mirror of
https://github.com/zama-ai/concrete.git
synced 2026-01-11 22:08:09 -05:00
bench(backend): add a benchmark tool for concrete-cuda
This commit is contained in:
4
.github/workflows/concrete_cuda_test.yml
vendored
4
.github/workflows/concrete_cuda_test.yml
vendored
@@ -104,7 +104,7 @@ jobs:
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
cd backends/concrete-cuda/implementation/build
|
||||
./test/test_concrete_cuda
|
||||
./test_and_benchmark/test/test_concrete_cuda
|
||||
|
||||
- name: Export variables for CUDA 11.1
|
||||
run: |
|
||||
@@ -124,7 +124,7 @@ jobs:
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
cd backends/concrete-cuda/implementation/build-old-cuda
|
||||
./test/test_concrete_cuda --gtest_filter="Wop*"
|
||||
./test_and_benchmark/test/test_concrete_cuda --gtest_filter="Wop*"
|
||||
|
||||
stop-runner:
|
||||
name: Stop EC2 runner
|
||||
|
||||
@@ -66,7 +66,7 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O3 ${CUD
|
||||
set(INCLUDE_DIR include)
|
||||
|
||||
add_subdirectory(src)
|
||||
add_subdirectory(test)
|
||||
add_subdirectory(test_and_benchmark)
|
||||
target_include_directories(concrete_cuda PRIVATE ${INCLUDE_DIR})
|
||||
|
||||
# This is required for rust cargo build
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#!/bin/bash
|
||||
|
||||
find ./{include,src,test} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-11 -i -style='file'
|
||||
find ./{include,src,test_and_benchmark} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-11 -i -style='file'
|
||||
cmake-format -i CMakeLists.txt -c ../../../compilers/concrete-compiler/compiler/.cmake-format-config.py
|
||||
|
||||
find ./{include,src,test} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c ../../../compilers/concrete-compiler/compiler/.cmake-format-config.py'
|
||||
find ./{include,src,test_and_benchmark} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c ../../../compilers/concrete-compiler/compiler/.cmake-format-config.py'
|
||||
|
||||
|
||||
@@ -1,340 +0,0 @@
|
||||
#include "../include/bootstrap.h"
|
||||
#include "../include/device.h"
|
||||
#include "concrete-cpu.h"
|
||||
#include "utils.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
typedef struct {
|
||||
int lwe_dimension;
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
double lwe_modular_variance;
|
||||
double glwe_modular_variance;
|
||||
int pbs_base_log;
|
||||
int pbs_level;
|
||||
int message_modulus;
|
||||
int carry_modulus;
|
||||
int number_of_inputs;
|
||||
int repetitions;
|
||||
int samples;
|
||||
} BootstrapTestParams;
|
||||
|
||||
class BootstrapTestPrimitives_u64
|
||||
: public ::testing::TestWithParam<BootstrapTestParams> {
|
||||
protected:
|
||||
int lwe_dimension;
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
double lwe_modular_variance;
|
||||
double glwe_modular_variance;
|
||||
int pbs_base_log;
|
||||
int pbs_level;
|
||||
int message_modulus;
|
||||
int carry_modulus;
|
||||
int payload_modulus;
|
||||
int number_of_inputs;
|
||||
int repetitions;
|
||||
int samples;
|
||||
uint64_t delta;
|
||||
Csprng *csprng;
|
||||
cudaStream_t *stream;
|
||||
int gpu_index = 0;
|
||||
uint64_t *lwe_sk_in_array;
|
||||
uint64_t *lwe_sk_out_array;
|
||||
uint64_t *plaintexts;
|
||||
double *d_fourier_bsk_array;
|
||||
uint64_t *d_lut_pbs_identity;
|
||||
uint64_t *d_lut_pbs_indexes;
|
||||
uint64_t *d_lwe_ct_in_array;
|
||||
uint64_t *d_lwe_ct_out_array;
|
||||
|
||||
public:
|
||||
// Test arithmetic functions
|
||||
void SetUp() {
|
||||
stream = cuda_create_stream(0);
|
||||
void *v_stream = (void *)stream;
|
||||
|
||||
// TestParams
|
||||
lwe_dimension = (int)GetParam().lwe_dimension;
|
||||
glwe_dimension = (int)GetParam().glwe_dimension;
|
||||
polynomial_size = (int)GetParam().polynomial_size;
|
||||
lwe_modular_variance = (int)GetParam().lwe_modular_variance;
|
||||
glwe_modular_variance = (int)GetParam().glwe_modular_variance;
|
||||
pbs_base_log = (int)GetParam().pbs_base_log;
|
||||
pbs_level = (int)GetParam().pbs_level;
|
||||
message_modulus = (int)GetParam().message_modulus;
|
||||
carry_modulus = (int)GetParam().carry_modulus;
|
||||
number_of_inputs = (int)GetParam().number_of_inputs;
|
||||
repetitions = (int)GetParam().repetitions;
|
||||
samples = (int)GetParam().samples;
|
||||
|
||||
payload_modulus = message_modulus * carry_modulus;
|
||||
// Value of the shift we multiply our messages by
|
||||
delta = ((uint64_t)(1) << 63) / (uint64_t)(payload_modulus);
|
||||
|
||||
// Create a Csprng
|
||||
csprng =
|
||||
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
|
||||
uint8_t seed[16] = {(uint8_t)0};
|
||||
concrete_cpu_construct_concrete_csprng(
|
||||
csprng, Uint128{.little_endian_bytes = {*seed}});
|
||||
|
||||
// Generate the keys
|
||||
generate_lwe_secret_keys(&lwe_sk_in_array, lwe_dimension, csprng,
|
||||
repetitions);
|
||||
generate_lwe_secret_keys(&lwe_sk_out_array,
|
||||
glwe_dimension * polynomial_size, csprng,
|
||||
repetitions);
|
||||
generate_lwe_bootstrap_keys(
|
||||
stream, gpu_index, &d_fourier_bsk_array, lwe_sk_in_array,
|
||||
lwe_sk_out_array, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
pbs_level, pbs_base_log, csprng, glwe_modular_variance, repetitions);
|
||||
plaintexts = generate_plaintexts(payload_modulus, delta, number_of_inputs,
|
||||
repetitions, samples);
|
||||
|
||||
// Create the LUT
|
||||
uint64_t *lut_pbs_identity = generate_identity_lut_pbs(
|
||||
polynomial_size, glwe_dimension, message_modulus, carry_modulus,
|
||||
[](int x) -> int { return x; });
|
||||
|
||||
// Copy the LUT
|
||||
d_lut_pbs_identity = (uint64_t *)cuda_malloc_async(
|
||||
(glwe_dimension + 1) * polynomial_size * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
d_lut_pbs_indexes = (uint64_t *)cuda_malloc_async(
|
||||
number_of_inputs * sizeof(uint64_t), stream, gpu_index);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
cuda_memset_async(d_lut_pbs_indexes, 0, number_of_inputs * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_memcpy_async_to_gpu(d_lut_pbs_identity, lut_pbs_identity,
|
||||
polynomial_size * (glwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
free(lut_pbs_identity);
|
||||
|
||||
d_lwe_ct_out_array =
|
||||
(uint64_t *)cuda_malloc_async((glwe_dimension * polynomial_size + 1) *
|
||||
number_of_inputs * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
d_lwe_ct_in_array = (uint64_t *)cuda_malloc_async(
|
||||
(lwe_dimension + 1) * number_of_inputs * repetitions * samples *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
uint64_t *lwe_ct_in_array =
|
||||
(uint64_t *)malloc((lwe_dimension + 1) * number_of_inputs *
|
||||
repetitions * samples * sizeof(uint64_t));
|
||||
// Create the input/output ciphertexts
|
||||
for (int r = 0; r < repetitions; r++) {
|
||||
uint64_t *lwe_sk_in = lwe_sk_in_array + (ptrdiff_t)(r * lwe_dimension);
|
||||
for (int s = 0; s < samples; s++) {
|
||||
for (int i = 0; i < number_of_inputs; i++) {
|
||||
uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
uint64_t *lwe_ct_in =
|
||||
lwe_ct_in_array + (ptrdiff_t)((r * samples * number_of_inputs +
|
||||
s * number_of_inputs + i) *
|
||||
(lwe_dimension + 1));
|
||||
concrete_cpu_encrypt_lwe_ciphertext_u64(
|
||||
lwe_sk_in, lwe_ct_in, plaintext, lwe_dimension,
|
||||
lwe_modular_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
|
||||
}
|
||||
}
|
||||
}
|
||||
cuda_synchronize_stream(v_stream);
|
||||
cuda_memcpy_async_to_gpu(d_lwe_ct_in_array, lwe_ct_in_array,
|
||||
repetitions * samples * number_of_inputs *
|
||||
(lwe_dimension + 1) * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
free(lwe_ct_in_array);
|
||||
}
|
||||
|
||||
void TearDown() {
|
||||
concrete_cpu_destroy_concrete_csprng(csprng);
|
||||
free(csprng);
|
||||
free(lwe_sk_in_array);
|
||||
free(lwe_sk_out_array);
|
||||
free(plaintexts);
|
||||
cuda_drop_async(d_fourier_bsk_array, stream, gpu_index);
|
||||
cuda_drop_async(d_lut_pbs_identity, stream, gpu_index);
|
||||
cuda_drop_async(d_lut_pbs_indexes, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_ct_out_array, stream, gpu_index);
|
||||
cuda_destroy_stream(stream, gpu_index);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(BootstrapTestPrimitives_u64, amortized_bootstrap) {
|
||||
uint64_t *lwe_ct_out_array =
|
||||
(uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *
|
||||
number_of_inputs * sizeof(uint64_t));
|
||||
int8_t *pbs_buffer = nullptr;
|
||||
scratch_cuda_bootstrap_amortized_64(
|
||||
stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
|
||||
number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
|
||||
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
|
||||
polynomial_size * (lwe_dimension + 1);
|
||||
// Here execute the PBS
|
||||
for (int r = 0; r < repetitions; r++) {
|
||||
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
|
||||
uint64_t *lwe_sk_out =
|
||||
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
|
||||
for (int s = 0; s < samples; s++) {
|
||||
uint64_t *d_lwe_ct_in =
|
||||
d_lwe_ct_in_array +
|
||||
(ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
|
||||
(lwe_dimension + 1));
|
||||
// Execute PBS
|
||||
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_ct_out_array,
|
||||
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
|
||||
(void *)d_lwe_ct_in, (void *)d_fourier_bsk, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
number_of_inputs, 1, 0, cuda_get_max_shared_memory(gpu_index));
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
|
||||
(glwe_dimension * polynomial_size + 1) *
|
||||
number_of_inputs * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
|
||||
for (int j = 0; j < number_of_inputs; j++) {
|
||||
uint64_t *result =
|
||||
lwe_ct_out_array +
|
||||
(ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
|
||||
uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
|
||||
s * number_of_inputs + j];
|
||||
uint64_t decrypted = 0;
|
||||
concrete_cpu_decrypt_lwe_ciphertext_u64(
|
||||
lwe_sk_out, result, glwe_dimension * polynomial_size, &decrypted);
|
||||
EXPECT_NE(decrypted, plaintext);
|
||||
// let err = (decrypted >= plaintext) ? decrypted - plaintext :
|
||||
// plaintext
|
||||
// - decrypted;
|
||||
// error_sample_vec.push(err);
|
||||
|
||||
// The bit before the message
|
||||
uint64_t rounding_bit = delta >> 1;
|
||||
// Compute the rounding bit
|
||||
uint64_t rounding = (decrypted & rounding_bit) << 1;
|
||||
uint64_t decoded = (decrypted + rounding) / delta;
|
||||
EXPECT_EQ(decoded, plaintext / delta);
|
||||
}
|
||||
}
|
||||
}
|
||||
cleanup_cuda_bootstrap_amortized(stream, gpu_index, &pbs_buffer);
|
||||
free(lwe_ct_out_array);
|
||||
}
|
||||
|
||||
TEST_P(BootstrapTestPrimitives_u64, low_latency_bootstrap) {
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
if (number_of_inputs > number_of_sm * 4 / (glwe_dimension + 1) / pbs_level)
|
||||
GTEST_SKIP() << "The Low Latency PBS does not support this configuration";
|
||||
uint64_t *lwe_ct_out_array =
|
||||
(uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *
|
||||
number_of_inputs * sizeof(uint64_t));
|
||||
int8_t *pbs_buffer = nullptr;
|
||||
scratch_cuda_bootstrap_low_latency_64(
|
||||
stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
|
||||
pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
|
||||
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
|
||||
polynomial_size * (lwe_dimension + 1);
|
||||
// Here execute the PBS
|
||||
for (int r = 0; r < repetitions; r++) {
|
||||
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
|
||||
uint64_t *lwe_sk_out =
|
||||
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
|
||||
for (int s = 0; s < samples; s++) {
|
||||
uint64_t *d_lwe_ct_in =
|
||||
d_lwe_ct_in_array +
|
||||
(ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
|
||||
(lwe_dimension + 1));
|
||||
// Execute PBS
|
||||
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_ct_out_array,
|
||||
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
|
||||
(void *)d_lwe_ct_in, (void *)d_fourier_bsk, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
number_of_inputs, 1, 0, cuda_get_max_shared_memory(gpu_index));
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
|
||||
(glwe_dimension * polynomial_size + 1) *
|
||||
number_of_inputs * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
|
||||
for (int j = 0; j < number_of_inputs; j++) {
|
||||
uint64_t *result =
|
||||
lwe_ct_out_array +
|
||||
(ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
|
||||
uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
|
||||
s * number_of_inputs + j];
|
||||
uint64_t decrypted = 0;
|
||||
concrete_cpu_decrypt_lwe_ciphertext_u64(
|
||||
lwe_sk_out, result, glwe_dimension * polynomial_size, &decrypted);
|
||||
EXPECT_NE(decrypted, plaintext);
|
||||
// let err = (decrypted >= plaintext) ? decrypted - plaintext :
|
||||
// plaintext
|
||||
// - decrypted;
|
||||
// error_sample_vec.push(err);
|
||||
|
||||
// The bit before the message
|
||||
uint64_t rounding_bit = delta >> 1;
|
||||
// Compute the rounding bit
|
||||
uint64_t rounding = (decrypted & rounding_bit) << 1;
|
||||
uint64_t decoded = (decrypted + rounding) / delta;
|
||||
EXPECT_EQ(decoded, plaintext / delta);
|
||||
}
|
||||
}
|
||||
}
|
||||
cleanup_cuda_bootstrap_low_latency(stream, gpu_index, &pbs_buffer);
|
||||
free(lwe_ct_out_array);
|
||||
}
|
||||
|
||||
// Defines for which parameters set the PBS will be tested.
|
||||
// It executes each test for all pairs on phis X qs (Cartesian product)
|
||||
::testing::internal::ParamGenerator<BootstrapTestParams> pbs_params_u64 =
|
||||
::testing::Values(
|
||||
// n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
|
||||
// message_modulus, carry_modulus, number_of_inputs, repetitions,
|
||||
// samples
|
||||
(BootstrapTestParams){567, 5, 256, 0.000007069849454709433,
|
||||
0.00000000000000029403601535432533, 15, 1, 2, 1,
|
||||
5, 2, 50},
|
||||
(BootstrapTestParams){623, 6, 256, 7.52316384526264e-37,
|
||||
7.52316384526264e-37, 9, 3, 2, 2, 5, 2, 50},
|
||||
(BootstrapTestParams){694, 3, 512, 0.000007069849454709433,
|
||||
0.00000000000000029403601535432533, 18, 1, 2, 1,
|
||||
5, 2, 50},
|
||||
(BootstrapTestParams){769, 2, 1024, 0.000007069849454709433,
|
||||
0.00000000000000029403601535432533, 23, 1, 2, 1,
|
||||
5, 2, 50},
|
||||
(BootstrapTestParams){754, 1, 2048, 0.000007069849454709433,
|
||||
0.00000000000000029403601535432533, 23, 1, 4, 1,
|
||||
5, 2, 50},
|
||||
(BootstrapTestParams){847, 1, 4096, 0.000007069849454709433,
|
||||
0.00000000000000029403601535432533, 2, 12, 2, 1,
|
||||
2, 1, 50},
|
||||
(BootstrapTestParams){881, 1, 8192, 0.000007069849454709433,
|
||||
0.00000000000000029403601535432533, 22, 1, 2, 1,
|
||||
2, 1, 25},
|
||||
(BootstrapTestParams){976, 1, 16384, 0.000007069849454709433,
|
||||
0.00000000000000029403601535432533, 11, 3, 4, 1,
|
||||
2, 1, 10});
|
||||
|
||||
std::string printParamName(::testing::TestParamInfo<BootstrapTestParams> p) {
|
||||
BootstrapTestParams params = p.param;
|
||||
|
||||
return "n_" + std::to_string(params.lwe_dimension) + "_k_" +
|
||||
std::to_string(params.glwe_dimension) + "_N_" +
|
||||
std::to_string(params.polynomial_size) + "_pbs_base_log_" +
|
||||
std::to_string(params.pbs_base_log) + "_pbs_level_" +
|
||||
std::to_string(params.pbs_level) + "_number_of_inputs_" +
|
||||
std::to_string(params.number_of_inputs);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(BootstrapInstantiation, BootstrapTestPrimitives_u64,
|
||||
pbs_params_u64, printParamName);
|
||||
@@ -1,340 +0,0 @@
|
||||
#include "../include/device.h"
|
||||
#include "../include/linear_algebra.h"
|
||||
#include "concrete-cpu.h"
|
||||
#include "utils.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
const unsigned REPETITIONS = 5;
|
||||
const unsigned SAMPLES = 100;
|
||||
|
||||
typedef struct {
|
||||
int lwe_dimension;
|
||||
double noise_variance;
|
||||
int message_modulus;
|
||||
int carry_modulus;
|
||||
int number_of_inputs;
|
||||
} LinearAlgebraTestParams;
|
||||
|
||||
class LinearAlgebraTestPrimitives_u64
|
||||
: public ::testing::TestWithParam<LinearAlgebraTestParams> {
|
||||
protected:
|
||||
int lwe_dimension;
|
||||
double noise_variance;
|
||||
int message_modulus;
|
||||
int carry_modulus;
|
||||
int number_of_inputs;
|
||||
int payload_modulus;
|
||||
uint64_t delta;
|
||||
Csprng *csprng;
|
||||
cudaStream_t *stream;
|
||||
int gpu_index = 0;
|
||||
uint64_t *lwe_sk_array;
|
||||
uint64_t *d_lwe_in_1_ct;
|
||||
uint64_t *d_lwe_in_2_ct;
|
||||
uint64_t *d_lwe_out_ct;
|
||||
uint64_t *lwe_in_1_ct;
|
||||
uint64_t *lwe_in_2_ct;
|
||||
uint64_t *lwe_out_ct;
|
||||
uint64_t *plaintexts_1;
|
||||
uint64_t *plaintexts_2;
|
||||
int num_samples;
|
||||
|
||||
public:
|
||||
// Test arithmetic functions
|
||||
void SetUp() {
|
||||
stream = cuda_create_stream(0);
|
||||
void *v_stream = (void *)stream;
|
||||
|
||||
// TestParams
|
||||
lwe_dimension = (int)GetParam().lwe_dimension;
|
||||
noise_variance = (double)GetParam().noise_variance;
|
||||
message_modulus = (int)GetParam().message_modulus;
|
||||
carry_modulus = (int)GetParam().carry_modulus;
|
||||
number_of_inputs = (int)GetParam().number_of_inputs;
|
||||
|
||||
payload_modulus = message_modulus * carry_modulus;
|
||||
// Value of the shift we multiply our messages by
|
||||
delta = ((uint64_t)(1) << 63) / (uint64_t)(payload_modulus);
|
||||
|
||||
// Create a Csprng
|
||||
csprng =
|
||||
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
|
||||
uint8_t seed[16] = {(uint8_t)0};
|
||||
concrete_cpu_construct_concrete_csprng(
|
||||
csprng, Uint128{.little_endian_bytes = {*seed}});
|
||||
|
||||
// Generate the keys
|
||||
generate_lwe_secret_keys(&lwe_sk_array, lwe_dimension, csprng, REPETITIONS);
|
||||
plaintexts_1 = generate_plaintexts(payload_modulus, delta, number_of_inputs,
|
||||
REPETITIONS, SAMPLES);
|
||||
plaintexts_2 = generate_plaintexts(payload_modulus, delta, number_of_inputs,
|
||||
REPETITIONS, SAMPLES);
|
||||
|
||||
d_lwe_in_1_ct = (uint64_t *)cuda_malloc_async(
|
||||
number_of_inputs * (lwe_dimension + 1) * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
d_lwe_in_2_ct = (uint64_t *)cuda_malloc_async(
|
||||
number_of_inputs * (lwe_dimension + 1) * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
d_lwe_out_ct = (uint64_t *)cuda_malloc_async(
|
||||
number_of_inputs * (lwe_dimension + 1) * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
|
||||
lwe_in_1_ct = (uint64_t *)malloc(number_of_inputs * (lwe_dimension + 1) *
|
||||
sizeof(uint64_t));
|
||||
lwe_in_2_ct = (uint64_t *)malloc(number_of_inputs * (lwe_dimension + 1) *
|
||||
sizeof(uint64_t));
|
||||
lwe_out_ct = (uint64_t *)malloc(number_of_inputs * (lwe_dimension + 1) *
|
||||
sizeof(uint64_t));
|
||||
|
||||
cuda_synchronize_stream(v_stream);
|
||||
}
|
||||
|
||||
void TearDown() {
|
||||
void *v_stream = (void *)stream;
|
||||
|
||||
cuda_synchronize_stream(v_stream);
|
||||
concrete_cpu_destroy_concrete_csprng(csprng);
|
||||
free(csprng);
|
||||
cuda_drop_async(d_lwe_in_1_ct, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_in_2_ct, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_out_ct, stream, gpu_index);
|
||||
free(lwe_in_1_ct);
|
||||
free(lwe_in_2_ct);
|
||||
free(lwe_out_ct);
|
||||
free(lwe_sk_array);
|
||||
free(plaintexts_1);
|
||||
free(plaintexts_2);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(LinearAlgebraTestPrimitives_u64, addition) {
|
||||
// Here execute the PBS
|
||||
for (uint r = 0; r < REPETITIONS; r++) {
|
||||
for (uint s = 0; s < SAMPLES; s++) {
|
||||
uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
|
||||
for (int i = 0; i < number_of_inputs; i++) {
|
||||
uint64_t plaintext_1 = plaintexts_1[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
uint64_t plaintext_2 = plaintexts_2[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
concrete_cpu_encrypt_lwe_ciphertext_u64(
|
||||
lwe_sk, lwe_in_1_ct + i * (lwe_dimension + 1), plaintext_1,
|
||||
lwe_dimension, noise_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
|
||||
concrete_cpu_encrypt_lwe_ciphertext_u64(
|
||||
lwe_sk, lwe_in_2_ct + i * (lwe_dimension + 1), plaintext_2,
|
||||
lwe_dimension, noise_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
|
||||
}
|
||||
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
|
||||
number_of_inputs * (lwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_memcpy_async_to_gpu(d_lwe_in_2_ct, lwe_in_2_ct,
|
||||
number_of_inputs * (lwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
// Execute addition
|
||||
cuda_add_lwe_ciphertext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
|
||||
(void *)d_lwe_in_2_ct, lwe_dimension, number_of_inputs);
|
||||
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
|
||||
number_of_inputs * (lwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
for (int i = 0; i < number_of_inputs; i++) {
|
||||
uint64_t plaintext_1 = plaintexts_1[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
uint64_t plaintext_2 = plaintexts_2[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
uint64_t decrypted = 0;
|
||||
concrete_cpu_decrypt_lwe_ciphertext_u64(
|
||||
lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension,
|
||||
&decrypted);
|
||||
// The bit before the message
|
||||
uint64_t rounding_bit = delta >> 1;
|
||||
// Compute the rounding bit
|
||||
uint64_t rounding = (decrypted & rounding_bit) << 1;
|
||||
uint64_t decoded = (decrypted + rounding) / delta;
|
||||
EXPECT_EQ(decoded, (plaintext_1 + plaintext_2) / delta);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(LinearAlgebraTestPrimitives_u64, plaintext_addition) {
|
||||
// Here execute the PBS
|
||||
for (uint r = 0; r < REPETITIONS; r++) {
|
||||
for (uint s = 0; s < SAMPLES; s++) {
|
||||
uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
|
||||
for (int i = 0; i < number_of_inputs; i++) {
|
||||
uint64_t plaintext_1 = plaintexts_1[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
concrete_cpu_encrypt_lwe_ciphertext_u64(
|
||||
lwe_sk, lwe_in_1_ct + i * (lwe_dimension + 1), plaintext_1,
|
||||
lwe_dimension, noise_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
|
||||
}
|
||||
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
|
||||
number_of_inputs * (lwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_memcpy_async_to_gpu(
|
||||
d_lwe_in_2_ct,
|
||||
&plaintexts_2[r * SAMPLES * number_of_inputs + s * number_of_inputs],
|
||||
number_of_inputs * sizeof(uint64_t), stream, gpu_index);
|
||||
// Execute addition
|
||||
cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
|
||||
(void *)d_lwe_in_2_ct, lwe_dimension, number_of_inputs);
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
|
||||
number_of_inputs * (lwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
for (int i = 0; i < number_of_inputs; i++) {
|
||||
uint64_t plaintext_1 = plaintexts_1[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
uint64_t plaintext_2 = plaintexts_2[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
uint64_t decrypted = 0;
|
||||
concrete_cpu_decrypt_lwe_ciphertext_u64(
|
||||
lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension,
|
||||
&decrypted);
|
||||
// The bit before the message
|
||||
uint64_t rounding_bit = delta >> 1;
|
||||
// Compute the rounding bit
|
||||
uint64_t rounding = (decrypted & rounding_bit) << 1;
|
||||
uint64_t decoded = (decrypted + rounding) / delta;
|
||||
EXPECT_EQ(decoded, (plaintext_1 + plaintext_2) / delta);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(LinearAlgebraTestPrimitives_u64, cleartext_multiplication) {
|
||||
void *v_stream = (void *)stream;
|
||||
uint64_t delta_2 =
|
||||
((uint64_t)(1) << 63) / (uint64_t)(payload_modulus * payload_modulus);
|
||||
// Here execute the PBS
|
||||
for (uint r = 0; r < REPETITIONS; r++) {
|
||||
for (uint s = 0; s < SAMPLES; s++) {
|
||||
uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
|
||||
uint64_t *cleartext_array =
|
||||
(uint64_t *)malloc(number_of_inputs * sizeof(uint64_t));
|
||||
for (int i = 0; i < number_of_inputs; i++) {
|
||||
uint64_t plaintext_1 = plaintexts_1[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + i] /
|
||||
delta * delta_2;
|
||||
uint64_t plaintext_2 = plaintexts_2[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
cleartext_array[i] = plaintext_2 / delta;
|
||||
concrete_cpu_encrypt_lwe_ciphertext_u64(
|
||||
lwe_sk, lwe_in_1_ct + i * (lwe_dimension + 1), plaintext_1,
|
||||
lwe_dimension, noise_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
|
||||
}
|
||||
cuda_synchronize_stream(v_stream);
|
||||
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
|
||||
number_of_inputs * (lwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_memcpy_async_to_gpu(d_lwe_in_2_ct, cleartext_array,
|
||||
number_of_inputs * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
// Execute cleartext multiplication
|
||||
cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
|
||||
(void *)d_lwe_in_2_ct, lwe_dimension, number_of_inputs);
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
|
||||
number_of_inputs * (lwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
for (int i = 0; i < number_of_inputs; i++) {
|
||||
uint64_t plaintext = plaintexts_1[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + i] /
|
||||
delta * delta_2;
|
||||
uint64_t cleartext = plaintexts_2[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + i] /
|
||||
delta;
|
||||
uint64_t decrypted = 0;
|
||||
concrete_cpu_decrypt_lwe_ciphertext_u64(
|
||||
lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension,
|
||||
&decrypted);
|
||||
// The bit before the message
|
||||
uint64_t rounding_bit = delta_2 >> 1;
|
||||
// Compute the rounding bit
|
||||
uint64_t rounding = (decrypted & rounding_bit) << 1;
|
||||
uint64_t decoded = (decrypted + rounding) / delta_2;
|
||||
EXPECT_EQ(decoded, plaintext / delta_2 * cleartext);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(LinearAlgebraTestPrimitives_u64, negate) {
|
||||
// Here execute the PBS
|
||||
for (uint r = 0; r < REPETITIONS; r++) {
|
||||
for (uint s = 0; s < SAMPLES; s++) {
|
||||
uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
|
||||
for (int i = 0; i < number_of_inputs; i++) {
|
||||
uint64_t plaintext = plaintexts_1[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
concrete_cpu_encrypt_lwe_ciphertext_u64(
|
||||
lwe_sk, lwe_in_1_ct + i * (lwe_dimension + 1), plaintext,
|
||||
lwe_dimension, noise_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
|
||||
}
|
||||
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
|
||||
number_of_inputs * (lwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
// Execute addition
|
||||
cuda_negate_lwe_ciphertext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
|
||||
lwe_dimension, number_of_inputs);
|
||||
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
|
||||
number_of_inputs * (lwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
for (int i = 0; i < number_of_inputs; i++) {
|
||||
uint64_t plaintext = plaintexts_1[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
uint64_t decrypted = 0;
|
||||
concrete_cpu_decrypt_lwe_ciphertext_u64(
|
||||
lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension,
|
||||
&decrypted);
|
||||
// The bit before the message
|
||||
uint64_t rounding_bit = delta >> 1;
|
||||
// Compute the rounding bit
|
||||
uint64_t rounding = (decrypted & rounding_bit) << 1;
|
||||
uint64_t decoded = (decrypted + rounding) / delta;
|
||||
EXPECT_EQ(decoded, -plaintext / delta);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Defines for which parameters set the linear algebra operations will be
|
||||
// tested. It executes each test for all pairs on phis X qs (Cartesian product)
|
||||
::testing::internal::ParamGenerator<LinearAlgebraTestParams>
|
||||
linear_algebra_params_u64 = ::testing::Values(
|
||||
// n, lwe_std_dev, message_modulus, carry_modulus, number_of_inputs
|
||||
(LinearAlgebraTestParams){600, 7.52316384526264e-37, 2, 2, 10});
|
||||
|
||||
std::string
|
||||
printParamName(::testing::TestParamInfo<LinearAlgebraTestParams> p) {
|
||||
LinearAlgebraTestParams params = p.param;
|
||||
|
||||
return "n_" + std::to_string(params.lwe_dimension);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(LinearAlgebraInstantiation,
|
||||
LinearAlgebraTestPrimitives_u64,
|
||||
linear_algebra_params_u64, printParamName);
|
||||
@@ -0,0 +1,2 @@
|
||||
add_subdirectory(test)
|
||||
add_subdirectory(benchmark)
|
||||
@@ -0,0 +1,48 @@
|
||||
cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
|
||||
find_package(CUDAToolkit)
|
||||
|
||||
if(NOT CMAKE_BUILD_TYPE)
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
endif()
|
||||
|
||||
# Disable the Google Benchmark requirement on Google Test
|
||||
set(BENCHMARK_ENABLE_GTEST_TESTS OFF)
|
||||
set(BENCHMARK_ENABLE_TESTING OFF)
|
||||
|
||||
FetchContent_Declare(
|
||||
googlebenchmark
|
||||
GIT_REPOSITORY https://github.com/google/benchmark.git
|
||||
GIT_TAG v1.7.1)
|
||||
FetchContent_MakeAvailable(googlebenchmark)
|
||||
|
||||
# Enable ExternalProject CMake module
|
||||
include(ExternalProject)
|
||||
|
||||
set(CONCRETE_CPU_BINARY_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../../concrete-cpu/implementation/target/release")
|
||||
set(CONCRETE_CPU_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../../concrete-cpu/implementation")
|
||||
set(CONCRETE_CUDA_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../")
|
||||
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../include)
|
||||
include_directories(${CONCRETE_CPU_SOURCE_DIR}/include)
|
||||
include_directories(${CONCRETE_CUDA_SOURCE_DIR}/include)
|
||||
|
||||
add_library(concrete_cpu_lib STATIC IMPORTED)
|
||||
set_target_properties(concrete_cpu_lib PROPERTIES IMPORTED_LOCATION ${CONCRETE_CPU_BINARY_DIR}/libconcrete_cpu.a)
|
||||
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--no-as-needed -ldl")
|
||||
|
||||
set(BINARY benchmark_concrete_cuda)
|
||||
|
||||
file(
|
||||
GLOB_RECURSE BENCH_SOURCES
|
||||
LIST_DIRECTORIES false
|
||||
benchmark*.cpp main.cpp)
|
||||
set(SOURCES ${BENCH_SOURCES})
|
||||
|
||||
add_executable(${BINARY} ${BENCH_SOURCES} ../utils.cpp ../setup_and_teardown.cpp)
|
||||
|
||||
set_target_properties(benchmark_concrete_cuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
|
||||
target_link_libraries(
|
||||
benchmark_concrete_cuda
|
||||
PUBLIC benchmark::benchmark concrete_cpu_lib concrete_cuda
|
||||
PRIVATE CUDA::cudart)
|
||||
@@ -0,0 +1,78 @@
|
||||
# benchmark_concrete_cuda
|
||||
|
||||
This benchmark tool is written over Google Benchmark library. It measures the performance of the
|
||||
CUDA-accelerated functions of the concrete-framework and helps to identify potential
|
||||
bottlenecks.
|
||||
|
||||
## How to Compile
|
||||
|
||||
The first step in compiling code with CMake is to create a build directory. This directory will
|
||||
contain all the files generated during the build process, such as object files and executables.
|
||||
We recommend creating this directory outside of the source directory, but inside the
|
||||
implementation folder, to keep the source directory clean.
|
||||
|
||||
```bash
|
||||
$ cd concrete-open-source/backends/concrete-cuda/implementation
|
||||
$ mkdir build
|
||||
$ cd build
|
||||
```
|
||||
|
||||
Run CMake to generate the build files and then use make to compile the project.
|
||||
|
||||
```bash
|
||||
$ cmake ..
|
||||
$ make
|
||||
```
|
||||
|
||||
## How to Run Benchmarks
|
||||
|
||||
To run benchmarks, you can simply execute the `benchmark_concrete_cuda` executable with no arguments:
|
||||
|
||||
```bash
|
||||
$ benchmark/benchmark_concrete_cuda
|
||||
```
|
||||
|
||||
This will run all the benchmarks in the code.
|
||||
|
||||
## How to Filter Benchmarks
|
||||
|
||||
You can filter benchmarks by specifying a regular expression as an argument. Only benchmarks whose name matches the regular expression will be executed.
|
||||
|
||||
For example, to run only benchmarks whose name contains the word "Bootstrap", you can execute:
|
||||
|
||||
```bash
|
||||
$ benchmark/benchmark_concrete_cuda --benchmark_filter=Bootstrap
|
||||
```
|
||||
|
||||
## How to Set the Time Unit
|
||||
|
||||
By default, benchmarks are reported in seconds. However, you can change the time unit to one of the following:
|
||||
|
||||
* `ns` (nanoseconds)
|
||||
* `us` (microseconds)
|
||||
* `ms` (milliseconds)
|
||||
* `s` (seconds)
|
||||
|
||||
To set the time unit, use the --benchmark_time_unit option followed by the desired time unit:
|
||||
|
||||
```bash
|
||||
$ benchmark/benchmark_concrete_cuda --benchmark_time_unit=us
|
||||
```
|
||||
|
||||
## How to Set the Number of Iterations
|
||||
|
||||
By default, each benchmark is executed for a number of iterations that is automatically determined by the Google Benchmark library.
|
||||
However, you can increase the minimum time used for each measurement to increase the number of
|
||||
iterations by using --benchmark_min_time. For instance:
|
||||
|
||||
```bash
|
||||
$ benchmark/benchmark_concrete_cuda --benchmark_min_time=10
|
||||
```
|
||||
|
||||
will force the tool to run at least 10s of iterations.
|
||||
|
||||
|
||||
## Conclusion
|
||||
|
||||
With these options, you can easily run benchmarks, filter benchmarks, set the time unit, and the number of iterations of benchmark_concrete_cuda. If you have any questions or issues, please feel free to contact us.
|
||||
To learn more about Google Benchmark library, please refer to the [official user guide](https://github.com/google/benchmark/blob/main/docs/user_guide.md).
|
||||
@@ -0,0 +1,113 @@
|
||||
#include <benchmark/benchmark.h>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <setup_and_teardown.h>
|
||||
|
||||
typedef struct {
|
||||
int lwe_dimension;
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
int pbs_base_log;
|
||||
int pbs_level;
|
||||
int ks_base_log;
|
||||
int ks_level;
|
||||
int number_of_bits_of_message_including_padding;
|
||||
int number_of_bits_to_extract;
|
||||
int number_of_inputs;
|
||||
} BitExtractionBenchmarkParams;
|
||||
|
||||
class BitExtractionBenchmark_u64 : public benchmark::Fixture {
|
||||
protected:
|
||||
int lwe_dimension;
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
double lwe_modular_variance = 7.52316384526264e-37;
|
||||
double glwe_modular_variance = 7.52316384526264e-37;
|
||||
int pbs_base_log;
|
||||
int pbs_level;
|
||||
int ks_base_log;
|
||||
int ks_level;
|
||||
int number_of_bits_of_message_including_padding;
|
||||
int number_of_bits_to_extract;
|
||||
int number_of_inputs;
|
||||
uint64_t delta;
|
||||
int delta_log;
|
||||
Csprng *csprng;
|
||||
cudaStream_t *stream;
|
||||
int gpu_index = 0;
|
||||
uint64_t *plaintexts;
|
||||
double *d_fourier_bsk;
|
||||
uint64_t *d_ksk;
|
||||
uint64_t *d_lwe_ct_in_array;
|
||||
uint64_t *d_lwe_ct_out_array;
|
||||
int8_t *bit_extract_buffer;
|
||||
uint64_t *lwe_sk_in;
|
||||
uint64_t *lwe_sk_out;
|
||||
|
||||
public:
|
||||
// Test arithmetic functions
|
||||
void SetUp(const ::benchmark::State &state) {
|
||||
stream = cuda_create_stream(0);
|
||||
|
||||
// TestParams
|
||||
lwe_dimension = state.range(0);
|
||||
glwe_dimension = state.range(1);
|
||||
polynomial_size = state.range(2);
|
||||
pbs_base_log = state.range(3);
|
||||
pbs_level = state.range(4);
|
||||
ks_base_log = state.range(5);
|
||||
ks_level = state.range(6);
|
||||
number_of_bits_of_message_including_padding = state.range(7);
|
||||
number_of_bits_to_extract = state.range(8);
|
||||
number_of_inputs = state.range(9);
|
||||
|
||||
bit_extraction_setup(
|
||||
stream, &csprng, &lwe_sk_in, &lwe_sk_out, &d_fourier_bsk, &d_ksk,
|
||||
&plaintexts, &d_lwe_ct_in_array, &d_lwe_ct_out_array,
|
||||
&bit_extract_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
lwe_modular_variance, glwe_modular_variance, ks_base_log, ks_level,
|
||||
pbs_base_log, pbs_level, number_of_bits_of_message_including_padding,
|
||||
number_of_bits_to_extract, &delta_log, &delta, number_of_inputs, 1, 1,
|
||||
gpu_index);
|
||||
}
|
||||
|
||||
void TearDown() {
|
||||
bit_extraction_teardown(stream, csprng, lwe_sk_in, lwe_sk_out,
|
||||
d_fourier_bsk, d_ksk, plaintexts, d_lwe_ct_in_array,
|
||||
d_lwe_ct_out_array, bit_extract_buffer, gpu_index);
|
||||
}
|
||||
};
|
||||
|
||||
BENCHMARK_DEFINE_F(BitExtractionBenchmark_u64, BitExtraction)
|
||||
(benchmark::State &st) {
|
||||
for (auto _ : st) {
|
||||
// Execute bit extract
|
||||
cuda_extract_bits_64(
|
||||
stream, gpu_index, (void *)d_lwe_ct_out_array,
|
||||
(void *)d_lwe_ct_in_array, bit_extract_buffer, (void *)d_ksk,
|
||||
(void *)d_fourier_bsk, number_of_bits_to_extract, delta_log,
|
||||
glwe_dimension * polynomial_size, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, pbs_base_log, pbs_level, ks_base_log, ks_level,
|
||||
number_of_inputs, cuda_get_max_shared_memory(gpu_index));
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
BitExtractionBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
|
||||
// Define the parameters to benchmark
|
||||
std::vector<BitExtractionBenchmarkParams> params = {
|
||||
(BitExtractionBenchmarkParams){585, 1, 1024, 10, 2, 4, 7, 5, 5, 1} //,
|
||||
};
|
||||
|
||||
// Add to the list of parameters to benchmark
|
||||
for (auto x : params)
|
||||
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
|
||||
x.pbs_base_log, x.pbs_level, x.ks_base_log, x.ks_level,
|
||||
x.number_of_bits_of_message_including_padding,
|
||||
x.number_of_bits_to_extract, x.number_of_inputs});
|
||||
}
|
||||
|
||||
BENCHMARK_REGISTER_F(BitExtractionBenchmark_u64, BitExtraction)
|
||||
->Apply(BitExtractionBenchmarkGenerateParams);
|
||||
@@ -0,0 +1,195 @@
|
||||
#include <benchmark/benchmark.h>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <setup_and_teardown.h>
|
||||
|
||||
typedef struct {
|
||||
int lwe_dimension;
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
int pbs_base_log;
|
||||
int pbs_level;
|
||||
int input_lwe_ciphertext_count;
|
||||
} BootstrapBenchmarkParams;
|
||||
|
||||
class BootstrapBenchmark_u64 : public benchmark::Fixture {
|
||||
protected:
|
||||
int lwe_dimension;
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
int input_lwe_ciphertext_count;
|
||||
double lwe_modular_variance = 0.000007069849454709433;
|
||||
double glwe_modular_variance = 0.00000000000000029403601535432533;
|
||||
int pbs_base_log;
|
||||
int pbs_level;
|
||||
int message_modulus = 4;
|
||||
int carry_modulus = 4;
|
||||
int payload_modulus;
|
||||
uint64_t delta;
|
||||
double *d_fourier_bsk_array;
|
||||
uint64_t *d_lut_pbs_identity;
|
||||
uint64_t *d_lut_pbs_indexes;
|
||||
uint64_t *d_lwe_ct_in_array;
|
||||
uint64_t *d_lwe_ct_out_array;
|
||||
uint64_t *lwe_ct_array;
|
||||
uint64_t *lwe_sk_in_array;
|
||||
uint64_t *lwe_sk_out_array;
|
||||
uint64_t *plaintexts;
|
||||
Csprng *csprng;
|
||||
cudaStream_t *stream;
|
||||
int gpu_index = 0;
|
||||
int8_t *amortized_pbs_buffer;
|
||||
int8_t *lowlat_pbs_buffer;
|
||||
|
||||
public:
|
||||
void SetUp(const ::benchmark::State &state) {
|
||||
stream = cuda_create_stream(0);
|
||||
|
||||
lwe_dimension = state.range(0);
|
||||
glwe_dimension = state.range(1);
|
||||
polynomial_size = state.range(2);
|
||||
pbs_base_log = state.range(3);
|
||||
pbs_level = state.range(4);
|
||||
input_lwe_ciphertext_count = state.range(5);
|
||||
|
||||
bootstrap_setup(stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
|
||||
&d_fourier_bsk_array, &plaintexts, &d_lut_pbs_identity,
|
||||
&d_lut_pbs_indexes, &d_lwe_ct_in_array, &d_lwe_ct_out_array,
|
||||
&amortized_pbs_buffer, &lowlat_pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, lwe_modular_variance,
|
||||
glwe_modular_variance, pbs_base_log, pbs_level,
|
||||
message_modulus, carry_modulus, &payload_modulus, &delta,
|
||||
input_lwe_ciphertext_count, 1, 1, gpu_index);
|
||||
|
||||
// We keep the following for the benchmarks with copies
|
||||
lwe_ct_array = (uint64_t *)malloc(
|
||||
(lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint64_t));
|
||||
}
|
||||
|
||||
void TearDown() {
|
||||
bootstrap_teardown(stream, csprng, lwe_sk_in_array, lwe_sk_out_array,
|
||||
d_fourier_bsk_array, plaintexts, d_lut_pbs_identity,
|
||||
d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_ct_out_array,
|
||||
amortized_pbs_buffer, lowlat_pbs_buffer, gpu_index);
|
||||
free(lwe_ct_array);
|
||||
}
|
||||
};
|
||||
|
||||
BENCHMARK_DEFINE_F(BootstrapBenchmark_u64, AmortizedPBS)(benchmark::State &st) {
|
||||
void *v_stream = (void *)stream;
|
||||
|
||||
for (auto _ : st) {
|
||||
// Execute PBS
|
||||
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_ct_out_array,
|
||||
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
|
||||
(void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array,
|
||||
amortized_pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
pbs_base_log, pbs_level, input_lwe_ciphertext_count,
|
||||
input_lwe_ciphertext_count, 0, cuda_get_max_shared_memory(gpu_index));
|
||||
cuda_synchronize_stream(v_stream);
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK_DEFINE_F(BootstrapBenchmark_u64, CopiesPlusAmortizedPBS)
|
||||
(benchmark::State &st) {
|
||||
void *v_stream = (void *)stream;
|
||||
|
||||
for (auto _ : st) {
|
||||
cuda_memcpy_async_to_gpu(d_lwe_ct_in_array, lwe_ct_array,
|
||||
(lwe_dimension + 1) * input_lwe_ciphertext_count *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
|
||||
// Execute PBS
|
||||
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_ct_out_array,
|
||||
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
|
||||
(void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array,
|
||||
amortized_pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
pbs_base_log, pbs_level, input_lwe_ciphertext_count,
|
||||
input_lwe_ciphertext_count, 0, cuda_get_max_shared_memory(gpu_index));
|
||||
|
||||
cuda_memcpy_async_to_cpu(lwe_ct_array, d_lwe_ct_out_array,
|
||||
(lwe_dimension + 1) * input_lwe_ciphertext_count *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK_DEFINE_F(BootstrapBenchmark_u64, LowLatencyPBS)
|
||||
(benchmark::State &st) {
|
||||
for (auto _ : st) {
|
||||
// Execute PBS
|
||||
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_ct_out_array,
|
||||
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
|
||||
(void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array,
|
||||
lowlat_pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
pbs_base_log, pbs_level, 1, 1, 0,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK_DEFINE_F(BootstrapBenchmark_u64, CopiesPlusLowLatencyPBS)
|
||||
(benchmark::State &st) {
|
||||
void *v_stream = (void *)stream;
|
||||
|
||||
for (auto _ : st) {
|
||||
cuda_memcpy_async_to_gpu(d_lwe_ct_in_array, lwe_ct_array,
|
||||
(lwe_dimension + 1) * input_lwe_ciphertext_count *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
// Execute PBS
|
||||
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_ct_out_array,
|
||||
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
|
||||
(void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array,
|
||||
lowlat_pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
pbs_base_log, pbs_level, 1, 1, 0,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
|
||||
cuda_memcpy_async_to_cpu(lwe_ct_array, d_lwe_ct_out_array,
|
||||
(lwe_dimension + 1) * input_lwe_ciphertext_count *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
BootstrapBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
|
||||
// Define the parameters to benchmark
|
||||
// lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
// input_lwe_ciphertext_count
|
||||
std::vector<BootstrapBenchmarkParams> params = {
|
||||
(BootstrapBenchmarkParams){567, 5, 256, 15, 1, 1},
|
||||
(BootstrapBenchmarkParams){577, 6, 256, 12, 3, 1},
|
||||
(BootstrapBenchmarkParams){553, 4, 512, 12, 3, 1},
|
||||
(BootstrapBenchmarkParams){769, 2, 1024, 23, 1, 1},
|
||||
(BootstrapBenchmarkParams){714, 2, 1024, 15, 2, 1},
|
||||
(BootstrapBenchmarkParams){694, 2, 1024, 8, 5, 1},
|
||||
(BootstrapBenchmarkParams){881, 1, 8192, 22, 1, 1},
|
||||
(BootstrapBenchmarkParams){879, 1, 8192, 11, 3, 1},
|
||||
};
|
||||
|
||||
// Add to the list of parameters to benchmark
|
||||
for (auto x : params)
|
||||
for (int num_samples = 1; num_samples <= 10000; num_samples *= 10) {
|
||||
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
|
||||
x.pbs_base_log, x.pbs_level, num_samples});
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK_REGISTER_F(BootstrapBenchmark_u64, AmortizedPBS)
|
||||
->Apply(BootstrapBenchmarkGenerateParams);
|
||||
BENCHMARK_REGISTER_F(BootstrapBenchmark_u64, LowLatencyPBS)
|
||||
->Apply(BootstrapBenchmarkGenerateParams);
|
||||
|
||||
BENCHMARK_REGISTER_F(BootstrapBenchmark_u64, CopiesPlusAmortizedPBS)
|
||||
->Apply(BootstrapBenchmarkGenerateParams);
|
||||
BENCHMARK_REGISTER_F(BootstrapBenchmark_u64, CopiesPlusLowLatencyPBS)
|
||||
->Apply(BootstrapBenchmarkGenerateParams);
|
||||
@@ -0,0 +1,120 @@
|
||||
#include <benchmark/benchmark.h>
|
||||
#include <cstdint>
|
||||
#include <setup_and_teardown.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
typedef struct {
|
||||
int lwe_dimension;
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
int pbs_base_log;
|
||||
int pbs_level;
|
||||
int pksk_base_log;
|
||||
int pksk_level;
|
||||
int cbs_base_log;
|
||||
int cbs_level;
|
||||
int number_of_inputs;
|
||||
} CircuitBootstrapBenchmarkParams;
|
||||
|
||||
class CircuitBootstrapBenchmark_u64 : public benchmark::Fixture {
|
||||
protected:
|
||||
int lwe_dimension;
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
double lwe_modular_variance = 7.52316384526264e-37;
|
||||
double glwe_modular_variance = 7.52316384526264e-37;
|
||||
int pbs_base_log;
|
||||
int pbs_level;
|
||||
int pksk_base_log;
|
||||
int pksk_level;
|
||||
int cbs_base_log;
|
||||
int cbs_level;
|
||||
int number_of_inputs;
|
||||
int number_of_bits_of_message_including_padding;
|
||||
int ggsw_size;
|
||||
uint64_t delta;
|
||||
int delta_log;
|
||||
Csprng *csprng;
|
||||
cudaStream_t *stream;
|
||||
int gpu_index = 0;
|
||||
uint64_t *lwe_sk_in;
|
||||
uint64_t *lwe_sk_out;
|
||||
uint64_t *plaintexts;
|
||||
double *d_fourier_bsk;
|
||||
uint64_t *d_pksk;
|
||||
uint64_t *d_lwe_ct_in_array;
|
||||
uint64_t *d_ggsw_ct_out_array;
|
||||
uint64_t *d_lut_vector_indexes;
|
||||
int8_t *cbs_buffer;
|
||||
|
||||
public:
|
||||
// Test arithmetic functions
|
||||
void SetUp(const ::benchmark::State &state) {
|
||||
stream = cuda_create_stream(0);
|
||||
|
||||
// TestParams
|
||||
lwe_dimension = state.range(0);
|
||||
glwe_dimension = state.range(1);
|
||||
polynomial_size = state.range(2);
|
||||
pbs_base_log = state.range(3);
|
||||
pbs_level = state.range(4);
|
||||
pksk_base_log = state.range(5);
|
||||
pksk_level = state.range(6);
|
||||
cbs_base_log = state.range(7);
|
||||
cbs_level = state.range(8);
|
||||
number_of_inputs = state.range(9);
|
||||
|
||||
// We generate binary messages
|
||||
number_of_bits_of_message_including_padding = 2;
|
||||
ggsw_size = cbs_level * (glwe_dimension + 1) * (glwe_dimension + 1) *
|
||||
polynomial_size;
|
||||
circuit_bootstrap_setup(
|
||||
stream, &csprng, &lwe_sk_in, &lwe_sk_out, &d_fourier_bsk, &d_pksk,
|
||||
&plaintexts, &d_lwe_ct_in_array, &d_ggsw_ct_out_array,
|
||||
&d_lut_vector_indexes, &cbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, lwe_modular_variance, glwe_modular_variance,
|
||||
pksk_base_log, pksk_level, pbs_base_log, pbs_level, cbs_level,
|
||||
number_of_bits_of_message_including_padding, ggsw_size, &delta_log,
|
||||
&delta, number_of_inputs, 1, 1, gpu_index);
|
||||
}
|
||||
|
||||
void TearDown() {
|
||||
circuit_bootstrap_teardown(stream, csprng, lwe_sk_in, lwe_sk_out,
|
||||
d_fourier_bsk, d_pksk, plaintexts,
|
||||
d_lwe_ct_in_array, d_lut_vector_indexes,
|
||||
d_ggsw_ct_out_array, cbs_buffer, gpu_index);
|
||||
}
|
||||
};
|
||||
|
||||
BENCHMARK_DEFINE_F(CircuitBootstrapBenchmark_u64, CircuitBootstrap)
|
||||
(benchmark::State &st) {
|
||||
for (auto _ : st) {
|
||||
// Execute circuit bootstrap
|
||||
cuda_circuit_bootstrap_64(
|
||||
stream, gpu_index, (void *)d_ggsw_ct_out_array,
|
||||
(void *)d_lwe_ct_in_array, (void *)d_fourier_bsk, (void *)d_pksk,
|
||||
(void *)d_lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
|
||||
glwe_dimension, lwe_dimension, pbs_level, pbs_base_log, pksk_level,
|
||||
pksk_base_log, cbs_level, cbs_base_log, number_of_inputs,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
CircuitBootstrapBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
|
||||
// Define the parameters to benchmark
|
||||
std::vector<CircuitBootstrapBenchmarkParams> params = {
|
||||
(CircuitBootstrapBenchmarkParams){10, 2, 512, 11, 2, 15, 2, 10, 1, 10} //,
|
||||
};
|
||||
|
||||
// Add to the list of parameters to benchmark
|
||||
for (auto x : params)
|
||||
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
|
||||
x.pbs_base_log, x.pbs_level, x.pksk_base_log, x.pksk_level,
|
||||
x.cbs_base_log, x.cbs_level, x.number_of_inputs});
|
||||
}
|
||||
|
||||
BENCHMARK_REGISTER_F(CircuitBootstrapBenchmark_u64, CircuitBootstrap)
|
||||
->Apply(CircuitBootstrapBenchmarkGenerateParams);
|
||||
@@ -0,0 +1,94 @@
|
||||
#include <benchmark/benchmark.h>
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <setup_and_teardown.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
typedef struct {
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
int r;
|
||||
int tau;
|
||||
int base_log;
|
||||
int level_count;
|
||||
} CMUXTreeBenchmarkParams;
|
||||
|
||||
class CMUXTreeBenchmark_u64 : public benchmark::Fixture {
|
||||
protected:
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
int r_lut;
|
||||
int tau;
|
||||
double glwe_modular_variance = 0.00000000000000029403601535432533;
|
||||
int base_log;
|
||||
int level_count;
|
||||
uint64_t delta;
|
||||
int delta_log = 60;
|
||||
Csprng *csprng;
|
||||
cudaStream_t *stream;
|
||||
int gpu_index = 0;
|
||||
uint64_t *d_lut_identity;
|
||||
uint64_t *d_ggsw_bit_array;
|
||||
uint64_t *plaintexts;
|
||||
uint64_t *d_glwe_out;
|
||||
uint64_t *glwe_sk;
|
||||
int8_t *cmux_tree_buffer = nullptr;
|
||||
|
||||
public:
|
||||
// Test arithmetic functions
|
||||
void SetUp(const ::benchmark::State &state) {
|
||||
stream = cuda_create_stream(0);
|
||||
|
||||
// TestParams
|
||||
glwe_dimension = state.range(0);
|
||||
polynomial_size = state.range(1);
|
||||
r_lut = state.range(2);
|
||||
tau = state.range(3);
|
||||
base_log = state.range(4);
|
||||
level_count = state.range(5);
|
||||
|
||||
// Value of the shift we multiply our messages by
|
||||
delta = ((uint64_t)(1) << delta_log);
|
||||
|
||||
cmux_tree_setup(stream, &csprng, &glwe_sk, &d_lut_identity, &plaintexts,
|
||||
&d_ggsw_bit_array, &cmux_tree_buffer, &d_glwe_out,
|
||||
glwe_dimension, polynomial_size, base_log, level_count,
|
||||
glwe_modular_variance, r_lut, tau, delta_log, 1, 1,
|
||||
gpu_index);
|
||||
}
|
||||
|
||||
void TearDown() {
|
||||
cmux_tree_teardown(stream, &csprng, &glwe_sk, &d_lut_identity, &plaintexts,
|
||||
&d_ggsw_bit_array, &cmux_tree_buffer, &d_glwe_out,
|
||||
gpu_index);
|
||||
}
|
||||
};
|
||||
|
||||
BENCHMARK_DEFINE_F(CMUXTreeBenchmark_u64, CMUXTree)(benchmark::State &st) {
|
||||
for (auto _ : st) {
|
||||
// Execute scratch/CMUX tree/cleanup
|
||||
cuda_cmux_tree_64(stream, gpu_index, (void *)d_glwe_out,
|
||||
(void *)d_ggsw_bit_array, (void *)d_lut_identity,
|
||||
cmux_tree_buffer, glwe_dimension, polynomial_size,
|
||||
base_log, level_count, r_lut, tau,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
}
|
||||
|
||||
// k, N, r, tau, base_log, level_count
|
||||
static void CMUXTreeBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
|
||||
// Define the parameters to benchmark
|
||||
std::vector<CMUXTreeBenchmarkParams> params = {
|
||||
(CMUXTreeBenchmarkParams){2, 256, 10, 6, 6, 3},
|
||||
};
|
||||
|
||||
// Add to the list of parameters to benchmark
|
||||
for (auto x : params)
|
||||
b->Args({x.glwe_dimension, x.polynomial_size, x.r, x.tau, x.base_log,
|
||||
x.level_count});
|
||||
}
|
||||
|
||||
BENCHMARK_REGISTER_F(CMUXTreeBenchmark_u64, CMUXTree)
|
||||
->Apply(CMUXTreeBenchmarkGenerateParams);
|
||||
@@ -0,0 +1,117 @@
|
||||
#include <benchmark/benchmark.h>
|
||||
#include <cstdint>
|
||||
#include <setup_and_teardown.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
typedef struct {
|
||||
int input_lwe_dimension;
|
||||
int output_lwe_dimension;
|
||||
int ksk_base_log;
|
||||
int ksk_level;
|
||||
int number_of_inputs;
|
||||
} KeyswitchBenchmarkParams;
|
||||
|
||||
class KeyswitchBenchmark_u64 : public benchmark::Fixture {
|
||||
protected:
|
||||
int input_lwe_dimension;
|
||||
int output_lwe_dimension;
|
||||
double noise_variance = 2.9802322387695312e-08;
|
||||
int ksk_base_log;
|
||||
int ksk_level;
|
||||
int message_modulus = 4;
|
||||
int carry_modulus = 4;
|
||||
int payload_modulus;
|
||||
int number_of_inputs;
|
||||
uint64_t delta;
|
||||
Csprng *csprng;
|
||||
cudaStream_t *stream;
|
||||
int gpu_index = 0;
|
||||
uint64_t *plaintexts;
|
||||
uint64_t *d_ksk_array;
|
||||
uint64_t *d_lwe_out_ct_array;
|
||||
uint64_t *d_lwe_in_ct_array;
|
||||
uint64_t *lwe_sk_in_array;
|
||||
uint64_t *lwe_sk_out_array;
|
||||
|
||||
public:
|
||||
// Test arithmetic functions
|
||||
void SetUp(const ::benchmark::State &state) {
|
||||
stream = cuda_create_stream(0);
|
||||
|
||||
// TestParams
|
||||
input_lwe_dimension = state.range(0);
|
||||
output_lwe_dimension = state.range(1);
|
||||
ksk_base_log = state.range(2);
|
||||
ksk_level = state.range(3);
|
||||
number_of_inputs = state.range(4);
|
||||
|
||||
keyswitch_setup(stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
|
||||
&d_ksk_array, &plaintexts, &d_lwe_in_ct_array,
|
||||
&d_lwe_out_ct_array, input_lwe_dimension,
|
||||
output_lwe_dimension, noise_variance, ksk_base_log,
|
||||
ksk_level, message_modulus, carry_modulus, &payload_modulus,
|
||||
&delta, number_of_inputs, 1, 1, gpu_index);
|
||||
}
|
||||
|
||||
void TearDown() {
|
||||
keyswitch_teardown(stream, csprng, lwe_sk_in_array, lwe_sk_out_array,
|
||||
d_ksk_array, plaintexts, d_lwe_in_ct_array,
|
||||
d_lwe_out_ct_array, gpu_index);
|
||||
}
|
||||
};
|
||||
|
||||
BENCHMARK_DEFINE_F(KeyswitchBenchmark_u64, Keyswitch)(benchmark::State &st) {
|
||||
for (auto _ : st) {
|
||||
// Execute keyswitch
|
||||
cuda_keyswitch_lwe_ciphertext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_out_ct_array,
|
||||
(void *)d_lwe_in_ct_array, (void *)d_ksk_array, input_lwe_dimension,
|
||||
output_lwe_dimension, ksk_base_log, ksk_level, number_of_inputs);
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK_DEFINE_F(KeyswitchBenchmark_u64, CopiesPlusKeyswitch)
|
||||
(benchmark::State &st) {
|
||||
uint64_t *lwe_in_ct = (uint64_t *)malloc(
|
||||
number_of_inputs * (input_lwe_dimension + 1) * sizeof(uint64_t));
|
||||
void *v_stream = (void *)stream;
|
||||
for (auto _ : st) {
|
||||
cuda_memcpy_async_to_gpu(d_lwe_in_ct_array, lwe_in_ct,
|
||||
number_of_inputs * (input_lwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
// Execute keyswitch
|
||||
cuda_keyswitch_lwe_ciphertext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_out_ct_array,
|
||||
(void *)d_lwe_in_ct_array, (void *)d_ksk_array, input_lwe_dimension,
|
||||
output_lwe_dimension, ksk_base_log, ksk_level, number_of_inputs);
|
||||
cuda_memcpy_async_to_cpu(lwe_in_ct, d_lwe_out_ct_array,
|
||||
number_of_inputs * (output_lwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
}
|
||||
free(lwe_in_ct);
|
||||
}
|
||||
|
||||
static void
|
||||
KeyswitchBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
|
||||
// Define the parameters to benchmark
|
||||
// na, nb, base_log, level, number_of_inputs
|
||||
std::vector<KeyswitchBenchmarkParams> params = {
|
||||
(KeyswitchBenchmarkParams){600, 1024, 3, 8, 10},
|
||||
};
|
||||
|
||||
// Add to the list of parameters to benchmark
|
||||
for (auto x : params)
|
||||
b->Args({x.input_lwe_dimension, x.output_lwe_dimension, x.ksk_base_log,
|
||||
x.ksk_level, x.number_of_inputs});
|
||||
}
|
||||
|
||||
BENCHMARK_REGISTER_F(KeyswitchBenchmark_u64, Keyswitch)
|
||||
->Apply(KeyswitchBenchmarkGenerateParams);
|
||||
|
||||
BENCHMARK_REGISTER_F(KeyswitchBenchmark_u64, CopiesPlusKeyswitch)
|
||||
->Apply(KeyswitchBenchmarkGenerateParams);
|
||||
@@ -0,0 +1,227 @@
|
||||
#include <benchmark/benchmark.h>
|
||||
#include <cstdint>
|
||||
#include <setup_and_teardown.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
typedef struct {
|
||||
int lwe_dimension;
|
||||
int input_lwe_ciphertext_count;
|
||||
} LinearAlgebraBenchmarkParams;
|
||||
|
||||
class LinearAlgebraBenchmark_u64 : public benchmark::Fixture {
|
||||
protected:
|
||||
int lwe_dimension;
|
||||
double noise_variance = 2.9802322387695312e-08;
|
||||
int ksk_base_log;
|
||||
int ksk_level;
|
||||
int message_modulus = 4;
|
||||
int carry_modulus = 4;
|
||||
int num_samples;
|
||||
uint64_t delta;
|
||||
Csprng *csprng;
|
||||
cudaStream_t *stream;
|
||||
int gpu_index = 0;
|
||||
uint64_t *d_lwe_in_1_ct;
|
||||
uint64_t *d_lwe_in_2_ct;
|
||||
uint64_t *d_lwe_out_ct;
|
||||
uint64_t *plaintexts_1;
|
||||
uint64_t *plaintexts_2;
|
||||
uint64_t *d_plaintext_2;
|
||||
uint64_t *d_cleartext;
|
||||
uint64_t *lwe_in_1_ct;
|
||||
uint64_t *lwe_in_2_ct;
|
||||
uint64_t *lwe_out_ct;
|
||||
uint64_t *lwe_sk_array;
|
||||
|
||||
public:
|
||||
// Test arithmetic functions
|
||||
void SetUp(const ::benchmark::State &state) {
|
||||
stream = cuda_create_stream(0);
|
||||
|
||||
// TestParams
|
||||
lwe_dimension = state.range(0);
|
||||
num_samples = state.range(1);
|
||||
|
||||
int payload_modulus = message_modulus * carry_modulus;
|
||||
// Value of the shift we multiply our messages by
|
||||
delta = ((uint64_t)(1) << 63) / (uint64_t)(payload_modulus);
|
||||
|
||||
linear_algebra_setup(
|
||||
stream, &csprng, &lwe_sk_array, &d_lwe_in_1_ct, &d_lwe_in_2_ct,
|
||||
&d_lwe_out_ct, &lwe_in_1_ct, &lwe_in_2_ct, &lwe_out_ct, &plaintexts_1,
|
||||
&plaintexts_2, &d_plaintext_2, &d_cleartext, lwe_dimension,
|
||||
noise_variance, payload_modulus, delta, num_samples, 1, 1, gpu_index);
|
||||
}
|
||||
|
||||
void TearDown() {
|
||||
linear_algebra_teardown(
|
||||
stream, &csprng, &lwe_sk_array, &d_lwe_in_1_ct, &d_lwe_in_2_ct,
|
||||
&d_lwe_out_ct, &lwe_in_1_ct, &lwe_in_2_ct, &lwe_out_ct, &plaintexts_1,
|
||||
&plaintexts_2, &d_plaintext_2, &d_cleartext, gpu_index);
|
||||
}
|
||||
};
|
||||
|
||||
BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, Addition)(benchmark::State &st) {
|
||||
// Execute addition
|
||||
for (auto _ : st) {
|
||||
cuda_add_lwe_ciphertext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
|
||||
(void *)d_lwe_in_2_ct, lwe_dimension, num_samples);
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, CopiesPlusAddition)
|
||||
(benchmark::State &st) {
|
||||
// Execute addition
|
||||
for (auto _ : st) {
|
||||
|
||||
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
|
||||
num_samples * (lwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_memcpy_async_to_gpu(d_lwe_in_2_ct, lwe_in_2_ct,
|
||||
num_samples * (lwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_add_lwe_ciphertext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
|
||||
(void *)d_lwe_in_2_ct, lwe_dimension, num_samples);
|
||||
|
||||
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
|
||||
num_samples * (lwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, PlaintextAddition)
|
||||
(benchmark::State &st) {
|
||||
for (auto _ : st) {
|
||||
// Execute addition
|
||||
cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
|
||||
(void *)d_plaintext_2, lwe_dimension, num_samples);
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, CopiesPlusPlaintextAddition)
|
||||
(benchmark::State &st) {
|
||||
for (auto _ : st) {
|
||||
|
||||
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
|
||||
num_samples * (lwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_memcpy_async_to_gpu(d_plaintext_2, plaintexts_2,
|
||||
num_samples * sizeof(uint64_t), stream, gpu_index);
|
||||
// Execute addition
|
||||
cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
|
||||
(void *)d_plaintext_2, lwe_dimension, num_samples);
|
||||
|
||||
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
|
||||
num_samples * (lwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, PlaintextMultiplication)
|
||||
(benchmark::State &st) {
|
||||
for (auto _ : st) {
|
||||
// Execute addition
|
||||
cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
|
||||
(void *)d_cleartext, lwe_dimension, num_samples);
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64,
|
||||
CopiesPlusPlaintextMultiplication)
|
||||
(benchmark::State &st) {
|
||||
for (auto _ : st) {
|
||||
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
|
||||
num_samples * (lwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_memcpy_async_to_gpu(d_cleartext, plaintexts_2,
|
||||
num_samples * sizeof(uint64_t), stream, gpu_index);
|
||||
// Execute addition
|
||||
cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
|
||||
(void *)d_cleartext, lwe_dimension, num_samples);
|
||||
|
||||
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
|
||||
num_samples * (lwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, Negate)(benchmark::State &st) {
|
||||
for (auto _ : st) {
|
||||
// Execute addition
|
||||
cuda_negate_lwe_ciphertext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
|
||||
lwe_dimension, num_samples);
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, CopiesPlusNegate)
|
||||
(benchmark::State &st) {
|
||||
for (auto _ : st) {
|
||||
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
|
||||
num_samples * (lwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
// Execute addition
|
||||
cuda_negate_lwe_ciphertext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
|
||||
lwe_dimension, num_samples);
|
||||
|
||||
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
|
||||
num_samples * (lwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
LinearAlgebraBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
|
||||
// Define the parameters to benchmark
|
||||
// n, input_lwe_ciphertext_count
|
||||
std::vector<LinearAlgebraBenchmarkParams> params = {
|
||||
(LinearAlgebraBenchmarkParams){600, 10},
|
||||
};
|
||||
|
||||
// Add to the list of parameters to benchmark
|
||||
for (auto x : params)
|
||||
b->Args({x.lwe_dimension, x.input_lwe_ciphertext_count});
|
||||
}
|
||||
|
||||
BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64, Addition)
|
||||
->Apply(LinearAlgebraBenchmarkGenerateParams);
|
||||
BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64, CopiesPlusAddition)
|
||||
->Apply(LinearAlgebraBenchmarkGenerateParams);
|
||||
BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64, PlaintextAddition)
|
||||
->Apply(LinearAlgebraBenchmarkGenerateParams);
|
||||
BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64, CopiesPlusPlaintextAddition)
|
||||
->Apply(LinearAlgebraBenchmarkGenerateParams);
|
||||
BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64, PlaintextMultiplication)
|
||||
->Apply(LinearAlgebraBenchmarkGenerateParams);
|
||||
BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64,
|
||||
CopiesPlusPlaintextMultiplication)
|
||||
->Apply(LinearAlgebraBenchmarkGenerateParams);
|
||||
BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64, Negate)
|
||||
->Apply(LinearAlgebraBenchmarkGenerateParams);
|
||||
BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64, CopiesPlusNegate)
|
||||
->Apply(LinearAlgebraBenchmarkGenerateParams);
|
||||
@@ -0,0 +1,169 @@
|
||||
#include <benchmark/benchmark.h>
|
||||
#include <cstdint>
|
||||
#include <setup_and_teardown.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
typedef struct {
|
||||
int lwe_dimension;
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
int pbs_base_log;
|
||||
int pbs_level;
|
||||
int ks_base_log;
|
||||
int ks_level;
|
||||
int pksk_base_log;
|
||||
int pksk_level;
|
||||
int cbs_base_log;
|
||||
int cbs_level;
|
||||
int tau;
|
||||
} WopPBSBenchmarkParams;
|
||||
|
||||
class WopPBSBenchmark_u64 : public benchmark::Fixture {
|
||||
protected:
|
||||
int lwe_dimension;
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
double lwe_modular_variance = 7.52316384526264e-37;
|
||||
double glwe_modular_variance = 7.52316384526264e-37;
|
||||
int pbs_base_log;
|
||||
int pbs_level;
|
||||
int ks_base_log;
|
||||
int ks_level;
|
||||
int pksk_base_log;
|
||||
int pksk_level;
|
||||
int cbs_base_log;
|
||||
int cbs_level;
|
||||
int tau;
|
||||
int p;
|
||||
int input_lwe_dimension;
|
||||
uint64_t delta;
|
||||
int cbs_delta_log;
|
||||
int delta_log;
|
||||
int delta_log_lut;
|
||||
Csprng *csprng;
|
||||
cudaStream_t *stream;
|
||||
int gpu_index = 0;
|
||||
uint64_t *plaintexts;
|
||||
double *d_fourier_bsk;
|
||||
uint64_t *lwe_sk_in;
|
||||
uint64_t *lwe_sk_out;
|
||||
uint64_t *d_ksk;
|
||||
uint64_t *d_pksk;
|
||||
uint64_t *d_lwe_ct_in_array;
|
||||
uint64_t *d_lwe_ct_out_array;
|
||||
uint64_t *d_lut_vector;
|
||||
int8_t *wop_pbs_buffer;
|
||||
uint64_t *lwe_ct_in_array;
|
||||
uint64_t *lwe_ct_out_array;
|
||||
|
||||
public:
|
||||
// Test arithmetic functions
|
||||
void SetUp(const ::benchmark::State &state) {
|
||||
stream = cuda_create_stream(0);
|
||||
|
||||
// TestParams
|
||||
lwe_dimension = state.range(0);
|
||||
glwe_dimension = state.range(1);
|
||||
polynomial_size = state.range(2);
|
||||
pbs_base_log = state.range(3);
|
||||
pbs_level = state.range(4);
|
||||
ks_base_log = state.range(5);
|
||||
ks_level = state.range(6);
|
||||
pksk_base_log = state.range(7);
|
||||
pksk_level = state.range(8);
|
||||
cbs_base_log = state.range(9);
|
||||
cbs_level = state.range(10);
|
||||
tau = state.range(11);
|
||||
p = 10 / tau;
|
||||
wop_pbs_setup(stream, &csprng, &lwe_sk_in, &lwe_sk_out, &d_ksk,
|
||||
&d_fourier_bsk, &d_pksk, &plaintexts, &d_lwe_ct_in_array,
|
||||
&d_lwe_ct_out_array, &d_lut_vector, &wop_pbs_buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size,
|
||||
lwe_modular_variance, glwe_modular_variance, ks_base_log,
|
||||
ks_level, pksk_base_log, pksk_level, pbs_base_log, pbs_level,
|
||||
cbs_level, p, &delta_log, &cbs_delta_log, &delta_log_lut,
|
||||
&delta, tau, 1, 1, gpu_index);
|
||||
|
||||
// We keep the following for the benchmarks with copies
|
||||
lwe_ct_in_array = (uint64_t *)malloc(
|
||||
(glwe_dimension * polynomial_size + 1) * tau * sizeof(uint64_t));
|
||||
for (int i = 0; i < tau; i++) {
|
||||
uint64_t plaintext = plaintexts[i];
|
||||
uint64_t *lwe_ct_in =
|
||||
lwe_ct_in_array +
|
||||
(ptrdiff_t)(i * (glwe_dimension * polynomial_size + 1));
|
||||
concrete_cpu_encrypt_lwe_ciphertext_u64(
|
||||
lwe_sk_in, lwe_ct_in, plaintext, glwe_dimension * polynomial_size,
|
||||
lwe_modular_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
|
||||
}
|
||||
lwe_ct_out_array = (uint64_t *)malloc(
|
||||
(glwe_dimension * polynomial_size + 1) * tau * sizeof(uint64_t));
|
||||
}
|
||||
|
||||
void TearDown() {
|
||||
wop_pbs_teardown(stream, csprng, lwe_sk_in, lwe_sk_out, d_ksk,
|
||||
d_fourier_bsk, d_pksk, plaintexts, d_lwe_ct_in_array,
|
||||
d_lut_vector, d_lwe_ct_out_array, wop_pbs_buffer,
|
||||
gpu_index);
|
||||
free(lwe_ct_in_array);
|
||||
free(lwe_ct_out_array);
|
||||
}
|
||||
};
|
||||
|
||||
BENCHMARK_DEFINE_F(WopPBSBenchmark_u64, WopPBS)(benchmark::State &st) {
|
||||
for (auto _ : st) {
|
||||
// Execute wop pbs
|
||||
cuda_wop_pbs_64(
|
||||
stream, gpu_index, (void *)d_lwe_ct_out_array,
|
||||
(void *)d_lwe_ct_in_array, (void *)d_lut_vector, (void *)d_fourier_bsk,
|
||||
(void *)d_ksk, (void *)d_pksk, wop_pbs_buffer, cbs_delta_log,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
ks_base_log, ks_level, pksk_base_log, pksk_level, cbs_base_log,
|
||||
cbs_level, p, p, delta_log, tau, cuda_get_max_shared_memory(gpu_index));
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK_DEFINE_F(WopPBSBenchmark_u64, CopiesPlusWopPBS)
|
||||
(benchmark::State &st) {
|
||||
for (auto _ : st) {
|
||||
cuda_memcpy_async_to_gpu(d_lwe_ct_in_array, lwe_ct_in_array,
|
||||
(input_lwe_dimension + 1) * tau * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
// Execute wop pbs
|
||||
cuda_wop_pbs_64(
|
||||
stream, gpu_index, (void *)d_lwe_ct_out_array,
|
||||
(void *)d_lwe_ct_in_array, (void *)d_lut_vector, (void *)d_fourier_bsk,
|
||||
(void *)d_ksk, (void *)d_pksk, wop_pbs_buffer, cbs_delta_log,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
ks_base_log, ks_level, pksk_base_log, pksk_level, cbs_base_log,
|
||||
cbs_level, p, p, delta_log, tau, cuda_get_max_shared_memory(gpu_index));
|
||||
|
||||
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
|
||||
(input_lwe_dimension + 1) * tau * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
}
|
||||
|
||||
static void WopPBSBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
|
||||
// Define the parameters to benchmark
|
||||
// n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
|
||||
// ks_base_log, ks_level, tau
|
||||
std::vector<WopPBSBenchmarkParams> params = {
|
||||
(WopPBSBenchmarkParams){481, 2, 512, 4, 9, 1, 9, 4, 9, 6, 4, 1} //,
|
||||
};
|
||||
|
||||
// Add to the list of parameters to benchmark
|
||||
for (auto x : params)
|
||||
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
|
||||
x.pbs_base_log, x.pbs_level, x.ks_base_log, x.ks_level,
|
||||
x.pksk_base_log, x.pksk_level, x.cbs_base_log, x.cbs_level,
|
||||
x.tau});
|
||||
}
|
||||
|
||||
BENCHMARK_REGISTER_F(WopPBSBenchmark_u64, WopPBS)
|
||||
->Apply(WopPBSBenchmarkGenerateParams);
|
||||
BENCHMARK_REGISTER_F(WopPBSBenchmark_u64, CopiesPlusWopPBS)
|
||||
->Apply(WopPBSBenchmarkGenerateParams);
|
||||
@@ -0,0 +1,3 @@
|
||||
#include <benchmark/benchmark.h>
|
||||
|
||||
BENCHMARK_MAIN();
|
||||
@@ -0,0 +1,146 @@
|
||||
#ifndef SETUP_AND_TEARDOWN_H
|
||||
#define SETUP_AND_TEARDOWN_H
|
||||
|
||||
#include <bit_extraction.h>
|
||||
#include <bootstrap.h>
|
||||
#include <circuit_bootstrap.h>
|
||||
#include <concrete-cpu.h>
|
||||
#include <device.h>
|
||||
#include <keyswitch.h>
|
||||
#include <linear_algebra.h>
|
||||
#include <utils.h>
|
||||
#include <vertical_packing.h>
|
||||
|
||||
void bootstrap_setup(cudaStream_t *stream, Csprng **csprng,
|
||||
uint64_t **lwe_sk_in_array, uint64_t **lwe_sk_out_array,
|
||||
double **d_fourier_bsk_array, uint64_t **plaintexts,
|
||||
uint64_t **d_lut_pbs_identity,
|
||||
uint64_t **d_lut_pbs_indexes, uint64_t **d_lwe_ct_in_array,
|
||||
uint64_t **d_lwe_ct_out_array,
|
||||
int8_t **amortized_pbs_buffer, int8_t **lowlat_pbs_buffer,
|
||||
int lwe_dimension, int glwe_dimension, int polynomial_size,
|
||||
double lwe_modular_variance, double glwe_modular_variance,
|
||||
int pbs_base_log, int pbs_level, int message_modulus,
|
||||
int carry_modulus, int *payload_modulus, uint64_t *delta,
|
||||
int number_of_inputs, int repetitions, int samples,
|
||||
int gpu_index);
|
||||
void bootstrap_teardown(cudaStream_t *stream, Csprng *csprng,
|
||||
uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array,
|
||||
double *d_fourier_bsk_array, uint64_t *plaintexts,
|
||||
uint64_t *d_lut_pbs_identity,
|
||||
uint64_t *d_lut_pbs_indexes,
|
||||
uint64_t *d_lwe_ct_in_array,
|
||||
uint64_t *d_lwe_ct_out_array,
|
||||
int8_t *amortized_pbs_buffer, int8_t *lowlat_pbs_buffer,
|
||||
int gpu_index);
|
||||
|
||||
void keyswitch_setup(cudaStream_t *stream, Csprng **csprng,
|
||||
uint64_t **lwe_sk_in_array, uint64_t **lwe_sk_out_array,
|
||||
uint64_t **d_ksk_array, uint64_t **plaintexts,
|
||||
uint64_t **d_lwe_ct_in_array,
|
||||
uint64_t **d_lwe_ct_out_array, int input_lwe_dimension,
|
||||
int output_lwe_dimension, double lwe_modular_variance,
|
||||
int ksk_base_log, int ksk_level, int message_modulus,
|
||||
int carry_modulus, int *payload_modulus, uint64_t *delta,
|
||||
int number_of_inputs, int repetitions, int samples,
|
||||
int gpu_index);
|
||||
void keyswitch_teardown(cudaStream_t *stream, Csprng *csprng,
|
||||
uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array,
|
||||
uint64_t *d_ksk_array, uint64_t *plaintexts,
|
||||
uint64_t *d_lwe_ct_in_array,
|
||||
uint64_t *d_lwe_ct_out_array, int gpu_index);
|
||||
|
||||
void bit_extraction_setup(
|
||||
cudaStream_t *stream, Csprng **csprng, uint64_t **lwe_sk_in_array,
|
||||
uint64_t **lwe_sk_out_array, double **d_fourier_bsk_array,
|
||||
uint64_t **d_ksk_array, uint64_t **plaintexts, uint64_t **d_lwe_ct_in_array,
|
||||
uint64_t **d_lwe_ct_out_array, int8_t **bit_extract_buffer,
|
||||
int lwe_dimension, int glwe_dimension, int polynomial_size,
|
||||
double lwe_modular_variance, double glwe_modular_variance, int ks_base_log,
|
||||
int ks_level, int pbs_base_log, int pbs_level,
|
||||
int number_of_bits_of_message_including_padding,
|
||||
int number_of_bits_to_extract, int *delta_log, uint64_t *delta,
|
||||
int number_of_inputs, int repetitions, int samples, int gpu_index);
|
||||
|
||||
void bit_extraction_teardown(cudaStream_t *stream, Csprng *csprng,
|
||||
uint64_t *lwe_sk_in_array,
|
||||
uint64_t *lwe_sk_out_array,
|
||||
double *d_fourier_bsk_array, uint64_t *d_ksk_array,
|
||||
uint64_t *plaintexts, uint64_t *d_lwe_ct_in_array,
|
||||
uint64_t *d_lwe_ct_out_array,
|
||||
int8_t *bit_extract_buffer, int gpu_index);
|
||||
|
||||
void circuit_bootstrap_setup(
|
||||
cudaStream_t *stream, Csprng **csprng, uint64_t **lwe_sk_in_array,
|
||||
uint64_t **lwe_sk_out_array, double **d_fourier_bsk_array,
|
||||
uint64_t **d_pksk_array, uint64_t **plaintexts,
|
||||
uint64_t **d_lwe_ct_in_array, uint64_t **d_ggsw_ct_out_array,
|
||||
uint64_t **d_lut_vector_indexes, int8_t **cbs_buffer, int lwe_dimension,
|
||||
int glwe_dimension, int polynomial_size, double lwe_modular_variance,
|
||||
double glwe_modular_variance, int pksk_base_log, int pksk_level,
|
||||
int pbs_base_log, int pbs_level, int cbs_level,
|
||||
int number_of_bits_of_message_including_padding, int ggsw_size,
|
||||
int *delta_log, uint64_t *delta, int number_of_inputs, int repetitions,
|
||||
int samples, int gpu_index);
|
||||
|
||||
void circuit_bootstrap_teardown(
|
||||
cudaStream_t *stream, Csprng *csprng, uint64_t *lwe_sk_in_array,
|
||||
uint64_t *lwe_sk_out_array, double *d_fourier_bsk_array,
|
||||
uint64_t *d_pksk_array, uint64_t *plaintexts, uint64_t *d_lwe_ct_in_array,
|
||||
uint64_t *d_lut_vector_indexes, uint64_t *d_ggsw_ct_out_array,
|
||||
int8_t *cbs_buffer, int gpu_index);
|
||||
|
||||
void cmux_tree_setup(cudaStream_t *stream, Csprng **csprng, uint64_t **glwe_sk,
|
||||
uint64_t **d_lut_identity, uint64_t **plaintexts,
|
||||
uint64_t **d_ggsw_bit_array, int8_t **cmux_tree_buffer,
|
||||
uint64_t **d_glwe_out, int glwe_dimension,
|
||||
int polynomial_size, int base_log, int level_count,
|
||||
double glwe_modular_variance, int r_lut, int tau,
|
||||
uint64_t delta_log, int repetitions, int samples,
|
||||
int gpu_index);
|
||||
void cmux_tree_teardown(cudaStream_t *stream, Csprng **csprng,
|
||||
uint64_t **glwe_sk, uint64_t **d_lut_identity,
|
||||
uint64_t **plaintexts, uint64_t **d_ggsw_bit_array,
|
||||
int8_t **cmux_tree_buffer, uint64_t **d_glwe_out,
|
||||
int gpu_index);
|
||||
void wop_pbs_setup(cudaStream_t *stream, Csprng **csprng,
|
||||
uint64_t **lwe_sk_in_array, uint64_t **lwe_sk_out_array,
|
||||
uint64_t **d_ksk_array, double **d_fourier_bsk_array,
|
||||
uint64_t **d_pksk_array, uint64_t **plaintexts,
|
||||
uint64_t **d_lwe_ct_in_array, uint64_t **d_lwe_ct_out_array,
|
||||
uint64_t **d_lut_vector, int8_t **wop_pbs_buffer,
|
||||
int lwe_dimension, int glwe_dimension, int polynomial_size,
|
||||
double lwe_modular_variance, double glwe_modular_variance,
|
||||
int ks_base_log, int ks_level, int pksk_base_log,
|
||||
int pksk_level, int pbs_base_log, int pbs_level,
|
||||
int cbs_level, int p, int *delta_log, int *cbs_delta_log,
|
||||
int *delta_log_lut, uint64_t *delta, int tau,
|
||||
int repetitions, int samples, int gpu_index);
|
||||
|
||||
void wop_pbs_teardown(cudaStream_t *stream, Csprng *csprng,
|
||||
uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array,
|
||||
uint64_t *d_ksk_array, double *d_fourier_bsk_array,
|
||||
uint64_t *d_pksk_array, uint64_t *plaintexts,
|
||||
uint64_t *d_lwe_ct_in_array, uint64_t *d_lut_vector,
|
||||
uint64_t *d_lwe_ct_out_array, int8_t *wop_pbs_buffer,
|
||||
int gpu_index);
|
||||
|
||||
void linear_algebra_setup(cudaStream_t *stream, Csprng **csprng,
|
||||
uint64_t **lwe_sk_array, uint64_t **d_lwe_in_1_ct,
|
||||
uint64_t **d_lwe_in_2_ct, uint64_t **d_lwe_out_ct,
|
||||
uint64_t **lwe_in_1_ct, uint64_t **lwe_in_2_ct,
|
||||
uint64_t **lwe_out_ct, uint64_t **plaintexts_1,
|
||||
uint64_t **plaintexts_2, uint64_t **d_plaintext_2,
|
||||
uint64_t **d_plaintexts_2_mul, int lwe_dimension,
|
||||
double noise_variance, int payload_modulus,
|
||||
uint64_t delta, int number_of_inputs, int repetitions,
|
||||
int samples, int gpu_index);
|
||||
|
||||
void linear_algebra_teardown(cudaStream_t *stream, Csprng **csprng,
|
||||
uint64_t **lwe_sk_array, uint64_t **d_lwe_in_1_ct,
|
||||
uint64_t **d_lwe_in_2_ct, uint64_t **d_lwe_out_ct,
|
||||
uint64_t **lwe_in_1_ct, uint64_t **lwe_in_2_ct,
|
||||
uint64_t **lwe_out_ct, uint64_t **plaintexts_1,
|
||||
uint64_t **plaintexts_2, uint64_t **d_plaintext_2,
|
||||
uint64_t **d_plaintexts_2_mul, int gpu_index);
|
||||
#endif // SETUP_AND_TEARDOWN_H
|
||||
@@ -1,8 +1,8 @@
|
||||
#ifndef TEST_UTILS_H
|
||||
#define TEST_UTILS_H
|
||||
#ifndef UTILS_H
|
||||
#define UTILS_H
|
||||
|
||||
#include "../include/device.h"
|
||||
#include "concrete-cpu.h"
|
||||
#include <concrete-cpu.h>
|
||||
#include <device.h>
|
||||
#include <functional>
|
||||
|
||||
uint64_t *generate_plaintexts(uint64_t payload_modulus, uint64_t delta,
|
||||
@@ -0,0 +1,808 @@
|
||||
#include <cmath>
|
||||
#include <setup_and_teardown.h>
|
||||
|
||||
void bootstrap_setup(cudaStream_t *stream, Csprng **csprng,
|
||||
uint64_t **lwe_sk_in_array, uint64_t **lwe_sk_out_array,
|
||||
double **d_fourier_bsk_array, uint64_t **plaintexts,
|
||||
uint64_t **d_lut_pbs_identity,
|
||||
uint64_t **d_lut_pbs_indexes, uint64_t **d_lwe_ct_in_array,
|
||||
uint64_t **d_lwe_ct_out_array,
|
||||
int8_t **amortized_pbs_buffer, int8_t **lowlat_pbs_buffer,
|
||||
int lwe_dimension, int glwe_dimension, int polynomial_size,
|
||||
double lwe_modular_variance, double glwe_modular_variance,
|
||||
int pbs_base_log, int pbs_level, int message_modulus,
|
||||
int carry_modulus, int *payload_modulus, uint64_t *delta,
|
||||
int number_of_inputs, int repetitions, int samples,
|
||||
int gpu_index) {
|
||||
|
||||
void *v_stream = (void *)stream;
|
||||
*payload_modulus = message_modulus * carry_modulus;
|
||||
// Value of the shift we multiply our messages by
|
||||
*delta = ((uint64_t)(1) << 63) / (uint64_t)(*payload_modulus);
|
||||
|
||||
// Create a Csprng
|
||||
*csprng =
|
||||
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
|
||||
uint8_t seed[16] = {(uint8_t)0};
|
||||
concrete_cpu_construct_concrete_csprng(
|
||||
*csprng, Uint128{.little_endian_bytes = {*seed}});
|
||||
|
||||
// Generate the keys
|
||||
generate_lwe_secret_keys(lwe_sk_in_array, lwe_dimension, *csprng,
|
||||
repetitions);
|
||||
generate_lwe_secret_keys(lwe_sk_out_array, glwe_dimension * polynomial_size,
|
||||
*csprng, repetitions);
|
||||
generate_lwe_bootstrap_keys(
|
||||
stream, gpu_index, d_fourier_bsk_array, *lwe_sk_in_array,
|
||||
*lwe_sk_out_array, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
pbs_level, pbs_base_log, *csprng, glwe_modular_variance, repetitions);
|
||||
*plaintexts = generate_plaintexts(*payload_modulus, *delta, number_of_inputs,
|
||||
repetitions, samples);
|
||||
|
||||
// Create the LUT
|
||||
uint64_t *lut_pbs_identity = generate_identity_lut_pbs(
|
||||
polynomial_size, glwe_dimension, message_modulus, carry_modulus,
|
||||
[](int x) -> int { return x; });
|
||||
uint64_t *lwe_ct_in_array =
|
||||
(uint64_t *)malloc((lwe_dimension + 1) * number_of_inputs * repetitions *
|
||||
samples * sizeof(uint64_t));
|
||||
// Create the input/output ciphertexts
|
||||
for (int r = 0; r < repetitions; r++) {
|
||||
uint64_t *lwe_sk_in = *lwe_sk_in_array + (ptrdiff_t)(r * lwe_dimension);
|
||||
for (int s = 0; s < samples; s++) {
|
||||
for (int i = 0; i < number_of_inputs; i++) {
|
||||
uint64_t plaintext = (*plaintexts)[r * samples * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
uint64_t *lwe_ct_in =
|
||||
lwe_ct_in_array + (ptrdiff_t)((r * samples * number_of_inputs +
|
||||
s * number_of_inputs + i) *
|
||||
(lwe_dimension + 1));
|
||||
concrete_cpu_encrypt_lwe_ciphertext_u64(
|
||||
lwe_sk_in, lwe_ct_in, plaintext, lwe_dimension,
|
||||
lwe_modular_variance, *csprng, &CONCRETE_CSPRNG_VTABLE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize and copy things in/to the device
|
||||
*d_lut_pbs_identity = (uint64_t *)cuda_malloc_async(
|
||||
(glwe_dimension + 1) * polynomial_size * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
cuda_memcpy_async_to_gpu(*d_lut_pbs_identity, lut_pbs_identity,
|
||||
polynomial_size * (glwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
*d_lut_pbs_indexes = (uint64_t *)cuda_malloc_async(
|
||||
number_of_inputs * sizeof(uint64_t), stream, gpu_index);
|
||||
cuda_memset_async(*d_lut_pbs_indexes, 0, number_of_inputs * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
|
||||
// Input and output LWEs
|
||||
*d_lwe_ct_out_array =
|
||||
(uint64_t *)cuda_malloc_async((glwe_dimension * polynomial_size + 1) *
|
||||
number_of_inputs * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
*d_lwe_ct_in_array = (uint64_t *)cuda_malloc_async(
|
||||
(lwe_dimension + 1) * number_of_inputs * repetitions * samples *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
|
||||
cuda_memcpy_async_to_gpu(*d_lwe_ct_in_array, lwe_ct_in_array,
|
||||
repetitions * samples * number_of_inputs *
|
||||
(lwe_dimension + 1) * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
|
||||
scratch_cuda_bootstrap_amortized_64(
|
||||
stream, gpu_index, amortized_pbs_buffer, glwe_dimension, polynomial_size,
|
||||
number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
|
||||
scratch_cuda_bootstrap_low_latency_64(
|
||||
stream, gpu_index, lowlat_pbs_buffer, glwe_dimension, polynomial_size,
|
||||
pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
|
||||
|
||||
cuda_synchronize_stream(v_stream);
|
||||
|
||||
free(lwe_ct_in_array);
|
||||
free(lut_pbs_identity);
|
||||
}
|
||||
|
||||
void bootstrap_teardown(cudaStream_t *stream, Csprng *csprng,
|
||||
uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array,
|
||||
double *d_fourier_bsk_array, uint64_t *plaintexts,
|
||||
uint64_t *d_lut_pbs_identity,
|
||||
uint64_t *d_lut_pbs_indexes,
|
||||
uint64_t *d_lwe_ct_in_array,
|
||||
uint64_t *d_lwe_ct_out_array,
|
||||
int8_t *amortized_pbs_buffer, int8_t *lowlat_pbs_buffer,
|
||||
int gpu_index) {
|
||||
void *v_stream = (void *)stream;
|
||||
cuda_synchronize_stream(v_stream);
|
||||
|
||||
concrete_cpu_destroy_concrete_csprng(csprng);
|
||||
free(csprng);
|
||||
free(lwe_sk_in_array);
|
||||
free(lwe_sk_out_array);
|
||||
free(plaintexts);
|
||||
cuda_drop_async(d_fourier_bsk_array, stream, gpu_index);
|
||||
cuda_drop_async(d_lut_pbs_identity, stream, gpu_index);
|
||||
cuda_drop_async(d_lut_pbs_indexes, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_ct_out_array, stream, gpu_index);
|
||||
cleanup_cuda_bootstrap_amortized(stream, gpu_index, &amortized_pbs_buffer);
|
||||
cleanup_cuda_bootstrap_low_latency(stream, gpu_index, &lowlat_pbs_buffer);
|
||||
cuda_destroy_stream(stream, gpu_index);
|
||||
}
|
||||
|
||||
void keyswitch_setup(cudaStream_t *stream, Csprng **csprng,
|
||||
uint64_t **lwe_sk_in_array, uint64_t **lwe_sk_out_array,
|
||||
uint64_t **d_ksk_array, uint64_t **plaintexts,
|
||||
uint64_t **d_lwe_ct_in_array,
|
||||
uint64_t **d_lwe_ct_out_array, int input_lwe_dimension,
|
||||
int output_lwe_dimension, double lwe_modular_variance,
|
||||
int ksk_base_log, int ksk_level, int message_modulus,
|
||||
int carry_modulus, int *payload_modulus, uint64_t *delta,
|
||||
int number_of_inputs, int repetitions, int samples,
|
||||
int gpu_index) {
|
||||
|
||||
*payload_modulus = message_modulus * carry_modulus;
|
||||
// Value of the shift we multiply our messages by
|
||||
*delta = ((uint64_t)(1) << 63) / (uint64_t)(*payload_modulus);
|
||||
|
||||
void *v_stream = (void *)stream;
|
||||
// Create a Csprng
|
||||
*csprng =
|
||||
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
|
||||
uint8_t seed[16] = {(uint8_t)0};
|
||||
concrete_cpu_construct_concrete_csprng(
|
||||
*csprng, Uint128{.little_endian_bytes = {*seed}});
|
||||
|
||||
// Generate the keys
|
||||
generate_lwe_secret_keys(lwe_sk_in_array, input_lwe_dimension, *csprng,
|
||||
repetitions);
|
||||
generate_lwe_secret_keys(lwe_sk_out_array, output_lwe_dimension, *csprng,
|
||||
repetitions);
|
||||
generate_lwe_keyswitch_keys(stream, gpu_index, d_ksk_array, *lwe_sk_in_array,
|
||||
*lwe_sk_out_array, input_lwe_dimension,
|
||||
output_lwe_dimension, ksk_level, ksk_base_log,
|
||||
*csprng, lwe_modular_variance, repetitions);
|
||||
*plaintexts = generate_plaintexts(*payload_modulus, *delta, number_of_inputs,
|
||||
repetitions, samples);
|
||||
|
||||
*d_lwe_ct_out_array = (uint64_t *)cuda_malloc_async(
|
||||
(output_lwe_dimension + 1) * number_of_inputs * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
*d_lwe_ct_in_array = (uint64_t *)cuda_malloc_async(
|
||||
(input_lwe_dimension + 1) * number_of_inputs * repetitions * samples *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
uint64_t *lwe_ct_in_array =
|
||||
(uint64_t *)malloc((input_lwe_dimension + 1) * number_of_inputs *
|
||||
repetitions * samples * sizeof(uint64_t));
|
||||
// Create the input/output ciphertexts
|
||||
for (int r = 0; r < repetitions; r++) {
|
||||
uint64_t *lwe_sk_in =
|
||||
*lwe_sk_in_array + (ptrdiff_t)(r * input_lwe_dimension);
|
||||
for (int s = 0; s < samples; s++) {
|
||||
for (int i = 0; i < number_of_inputs; i++) {
|
||||
uint64_t plaintext = (*plaintexts)[r * samples * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
uint64_t *lwe_ct_in =
|
||||
lwe_ct_in_array + (ptrdiff_t)((r * samples * number_of_inputs +
|
||||
s * number_of_inputs + i) *
|
||||
(input_lwe_dimension + 1));
|
||||
concrete_cpu_encrypt_lwe_ciphertext_u64(
|
||||
lwe_sk_in, lwe_ct_in, plaintext, input_lwe_dimension,
|
||||
lwe_modular_variance, *csprng, &CONCRETE_CSPRNG_VTABLE);
|
||||
}
|
||||
}
|
||||
}
|
||||
cuda_memcpy_async_to_gpu(*d_lwe_ct_in_array, lwe_ct_in_array,
|
||||
repetitions * samples * number_of_inputs *
|
||||
(input_lwe_dimension + 1) * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
free(lwe_ct_in_array);
|
||||
}
|
||||
|
||||
void keyswitch_teardown(cudaStream_t *stream, Csprng *csprng,
|
||||
uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array,
|
||||
uint64_t *d_ksk_array, uint64_t *plaintexts,
|
||||
uint64_t *d_lwe_ct_in_array,
|
||||
uint64_t *d_lwe_ct_out_array, int gpu_index) {
|
||||
void *v_stream = (void *)stream;
|
||||
cuda_synchronize_stream(v_stream);
|
||||
concrete_cpu_destroy_concrete_csprng(csprng);
|
||||
free(csprng);
|
||||
free(lwe_sk_in_array);
|
||||
free(lwe_sk_out_array);
|
||||
free(plaintexts);
|
||||
cuda_drop_async(d_ksk_array, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_ct_out_array, stream, gpu_index);
|
||||
cuda_destroy_stream(stream, gpu_index);
|
||||
}
|
||||
|
||||
void linear_algebra_setup(cudaStream_t *stream, Csprng **csprng,
|
||||
uint64_t **lwe_sk_array, uint64_t **d_lwe_in_1_ct,
|
||||
uint64_t **d_lwe_in_2_ct, uint64_t **d_lwe_out_ct,
|
||||
uint64_t **lwe_in_1_ct, uint64_t **lwe_in_2_ct,
|
||||
uint64_t **lwe_out_ct, uint64_t **plaintexts_1,
|
||||
uint64_t **plaintexts_2, uint64_t **d_plaintexts_2,
|
||||
uint64_t **d_cleartext_2, int lwe_dimension,
|
||||
double noise_variance, int payload_modulus,
|
||||
uint64_t delta, int number_of_inputs, int repetitions,
|
||||
int samples, int gpu_index) {
|
||||
|
||||
// Create a Csprng
|
||||
*csprng =
|
||||
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
|
||||
uint8_t seed[16] = {(uint8_t)0};
|
||||
concrete_cpu_construct_concrete_csprng(
|
||||
*csprng, Uint128{.little_endian_bytes = {*seed}});
|
||||
|
||||
// Generate the keys
|
||||
generate_lwe_secret_keys(lwe_sk_array, lwe_dimension, *csprng, repetitions);
|
||||
*plaintexts_1 = generate_plaintexts(payload_modulus, delta, number_of_inputs,
|
||||
repetitions, samples);
|
||||
*plaintexts_2 = generate_plaintexts(payload_modulus, delta, number_of_inputs,
|
||||
repetitions, samples);
|
||||
|
||||
*lwe_in_1_ct = (uint64_t *)malloc(repetitions * samples * number_of_inputs *
|
||||
(lwe_dimension + 1) * sizeof(uint64_t));
|
||||
*lwe_in_2_ct = (uint64_t *)malloc(repetitions * samples * number_of_inputs *
|
||||
(lwe_dimension + 1) * sizeof(uint64_t));
|
||||
uint64_t *cleartext_2 = (uint64_t *)malloc(
|
||||
repetitions * samples * number_of_inputs * sizeof(uint64_t));
|
||||
*lwe_out_ct = (uint64_t *)malloc(number_of_inputs * (lwe_dimension + 1) *
|
||||
sizeof(uint64_t));
|
||||
|
||||
// Create the input/output ciphertexts
|
||||
for (int r = 0; r < repetitions; r++) {
|
||||
uint64_t *lwe_sk = *lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
|
||||
for (int s = 0; s < samples; s++) {
|
||||
for (int i = 0; i < number_of_inputs; i++) {
|
||||
uint64_t plaintext_1 = (*plaintexts_1)[r * samples * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
uint64_t plaintext_2 = (*plaintexts_2)[r * samples * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
uint64_t *lwe_1_in =
|
||||
(*lwe_in_1_ct) + (ptrdiff_t)((r * samples * number_of_inputs +
|
||||
s * number_of_inputs + i) *
|
||||
(lwe_dimension + 1));
|
||||
uint64_t *lwe_2_in =
|
||||
(*lwe_in_2_ct) + (ptrdiff_t)((r * samples * number_of_inputs +
|
||||
s * number_of_inputs + i) *
|
||||
(lwe_dimension + 1));
|
||||
|
||||
concrete_cpu_encrypt_lwe_ciphertext_u64(
|
||||
lwe_sk, lwe_1_in, plaintext_1, lwe_dimension, noise_variance,
|
||||
*csprng, &CONCRETE_CSPRNG_VTABLE);
|
||||
concrete_cpu_encrypt_lwe_ciphertext_u64(
|
||||
lwe_sk, lwe_2_in, plaintext_2, lwe_dimension, noise_variance,
|
||||
*csprng, &CONCRETE_CSPRNG_VTABLE);
|
||||
cleartext_2[r * samples * number_of_inputs + s * number_of_inputs + i] =
|
||||
plaintext_2 / delta;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize and copy things in/to the device
|
||||
*d_lwe_in_1_ct =
|
||||
(uint64_t *)cuda_malloc_async(repetitions * samples * number_of_inputs *
|
||||
(lwe_dimension + 1) * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
*d_lwe_in_2_ct =
|
||||
(uint64_t *)cuda_malloc_async(repetitions * samples * number_of_inputs *
|
||||
(lwe_dimension + 1) * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
*d_plaintexts_2 = (uint64_t *)cuda_malloc_async(
|
||||
repetitions * samples * number_of_inputs * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
*d_cleartext_2 = (uint64_t *)cuda_malloc_async(
|
||||
repetitions * samples * number_of_inputs * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
*d_lwe_out_ct = (uint64_t *)cuda_malloc_async(
|
||||
number_of_inputs * (lwe_dimension + 1) * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
|
||||
cuda_memcpy_async_to_gpu(*d_lwe_in_1_ct, *lwe_in_1_ct,
|
||||
repetitions * samples * number_of_inputs *
|
||||
(lwe_dimension + 1) * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_memcpy_async_to_gpu(*d_lwe_in_2_ct, *lwe_in_2_ct,
|
||||
repetitions * samples * number_of_inputs *
|
||||
(lwe_dimension + 1) * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_memcpy_async_to_gpu(*d_plaintexts_2, *plaintexts_2,
|
||||
repetitions * samples * number_of_inputs *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_memcpy_async_to_gpu(*d_cleartext_2, cleartext_2,
|
||||
repetitions * samples * number_of_inputs *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
|
||||
void *v_stream = (void *)stream;
|
||||
cuda_synchronize_stream(v_stream);
|
||||
free(cleartext_2);
|
||||
}
|
||||
|
||||
void linear_algebra_teardown(cudaStream_t *stream, Csprng **csprng,
|
||||
uint64_t **lwe_sk_array, uint64_t **d_lwe_in_1_ct,
|
||||
uint64_t **d_lwe_in_2_ct, uint64_t **d_lwe_out_ct,
|
||||
uint64_t **lwe_in_1_ct, uint64_t **lwe_in_2_ct,
|
||||
uint64_t **lwe_out_ct, uint64_t **plaintexts_1,
|
||||
uint64_t **plaintexts_2, uint64_t **d_plaintexts_2,
|
||||
uint64_t **d_cleartext_2, int gpu_index) {
|
||||
void *v_stream = (void *)stream;
|
||||
|
||||
cuda_synchronize_stream(v_stream);
|
||||
concrete_cpu_destroy_concrete_csprng(*csprng);
|
||||
free(*csprng);
|
||||
cuda_drop_async(*d_lwe_in_1_ct, stream, gpu_index);
|
||||
cuda_drop_async(*d_lwe_in_2_ct, stream, gpu_index);
|
||||
cuda_drop_async(*d_plaintexts_2, stream, gpu_index);
|
||||
cuda_drop_async(*d_cleartext_2, stream, gpu_index);
|
||||
cuda_drop_async(*d_lwe_out_ct, stream, gpu_index);
|
||||
free(*lwe_out_ct);
|
||||
free(*lwe_sk_array);
|
||||
free(*plaintexts_1);
|
||||
free(*plaintexts_2);
|
||||
free(*lwe_in_1_ct);
|
||||
free(*lwe_in_2_ct);
|
||||
}
|
||||
|
||||
void bit_extraction_setup(
|
||||
cudaStream_t *stream, Csprng **csprng, uint64_t **lwe_sk_in_array,
|
||||
uint64_t **lwe_sk_out_array, double **d_fourier_bsk_array,
|
||||
uint64_t **d_ksk_array, uint64_t **plaintexts, uint64_t **d_lwe_ct_in_array,
|
||||
uint64_t **d_lwe_ct_out_array, int8_t **bit_extract_buffer,
|
||||
int lwe_dimension, int glwe_dimension, int polynomial_size,
|
||||
double lwe_modular_variance, double glwe_modular_variance, int ks_base_log,
|
||||
int ks_level, int pbs_base_log, int pbs_level,
|
||||
int number_of_bits_of_message_including_padding,
|
||||
int number_of_bits_to_extract, int *delta_log, uint64_t *delta,
|
||||
int number_of_inputs, int repetitions, int samples, int gpu_index) {
|
||||
void *v_stream = (void *)stream;
|
||||
*delta_log = 64 - number_of_bits_of_message_including_padding;
|
||||
*delta = (uint64_t)(1) << *delta_log;
|
||||
|
||||
// Create a Csprng
|
||||
*csprng =
|
||||
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
|
||||
uint8_t seed[16] = {(uint8_t)0};
|
||||
concrete_cpu_construct_concrete_csprng(
|
||||
*csprng, Uint128{.little_endian_bytes = {*seed}});
|
||||
|
||||
int input_lwe_dimension = glwe_dimension * polynomial_size;
|
||||
int output_lwe_dimension = lwe_dimension;
|
||||
// Generate the keys
|
||||
generate_lwe_secret_keys(lwe_sk_in_array, input_lwe_dimension, *csprng,
|
||||
repetitions);
|
||||
generate_lwe_secret_keys(lwe_sk_out_array, output_lwe_dimension, *csprng,
|
||||
repetitions);
|
||||
generate_lwe_keyswitch_keys(stream, gpu_index, d_ksk_array, *lwe_sk_in_array,
|
||||
*lwe_sk_out_array, input_lwe_dimension,
|
||||
output_lwe_dimension, ks_level, ks_base_log,
|
||||
*csprng, lwe_modular_variance, repetitions);
|
||||
|
||||
generate_lwe_bootstrap_keys(
|
||||
stream, gpu_index, d_fourier_bsk_array, *lwe_sk_out_array,
|
||||
*lwe_sk_in_array, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
pbs_level, pbs_base_log, *csprng, glwe_modular_variance, repetitions);
|
||||
*plaintexts =
|
||||
generate_plaintexts(number_of_bits_of_message_including_padding, *delta,
|
||||
number_of_inputs, repetitions, samples);
|
||||
|
||||
*d_lwe_ct_out_array = (uint64_t *)cuda_malloc_async(
|
||||
(output_lwe_dimension + 1) * number_of_bits_to_extract *
|
||||
number_of_inputs * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
|
||||
*d_lwe_ct_in_array = (uint64_t *)cuda_malloc_async(
|
||||
(input_lwe_dimension + 1) * number_of_inputs * repetitions * samples *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
|
||||
uint64_t *lwe_ct_in_array =
|
||||
(uint64_t *)malloc(repetitions * samples * (input_lwe_dimension + 1) *
|
||||
number_of_inputs * sizeof(uint64_t));
|
||||
|
||||
// Create the input ciphertexts
|
||||
for (int r = 0; r < repetitions; r++) {
|
||||
uint64_t *lwe_sk_in =
|
||||
*lwe_sk_in_array + (ptrdiff_t)(r * input_lwe_dimension);
|
||||
for (int s = 0; s < samples; s++) {
|
||||
for (int i = 0; i < number_of_inputs; i++) {
|
||||
uint64_t plaintext = (*plaintexts)[r * samples * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
uint64_t *lwe_ct_in =
|
||||
lwe_ct_in_array + (ptrdiff_t)((r * samples * number_of_inputs +
|
||||
s * number_of_inputs + i) *
|
||||
(input_lwe_dimension + 1));
|
||||
concrete_cpu_encrypt_lwe_ciphertext_u64(
|
||||
lwe_sk_in, lwe_ct_in, plaintext, input_lwe_dimension,
|
||||
lwe_modular_variance, *csprng, &CONCRETE_CSPRNG_VTABLE);
|
||||
}
|
||||
}
|
||||
}
|
||||
cuda_memcpy_async_to_gpu(*d_lwe_ct_in_array, lwe_ct_in_array,
|
||||
repetitions * samples * number_of_inputs *
|
||||
(input_lwe_dimension + 1) * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
// Execute scratch
|
||||
scratch_cuda_extract_bits_64(stream, gpu_index, bit_extract_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size,
|
||||
pbs_level, number_of_inputs,
|
||||
cuda_get_max_shared_memory(gpu_index), true);
|
||||
|
||||
cuda_synchronize_stream(v_stream);
|
||||
free(lwe_ct_in_array);
|
||||
}
|
||||
|
||||
void bit_extraction_teardown(cudaStream_t *stream, Csprng *csprng,
|
||||
uint64_t *lwe_sk_in_array,
|
||||
uint64_t *lwe_sk_out_array,
|
||||
double *d_fourier_bsk_array, uint64_t *d_ksk_array,
|
||||
uint64_t *plaintexts, uint64_t *d_lwe_ct_in_array,
|
||||
uint64_t *d_lwe_ct_out_array,
|
||||
int8_t *bit_extract_buffer, int gpu_index) {
|
||||
void *v_stream = (void *)stream;
|
||||
cuda_synchronize_stream(v_stream);
|
||||
concrete_cpu_destroy_concrete_csprng(csprng);
|
||||
free(csprng);
|
||||
free(lwe_sk_in_array);
|
||||
free(lwe_sk_out_array);
|
||||
free(plaintexts);
|
||||
cleanup_cuda_extract_bits(stream, gpu_index, &bit_extract_buffer);
|
||||
cuda_drop_async(d_fourier_bsk_array, stream, gpu_index);
|
||||
cuda_drop_async(d_ksk_array, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_ct_out_array, stream, gpu_index);
|
||||
cuda_destroy_stream(stream, gpu_index);
|
||||
}
|
||||
|
||||
void circuit_bootstrap_setup(
|
||||
cudaStream_t *stream, Csprng **csprng, uint64_t **lwe_sk_in_array,
|
||||
uint64_t **lwe_sk_out_array, double **d_fourier_bsk_array,
|
||||
uint64_t **d_pksk_array, uint64_t **plaintexts,
|
||||
uint64_t **d_lwe_ct_in_array, uint64_t **d_ggsw_ct_out_array,
|
||||
uint64_t **d_lut_vector_indexes, int8_t **cbs_buffer, int lwe_dimension,
|
||||
int glwe_dimension, int polynomial_size, double lwe_modular_variance,
|
||||
double glwe_modular_variance, int pksk_base_log, int pksk_level,
|
||||
int pbs_base_log, int pbs_level, int cbs_level,
|
||||
int number_of_bits_of_message_including_padding, int ggsw_size,
|
||||
int *delta_log, uint64_t *delta, int number_of_inputs, int repetitions,
|
||||
int samples, int gpu_index) {
|
||||
|
||||
void *v_stream = (void *)stream;
|
||||
*delta_log = 60;
|
||||
|
||||
*delta = (uint64_t)(1) << *delta_log;
|
||||
// Create a Csprng
|
||||
*csprng =
|
||||
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
|
||||
uint8_t seed[16] = {(uint8_t)0};
|
||||
concrete_cpu_construct_concrete_csprng(
|
||||
*csprng, Uint128{.little_endian_bytes = {*seed}});
|
||||
|
||||
// Generate the keys
|
||||
generate_lwe_secret_keys(lwe_sk_in_array, lwe_dimension, *csprng,
|
||||
repetitions);
|
||||
generate_lwe_secret_keys(lwe_sk_out_array, glwe_dimension * polynomial_size,
|
||||
*csprng, repetitions);
|
||||
generate_lwe_bootstrap_keys(
|
||||
stream, gpu_index, d_fourier_bsk_array, *lwe_sk_in_array,
|
||||
*lwe_sk_out_array, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
pbs_level, pbs_base_log, *csprng, glwe_modular_variance, repetitions);
|
||||
generate_lwe_private_functional_keyswitch_key_lists(
|
||||
stream, gpu_index, d_pksk_array, *lwe_sk_out_array, *lwe_sk_out_array,
|
||||
glwe_dimension * polynomial_size, glwe_dimension, polynomial_size,
|
||||
pksk_level, pksk_base_log, *csprng, lwe_modular_variance, repetitions);
|
||||
*plaintexts =
|
||||
generate_plaintexts(number_of_bits_of_message_including_padding, *delta,
|
||||
number_of_inputs, repetitions, samples);
|
||||
|
||||
*d_ggsw_ct_out_array = (uint64_t *)cuda_malloc_async(
|
||||
repetitions * samples * number_of_inputs * ggsw_size * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
|
||||
*d_lwe_ct_in_array =
|
||||
(uint64_t *)cuda_malloc_async(repetitions * samples * number_of_inputs *
|
||||
(lwe_dimension + 1) * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
|
||||
uint64_t *lwe_ct_in_array =
|
||||
(uint64_t *)malloc(repetitions * samples * number_of_inputs *
|
||||
(lwe_dimension + 1) * sizeof(uint64_t));
|
||||
// Create the input ciphertexts
|
||||
for (int r = 0; r < repetitions; r++) {
|
||||
uint64_t *lwe_sk_in = *lwe_sk_in_array + (ptrdiff_t)(r * lwe_dimension);
|
||||
for (int s = 0; s < samples; s++) {
|
||||
for (int i = 0; i < number_of_inputs; i++) {
|
||||
uint64_t plaintext = (*plaintexts)[r * samples * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
uint64_t *lwe_ct_in =
|
||||
lwe_ct_in_array + (ptrdiff_t)((r * samples * number_of_inputs +
|
||||
s * number_of_inputs + i) *
|
||||
(lwe_dimension + 1));
|
||||
concrete_cpu_encrypt_lwe_ciphertext_u64(
|
||||
lwe_sk_in, lwe_ct_in, plaintext, lwe_dimension,
|
||||
lwe_modular_variance, *csprng, &CONCRETE_CSPRNG_VTABLE);
|
||||
}
|
||||
}
|
||||
}
|
||||
cuda_memcpy_async_to_gpu(*d_lwe_ct_in_array, lwe_ct_in_array,
|
||||
repetitions * samples * number_of_inputs *
|
||||
(lwe_dimension + 1) * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
|
||||
// Execute cbs scratch
|
||||
scratch_cuda_circuit_bootstrap_64(
|
||||
stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, cbs_level, number_of_inputs,
|
||||
cuda_get_max_shared_memory(gpu_index), true);
|
||||
|
||||
// Build LUT vector indexes
|
||||
uint64_t *h_lut_vector_indexes =
|
||||
(uint64_t *)malloc(number_of_inputs * cbs_level * sizeof(uint64_t));
|
||||
for (int index = 0; index < cbs_level * number_of_inputs; index++) {
|
||||
h_lut_vector_indexes[index] = index % cbs_level;
|
||||
}
|
||||
*d_lut_vector_indexes = (uint64_t *)cuda_malloc_async(
|
||||
number_of_inputs * cbs_level * sizeof(uint64_t), stream, gpu_index);
|
||||
cuda_memcpy_async_to_gpu(*d_lut_vector_indexes, h_lut_vector_indexes,
|
||||
number_of_inputs * cbs_level * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
free(h_lut_vector_indexes);
|
||||
free(lwe_ct_in_array);
|
||||
}
|
||||
|
||||
void circuit_bootstrap_teardown(
|
||||
cudaStream_t *stream, Csprng *csprng, uint64_t *lwe_sk_in_array,
|
||||
uint64_t *lwe_sk_out_array, double *d_fourier_bsk_array,
|
||||
uint64_t *d_pksk_array, uint64_t *plaintexts, uint64_t *d_lwe_ct_in_array,
|
||||
uint64_t *d_lut_vector_indexes, uint64_t *d_ggsw_ct_out_array,
|
||||
int8_t *cbs_buffer, int gpu_index) {
|
||||
void *v_stream = (void *)stream;
|
||||
cuda_synchronize_stream(v_stream);
|
||||
concrete_cpu_destroy_concrete_csprng(csprng);
|
||||
free(csprng);
|
||||
free(lwe_sk_in_array);
|
||||
free(lwe_sk_out_array);
|
||||
free(plaintexts);
|
||||
cleanup_cuda_circuit_bootstrap(stream, gpu_index, &cbs_buffer);
|
||||
cuda_drop_async(d_fourier_bsk_array, stream, gpu_index);
|
||||
cuda_drop_async(d_pksk_array, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index);
|
||||
cuda_drop_async(d_ggsw_ct_out_array, stream, gpu_index);
|
||||
cuda_drop_async(d_lut_vector_indexes, stream, gpu_index);
|
||||
cuda_destroy_stream(stream, gpu_index);
|
||||
}
|
||||
|
||||
void cmux_tree_setup(cudaStream_t *stream, Csprng **csprng, uint64_t **glwe_sk,
|
||||
uint64_t **d_lut_identity, uint64_t **plaintexts,
|
||||
uint64_t **d_ggsw_bit_array, int8_t **cmux_tree_buffer,
|
||||
uint64_t **d_glwe_out, int glwe_dimension,
|
||||
int polynomial_size, int base_log, int level_count,
|
||||
double glwe_modular_variance, int r_lut, int tau,
|
||||
uint64_t delta_log, int repetitions, int samples,
|
||||
int gpu_index) {
|
||||
void *v_stream = (void *)stream;
|
||||
int ggsw_size = polynomial_size * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) * level_count;
|
||||
int glwe_size = (glwe_dimension + 1) * polynomial_size;
|
||||
|
||||
// Create a Csprng
|
||||
*csprng =
|
||||
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
|
||||
uint8_t seed[16] = {(uint8_t)0};
|
||||
concrete_cpu_construct_concrete_csprng(
|
||||
*csprng, Uint128{.little_endian_bytes = {*seed}});
|
||||
|
||||
// Generate the keys
|
||||
generate_glwe_secret_keys(glwe_sk, glwe_dimension, polynomial_size, *csprng,
|
||||
repetitions);
|
||||
*plaintexts = generate_plaintexts(r_lut, 1, 1, repetitions, samples);
|
||||
|
||||
// Create the LUT
|
||||
int num_lut = (1 << r_lut);
|
||||
*d_lut_identity = (uint64_t *)cuda_malloc_async(
|
||||
polynomial_size * num_lut * tau * sizeof(uint64_t), stream, gpu_index);
|
||||
uint64_t *lut_cmux_tree_identity =
|
||||
generate_identity_lut_cmux_tree(polynomial_size, num_lut, tau, delta_log);
|
||||
|
||||
// Encrypt one bit per GGSW
|
||||
uint64_t *ggsw_bit_array = (uint64_t *)malloc(repetitions * samples * r_lut *
|
||||
ggsw_size * sizeof(uint64_t));
|
||||
for (int r = 0; r < repetitions; r++) {
|
||||
for (int s = 0; s < samples; s++) {
|
||||
uint64_t witness = (*plaintexts)[r * samples + s];
|
||||
|
||||
// Instantiate the GGSW m^tree ciphertexts
|
||||
// We need r GGSW ciphertexts
|
||||
// Bit decomposition of the value from MSB to LSB
|
||||
uint64_t *bit_array = bit_decompose_value(witness, r_lut);
|
||||
for (int i = 0; i < r_lut; i++) {
|
||||
uint64_t *ggsw_slice =
|
||||
ggsw_bit_array +
|
||||
(ptrdiff_t)((r * samples * r_lut + s * r_lut + i) * ggsw_size);
|
||||
concrete_cpu_encrypt_ggsw_ciphertext_u64(
|
||||
*glwe_sk, ggsw_slice, bit_array[i], glwe_dimension, polynomial_size,
|
||||
level_count, base_log, glwe_modular_variance, *csprng,
|
||||
&CONCRETE_CSPRNG_VTABLE);
|
||||
}
|
||||
free(bit_array);
|
||||
}
|
||||
}
|
||||
// Allocate and copy things to the device
|
||||
*d_glwe_out = (uint64_t *)cuda_malloc_async(
|
||||
tau * glwe_size * sizeof(uint64_t), stream, gpu_index);
|
||||
*d_ggsw_bit_array = (uint64_t *)cuda_malloc_async(
|
||||
repetitions * samples * r_lut * ggsw_size * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
cuda_memcpy_async_to_gpu(*d_lut_identity, lut_cmux_tree_identity,
|
||||
polynomial_size * num_lut * tau * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_memcpy_async_to_gpu(*d_ggsw_bit_array, ggsw_bit_array,
|
||||
repetitions * samples * r_lut * ggsw_size *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
|
||||
scratch_cuda_cmux_tree_64(stream, gpu_index, cmux_tree_buffer, glwe_dimension,
|
||||
polynomial_size, level_count, r_lut, tau,
|
||||
cuda_get_max_shared_memory(gpu_index), true);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
free(lut_cmux_tree_identity);
|
||||
free(ggsw_bit_array);
|
||||
}
|
||||
void cmux_tree_teardown(cudaStream_t *stream, Csprng **csprng,
|
||||
uint64_t **glwe_sk, uint64_t **d_lut_identity,
|
||||
uint64_t **plaintexts, uint64_t **d_ggsw_bit_array,
|
||||
int8_t **cmux_tree_buffer, uint64_t **d_glwe_out,
|
||||
int gpu_index) {
|
||||
cuda_synchronize_stream(stream);
|
||||
concrete_cpu_destroy_concrete_csprng(*csprng);
|
||||
free(*plaintexts);
|
||||
free(*csprng);
|
||||
free(*glwe_sk);
|
||||
cuda_drop_async(*d_lut_identity, stream, gpu_index);
|
||||
cuda_drop_async(*d_ggsw_bit_array, stream, gpu_index);
|
||||
cuda_drop_async(*d_glwe_out, stream, gpu_index);
|
||||
cleanup_cuda_cmux_tree(stream, gpu_index, cmux_tree_buffer);
|
||||
cuda_destroy_stream(stream, gpu_index);
|
||||
}
|
||||
|
||||
void wop_pbs_setup(cudaStream_t *stream, Csprng **csprng,
|
||||
uint64_t **lwe_sk_in_array, uint64_t **lwe_sk_out_array,
|
||||
uint64_t **d_ksk_array, double **d_fourier_bsk_array,
|
||||
uint64_t **d_pksk_array, uint64_t **plaintexts,
|
||||
uint64_t **d_lwe_ct_in_array, uint64_t **d_lwe_ct_out_array,
|
||||
uint64_t **d_lut_vector, int8_t **wop_pbs_buffer,
|
||||
int lwe_dimension, int glwe_dimension, int polynomial_size,
|
||||
double lwe_modular_variance, double glwe_modular_variance,
|
||||
int ks_base_log, int ks_level, int pksk_base_log,
|
||||
int pksk_level, int pbs_base_log, int pbs_level,
|
||||
int cbs_level, int p, int *delta_log, int *cbs_delta_log,
|
||||
int *delta_log_lut, uint64_t *delta, int tau,
|
||||
int repetitions, int samples, int gpu_index) {
|
||||
|
||||
void *v_stream = (void *)stream;
|
||||
int input_lwe_dimension = glwe_dimension * polynomial_size;
|
||||
*delta_log = 64 - p;
|
||||
*delta_log_lut = *delta_log;
|
||||
*delta = (uint64_t)(1) << *delta_log;
|
||||
// Create a Csprng
|
||||
*csprng =
|
||||
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
|
||||
uint8_t seed[16] = {(uint8_t)0};
|
||||
concrete_cpu_construct_concrete_csprng(
|
||||
*csprng, Uint128{.little_endian_bytes = {*seed}});
|
||||
|
||||
// Generate the keys
|
||||
generate_lwe_secret_keys(lwe_sk_in_array, input_lwe_dimension, *csprng,
|
||||
repetitions);
|
||||
generate_lwe_secret_keys(lwe_sk_out_array, lwe_dimension, *csprng,
|
||||
repetitions);
|
||||
generate_lwe_keyswitch_keys(stream, gpu_index, d_ksk_array, *lwe_sk_in_array,
|
||||
*lwe_sk_out_array, input_lwe_dimension,
|
||||
lwe_dimension, ks_level, ks_base_log, *csprng,
|
||||
lwe_modular_variance, repetitions);
|
||||
generate_lwe_bootstrap_keys(
|
||||
stream, gpu_index, d_fourier_bsk_array, *lwe_sk_out_array,
|
||||
*lwe_sk_in_array, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
pbs_level, pbs_base_log, *csprng, glwe_modular_variance, repetitions);
|
||||
generate_lwe_private_functional_keyswitch_key_lists(
|
||||
stream, gpu_index, d_pksk_array, *lwe_sk_in_array, *lwe_sk_in_array,
|
||||
input_lwe_dimension, glwe_dimension, polynomial_size, pksk_level,
|
||||
pksk_base_log, *csprng, lwe_modular_variance, repetitions);
|
||||
*plaintexts = generate_plaintexts(p, *delta, tau, repetitions, samples);
|
||||
|
||||
// LUT creation
|
||||
int lut_size = polynomial_size;
|
||||
int lut_num = tau << (tau * p - (int)log2(polynomial_size)); // r
|
||||
|
||||
uint64_t *big_lut = (uint64_t *)malloc(lut_num * lut_size * sizeof(uint64_t));
|
||||
for (int t = tau - 1; t >= 0; t--) {
|
||||
uint64_t *small_lut = big_lut + (ptrdiff_t)(t * (1 << (tau * p)));
|
||||
for (uint64_t value = 0; value < (uint64_t)(1 << (tau * p)); value++) {
|
||||
int nbits = t * p;
|
||||
uint64_t x = (value >> nbits) & (uint64_t)((1 << p) - 1);
|
||||
small_lut[value] =
|
||||
((x % (uint64_t)(1 << (64 - *delta_log))) << *delta_log_lut);
|
||||
}
|
||||
}
|
||||
*d_lut_vector = (uint64_t *)cuda_malloc_async(
|
||||
lut_num * lut_size * sizeof(uint64_t), stream, gpu_index);
|
||||
cuda_memcpy_async_to_gpu(*d_lut_vector, big_lut,
|
||||
lut_num * lut_size * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
// Allocate input
|
||||
*d_lwe_ct_in_array = (uint64_t *)cuda_malloc_async(
|
||||
repetitions * samples * (input_lwe_dimension + 1) * tau *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
// Allocate output
|
||||
*d_lwe_ct_out_array = (uint64_t *)cuda_malloc_async(
|
||||
repetitions * samples * (input_lwe_dimension + 1) * tau *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
uint64_t *lwe_ct_in_array =
|
||||
(uint64_t *)malloc(repetitions * samples * (input_lwe_dimension + 1) *
|
||||
tau * sizeof(uint64_t));
|
||||
// Create the input ciphertexts
|
||||
for (int r = 0; r < repetitions; r++) {
|
||||
uint64_t *lwe_sk_in =
|
||||
*lwe_sk_in_array + (ptrdiff_t)(r * input_lwe_dimension);
|
||||
for (int s = 0; s < samples; s++) {
|
||||
for (int i = 0; i < tau; i++) {
|
||||
uint64_t plaintext = (*plaintexts)[r * samples * tau + s * tau + i];
|
||||
uint64_t *lwe_ct_in =
|
||||
lwe_ct_in_array + (ptrdiff_t)((r * samples * tau + s * tau + i) *
|
||||
(input_lwe_dimension + 1));
|
||||
concrete_cpu_encrypt_lwe_ciphertext_u64(
|
||||
lwe_sk_in, lwe_ct_in, plaintext, input_lwe_dimension,
|
||||
lwe_modular_variance, *csprng, &CONCRETE_CSPRNG_VTABLE);
|
||||
}
|
||||
}
|
||||
}
|
||||
cuda_memcpy_async_to_gpu(*d_lwe_ct_in_array, lwe_ct_in_array,
|
||||
repetitions * samples * tau *
|
||||
(input_lwe_dimension + 1) * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
// Execute scratch
|
||||
scratch_cuda_wop_pbs_64(stream, gpu_index, wop_pbs_buffer,
|
||||
(uint32_t *)delta_log, (uint32_t *)cbs_delta_log,
|
||||
glwe_dimension, lwe_dimension, polynomial_size,
|
||||
cbs_level, pbs_level, p, p, tau,
|
||||
cuda_get_max_shared_memory(gpu_index), true);
|
||||
|
||||
cuda_synchronize_stream(v_stream);
|
||||
free(lwe_ct_in_array);
|
||||
free(big_lut);
|
||||
}
|
||||
|
||||
void wop_pbs_teardown(cudaStream_t *stream, Csprng *csprng,
|
||||
uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array,
|
||||
uint64_t *d_ksk_array, double *d_fourier_bsk_array,
|
||||
uint64_t *d_pksk_array, uint64_t *plaintexts,
|
||||
uint64_t *d_lwe_ct_in_array, uint64_t *d_lut_vector,
|
||||
uint64_t *d_lwe_ct_out_array, int8_t *wop_pbs_buffer,
|
||||
int gpu_index) {
|
||||
void *v_stream = (void *)stream;
|
||||
cuda_synchronize_stream(v_stream);
|
||||
concrete_cpu_destroy_concrete_csprng(csprng);
|
||||
free(csprng);
|
||||
free(lwe_sk_in_array);
|
||||
free(lwe_sk_out_array);
|
||||
free(plaintexts);
|
||||
cleanup_cuda_circuit_bootstrap_vertical_packing(stream, gpu_index,
|
||||
&wop_pbs_buffer);
|
||||
cuda_drop_async(d_fourier_bsk_array, stream, gpu_index);
|
||||
cuda_drop_async(d_ksk_array, stream, gpu_index);
|
||||
cuda_drop_async(d_pksk_array, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_ct_out_array, stream, gpu_index);
|
||||
cuda_drop_async(d_lut_vector, stream, gpu_index);
|
||||
cuda_destroy_stream(stream, gpu_index);
|
||||
}
|
||||
@@ -7,8 +7,9 @@ set(gtest_force_shared_crt
|
||||
CACHE BOOL "" FORCE)
|
||||
FetchContent_MakeAvailable(googletest)
|
||||
|
||||
set(CONCRETE_CPU_BINARY_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../concrete-cpu/implementation/target/release")
|
||||
set(CONCRETE_CPU_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../concrete-cpu/implementation")
|
||||
set(CONCRETE_CPU_BINARY_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../../concrete-cpu/implementation/target/release")
|
||||
set(CONCRETE_CPU_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../../concrete-cpu/implementation")
|
||||
set(CONCRETE_CUDA_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../")
|
||||
|
||||
if(NOT TARGET concrete_cpu)
|
||||
# Enable ExternalProject CMake module
|
||||
@@ -28,7 +29,10 @@ if(NOT TARGET concrete_cpu)
|
||||
LOG_BUILD ON)
|
||||
endif()
|
||||
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../include)
|
||||
include_directories(${CONCRETE_CPU_SOURCE_DIR}/include)
|
||||
include_directories(${CONCRETE_CUDA_SOURCE_DIR}/include)
|
||||
|
||||
add_library(concrete_cpu_lib STATIC IMPORTED)
|
||||
set_target_properties(concrete_cpu_lib PROPERTIES IMPORTED_LOCATION ${CONCRETE_CPU_BINARY_DIR}/libconcrete_cpu.a)
|
||||
|
||||
@@ -43,7 +47,7 @@ file(
|
||||
|
||||
set(SOURCES ${TEST_SOURCES})
|
||||
|
||||
add_executable(${BINARY} ${TEST_SOURCES})
|
||||
add_executable(${BINARY} ${TEST_SOURCES} ../utils.cpp ../setup_and_teardown.cpp)
|
||||
|
||||
add_test(NAME ${BINARY} COMMAND ${BINARY})
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
#include "../include/bit_extraction.h"
|
||||
#include "../include/device.h"
|
||||
#include "concrete-cpu.h"
|
||||
#include "utils.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <cstdint>
|
||||
#include <gtest/gtest.h>
|
||||
#include <setup_and_teardown.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
@@ -47,13 +44,12 @@ protected:
|
||||
int gpu_index = 0;
|
||||
uint64_t *lwe_sk_in_array;
|
||||
uint64_t *lwe_sk_out_array;
|
||||
uint64_t *lwe_in_ct_array;
|
||||
uint64_t *lwe_out_ct_array;
|
||||
uint64_t *lwe_ct_in_array;
|
||||
uint64_t *plaintexts;
|
||||
double *d_fourier_bsk_array;
|
||||
uint64_t *d_ksk_array;
|
||||
uint64_t *d_lwe_in_ct_array;
|
||||
uint64_t *d_lwe_out_ct_array;
|
||||
uint64_t *d_lwe_ct_in_array;
|
||||
uint64_t *d_lwe_ct_out_array;
|
||||
int8_t *bit_extract_buffer;
|
||||
int input_lwe_dimension;
|
||||
int output_lwe_dimension;
|
||||
@@ -77,114 +73,56 @@ public:
|
||||
(int)GetParam().number_of_bits_of_message_including_padding;
|
||||
number_of_bits_to_extract = (int)GetParam().number_of_bits_to_extract;
|
||||
number_of_inputs = (int)GetParam().number_of_inputs;
|
||||
delta_log = 64 - number_of_bits_of_message_including_padding;
|
||||
delta = (uint64_t)(1) << delta_log;
|
||||
|
||||
// Create a Csprng
|
||||
csprng =
|
||||
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
|
||||
uint8_t seed[16] = {(uint8_t)0};
|
||||
concrete_cpu_construct_concrete_csprng(
|
||||
csprng, Uint128{.little_endian_bytes = {*seed}});
|
||||
|
||||
input_lwe_dimension = glwe_dimension * polynomial_size;
|
||||
output_lwe_dimension = lwe_dimension;
|
||||
// Generate the keys
|
||||
generate_lwe_secret_keys(&lwe_sk_in_array, input_lwe_dimension, csprng,
|
||||
REPETITIONS);
|
||||
generate_lwe_secret_keys(&lwe_sk_out_array, output_lwe_dimension, csprng,
|
||||
REPETITIONS);
|
||||
generate_lwe_keyswitch_keys(
|
||||
stream, gpu_index, &d_ksk_array, lwe_sk_in_array, lwe_sk_out_array,
|
||||
input_lwe_dimension, output_lwe_dimension, ks_level, ks_base_log,
|
||||
csprng, lwe_modular_variance, REPETITIONS);
|
||||
generate_lwe_bootstrap_keys(
|
||||
stream, gpu_index, &d_fourier_bsk_array, lwe_sk_out_array,
|
||||
lwe_sk_in_array, output_lwe_dimension, glwe_dimension, polynomial_size,
|
||||
pbs_level, pbs_base_log, csprng, glwe_modular_variance, REPETITIONS);
|
||||
plaintexts =
|
||||
generate_plaintexts(number_of_bits_of_message_including_padding, delta,
|
||||
number_of_inputs, REPETITIONS, SAMPLES);
|
||||
|
||||
d_lwe_out_ct_array = (uint64_t *)cuda_malloc_async(
|
||||
(output_lwe_dimension + 1) * number_of_bits_to_extract *
|
||||
number_of_inputs * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
|
||||
d_lwe_in_ct_array = (uint64_t *)cuda_malloc_async(
|
||||
(input_lwe_dimension + 1) * number_of_inputs * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
|
||||
lwe_in_ct_array = (uint64_t *)malloc((input_lwe_dimension + 1) *
|
||||
number_of_inputs * sizeof(uint64_t));
|
||||
lwe_out_ct_array = (uint64_t *)malloc((output_lwe_dimension + 1) *
|
||||
number_of_bits_to_extract *
|
||||
number_of_inputs * sizeof(uint64_t));
|
||||
// Execute scratch
|
||||
scratch_cuda_extract_bits_64(stream, gpu_index, &bit_extract_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size,
|
||||
pbs_level, number_of_inputs,
|
||||
cuda_get_max_shared_memory(gpu_index), true);
|
||||
bit_extraction_setup(
|
||||
stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
|
||||
&d_fourier_bsk_array, &d_ksk_array, &plaintexts, &d_lwe_ct_in_array,
|
||||
&d_lwe_ct_out_array, &bit_extract_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, lwe_modular_variance, glwe_modular_variance,
|
||||
ks_base_log, ks_level, pbs_base_log, pbs_level,
|
||||
number_of_bits_of_message_including_padding, number_of_bits_to_extract,
|
||||
&delta_log, &delta, number_of_inputs, REPETITIONS, SAMPLES, gpu_index);
|
||||
}
|
||||
|
||||
void TearDown() {
|
||||
void *v_stream = (void *)stream;
|
||||
|
||||
cuda_synchronize_stream(v_stream);
|
||||
concrete_cpu_destroy_concrete_csprng(csprng);
|
||||
free(csprng);
|
||||
free(lwe_sk_in_array);
|
||||
free(lwe_sk_out_array);
|
||||
free(plaintexts);
|
||||
free(lwe_in_ct_array);
|
||||
free(lwe_out_ct_array);
|
||||
cleanup_cuda_extract_bits(stream, gpu_index, &bit_extract_buffer);
|
||||
cuda_drop_async(d_fourier_bsk_array, stream, gpu_index);
|
||||
cuda_drop_async(d_ksk_array, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_in_ct_array, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_out_ct_array, stream, gpu_index);
|
||||
cuda_destroy_stream(stream, gpu_index);
|
||||
bit_extraction_teardown(stream, csprng, lwe_sk_in_array, lwe_sk_out_array,
|
||||
d_fourier_bsk_array, d_ksk_array, plaintexts,
|
||||
d_lwe_ct_in_array, d_lwe_ct_out_array,
|
||||
bit_extract_buffer, gpu_index);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(BitExtractionTestPrimitives_u64, bit_extraction) {
|
||||
void *v_stream = (void *)stream;
|
||||
uint64_t *lwe_ct_out_array = (uint64_t *)malloc(
|
||||
(output_lwe_dimension + 1) * number_of_bits_to_extract *
|
||||
number_of_inputs * sizeof(uint64_t));
|
||||
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
|
||||
polynomial_size * (output_lwe_dimension + 1);
|
||||
int ksk_size = ks_level * input_lwe_dimension * (output_lwe_dimension + 1);
|
||||
for (uint r = 0; r < REPETITIONS; r++) {
|
||||
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
|
||||
uint64_t *d_ksk = d_ksk_array + (ptrdiff_t)(ksk_size * r);
|
||||
uint64_t *lwe_in_sk =
|
||||
lwe_sk_in_array + (ptrdiff_t)(input_lwe_dimension * r);
|
||||
uint64_t *lwe_sk_out =
|
||||
lwe_sk_out_array + (ptrdiff_t)(r * output_lwe_dimension);
|
||||
for (uint s = 0; s < SAMPLES; s++) {
|
||||
for (int i = 0; i < number_of_inputs; i++) {
|
||||
uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
uint64_t *lwe_in_ct =
|
||||
lwe_in_ct_array + (ptrdiff_t)(i * (input_lwe_dimension + 1));
|
||||
concrete_cpu_encrypt_lwe_ciphertext_u64(
|
||||
lwe_in_sk, lwe_in_ct, plaintext, input_lwe_dimension,
|
||||
lwe_modular_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
|
||||
}
|
||||
cuda_memcpy_async_to_gpu(d_lwe_in_ct_array, lwe_in_ct_array,
|
||||
(input_lwe_dimension + 1) * number_of_inputs *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
uint64_t *d_lwe_ct_in =
|
||||
d_lwe_ct_in_array +
|
||||
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
|
||||
(input_lwe_dimension + 1));
|
||||
|
||||
// Execute bit extract
|
||||
cuda_extract_bits_64(
|
||||
stream, gpu_index, (void *)d_lwe_out_ct_array,
|
||||
(void *)d_lwe_in_ct_array, bit_extract_buffer, (void *)d_ksk,
|
||||
(void *)d_fourier_bsk, number_of_bits_to_extract, delta_log,
|
||||
input_lwe_dimension, output_lwe_dimension, glwe_dimension,
|
||||
polynomial_size, pbs_base_log, pbs_level, ks_base_log, ks_level,
|
||||
number_of_inputs, cuda_get_max_shared_memory(gpu_index));
|
||||
stream, gpu_index, (void *)d_lwe_ct_out_array, (void *)d_lwe_ct_in,
|
||||
bit_extract_buffer, (void *)d_ksk, (void *)d_fourier_bsk,
|
||||
number_of_bits_to_extract, delta_log, input_lwe_dimension,
|
||||
output_lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, ks_base_log, ks_level, number_of_inputs,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(lwe_out_ct_array, d_lwe_out_ct_array,
|
||||
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
|
||||
(output_lwe_dimension + 1) *
|
||||
number_of_bits_to_extract *
|
||||
number_of_inputs * sizeof(uint64_t),
|
||||
@@ -192,7 +130,7 @@ TEST_P(BitExtractionTestPrimitives_u64, bit_extraction) {
|
||||
cuda_synchronize_stream(v_stream);
|
||||
for (int j = 0; j < number_of_inputs; j++) {
|
||||
uint64_t *result_array =
|
||||
lwe_out_ct_array + (ptrdiff_t)(j * number_of_bits_to_extract *
|
||||
lwe_ct_out_array + (ptrdiff_t)(j * number_of_bits_to_extract *
|
||||
(output_lwe_dimension + 1));
|
||||
uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + j];
|
||||
@@ -0,0 +1,245 @@
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <functional>
|
||||
#include <gtest/gtest.h>
|
||||
#include <setup_and_teardown.h>
|
||||
#include <utils.h>
|
||||
|
||||
typedef struct {
|
||||
int lwe_dimension;
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
double lwe_modular_variance;
|
||||
double glwe_modular_variance;
|
||||
int pbs_base_log;
|
||||
int pbs_level;
|
||||
int message_modulus;
|
||||
int carry_modulus;
|
||||
int number_of_inputs;
|
||||
int repetitions;
|
||||
int samples;
|
||||
} BootstrapTestParams;
|
||||
|
||||
class BootstrapTestPrimitives_u64
|
||||
: public ::testing::TestWithParam<BootstrapTestParams> {
|
||||
protected:
|
||||
int lwe_dimension;
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
double lwe_modular_variance;
|
||||
double glwe_modular_variance;
|
||||
int pbs_base_log;
|
||||
int pbs_level;
|
||||
int message_modulus;
|
||||
int carry_modulus;
|
||||
int payload_modulus;
|
||||
int number_of_inputs;
|
||||
int repetitions;
|
||||
int samples;
|
||||
uint64_t delta;
|
||||
Csprng *csprng;
|
||||
cudaStream_t *stream;
|
||||
int gpu_index = 0;
|
||||
uint64_t *lwe_sk_in_array;
|
||||
uint64_t *lwe_sk_out_array;
|
||||
uint64_t *plaintexts;
|
||||
double *d_fourier_bsk_array;
|
||||
uint64_t *d_lut_pbs_identity;
|
||||
uint64_t *d_lut_pbs_indexes;
|
||||
uint64_t *d_lwe_ct_in_array;
|
||||
uint64_t *d_lwe_ct_out_array;
|
||||
uint64_t *lwe_ct_out_array;
|
||||
int8_t *amortized_pbs_buffer;
|
||||
int8_t *lowlat_pbs_buffer;
|
||||
|
||||
public:
|
||||
// Test arithmetic functions
|
||||
void SetUp() {
|
||||
stream = cuda_create_stream(0);
|
||||
|
||||
// TestParams
|
||||
lwe_dimension = (int)GetParam().lwe_dimension;
|
||||
glwe_dimension = (int)GetParam().glwe_dimension;
|
||||
polynomial_size = (int)GetParam().polynomial_size;
|
||||
lwe_modular_variance = (double)GetParam().lwe_modular_variance;
|
||||
glwe_modular_variance = (double)GetParam().glwe_modular_variance;
|
||||
pbs_base_log = (int)GetParam().pbs_base_log;
|
||||
pbs_level = (int)GetParam().pbs_level;
|
||||
message_modulus = (int)GetParam().message_modulus;
|
||||
carry_modulus = (int)GetParam().carry_modulus;
|
||||
number_of_inputs = (int)GetParam().number_of_inputs;
|
||||
repetitions = (int)GetParam().repetitions;
|
||||
samples = (int)GetParam().samples;
|
||||
|
||||
bootstrap_setup(stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
|
||||
&d_fourier_bsk_array, &plaintexts, &d_lut_pbs_identity,
|
||||
&d_lut_pbs_indexes, &d_lwe_ct_in_array, &d_lwe_ct_out_array,
|
||||
&amortized_pbs_buffer, &lowlat_pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, lwe_modular_variance,
|
||||
glwe_modular_variance, pbs_base_log, pbs_level,
|
||||
message_modulus, carry_modulus, &payload_modulus, &delta,
|
||||
number_of_inputs, repetitions, samples, gpu_index);
|
||||
|
||||
lwe_ct_out_array =
|
||||
(uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *
|
||||
number_of_inputs * sizeof(uint64_t));
|
||||
}
|
||||
|
||||
void TearDown() {
|
||||
free(lwe_ct_out_array);
|
||||
bootstrap_teardown(stream, csprng, lwe_sk_in_array, lwe_sk_out_array,
|
||||
d_fourier_bsk_array, plaintexts, d_lut_pbs_identity,
|
||||
d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_ct_out_array,
|
||||
amortized_pbs_buffer, lowlat_pbs_buffer, gpu_index);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(BootstrapTestPrimitives_u64, amortized_bootstrap) {
|
||||
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
|
||||
polynomial_size * (lwe_dimension + 1);
|
||||
// Here execute the PBS
|
||||
for (int r = 0; r < repetitions; r++) {
|
||||
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
|
||||
uint64_t *lwe_sk_out =
|
||||
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
|
||||
for (int s = 0; s < samples; s++) {
|
||||
uint64_t *d_lwe_ct_in =
|
||||
d_lwe_ct_in_array +
|
||||
(ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
|
||||
(lwe_dimension + 1));
|
||||
// Execute PBS
|
||||
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_ct_out_array,
|
||||
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
|
||||
(void *)d_lwe_ct_in, (void *)d_fourier_bsk, amortized_pbs_buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, number_of_inputs, 1, 0,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
|
||||
(glwe_dimension * polynomial_size + 1) *
|
||||
number_of_inputs * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
|
||||
for (int j = 0; j < number_of_inputs; j++) {
|
||||
uint64_t *result =
|
||||
lwe_ct_out_array +
|
||||
(ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
|
||||
uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
|
||||
s * number_of_inputs + j];
|
||||
uint64_t decrypted = 0;
|
||||
concrete_cpu_decrypt_lwe_ciphertext_u64(
|
||||
lwe_sk_out, result, glwe_dimension * polynomial_size, &decrypted);
|
||||
EXPECT_NE(decrypted, plaintext);
|
||||
// let err = (decrypted >= plaintext) ? decrypted - plaintext :
|
||||
// plaintext
|
||||
// - decrypted;
|
||||
// error_sample_vec.push(err);
|
||||
|
||||
// The bit before the message
|
||||
uint64_t rounding_bit = delta >> 1;
|
||||
// Compute the rounding bit
|
||||
uint64_t rounding = (decrypted & rounding_bit) << 1;
|
||||
uint64_t decoded = (decrypted + rounding) / delta;
|
||||
EXPECT_EQ(decoded, plaintext / delta)
|
||||
<< "Repetition: " << r << ", sample: " << s;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(BootstrapTestPrimitives_u64, low_latency_bootstrap) {
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
if (number_of_inputs > number_of_sm * 4 / (glwe_dimension + 1) / pbs_level)
|
||||
GTEST_SKIP() << "The Low Latency PBS does not support this configuration";
|
||||
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
|
||||
polynomial_size * (lwe_dimension + 1);
|
||||
// Here execute the PBS
|
||||
for (int r = 0; r < repetitions; r++) {
|
||||
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
|
||||
uint64_t *lwe_sk_out =
|
||||
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
|
||||
for (int s = 0; s < samples; s++) {
|
||||
uint64_t *d_lwe_ct_in =
|
||||
d_lwe_ct_in_array +
|
||||
(ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
|
||||
(lwe_dimension + 1));
|
||||
// Execute PBS
|
||||
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_ct_out_array,
|
||||
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
|
||||
(void *)d_lwe_ct_in, (void *)d_fourier_bsk, lowlat_pbs_buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, number_of_inputs, 1, 0,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
|
||||
(glwe_dimension * polynomial_size + 1) *
|
||||
number_of_inputs * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
|
||||
for (int j = 0; j < number_of_inputs; j++) {
|
||||
uint64_t *result =
|
||||
lwe_ct_out_array +
|
||||
(ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
|
||||
uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
|
||||
s * number_of_inputs + j];
|
||||
uint64_t decrypted = 0;
|
||||
concrete_cpu_decrypt_lwe_ciphertext_u64(
|
||||
lwe_sk_out, result, glwe_dimension * polynomial_size, &decrypted);
|
||||
EXPECT_NE(decrypted, plaintext);
|
||||
// let err = (decrypted >= plaintext) ? decrypted - plaintext :
|
||||
// plaintext
|
||||
// - decrypted;
|
||||
// error_sample_vec.push(err);
|
||||
|
||||
// The bit before the message
|
||||
uint64_t rounding_bit = delta >> 1;
|
||||
// Compute the rounding bit
|
||||
uint64_t rounding = (decrypted & rounding_bit) << 1;
|
||||
uint64_t decoded = (decrypted + rounding) / delta;
|
||||
EXPECT_EQ(decoded, plaintext / delta);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Defines for which parameters set the PBS will be tested.
|
||||
// It executes each test for all pairs on phis X qs (Cartesian product)
|
||||
::testing::internal::ParamGenerator<BootstrapTestParams> pbs_params_u64 =
|
||||
::testing::Values(
|
||||
// n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
|
||||
// message_modulus, carry_modulus, number_of_inputs, repetitions,
|
||||
// samples
|
||||
(BootstrapTestParams){567, 5, 256, 7.52316384526264e-25,
|
||||
7.52316384526264e-25, 15, 1, 2, 1, 5, 2, 5},
|
||||
(BootstrapTestParams){623, 6, 256, 7.52316384526264e-25,
|
||||
7.52316384526264e-25, 9, 3, 2, 2, 5, 2, 50},
|
||||
(BootstrapTestParams){694, 3, 512, 7.52316384526264e-25,
|
||||
7.52316384526264e-25, 18, 1, 2, 1, 5, 2, 50},
|
||||
(BootstrapTestParams){769, 2, 1024, 7.52316384526264e-25,
|
||||
7.52316384526264e-25, 23, 1, 2, 1, 5, 2, 50},
|
||||
(BootstrapTestParams){754, 1, 2048, 7.52316384526264e-25,
|
||||
7.52316384526264e-25, 23, 1, 4, 1, 5, 2, 50},
|
||||
(BootstrapTestParams){847, 1, 4096, 7.52316384526264e-25,
|
||||
7.52316384526264e-25, 2, 12, 2, 1, 2, 1, 50},
|
||||
(BootstrapTestParams){881, 1, 8192, 7.52316384526264e-25,
|
||||
7.52316384526264e-25, 22, 1, 2, 1, 2, 1, 25},
|
||||
(BootstrapTestParams){976, 1, 16384, 7.52316384526264e-25,
|
||||
7.52316384526264e-25, 11, 3, 4, 1, 2, 1, 10});
|
||||
|
||||
std::string printParamName(::testing::TestParamInfo<BootstrapTestParams> p) {
|
||||
BootstrapTestParams params = p.param;
|
||||
|
||||
return "n_" + std::to_string(params.lwe_dimension) + "_k_" +
|
||||
std::to_string(params.glwe_dimension) + "_N_" +
|
||||
std::to_string(params.polynomial_size) + "_pbs_base_log_" +
|
||||
std::to_string(params.pbs_base_log) + "_pbs_level_" +
|
||||
std::to_string(params.pbs_level) + "_number_of_inputs_" +
|
||||
std::to_string(params.number_of_inputs);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(BootstrapInstantiation, BootstrapTestPrimitives_u64,
|
||||
pbs_params_u64, printParamName);
|
||||
@@ -1,9 +1,6 @@
|
||||
#include "../include/circuit_bootstrap.h"
|
||||
#include "../include/device.h"
|
||||
#include "concrete-cpu.h"
|
||||
#include "utils.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <cstdint>
|
||||
#include <gtest/gtest.h>
|
||||
#include <setup_and_teardown.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
@@ -49,13 +46,11 @@ protected:
|
||||
int gpu_index = 0;
|
||||
uint64_t *lwe_sk_in_array;
|
||||
uint64_t *lwe_sk_out_array;
|
||||
uint64_t *lwe_in_ct;
|
||||
uint64_t *ggsw_out_ct;
|
||||
uint64_t *plaintexts;
|
||||
double *d_fourier_bsk_array;
|
||||
uint64_t *d_pksk_array;
|
||||
uint64_t *d_lwe_in_ct;
|
||||
uint64_t *d_ggsw_out_ct;
|
||||
uint64_t *d_lwe_ct_in_array;
|
||||
uint64_t *d_ggsw_ct_out_array;
|
||||
uint64_t *d_lut_vector_indexes;
|
||||
int8_t *cbs_buffer;
|
||||
|
||||
@@ -77,90 +72,34 @@ public:
|
||||
cbs_base_log = (int)GetParam().cbs_base_log;
|
||||
cbs_level = (int)GetParam().cbs_level;
|
||||
number_of_inputs = (int)GetParam().number_of_inputs;
|
||||
|
||||
// We generate binary messages
|
||||
number_of_bits_of_message_including_padding = 2;
|
||||
delta_log = 60;
|
||||
delta = (uint64_t)(1) << delta_log;
|
||||
ggsw_size = cbs_level * (glwe_dimension + 1) * (glwe_dimension + 1) *
|
||||
polynomial_size;
|
||||
|
||||
// Create a Csprng
|
||||
csprng =
|
||||
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
|
||||
uint8_t seed[16] = {(uint8_t)0};
|
||||
concrete_cpu_construct_concrete_csprng(
|
||||
csprng, Uint128{.little_endian_bytes = {*seed}});
|
||||
|
||||
// Generate the keys
|
||||
generate_lwe_secret_keys(&lwe_sk_in_array, lwe_dimension, csprng,
|
||||
REPETITIONS);
|
||||
generate_lwe_secret_keys(&lwe_sk_out_array,
|
||||
glwe_dimension * polynomial_size, csprng,
|
||||
REPETITIONS);
|
||||
generate_lwe_bootstrap_keys(
|
||||
stream, gpu_index, &d_fourier_bsk_array, lwe_sk_in_array,
|
||||
lwe_sk_out_array, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
pbs_level, pbs_base_log, csprng, glwe_modular_variance, REPETITIONS);
|
||||
generate_lwe_private_functional_keyswitch_key_lists(
|
||||
stream, gpu_index, &d_pksk_array, lwe_sk_out_array, lwe_sk_out_array,
|
||||
glwe_dimension * polynomial_size, glwe_dimension, polynomial_size,
|
||||
pksk_level, pksk_base_log, csprng, lwe_modular_variance, REPETITIONS);
|
||||
plaintexts =
|
||||
generate_plaintexts(number_of_bits_of_message_including_padding, delta,
|
||||
number_of_inputs, REPETITIONS, SAMPLES);
|
||||
|
||||
d_ggsw_out_ct = (uint64_t *)cuda_malloc_async(
|
||||
number_of_inputs * ggsw_size * sizeof(uint64_t), stream, gpu_index);
|
||||
|
||||
d_lwe_in_ct = (uint64_t *)cuda_malloc_async(
|
||||
number_of_inputs * (lwe_dimension + 1) * sizeof(uint64_t), stream,
|
||||
circuit_bootstrap_setup(
|
||||
stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
|
||||
&d_fourier_bsk_array, &d_pksk_array, &plaintexts, &d_lwe_ct_in_array,
|
||||
&d_ggsw_ct_out_array, &d_lut_vector_indexes, &cbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, lwe_modular_variance,
|
||||
glwe_modular_variance, pksk_base_log, pksk_level, pbs_base_log,
|
||||
pbs_level, cbs_level, number_of_bits_of_message_including_padding,
|
||||
ggsw_size, &delta_log, &delta, number_of_inputs, REPETITIONS, SAMPLES,
|
||||
gpu_index);
|
||||
|
||||
lwe_in_ct = (uint64_t *)malloc(number_of_inputs * (lwe_dimension + 1) *
|
||||
sizeof(uint64_t));
|
||||
ggsw_out_ct = (uint64_t *)malloc(ggsw_size * sizeof(uint64_t));
|
||||
// Execute cbs scratch
|
||||
scratch_cuda_circuit_bootstrap_64(
|
||||
stream, gpu_index, &cbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, cbs_level, number_of_inputs,
|
||||
cuda_get_max_shared_memory(gpu_index), true);
|
||||
// Build LUT vector indexes
|
||||
uint64_t *h_lut_vector_indexes =
|
||||
(uint64_t *)malloc(number_of_inputs * cbs_level * sizeof(uint64_t));
|
||||
for (int index = 0; index < cbs_level * number_of_inputs; index++) {
|
||||
h_lut_vector_indexes[index] = index % cbs_level;
|
||||
}
|
||||
d_lut_vector_indexes = (uint64_t *)cuda_malloc_async(
|
||||
number_of_inputs * cbs_level * sizeof(uint64_t), stream, gpu_index);
|
||||
cuda_memcpy_async_to_gpu(d_lut_vector_indexes, h_lut_vector_indexes,
|
||||
number_of_inputs * cbs_level * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
free(h_lut_vector_indexes);
|
||||
}
|
||||
|
||||
void TearDown() {
|
||||
void *v_stream = (void *)stream;
|
||||
|
||||
cuda_synchronize_stream(v_stream);
|
||||
concrete_cpu_destroy_concrete_csprng(csprng);
|
||||
free(csprng);
|
||||
free(lwe_sk_in_array);
|
||||
free(lwe_sk_out_array);
|
||||
free(plaintexts);
|
||||
free(lwe_in_ct);
|
||||
free(ggsw_out_ct);
|
||||
cleanup_cuda_circuit_bootstrap(stream, gpu_index, &cbs_buffer);
|
||||
cuda_drop_async(d_fourier_bsk_array, stream, gpu_index);
|
||||
cuda_drop_async(d_pksk_array, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_in_ct, stream, gpu_index);
|
||||
cuda_drop_async(d_ggsw_out_ct, stream, gpu_index);
|
||||
cuda_drop_async(d_lut_vector_indexes, stream, gpu_index);
|
||||
cuda_destroy_stream(stream, gpu_index);
|
||||
circuit_bootstrap_teardown(
|
||||
stream, csprng, lwe_sk_in_array, lwe_sk_out_array, d_fourier_bsk_array,
|
||||
d_pksk_array, plaintexts, d_lwe_ct_in_array, d_lut_vector_indexes,
|
||||
d_ggsw_ct_out_array, cbs_buffer, gpu_index);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(CircuitBootstrapTestPrimitives_u64, circuit_bootstrap) {
|
||||
void *v_stream = (void *)stream;
|
||||
uint64_t *ggsw_ct_out = (uint64_t *)malloc(ggsw_size * sizeof(uint64_t));
|
||||
for (uint r = 0; r < REPETITIONS; r++) {
|
||||
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
|
||||
polynomial_size * (lwe_dimension + 1);
|
||||
@@ -169,27 +108,17 @@ TEST_P(CircuitBootstrapTestPrimitives_u64, circuit_bootstrap) {
|
||||
(glwe_dimension * polynomial_size + 1) *
|
||||
(glwe_dimension + 1);
|
||||
uint64_t *d_pksk_list = d_pksk_array + (ptrdiff_t)(pksk_list_size * r);
|
||||
uint64_t *lwe_in_sk = lwe_sk_in_array + (ptrdiff_t)(lwe_dimension * r);
|
||||
uint64_t *lwe_sk_out =
|
||||
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
|
||||
for (uint s = 0; s < SAMPLES; s++) {
|
||||
for (int i = 0; i < number_of_inputs; i++) {
|
||||
uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
concrete_cpu_encrypt_lwe_ciphertext_u64(
|
||||
lwe_in_sk, lwe_in_ct + i * (lwe_dimension + 1), plaintext,
|
||||
lwe_dimension, lwe_modular_variance, csprng,
|
||||
&CONCRETE_CSPRNG_VTABLE);
|
||||
}
|
||||
cuda_synchronize_stream(v_stream);
|
||||
cuda_memcpy_async_to_gpu(d_lwe_in_ct, lwe_in_ct,
|
||||
number_of_inputs * (lwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
uint64_t *d_lwe_ct_in =
|
||||
d_lwe_ct_in_array +
|
||||
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
|
||||
(lwe_dimension + 1));
|
||||
|
||||
// Execute circuit bootstrap
|
||||
cuda_circuit_bootstrap_64(
|
||||
stream, gpu_index, (void *)d_ggsw_out_ct, (void *)d_lwe_in_ct,
|
||||
stream, gpu_index, (void *)d_ggsw_ct_out_array, (void *)d_lwe_ct_in,
|
||||
(void *)d_fourier_bsk, (void *)d_pksk_list,
|
||||
(void *)d_lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
|
||||
glwe_dimension, lwe_dimension, pbs_level, pbs_base_log, pksk_level,
|
||||
@@ -203,9 +132,9 @@ TEST_P(CircuitBootstrapTestPrimitives_u64, circuit_bootstrap) {
|
||||
(uint64_t *)malloc(polynomial_size * (glwe_dimension + 1) *
|
||||
cbs_level * sizeof(uint64_t));
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(ggsw_out_ct, d_ggsw_out_ct + i * ggsw_size,
|
||||
ggsw_size * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
cuda_memcpy_async_to_cpu(
|
||||
ggsw_ct_out, d_ggsw_ct_out_array + i * ggsw_size,
|
||||
ggsw_size * sizeof(uint64_t), stream, gpu_index);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
|
||||
uint64_t multiplying_factor = -(plaintext >> delta_log);
|
||||
@@ -215,7 +144,7 @@ TEST_P(CircuitBootstrapTestPrimitives_u64, circuit_bootstrap) {
|
||||
(glwe_dimension + 1) +
|
||||
j * polynomial_size);
|
||||
uint64_t *glwe_ct_out =
|
||||
ggsw_out_ct +
|
||||
ggsw_ct_out +
|
||||
(ptrdiff_t)((l - 1) * polynomial_size * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) +
|
||||
j * polynomial_size * (glwe_dimension + 1));
|
||||
@@ -239,7 +168,7 @@ TEST_P(CircuitBootstrapTestPrimitives_u64, circuit_bootstrap) {
|
||||
(glwe_dimension + 1) +
|
||||
glwe_dimension * polynomial_size);
|
||||
uint64_t *glwe_ct_out =
|
||||
ggsw_out_ct +
|
||||
ggsw_ct_out +
|
||||
(ptrdiff_t)((cbs_level - 1) * polynomial_size *
|
||||
(glwe_dimension + 1) * (glwe_dimension + 1) +
|
||||
glwe_dimension * polynomial_size *
|
||||
@@ -258,6 +187,7 @@ TEST_P(CircuitBootstrapTestPrimitives_u64, circuit_bootstrap) {
|
||||
}
|
||||
}
|
||||
}
|
||||
free(ggsw_ct_out);
|
||||
}
|
||||
|
||||
// Defines for which parameters set the PBS will be tested.
|
||||
@@ -1,10 +1,7 @@
|
||||
#include "../include/device.h"
|
||||
#include "../include/vertical_packing.h"
|
||||
#include "concrete-cpu.h"
|
||||
#include "utils.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <gtest/gtest.h>
|
||||
#include <setup_and_teardown.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
const unsigned REPETITIONS = 5;
|
||||
@@ -39,12 +36,15 @@ protected:
|
||||
int gpu_index = 0;
|
||||
uint64_t *glwe_sk;
|
||||
uint64_t *d_lut_identity;
|
||||
int8_t *cmux_tree_buffer = nullptr;
|
||||
uint64_t *d_ggsw_bit_array;
|
||||
uint64_t *d_glwe_out;
|
||||
uint64_t *glwe_out;
|
||||
|
||||
public:
|
||||
// Test arithmetic functions
|
||||
void SetUp() {
|
||||
stream = cuda_create_stream(0);
|
||||
void *v_stream = (void *)stream;
|
||||
|
||||
// TestParams
|
||||
glwe_dimension = (int)GetParam().glwe_dimension;
|
||||
@@ -59,41 +59,20 @@ public:
|
||||
// Value of the shift we multiply our messages by
|
||||
delta = ((uint64_t)(1) << delta_log);
|
||||
|
||||
// Create a Csprng
|
||||
csprng =
|
||||
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
|
||||
uint8_t seed[16] = {(uint8_t)0};
|
||||
concrete_cpu_construct_concrete_csprng(
|
||||
csprng, Uint128{.little_endian_bytes = {*seed}});
|
||||
|
||||
// Generate the keys
|
||||
generate_glwe_secret_keys(&glwe_sk, glwe_dimension, polynomial_size, csprng,
|
||||
REPETITIONS);
|
||||
plaintexts = generate_plaintexts(r_lut, 1, 1, REPETITIONS, SAMPLES);
|
||||
|
||||
// Create the LUT
|
||||
int num_lut = (1 << r_lut);
|
||||
d_lut_identity = (uint64_t *)cuda_malloc_async(
|
||||
polynomial_size * num_lut * tau * sizeof(uint64_t), stream, gpu_index);
|
||||
uint64_t *lut_cmux_tree_identity = generate_identity_lut_cmux_tree(
|
||||
polynomial_size, num_lut, tau, delta_log);
|
||||
|
||||
// Copy all LUTs
|
||||
cuda_memcpy_async_to_gpu(d_lut_identity, lut_cmux_tree_identity,
|
||||
polynomial_size * num_lut * tau * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
|
||||
cuda_synchronize_stream(v_stream);
|
||||
free(lut_cmux_tree_identity);
|
||||
cmux_tree_setup(stream, &csprng, &glwe_sk, &d_lut_identity, &plaintexts,
|
||||
&d_ggsw_bit_array, &cmux_tree_buffer, &d_glwe_out,
|
||||
glwe_dimension, polynomial_size, base_log, level_count,
|
||||
glwe_modular_variance, r_lut, tau, delta_log, REPETITIONS,
|
||||
SAMPLES, gpu_index);
|
||||
glwe_out = (uint64_t *)malloc(tau * (glwe_dimension + 1) * polynomial_size *
|
||||
sizeof(uint64_t));
|
||||
}
|
||||
|
||||
void TearDown() {
|
||||
cuda_synchronize_stream(stream);
|
||||
concrete_cpu_destroy_concrete_csprng(csprng);
|
||||
free(plaintexts);
|
||||
free(csprng);
|
||||
cuda_drop_async(d_lut_identity, stream, gpu_index);
|
||||
cuda_destroy_stream(stream, gpu_index);
|
||||
free(glwe_out);
|
||||
cmux_tree_teardown(stream, &csprng, &glwe_sk, &d_lut_identity, &plaintexts,
|
||||
&d_ggsw_bit_array, &cmux_tree_buffer, &d_glwe_out,
|
||||
gpu_index);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -101,54 +80,30 @@ TEST_P(CMUXTreeTestPrimitives_u64, cmux_tree) {
|
||||
int ggsw_size = polynomial_size * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) * level_count;
|
||||
int glwe_size = (glwe_dimension + 1) * polynomial_size;
|
||||
uint64_t *d_ggsw_bit_array = (uint64_t *)cuda_malloc_async(
|
||||
r_lut * ggsw_size * sizeof(uint64_t), stream, gpu_index);
|
||||
uint64_t *d_results = (uint64_t *)cuda_malloc_async(
|
||||
tau * glwe_size * sizeof(uint64_t), stream, gpu_index);
|
||||
uint64_t *results = (uint64_t *)malloc(tau * glwe_size * sizeof(uint64_t));
|
||||
uint64_t *ggsw = (uint64_t *)malloc(ggsw_size * sizeof(uint64_t));
|
||||
|
||||
int8_t *cmux_tree_buffer = nullptr;
|
||||
scratch_cuda_cmux_tree_64(stream, gpu_index, &cmux_tree_buffer,
|
||||
glwe_dimension, polynomial_size, level_count, r_lut,
|
||||
tau, cuda_get_max_shared_memory(gpu_index), true);
|
||||
|
||||
// Here execute the PBS
|
||||
for (uint r = 0; r < REPETITIONS; r++) {
|
||||
for (uint s = 0; s < SAMPLES; s++) {
|
||||
uint64_t witness = plaintexts[r * SAMPLES + s];
|
||||
|
||||
// Instantiate the GGSW m^tree ciphertexts
|
||||
// We need r GGSW ciphertexts
|
||||
// Bit decomposition of the value from MSB to LSB
|
||||
uint64_t *bit_array = bit_decompose_value(witness, r_lut);
|
||||
uint64_t *d_ggsw_bit_array_slice =
|
||||
d_ggsw_bit_array +
|
||||
(ptrdiff_t)((r * SAMPLES * r_lut + s * r_lut) * ggsw_size);
|
||||
|
||||
for (int i = 0; i < r_lut; i++) {
|
||||
uint64_t *d_ggsw_slice = d_ggsw_bit_array + i * ggsw_size;
|
||||
concrete_cpu_encrypt_ggsw_ciphertext_u64(
|
||||
glwe_sk, ggsw, bit_array[i], glwe_dimension, polynomial_size,
|
||||
level_count, base_log, glwe_modular_variance, csprng,
|
||||
&CONCRETE_CSPRNG_VTABLE);
|
||||
cuda_memcpy_async_to_gpu(d_ggsw_slice, ggsw,
|
||||
ggsw_size * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
}
|
||||
cuda_synchronize_stream(stream);
|
||||
|
||||
// Execute scratch/CMUX tree/cleanup
|
||||
cuda_cmux_tree_64(stream, gpu_index, (void *)d_results,
|
||||
(void *)d_ggsw_bit_array, (void *)d_lut_identity,
|
||||
// Execute CMUX tree
|
||||
cuda_cmux_tree_64(stream, gpu_index, (void *)d_glwe_out,
|
||||
(void *)d_ggsw_bit_array_slice, (void *)d_lut_identity,
|
||||
cmux_tree_buffer, glwe_dimension, polynomial_size,
|
||||
base_log, level_count, r_lut, tau,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(results, d_results,
|
||||
cuda_memcpy_async_to_cpu(glwe_out, d_glwe_out,
|
||||
tau * glwe_size * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
cuda_synchronize_stream(stream);
|
||||
for (int tree = 0; tree < tau; tree++) {
|
||||
uint64_t *result = results + tree * glwe_size;
|
||||
uint64_t *result = glwe_out + tree * glwe_size;
|
||||
uint64_t *decrypted =
|
||||
(uint64_t *)malloc(polynomial_size * sizeof(uint64_t));
|
||||
concrete_cpu_decrypt_glwe_ciphertext_u64(
|
||||
@@ -158,27 +113,15 @@ TEST_P(CMUXTreeTestPrimitives_u64, cmux_tree) {
|
||||
// Compute the rounding bit
|
||||
uint64_t rounding = (decrypted[0] & rounding_bit) << 1;
|
||||
uint64_t decoded = (decrypted[0] + rounding) / delta;
|
||||
EXPECT_EQ(decoded, (witness + tree) % (1 << (64 - delta_log)));
|
||||
EXPECT_EQ(decoded, (witness + tree) % (1 << (64 - delta_log)))
|
||||
<< "Repetition: " << r << ", sample: " << s << ", tree: " << tree;
|
||||
free(decrypted);
|
||||
}
|
||||
free(bit_array);
|
||||
}
|
||||
}
|
||||
cuda_synchronize_stream(stream);
|
||||
cleanup_cuda_cmux_tree(stream, gpu_index, &cmux_tree_buffer);
|
||||
free(ggsw);
|
||||
|
||||
cuda_drop_async(d_ggsw_bit_array, stream, gpu_index);
|
||||
}
|
||||
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
double glwe_modular_variance;
|
||||
int base_log;
|
||||
int level_count;
|
||||
int message_modulus;
|
||||
int carry_modulus;
|
||||
|
||||
// Defines for which parameters set the PBS will be tested.
|
||||
// It executes each test for all pairs on phis X qs (Cartesian product)
|
||||
::testing::internal::ParamGenerator<CMUXTreeTestParams> cmux_tree_params_u64 =
|
||||
@@ -1,10 +1,6 @@
|
||||
#include "../include/device.h"
|
||||
#include "../include/keyswitch.h"
|
||||
#include "concrete-cpu.h"
|
||||
#include "utils.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <gtest/gtest.h>
|
||||
#include <setup_and_teardown.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
@@ -42,8 +38,8 @@ protected:
|
||||
uint64_t *lwe_sk_out_array;
|
||||
uint64_t *plaintexts;
|
||||
uint64_t *d_ksk_array;
|
||||
uint64_t *d_lwe_out_ct;
|
||||
uint64_t *d_lwe_in_ct;
|
||||
uint64_t *d_lwe_ct_out_array;
|
||||
uint64_t *d_lwe_ct_in_array;
|
||||
uint64_t *lwe_in_ct;
|
||||
uint64_t *lwe_out_ct;
|
||||
|
||||
@@ -51,7 +47,6 @@ public:
|
||||
// Test arithmetic functions
|
||||
void SetUp() {
|
||||
stream = cuda_create_stream(0);
|
||||
void *v_stream = (void *)stream;
|
||||
|
||||
// TestParams
|
||||
input_lwe_dimension = (int)GetParam().input_lwe_dimension;
|
||||
@@ -63,94 +58,42 @@ public:
|
||||
carry_modulus = (int)GetParam().carry_modulus;
|
||||
number_of_inputs = (int)GetParam().number_of_inputs;
|
||||
|
||||
payload_modulus = message_modulus * carry_modulus;
|
||||
// Value of the shift we multiply our messages by
|
||||
delta = ((uint64_t)(1) << 63) / (uint64_t)(payload_modulus);
|
||||
|
||||
// Create a Csprng
|
||||
csprng =
|
||||
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
|
||||
uint8_t seed[16] = {(uint8_t)0};
|
||||
concrete_cpu_construct_concrete_csprng(
|
||||
csprng, Uint128{.little_endian_bytes = {*seed}});
|
||||
|
||||
// Generate the keys
|
||||
generate_lwe_secret_keys(&lwe_sk_in_array, input_lwe_dimension, csprng,
|
||||
REPETITIONS);
|
||||
generate_lwe_secret_keys(&lwe_sk_out_array, output_lwe_dimension, csprng,
|
||||
REPETITIONS);
|
||||
generate_lwe_keyswitch_keys(
|
||||
stream, gpu_index, &d_ksk_array, lwe_sk_in_array, lwe_sk_out_array,
|
||||
input_lwe_dimension, output_lwe_dimension, ksk_level, ksk_base_log,
|
||||
csprng, noise_variance, REPETITIONS);
|
||||
plaintexts = generate_plaintexts(payload_modulus, delta, number_of_inputs,
|
||||
REPETITIONS, SAMPLES);
|
||||
|
||||
d_lwe_out_ct = (uint64_t *)cuda_malloc_async(
|
||||
number_of_inputs * (output_lwe_dimension + 1) * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
|
||||
d_lwe_in_ct = (uint64_t *)cuda_malloc_async(
|
||||
number_of_inputs * (input_lwe_dimension + 1) * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
|
||||
lwe_in_ct = (uint64_t *)malloc(
|
||||
number_of_inputs * (input_lwe_dimension + 1) * sizeof(uint64_t));
|
||||
lwe_out_ct = (uint64_t *)malloc(
|
||||
number_of_inputs * (output_lwe_dimension + 1) * sizeof(uint64_t));
|
||||
|
||||
cuda_synchronize_stream(v_stream);
|
||||
keyswitch_setup(stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
|
||||
&d_ksk_array, &plaintexts, &d_lwe_ct_in_array,
|
||||
&d_lwe_ct_out_array, input_lwe_dimension,
|
||||
output_lwe_dimension, noise_variance, ksk_base_log,
|
||||
ksk_level, message_modulus, carry_modulus, &payload_modulus,
|
||||
&delta, number_of_inputs, REPETITIONS, SAMPLES, gpu_index);
|
||||
}
|
||||
|
||||
void TearDown() {
|
||||
void *v_stream = (void *)stream;
|
||||
|
||||
cuda_synchronize_stream(v_stream);
|
||||
concrete_cpu_destroy_concrete_csprng(csprng);
|
||||
free(csprng);
|
||||
cuda_drop_async(d_lwe_in_ct, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_out_ct, stream, gpu_index);
|
||||
free(lwe_in_ct);
|
||||
free(lwe_out_ct);
|
||||
free(lwe_sk_in_array);
|
||||
free(lwe_sk_out_array);
|
||||
free(plaintexts);
|
||||
cuda_drop_async(d_ksk_array, stream, gpu_index);
|
||||
cuda_destroy_stream(stream, gpu_index);
|
||||
keyswitch_teardown(stream, csprng, lwe_sk_in_array, lwe_sk_out_array,
|
||||
d_ksk_array, plaintexts, d_lwe_ct_in_array,
|
||||
d_lwe_ct_out_array, gpu_index);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(KeyswitchTestPrimitives_u64, keyswitch) {
|
||||
void *v_stream = (void *)stream;
|
||||
uint64_t *lwe_out_ct = (uint64_t *)malloc(
|
||||
(output_lwe_dimension + 1) * number_of_inputs * sizeof(uint64_t));
|
||||
for (uint r = 0; r < REPETITIONS; r++) {
|
||||
uint64_t *lwe_in_sk =
|
||||
lwe_sk_in_array + (ptrdiff_t)(r * input_lwe_dimension);
|
||||
uint64_t *lwe_out_sk =
|
||||
lwe_sk_out_array + (ptrdiff_t)(r * output_lwe_dimension);
|
||||
int ksk_size = ksk_level * (output_lwe_dimension + 1) * input_lwe_dimension;
|
||||
uint64_t *d_ksk = d_ksk_array + (ptrdiff_t)(ksk_size * r);
|
||||
for (uint s = 0; s < SAMPLES; s++) {
|
||||
for (int i = 0; i < number_of_inputs; i++) {
|
||||
uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
concrete_cpu_encrypt_lwe_ciphertext_u64(
|
||||
lwe_in_sk, lwe_in_ct + i * (input_lwe_dimension + 1), plaintext,
|
||||
input_lwe_dimension, noise_variance, csprng,
|
||||
&CONCRETE_CSPRNG_VTABLE);
|
||||
}
|
||||
cuda_synchronize_stream(v_stream);
|
||||
cuda_memcpy_async_to_gpu(d_lwe_in_ct, lwe_in_ct,
|
||||
number_of_inputs * (input_lwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
uint64_t *d_lwe_ct_in =
|
||||
d_lwe_ct_in_array +
|
||||
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
|
||||
(input_lwe_dimension + 1));
|
||||
// Execute keyswitch
|
||||
cuda_keyswitch_lwe_ciphertext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_ct,
|
||||
stream, gpu_index, (void *)d_lwe_ct_out_array, (void *)d_lwe_ct_in,
|
||||
(void *)d_ksk, input_lwe_dimension, output_lwe_dimension,
|
||||
ksk_base_log, ksk_level, number_of_inputs);
|
||||
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
|
||||
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_ct_out_array,
|
||||
number_of_inputs * (output_lwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
@@ -162,11 +105,6 @@ TEST_P(KeyswitchTestPrimitives_u64, keyswitch) {
|
||||
lwe_out_sk, lwe_out_ct + i * (output_lwe_dimension + 1),
|
||||
output_lwe_dimension, &decrypted);
|
||||
EXPECT_NE(decrypted, plaintext);
|
||||
// let err = (decrypted >= plaintext) ? decrypted - plaintext :
|
||||
// plaintext
|
||||
// - decrypted;
|
||||
// error_sample_vec.push(err);
|
||||
|
||||
// The bit before the message
|
||||
uint64_t rounding_bit = delta >> 1;
|
||||
// Compute the rounding bit
|
||||
@@ -176,6 +114,7 @@ TEST_P(KeyswitchTestPrimitives_u64, keyswitch) {
|
||||
}
|
||||
}
|
||||
}
|
||||
free(lwe_out_ct);
|
||||
}
|
||||
|
||||
// Defines for which parameters set the PBS will be tested.
|
||||
@@ -0,0 +1,269 @@
|
||||
#include <cstdint>
|
||||
#include <gtest/gtest.h>
|
||||
#include <setup_and_teardown.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
const unsigned REPETITIONS = 5;
|
||||
const unsigned SAMPLES = 100;
|
||||
|
||||
typedef struct {
|
||||
int lwe_dimension;
|
||||
double noise_variance;
|
||||
int message_modulus;
|
||||
int carry_modulus;
|
||||
int number_of_inputs;
|
||||
} LinearAlgebraTestParams;
|
||||
|
||||
class LinearAlgebraTestPrimitives_u64
|
||||
: public ::testing::TestWithParam<LinearAlgebraTestParams> {
|
||||
protected:
|
||||
int lwe_dimension;
|
||||
double noise_variance;
|
||||
int message_modulus;
|
||||
int carry_modulus;
|
||||
int number_of_inputs;
|
||||
int payload_modulus;
|
||||
uint64_t delta;
|
||||
Csprng *csprng;
|
||||
cudaStream_t *stream;
|
||||
int gpu_index = 0;
|
||||
uint64_t *lwe_sk_array;
|
||||
uint64_t *d_lwe_in_1_ct;
|
||||
uint64_t *d_lwe_in_2_ct;
|
||||
uint64_t *d_plaintext_2;
|
||||
uint64_t *d_cleartext;
|
||||
uint64_t *d_lwe_out_ct;
|
||||
uint64_t *lwe_in_1_ct;
|
||||
uint64_t *lwe_in_2_ct;
|
||||
uint64_t *lwe_out_ct;
|
||||
uint64_t *plaintexts_1;
|
||||
uint64_t *plaintexts_2;
|
||||
int num_samples;
|
||||
|
||||
public:
|
||||
// Test arithmetic functions
|
||||
void SetUp() {
|
||||
stream = cuda_create_stream(0);
|
||||
|
||||
// TestParams
|
||||
lwe_dimension = (int)GetParam().lwe_dimension;
|
||||
noise_variance = (double)GetParam().noise_variance;
|
||||
message_modulus = (int)GetParam().message_modulus;
|
||||
carry_modulus = (int)GetParam().carry_modulus;
|
||||
number_of_inputs = (int)GetParam().number_of_inputs;
|
||||
|
||||
payload_modulus = message_modulus * carry_modulus;
|
||||
// Value of the shift we multiply our messages by
|
||||
// In this test we use a smaller delta to avoid an overflow during
|
||||
// multiplication
|
||||
delta =
|
||||
((uint64_t)(1) << 63) / (uint64_t)(payload_modulus * payload_modulus);
|
||||
|
||||
linear_algebra_setup(stream, &csprng, &lwe_sk_array, &d_lwe_in_1_ct,
|
||||
&d_lwe_in_2_ct, &d_lwe_out_ct, &lwe_in_1_ct,
|
||||
&lwe_in_2_ct, &lwe_out_ct, &plaintexts_1,
|
||||
&plaintexts_2, &d_plaintext_2, &d_cleartext,
|
||||
lwe_dimension, noise_variance, payload_modulus, delta,
|
||||
number_of_inputs, REPETITIONS, SAMPLES, gpu_index);
|
||||
}
|
||||
|
||||
void TearDown() {
|
||||
linear_algebra_teardown(
|
||||
stream, &csprng, &lwe_sk_array, &d_lwe_in_1_ct, &d_lwe_in_2_ct,
|
||||
&d_lwe_out_ct, &lwe_in_1_ct, &lwe_in_2_ct, &lwe_out_ct, &plaintexts_1,
|
||||
&plaintexts_2, &d_plaintext_2, &d_cleartext, gpu_index);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(LinearAlgebraTestPrimitives_u64, addition) {
|
||||
void *v_stream = (void *)stream;
|
||||
// Here execute the PBS
|
||||
for (uint r = 0; r < REPETITIONS; r++) {
|
||||
uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
|
||||
for (uint s = 0; s < SAMPLES; s++) {
|
||||
uint64_t *d_lwe_1_in =
|
||||
d_lwe_in_1_ct +
|
||||
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
|
||||
(lwe_dimension + 1));
|
||||
uint64_t *d_lwe_2_in =
|
||||
d_lwe_in_2_ct +
|
||||
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
|
||||
(lwe_dimension + 1));
|
||||
// Execute addition
|
||||
cuda_add_lwe_ciphertext_vector_64(stream, gpu_index, (void *)d_lwe_out_ct,
|
||||
(void *)d_lwe_1_in, (void *)d_lwe_2_in,
|
||||
lwe_dimension, number_of_inputs);
|
||||
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
|
||||
number_of_inputs * (lwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
for (int i = 0; i < number_of_inputs; i++) {
|
||||
uint64_t plaintext_1 = plaintexts_1[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
uint64_t plaintext_2 = plaintexts_2[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
uint64_t decrypted = 0;
|
||||
concrete_cpu_decrypt_lwe_ciphertext_u64(
|
||||
lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension,
|
||||
&decrypted);
|
||||
// The bit before the message
|
||||
uint64_t rounding_bit = delta >> 1;
|
||||
// Compute the rounding bit
|
||||
uint64_t rounding = (decrypted & rounding_bit) << 1;
|
||||
uint64_t decoded = (decrypted + rounding) / delta;
|
||||
EXPECT_EQ(decoded, (plaintext_1 + plaintext_2) / delta)
|
||||
<< "Repetition: " << r << ", sample: " << s;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(LinearAlgebraTestPrimitives_u64, plaintext_addition) {
|
||||
// Here execute the PBS
|
||||
for (uint r = 0; r < REPETITIONS; r++) {
|
||||
uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
|
||||
for (uint s = 0; s < SAMPLES; s++) {
|
||||
uint64_t *d_lwe_1_slice =
|
||||
d_lwe_in_1_ct +
|
||||
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
|
||||
(lwe_dimension + 1));
|
||||
uint64_t *d_plaintext_2_in =
|
||||
d_plaintext_2 +
|
||||
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs));
|
||||
// Execute addition
|
||||
cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_1_slice,
|
||||
(void *)d_plaintext_2_in, lwe_dimension, number_of_inputs);
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
|
||||
number_of_inputs * (lwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
for (int i = 0; i < number_of_inputs; i++) {
|
||||
uint64_t plaintext_1 = plaintexts_1[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
uint64_t plaintext_2 = plaintexts_2[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
uint64_t decrypted = 0;
|
||||
concrete_cpu_decrypt_lwe_ciphertext_u64(
|
||||
lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension,
|
||||
&decrypted);
|
||||
// The bit before the message
|
||||
uint64_t rounding_bit = delta >> 1;
|
||||
// Compute the rounding bit
|
||||
uint64_t rounding = (decrypted & rounding_bit) << 1;
|
||||
uint64_t decoded = (decrypted + rounding) / delta;
|
||||
EXPECT_EQ(decoded, (plaintext_1 + plaintext_2) / delta)
|
||||
<< "Repetition: " << r << ", sample: " << s << " i: " << i << ") "
|
||||
<< plaintext_1 / delta << " + " << plaintext_2 / delta;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(LinearAlgebraTestPrimitives_u64, cleartext_multiplication) {
|
||||
void *v_stream = (void *)stream;
|
||||
// Here execute the PBS
|
||||
for (uint r = 0; r < REPETITIONS; r++) {
|
||||
uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
|
||||
for (uint s = 0; s < SAMPLES; s++) {
|
||||
uint64_t *d_lwe_1_slice =
|
||||
d_lwe_in_1_ct +
|
||||
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
|
||||
(lwe_dimension + 1));
|
||||
uint64_t *d_cleartext_in =
|
||||
d_cleartext +
|
||||
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs));
|
||||
// Execute cleartext multiplication
|
||||
cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_1_slice,
|
||||
(void *)d_cleartext_in, lwe_dimension, number_of_inputs);
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
|
||||
number_of_inputs * (lwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
for (int i = 0; i < number_of_inputs; i++) {
|
||||
uint64_t cleartext_1 = plaintexts_1[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + i] /
|
||||
delta;
|
||||
uint64_t cleartext_2 = plaintexts_2[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + i] /
|
||||
delta;
|
||||
uint64_t decrypted = 0;
|
||||
concrete_cpu_decrypt_lwe_ciphertext_u64(
|
||||
lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension,
|
||||
&decrypted);
|
||||
// The bit before the message
|
||||
uint64_t rounding_bit = delta >> 1;
|
||||
// Compute the rounding bit
|
||||
uint64_t rounding = (decrypted & rounding_bit) << 1;
|
||||
uint64_t decoded = (decrypted + rounding) / delta;
|
||||
EXPECT_EQ(decoded, cleartext_1 * cleartext_2)
|
||||
<< "Repetition: " << r << ", sample: " << s << " i: " << i
|
||||
<< ", decrypted: " << decrypted;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(LinearAlgebraTestPrimitives_u64, negate) {
|
||||
// Here execute the PBS
|
||||
for (uint r = 0; r < REPETITIONS; r++) {
|
||||
uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
|
||||
for (uint s = 0; s < SAMPLES; s++) {
|
||||
uint64_t *d_lwe_1_slice =
|
||||
d_lwe_in_1_ct +
|
||||
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
|
||||
(lwe_dimension + 1));
|
||||
// Execute negate
|
||||
cuda_negate_lwe_ciphertext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_1_slice,
|
||||
lwe_dimension, number_of_inputs);
|
||||
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
|
||||
number_of_inputs * (lwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
for (int i = 0; i < number_of_inputs; i++) {
|
||||
uint64_t plaintext = plaintexts_1[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
uint64_t decrypted = 0;
|
||||
concrete_cpu_decrypt_lwe_ciphertext_u64(
|
||||
lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension,
|
||||
&decrypted);
|
||||
// The bit before the message
|
||||
uint64_t rounding_bit = delta >> 1;
|
||||
// Compute the rounding bit
|
||||
uint64_t rounding = (decrypted & rounding_bit) << 1;
|
||||
uint64_t decoded = (decrypted + rounding) / delta;
|
||||
EXPECT_EQ(decoded, -plaintext / delta)
|
||||
<< "Repetition: " << r << ", sample: " << s << " i: " << i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Defines for which parameters set the linear algebra operations will be
|
||||
// tested. It executes each test for all pairs on phis X qs (Cartesian product)
|
||||
::testing::internal::ParamGenerator<LinearAlgebraTestParams>
|
||||
linear_algebra_params_u64 = ::testing::Values(
|
||||
// n, lwe_std_dev, message_modulus, carry_modulus, number_of_inputs
|
||||
(LinearAlgebraTestParams){600, 7.52316384526264e-37, 2, 2, 10});
|
||||
|
||||
std::string
|
||||
printParamName(::testing::TestParamInfo<LinearAlgebraTestParams> p) {
|
||||
LinearAlgebraTestParams params = p.param;
|
||||
|
||||
return "n_" + std::to_string(params.lwe_dimension);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(LinearAlgebraInstantiation,
|
||||
LinearAlgebraTestPrimitives_u64,
|
||||
linear_algebra_params_u64, printParamName);
|
||||
@@ -1,10 +1,6 @@
|
||||
#include "../include/bootstrap.h"
|
||||
#include "../include/device.h"
|
||||
#include "concrete-cpu.h"
|
||||
#include "utils.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <gtest/gtest.h>
|
||||
#include <setup_and_teardown.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
@@ -47,7 +43,7 @@ protected:
|
||||
int tau;
|
||||
int p;
|
||||
uint64_t delta;
|
||||
uint32_t cbs_delta_log;
|
||||
int cbs_delta_log;
|
||||
int delta_log;
|
||||
int delta_log_lut;
|
||||
Csprng *csprng;
|
||||
@@ -71,7 +67,6 @@ public:
|
||||
// Test arithmetic functions
|
||||
void SetUp() {
|
||||
stream = cuda_create_stream(0);
|
||||
void *v_stream = (void *)stream;
|
||||
|
||||
// TestParams
|
||||
lwe_dimension = (int)GetParam().lwe_dimension;
|
||||
@@ -89,104 +84,30 @@ public:
|
||||
cbs_level = (int)GetParam().cbs_level;
|
||||
tau = (int)GetParam().tau;
|
||||
p = 10 / tau;
|
||||
delta_log = 64 - p;
|
||||
delta_log_lut = delta_log;
|
||||
delta = (uint64_t)(1) << delta_log;
|
||||
|
||||
// Create a Csprng
|
||||
csprng =
|
||||
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
|
||||
uint8_t seed[16] = {(uint8_t)0};
|
||||
concrete_cpu_construct_concrete_csprng(
|
||||
csprng, Uint128{.little_endian_bytes = {*seed}});
|
||||
|
||||
input_lwe_dimension = glwe_dimension * polynomial_size;
|
||||
// Generate the keys
|
||||
generate_lwe_secret_keys(&lwe_sk_in_array, input_lwe_dimension, csprng,
|
||||
REPETITIONS);
|
||||
generate_lwe_secret_keys(&lwe_sk_out_array, lwe_dimension, csprng,
|
||||
REPETITIONS);
|
||||
generate_lwe_keyswitch_keys(
|
||||
stream, gpu_index, &d_ksk_array, lwe_sk_in_array, lwe_sk_out_array,
|
||||
input_lwe_dimension, lwe_dimension, ks_level, ks_base_log, csprng,
|
||||
lwe_modular_variance, REPETITIONS);
|
||||
generate_lwe_bootstrap_keys(
|
||||
stream, gpu_index, &d_fourier_bsk_array, lwe_sk_out_array,
|
||||
lwe_sk_in_array, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
pbs_level, pbs_base_log, csprng, glwe_modular_variance, REPETITIONS);
|
||||
generate_lwe_private_functional_keyswitch_key_lists(
|
||||
stream, gpu_index, &d_pksk_array, lwe_sk_in_array, lwe_sk_in_array,
|
||||
input_lwe_dimension, glwe_dimension, polynomial_size, pksk_level,
|
||||
pksk_base_log, csprng, lwe_modular_variance, REPETITIONS);
|
||||
plaintexts = generate_plaintexts(p, delta, tau, REPETITIONS, SAMPLES);
|
||||
|
||||
// LUT creation
|
||||
int lut_size = polynomial_size;
|
||||
int lut_num = tau << (tau * p - (int)log2(polynomial_size)); // r
|
||||
|
||||
uint64_t *big_lut =
|
||||
(uint64_t *)malloc(lut_num * lut_size * sizeof(uint64_t));
|
||||
for (int t = tau - 1; t >= 0; t--) {
|
||||
uint64_t *small_lut = big_lut + (ptrdiff_t)(t * (1 << (tau * p)));
|
||||
for (uint64_t value = 0; value < (uint64_t)(1 << (tau * p)); value++) {
|
||||
int nbits = t * p;
|
||||
uint64_t x = (value >> nbits) & (uint64_t)((1 << p) - 1);
|
||||
small_lut[value] =
|
||||
((x % (uint64_t)(1 << (64 - delta_log))) << delta_log_lut);
|
||||
}
|
||||
}
|
||||
d_lut_vector = (uint64_t *)cuda_malloc_async(
|
||||
lut_num * lut_size * sizeof(uint64_t), stream, gpu_index);
|
||||
cuda_memcpy_async_to_gpu(d_lut_vector, big_lut,
|
||||
lut_num * lut_size * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
// Execute scratch
|
||||
scratch_cuda_wop_pbs_64(stream, gpu_index, &wop_pbs_buffer,
|
||||
(uint32_t *)&delta_log, &cbs_delta_log,
|
||||
glwe_dimension, lwe_dimension, polynomial_size,
|
||||
cbs_level, pbs_level, p, p, tau,
|
||||
cuda_get_max_shared_memory(gpu_index), true);
|
||||
// Allocate input
|
||||
d_lwe_ct_in_array = (uint64_t *)cuda_malloc_async(
|
||||
(input_lwe_dimension + 1) * tau * sizeof(uint64_t), stream, gpu_index);
|
||||
// Allocate output
|
||||
d_lwe_ct_out_array = (uint64_t *)cuda_malloc_async(
|
||||
(input_lwe_dimension + 1) * tau * sizeof(uint64_t), stream, gpu_index);
|
||||
lwe_in_ct_array =
|
||||
(uint64_t *)malloc((input_lwe_dimension + 1) * tau * sizeof(uint64_t));
|
||||
lwe_out_ct_array =
|
||||
(uint64_t *)malloc((input_lwe_dimension + 1) * tau * sizeof(uint64_t));
|
||||
|
||||
cuda_synchronize_stream(v_stream);
|
||||
free(big_lut);
|
||||
wop_pbs_setup(
|
||||
stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array, &d_ksk_array,
|
||||
&d_fourier_bsk_array, &d_pksk_array, &plaintexts, &d_lwe_ct_in_array,
|
||||
&d_lwe_ct_out_array, &d_lut_vector, &wop_pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, lwe_modular_variance,
|
||||
glwe_modular_variance, ks_base_log, ks_level, pksk_base_log, pksk_level,
|
||||
pbs_base_log, pbs_level, cbs_level, p, &delta_log, &cbs_delta_log,
|
||||
&delta_log_lut, &delta, tau, REPETITIONS, SAMPLES, gpu_index);
|
||||
}
|
||||
|
||||
void TearDown() {
|
||||
void *v_stream = (void *)stream;
|
||||
|
||||
cuda_synchronize_stream(v_stream);
|
||||
concrete_cpu_destroy_concrete_csprng(csprng);
|
||||
free(csprng);
|
||||
free(lwe_sk_in_array);
|
||||
free(lwe_sk_out_array);
|
||||
free(plaintexts);
|
||||
free(lwe_in_ct_array);
|
||||
free(lwe_out_ct_array);
|
||||
cleanup_cuda_circuit_bootstrap_vertical_packing(stream, gpu_index,
|
||||
&wop_pbs_buffer);
|
||||
cuda_drop_async(d_fourier_bsk_array, stream, gpu_index);
|
||||
cuda_drop_async(d_ksk_array, stream, gpu_index);
|
||||
cuda_drop_async(d_pksk_array, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_ct_out_array, stream, gpu_index);
|
||||
cuda_drop_async(d_lut_vector, stream, gpu_index);
|
||||
cuda_destroy_stream(stream, gpu_index);
|
||||
wop_pbs_teardown(stream, csprng, lwe_sk_in_array, lwe_sk_out_array,
|
||||
d_ksk_array, d_fourier_bsk_array, d_pksk_array, plaintexts,
|
||||
d_lwe_ct_in_array, d_lut_vector, d_lwe_ct_out_array,
|
||||
wop_pbs_buffer, gpu_index);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(WopBootstrapTestPrimitives_u64, wop_pbs) {
|
||||
void *v_stream = (void *)stream;
|
||||
int input_lwe_dimension = glwe_dimension * polynomial_size;
|
||||
uint64_t *lwe_out_ct_array =
|
||||
(uint64_t *)malloc((input_lwe_dimension + 1) * tau * sizeof(uint64_t));
|
||||
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
|
||||
polynomial_size * (lwe_dimension + 1);
|
||||
int ksk_size =
|
||||
@@ -196,33 +117,23 @@ TEST_P(WopBootstrapTestPrimitives_u64, wop_pbs) {
|
||||
(glwe_dimension + 1);
|
||||
for (uint r = 0; r < REPETITIONS; r++) {
|
||||
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
|
||||
uint64_t *lwe_sk_in =
|
||||
lwe_sk_in_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
|
||||
uint64_t *d_ksk = d_ksk_array + (ptrdiff_t)(ksk_size * r);
|
||||
uint64_t *d_pksk_list = d_pksk_array + (ptrdiff_t)(pksk_list_size * r);
|
||||
uint64_t *lwe_sk_in =
|
||||
lwe_sk_in_array + (ptrdiff_t)(input_lwe_dimension * r);
|
||||
for (uint s = 0; s < SAMPLES; s++) {
|
||||
for (int t = 0; t < tau; t++) {
|
||||
uint64_t plaintext = plaintexts[r * SAMPLES * tau + s * tau + t];
|
||||
uint64_t *lwe_in_ct =
|
||||
lwe_in_ct_array + (ptrdiff_t)(t * (input_lwe_dimension + 1));
|
||||
concrete_cpu_encrypt_lwe_ciphertext_u64(
|
||||
lwe_sk_in, lwe_in_ct, plaintext, input_lwe_dimension,
|
||||
lwe_modular_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
|
||||
}
|
||||
cuda_memcpy_async_to_gpu(d_lwe_ct_in_array, lwe_in_ct_array,
|
||||
(input_lwe_dimension + 1) * tau *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
uint64_t *d_lwe_ct_in =
|
||||
d_lwe_ct_in_array + (ptrdiff_t)((r * SAMPLES * tau + s * tau) *
|
||||
(input_lwe_dimension + 1));
|
||||
|
||||
// Execute wop pbs
|
||||
cuda_wop_pbs_64(stream, gpu_index, (void *)d_lwe_ct_out_array,
|
||||
(void *)d_lwe_ct_in_array, (void *)d_lut_vector,
|
||||
(void *)d_fourier_bsk, (void *)d_ksk, (void *)d_pksk_list,
|
||||
wop_pbs_buffer, cbs_delta_log, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
ks_base_log, ks_level, pksk_base_log, pksk_level,
|
||||
cbs_base_log, cbs_level, p, p, delta_log, tau,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
cuda_wop_pbs_64(
|
||||
stream, gpu_index, (void *)d_lwe_ct_out_array, (void *)d_lwe_ct_in,
|
||||
(void *)d_lut_vector, (void *)d_fourier_bsk, (void *)d_ksk,
|
||||
(void *)d_pksk_list, wop_pbs_buffer, cbs_delta_log, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, pbs_base_log, pbs_level, ks_base_log,
|
||||
ks_level, pksk_base_log, pksk_level, cbs_base_log, cbs_level, p, p,
|
||||
delta_log, tau, cuda_get_max_shared_memory(gpu_index));
|
||||
|
||||
//// Copy result back
|
||||
cuda_memcpy_async_to_cpu(lwe_out_ct_array, d_lwe_ct_out_array,
|
||||
@@ -1,12 +1,12 @@
|
||||
#include "utils.h"
|
||||
#include "../include/bootstrap.h"
|
||||
#include "../include/device.h"
|
||||
#include "concrete-cpu.h"
|
||||
#include <bootstrap.h>
|
||||
#include <cmath>
|
||||
#include <concrete-cpu.h>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <device.h>
|
||||
#include <functional>
|
||||
#include <random>
|
||||
#include <utils.h>
|
||||
|
||||
// For each sample and repetition, create a plaintext
|
||||
// The payload_modulus is the message modulus times the carry modulus
|
||||
Reference in New Issue
Block a user