test(concrete_cuda): add C++ tests to concrete-cuda

This commit is contained in:
Agnes Leroy
2023-03-13 15:42:30 +01:00
committed by Agnès Leroy
parent 7d7780bd23
commit 39800f2d8a
14 changed files with 2245 additions and 27 deletions

View File

@@ -39,7 +39,7 @@ jobs:
with:
mode: start
github-token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
ec2-image-id: ami-0c4d39cb3dba0fcff
ec2-image-id: ami-03f11dc8c6a5f5c0a
ec2-instance-type: p3.2xlarge
subnet-id: subnet-8123c9e7
security-group-id: sg-0466d33ced960ba35
@@ -91,31 +91,31 @@ jobs:
cd build
cmake ..
make -j8
#- name: Test concrete-cuda with Cuda 11.8
# if: ${{ !cancelled() }}
# run: |
# cd backends/concrete-cuda/implementation/build
# ./test/test_concrete_cuda
- name: Test concrete-cuda with Cuda 11.8
if: ${{ !cancelled() }}
run: |
cd backends/concrete-cuda/implementation/build
./test/test_concrete_cuda
#- name: Export variables for CUDA 11.1
# run: |
# echo "CUDA_PATH=$OLD_CUDA_PATH" >> "${GITHUB_ENV}"
# echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
# echo "LD_LIBRARY_PATH=$OLD_CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
# echo "CUDACXX=$OLD_CUDA_PATH/bin/nvcc" >> "${GITHUB_ENV}"
#- name: Build concrete-cuda with Cuda 11.1
# if: ${{ !cancelled() }}
# run: |
# cd backends/concrete-cuda/implementation
# mkdir build-old-cuda
# cd build-old-cuda
# cmake ..
# make -j8
#- name: Test concrete-cuda with Cuda 11.1
# if: ${{ !cancelled() }}
# run: |
# cd backends/concrete-cuda/implementation/build-old-cuda
# ./test/test_concrete_cuda
- name: Export variables for CUDA 11.1
run: |
echo "CUDA_PATH=$OLD_CUDA_PATH" >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$OLD_CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
echo "CUDACXX=$OLD_CUDA_PATH/bin/nvcc" >> "${GITHUB_ENV}"
- name: Build concrete-cuda with Cuda 11.1
if: ${{ !cancelled() }}
run: |
cd backends/concrete-cuda/implementation
mkdir build-old-cuda
cd build-old-cuda
cmake ..
make -j8
- name: Test concrete-cuda with Cuda 11.1
if: ${{ !cancelled() }}
run: |
cd backends/concrete-cuda/implementation/build-old-cuda
./test/test_concrete_cuda
- name: Slack Notification
if: ${{ always() }}

View File

@@ -1,8 +1,6 @@
cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
project(concrete_cuda LANGUAGES CXX CUDA)
include(CTest)
# See if the minimum CUDA version is available. If not, only enable documentation building.
set(MINIMUM_SUPPORTED_CUDA_VERSION 10.0)
include(CheckLanguage)
@@ -69,6 +67,7 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O3 ${CUD
set(INCLUDE_DIR include)
add_subdirectory(src)
add_subdirectory(test)
add_subdirectory(parameters)
target_include_directories(concrete_cuda PRIVATE ${INCLUDE_DIR})
@@ -87,3 +86,6 @@ if (CPPLINT)
set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_ALL TRUE)
# set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD TRUE)
endif ()
enable_testing()

View File

@@ -32,6 +32,10 @@ int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size,
int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
cudaStream_t *stream, uint32_t gpu_index);
int cuda_memset_async(void *dest, uint64_t val, uint64_t size,
cudaStream_t *stream, uint32_t gpu_index);
int cuda_get_number_of_gpus();
int cuda_synchronize_device(uint32_t gpu_index);

View File

@@ -111,6 +111,28 @@ int cuda_synchronize_device(uint32_t gpu_index) {
return 0;
}
int cuda_memset_async(void *dest, uint64_t val, uint64_t size,
cudaStream_t *stream, uint32_t gpu_index) {
if (size == 0) {
// error code: zero copy size
return -3;
}
if (gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
cudaPointerAttributes attr;
cudaPointerGetAttributes(&attr, dest);
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
// error code: invalid device pointer
return -1;
}
cudaSetDevice(gpu_index);
cudaMemsetAsync(dest, val, size, *stream);
return 0;
}
/// Tries to copy memory to the GPU asynchronously
/// 0: success
/// -1: error, invalid device pointer

View File

@@ -0,0 +1,54 @@
include(FetchContent)
FetchContent_Declare(
googletest
URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip
)
# For Windows: Prevent overriding the parent project's compiler/linker settings
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(googletest)
# Enable ExternalProject CMake module
include(ExternalProject)
set(CONCRETE_CPU_BINARY_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../concrete-cpu/target/release")
set(CONCRETE_CPU_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../concrete-cpu")
# Add rust_example as a CMake target
ExternalProject_Add(
concrete_cpu
SOURCE_DIR ${CONCRETE_CPU_SOURCE_DIR}
DOWNLOAD_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND cargo build --release COMMAND cargo build --release
BINARY_DIR ${CONCRETE_CPU_BINARY_DIR}
INSTALL_COMMAND ""
LOG_BUILD ON)
include_directories(${CONCRETE_CPU_SOURCE_DIR}/include)
add_library(concrete_cpu_lib STATIC IMPORTED)
set_target_properties(concrete_cpu_lib PROPERTIES IMPORTED_LOCATION
${CONCRETE_CPU_BINARY_DIR}/libconcrete_cpu.a)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--no-as-needed -ldl")
set(BINARY test_concrete_cuda)
file(GLOB_RECURSE TEST_SOURCES LIST_DIRECTORIES false *.h *.cpp)
set(SOURCES ${TEST_SOURCES})
add_executable(${BINARY} ${TEST_SOURCES})
add_test(NAME ${BINARY} COMMAND ${BINARY})
set_target_properties(test_concrete_cuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON
CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(test_concrete_cuda PUBLIC GTest::gtest_main concrete_cpu_lib
concrete_cuda cudart)
find_package(CUDA REQUIRED)
include_directories("${CUDA_INCLUDE_DIRS}" "${CMAKE_CURRENT_SOURCE_DIR}")
include(GoogleTest)
gtest_discover_tests(test_concrete_cuda)

View File

@@ -0,0 +1,250 @@
#include "../include/bit_extraction.h"
#include "../include/device.h"
#include "concrete-cpu.h"
#include "utils.h"
#include "gtest/gtest.h"
#include <cstdint>
#include <stdio.h>
#include <stdlib.h>
const unsigned REPETITIONS = 5;
const unsigned SAMPLES = 100;
typedef struct {
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance;
double glwe_modular_variance;
int pbs_base_log;
int pbs_level;
int ks_base_log;
int ks_level;
int number_of_bits_of_message_including_padding;
int number_of_bits_to_extract;
int number_of_inputs;
} BitExtractionTestParams;
class BitExtractionTestPrimitives_u64
: public ::testing::TestWithParam<BitExtractionTestParams> {
protected:
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance;
double glwe_modular_variance;
int pbs_base_log;
int pbs_level;
int ks_base_log;
int ks_level;
int number_of_bits_of_message_including_padding;
int number_of_bits_to_extract;
int number_of_inputs;
uint64_t delta;
int delta_log;
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t *lwe_sk_in_array;
uint64_t *lwe_sk_out_array;
uint64_t *lwe_in_ct_array;
uint64_t *lwe_out_ct_array;
uint64_t *plaintexts;
double *d_fourier_bsk_array;
uint64_t *d_ksk_array;
uint64_t *d_lwe_in_ct_array;
uint64_t *d_lwe_out_ct_array;
int8_t *bit_extract_buffer;
public:
// Test arithmetic functions
void SetUp() {
stream = cuda_create_stream(0);
// TestParams
lwe_dimension = (int)GetParam().lwe_dimension;
glwe_dimension = (int)GetParam().glwe_dimension;
polynomial_size = (int)GetParam().polynomial_size;
lwe_modular_variance = (double)GetParam().lwe_modular_variance;
glwe_modular_variance = (double)GetParam().glwe_modular_variance;
pbs_base_log = (int)GetParam().pbs_base_log;
pbs_level = (int)GetParam().pbs_level;
ks_base_log = (int)GetParam().ks_base_log;
ks_level = (int)GetParam().ks_level;
number_of_bits_of_message_including_padding =
(int)GetParam().number_of_bits_of_message_including_padding;
number_of_bits_to_extract = (int)GetParam().number_of_bits_to_extract;
number_of_inputs = (int)GetParam().number_of_inputs;
delta_log = 64 - number_of_bits_of_message_including_padding;
delta = (uint64_t)(1) << delta_log;
// Create a Csprng
csprng =
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
uint8_t seed[16] = {(uint8_t)0};
concrete_cpu_construct_concrete_csprng(
csprng, Uint128{.little_endian_bytes = {*seed}});
int input_lwe_dimension = glwe_dimension * polynomial_size;
int output_lwe_dimension = lwe_dimension;
// Generate the keys
generate_lwe_secret_keys(&lwe_sk_in_array, input_lwe_dimension, csprng, REPETITIONS);
generate_lwe_secret_keys(&lwe_sk_out_array, output_lwe_dimension, csprng, REPETITIONS);
generate_lwe_keyswitch_keys(
stream, gpu_index, &d_ksk_array, lwe_sk_in_array, lwe_sk_out_array,
input_lwe_dimension, output_lwe_dimension, ks_level, ks_base_log,
csprng, lwe_modular_variance, REPETITIONS);
generate_lwe_bootstrap_keys(
stream, gpu_index, &d_fourier_bsk_array, lwe_sk_out_array,
lwe_sk_in_array, lwe_dimension, glwe_dimension, polynomial_size,
pbs_level, pbs_base_log, csprng, glwe_modular_variance, REPETITIONS);
plaintexts = generate_plaintexts(
number_of_bits_of_message_including_padding, delta, number_of_inputs, REPETITIONS, SAMPLES);
d_lwe_out_ct_array = (uint64_t *)cuda_malloc_async(
(output_lwe_dimension + 1) * number_of_bits_to_extract *
number_of_inputs * sizeof(uint64_t),
stream, gpu_index);
d_lwe_in_ct_array = (uint64_t *)cuda_malloc_async(
(input_lwe_dimension + 1) * number_of_inputs * sizeof(uint64_t), stream,
gpu_index);
lwe_in_ct_array = (uint64_t *)malloc((input_lwe_dimension + 1) *
number_of_inputs * sizeof(uint64_t));
lwe_out_ct_array = (uint64_t *)malloc((output_lwe_dimension + 1) *
number_of_bits_to_extract *
number_of_inputs * sizeof(uint64_t));
// Execute scratch
scratch_cuda_extract_bits_64(stream, gpu_index, &bit_extract_buffer,
glwe_dimension, lwe_dimension, polynomial_size,
pbs_level, number_of_inputs,
cuda_get_max_shared_memory(gpu_index), true);
}
void TearDown() {
void *v_stream = (void *)stream;
cuda_synchronize_stream(v_stream);
concrete_cpu_destroy_concrete_csprng(csprng);
free(csprng);
free(lwe_sk_in_array);
free(lwe_sk_out_array);
free(plaintexts);
free(lwe_in_ct_array);
free(lwe_out_ct_array);
cleanup_cuda_extract_bits(stream, gpu_index, &bit_extract_buffer);
cuda_drop_async(d_fourier_bsk_array, stream, gpu_index);
cuda_drop_async(d_ksk_array, stream, gpu_index);
cuda_drop_async(d_lwe_in_ct_array, stream, gpu_index);
cuda_drop_async(d_lwe_out_ct_array, stream, gpu_index);
cuda_destroy_stream(stream, gpu_index);
}
};
TEST_P(BitExtractionTestPrimitives_u64, bit_extraction) {
void *v_stream = (void *)stream;
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
polynomial_size * (lwe_dimension + 1);
int ksk_size =
ks_level * (lwe_dimension + 1) * glwe_dimension * polynomial_size;
for (uint r = 0; r < REPETITIONS; r++) {
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
uint64_t *d_ksk = d_ksk_array + (ptrdiff_t)(ksk_size * r);
uint64_t *lwe_in_sk =
lwe_sk_in_array + (ptrdiff_t)(glwe_dimension * polynomial_size * r);
uint64_t *lwe_sk_out = lwe_sk_out_array + (ptrdiff_t)(r * lwe_dimension);
for (uint s = 0; s < SAMPLES; s++) {
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i];
uint64_t *lwe_in_ct =
lwe_in_ct_array +
(ptrdiff_t)(
(r * SAMPLES * number_of_inputs + s * number_of_inputs + i) *
(glwe_dimension * polynomial_size + 1));
concrete_cpu_encrypt_lwe_ciphertext_u64(
lwe_in_sk, lwe_in_ct, plaintext, glwe_dimension * polynomial_size,
lwe_modular_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
}
cuda_synchronize_stream(v_stream);
cuda_memcpy_async_to_gpu(d_lwe_in_ct_array, lwe_in_ct_array,
(glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t),
stream, gpu_index);
// Execute bit extract
cuda_extract_bits_64(
stream, gpu_index, (void *)d_lwe_out_ct_array,
(void *)d_lwe_in_ct_array, bit_extract_buffer, (void *)d_ksk,
(void *)d_fourier_bsk, number_of_bits_to_extract, delta_log,
glwe_dimension * polynomial_size, lwe_dimension, glwe_dimension,
polynomial_size, pbs_base_log, pbs_level, ks_base_log, ks_level,
number_of_inputs, cuda_get_max_shared_memory(gpu_index));
// Copy result back
cuda_synchronize_stream(v_stream);
cuda_memcpy_async_to_cpu(lwe_out_ct_array, d_lwe_out_ct_array,
(lwe_dimension + 1) * number_of_bits_to_extract *
number_of_inputs * sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(v_stream);
for (int j = 0; j < number_of_inputs; j++) {
uint64_t *result_array =
lwe_out_ct_array +
(ptrdiff_t)(j * number_of_bits_to_extract * (lwe_dimension + 1));
uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs +
s * number_of_inputs + j];
for (int i = 0; i < number_of_bits_to_extract; i++) {
uint64_t *result_ct =
result_array + (ptrdiff_t)((number_of_bits_to_extract - 1 - i) *
(lwe_dimension + 1));
uint64_t decrypted_message = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(
lwe_sk_out, result_ct, lwe_dimension, &decrypted_message);
// Round after decryption
uint64_t decrypted_rounded =
closest_representable(decrypted_message, 1, 1);
// Bring back the extracted bit found in the MSB in the LSB
uint64_t decrypted_extract_bit = decrypted_rounded >> 63;
uint64_t expected = ((plaintext >> delta_log) >> i) & (uint64_t)(1);
EXPECT_EQ(decrypted_extract_bit, expected);
}
}
}
}
}
// Defines for which parameters set the PBS will be tested.
// It executes each test for all pairs on phis X qs (Cartesian product)
::testing::internal::ParamGenerator<BitExtractionTestParams>
bit_extract_params_u64 = ::testing::Values(
// n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
// ks_base_log, ks_level, number_of_message_bits,
// number_of_bits_to_extract
(BitExtractionTestParams){585, 1, 1024, 7.52316384526264e-37,
7.52316384526264e-37, 10, 2, 4, 7, 5, 5, 1});//,
// (BitExtractionTestParams){585, 1, 1024, 7.52316384526264e-37,
// 7.52316384526264e-37, 10, 2, 4, 7, 5, 5, 2});
std::string
printParamName(::testing::TestParamInfo<BitExtractionTestParams> p) {
BitExtractionTestParams params = p.param;
return "n_" + std::to_string(params.lwe_dimension) + "_k_" +
std::to_string(params.glwe_dimension) + "_N_" +
std::to_string(params.polynomial_size) + "_pbs_base_log_" +
std::to_string(params.pbs_base_log) + "_pbs_level_" +
std::to_string(params.pbs_level) + "_ks_base_log_" +
std::to_string(params.ks_base_log) + "_ks_level_" +
std::to_string(params.ks_level) + "_number_of_message_bits_" +
std::to_string(params.number_of_bits_of_message_including_padding) +
"_number_of_bits_to_extract_" +
std::to_string(params.number_of_bits_to_extract) +
"_number_of_inputs_" + std::to_string(params.number_of_inputs);
}
INSTANTIATE_TEST_CASE_P(BitExtractionInstantiation,
BitExtractionTestPrimitives_u64, bit_extract_params_u64,
printParamName);

View File

@@ -0,0 +1,313 @@
#include "../include/bootstrap.h"
#include "../include/device.h"
#include "concrete-cpu.h"
#include "utils.h"
#include "gtest/gtest.h"
#include <cstdint>
#include <functional>
#include <stdio.h>
#include <stdlib.h>
const unsigned REPETITIONS = 5;
const unsigned SAMPLES = 100;
typedef struct {
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance;
double glwe_modular_variance;
int pbs_base_log;
int pbs_level;
int message_modulus;
int carry_modulus;
int number_of_inputs;
} BootstrapTestParams;
class BootstrapTestPrimitives_u64
: public ::testing::TestWithParam<BootstrapTestParams> {
protected:
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance;
double glwe_modular_variance;
int pbs_base_log;
int pbs_level;
int message_modulus;
int carry_modulus;
int payload_modulus;
int number_of_inputs;
uint64_t delta;
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t *lwe_sk_in_array;
uint64_t *lwe_sk_out_array;
uint64_t *plaintexts;
double *d_fourier_bsk_array;
uint64_t *d_lut_pbs_identity;
uint64_t *d_lut_pbs_indexes;
uint64_t *d_lwe_ct_in_array;
uint64_t *d_lwe_ct_out_array;
public:
// Test arithmetic functions
void SetUp() {
stream = cuda_create_stream(0);
void *v_stream = (void *)stream;
// TestParams
lwe_dimension = (int)GetParam().lwe_dimension;
glwe_dimension = (int)GetParam().glwe_dimension;
polynomial_size = (int)GetParam().polynomial_size;
lwe_modular_variance = (int)GetParam().lwe_modular_variance;
glwe_modular_variance = (int)GetParam().glwe_modular_variance;
pbs_base_log = (int)GetParam().pbs_base_log;
pbs_level = (int)GetParam().pbs_level;
message_modulus = (int)GetParam().message_modulus;
carry_modulus = (int)GetParam().carry_modulus;
number_of_inputs = (int)GetParam().number_of_inputs;
payload_modulus = message_modulus * carry_modulus;
// Value of the shift we multiply our messages by
delta = ((uint64_t)(1) << 63) / (uint64_t)(payload_modulus);
// Create a Csprng
csprng =
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
uint8_t seed[16] = {(uint8_t)0};
concrete_cpu_construct_concrete_csprng(
csprng, Uint128{.little_endian_bytes = {*seed}});
// Generate the keys
generate_lwe_secret_keys(&lwe_sk_in_array, lwe_dimension, csprng, REPETITIONS);
generate_lwe_secret_keys(&lwe_sk_out_array,
glwe_dimension * polynomial_size, csprng, REPETITIONS);
generate_lwe_bootstrap_keys(
stream, gpu_index, &d_fourier_bsk_array, lwe_sk_in_array,
lwe_sk_out_array, lwe_dimension, glwe_dimension, polynomial_size,
pbs_level, pbs_base_log, csprng, glwe_modular_variance, REPETITIONS);
plaintexts = generate_plaintexts(payload_modulus, delta, number_of_inputs, REPETITIONS,
SAMPLES);
// Create the LUT
uint64_t *lut_pbs_identity = generate_identity_lut_pbs(
polynomial_size, glwe_dimension, message_modulus, carry_modulus,
[](int x) -> int { return x; });
// Copy the LUT
d_lut_pbs_identity = (uint64_t *)cuda_malloc_async(
(glwe_dimension + 1) * polynomial_size * sizeof(uint64_t), stream,
gpu_index);
d_lut_pbs_indexes = (uint64_t *)cuda_malloc_async(
number_of_inputs * sizeof(uint64_t), stream, gpu_index);
cuda_synchronize_stream(v_stream);
cuda_memset_async(d_lut_pbs_indexes, 0, number_of_inputs * sizeof(uint64_t),
stream, gpu_index);
cuda_memcpy_async_to_gpu(d_lut_pbs_identity, lut_pbs_identity,
polynomial_size * (glwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(v_stream);
free(lut_pbs_identity);
d_lwe_ct_out_array =
(uint64_t *)cuda_malloc_async((glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t),
stream, gpu_index);
d_lwe_ct_in_array = (uint64_t *)cuda_malloc_async(
(lwe_dimension + 1) * number_of_inputs * REPETITIONS * SAMPLES *
sizeof(uint64_t),
stream, gpu_index);
uint64_t *lwe_ct_in_array =
(uint64_t *)malloc((lwe_dimension + 1) * number_of_inputs *
REPETITIONS * SAMPLES * sizeof(uint64_t));
// Create the input/output ciphertexts
for (uint r = 0; r < REPETITIONS; r++) {
uint64_t *lwe_sk_in = lwe_sk_in_array + (ptrdiff_t)(r * lwe_dimension);
for (uint s = 0; s < SAMPLES; s++) {
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i];
uint64_t *lwe_ct_in =
lwe_ct_in_array + (ptrdiff_t)((r * SAMPLES * number_of_inputs +
s * number_of_inputs + i) *
(lwe_dimension + 1));
concrete_cpu_encrypt_lwe_ciphertext_u64(
lwe_sk_in, lwe_ct_in, plaintext, lwe_dimension,
lwe_modular_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
}
}
}
cuda_synchronize_stream(v_stream);
cuda_memcpy_async_to_gpu(d_lwe_ct_in_array, lwe_ct_in_array,
REPETITIONS * SAMPLES * number_of_inputs *
(lwe_dimension + 1) * sizeof(uint64_t),
stream, gpu_index);
free(lwe_ct_in_array);
}
void TearDown() {
concrete_cpu_destroy_concrete_csprng(csprng);
free(csprng);
free(lwe_sk_in_array);
free(lwe_sk_out_array);
free(plaintexts);
cuda_drop_async(d_fourier_bsk_array, stream, gpu_index);
cuda_drop_async(d_lut_pbs_identity, stream, gpu_index);
cuda_drop_async(d_lut_pbs_indexes, stream, gpu_index);
cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index);
cuda_drop_async(d_lwe_ct_out_array, stream, gpu_index);
cuda_destroy_stream(stream, gpu_index);
}
};
TEST_P(BootstrapTestPrimitives_u64, amortized_bootstrap) {
uint64_t *lwe_ct_out_array =
(uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t));
int8_t *pbs_buffer = nullptr;
scratch_cuda_bootstrap_amortized_64(
stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
polynomial_size * (lwe_dimension + 1);
// Here execute the PBS
for (uint r = 0; r < REPETITIONS; r++) {
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
uint64_t *lwe_sk_out =
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
for (uint s = 0; s < SAMPLES; s++) {
uint64_t *d_lwe_ct_in =
d_lwe_ct_in_array +
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
(lwe_dimension + 1));
// Execute PBS
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
(void *)d_lwe_ct_in, (void *)d_fourier_bsk, pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
number_of_inputs, 1, 0, cuda_get_max_shared_memory(gpu_index));
// Copy result back
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
(glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t),
stream, gpu_index);
for (int j = 0; j < number_of_inputs; j++) {
uint64_t *result =
lwe_ct_out_array +
(ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs +
s * number_of_inputs + j];
uint64_t decrypted = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(
lwe_sk_out, result, glwe_dimension * polynomial_size, &decrypted);
EXPECT_NE(decrypted, plaintext);
// let err = (decrypted >= plaintext) ? decrypted - plaintext :
// plaintext
// - decrypted;
// error_sample_vec.push(err);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
EXPECT_EQ(decoded, plaintext / delta);
}
}
}
cleanup_cuda_bootstrap_amortized(stream, gpu_index, &pbs_buffer);
free(lwe_ct_out_array);
}
TEST_P(BootstrapTestPrimitives_u64, low_latency_bootstrap) {
uint64_t *lwe_ct_out_array =
(uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t));
int8_t *pbs_buffer = nullptr;
scratch_cuda_bootstrap_low_latency_64(
stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
polynomial_size * (lwe_dimension + 1);
// Here execute the PBS
for (uint r = 0; r < REPETITIONS; r++) {
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
uint64_t *lwe_sk_out =
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
for (uint s = 0; s < SAMPLES; s++) {
uint64_t *d_lwe_ct_in =
d_lwe_ct_in_array +
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
(lwe_dimension + 1));
// Execute PBS
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
(void *)d_lwe_ct_in, (void *)d_fourier_bsk, pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
number_of_inputs, 1, 0, cuda_get_max_shared_memory(gpu_index));
// Copy result back
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
(glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t),
stream, gpu_index);
for (int j = 0; j < number_of_inputs; j++) {
uint64_t *result =
lwe_ct_out_array +
(ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs +
s * number_of_inputs + j];
uint64_t decrypted = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(
lwe_sk_out, result, glwe_dimension * polynomial_size, &decrypted);
EXPECT_NE(decrypted, plaintext);
// let err = (decrypted >= plaintext) ? decrypted - plaintext :
// plaintext
// - decrypted;
// error_sample_vec.push(err);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
EXPECT_EQ(decoded, plaintext / delta);
}
}
}
cleanup_cuda_bootstrap_low_latency(stream, gpu_index, &pbs_buffer);
free(lwe_ct_out_array);
}
// Defines for which parameters set the PBS will be tested.
// It executes each test for all pairs on phis X qs (Cartesian product)
::testing::internal::ParamGenerator<BootstrapTestParams> pbs_params_u64 =
::testing::Values(
// n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
// message_modulus, carry_modulus
(BootstrapTestParams){500, 1, 1024, 0.000007069849454709433,
0.00000000000000029403601535432533, 23, 2, 4, 4,
1},
(BootstrapTestParams){500, 1, 1024, 0.000007069849454709433,
0.00000000000000029403601535432533, 23, 2, 4, 4,
3});
std::string printParamName(::testing::TestParamInfo<BootstrapTestParams> p) {
BootstrapTestParams params = p.param;
return "n_" + std::to_string(params.lwe_dimension) + "_k_" +
std::to_string(params.glwe_dimension) + "_N_" +
std::to_string(params.polynomial_size) + "_pbs_base_log_" +
std::to_string(params.pbs_base_log) + "_pbs_level_" +
std::to_string(params.pbs_level) + "_number_of_inputs_" +
std::to_string(params.number_of_inputs);
}
INSTANTIATE_TEST_CASE_P(BootstrapInstantiation, BootstrapTestPrimitives_u64,
pbs_params_u64, printParamName);

View File

@@ -0,0 +1,271 @@
#include "../include/circuit_bootstrap.h"
#include "../include/device.h"
#include "concrete-cpu.h"
#include "utils.h"
#include "gtest/gtest.h"
#include <cstdint>
#include <stdio.h>
#include <stdlib.h>
const unsigned REPETITIONS = 5;
const unsigned SAMPLES = 100;
typedef struct {
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance;
double glwe_modular_variance;
int pbs_base_log;
int pbs_level;
int pksk_base_log;
int pksk_level;
int cbs_base_log;
int cbs_level;
} CircuitBootstrapTestParams;
class CircuitBootstrapTestPrimitives_u64
: public ::testing::TestWithParam<CircuitBootstrapTestParams> {
protected:
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance;
double glwe_modular_variance;
int pbs_base_log;
int pbs_level;
int pksk_base_log;
int pksk_level;
int cbs_base_log;
int cbs_level;
int number_of_bits_of_message_including_padding;
int ggsw_size;
uint64_t delta;
int delta_log;
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t *lwe_sk_in_array;
uint64_t *lwe_sk_out_array;
uint64_t *lwe_in_ct;
uint64_t *ggsw_out_ct;
uint64_t *plaintexts;
double *d_fourier_bsk_array;
uint64_t *d_pksk_array;
uint64_t *d_lwe_in_ct;
uint64_t *d_ggsw_out_ct;
uint64_t *d_lut_vector_indexes;
int8_t *cbs_buffer;
public:
// Test arithmetic functions
void SetUp() {
stream = cuda_create_stream(0);
// TestParams
lwe_dimension = (int)GetParam().lwe_dimension;
glwe_dimension = (int)GetParam().glwe_dimension;
polynomial_size = (int)GetParam().polynomial_size;
lwe_modular_variance = (double)GetParam().lwe_modular_variance;
glwe_modular_variance = (double)GetParam().glwe_modular_variance;
pbs_base_log = (int)GetParam().pbs_base_log;
pbs_level = (int)GetParam().pbs_level;
pksk_base_log = (int)GetParam().pksk_base_log;
pksk_level = (int)GetParam().pksk_level;
cbs_base_log = (int)GetParam().cbs_base_log;
cbs_level = (int)GetParam().cbs_level;
// We generate binary messages
number_of_bits_of_message_including_padding = 2;
delta_log = 60;
delta = (uint64_t)(1) << delta_log;
ggsw_size = cbs_level * (glwe_dimension + 1) * (glwe_dimension + 1) *
polynomial_size;
// Create a Csprng
csprng =
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
uint8_t seed[16] = {(uint8_t)0};
concrete_cpu_construct_concrete_csprng(
csprng, Uint128{.little_endian_bytes = {*seed}});
// Generate the keys
generate_lwe_secret_keys(&lwe_sk_in_array, lwe_dimension, csprng, REPETITIONS);
generate_lwe_secret_keys(&lwe_sk_out_array,
glwe_dimension * polynomial_size, csprng, REPETITIONS);
generate_lwe_bootstrap_keys(
stream, gpu_index, &d_fourier_bsk_array, lwe_sk_in_array,
lwe_sk_out_array, lwe_dimension, glwe_dimension, polynomial_size,
pbs_level, pbs_base_log, csprng, glwe_modular_variance, REPETITIONS);
generate_lwe_private_functional_keyswitch_key_lists(
stream, gpu_index, &d_pksk_array, lwe_sk_out_array, lwe_sk_out_array,
glwe_dimension * polynomial_size, glwe_dimension, polynomial_size,
pksk_level, pksk_base_log, csprng, lwe_modular_variance, REPETITIONS);
plaintexts = generate_plaintexts(
number_of_bits_of_message_including_padding, delta, 1, REPETITIONS, SAMPLES);
d_ggsw_out_ct = (uint64_t *)cuda_malloc_async(ggsw_size * sizeof(uint64_t),
stream, gpu_index);
d_lwe_in_ct = (uint64_t *)cuda_malloc_async(
(lwe_dimension + 1) * sizeof(uint64_t), stream, gpu_index);
lwe_in_ct = (uint64_t *)malloc((lwe_dimension + 1) * sizeof(uint64_t));
ggsw_out_ct = (uint64_t *)malloc(ggsw_size * sizeof(uint64_t));
// Execute cbs scratch
scratch_cuda_circuit_bootstrap_64(
stream, gpu_index, &cbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, cbs_level, 1, cuda_get_max_shared_memory(gpu_index),
true);
// Build LUT vector indexes
uint64_t *h_lut_vector_indexes =
(uint64_t *)malloc(cbs_level * sizeof(uint64_t));
for (int index = 0; index < cbs_level; index++) {
h_lut_vector_indexes[index] = 0; // index % cbs_level;
}
d_lut_vector_indexes = (uint64_t *)cuda_malloc_async(
cbs_level * sizeof(uint64_t), stream, gpu_index);
cuda_memcpy_async_to_gpu(d_lut_vector_indexes, h_lut_vector_indexes,
cbs_level * sizeof(uint64_t), stream, gpu_index);
free(h_lut_vector_indexes);
}
void TearDown() {
void *v_stream = (void *)stream;
cuda_synchronize_stream(v_stream);
concrete_cpu_destroy_concrete_csprng(csprng);
free(csprng);
free(lwe_sk_in_array);
free(lwe_sk_out_array);
free(plaintexts);
free(lwe_in_ct);
free(ggsw_out_ct);
cleanup_cuda_circuit_bootstrap(stream, gpu_index, &cbs_buffer);
cuda_drop_async(d_fourier_bsk_array, stream, gpu_index);
cuda_drop_async(d_pksk_array, stream, gpu_index);
cuda_drop_async(d_lwe_in_ct, stream, gpu_index);
cuda_drop_async(d_ggsw_out_ct, stream, gpu_index);
cuda_drop_async(d_lut_vector_indexes, stream, gpu_index);
cuda_destroy_stream(stream, gpu_index);
}
};
TEST_P(CircuitBootstrapTestPrimitives_u64, circuit_bootstrap) {
void *v_stream = (void *)stream;
for (uint r = 0; r < REPETITIONS; r++) {
for (uint s = 0; s < SAMPLES; s++) {
uint64_t plaintext = plaintexts[r * SAMPLES + s];
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
polynomial_size * (lwe_dimension + 1);
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
int pksk_list_size = pksk_level * (glwe_dimension + 1) * polynomial_size *
(glwe_dimension * polynomial_size + 1) *
(glwe_dimension + 1);
uint64_t *d_pksk_list = d_pksk_array + (ptrdiff_t)(pksk_list_size * r);
uint64_t *lwe_in_sk = lwe_sk_in_array + (ptrdiff_t)(lwe_dimension * r);
uint64_t *lwe_sk_out =
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
concrete_cpu_encrypt_lwe_ciphertext_u64(
lwe_in_sk, lwe_in_ct, plaintext, lwe_dimension, lwe_modular_variance,
csprng, &CONCRETE_CSPRNG_VTABLE);
cuda_synchronize_stream(v_stream);
cuda_memcpy_async_to_gpu(d_lwe_in_ct, lwe_in_ct,
(lwe_dimension + 1) * sizeof(uint64_t), stream,
gpu_index);
// Execute circuit bootstrap
cuda_circuit_bootstrap_64(
stream, gpu_index, (void *)d_ggsw_out_ct, (void *)d_lwe_in_ct,
(void *)d_fourier_bsk, (void *)d_pksk_list,
(void *)d_lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
glwe_dimension, lwe_dimension, pbs_level, pbs_base_log, pksk_level,
pksk_base_log, cbs_level, cbs_base_log, 1,
cuda_get_max_shared_memory(gpu_index));
// Copy result back
cuda_memcpy_async_to_cpu(ggsw_out_ct, d_ggsw_out_ct,
ggsw_size * sizeof(uint64_t), stream, gpu_index);
cuda_synchronize_stream(v_stream);
uint64_t *decrypted =
(uint64_t *)malloc(polynomial_size * (glwe_dimension + 1) *
cbs_level * sizeof(uint64_t));
uint64_t multiplying_factor = -(plaintext >> delta_log);
for (int l = 1; l < cbs_level + 1; l++) {
for (int j = 0; j < glwe_dimension; j++) {
uint64_t *res = decrypted + (ptrdiff_t)((l - 1) * polynomial_size *
(glwe_dimension + 1) +
j * polynomial_size);
uint64_t *glwe_ct_out =
ggsw_out_ct +
(ptrdiff_t)((l - 1) * polynomial_size * (glwe_dimension + 1) *
(glwe_dimension + 1) +
j * polynomial_size * (glwe_dimension + 1));
concrete_cpu_decrypt_glwe_ciphertext_u64(
lwe_sk_out, res, glwe_ct_out, glwe_dimension, polynomial_size);
for (int k = 0; k < polynomial_size; k++) {
uint64_t expected_decryption =
lwe_sk_out[j * polynomial_size + k] * multiplying_factor;
expected_decryption >>= (64 - cbs_base_log * l);
uint64_t decoded_plaintext =
closest_representable(res[k], l, cbs_base_log) >>
(64 - cbs_base_log * l);
EXPECT_EQ(expected_decryption, decoded_plaintext);
}
}
}
// Check last glwe on last level
uint64_t *res =
decrypted +
(ptrdiff_t)((cbs_level - 1) * polynomial_size * (glwe_dimension + 1) +
glwe_dimension * polynomial_size);
uint64_t *glwe_ct_out =
ggsw_out_ct +
(ptrdiff_t)((cbs_level - 1) * polynomial_size * (glwe_dimension + 1) *
(glwe_dimension + 1) +
glwe_dimension * polynomial_size * (glwe_dimension + 1));
concrete_cpu_decrypt_glwe_ciphertext_u64(lwe_sk_out, res, glwe_ct_out,
glwe_dimension, polynomial_size);
for (int k = 0; k < polynomial_size; k++) {
uint64_t expected_decryption = (k == 0) ? plaintext / delta : 0;
uint64_t decoded_plaintext =
closest_representable(res[k], cbs_level, cbs_base_log) >>
(64 - cbs_base_log * cbs_level);
EXPECT_EQ(expected_decryption, decoded_plaintext);
}
free(decrypted);
}
}
}
// Defines for which parameters set the PBS will be tested.
// It executes each test for all pairs on phis X qs (Cartesian product)
::testing::internal::ParamGenerator<CircuitBootstrapTestParams> cbs_params_u64 =
::testing::Values(
// n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
// pksk_base_log, pksk_level, cbs_base_log, cbs_level
(CircuitBootstrapTestParams){10, 2, 512, 7.52316384526264e-37,
7.52316384526264e-37, 11, 2, 15, 2, 10,
1});
std::string
printParamName(::testing::TestParamInfo<CircuitBootstrapTestParams> p) {
CircuitBootstrapTestParams params = p.param;
return "n_" + std::to_string(params.lwe_dimension) + "_k_" +
std::to_string(params.glwe_dimension) + "_N_" +
std::to_string(params.polynomial_size) + "_pbs_base_log_" +
std::to_string(params.pbs_base_log) + "_pbs_level_" +
std::to_string(params.pbs_level) + "_pksk_base_log_" +
std::to_string(params.pksk_base_log) + "_pksk_level_" +
std::to_string(params.pksk_level) + "_cbs_base_log_" +
std::to_string(params.cbs_base_log) + "_cbs_level_" +
std::to_string(params.cbs_level);
}
INSTANTIATE_TEST_CASE_P(CircuitBootstrapInstantiation,
CircuitBootstrapTestPrimitives_u64, cbs_params_u64,
printParamName);

View File

@@ -0,0 +1,202 @@
#include "../include/device.h"
#include "../include/vertical_packing.h"
#include "concrete-cpu.h"
#include "utils.h"
#include "gtest/gtest.h"
#include <cstdint>
#include <functional>
#include <stdio.h>
#include <stdlib.h>
const unsigned REPETITIONS = 5;
const unsigned SAMPLES = 100;
typedef struct {
int glwe_dimension;
int polynomial_size;
int r;
int tau;
double glwe_modular_variance;
int base_log;
int level_count;
int delta_log;
} CMUXTreeTestParams;
class CMUXTreeTestPrimitives_u64
: public ::testing::TestWithParam<CMUXTreeTestParams> {
protected:
int glwe_dimension;
int polynomial_size;
int r_lut;
int tau;
double glwe_modular_variance;
int base_log;
int level_count;
uint64_t delta;
int delta_log;
Csprng *csprng;
uint64_t *plaintexts;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t *glwe_sk;
uint64_t *d_lut_identity;
public:
// Test arithmetic functions
void SetUp() {
stream = cuda_create_stream(0);
void *v_stream = (void *)stream;
// TestParams
glwe_dimension = (int)GetParam().glwe_dimension;
polynomial_size = (int)GetParam().polynomial_size;
r_lut = (int)GetParam().r;
tau = (int)GetParam().tau;
glwe_modular_variance = (int)GetParam().glwe_modular_variance;
base_log = (int)GetParam().base_log;
level_count = (int)GetParam().level_count;
delta_log = (int)GetParam().delta_log;
// Value of the shift we multiply our messages by
delta = ((uint64_t)(1) << delta_log);
// Create a Csprng
csprng =
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
uint8_t seed[16] = {(uint8_t)0};
concrete_cpu_construct_concrete_csprng(
csprng, Uint128{.little_endian_bytes = {*seed}});
// Generate the keys
generate_glwe_secret_keys(&glwe_sk, glwe_dimension, polynomial_size,
csprng, REPETITIONS);
plaintexts = generate_plaintexts(r_lut, 1, 1, REPETITIONS, SAMPLES);
// Create the LUT
int num_lut = (1 << r_lut);
d_lut_identity = (uint64_t *)cuda_malloc_async(
polynomial_size * num_lut * tau * sizeof(uint64_t), stream, gpu_index);
uint64_t *lut_cmux_tree_identity = generate_identity_lut_cmux_tree(
polynomial_size, num_lut, tau, delta_log);
// Copy all LUTs
cuda_memcpy_async_to_gpu(d_lut_identity, lut_cmux_tree_identity,
polynomial_size * num_lut * tau * sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(v_stream);
free(lut_cmux_tree_identity);
}
void TearDown() {
cuda_synchronize_stream(stream);
concrete_cpu_destroy_concrete_csprng(csprng);
free(plaintexts);
free(csprng);
cuda_drop_async(d_lut_identity, stream, gpu_index);
cuda_destroy_stream(stream, gpu_index);
}
};
TEST_P(CMUXTreeTestPrimitives_u64, cmux_tree) {
int ggsw_size = polynomial_size * (glwe_dimension + 1) *
(glwe_dimension + 1) * level_count;
int glwe_size = (glwe_dimension + 1) * polynomial_size;
uint64_t *d_ggsw_bit_array = (uint64_t *)cuda_malloc_async(
r_lut * ggsw_size * sizeof(uint64_t), stream, gpu_index);
uint64_t *d_results = (uint64_t *)cuda_malloc_async(
tau * glwe_size * sizeof(uint64_t), stream, gpu_index);
uint64_t *results = (uint64_t *)malloc(tau * glwe_size * sizeof(uint64_t));
uint64_t *ggsw = (uint64_t *)malloc(ggsw_size * sizeof(uint64_t));
int8_t *cmux_tree_buffer = nullptr;
scratch_cuda_cmux_tree_64(stream, gpu_index, &cmux_tree_buffer,
glwe_dimension, polynomial_size, level_count, r_lut,
tau, cuda_get_max_shared_memory(gpu_index), true);
// Here execute the PBS
for (uint r = 0; r < REPETITIONS; r++) {
for (uint s = 0; s < SAMPLES; s++) {
uint64_t witness = plaintexts[r * SAMPLES + s];
// Instantiate the GGSW m^tree ciphertexts
// We need r GGSW ciphertexts
// Bit decomposition of the value from MSB to LSB
uint64_t *bit_array = bit_decompose_value(witness, r_lut);
for (int i = 0; i < r_lut; i++) {
uint64_t *d_ggsw_slice = d_ggsw_bit_array + i * ggsw_size;
concrete_cpu_encrypt_ggsw_ciphertext_u64(
glwe_sk, ggsw, bit_array[i], glwe_dimension, polynomial_size,
level_count, base_log, glwe_modular_variance, csprng,
&CONCRETE_CSPRNG_VTABLE);
cuda_memcpy_async_to_gpu(d_ggsw_slice, ggsw,
ggsw_size * sizeof(uint64_t), stream,
gpu_index);
}
cuda_synchronize_stream(stream);
// Execute scratch/CMUX tree/cleanup
cuda_cmux_tree_64(stream, gpu_index, (void *)d_results,
(void *)d_ggsw_bit_array, (void *)d_lut_identity,
cmux_tree_buffer, glwe_dimension, polynomial_size,
base_log, level_count, r_lut, tau,
cuda_get_max_shared_memory(gpu_index));
// Copy result back
cuda_memcpy_async_to_cpu(results, d_results,
tau * glwe_size * sizeof(uint64_t), stream,
gpu_index);
cuda_synchronize_stream(stream);
for (int tree = 0; tree < tau; tree++) {
uint64_t *result = results + tree * glwe_size;
uint64_t *decrypted =
(uint64_t *)malloc(polynomial_size * sizeof(uint64_t));
concrete_cpu_decrypt_glwe_ciphertext_u64(
glwe_sk, decrypted, result, glwe_dimension, polynomial_size);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted[0] & rounding_bit) << 1;
uint64_t decoded = (decrypted[0] + rounding) / delta;
EXPECT_EQ(decoded, (witness + tree) % (1 << (64 - delta_log)));
free(decrypted);
}
free(bit_array);
}
}
cuda_synchronize_stream(stream);
cleanup_cuda_cmux_tree(stream, gpu_index, &cmux_tree_buffer);
free(ggsw);
cuda_drop_async(d_ggsw_bit_array, stream, gpu_index);
}
int glwe_dimension;
int polynomial_size;
double glwe_modular_variance;
int base_log;
int level_count;
int message_modulus;
int carry_modulus;
// Defines for which parameters set the PBS will be tested.
// It executes each test for all pairs on phis X qs (Cartesian product)
::testing::internal::ParamGenerator<CMUXTreeTestParams> cmux_tree_params_u64 =
::testing::Values(
// k, N, r, tau, glwe_variance, base_log, level_count, delta_log
(CMUXTreeTestParams){2, 256, 10, 6, 0.00000000000000029403601535432533,
6, 3, 60});
std::string printParamName(::testing::TestParamInfo<CMUXTreeTestParams> p) {
CMUXTreeTestParams params = p.param;
return "k_" + std::to_string(params.glwe_dimension) + "_N_" +
std::to_string(params.polynomial_size) + "_tau_" +
std::to_string(params.tau) + "_base_log_" +
std::to_string(params.base_log) + "_level_count_" +
std::to_string(params.level_count);
}
INSTANTIATE_TEST_CASE_P(CMUXTreeInstantiation, CMUXTreeTestPrimitives_u64,
cmux_tree_params_u64, printParamName);

View File

@@ -0,0 +1,192 @@
#include "../include/device.h"
#include "../include/keyswitch.h"
#include "concrete-cpu.h"
#include "utils.h"
#include "gtest/gtest.h"
#include <cstdint>
#include <functional>
#include <stdio.h>
#include <stdlib.h>
const unsigned REPETITIONS = 5;
const unsigned SAMPLES = 100;
typedef struct {
int input_lwe_dimension;
int output_lwe_dimension;
double noise_variance;
int ksk_base_log;
int ksk_level;
int message_modulus;
int carry_modulus;
} KeyswitchTestParams;
class KeyswitchTestPrimitives_u64
: public ::testing::TestWithParam<KeyswitchTestParams> {
protected:
int input_lwe_dimension;
int output_lwe_dimension;
double noise_variance;
int ksk_base_log;
int ksk_level;
int message_modulus;
int carry_modulus;
int payload_modulus;
uint64_t delta;
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t *lwe_sk_in_array;
uint64_t *lwe_sk_out_array;
uint64_t *plaintexts;
uint64_t *d_ksk_array;
uint64_t *d_lwe_out_ct;
uint64_t *d_lwe_in_ct;
uint64_t *lwe_in_ct;
uint64_t *lwe_out_ct;
int num_samples;
public:
// Test arithmetic functions
void SetUp() {
stream = cuda_create_stream(0);
void *v_stream = (void *)stream;
// TestParams
input_lwe_dimension = (int)GetParam().input_lwe_dimension;
output_lwe_dimension = (int)GetParam().output_lwe_dimension;
noise_variance = (int)GetParam().noise_variance;
ksk_base_log = (int)GetParam().ksk_base_log;
ksk_level = (int)GetParam().ksk_level;
message_modulus = (int)GetParam().message_modulus;
carry_modulus = (int)GetParam().carry_modulus;
payload_modulus = message_modulus * carry_modulus;
// Value of the shift we multiply our messages by
delta = ((uint64_t)(1) << 63) / (uint64_t)(payload_modulus);
// Create a Csprng
csprng =
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
uint8_t seed[16] = {(uint8_t)0};
concrete_cpu_construct_concrete_csprng(
csprng, Uint128{.little_endian_bytes = {*seed}});
// Generate the keys
generate_lwe_secret_keys(&lwe_sk_in_array, input_lwe_dimension, csprng, REPETITIONS);
generate_lwe_secret_keys(&lwe_sk_out_array, output_lwe_dimension, csprng, REPETITIONS);
generate_lwe_keyswitch_keys(
stream, gpu_index, &d_ksk_array, lwe_sk_in_array, lwe_sk_out_array,
input_lwe_dimension, output_lwe_dimension, ksk_level, ksk_base_log,
csprng, noise_variance, REPETITIONS);
plaintexts = generate_plaintexts(payload_modulus, delta, 1, REPETITIONS, SAMPLES);
d_lwe_out_ct = (uint64_t *)cuda_malloc_async(
(output_lwe_dimension + 1) * sizeof(uint64_t), stream, gpu_index);
d_lwe_in_ct = (uint64_t *)cuda_malloc_async(
(input_lwe_dimension + 1) * sizeof(uint64_t), stream, gpu_index);
lwe_in_ct =
(uint64_t *)malloc((input_lwe_dimension + 1) * sizeof(uint64_t));
lwe_out_ct =
(uint64_t *)malloc((output_lwe_dimension + 1) * sizeof(uint64_t));
cuda_synchronize_stream(v_stream);
}
void TearDown() {
void *v_stream = (void *)stream;
cuda_synchronize_stream(v_stream);
concrete_cpu_destroy_concrete_csprng(csprng);
free(csprng);
cuda_drop_async(d_lwe_in_ct, stream, gpu_index);
cuda_drop_async(d_lwe_out_ct, stream, gpu_index);
free(lwe_in_ct);
free(lwe_out_ct);
free(lwe_sk_in_array);
free(lwe_sk_out_array);
free(plaintexts);
cuda_drop_async(d_ksk_array, stream, gpu_index);
cuda_destroy_stream(stream, gpu_index);
}
};
TEST_P(KeyswitchTestPrimitives_u64, keyswitch) {
void *v_stream = (void *)stream;
for (uint r = 0; r < REPETITIONS; r++) {
for (uint s = 0; s < SAMPLES; s++) {
uint64_t plaintext = plaintexts[r * SAMPLES + s];
uint64_t *lwe_in_sk =
lwe_sk_in_array + (ptrdiff_t)(r * input_lwe_dimension);
uint64_t *lwe_out_sk =
lwe_sk_out_array + (ptrdiff_t)(r * output_lwe_dimension);
int ksk_size =
ksk_level * (output_lwe_dimension + 1) * input_lwe_dimension;
uint64_t *d_ksk = d_ksk_array + (ptrdiff_t)(ksk_size * r);
concrete_cpu_encrypt_lwe_ciphertext_u64(
lwe_in_sk, lwe_in_ct, plaintext, input_lwe_dimension, noise_variance,
csprng, &CONCRETE_CSPRNG_VTABLE);
cuda_synchronize_stream(v_stream);
cuda_memcpy_async_to_gpu(d_lwe_in_ct, lwe_in_ct,
(input_lwe_dimension + 1) * sizeof(uint64_t),
stream, gpu_index);
// Execute keyswitch
cuda_keyswitch_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_ct,
(void *)d_ksk, input_lwe_dimension, output_lwe_dimension,
ksk_base_log, ksk_level, 1);
// Copy result back
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
(output_lwe_dimension + 1) * sizeof(uint64_t),
stream, gpu_index);
uint64_t decrypted = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(lwe_out_sk, lwe_out_ct,
output_lwe_dimension, &decrypted);
EXPECT_NE(decrypted, plaintext);
// let err = (decrypted >= plaintext) ? decrypted - plaintext : plaintext
// - decrypted;
// error_sample_vec.push(err);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
ASSERT_EQ(decoded, plaintext / delta);
}
}
}
// Defines for which parameters set the PBS will be tested.
// It executes each test for all pairs on phis X qs (Cartesian product)
::testing::internal::ParamGenerator<KeyswitchTestParams> ksk_params_u64 =
::testing::Values(
// n, k*N, noise_variance, ks_base_log, ks_level,
// message_modulus, carry_modulus
// 1 bit message 0 bit carry parameters
(KeyswitchTestParams){567, 1280, 2.9802322387695312e-08, 3, 3, 2, 1},
// 3 bits message 0 bit carry parameters
(KeyswitchTestParams){694, 1536, 2.9802322387695312e-08, 4, 3, 4, 1},
// 4 bits message 0 bit carry parameters
(KeyswitchTestParams){769, 2048, 2.9802322387695312e-08, 4, 3, 5, 1},
// 5 bits message 0 bit carry parameters
(KeyswitchTestParams){754, 2048, 2.9802322387695312e-08, 3, 5, 6, 1},
// 6 bits message 0 bit carry parameters
(KeyswitchTestParams){847, 4096, 2.9802322387695312e-08, 4, 4, 7, 1},
// 7 bits message 0 bit carry parameters
(KeyswitchTestParams){881, 8192, 2.9802322387695312e-08, 3, 6, 8, 1});
std::string printParamName(::testing::TestParamInfo<KeyswitchTestParams> p) {
KeyswitchTestParams params = p.param;
return "na_" + std::to_string(params.input_lwe_dimension) + "_nb_" +
std::to_string(params.output_lwe_dimension) + "_baselog_" +
std::to_string(params.ksk_base_log) + "_ksk_level_" +
std::to_string(params.ksk_level);
}
INSTANTIATE_TEST_CASE_P(KeyswitchInstantiation, KeyswitchTestPrimitives_u64,
ksk_params_u64, printParamName);

View File

@@ -0,0 +1,279 @@
#include "../include/device.h"
#include "../include/linear_algebra.h"
#include "concrete-cpu.h"
#include "utils.h"
#include "gtest/gtest.h"
#include <cstdint>
#include <functional>
#include <stdio.h>
#include <stdlib.h>
const unsigned REPETITIONS = 5;
const unsigned SAMPLES = 100;
typedef struct {
int lwe_dimension;
double noise_variance;
int message_modulus;
int carry_modulus;
} LinearAlgebraTestParams;
class LinearAlgebraTestPrimitives_u64
: public ::testing::TestWithParam<LinearAlgebraTestParams> {
protected:
int lwe_dimension;
double noise_variance;
int message_modulus;
int carry_modulus;
int payload_modulus;
uint64_t delta;
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t *lwe_sk_array;
uint64_t *d_lwe_in_1_ct;
uint64_t *d_lwe_in_2_ct;
uint64_t *d_lwe_out_ct;
uint64_t *lwe_in_1_ct;
uint64_t *lwe_in_2_ct;
uint64_t *lwe_out_ct;
uint64_t *plaintexts_1;
uint64_t *plaintexts_2;
int num_samples;
public:
// Test arithmetic functions
void SetUp() {
stream = cuda_create_stream(0);
void *v_stream = (void *)stream;
// TestParams
lwe_dimension = (int)GetParam().lwe_dimension;
noise_variance = (int)GetParam().noise_variance;
message_modulus = (int)GetParam().message_modulus;
carry_modulus = (int)GetParam().carry_modulus;
payload_modulus = message_modulus * carry_modulus;
// Value of the shift we multiply our messages by
delta = ((uint64_t)(1) << 63) / (uint64_t)(payload_modulus);
// Create a Csprng
csprng =
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
uint8_t seed[16] = {(uint8_t)0};
concrete_cpu_construct_concrete_csprng(
csprng, Uint128{.little_endian_bytes = {*seed}});
// Generate the keys
generate_lwe_secret_keys(&lwe_sk_array, lwe_dimension, csprng, REPETITIONS);
plaintexts_1 = generate_plaintexts(payload_modulus, delta, 1, REPETITIONS, SAMPLES);
plaintexts_2 = generate_plaintexts(payload_modulus, delta, 1, REPETITIONS, SAMPLES);
d_lwe_in_1_ct = (uint64_t *)cuda_malloc_async(
(lwe_dimension + 1) * sizeof(uint64_t), stream, gpu_index);
d_lwe_in_2_ct = (uint64_t *)cuda_malloc_async(
(lwe_dimension + 1) * sizeof(uint64_t), stream, gpu_index);
d_lwe_out_ct = (uint64_t *)cuda_malloc_async(
(lwe_dimension + 1) * sizeof(uint64_t), stream, gpu_index);
lwe_in_1_ct = (uint64_t *)malloc((lwe_dimension + 1) * sizeof(uint64_t));
lwe_in_2_ct = (uint64_t *)malloc((lwe_dimension + 1) * sizeof(uint64_t));
lwe_out_ct = (uint64_t *)malloc((lwe_dimension + 1) * sizeof(uint64_t));
cuda_synchronize_stream(v_stream);
}
void TearDown() {
void *v_stream = (void *)stream;
cuda_synchronize_stream(v_stream);
concrete_cpu_destroy_concrete_csprng(csprng);
free(csprng);
cuda_drop_async(d_lwe_in_1_ct, stream, gpu_index);
cuda_drop_async(d_lwe_in_2_ct, stream, gpu_index);
cuda_drop_async(d_lwe_out_ct, stream, gpu_index);
free(lwe_in_1_ct);
free(lwe_in_2_ct);
free(lwe_out_ct);
free(lwe_sk_array);
free(plaintexts_1);
free(plaintexts_2);
}
};
TEST_P(LinearAlgebraTestPrimitives_u64, addition) {
void *v_stream = (void *)stream;
// Here execute the PBS
for (uint r = 0; r < REPETITIONS; r++) {
for (uint s = 0; s < SAMPLES; s++) {
uint64_t plaintext_1 = plaintexts_1[r * SAMPLES + s];
uint64_t plaintext_2 = plaintexts_2[r * SAMPLES + s];
uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
concrete_cpu_encrypt_lwe_ciphertext_u64(lwe_sk, lwe_in_1_ct, plaintext_1,
lwe_dimension, noise_variance,
csprng, &CONCRETE_CSPRNG_VTABLE);
concrete_cpu_encrypt_lwe_ciphertext_u64(lwe_sk, lwe_in_2_ct, plaintext_2,
lwe_dimension, noise_variance,
csprng, &CONCRETE_CSPRNG_VTABLE);
cuda_synchronize_stream(v_stream);
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
(lwe_dimension + 1) * sizeof(uint64_t), stream,
gpu_index);
cuda_memcpy_async_to_gpu(d_lwe_in_2_ct, lwe_in_2_ct,
(lwe_dimension + 1) * sizeof(uint64_t), stream,
gpu_index);
// Execute addition
cuda_add_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
(void *)d_lwe_in_2_ct, lwe_dimension, 1);
// Copy result back
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
(lwe_dimension + 1) * sizeof(uint64_t), stream,
gpu_index);
uint64_t decrypted = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(lwe_sk, lwe_out_ct, lwe_dimension,
&decrypted);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
ASSERT_EQ(decoded, (plaintext_1 + plaintext_2) / delta);
cuda_synchronize_stream(v_stream);
}
}
}
TEST_P(LinearAlgebraTestPrimitives_u64, plaintext_addition) {
void *v_stream = (void *)stream;
// Here execute the PBS
for (uint r = 0; r < REPETITIONS; r++) {
for (uint s = 0; s < SAMPLES; s++) {
uint64_t plaintext_1 = plaintexts_1[r * SAMPLES + s];
uint64_t plaintext_2 = plaintexts_2[r * SAMPLES + s];
uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
concrete_cpu_encrypt_lwe_ciphertext_u64(lwe_sk, lwe_in_1_ct, plaintext_1,
lwe_dimension, noise_variance,
csprng, &CONCRETE_CSPRNG_VTABLE);
cuda_synchronize_stream(v_stream);
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
(lwe_dimension + 1) * sizeof(uint64_t), stream,
gpu_index);
cuda_memcpy_async_to_gpu(d_lwe_in_2_ct, &plaintext_2, sizeof(uint64_t),
stream, gpu_index);
// Execute addition
cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
(void *)d_lwe_in_2_ct, lwe_dimension, 1);
// Copy result back
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
(lwe_dimension + 1) * sizeof(uint64_t), stream,
gpu_index);
uint64_t decrypted = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(lwe_sk, lwe_out_ct, lwe_dimension,
&decrypted);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
ASSERT_EQ(decoded, (plaintext_1 + plaintext_2) / delta);
cuda_synchronize_stream(v_stream);
}
}
}
TEST_P(LinearAlgebraTestPrimitives_u64, plaintext_multiplication) {
void *v_stream = (void *)stream;
// Here execute the PBS
for (uint r = 0; r < REPETITIONS; r++) {
for (uint s = 0; s < SAMPLES; s++) {
uint64_t plaintext_1 = plaintexts_1[r * SAMPLES + s];
uint64_t plaintext_2 = plaintexts_2[r * SAMPLES + s];
uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
concrete_cpu_encrypt_lwe_ciphertext_u64(lwe_sk, lwe_in_1_ct, plaintext_1,
lwe_dimension, noise_variance,
csprng, &CONCRETE_CSPRNG_VTABLE);
cuda_synchronize_stream(v_stream);
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
(lwe_dimension + 1) * sizeof(uint64_t), stream,
gpu_index);
cuda_memcpy_async_to_gpu(d_lwe_in_2_ct, &plaintext_1, sizeof(uint64_t),
stream, gpu_index);
// Execute addition
cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
(void *)d_lwe_in_2_ct, lwe_dimension, 1);
// Copy result back
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
(lwe_dimension + 1) * sizeof(uint64_t), stream,
gpu_index);
uint64_t decrypted = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(lwe_sk, lwe_out_ct, lwe_dimension,
&decrypted);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
ASSERT_EQ(decoded, (plaintext_1 * plaintext_2) / delta);
cuda_synchronize_stream(v_stream);
}
}
}
TEST_P(LinearAlgebraTestPrimitives_u64, negate) {
void *v_stream = (void *)stream;
// Here execute the PBS
for (uint r = 0; r < REPETITIONS; r++) {
for (uint s = 0; s < SAMPLES; s++) {
uint64_t plaintext = plaintexts_1[r * SAMPLES + s];
uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
concrete_cpu_encrypt_lwe_ciphertext_u64(lwe_sk, lwe_in_1_ct, plaintext,
lwe_dimension, noise_variance,
csprng, &CONCRETE_CSPRNG_VTABLE);
cuda_synchronize_stream(v_stream);
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
(lwe_dimension + 1) * sizeof(uint64_t), stream,
gpu_index);
// Execute addition
cuda_negate_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
lwe_dimension, 1);
// Copy result back
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
(lwe_dimension + 1) * sizeof(uint64_t), stream,
gpu_index);
uint64_t decrypted = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(lwe_sk, lwe_out_ct, lwe_dimension,
&decrypted);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
ASSERT_EQ(decoded, -plaintext / delta);
cuda_synchronize_stream(v_stream);
}
}
}
// Defines for which parameters set the linear algebra operations will be
// tested. It executes each test for all pairs on phis X qs (Cartesian product)
::testing::internal::ParamGenerator<LinearAlgebraTestParams>
linear_algebra_params_u64 = ::testing::Values(
// n, lwe_std_dev, message_modulus, carry_modulus
(LinearAlgebraTestParams){600, 0.000007069849454709433, 4, 4});
std::string
printParamName(::testing::TestParamInfo<LinearAlgebraTestParams> p) {
LinearAlgebraTestParams params = p.param;
return "n_" + std::to_string(params.lwe_dimension);
}
INSTANTIATE_TEST_CASE_P(LinearAlgebraInstantiation,
LinearAlgebraTestPrimitives_u64,
linear_algebra_params_u64, printParamName);

View File

@@ -0,0 +1,289 @@
#include "../include/bootstrap.h"
#include "../include/device.h"
#include "concrete-cpu.h"
#include "utils.h"
#include "gtest/gtest.h"
#include <cmath>
#include <cstdint>
#include <stdio.h>
#include <stdlib.h>
const unsigned REPETITIONS = 5;
const unsigned SAMPLES = 100;
typedef struct {
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance;
double glwe_modular_variance;
int pbs_base_log;
int pbs_level;
int ks_base_log;
int ks_level;
int pksk_base_log;
int pksk_level;
int cbs_base_log;
int cbs_level;
int tau;
} WopBootstrapTestParams;
class WopBootstrapTestPrimitives_u64
: public ::testing::TestWithParam<WopBootstrapTestParams> {
protected:
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance;
double glwe_modular_variance;
int pbs_base_log;
int pbs_level;
int ks_base_log;
int ks_level;
int pksk_base_log;
int pksk_level;
int cbs_base_log;
int cbs_level;
int tau;
int p;
uint64_t delta;
uint32_t cbs_delta_log;
int delta_log;
int delta_log_lut;
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t *lwe_sk_in_array;
uint64_t *lwe_sk_out_array;
uint64_t *lwe_in_ct_array;
uint64_t *lwe_out_ct_array;
uint64_t *plaintexts;
double *d_fourier_bsk_array;
uint64_t *d_ksk_array;
uint64_t *d_pksk_array;
uint64_t *d_lwe_ct_in_array;
uint64_t *d_lwe_ct_out_array;
uint64_t *d_lut_vector;
int8_t *wop_pbs_buffer;
public:
// Test arithmetic functions
void SetUp() {
stream = cuda_create_stream(0);
void *v_stream = (void *)stream;
// TestParams
lwe_dimension = (int)GetParam().lwe_dimension;
glwe_dimension = (int)GetParam().glwe_dimension;
polynomial_size = (int)GetParam().polynomial_size;
lwe_modular_variance = (double)GetParam().lwe_modular_variance;
glwe_modular_variance = (double)GetParam().glwe_modular_variance;
pbs_base_log = (int)GetParam().pbs_base_log;
pbs_level = (int)GetParam().pbs_level;
ks_base_log = (int)GetParam().ks_base_log;
ks_level = (int)GetParam().ks_level;
pksk_base_log = (int)GetParam().pksk_base_log;
pksk_level = (int)GetParam().pksk_level;
cbs_base_log = (int)GetParam().cbs_base_log;
cbs_level = (int)GetParam().cbs_level;
tau = (int)GetParam().tau;
p = 10 / tau;
delta_log = 64 - p;
delta_log_lut = delta_log;
delta = (uint64_t)(1) << delta_log;
// Create a Csprng
csprng =
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
uint8_t seed[16] = {(uint8_t)0};
concrete_cpu_construct_concrete_csprng(
csprng, Uint128{.little_endian_bytes = {*seed}});
int input_lwe_dimension = glwe_dimension * polynomial_size;
// Generate the keys
generate_lwe_secret_keys(&lwe_sk_in_array, input_lwe_dimension, csprng, REPETITIONS);
generate_lwe_secret_keys(&lwe_sk_out_array, lwe_dimension, csprng, REPETITIONS);
generate_lwe_keyswitch_keys(stream, gpu_index, &d_ksk_array,
lwe_sk_in_array, lwe_sk_out_array,
input_lwe_dimension, lwe_dimension, ks_level,
ks_base_log, csprng, lwe_modular_variance, REPETITIONS);
generate_lwe_bootstrap_keys(
stream, gpu_index, &d_fourier_bsk_array, lwe_sk_out_array,
lwe_sk_in_array, lwe_dimension, glwe_dimension, polynomial_size,
pbs_level, pbs_base_log, csprng, glwe_modular_variance, REPETITIONS);
generate_lwe_private_functional_keyswitch_key_lists(
stream, gpu_index, &d_pksk_array, lwe_sk_in_array, lwe_sk_in_array,
input_lwe_dimension, glwe_dimension, polynomial_size, pksk_level,
pksk_base_log, csprng, lwe_modular_variance, REPETITIONS);
plaintexts = generate_plaintexts(p, delta, tau, REPETITIONS, SAMPLES);
// LUT creation
int lut_size = polynomial_size;
int lut_num = tau << (tau * p - (int)log2(polynomial_size)); // r
uint64_t *big_lut =
(uint64_t *)malloc(lut_num * lut_size * sizeof(uint64_t));
for (int t = tau - 1; t >= 0; t--) {
uint64_t *small_lut = big_lut + (ptrdiff_t)(t * (1 << (tau * p)));
for (uint64_t value = 0; value < (uint64_t)(1 << (tau * p)); value++) {
int nbits = t * p;
uint64_t x = (value >> nbits) & (uint64_t)((1 << p) - 1);
small_lut[value] =
((x % (uint64_t)(1 << (64 - delta_log))) << delta_log_lut);
}
}
d_lut_vector = (uint64_t *)cuda_malloc_async(
lut_num * lut_size * sizeof(uint64_t), stream, gpu_index);
cuda_memcpy_async_to_gpu(d_lut_vector, big_lut,
lut_num * lut_size * sizeof(uint64_t), stream,
gpu_index);
free(big_lut);
// Execute scratch
scratch_cuda_wop_pbs_64(stream, gpu_index, &wop_pbs_buffer,
(uint32_t *)&delta_log, &cbs_delta_log,
glwe_dimension, lwe_dimension, polynomial_size,
cbs_level, pbs_level, p, p, tau,
cuda_get_max_shared_memory(gpu_index), true);
// Allocate input
d_lwe_ct_in_array = (uint64_t *)cuda_malloc_async(
(input_lwe_dimension + 1) * tau * sizeof(uint64_t), stream, gpu_index);
// Allocate output
d_lwe_ct_out_array = (uint64_t *)cuda_malloc_async(
(input_lwe_dimension + 1) * tau * sizeof(uint64_t), stream, gpu_index);
lwe_in_ct_array =
(uint64_t *)malloc((input_lwe_dimension + 1) * tau * sizeof(uint64_t));
lwe_out_ct_array =
(uint64_t *)malloc((input_lwe_dimension + 1) * tau * sizeof(uint64_t));
}
void TearDown() {
void *v_stream = (void *)stream;
cuda_synchronize_stream(v_stream);
concrete_cpu_destroy_concrete_csprng(csprng);
free(csprng);
free(lwe_sk_in_array);
free(lwe_sk_out_array);
free(plaintexts);
free(lwe_in_ct_array);
free(lwe_out_ct_array);
cleanup_cuda_circuit_bootstrap_vertical_packing(stream, gpu_index,
&wop_pbs_buffer);
cuda_drop_async(d_fourier_bsk_array, stream, gpu_index);
cuda_drop_async(d_ksk_array, stream, gpu_index);
cuda_drop_async(d_pksk_array, stream, gpu_index);
cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index);
cuda_drop_async(d_lwe_ct_out_array, stream, gpu_index);
cuda_drop_async(d_lut_vector, stream, gpu_index);
cuda_destroy_stream(stream, gpu_index);
}
};
TEST_P(WopBootstrapTestPrimitives_u64, wop_pbs) {
void *v_stream = (void *)stream;
int input_lwe_dimension = glwe_dimension * polynomial_size;
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
polynomial_size * (lwe_dimension + 1);
int ksk_size =
ks_level * (lwe_dimension + 1) * glwe_dimension * polynomial_size;
int pksk_list_size = pksk_level * (glwe_dimension + 1) * polynomial_size *
(glwe_dimension * polynomial_size + 1) *
(glwe_dimension + 1);
for (uint r = 0; r < REPETITIONS; r++) {
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
uint64_t *lwe_sk_in =
lwe_sk_in_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
uint64_t *d_ksk = d_ksk_array + (ptrdiff_t)(ksk_size * r);
uint64_t *d_pksk_list = d_pksk_array + (ptrdiff_t)(pksk_list_size * r);
for (uint s = 0; s < SAMPLES; s++) {
for (int t = 0; t < tau; t++) {
uint64_t plaintext = plaintexts[r * SAMPLES * tau + s * tau + t];
uint64_t *lwe_in_ct =
lwe_in_ct_array + (ptrdiff_t)((r * SAMPLES * tau + s * tau + t) *
(input_lwe_dimension + 1));
concrete_cpu_encrypt_lwe_ciphertext_u64(
lwe_sk_in, lwe_in_ct, plaintext, input_lwe_dimension,
lwe_modular_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
}
cuda_synchronize_stream(v_stream);
cuda_memcpy_async_to_gpu(d_lwe_ct_in_array, lwe_in_ct_array,
(input_lwe_dimension + 1) * tau *
sizeof(uint64_t),
stream, gpu_index);
// Execute wop pbs
cuda_wop_pbs_64(stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lwe_ct_in_array, (void *)d_lut_vector,
(void *)d_fourier_bsk, (void *)d_ksk, (void *)d_pksk_list,
wop_pbs_buffer, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
ks_base_log, ks_level, pksk_base_log, pksk_level,
cbs_base_log, cbs_level, p, p, delta_log, tau,
cuda_get_max_shared_memory(gpu_index));
//// Copy result back
// cuda_memcpy_async_to_cpu(lwe_out_ct_array, d_lwe_ct_out_array,
//(input_lwe_dimension + 1) * tau * sizeof(uint64_t), stream, gpu_index);
// cuda_synchronize_stream(v_stream);
// for (int i = 0; i < tau; i++) {
// uint64_t *result_ct =
// lwe_out_ct_array + (ptrdiff_t)(i * (input_lwe_dimension + 1));
// uint64_t decrypted_message = 0;
// concrete_cpu_decrypt_lwe_ciphertext_u64(
// lwe_sk_in, result_ct, input_lwe_dimension, &decrypted_message);
// // Round after decryption
// uint64_t decrypted =
// closest_representable(decrypted_message, 1, p) >> delta_log;
// uint64_t expected = plaintext >> delta_log;
// EXPECT_EQ(decrypted, expected);
//}
}
}
}
// Defines for which parameters set the PBS will be tested.
// It executes each test for all pairs on phis X qs (Cartesian product)
::testing::internal::ParamGenerator<WopBootstrapTestParams> wop_pbs_params_u64 =
::testing::Values(
// n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
// ks_base_log, ks_level, tau
(WopBootstrapTestParams){481, 2, 512, 7.52316384526264e-37,
7.52316384526264e-37, 4,
9, 1, 9, 4, 9, 6, 4, 1}//,
//(WopBootstrapTestParams){481, 2, 512, 7.52316384526264e-37,
// 7.52316384526264e-37, 4, 9, 1, 9, 4, 9, 6, 4,
// 2} //,
//(WopBootstrapTestParams){481, 2, 1024, 7.52316384526264e-37,
// 7.52316384526264e-37, 4,
// 9, 1, 9, 4, 9, 6, 4, 1},
//(WopBootstrapTestParams){481, 2, 1024, 7.52316384526264e-37,
// 7.52316384526264e-37, 4,
// 9, 1, 9, 4, 9, 6, 4, 2}
);
std::string printParamName(::testing::TestParamInfo<WopBootstrapTestParams> p) {
WopBootstrapTestParams params = p.param;
std::string message = "Unknown_parameter_set";
if (params.polynomial_size == 512) {
// When log_2_poly_size == 9 we have a cmux tree done with a single cmux.
message = "wop_pbs_cmux_tree_with_single_cmux_n_" +
std::to_string(params.lwe_dimension) + "_k_" +
std::to_string(params.glwe_dimension) + "_N_" +
std::to_string(params.polynomial_size) + "_tau_" +
std::to_string(params.tau);
} else if (params.polynomial_size == 1024) {
// When log_2_poly_size == 10 the VP skips the cmux tree.
message = "wop_pbs_without_cmux_tree_n_" +
std::to_string(params.lwe_dimension) + "_k_" +
std::to_string(params.glwe_dimension) + "_N_" +
std::to_string(params.polynomial_size) + "_tau_" +
std::to_string(params.tau);
}
return message;
}
INSTANTIATE_TEST_CASE_P(WopBootstrapInstantiation,
WopBootstrapTestPrimitives_u64, wop_pbs_params_u64,
printParamName);

View File

@@ -0,0 +1,290 @@
#include "utils.h"
#include "../include/bootstrap.h"
#include "../include/device.h"
#include "concrete-cpu.h"
#include <cmath>
#include <cstdint>
#include <cstdlib>
#include <functional>
#include <random>
// For each sample and repetition, create a plaintext
// The payload_modulus is the message modulus times the carry modulus
// (so the total message modulus)
uint64_t *generate_plaintexts(uint64_t payload_modulus, uint64_t delta,
int number_of_inputs, const unsigned repetitions, const unsigned
samples) {
uint64_t *plaintext_array = (uint64_t *)malloc(
repetitions * samples * number_of_inputs * sizeof(uint64_t));
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<unsigned long long> dis(
std::numeric_limits<std::uint64_t>::min(),
std::numeric_limits<std::uint64_t>::max());
for (uint r = 0; r < repetitions; r++) {
for (uint s = 0; s < samples; s++) {
for (int i = 0; i < number_of_inputs; i++) {
plaintext_array[r * samples * number_of_inputs + s * number_of_inputs +
i] = (dis(gen) % payload_modulus) * delta;
}
}
}
return plaintext_array;
}
// Decompose value in r bits
// Bit decomposition of the value from MSB to LSB
uint64_t *bit_decompose_value(uint64_t value, int r) {
uint64_t *bit_array = (uint64_t *)malloc(r * sizeof(uint64_t));
uint64_t x = value;
for (int i = 0; i < r; i++) {
bit_array[i] = x & 1;
x >>= 1;
}
return bit_array;
}
uint64_t *generate_identity_lut_pbs(int polynomial_size, int glwe_dimension,
int message_modulus, int carry_modulus,
std::function<uint64_t(uint64_t)> func) {
// Modulus of the msg contained in the msg bits and operations buffer
uint64_t modulus_sup = message_modulus * carry_modulus;
// N/(p/2) = size of each block
uint64_t box_size = polynomial_size / modulus_sup;
// Value of the shift we multiply our messages by
uint64_t delta = ((uint64_t)1 << 63) / (uint64_t)(modulus_sup);
// Create the plaintext lut_pbs
uint64_t *plaintext_lut_pbs =
(uint64_t *)malloc(polynomial_size * sizeof(uint64_t));
// This plaintext_lut_pbs extracts the carry bits
for (uint64_t i = 0; i < modulus_sup; i++) {
uint64_t index = i * box_size;
for (uint64_t j = index; j < index + box_size; j++) {
plaintext_lut_pbs[j] = func(i) * delta;
}
}
uint64_t half_box_size = box_size / 2;
// Negate the first half_box_size coefficients
for (uint64_t i = 0; i < half_box_size; i++) {
plaintext_lut_pbs[i] = -plaintext_lut_pbs[i];
}
// Rotate the plaintext_lut_pbs
std::rotate(plaintext_lut_pbs, plaintext_lut_pbs + half_box_size,
plaintext_lut_pbs + polynomial_size);
// Create the GLWE lut_pbs
uint64_t *lut_pbs = (uint64_t *)malloc(
polynomial_size * (glwe_dimension + 1) * sizeof(uint64_t));
for (int i = 0; i < polynomial_size * glwe_dimension; i++) {
lut_pbs[i] = 0;
}
for (int i = 0; i < polynomial_size; i++) {
int glwe_index = glwe_dimension * polynomial_size + i;
lut_pbs[glwe_index] = plaintext_lut_pbs[i];
}
free(plaintext_lut_pbs);
return lut_pbs;
}
uint64_t *generate_identity_lut_cmux_tree(int polynomial_size, int num_lut,
int tau, int delta_log) {
// Create the plaintext lut_pbs
uint64_t *plaintext_lut_cmux_tree =
(uint64_t *)malloc(num_lut * tau * polynomial_size * sizeof(uint64_t));
// This plaintext_lut_cmux_tree extracts the carry bits
for (int tree = 0; tree < tau; tree++)
for (int i = 0; i < num_lut; i++) {
uint64_t *plaintext_lut_slice = plaintext_lut_cmux_tree +
i * polynomial_size +
tree * num_lut * polynomial_size;
uint64_t coeff = (((uint64_t)(i + tree) % (1 << (64 - delta_log))))
<< delta_log;
for (int p = 0; p < polynomial_size; p++)
plaintext_lut_slice[p] = coeff;
}
return plaintext_lut_cmux_tree;
}
// Generate repetitions LWE secret keys
void generate_lwe_secret_keys(uint64_t **lwe_sk_array, int lwe_dimension,
Csprng *csprng, const unsigned repetitions) {
int lwe_sk_array_size = lwe_dimension * repetitions;
*lwe_sk_array = (uint64_t *)malloc(lwe_sk_array_size * sizeof(uint64_t));
int shift = 0;
for (uint r = 0; r < repetitions; r++) {
// Generate the lwe secret key for each repetition
concrete_cpu_init_secret_key_u64(*lwe_sk_array + (ptrdiff_t)(shift),
lwe_dimension, csprng,
&CONCRETE_CSPRNG_VTABLE);
shift += lwe_dimension;
}
}
// Generate repetitions GLWE secret keys
void generate_glwe_secret_keys(uint64_t **glwe_sk_array, int glwe_dimension,
int polynomial_size, Csprng *csprng, const unsigned repetitions) {
int glwe_sk_array_size = glwe_dimension * polynomial_size * repetitions;
*glwe_sk_array = (uint64_t *)malloc(glwe_sk_array_size * sizeof(uint64_t));
int shift = 0;
for (uint r = 0; r < repetitions; r++) {
// Generate the lwe secret key for each repetition
concrete_cpu_init_secret_key_u64(*glwe_sk_array + (ptrdiff_t)(shift),
glwe_dimension * polynomial_size, csprng,
&CONCRETE_CSPRNG_VTABLE);
shift += glwe_dimension * polynomial_size;
}
}
// Generate repetitions LWE bootstrap keys
void generate_lwe_bootstrap_keys(cudaStream_t *stream, int gpu_index,
double **d_fourier_bsk_array,
uint64_t *lwe_sk_in_array,
uint64_t *lwe_sk_out_array, int lwe_dimension,
int glwe_dimension, int polynomial_size,
int pbs_level, int pbs_base_log,
Csprng *csprng, double variance, const unsigned repetitions) {
void *v_stream = (void *)stream;
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
polynomial_size * (lwe_dimension + 1);
int bsk_array_size = bsk_size * repetitions;
uint64_t *bsk_array = (uint64_t *)malloc(bsk_array_size * sizeof(uint64_t));
*d_fourier_bsk_array = (double *)cuda_malloc_async(
bsk_array_size * sizeof(double), stream, gpu_index);
int shift_in = 0;
int shift_out = 0;
int shift_bsk = 0;
for (uint r = 0; r < repetitions; r++) {
// Generate the bootstrap key for each repetition
concrete_cpu_init_lwe_bootstrap_key_u64(
bsk_array + (ptrdiff_t)(shift_bsk),
lwe_sk_in_array + (ptrdiff_t)(shift_in),
lwe_sk_out_array + (ptrdiff_t)(shift_out), lwe_dimension,
polynomial_size, glwe_dimension, pbs_level, pbs_base_log, variance,
Parallelism(1), csprng, &CONCRETE_CSPRNG_VTABLE);
cuda_synchronize_stream(v_stream);
double *d_fourier_bsk = *d_fourier_bsk_array + (ptrdiff_t)(shift_bsk);
uint64_t *bsk = bsk_array + (ptrdiff_t)(shift_bsk);
cuda_synchronize_stream(v_stream);
cuda_convert_lwe_bootstrap_key_64(
(void *)(d_fourier_bsk), (void *)(bsk), v_stream, gpu_index,
lwe_dimension, glwe_dimension, pbs_level, polynomial_size);
shift_in += lwe_dimension;
shift_out += glwe_dimension * polynomial_size;
shift_bsk += bsk_size;
}
free(bsk_array);
}
// Generate repetitions keyswitch keys
void generate_lwe_keyswitch_keys(
cudaStream_t *stream, int gpu_index, uint64_t **d_ksk_array,
uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array,
int input_lwe_dimension, int output_lwe_dimension, int ksk_level,
int ksk_base_log, Csprng *csprng, double variance, const unsigned repetitions) {
int ksk_size = ksk_level * (output_lwe_dimension + 1) * input_lwe_dimension;
int ksk_array_size = ksk_size * repetitions;
uint64_t *ksk_array = (uint64_t *)malloc(ksk_array_size * sizeof(uint64_t));
*d_ksk_array = (uint64_t *)cuda_malloc_async(
ksk_array_size * sizeof(uint64_t), stream, gpu_index);
int shift_in = 0;
int shift_out = 0;
int shift_ksk = 0;
for (uint r = 0; r < repetitions; r++) {
// Generate the keyswitch key for each repetition
concrete_cpu_init_lwe_keyswitch_key_u64(
ksk_array + (ptrdiff_t)(shift_ksk),
lwe_sk_in_array + (ptrdiff_t)(shift_in),
lwe_sk_out_array + (ptrdiff_t)(shift_out), input_lwe_dimension,
output_lwe_dimension, ksk_level, ksk_base_log, variance, csprng,
&CONCRETE_CSPRNG_VTABLE);
uint64_t *d_ksk = *d_ksk_array + (ptrdiff_t)(shift_ksk);
uint64_t *ksk = ksk_array + (ptrdiff_t)(shift_ksk);
cuda_memcpy_async_to_gpu(d_ksk, ksk, ksk_size * sizeof(uint64_t), stream,
gpu_index);
shift_in += input_lwe_dimension;
shift_out += output_lwe_dimension;
shift_ksk += ksk_size;
}
free(ksk_array);
}
// Generate repetitions private functional keyswitch key lists (with (k + 1)
// keys each)
void generate_lwe_private_functional_keyswitch_key_lists(
cudaStream_t *stream, int gpu_index, uint64_t **d_pksk_array,
uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array,
int input_lwe_dimension, int output_glwe_dimension,
int output_polynomial_size, int pksk_level, int pksk_base_log,
Csprng *csprng, double variance, const unsigned repetitions) {
int pksk_list_size = pksk_level * (output_glwe_dimension + 1) *
output_polynomial_size * (input_lwe_dimension + 1) *
(output_glwe_dimension + 1);
int pksk_array_size = pksk_list_size * repetitions;
uint64_t *pksk_array = (uint64_t *)malloc(pksk_array_size * sizeof(uint64_t));
*d_pksk_array = (uint64_t *)cuda_malloc_async(
pksk_array_size * sizeof(uint64_t), stream, gpu_index);
int shift_in = 0;
int shift_out = 0;
int shift_pksk_list = 0;
for (uint r = 0; r < repetitions; r++) {
// Generate the (k + 1) private functional keyswitch keys for each
// repetition
concrete_cpu_init_lwe_circuit_bootstrap_private_functional_packing_keyswitch_keys_u64(
pksk_array + (ptrdiff_t)(shift_pksk_list),
lwe_sk_in_array + (ptrdiff_t)(shift_in),
lwe_sk_out_array + (ptrdiff_t)(shift_out), input_lwe_dimension,
output_polynomial_size, output_glwe_dimension, pksk_level,
pksk_base_log, variance, Parallelism(1), csprng,
&CONCRETE_CSPRNG_VTABLE);
uint64_t *d_pksk_list = *d_pksk_array + (ptrdiff_t)(shift_pksk_list);
uint64_t *pksk_list = pksk_array + (ptrdiff_t)(shift_pksk_list);
cuda_memcpy_async_to_gpu(d_pksk_list, pksk_list,
pksk_list_size * sizeof(uint64_t), stream,
gpu_index);
shift_in += input_lwe_dimension;
shift_out += output_glwe_dimension * output_polynomial_size;
shift_pksk_list += pksk_list_size;
}
free(pksk_array);
}
// The closest number representable by the decomposition can be computed by
// performing the rounding at the appropriate bit.
uint64_t closest_representable(uint64_t input, int level_count, int base_log) {
// Compute the number of least significant bits which can not be represented
// by the decomposition
int non_rep_bit_count = 64 - (level_count * base_log);
// Generate a mask which captures the non representable bits
uint64_t one = 1;
uint64_t non_rep_mask = one << (non_rep_bit_count - 1);
// Retrieve the non representable bits
uint64_t non_rep_bits = input & non_rep_mask;
// Extract the msb of the non representable bits to perform the rounding
uint64_t non_rep_msb = non_rep_bits >> (non_rep_bit_count - 1);
// Remove the non-representable bits and perform the rounding
uint64_t res = input >> non_rep_bit_count;
res += non_rep_msb;
return res << non_rep_bit_count;
}

View File

@@ -0,0 +1,50 @@
#ifndef TEST_UTILS_H
#define TEST_UTILS_H
#include "../include/device.h"
#include "concrete-cpu.h"
#include <functional>
uint64_t *generate_plaintexts(uint64_t payload_modulus, uint64_t delta,
int number_of_inputs, const unsigned repetitions, const unsigned
samples);
uint64_t *generate_identity_lut_pbs(int polynomial_size, int glwe_dimension,
int message_modulus, int carry_modulus,
std::function<uint64_t(uint64_t)> func);
uint64_t *generate_identity_lut_cmux_tree(int polynomial_size, int num_lut,
int tau, int delta_log);
void generate_lwe_secret_keys(uint64_t **lwe_sk_array, int lwe_dimension,
Csprng *csprng, const unsigned repetitions);
void generate_glwe_secret_keys(uint64_t **glwe_sk_array, int glwe_dimension,
int polynomial_size, Csprng *csprng, const unsigned repetitions);
void generate_lwe_bootstrap_keys(cudaStream_t *stream, int gpu_index,
double **d_fourier_bsk_array,
uint64_t *lwe_sk_in_array,
uint64_t *lwe_sk_out_array, int lwe_dimension,
int glwe_dimension, int polynomial_size,
int pbs_level, int pbs_base_log,
Csprng *csprng, double variance, const unsigned repetitions);
void generate_lwe_keyswitch_keys(
cudaStream_t *stream, int gpu_index, uint64_t **d_ksk_array,
uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array,
int input_lwe_dimension, int output_lwe_dimension, int ksk_level,
int ksk_base_log, Csprng *csprng, double variance, const unsigned repetitions);
void generate_lwe_private_functional_keyswitch_key_lists(
cudaStream_t *stream, int gpu_index, uint64_t **d_pksk_array,
uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array,
int input_lwe_dimension, int output_glwe_dimension,
int output_polynomial_size, int pksk_level, int pksk_base_log,
Csprng *csprng, double variance, const unsigned repetitions);
uint64_t closest_representable(uint64_t input, int level_count, int base_log);
uint64_t *bit_decompose_value(uint64_t value, int r);
#endif