mirror of
https://github.com/zama-ai/concrete.git
synced 2026-02-08 03:25:05 -05:00
test(concrete_cuda): add C++ tests to concrete-cuda
This commit is contained in:
50
.github/workflows/concrete_cuda_test.yml
vendored
50
.github/workflows/concrete_cuda_test.yml
vendored
@@ -39,7 +39,7 @@ jobs:
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
|
||||
ec2-image-id: ami-0c4d39cb3dba0fcff
|
||||
ec2-image-id: ami-03f11dc8c6a5f5c0a
|
||||
ec2-instance-type: p3.2xlarge
|
||||
subnet-id: subnet-8123c9e7
|
||||
security-group-id: sg-0466d33ced960ba35
|
||||
@@ -91,31 +91,31 @@ jobs:
|
||||
cd build
|
||||
cmake ..
|
||||
make -j8
|
||||
#- name: Test concrete-cuda with Cuda 11.8
|
||||
# if: ${{ !cancelled() }}
|
||||
# run: |
|
||||
# cd backends/concrete-cuda/implementation/build
|
||||
# ./test/test_concrete_cuda
|
||||
- name: Test concrete-cuda with Cuda 11.8
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
cd backends/concrete-cuda/implementation/build
|
||||
./test/test_concrete_cuda
|
||||
|
||||
#- name: Export variables for CUDA 11.1
|
||||
# run: |
|
||||
# echo "CUDA_PATH=$OLD_CUDA_PATH" >> "${GITHUB_ENV}"
|
||||
# echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
# echo "LD_LIBRARY_PATH=$OLD_CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
|
||||
# echo "CUDACXX=$OLD_CUDA_PATH/bin/nvcc" >> "${GITHUB_ENV}"
|
||||
#- name: Build concrete-cuda with Cuda 11.1
|
||||
# if: ${{ !cancelled() }}
|
||||
# run: |
|
||||
# cd backends/concrete-cuda/implementation
|
||||
# mkdir build-old-cuda
|
||||
# cd build-old-cuda
|
||||
# cmake ..
|
||||
# make -j8
|
||||
#- name: Test concrete-cuda with Cuda 11.1
|
||||
# if: ${{ !cancelled() }}
|
||||
# run: |
|
||||
# cd backends/concrete-cuda/implementation/build-old-cuda
|
||||
# ./test/test_concrete_cuda
|
||||
- name: Export variables for CUDA 11.1
|
||||
run: |
|
||||
echo "CUDA_PATH=$OLD_CUDA_PATH" >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
echo "LD_LIBRARY_PATH=$OLD_CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
|
||||
echo "CUDACXX=$OLD_CUDA_PATH/bin/nvcc" >> "${GITHUB_ENV}"
|
||||
- name: Build concrete-cuda with Cuda 11.1
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
cd backends/concrete-cuda/implementation
|
||||
mkdir build-old-cuda
|
||||
cd build-old-cuda
|
||||
cmake ..
|
||||
make -j8
|
||||
- name: Test concrete-cuda with Cuda 11.1
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
cd backends/concrete-cuda/implementation/build-old-cuda
|
||||
./test/test_concrete_cuda
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
|
||||
project(concrete_cuda LANGUAGES CXX CUDA)
|
||||
|
||||
include(CTest)
|
||||
|
||||
# See if the minimum CUDA version is available. If not, only enable documentation building.
|
||||
set(MINIMUM_SUPPORTED_CUDA_VERSION 10.0)
|
||||
include(CheckLanguage)
|
||||
@@ -69,6 +67,7 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O3 ${CUD
|
||||
set(INCLUDE_DIR include)
|
||||
|
||||
add_subdirectory(src)
|
||||
add_subdirectory(test)
|
||||
add_subdirectory(parameters)
|
||||
target_include_directories(concrete_cuda PRIVATE ${INCLUDE_DIR})
|
||||
|
||||
@@ -87,3 +86,6 @@ if (CPPLINT)
|
||||
set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_ALL TRUE)
|
||||
# set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD TRUE)
|
||||
endif ()
|
||||
|
||||
enable_testing()
|
||||
|
||||
|
||||
@@ -32,6 +32,10 @@ int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size,
|
||||
|
||||
int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||
cudaStream_t *stream, uint32_t gpu_index);
|
||||
|
||||
int cuda_memset_async(void *dest, uint64_t val, uint64_t size,
|
||||
cudaStream_t *stream, uint32_t gpu_index);
|
||||
|
||||
int cuda_get_number_of_gpus();
|
||||
|
||||
int cuda_synchronize_device(uint32_t gpu_index);
|
||||
|
||||
@@ -111,6 +111,28 @@ int cuda_synchronize_device(uint32_t gpu_index) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cuda_memset_async(void *dest, uint64_t val, uint64_t size,
|
||||
cudaStream_t *stream, uint32_t gpu_index) {
|
||||
if (size == 0) {
|
||||
// error code: zero copy size
|
||||
return -3;
|
||||
}
|
||||
|
||||
if (gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
}
|
||||
cudaPointerAttributes attr;
|
||||
cudaPointerGetAttributes(&attr, dest);
|
||||
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
|
||||
// error code: invalid device pointer
|
||||
return -1;
|
||||
}
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaMemsetAsync(dest, val, size, *stream);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Tries to copy memory to the GPU asynchronously
|
||||
/// 0: success
|
||||
/// -1: error, invalid device pointer
|
||||
|
||||
54
backends/concrete-cuda/implementation/test/CMakeLists.txt
Normal file
54
backends/concrete-cuda/implementation/test/CMakeLists.txt
Normal file
@@ -0,0 +1,54 @@
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(
|
||||
googletest
|
||||
URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip
|
||||
)
|
||||
# For Windows: Prevent overriding the parent project's compiler/linker settings
|
||||
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
|
||||
FetchContent_MakeAvailable(googletest)
|
||||
|
||||
# Enable ExternalProject CMake module
|
||||
include(ExternalProject)
|
||||
|
||||
set(CONCRETE_CPU_BINARY_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../concrete-cpu/target/release")
|
||||
set(CONCRETE_CPU_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../concrete-cpu")
|
||||
|
||||
# Add rust_example as a CMake target
|
||||
ExternalProject_Add(
|
||||
concrete_cpu
|
||||
SOURCE_DIR ${CONCRETE_CPU_SOURCE_DIR}
|
||||
DOWNLOAD_COMMAND ""
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND cargo build --release COMMAND cargo build --release
|
||||
BINARY_DIR ${CONCRETE_CPU_BINARY_DIR}
|
||||
INSTALL_COMMAND ""
|
||||
LOG_BUILD ON)
|
||||
|
||||
include_directories(${CONCRETE_CPU_SOURCE_DIR}/include)
|
||||
add_library(concrete_cpu_lib STATIC IMPORTED)
|
||||
set_target_properties(concrete_cpu_lib PROPERTIES IMPORTED_LOCATION
|
||||
${CONCRETE_CPU_BINARY_DIR}/libconcrete_cpu.a)
|
||||
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--no-as-needed -ldl")
|
||||
|
||||
set(BINARY test_concrete_cuda)
|
||||
|
||||
file(GLOB_RECURSE TEST_SOURCES LIST_DIRECTORIES false *.h *.cpp)
|
||||
|
||||
set(SOURCES ${TEST_SOURCES})
|
||||
|
||||
add_executable(${BINARY} ${TEST_SOURCES})
|
||||
|
||||
add_test(NAME ${BINARY} COMMAND ${BINARY})
|
||||
|
||||
set_target_properties(test_concrete_cuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON
|
||||
CUDA_RESOLVE_DEVICE_SYMBOLS ON)
|
||||
target_link_libraries(test_concrete_cuda PUBLIC GTest::gtest_main concrete_cpu_lib
|
||||
concrete_cuda cudart)
|
||||
find_package(CUDA REQUIRED)
|
||||
include_directories("${CUDA_INCLUDE_DIRS}" "${CMAKE_CURRENT_SOURCE_DIR}")
|
||||
|
||||
include(GoogleTest)
|
||||
gtest_discover_tests(test_concrete_cuda)
|
||||
|
||||
|
||||
@@ -0,0 +1,250 @@
|
||||
#include "../include/bit_extraction.h"
|
||||
#include "../include/device.h"
|
||||
#include "concrete-cpu.h"
|
||||
#include "utils.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <cstdint>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
const unsigned REPETITIONS = 5;
|
||||
const unsigned SAMPLES = 100;
|
||||
|
||||
typedef struct {
|
||||
int lwe_dimension;
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
double lwe_modular_variance;
|
||||
double glwe_modular_variance;
|
||||
int pbs_base_log;
|
||||
int pbs_level;
|
||||
int ks_base_log;
|
||||
int ks_level;
|
||||
int number_of_bits_of_message_including_padding;
|
||||
int number_of_bits_to_extract;
|
||||
int number_of_inputs;
|
||||
} BitExtractionTestParams;
|
||||
|
||||
class BitExtractionTestPrimitives_u64
|
||||
: public ::testing::TestWithParam<BitExtractionTestParams> {
|
||||
protected:
|
||||
int lwe_dimension;
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
double lwe_modular_variance;
|
||||
double glwe_modular_variance;
|
||||
int pbs_base_log;
|
||||
int pbs_level;
|
||||
int ks_base_log;
|
||||
int ks_level;
|
||||
int number_of_bits_of_message_including_padding;
|
||||
int number_of_bits_to_extract;
|
||||
int number_of_inputs;
|
||||
uint64_t delta;
|
||||
int delta_log;
|
||||
Csprng *csprng;
|
||||
cudaStream_t *stream;
|
||||
int gpu_index = 0;
|
||||
uint64_t *lwe_sk_in_array;
|
||||
uint64_t *lwe_sk_out_array;
|
||||
uint64_t *lwe_in_ct_array;
|
||||
uint64_t *lwe_out_ct_array;
|
||||
uint64_t *plaintexts;
|
||||
double *d_fourier_bsk_array;
|
||||
uint64_t *d_ksk_array;
|
||||
uint64_t *d_lwe_in_ct_array;
|
||||
uint64_t *d_lwe_out_ct_array;
|
||||
int8_t *bit_extract_buffer;
|
||||
|
||||
public:
|
||||
// Test arithmetic functions
|
||||
void SetUp() {
|
||||
stream = cuda_create_stream(0);
|
||||
|
||||
// TestParams
|
||||
lwe_dimension = (int)GetParam().lwe_dimension;
|
||||
glwe_dimension = (int)GetParam().glwe_dimension;
|
||||
polynomial_size = (int)GetParam().polynomial_size;
|
||||
lwe_modular_variance = (double)GetParam().lwe_modular_variance;
|
||||
glwe_modular_variance = (double)GetParam().glwe_modular_variance;
|
||||
pbs_base_log = (int)GetParam().pbs_base_log;
|
||||
pbs_level = (int)GetParam().pbs_level;
|
||||
ks_base_log = (int)GetParam().ks_base_log;
|
||||
ks_level = (int)GetParam().ks_level;
|
||||
number_of_bits_of_message_including_padding =
|
||||
(int)GetParam().number_of_bits_of_message_including_padding;
|
||||
number_of_bits_to_extract = (int)GetParam().number_of_bits_to_extract;
|
||||
number_of_inputs = (int)GetParam().number_of_inputs;
|
||||
delta_log = 64 - number_of_bits_of_message_including_padding;
|
||||
delta = (uint64_t)(1) << delta_log;
|
||||
|
||||
// Create a Csprng
|
||||
csprng =
|
||||
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
|
||||
uint8_t seed[16] = {(uint8_t)0};
|
||||
concrete_cpu_construct_concrete_csprng(
|
||||
csprng, Uint128{.little_endian_bytes = {*seed}});
|
||||
|
||||
int input_lwe_dimension = glwe_dimension * polynomial_size;
|
||||
int output_lwe_dimension = lwe_dimension;
|
||||
// Generate the keys
|
||||
generate_lwe_secret_keys(&lwe_sk_in_array, input_lwe_dimension, csprng, REPETITIONS);
|
||||
generate_lwe_secret_keys(&lwe_sk_out_array, output_lwe_dimension, csprng, REPETITIONS);
|
||||
generate_lwe_keyswitch_keys(
|
||||
stream, gpu_index, &d_ksk_array, lwe_sk_in_array, lwe_sk_out_array,
|
||||
input_lwe_dimension, output_lwe_dimension, ks_level, ks_base_log,
|
||||
csprng, lwe_modular_variance, REPETITIONS);
|
||||
generate_lwe_bootstrap_keys(
|
||||
stream, gpu_index, &d_fourier_bsk_array, lwe_sk_out_array,
|
||||
lwe_sk_in_array, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
pbs_level, pbs_base_log, csprng, glwe_modular_variance, REPETITIONS);
|
||||
plaintexts = generate_plaintexts(
|
||||
number_of_bits_of_message_including_padding, delta, number_of_inputs, REPETITIONS, SAMPLES);
|
||||
|
||||
d_lwe_out_ct_array = (uint64_t *)cuda_malloc_async(
|
||||
(output_lwe_dimension + 1) * number_of_bits_to_extract *
|
||||
number_of_inputs * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
|
||||
d_lwe_in_ct_array = (uint64_t *)cuda_malloc_async(
|
||||
(input_lwe_dimension + 1) * number_of_inputs * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
|
||||
lwe_in_ct_array = (uint64_t *)malloc((input_lwe_dimension + 1) *
|
||||
number_of_inputs * sizeof(uint64_t));
|
||||
lwe_out_ct_array = (uint64_t *)malloc((output_lwe_dimension + 1) *
|
||||
number_of_bits_to_extract *
|
||||
number_of_inputs * sizeof(uint64_t));
|
||||
// Execute scratch
|
||||
scratch_cuda_extract_bits_64(stream, gpu_index, &bit_extract_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size,
|
||||
pbs_level, number_of_inputs,
|
||||
cuda_get_max_shared_memory(gpu_index), true);
|
||||
}
|
||||
|
||||
void TearDown() {
|
||||
void *v_stream = (void *)stream;
|
||||
|
||||
cuda_synchronize_stream(v_stream);
|
||||
concrete_cpu_destroy_concrete_csprng(csprng);
|
||||
free(csprng);
|
||||
free(lwe_sk_in_array);
|
||||
free(lwe_sk_out_array);
|
||||
free(plaintexts);
|
||||
free(lwe_in_ct_array);
|
||||
free(lwe_out_ct_array);
|
||||
cleanup_cuda_extract_bits(stream, gpu_index, &bit_extract_buffer);
|
||||
cuda_drop_async(d_fourier_bsk_array, stream, gpu_index);
|
||||
cuda_drop_async(d_ksk_array, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_in_ct_array, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_out_ct_array, stream, gpu_index);
|
||||
cuda_destroy_stream(stream, gpu_index);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(BitExtractionTestPrimitives_u64, bit_extraction) {
|
||||
void *v_stream = (void *)stream;
|
||||
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
|
||||
polynomial_size * (lwe_dimension + 1);
|
||||
int ksk_size =
|
||||
ks_level * (lwe_dimension + 1) * glwe_dimension * polynomial_size;
|
||||
for (uint r = 0; r < REPETITIONS; r++) {
|
||||
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
|
||||
uint64_t *d_ksk = d_ksk_array + (ptrdiff_t)(ksk_size * r);
|
||||
uint64_t *lwe_in_sk =
|
||||
lwe_sk_in_array + (ptrdiff_t)(glwe_dimension * polynomial_size * r);
|
||||
uint64_t *lwe_sk_out = lwe_sk_out_array + (ptrdiff_t)(r * lwe_dimension);
|
||||
for (uint s = 0; s < SAMPLES; s++) {
|
||||
for (int i = 0; i < number_of_inputs; i++) {
|
||||
uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
uint64_t *lwe_in_ct =
|
||||
lwe_in_ct_array +
|
||||
(ptrdiff_t)(
|
||||
(r * SAMPLES * number_of_inputs + s * number_of_inputs + i) *
|
||||
(glwe_dimension * polynomial_size + 1));
|
||||
concrete_cpu_encrypt_lwe_ciphertext_u64(
|
||||
lwe_in_sk, lwe_in_ct, plaintext, glwe_dimension * polynomial_size,
|
||||
lwe_modular_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
|
||||
}
|
||||
cuda_synchronize_stream(v_stream);
|
||||
cuda_memcpy_async_to_gpu(d_lwe_in_ct_array, lwe_in_ct_array,
|
||||
(glwe_dimension * polynomial_size + 1) *
|
||||
number_of_inputs * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
|
||||
// Execute bit extract
|
||||
cuda_extract_bits_64(
|
||||
stream, gpu_index, (void *)d_lwe_out_ct_array,
|
||||
(void *)d_lwe_in_ct_array, bit_extract_buffer, (void *)d_ksk,
|
||||
(void *)d_fourier_bsk, number_of_bits_to_extract, delta_log,
|
||||
glwe_dimension * polynomial_size, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, pbs_base_log, pbs_level, ks_base_log, ks_level,
|
||||
number_of_inputs, cuda_get_max_shared_memory(gpu_index));
|
||||
|
||||
// Copy result back
|
||||
cuda_synchronize_stream(v_stream);
|
||||
cuda_memcpy_async_to_cpu(lwe_out_ct_array, d_lwe_out_ct_array,
|
||||
(lwe_dimension + 1) * number_of_bits_to_extract *
|
||||
number_of_inputs * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
|
||||
for (int j = 0; j < number_of_inputs; j++) {
|
||||
uint64_t *result_array =
|
||||
lwe_out_ct_array +
|
||||
(ptrdiff_t)(j * number_of_bits_to_extract * (lwe_dimension + 1));
|
||||
uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + j];
|
||||
for (int i = 0; i < number_of_bits_to_extract; i++) {
|
||||
uint64_t *result_ct =
|
||||
result_array + (ptrdiff_t)((number_of_bits_to_extract - 1 - i) *
|
||||
(lwe_dimension + 1));
|
||||
uint64_t decrypted_message = 0;
|
||||
concrete_cpu_decrypt_lwe_ciphertext_u64(
|
||||
lwe_sk_out, result_ct, lwe_dimension, &decrypted_message);
|
||||
// Round after decryption
|
||||
uint64_t decrypted_rounded =
|
||||
closest_representable(decrypted_message, 1, 1);
|
||||
// Bring back the extracted bit found in the MSB in the LSB
|
||||
uint64_t decrypted_extract_bit = decrypted_rounded >> 63;
|
||||
uint64_t expected = ((plaintext >> delta_log) >> i) & (uint64_t)(1);
|
||||
EXPECT_EQ(decrypted_extract_bit, expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Defines for which parameters set the PBS will be tested.
|
||||
// It executes each test for all pairs on phis X qs (Cartesian product)
|
||||
::testing::internal::ParamGenerator<BitExtractionTestParams>
|
||||
bit_extract_params_u64 = ::testing::Values(
|
||||
// n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
|
||||
// ks_base_log, ks_level, number_of_message_bits,
|
||||
// number_of_bits_to_extract
|
||||
(BitExtractionTestParams){585, 1, 1024, 7.52316384526264e-37,
|
||||
7.52316384526264e-37, 10, 2, 4, 7, 5, 5, 1});//,
|
||||
// (BitExtractionTestParams){585, 1, 1024, 7.52316384526264e-37,
|
||||
// 7.52316384526264e-37, 10, 2, 4, 7, 5, 5, 2});
|
||||
|
||||
std::string
|
||||
printParamName(::testing::TestParamInfo<BitExtractionTestParams> p) {
|
||||
BitExtractionTestParams params = p.param;
|
||||
|
||||
return "n_" + std::to_string(params.lwe_dimension) + "_k_" +
|
||||
std::to_string(params.glwe_dimension) + "_N_" +
|
||||
std::to_string(params.polynomial_size) + "_pbs_base_log_" +
|
||||
std::to_string(params.pbs_base_log) + "_pbs_level_" +
|
||||
std::to_string(params.pbs_level) + "_ks_base_log_" +
|
||||
std::to_string(params.ks_base_log) + "_ks_level_" +
|
||||
std::to_string(params.ks_level) + "_number_of_message_bits_" +
|
||||
std::to_string(params.number_of_bits_of_message_including_padding) +
|
||||
"_number_of_bits_to_extract_" +
|
||||
std::to_string(params.number_of_bits_to_extract) +
|
||||
"_number_of_inputs_" + std::to_string(params.number_of_inputs);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(BitExtractionInstantiation,
|
||||
BitExtractionTestPrimitives_u64, bit_extract_params_u64,
|
||||
printParamName);
|
||||
313
backends/concrete-cuda/implementation/test/test_bootstrap.cpp
Normal file
313
backends/concrete-cuda/implementation/test/test_bootstrap.cpp
Normal file
@@ -0,0 +1,313 @@
|
||||
#include "../include/bootstrap.h"
|
||||
#include "../include/device.h"
|
||||
#include "concrete-cpu.h"
|
||||
#include "utils.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
const unsigned REPETITIONS = 5;
|
||||
const unsigned SAMPLES = 100;
|
||||
|
||||
typedef struct {
|
||||
int lwe_dimension;
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
double lwe_modular_variance;
|
||||
double glwe_modular_variance;
|
||||
int pbs_base_log;
|
||||
int pbs_level;
|
||||
int message_modulus;
|
||||
int carry_modulus;
|
||||
int number_of_inputs;
|
||||
} BootstrapTestParams;
|
||||
|
||||
class BootstrapTestPrimitives_u64
|
||||
: public ::testing::TestWithParam<BootstrapTestParams> {
|
||||
protected:
|
||||
int lwe_dimension;
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
double lwe_modular_variance;
|
||||
double glwe_modular_variance;
|
||||
int pbs_base_log;
|
||||
int pbs_level;
|
||||
int message_modulus;
|
||||
int carry_modulus;
|
||||
int payload_modulus;
|
||||
int number_of_inputs;
|
||||
uint64_t delta;
|
||||
Csprng *csprng;
|
||||
cudaStream_t *stream;
|
||||
int gpu_index = 0;
|
||||
uint64_t *lwe_sk_in_array;
|
||||
uint64_t *lwe_sk_out_array;
|
||||
uint64_t *plaintexts;
|
||||
double *d_fourier_bsk_array;
|
||||
uint64_t *d_lut_pbs_identity;
|
||||
uint64_t *d_lut_pbs_indexes;
|
||||
uint64_t *d_lwe_ct_in_array;
|
||||
uint64_t *d_lwe_ct_out_array;
|
||||
|
||||
public:
|
||||
// Test arithmetic functions
|
||||
void SetUp() {
|
||||
stream = cuda_create_stream(0);
|
||||
void *v_stream = (void *)stream;
|
||||
|
||||
// TestParams
|
||||
lwe_dimension = (int)GetParam().lwe_dimension;
|
||||
glwe_dimension = (int)GetParam().glwe_dimension;
|
||||
polynomial_size = (int)GetParam().polynomial_size;
|
||||
lwe_modular_variance = (int)GetParam().lwe_modular_variance;
|
||||
glwe_modular_variance = (int)GetParam().glwe_modular_variance;
|
||||
pbs_base_log = (int)GetParam().pbs_base_log;
|
||||
pbs_level = (int)GetParam().pbs_level;
|
||||
message_modulus = (int)GetParam().message_modulus;
|
||||
carry_modulus = (int)GetParam().carry_modulus;
|
||||
number_of_inputs = (int)GetParam().number_of_inputs;
|
||||
|
||||
payload_modulus = message_modulus * carry_modulus;
|
||||
// Value of the shift we multiply our messages by
|
||||
delta = ((uint64_t)(1) << 63) / (uint64_t)(payload_modulus);
|
||||
|
||||
// Create a Csprng
|
||||
csprng =
|
||||
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
|
||||
uint8_t seed[16] = {(uint8_t)0};
|
||||
concrete_cpu_construct_concrete_csprng(
|
||||
csprng, Uint128{.little_endian_bytes = {*seed}});
|
||||
|
||||
// Generate the keys
|
||||
generate_lwe_secret_keys(&lwe_sk_in_array, lwe_dimension, csprng, REPETITIONS);
|
||||
generate_lwe_secret_keys(&lwe_sk_out_array,
|
||||
glwe_dimension * polynomial_size, csprng, REPETITIONS);
|
||||
generate_lwe_bootstrap_keys(
|
||||
stream, gpu_index, &d_fourier_bsk_array, lwe_sk_in_array,
|
||||
lwe_sk_out_array, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
pbs_level, pbs_base_log, csprng, glwe_modular_variance, REPETITIONS);
|
||||
plaintexts = generate_plaintexts(payload_modulus, delta, number_of_inputs, REPETITIONS,
|
||||
SAMPLES);
|
||||
|
||||
// Create the LUT
|
||||
uint64_t *lut_pbs_identity = generate_identity_lut_pbs(
|
||||
polynomial_size, glwe_dimension, message_modulus, carry_modulus,
|
||||
[](int x) -> int { return x; });
|
||||
|
||||
// Copy the LUT
|
||||
d_lut_pbs_identity = (uint64_t *)cuda_malloc_async(
|
||||
(glwe_dimension + 1) * polynomial_size * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
d_lut_pbs_indexes = (uint64_t *)cuda_malloc_async(
|
||||
number_of_inputs * sizeof(uint64_t), stream, gpu_index);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
cuda_memset_async(d_lut_pbs_indexes, 0, number_of_inputs * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_memcpy_async_to_gpu(d_lut_pbs_identity, lut_pbs_identity,
|
||||
polynomial_size * (glwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
free(lut_pbs_identity);
|
||||
|
||||
d_lwe_ct_out_array =
|
||||
(uint64_t *)cuda_malloc_async((glwe_dimension * polynomial_size + 1) *
|
||||
number_of_inputs * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
d_lwe_ct_in_array = (uint64_t *)cuda_malloc_async(
|
||||
(lwe_dimension + 1) * number_of_inputs * REPETITIONS * SAMPLES *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
uint64_t *lwe_ct_in_array =
|
||||
(uint64_t *)malloc((lwe_dimension + 1) * number_of_inputs *
|
||||
REPETITIONS * SAMPLES * sizeof(uint64_t));
|
||||
// Create the input/output ciphertexts
|
||||
for (uint r = 0; r < REPETITIONS; r++) {
|
||||
uint64_t *lwe_sk_in = lwe_sk_in_array + (ptrdiff_t)(r * lwe_dimension);
|
||||
for (uint s = 0; s < SAMPLES; s++) {
|
||||
for (int i = 0; i < number_of_inputs; i++) {
|
||||
uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
uint64_t *lwe_ct_in =
|
||||
lwe_ct_in_array + (ptrdiff_t)((r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + i) *
|
||||
(lwe_dimension + 1));
|
||||
concrete_cpu_encrypt_lwe_ciphertext_u64(
|
||||
lwe_sk_in, lwe_ct_in, plaintext, lwe_dimension,
|
||||
lwe_modular_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
|
||||
}
|
||||
}
|
||||
}
|
||||
cuda_synchronize_stream(v_stream);
|
||||
cuda_memcpy_async_to_gpu(d_lwe_ct_in_array, lwe_ct_in_array,
|
||||
REPETITIONS * SAMPLES * number_of_inputs *
|
||||
(lwe_dimension + 1) * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
free(lwe_ct_in_array);
|
||||
}
|
||||
|
||||
void TearDown() {
|
||||
concrete_cpu_destroy_concrete_csprng(csprng);
|
||||
free(csprng);
|
||||
free(lwe_sk_in_array);
|
||||
free(lwe_sk_out_array);
|
||||
free(plaintexts);
|
||||
cuda_drop_async(d_fourier_bsk_array, stream, gpu_index);
|
||||
cuda_drop_async(d_lut_pbs_identity, stream, gpu_index);
|
||||
cuda_drop_async(d_lut_pbs_indexes, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_ct_out_array, stream, gpu_index);
|
||||
cuda_destroy_stream(stream, gpu_index);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(BootstrapTestPrimitives_u64, amortized_bootstrap) {
|
||||
uint64_t *lwe_ct_out_array =
|
||||
(uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *
|
||||
number_of_inputs * sizeof(uint64_t));
|
||||
int8_t *pbs_buffer = nullptr;
|
||||
scratch_cuda_bootstrap_amortized_64(
|
||||
stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
|
||||
number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
|
||||
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
|
||||
polynomial_size * (lwe_dimension + 1);
|
||||
// Here execute the PBS
|
||||
for (uint r = 0; r < REPETITIONS; r++) {
|
||||
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
|
||||
uint64_t *lwe_sk_out =
|
||||
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
|
||||
for (uint s = 0; s < SAMPLES; s++) {
|
||||
uint64_t *d_lwe_ct_in =
|
||||
d_lwe_ct_in_array +
|
||||
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
|
||||
(lwe_dimension + 1));
|
||||
// Execute PBS
|
||||
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_ct_out_array,
|
||||
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
|
||||
(void *)d_lwe_ct_in, (void *)d_fourier_bsk, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
number_of_inputs, 1, 0, cuda_get_max_shared_memory(gpu_index));
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
|
||||
(glwe_dimension * polynomial_size + 1) *
|
||||
number_of_inputs * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
|
||||
for (int j = 0; j < number_of_inputs; j++) {
|
||||
uint64_t *result =
|
||||
lwe_ct_out_array +
|
||||
(ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
|
||||
uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + j];
|
||||
uint64_t decrypted = 0;
|
||||
concrete_cpu_decrypt_lwe_ciphertext_u64(
|
||||
lwe_sk_out, result, glwe_dimension * polynomial_size, &decrypted);
|
||||
EXPECT_NE(decrypted, plaintext);
|
||||
// let err = (decrypted >= plaintext) ? decrypted - plaintext :
|
||||
// plaintext
|
||||
// - decrypted;
|
||||
// error_sample_vec.push(err);
|
||||
|
||||
// The bit before the message
|
||||
uint64_t rounding_bit = delta >> 1;
|
||||
// Compute the rounding bit
|
||||
uint64_t rounding = (decrypted & rounding_bit) << 1;
|
||||
uint64_t decoded = (decrypted + rounding) / delta;
|
||||
EXPECT_EQ(decoded, plaintext / delta);
|
||||
}
|
||||
}
|
||||
}
|
||||
cleanup_cuda_bootstrap_amortized(stream, gpu_index, &pbs_buffer);
|
||||
free(lwe_ct_out_array);
|
||||
}
|
||||
|
||||
TEST_P(BootstrapTestPrimitives_u64, low_latency_bootstrap) {
|
||||
uint64_t *lwe_ct_out_array =
|
||||
(uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *
|
||||
number_of_inputs * sizeof(uint64_t));
|
||||
int8_t *pbs_buffer = nullptr;
|
||||
scratch_cuda_bootstrap_low_latency_64(
|
||||
stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
|
||||
pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
|
||||
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
|
||||
polynomial_size * (lwe_dimension + 1);
|
||||
// Here execute the PBS
|
||||
for (uint r = 0; r < REPETITIONS; r++) {
|
||||
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
|
||||
uint64_t *lwe_sk_out =
|
||||
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
|
||||
for (uint s = 0; s < SAMPLES; s++) {
|
||||
uint64_t *d_lwe_ct_in =
|
||||
d_lwe_ct_in_array +
|
||||
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
|
||||
(lwe_dimension + 1));
|
||||
// Execute PBS
|
||||
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_ct_out_array,
|
||||
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
|
||||
(void *)d_lwe_ct_in, (void *)d_fourier_bsk, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
number_of_inputs, 1, 0, cuda_get_max_shared_memory(gpu_index));
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
|
||||
(glwe_dimension * polynomial_size + 1) *
|
||||
number_of_inputs * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
|
||||
for (int j = 0; j < number_of_inputs; j++) {
|
||||
uint64_t *result =
|
||||
lwe_ct_out_array +
|
||||
(ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
|
||||
uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + j];
|
||||
uint64_t decrypted = 0;
|
||||
concrete_cpu_decrypt_lwe_ciphertext_u64(
|
||||
lwe_sk_out, result, glwe_dimension * polynomial_size, &decrypted);
|
||||
EXPECT_NE(decrypted, plaintext);
|
||||
// let err = (decrypted >= plaintext) ? decrypted - plaintext :
|
||||
// plaintext
|
||||
// - decrypted;
|
||||
// error_sample_vec.push(err);
|
||||
|
||||
// The bit before the message
|
||||
uint64_t rounding_bit = delta >> 1;
|
||||
// Compute the rounding bit
|
||||
uint64_t rounding = (decrypted & rounding_bit) << 1;
|
||||
uint64_t decoded = (decrypted + rounding) / delta;
|
||||
EXPECT_EQ(decoded, plaintext / delta);
|
||||
}
|
||||
}
|
||||
}
|
||||
cleanup_cuda_bootstrap_low_latency(stream, gpu_index, &pbs_buffer);
|
||||
free(lwe_ct_out_array);
|
||||
}
|
||||
|
||||
// Defines for which parameters set the PBS will be tested.
|
||||
// It executes each test for all pairs on phis X qs (Cartesian product)
|
||||
::testing::internal::ParamGenerator<BootstrapTestParams> pbs_params_u64 =
|
||||
::testing::Values(
|
||||
// n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
|
||||
// message_modulus, carry_modulus
|
||||
(BootstrapTestParams){500, 1, 1024, 0.000007069849454709433,
|
||||
0.00000000000000029403601535432533, 23, 2, 4, 4,
|
||||
1},
|
||||
(BootstrapTestParams){500, 1, 1024, 0.000007069849454709433,
|
||||
0.00000000000000029403601535432533, 23, 2, 4, 4,
|
||||
3});
|
||||
|
||||
std::string printParamName(::testing::TestParamInfo<BootstrapTestParams> p) {
|
||||
BootstrapTestParams params = p.param;
|
||||
|
||||
return "n_" + std::to_string(params.lwe_dimension) + "_k_" +
|
||||
std::to_string(params.glwe_dimension) + "_N_" +
|
||||
std::to_string(params.polynomial_size) + "_pbs_base_log_" +
|
||||
std::to_string(params.pbs_base_log) + "_pbs_level_" +
|
||||
std::to_string(params.pbs_level) + "_number_of_inputs_" +
|
||||
std::to_string(params.number_of_inputs);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(BootstrapInstantiation, BootstrapTestPrimitives_u64,
|
||||
pbs_params_u64, printParamName);
|
||||
@@ -0,0 +1,271 @@
|
||||
#include "../include/circuit_bootstrap.h"
|
||||
#include "../include/device.h"
|
||||
#include "concrete-cpu.h"
|
||||
#include "utils.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <cstdint>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
const unsigned REPETITIONS = 5;
|
||||
const unsigned SAMPLES = 100;
|
||||
|
||||
typedef struct {
|
||||
int lwe_dimension;
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
double lwe_modular_variance;
|
||||
double glwe_modular_variance;
|
||||
int pbs_base_log;
|
||||
int pbs_level;
|
||||
int pksk_base_log;
|
||||
int pksk_level;
|
||||
int cbs_base_log;
|
||||
int cbs_level;
|
||||
} CircuitBootstrapTestParams;
|
||||
|
||||
class CircuitBootstrapTestPrimitives_u64
|
||||
: public ::testing::TestWithParam<CircuitBootstrapTestParams> {
|
||||
protected:
|
||||
int lwe_dimension;
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
double lwe_modular_variance;
|
||||
double glwe_modular_variance;
|
||||
int pbs_base_log;
|
||||
int pbs_level;
|
||||
int pksk_base_log;
|
||||
int pksk_level;
|
||||
int cbs_base_log;
|
||||
int cbs_level;
|
||||
int number_of_bits_of_message_including_padding;
|
||||
int ggsw_size;
|
||||
uint64_t delta;
|
||||
int delta_log;
|
||||
Csprng *csprng;
|
||||
cudaStream_t *stream;
|
||||
int gpu_index = 0;
|
||||
uint64_t *lwe_sk_in_array;
|
||||
uint64_t *lwe_sk_out_array;
|
||||
uint64_t *lwe_in_ct;
|
||||
uint64_t *ggsw_out_ct;
|
||||
uint64_t *plaintexts;
|
||||
double *d_fourier_bsk_array;
|
||||
uint64_t *d_pksk_array;
|
||||
uint64_t *d_lwe_in_ct;
|
||||
uint64_t *d_ggsw_out_ct;
|
||||
uint64_t *d_lut_vector_indexes;
|
||||
int8_t *cbs_buffer;
|
||||
|
||||
public:
|
||||
// Test arithmetic functions
|
||||
void SetUp() {
|
||||
stream = cuda_create_stream(0);
|
||||
|
||||
// TestParams
|
||||
lwe_dimension = (int)GetParam().lwe_dimension;
|
||||
glwe_dimension = (int)GetParam().glwe_dimension;
|
||||
polynomial_size = (int)GetParam().polynomial_size;
|
||||
lwe_modular_variance = (double)GetParam().lwe_modular_variance;
|
||||
glwe_modular_variance = (double)GetParam().glwe_modular_variance;
|
||||
pbs_base_log = (int)GetParam().pbs_base_log;
|
||||
pbs_level = (int)GetParam().pbs_level;
|
||||
pksk_base_log = (int)GetParam().pksk_base_log;
|
||||
pksk_level = (int)GetParam().pksk_level;
|
||||
cbs_base_log = (int)GetParam().cbs_base_log;
|
||||
cbs_level = (int)GetParam().cbs_level;
|
||||
// We generate binary messages
|
||||
number_of_bits_of_message_including_padding = 2;
|
||||
delta_log = 60;
|
||||
delta = (uint64_t)(1) << delta_log;
|
||||
ggsw_size = cbs_level * (glwe_dimension + 1) * (glwe_dimension + 1) *
|
||||
polynomial_size;
|
||||
|
||||
// Create a Csprng
|
||||
csprng =
|
||||
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
|
||||
uint8_t seed[16] = {(uint8_t)0};
|
||||
concrete_cpu_construct_concrete_csprng(
|
||||
csprng, Uint128{.little_endian_bytes = {*seed}});
|
||||
|
||||
// Generate the keys
|
||||
generate_lwe_secret_keys(&lwe_sk_in_array, lwe_dimension, csprng, REPETITIONS);
|
||||
generate_lwe_secret_keys(&lwe_sk_out_array,
|
||||
glwe_dimension * polynomial_size, csprng, REPETITIONS);
|
||||
generate_lwe_bootstrap_keys(
|
||||
stream, gpu_index, &d_fourier_bsk_array, lwe_sk_in_array,
|
||||
lwe_sk_out_array, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
pbs_level, pbs_base_log, csprng, glwe_modular_variance, REPETITIONS);
|
||||
generate_lwe_private_functional_keyswitch_key_lists(
|
||||
stream, gpu_index, &d_pksk_array, lwe_sk_out_array, lwe_sk_out_array,
|
||||
glwe_dimension * polynomial_size, glwe_dimension, polynomial_size,
|
||||
pksk_level, pksk_base_log, csprng, lwe_modular_variance, REPETITIONS);
|
||||
plaintexts = generate_plaintexts(
|
||||
number_of_bits_of_message_including_padding, delta, 1, REPETITIONS, SAMPLES);
|
||||
|
||||
d_ggsw_out_ct = (uint64_t *)cuda_malloc_async(ggsw_size * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
|
||||
d_lwe_in_ct = (uint64_t *)cuda_malloc_async(
|
||||
(lwe_dimension + 1) * sizeof(uint64_t), stream, gpu_index);
|
||||
|
||||
lwe_in_ct = (uint64_t *)malloc((lwe_dimension + 1) * sizeof(uint64_t));
|
||||
ggsw_out_ct = (uint64_t *)malloc(ggsw_size * sizeof(uint64_t));
|
||||
// Execute cbs scratch
|
||||
scratch_cuda_circuit_bootstrap_64(
|
||||
stream, gpu_index, &cbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, cbs_level, 1, cuda_get_max_shared_memory(gpu_index),
|
||||
true);
|
||||
// Build LUT vector indexes
|
||||
uint64_t *h_lut_vector_indexes =
|
||||
(uint64_t *)malloc(cbs_level * sizeof(uint64_t));
|
||||
for (int index = 0; index < cbs_level; index++) {
|
||||
h_lut_vector_indexes[index] = 0; // index % cbs_level;
|
||||
}
|
||||
d_lut_vector_indexes = (uint64_t *)cuda_malloc_async(
|
||||
cbs_level * sizeof(uint64_t), stream, gpu_index);
|
||||
cuda_memcpy_async_to_gpu(d_lut_vector_indexes, h_lut_vector_indexes,
|
||||
cbs_level * sizeof(uint64_t), stream, gpu_index);
|
||||
free(h_lut_vector_indexes);
|
||||
}
|
||||
|
||||
void TearDown() {
|
||||
void *v_stream = (void *)stream;
|
||||
|
||||
cuda_synchronize_stream(v_stream);
|
||||
concrete_cpu_destroy_concrete_csprng(csprng);
|
||||
free(csprng);
|
||||
free(lwe_sk_in_array);
|
||||
free(lwe_sk_out_array);
|
||||
free(plaintexts);
|
||||
free(lwe_in_ct);
|
||||
free(ggsw_out_ct);
|
||||
cleanup_cuda_circuit_bootstrap(stream, gpu_index, &cbs_buffer);
|
||||
cuda_drop_async(d_fourier_bsk_array, stream, gpu_index);
|
||||
cuda_drop_async(d_pksk_array, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_in_ct, stream, gpu_index);
|
||||
cuda_drop_async(d_ggsw_out_ct, stream, gpu_index);
|
||||
cuda_drop_async(d_lut_vector_indexes, stream, gpu_index);
|
||||
cuda_destroy_stream(stream, gpu_index);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(CircuitBootstrapTestPrimitives_u64, circuit_bootstrap) {
|
||||
void *v_stream = (void *)stream;
|
||||
for (uint r = 0; r < REPETITIONS; r++) {
|
||||
for (uint s = 0; s < SAMPLES; s++) {
|
||||
uint64_t plaintext = plaintexts[r * SAMPLES + s];
|
||||
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
|
||||
polynomial_size * (lwe_dimension + 1);
|
||||
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
|
||||
int pksk_list_size = pksk_level * (glwe_dimension + 1) * polynomial_size *
|
||||
(glwe_dimension * polynomial_size + 1) *
|
||||
(glwe_dimension + 1);
|
||||
uint64_t *d_pksk_list = d_pksk_array + (ptrdiff_t)(pksk_list_size * r);
|
||||
uint64_t *lwe_in_sk = lwe_sk_in_array + (ptrdiff_t)(lwe_dimension * r);
|
||||
uint64_t *lwe_sk_out =
|
||||
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
|
||||
concrete_cpu_encrypt_lwe_ciphertext_u64(
|
||||
lwe_in_sk, lwe_in_ct, plaintext, lwe_dimension, lwe_modular_variance,
|
||||
csprng, &CONCRETE_CSPRNG_VTABLE);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
cuda_memcpy_async_to_gpu(d_lwe_in_ct, lwe_in_ct,
|
||||
(lwe_dimension + 1) * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
|
||||
// Execute circuit bootstrap
|
||||
cuda_circuit_bootstrap_64(
|
||||
stream, gpu_index, (void *)d_ggsw_out_ct, (void *)d_lwe_in_ct,
|
||||
(void *)d_fourier_bsk, (void *)d_pksk_list,
|
||||
(void *)d_lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
|
||||
glwe_dimension, lwe_dimension, pbs_level, pbs_base_log, pksk_level,
|
||||
pksk_base_log, cbs_level, cbs_base_log, 1,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(ggsw_out_ct, d_ggsw_out_ct,
|
||||
ggsw_size * sizeof(uint64_t), stream, gpu_index);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
|
||||
uint64_t *decrypted =
|
||||
(uint64_t *)malloc(polynomial_size * (glwe_dimension + 1) *
|
||||
cbs_level * sizeof(uint64_t));
|
||||
|
||||
uint64_t multiplying_factor = -(plaintext >> delta_log);
|
||||
for (int l = 1; l < cbs_level + 1; l++) {
|
||||
for (int j = 0; j < glwe_dimension; j++) {
|
||||
uint64_t *res = decrypted + (ptrdiff_t)((l - 1) * polynomial_size *
|
||||
(glwe_dimension + 1) +
|
||||
j * polynomial_size);
|
||||
uint64_t *glwe_ct_out =
|
||||
ggsw_out_ct +
|
||||
(ptrdiff_t)((l - 1) * polynomial_size * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) +
|
||||
j * polynomial_size * (glwe_dimension + 1));
|
||||
concrete_cpu_decrypt_glwe_ciphertext_u64(
|
||||
lwe_sk_out, res, glwe_ct_out, glwe_dimension, polynomial_size);
|
||||
|
||||
for (int k = 0; k < polynomial_size; k++) {
|
||||
uint64_t expected_decryption =
|
||||
lwe_sk_out[j * polynomial_size + k] * multiplying_factor;
|
||||
expected_decryption >>= (64 - cbs_base_log * l);
|
||||
uint64_t decoded_plaintext =
|
||||
closest_representable(res[k], l, cbs_base_log) >>
|
||||
(64 - cbs_base_log * l);
|
||||
EXPECT_EQ(expected_decryption, decoded_plaintext);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check last glwe on last level
|
||||
uint64_t *res =
|
||||
decrypted +
|
||||
(ptrdiff_t)((cbs_level - 1) * polynomial_size * (glwe_dimension + 1) +
|
||||
glwe_dimension * polynomial_size);
|
||||
uint64_t *glwe_ct_out =
|
||||
ggsw_out_ct +
|
||||
(ptrdiff_t)((cbs_level - 1) * polynomial_size * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) +
|
||||
glwe_dimension * polynomial_size * (glwe_dimension + 1));
|
||||
concrete_cpu_decrypt_glwe_ciphertext_u64(lwe_sk_out, res, glwe_ct_out,
|
||||
glwe_dimension, polynomial_size);
|
||||
|
||||
for (int k = 0; k < polynomial_size; k++) {
|
||||
uint64_t expected_decryption = (k == 0) ? plaintext / delta : 0;
|
||||
uint64_t decoded_plaintext =
|
||||
closest_representable(res[k], cbs_level, cbs_base_log) >>
|
||||
(64 - cbs_base_log * cbs_level);
|
||||
EXPECT_EQ(expected_decryption, decoded_plaintext);
|
||||
}
|
||||
free(decrypted);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Defines for which parameters set the PBS will be tested.
|
||||
// It executes each test for all pairs on phis X qs (Cartesian product)
|
||||
::testing::internal::ParamGenerator<CircuitBootstrapTestParams> cbs_params_u64 =
|
||||
::testing::Values(
|
||||
// n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
|
||||
// pksk_base_log, pksk_level, cbs_base_log, cbs_level
|
||||
(CircuitBootstrapTestParams){10, 2, 512, 7.52316384526264e-37,
|
||||
7.52316384526264e-37, 11, 2, 15, 2, 10,
|
||||
1});
|
||||
|
||||
std::string
|
||||
printParamName(::testing::TestParamInfo<CircuitBootstrapTestParams> p) {
|
||||
CircuitBootstrapTestParams params = p.param;
|
||||
|
||||
return "n_" + std::to_string(params.lwe_dimension) + "_k_" +
|
||||
std::to_string(params.glwe_dimension) + "_N_" +
|
||||
std::to_string(params.polynomial_size) + "_pbs_base_log_" +
|
||||
std::to_string(params.pbs_base_log) + "_pbs_level_" +
|
||||
std::to_string(params.pbs_level) + "_pksk_base_log_" +
|
||||
std::to_string(params.pksk_base_log) + "_pksk_level_" +
|
||||
std::to_string(params.pksk_level) + "_cbs_base_log_" +
|
||||
std::to_string(params.cbs_base_log) + "_cbs_level_" +
|
||||
std::to_string(params.cbs_level);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(CircuitBootstrapInstantiation,
|
||||
CircuitBootstrapTestPrimitives_u64, cbs_params_u64,
|
||||
printParamName);
|
||||
202
backends/concrete-cuda/implementation/test/test_cmux_tree.cpp
Normal file
202
backends/concrete-cuda/implementation/test/test_cmux_tree.cpp
Normal file
@@ -0,0 +1,202 @@
|
||||
#include "../include/device.h"
|
||||
#include "../include/vertical_packing.h"
|
||||
#include "concrete-cpu.h"
|
||||
#include "utils.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
const unsigned REPETITIONS = 5;
|
||||
const unsigned SAMPLES = 100;
|
||||
|
||||
typedef struct {
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
int r;
|
||||
int tau;
|
||||
double glwe_modular_variance;
|
||||
int base_log;
|
||||
int level_count;
|
||||
int delta_log;
|
||||
} CMUXTreeTestParams;
|
||||
|
||||
class CMUXTreeTestPrimitives_u64
|
||||
: public ::testing::TestWithParam<CMUXTreeTestParams> {
|
||||
protected:
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
int r_lut;
|
||||
int tau;
|
||||
double glwe_modular_variance;
|
||||
int base_log;
|
||||
int level_count;
|
||||
uint64_t delta;
|
||||
int delta_log;
|
||||
Csprng *csprng;
|
||||
uint64_t *plaintexts;
|
||||
cudaStream_t *stream;
|
||||
int gpu_index = 0;
|
||||
uint64_t *glwe_sk;
|
||||
uint64_t *d_lut_identity;
|
||||
|
||||
public:
|
||||
// Test arithmetic functions
|
||||
void SetUp() {
|
||||
stream = cuda_create_stream(0);
|
||||
void *v_stream = (void *)stream;
|
||||
|
||||
// TestParams
|
||||
glwe_dimension = (int)GetParam().glwe_dimension;
|
||||
polynomial_size = (int)GetParam().polynomial_size;
|
||||
r_lut = (int)GetParam().r;
|
||||
tau = (int)GetParam().tau;
|
||||
glwe_modular_variance = (int)GetParam().glwe_modular_variance;
|
||||
base_log = (int)GetParam().base_log;
|
||||
level_count = (int)GetParam().level_count;
|
||||
delta_log = (int)GetParam().delta_log;
|
||||
|
||||
// Value of the shift we multiply our messages by
|
||||
delta = ((uint64_t)(1) << delta_log);
|
||||
|
||||
// Create a Csprng
|
||||
csprng =
|
||||
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
|
||||
uint8_t seed[16] = {(uint8_t)0};
|
||||
concrete_cpu_construct_concrete_csprng(
|
||||
csprng, Uint128{.little_endian_bytes = {*seed}});
|
||||
|
||||
// Generate the keys
|
||||
generate_glwe_secret_keys(&glwe_sk, glwe_dimension, polynomial_size,
|
||||
csprng, REPETITIONS);
|
||||
plaintexts = generate_plaintexts(r_lut, 1, 1, REPETITIONS, SAMPLES);
|
||||
|
||||
// Create the LUT
|
||||
int num_lut = (1 << r_lut);
|
||||
d_lut_identity = (uint64_t *)cuda_malloc_async(
|
||||
polynomial_size * num_lut * tau * sizeof(uint64_t), stream, gpu_index);
|
||||
uint64_t *lut_cmux_tree_identity = generate_identity_lut_cmux_tree(
|
||||
polynomial_size, num_lut, tau, delta_log);
|
||||
|
||||
// Copy all LUTs
|
||||
cuda_memcpy_async_to_gpu(d_lut_identity, lut_cmux_tree_identity,
|
||||
polynomial_size * num_lut * tau * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
|
||||
cuda_synchronize_stream(v_stream);
|
||||
free(lut_cmux_tree_identity);
|
||||
}
|
||||
|
||||
void TearDown() {
|
||||
cuda_synchronize_stream(stream);
|
||||
concrete_cpu_destroy_concrete_csprng(csprng);
|
||||
free(plaintexts);
|
||||
free(csprng);
|
||||
cuda_drop_async(d_lut_identity, stream, gpu_index);
|
||||
cuda_destroy_stream(stream, gpu_index);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(CMUXTreeTestPrimitives_u64, cmux_tree) {
|
||||
int ggsw_size = polynomial_size * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) * level_count;
|
||||
int glwe_size = (glwe_dimension + 1) * polynomial_size;
|
||||
uint64_t *d_ggsw_bit_array = (uint64_t *)cuda_malloc_async(
|
||||
r_lut * ggsw_size * sizeof(uint64_t), stream, gpu_index);
|
||||
uint64_t *d_results = (uint64_t *)cuda_malloc_async(
|
||||
tau * glwe_size * sizeof(uint64_t), stream, gpu_index);
|
||||
uint64_t *results = (uint64_t *)malloc(tau * glwe_size * sizeof(uint64_t));
|
||||
uint64_t *ggsw = (uint64_t *)malloc(ggsw_size * sizeof(uint64_t));
|
||||
|
||||
int8_t *cmux_tree_buffer = nullptr;
|
||||
scratch_cuda_cmux_tree_64(stream, gpu_index, &cmux_tree_buffer,
|
||||
glwe_dimension, polynomial_size, level_count, r_lut,
|
||||
tau, cuda_get_max_shared_memory(gpu_index), true);
|
||||
|
||||
// Here execute the PBS
|
||||
for (uint r = 0; r < REPETITIONS; r++) {
|
||||
for (uint s = 0; s < SAMPLES; s++) {
|
||||
uint64_t witness = plaintexts[r * SAMPLES + s];
|
||||
|
||||
// Instantiate the GGSW m^tree ciphertexts
|
||||
// We need r GGSW ciphertexts
|
||||
// Bit decomposition of the value from MSB to LSB
|
||||
uint64_t *bit_array = bit_decompose_value(witness, r_lut);
|
||||
|
||||
for (int i = 0; i < r_lut; i++) {
|
||||
uint64_t *d_ggsw_slice = d_ggsw_bit_array + i * ggsw_size;
|
||||
concrete_cpu_encrypt_ggsw_ciphertext_u64(
|
||||
glwe_sk, ggsw, bit_array[i], glwe_dimension, polynomial_size,
|
||||
level_count, base_log, glwe_modular_variance, csprng,
|
||||
&CONCRETE_CSPRNG_VTABLE);
|
||||
cuda_memcpy_async_to_gpu(d_ggsw_slice, ggsw,
|
||||
ggsw_size * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
}
|
||||
cuda_synchronize_stream(stream);
|
||||
|
||||
// Execute scratch/CMUX tree/cleanup
|
||||
cuda_cmux_tree_64(stream, gpu_index, (void *)d_results,
|
||||
(void *)d_ggsw_bit_array, (void *)d_lut_identity,
|
||||
cmux_tree_buffer, glwe_dimension, polynomial_size,
|
||||
base_log, level_count, r_lut, tau,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(results, d_results,
|
||||
tau * glwe_size * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
cuda_synchronize_stream(stream);
|
||||
for (int tree = 0; tree < tau; tree++) {
|
||||
uint64_t *result = results + tree * glwe_size;
|
||||
uint64_t *decrypted =
|
||||
(uint64_t *)malloc(polynomial_size * sizeof(uint64_t));
|
||||
concrete_cpu_decrypt_glwe_ciphertext_u64(
|
||||
glwe_sk, decrypted, result, glwe_dimension, polynomial_size);
|
||||
// The bit before the message
|
||||
uint64_t rounding_bit = delta >> 1;
|
||||
// Compute the rounding bit
|
||||
uint64_t rounding = (decrypted[0] & rounding_bit) << 1;
|
||||
uint64_t decoded = (decrypted[0] + rounding) / delta;
|
||||
EXPECT_EQ(decoded, (witness + tree) % (1 << (64 - delta_log)));
|
||||
free(decrypted);
|
||||
}
|
||||
free(bit_array);
|
||||
}
|
||||
}
|
||||
cuda_synchronize_stream(stream);
|
||||
cleanup_cuda_cmux_tree(stream, gpu_index, &cmux_tree_buffer);
|
||||
free(ggsw);
|
||||
|
||||
cuda_drop_async(d_ggsw_bit_array, stream, gpu_index);
|
||||
}
|
||||
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
double glwe_modular_variance;
|
||||
int base_log;
|
||||
int level_count;
|
||||
int message_modulus;
|
||||
int carry_modulus;
|
||||
|
||||
// Defines for which parameters set the PBS will be tested.
|
||||
// It executes each test for all pairs on phis X qs (Cartesian product)
|
||||
::testing::internal::ParamGenerator<CMUXTreeTestParams> cmux_tree_params_u64 =
|
||||
::testing::Values(
|
||||
// k, N, r, tau, glwe_variance, base_log, level_count, delta_log
|
||||
(CMUXTreeTestParams){2, 256, 10, 6, 0.00000000000000029403601535432533,
|
||||
6, 3, 60});
|
||||
|
||||
std::string printParamName(::testing::TestParamInfo<CMUXTreeTestParams> p) {
|
||||
CMUXTreeTestParams params = p.param;
|
||||
|
||||
return "k_" + std::to_string(params.glwe_dimension) + "_N_" +
|
||||
std::to_string(params.polynomial_size) + "_tau_" +
|
||||
std::to_string(params.tau) + "_base_log_" +
|
||||
std::to_string(params.base_log) + "_level_count_" +
|
||||
std::to_string(params.level_count);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(CMUXTreeInstantiation, CMUXTreeTestPrimitives_u64,
|
||||
cmux_tree_params_u64, printParamName);
|
||||
192
backends/concrete-cuda/implementation/test/test_keyswitch.cpp
Normal file
192
backends/concrete-cuda/implementation/test/test_keyswitch.cpp
Normal file
@@ -0,0 +1,192 @@
|
||||
#include "../include/device.h"
|
||||
#include "../include/keyswitch.h"
|
||||
#include "concrete-cpu.h"
|
||||
#include "utils.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
const unsigned REPETITIONS = 5;
|
||||
const unsigned SAMPLES = 100;
|
||||
|
||||
typedef struct {
|
||||
int input_lwe_dimension;
|
||||
int output_lwe_dimension;
|
||||
double noise_variance;
|
||||
int ksk_base_log;
|
||||
int ksk_level;
|
||||
int message_modulus;
|
||||
int carry_modulus;
|
||||
} KeyswitchTestParams;
|
||||
|
||||
class KeyswitchTestPrimitives_u64
|
||||
: public ::testing::TestWithParam<KeyswitchTestParams> {
|
||||
protected:
|
||||
int input_lwe_dimension;
|
||||
int output_lwe_dimension;
|
||||
double noise_variance;
|
||||
int ksk_base_log;
|
||||
int ksk_level;
|
||||
int message_modulus;
|
||||
int carry_modulus;
|
||||
int payload_modulus;
|
||||
uint64_t delta;
|
||||
Csprng *csprng;
|
||||
cudaStream_t *stream;
|
||||
int gpu_index = 0;
|
||||
uint64_t *lwe_sk_in_array;
|
||||
uint64_t *lwe_sk_out_array;
|
||||
uint64_t *plaintexts;
|
||||
uint64_t *d_ksk_array;
|
||||
uint64_t *d_lwe_out_ct;
|
||||
uint64_t *d_lwe_in_ct;
|
||||
uint64_t *lwe_in_ct;
|
||||
uint64_t *lwe_out_ct;
|
||||
int num_samples;
|
||||
|
||||
public:
|
||||
// Test arithmetic functions
|
||||
void SetUp() {
|
||||
stream = cuda_create_stream(0);
|
||||
void *v_stream = (void *)stream;
|
||||
|
||||
// TestParams
|
||||
input_lwe_dimension = (int)GetParam().input_lwe_dimension;
|
||||
output_lwe_dimension = (int)GetParam().output_lwe_dimension;
|
||||
noise_variance = (int)GetParam().noise_variance;
|
||||
ksk_base_log = (int)GetParam().ksk_base_log;
|
||||
ksk_level = (int)GetParam().ksk_level;
|
||||
message_modulus = (int)GetParam().message_modulus;
|
||||
carry_modulus = (int)GetParam().carry_modulus;
|
||||
|
||||
payload_modulus = message_modulus * carry_modulus;
|
||||
// Value of the shift we multiply our messages by
|
||||
delta = ((uint64_t)(1) << 63) / (uint64_t)(payload_modulus);
|
||||
|
||||
// Create a Csprng
|
||||
csprng =
|
||||
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
|
||||
uint8_t seed[16] = {(uint8_t)0};
|
||||
concrete_cpu_construct_concrete_csprng(
|
||||
csprng, Uint128{.little_endian_bytes = {*seed}});
|
||||
|
||||
// Generate the keys
|
||||
generate_lwe_secret_keys(&lwe_sk_in_array, input_lwe_dimension, csprng, REPETITIONS);
|
||||
generate_lwe_secret_keys(&lwe_sk_out_array, output_lwe_dimension, csprng, REPETITIONS);
|
||||
generate_lwe_keyswitch_keys(
|
||||
stream, gpu_index, &d_ksk_array, lwe_sk_in_array, lwe_sk_out_array,
|
||||
input_lwe_dimension, output_lwe_dimension, ksk_level, ksk_base_log,
|
||||
csprng, noise_variance, REPETITIONS);
|
||||
plaintexts = generate_plaintexts(payload_modulus, delta, 1, REPETITIONS, SAMPLES);
|
||||
|
||||
d_lwe_out_ct = (uint64_t *)cuda_malloc_async(
|
||||
(output_lwe_dimension + 1) * sizeof(uint64_t), stream, gpu_index);
|
||||
|
||||
d_lwe_in_ct = (uint64_t *)cuda_malloc_async(
|
||||
(input_lwe_dimension + 1) * sizeof(uint64_t), stream, gpu_index);
|
||||
|
||||
lwe_in_ct =
|
||||
(uint64_t *)malloc((input_lwe_dimension + 1) * sizeof(uint64_t));
|
||||
lwe_out_ct =
|
||||
(uint64_t *)malloc((output_lwe_dimension + 1) * sizeof(uint64_t));
|
||||
|
||||
cuda_synchronize_stream(v_stream);
|
||||
}
|
||||
|
||||
void TearDown() {
|
||||
void *v_stream = (void *)stream;
|
||||
|
||||
cuda_synchronize_stream(v_stream);
|
||||
concrete_cpu_destroy_concrete_csprng(csprng);
|
||||
free(csprng);
|
||||
cuda_drop_async(d_lwe_in_ct, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_out_ct, stream, gpu_index);
|
||||
free(lwe_in_ct);
|
||||
free(lwe_out_ct);
|
||||
free(lwe_sk_in_array);
|
||||
free(lwe_sk_out_array);
|
||||
free(plaintexts);
|
||||
cuda_drop_async(d_ksk_array, stream, gpu_index);
|
||||
cuda_destroy_stream(stream, gpu_index);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(KeyswitchTestPrimitives_u64, keyswitch) {
|
||||
void *v_stream = (void *)stream;
|
||||
for (uint r = 0; r < REPETITIONS; r++) {
|
||||
for (uint s = 0; s < SAMPLES; s++) {
|
||||
uint64_t plaintext = plaintexts[r * SAMPLES + s];
|
||||
uint64_t *lwe_in_sk =
|
||||
lwe_sk_in_array + (ptrdiff_t)(r * input_lwe_dimension);
|
||||
uint64_t *lwe_out_sk =
|
||||
lwe_sk_out_array + (ptrdiff_t)(r * output_lwe_dimension);
|
||||
int ksk_size =
|
||||
ksk_level * (output_lwe_dimension + 1) * input_lwe_dimension;
|
||||
uint64_t *d_ksk = d_ksk_array + (ptrdiff_t)(ksk_size * r);
|
||||
concrete_cpu_encrypt_lwe_ciphertext_u64(
|
||||
lwe_in_sk, lwe_in_ct, plaintext, input_lwe_dimension, noise_variance,
|
||||
csprng, &CONCRETE_CSPRNG_VTABLE);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
cuda_memcpy_async_to_gpu(d_lwe_in_ct, lwe_in_ct,
|
||||
(input_lwe_dimension + 1) * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
// Execute keyswitch
|
||||
cuda_keyswitch_lwe_ciphertext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_ct,
|
||||
(void *)d_ksk, input_lwe_dimension, output_lwe_dimension,
|
||||
ksk_base_log, ksk_level, 1);
|
||||
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
|
||||
(output_lwe_dimension + 1) * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
uint64_t decrypted = 0;
|
||||
concrete_cpu_decrypt_lwe_ciphertext_u64(lwe_out_sk, lwe_out_ct,
|
||||
output_lwe_dimension, &decrypted);
|
||||
EXPECT_NE(decrypted, plaintext);
|
||||
// let err = (decrypted >= plaintext) ? decrypted - plaintext : plaintext
|
||||
// - decrypted;
|
||||
// error_sample_vec.push(err);
|
||||
|
||||
// The bit before the message
|
||||
uint64_t rounding_bit = delta >> 1;
|
||||
// Compute the rounding bit
|
||||
uint64_t rounding = (decrypted & rounding_bit) << 1;
|
||||
uint64_t decoded = (decrypted + rounding) / delta;
|
||||
ASSERT_EQ(decoded, plaintext / delta);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Defines for which parameters set the PBS will be tested.
|
||||
// It executes each test for all pairs on phis X qs (Cartesian product)
|
||||
::testing::internal::ParamGenerator<KeyswitchTestParams> ksk_params_u64 =
|
||||
::testing::Values(
|
||||
// n, k*N, noise_variance, ks_base_log, ks_level,
|
||||
// message_modulus, carry_modulus
|
||||
// 1 bit message 0 bit carry parameters
|
||||
(KeyswitchTestParams){567, 1280, 2.9802322387695312e-08, 3, 3, 2, 1},
|
||||
// 3 bits message 0 bit carry parameters
|
||||
(KeyswitchTestParams){694, 1536, 2.9802322387695312e-08, 4, 3, 4, 1},
|
||||
// 4 bits message 0 bit carry parameters
|
||||
(KeyswitchTestParams){769, 2048, 2.9802322387695312e-08, 4, 3, 5, 1},
|
||||
// 5 bits message 0 bit carry parameters
|
||||
(KeyswitchTestParams){754, 2048, 2.9802322387695312e-08, 3, 5, 6, 1},
|
||||
// 6 bits message 0 bit carry parameters
|
||||
(KeyswitchTestParams){847, 4096, 2.9802322387695312e-08, 4, 4, 7, 1},
|
||||
// 7 bits message 0 bit carry parameters
|
||||
(KeyswitchTestParams){881, 8192, 2.9802322387695312e-08, 3, 6, 8, 1});
|
||||
|
||||
std::string printParamName(::testing::TestParamInfo<KeyswitchTestParams> p) {
|
||||
KeyswitchTestParams params = p.param;
|
||||
|
||||
return "na_" + std::to_string(params.input_lwe_dimension) + "_nb_" +
|
||||
std::to_string(params.output_lwe_dimension) + "_baselog_" +
|
||||
std::to_string(params.ksk_base_log) + "_ksk_level_" +
|
||||
std::to_string(params.ksk_level);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(KeyswitchInstantiation, KeyswitchTestPrimitives_u64,
|
||||
ksk_params_u64, printParamName);
|
||||
@@ -0,0 +1,279 @@
|
||||
#include "../include/device.h"
|
||||
#include "../include/linear_algebra.h"
|
||||
#include "concrete-cpu.h"
|
||||
#include "utils.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
const unsigned REPETITIONS = 5;
|
||||
const unsigned SAMPLES = 100;
|
||||
|
||||
typedef struct {
|
||||
int lwe_dimension;
|
||||
double noise_variance;
|
||||
int message_modulus;
|
||||
int carry_modulus;
|
||||
} LinearAlgebraTestParams;
|
||||
|
||||
class LinearAlgebraTestPrimitives_u64
|
||||
: public ::testing::TestWithParam<LinearAlgebraTestParams> {
|
||||
protected:
|
||||
int lwe_dimension;
|
||||
double noise_variance;
|
||||
int message_modulus;
|
||||
int carry_modulus;
|
||||
int payload_modulus;
|
||||
uint64_t delta;
|
||||
Csprng *csprng;
|
||||
cudaStream_t *stream;
|
||||
int gpu_index = 0;
|
||||
uint64_t *lwe_sk_array;
|
||||
uint64_t *d_lwe_in_1_ct;
|
||||
uint64_t *d_lwe_in_2_ct;
|
||||
uint64_t *d_lwe_out_ct;
|
||||
uint64_t *lwe_in_1_ct;
|
||||
uint64_t *lwe_in_2_ct;
|
||||
uint64_t *lwe_out_ct;
|
||||
uint64_t *plaintexts_1;
|
||||
uint64_t *plaintexts_2;
|
||||
int num_samples;
|
||||
|
||||
public:
|
||||
// Test arithmetic functions
|
||||
void SetUp() {
|
||||
stream = cuda_create_stream(0);
|
||||
void *v_stream = (void *)stream;
|
||||
|
||||
// TestParams
|
||||
lwe_dimension = (int)GetParam().lwe_dimension;
|
||||
noise_variance = (int)GetParam().noise_variance;
|
||||
message_modulus = (int)GetParam().message_modulus;
|
||||
carry_modulus = (int)GetParam().carry_modulus;
|
||||
|
||||
payload_modulus = message_modulus * carry_modulus;
|
||||
// Value of the shift we multiply our messages by
|
||||
delta = ((uint64_t)(1) << 63) / (uint64_t)(payload_modulus);
|
||||
|
||||
// Create a Csprng
|
||||
csprng =
|
||||
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
|
||||
uint8_t seed[16] = {(uint8_t)0};
|
||||
concrete_cpu_construct_concrete_csprng(
|
||||
csprng, Uint128{.little_endian_bytes = {*seed}});
|
||||
|
||||
// Generate the keys
|
||||
generate_lwe_secret_keys(&lwe_sk_array, lwe_dimension, csprng, REPETITIONS);
|
||||
plaintexts_1 = generate_plaintexts(payload_modulus, delta, 1, REPETITIONS, SAMPLES);
|
||||
plaintexts_2 = generate_plaintexts(payload_modulus, delta, 1, REPETITIONS, SAMPLES);
|
||||
|
||||
d_lwe_in_1_ct = (uint64_t *)cuda_malloc_async(
|
||||
(lwe_dimension + 1) * sizeof(uint64_t), stream, gpu_index);
|
||||
d_lwe_in_2_ct = (uint64_t *)cuda_malloc_async(
|
||||
(lwe_dimension + 1) * sizeof(uint64_t), stream, gpu_index);
|
||||
d_lwe_out_ct = (uint64_t *)cuda_malloc_async(
|
||||
(lwe_dimension + 1) * sizeof(uint64_t), stream, gpu_index);
|
||||
|
||||
lwe_in_1_ct = (uint64_t *)malloc((lwe_dimension + 1) * sizeof(uint64_t));
|
||||
lwe_in_2_ct = (uint64_t *)malloc((lwe_dimension + 1) * sizeof(uint64_t));
|
||||
lwe_out_ct = (uint64_t *)malloc((lwe_dimension + 1) * sizeof(uint64_t));
|
||||
|
||||
cuda_synchronize_stream(v_stream);
|
||||
}
|
||||
|
||||
void TearDown() {
|
||||
void *v_stream = (void *)stream;
|
||||
|
||||
cuda_synchronize_stream(v_stream);
|
||||
concrete_cpu_destroy_concrete_csprng(csprng);
|
||||
free(csprng);
|
||||
cuda_drop_async(d_lwe_in_1_ct, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_in_2_ct, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_out_ct, stream, gpu_index);
|
||||
free(lwe_in_1_ct);
|
||||
free(lwe_in_2_ct);
|
||||
free(lwe_out_ct);
|
||||
free(lwe_sk_array);
|
||||
free(plaintexts_1);
|
||||
free(plaintexts_2);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(LinearAlgebraTestPrimitives_u64, addition) {
|
||||
void *v_stream = (void *)stream;
|
||||
// Here execute the PBS
|
||||
for (uint r = 0; r < REPETITIONS; r++) {
|
||||
for (uint s = 0; s < SAMPLES; s++) {
|
||||
uint64_t plaintext_1 = plaintexts_1[r * SAMPLES + s];
|
||||
uint64_t plaintext_2 = plaintexts_2[r * SAMPLES + s];
|
||||
uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
|
||||
concrete_cpu_encrypt_lwe_ciphertext_u64(lwe_sk, lwe_in_1_ct, plaintext_1,
|
||||
lwe_dimension, noise_variance,
|
||||
csprng, &CONCRETE_CSPRNG_VTABLE);
|
||||
concrete_cpu_encrypt_lwe_ciphertext_u64(lwe_sk, lwe_in_2_ct, plaintext_2,
|
||||
lwe_dimension, noise_variance,
|
||||
csprng, &CONCRETE_CSPRNG_VTABLE);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
|
||||
(lwe_dimension + 1) * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
cuda_memcpy_async_to_gpu(d_lwe_in_2_ct, lwe_in_2_ct,
|
||||
(lwe_dimension + 1) * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
// Execute addition
|
||||
cuda_add_lwe_ciphertext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
|
||||
(void *)d_lwe_in_2_ct, lwe_dimension, 1);
|
||||
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
|
||||
(lwe_dimension + 1) * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
uint64_t decrypted = 0;
|
||||
concrete_cpu_decrypt_lwe_ciphertext_u64(lwe_sk, lwe_out_ct, lwe_dimension,
|
||||
&decrypted);
|
||||
// The bit before the message
|
||||
uint64_t rounding_bit = delta >> 1;
|
||||
// Compute the rounding bit
|
||||
uint64_t rounding = (decrypted & rounding_bit) << 1;
|
||||
uint64_t decoded = (decrypted + rounding) / delta;
|
||||
ASSERT_EQ(decoded, (plaintext_1 + plaintext_2) / delta);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(LinearAlgebraTestPrimitives_u64, plaintext_addition) {
|
||||
void *v_stream = (void *)stream;
|
||||
// Here execute the PBS
|
||||
for (uint r = 0; r < REPETITIONS; r++) {
|
||||
for (uint s = 0; s < SAMPLES; s++) {
|
||||
uint64_t plaintext_1 = plaintexts_1[r * SAMPLES + s];
|
||||
uint64_t plaintext_2 = plaintexts_2[r * SAMPLES + s];
|
||||
uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
|
||||
concrete_cpu_encrypt_lwe_ciphertext_u64(lwe_sk, lwe_in_1_ct, plaintext_1,
|
||||
lwe_dimension, noise_variance,
|
||||
csprng, &CONCRETE_CSPRNG_VTABLE);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
|
||||
(lwe_dimension + 1) * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
cuda_memcpy_async_to_gpu(d_lwe_in_2_ct, &plaintext_2, sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
// Execute addition
|
||||
cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
|
||||
(void *)d_lwe_in_2_ct, lwe_dimension, 1);
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
|
||||
(lwe_dimension + 1) * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
uint64_t decrypted = 0;
|
||||
concrete_cpu_decrypt_lwe_ciphertext_u64(lwe_sk, lwe_out_ct, lwe_dimension,
|
||||
&decrypted);
|
||||
// The bit before the message
|
||||
uint64_t rounding_bit = delta >> 1;
|
||||
// Compute the rounding bit
|
||||
uint64_t rounding = (decrypted & rounding_bit) << 1;
|
||||
uint64_t decoded = (decrypted + rounding) / delta;
|
||||
ASSERT_EQ(decoded, (plaintext_1 + plaintext_2) / delta);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(LinearAlgebraTestPrimitives_u64, plaintext_multiplication) {
|
||||
void *v_stream = (void *)stream;
|
||||
// Here execute the PBS
|
||||
for (uint r = 0; r < REPETITIONS; r++) {
|
||||
for (uint s = 0; s < SAMPLES; s++) {
|
||||
uint64_t plaintext_1 = plaintexts_1[r * SAMPLES + s];
|
||||
uint64_t plaintext_2 = plaintexts_2[r * SAMPLES + s];
|
||||
uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
|
||||
concrete_cpu_encrypt_lwe_ciphertext_u64(lwe_sk, lwe_in_1_ct, plaintext_1,
|
||||
lwe_dimension, noise_variance,
|
||||
csprng, &CONCRETE_CSPRNG_VTABLE);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
|
||||
(lwe_dimension + 1) * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
cuda_memcpy_async_to_gpu(d_lwe_in_2_ct, &plaintext_1, sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
// Execute addition
|
||||
cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
|
||||
(void *)d_lwe_in_2_ct, lwe_dimension, 1);
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
|
||||
(lwe_dimension + 1) * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
uint64_t decrypted = 0;
|
||||
concrete_cpu_decrypt_lwe_ciphertext_u64(lwe_sk, lwe_out_ct, lwe_dimension,
|
||||
&decrypted);
|
||||
// The bit before the message
|
||||
uint64_t rounding_bit = delta >> 1;
|
||||
// Compute the rounding bit
|
||||
uint64_t rounding = (decrypted & rounding_bit) << 1;
|
||||
uint64_t decoded = (decrypted + rounding) / delta;
|
||||
ASSERT_EQ(decoded, (plaintext_1 * plaintext_2) / delta);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(LinearAlgebraTestPrimitives_u64, negate) {
|
||||
void *v_stream = (void *)stream;
|
||||
// Here execute the PBS
|
||||
for (uint r = 0; r < REPETITIONS; r++) {
|
||||
for (uint s = 0; s < SAMPLES; s++) {
|
||||
uint64_t plaintext = plaintexts_1[r * SAMPLES + s];
|
||||
uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
|
||||
concrete_cpu_encrypt_lwe_ciphertext_u64(lwe_sk, lwe_in_1_ct, plaintext,
|
||||
lwe_dimension, noise_variance,
|
||||
csprng, &CONCRETE_CSPRNG_VTABLE);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
|
||||
(lwe_dimension + 1) * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
// Execute addition
|
||||
cuda_negate_lwe_ciphertext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
|
||||
lwe_dimension, 1);
|
||||
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
|
||||
(lwe_dimension + 1) * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
uint64_t decrypted = 0;
|
||||
concrete_cpu_decrypt_lwe_ciphertext_u64(lwe_sk, lwe_out_ct, lwe_dimension,
|
||||
&decrypted);
|
||||
// The bit before the message
|
||||
uint64_t rounding_bit = delta >> 1;
|
||||
// Compute the rounding bit
|
||||
uint64_t rounding = (decrypted & rounding_bit) << 1;
|
||||
uint64_t decoded = (decrypted + rounding) / delta;
|
||||
ASSERT_EQ(decoded, -plaintext / delta);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Defines for which parameters set the linear algebra operations will be
|
||||
// tested. It executes each test for all pairs on phis X qs (Cartesian product)
|
||||
::testing::internal::ParamGenerator<LinearAlgebraTestParams>
|
||||
linear_algebra_params_u64 = ::testing::Values(
|
||||
// n, lwe_std_dev, message_modulus, carry_modulus
|
||||
(LinearAlgebraTestParams){600, 0.000007069849454709433, 4, 4});
|
||||
|
||||
std::string
|
||||
printParamName(::testing::TestParamInfo<LinearAlgebraTestParams> p) {
|
||||
LinearAlgebraTestParams params = p.param;
|
||||
|
||||
return "n_" + std::to_string(params.lwe_dimension);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(LinearAlgebraInstantiation,
|
||||
LinearAlgebraTestPrimitives_u64,
|
||||
linear_algebra_params_u64, printParamName);
|
||||
@@ -0,0 +1,289 @@
|
||||
#include "../include/bootstrap.h"
|
||||
#include "../include/device.h"
|
||||
#include "concrete-cpu.h"
|
||||
#include "utils.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
const unsigned REPETITIONS = 5;
|
||||
const unsigned SAMPLES = 100;
|
||||
|
||||
typedef struct {
|
||||
int lwe_dimension;
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
double lwe_modular_variance;
|
||||
double glwe_modular_variance;
|
||||
int pbs_base_log;
|
||||
int pbs_level;
|
||||
int ks_base_log;
|
||||
int ks_level;
|
||||
int pksk_base_log;
|
||||
int pksk_level;
|
||||
int cbs_base_log;
|
||||
int cbs_level;
|
||||
int tau;
|
||||
} WopBootstrapTestParams;
|
||||
|
||||
class WopBootstrapTestPrimitives_u64
|
||||
: public ::testing::TestWithParam<WopBootstrapTestParams> {
|
||||
protected:
|
||||
int lwe_dimension;
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
double lwe_modular_variance;
|
||||
double glwe_modular_variance;
|
||||
int pbs_base_log;
|
||||
int pbs_level;
|
||||
int ks_base_log;
|
||||
int ks_level;
|
||||
int pksk_base_log;
|
||||
int pksk_level;
|
||||
int cbs_base_log;
|
||||
int cbs_level;
|
||||
int tau;
|
||||
int p;
|
||||
uint64_t delta;
|
||||
uint32_t cbs_delta_log;
|
||||
int delta_log;
|
||||
int delta_log_lut;
|
||||
Csprng *csprng;
|
||||
cudaStream_t *stream;
|
||||
int gpu_index = 0;
|
||||
uint64_t *lwe_sk_in_array;
|
||||
uint64_t *lwe_sk_out_array;
|
||||
uint64_t *lwe_in_ct_array;
|
||||
uint64_t *lwe_out_ct_array;
|
||||
uint64_t *plaintexts;
|
||||
double *d_fourier_bsk_array;
|
||||
uint64_t *d_ksk_array;
|
||||
uint64_t *d_pksk_array;
|
||||
uint64_t *d_lwe_ct_in_array;
|
||||
uint64_t *d_lwe_ct_out_array;
|
||||
uint64_t *d_lut_vector;
|
||||
int8_t *wop_pbs_buffer;
|
||||
|
||||
public:
|
||||
// Test arithmetic functions
|
||||
void SetUp() {
|
||||
stream = cuda_create_stream(0);
|
||||
void *v_stream = (void *)stream;
|
||||
|
||||
// TestParams
|
||||
lwe_dimension = (int)GetParam().lwe_dimension;
|
||||
glwe_dimension = (int)GetParam().glwe_dimension;
|
||||
polynomial_size = (int)GetParam().polynomial_size;
|
||||
lwe_modular_variance = (double)GetParam().lwe_modular_variance;
|
||||
glwe_modular_variance = (double)GetParam().glwe_modular_variance;
|
||||
pbs_base_log = (int)GetParam().pbs_base_log;
|
||||
pbs_level = (int)GetParam().pbs_level;
|
||||
ks_base_log = (int)GetParam().ks_base_log;
|
||||
ks_level = (int)GetParam().ks_level;
|
||||
pksk_base_log = (int)GetParam().pksk_base_log;
|
||||
pksk_level = (int)GetParam().pksk_level;
|
||||
cbs_base_log = (int)GetParam().cbs_base_log;
|
||||
cbs_level = (int)GetParam().cbs_level;
|
||||
tau = (int)GetParam().tau;
|
||||
p = 10 / tau;
|
||||
delta_log = 64 - p;
|
||||
delta_log_lut = delta_log;
|
||||
delta = (uint64_t)(1) << delta_log;
|
||||
|
||||
// Create a Csprng
|
||||
csprng =
|
||||
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
|
||||
uint8_t seed[16] = {(uint8_t)0};
|
||||
concrete_cpu_construct_concrete_csprng(
|
||||
csprng, Uint128{.little_endian_bytes = {*seed}});
|
||||
|
||||
int input_lwe_dimension = glwe_dimension * polynomial_size;
|
||||
// Generate the keys
|
||||
generate_lwe_secret_keys(&lwe_sk_in_array, input_lwe_dimension, csprng, REPETITIONS);
|
||||
generate_lwe_secret_keys(&lwe_sk_out_array, lwe_dimension, csprng, REPETITIONS);
|
||||
generate_lwe_keyswitch_keys(stream, gpu_index, &d_ksk_array,
|
||||
lwe_sk_in_array, lwe_sk_out_array,
|
||||
input_lwe_dimension, lwe_dimension, ks_level,
|
||||
ks_base_log, csprng, lwe_modular_variance, REPETITIONS);
|
||||
generate_lwe_bootstrap_keys(
|
||||
stream, gpu_index, &d_fourier_bsk_array, lwe_sk_out_array,
|
||||
lwe_sk_in_array, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
pbs_level, pbs_base_log, csprng, glwe_modular_variance, REPETITIONS);
|
||||
generate_lwe_private_functional_keyswitch_key_lists(
|
||||
stream, gpu_index, &d_pksk_array, lwe_sk_in_array, lwe_sk_in_array,
|
||||
input_lwe_dimension, glwe_dimension, polynomial_size, pksk_level,
|
||||
pksk_base_log, csprng, lwe_modular_variance, REPETITIONS);
|
||||
plaintexts = generate_plaintexts(p, delta, tau, REPETITIONS, SAMPLES);
|
||||
|
||||
// LUT creation
|
||||
int lut_size = polynomial_size;
|
||||
int lut_num = tau << (tau * p - (int)log2(polynomial_size)); // r
|
||||
|
||||
uint64_t *big_lut =
|
||||
(uint64_t *)malloc(lut_num * lut_size * sizeof(uint64_t));
|
||||
for (int t = tau - 1; t >= 0; t--) {
|
||||
uint64_t *small_lut = big_lut + (ptrdiff_t)(t * (1 << (tau * p)));
|
||||
for (uint64_t value = 0; value < (uint64_t)(1 << (tau * p)); value++) {
|
||||
int nbits = t * p;
|
||||
uint64_t x = (value >> nbits) & (uint64_t)((1 << p) - 1);
|
||||
small_lut[value] =
|
||||
((x % (uint64_t)(1 << (64 - delta_log))) << delta_log_lut);
|
||||
}
|
||||
}
|
||||
d_lut_vector = (uint64_t *)cuda_malloc_async(
|
||||
lut_num * lut_size * sizeof(uint64_t), stream, gpu_index);
|
||||
cuda_memcpy_async_to_gpu(d_lut_vector, big_lut,
|
||||
lut_num * lut_size * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
free(big_lut);
|
||||
// Execute scratch
|
||||
scratch_cuda_wop_pbs_64(stream, gpu_index, &wop_pbs_buffer,
|
||||
(uint32_t *)&delta_log, &cbs_delta_log,
|
||||
glwe_dimension, lwe_dimension, polynomial_size,
|
||||
cbs_level, pbs_level, p, p, tau,
|
||||
cuda_get_max_shared_memory(gpu_index), true);
|
||||
// Allocate input
|
||||
d_lwe_ct_in_array = (uint64_t *)cuda_malloc_async(
|
||||
(input_lwe_dimension + 1) * tau * sizeof(uint64_t), stream, gpu_index);
|
||||
// Allocate output
|
||||
d_lwe_ct_out_array = (uint64_t *)cuda_malloc_async(
|
||||
(input_lwe_dimension + 1) * tau * sizeof(uint64_t), stream, gpu_index);
|
||||
lwe_in_ct_array =
|
||||
(uint64_t *)malloc((input_lwe_dimension + 1) * tau * sizeof(uint64_t));
|
||||
lwe_out_ct_array =
|
||||
(uint64_t *)malloc((input_lwe_dimension + 1) * tau * sizeof(uint64_t));
|
||||
}
|
||||
|
||||
void TearDown() {
|
||||
void *v_stream = (void *)stream;
|
||||
|
||||
cuda_synchronize_stream(v_stream);
|
||||
concrete_cpu_destroy_concrete_csprng(csprng);
|
||||
free(csprng);
|
||||
free(lwe_sk_in_array);
|
||||
free(lwe_sk_out_array);
|
||||
free(plaintexts);
|
||||
free(lwe_in_ct_array);
|
||||
free(lwe_out_ct_array);
|
||||
cleanup_cuda_circuit_bootstrap_vertical_packing(stream, gpu_index,
|
||||
&wop_pbs_buffer);
|
||||
cuda_drop_async(d_fourier_bsk_array, stream, gpu_index);
|
||||
cuda_drop_async(d_ksk_array, stream, gpu_index);
|
||||
cuda_drop_async(d_pksk_array, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_ct_out_array, stream, gpu_index);
|
||||
cuda_drop_async(d_lut_vector, stream, gpu_index);
|
||||
cuda_destroy_stream(stream, gpu_index);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(WopBootstrapTestPrimitives_u64, wop_pbs) {
|
||||
void *v_stream = (void *)stream;
|
||||
int input_lwe_dimension = glwe_dimension * polynomial_size;
|
||||
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
|
||||
polynomial_size * (lwe_dimension + 1);
|
||||
int ksk_size =
|
||||
ks_level * (lwe_dimension + 1) * glwe_dimension * polynomial_size;
|
||||
int pksk_list_size = pksk_level * (glwe_dimension + 1) * polynomial_size *
|
||||
(glwe_dimension * polynomial_size + 1) *
|
||||
(glwe_dimension + 1);
|
||||
for (uint r = 0; r < REPETITIONS; r++) {
|
||||
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
|
||||
uint64_t *lwe_sk_in =
|
||||
lwe_sk_in_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
|
||||
uint64_t *d_ksk = d_ksk_array + (ptrdiff_t)(ksk_size * r);
|
||||
uint64_t *d_pksk_list = d_pksk_array + (ptrdiff_t)(pksk_list_size * r);
|
||||
for (uint s = 0; s < SAMPLES; s++) {
|
||||
for (int t = 0; t < tau; t++) {
|
||||
uint64_t plaintext = plaintexts[r * SAMPLES * tau + s * tau + t];
|
||||
uint64_t *lwe_in_ct =
|
||||
lwe_in_ct_array + (ptrdiff_t)((r * SAMPLES * tau + s * tau + t) *
|
||||
(input_lwe_dimension + 1));
|
||||
concrete_cpu_encrypt_lwe_ciphertext_u64(
|
||||
lwe_sk_in, lwe_in_ct, plaintext, input_lwe_dimension,
|
||||
lwe_modular_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
|
||||
}
|
||||
cuda_synchronize_stream(v_stream);
|
||||
cuda_memcpy_async_to_gpu(d_lwe_ct_in_array, lwe_in_ct_array,
|
||||
(input_lwe_dimension + 1) * tau *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
|
||||
// Execute wop pbs
|
||||
cuda_wop_pbs_64(stream, gpu_index, (void *)d_lwe_ct_out_array,
|
||||
(void *)d_lwe_ct_in_array, (void *)d_lut_vector,
|
||||
(void *)d_fourier_bsk, (void *)d_ksk, (void *)d_pksk_list,
|
||||
wop_pbs_buffer, cbs_delta_log, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
ks_base_log, ks_level, pksk_base_log, pksk_level,
|
||||
cbs_base_log, cbs_level, p, p, delta_log, tau,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
|
||||
//// Copy result back
|
||||
// cuda_memcpy_async_to_cpu(lwe_out_ct_array, d_lwe_ct_out_array,
|
||||
//(input_lwe_dimension + 1) * tau * sizeof(uint64_t), stream, gpu_index);
|
||||
// cuda_synchronize_stream(v_stream);
|
||||
|
||||
// for (int i = 0; i < tau; i++) {
|
||||
// uint64_t *result_ct =
|
||||
// lwe_out_ct_array + (ptrdiff_t)(i * (input_lwe_dimension + 1));
|
||||
// uint64_t decrypted_message = 0;
|
||||
// concrete_cpu_decrypt_lwe_ciphertext_u64(
|
||||
// lwe_sk_in, result_ct, input_lwe_dimension, &decrypted_message);
|
||||
// // Round after decryption
|
||||
// uint64_t decrypted =
|
||||
// closest_representable(decrypted_message, 1, p) >> delta_log;
|
||||
// uint64_t expected = plaintext >> delta_log;
|
||||
// EXPECT_EQ(decrypted, expected);
|
||||
//}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Defines for which parameters set the PBS will be tested.
|
||||
// It executes each test for all pairs on phis X qs (Cartesian product)
|
||||
::testing::internal::ParamGenerator<WopBootstrapTestParams> wop_pbs_params_u64 =
|
||||
::testing::Values(
|
||||
// n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
|
||||
// ks_base_log, ks_level, tau
|
||||
(WopBootstrapTestParams){481, 2, 512, 7.52316384526264e-37,
|
||||
7.52316384526264e-37, 4,
|
||||
9, 1, 9, 4, 9, 6, 4, 1}//,
|
||||
//(WopBootstrapTestParams){481, 2, 512, 7.52316384526264e-37,
|
||||
// 7.52316384526264e-37, 4, 9, 1, 9, 4, 9, 6, 4,
|
||||
// 2} //,
|
||||
//(WopBootstrapTestParams){481, 2, 1024, 7.52316384526264e-37,
|
||||
// 7.52316384526264e-37, 4,
|
||||
// 9, 1, 9, 4, 9, 6, 4, 1},
|
||||
//(WopBootstrapTestParams){481, 2, 1024, 7.52316384526264e-37,
|
||||
// 7.52316384526264e-37, 4,
|
||||
// 9, 1, 9, 4, 9, 6, 4, 2}
|
||||
);
|
||||
|
||||
std::string printParamName(::testing::TestParamInfo<WopBootstrapTestParams> p) {
|
||||
WopBootstrapTestParams params = p.param;
|
||||
|
||||
std::string message = "Unknown_parameter_set";
|
||||
if (params.polynomial_size == 512) {
|
||||
// When log_2_poly_size == 9 we have a cmux tree done with a single cmux.
|
||||
message = "wop_pbs_cmux_tree_with_single_cmux_n_" +
|
||||
std::to_string(params.lwe_dimension) + "_k_" +
|
||||
std::to_string(params.glwe_dimension) + "_N_" +
|
||||
std::to_string(params.polynomial_size) + "_tau_" +
|
||||
std::to_string(params.tau);
|
||||
} else if (params.polynomial_size == 1024) {
|
||||
// When log_2_poly_size == 10 the VP skips the cmux tree.
|
||||
message = "wop_pbs_without_cmux_tree_n_" +
|
||||
std::to_string(params.lwe_dimension) + "_k_" +
|
||||
std::to_string(params.glwe_dimension) + "_N_" +
|
||||
std::to_string(params.polynomial_size) + "_tau_" +
|
||||
std::to_string(params.tau);
|
||||
}
|
||||
return message;
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(WopBootstrapInstantiation,
|
||||
WopBootstrapTestPrimitives_u64, wop_pbs_params_u64,
|
||||
printParamName);
|
||||
290
backends/concrete-cuda/implementation/test/utils.cpp
Normal file
290
backends/concrete-cuda/implementation/test/utils.cpp
Normal file
@@ -0,0 +1,290 @@
|
||||
#include "utils.h"
|
||||
#include "../include/bootstrap.h"
|
||||
#include "../include/device.h"
|
||||
#include "concrete-cpu.h"
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <functional>
|
||||
#include <random>
|
||||
|
||||
// For each sample and repetition, create a plaintext
|
||||
// The payload_modulus is the message modulus times the carry modulus
|
||||
// (so the total message modulus)
|
||||
uint64_t *generate_plaintexts(uint64_t payload_modulus, uint64_t delta,
|
||||
int number_of_inputs, const unsigned repetitions, const unsigned
|
||||
samples) {
|
||||
uint64_t *plaintext_array = (uint64_t *)malloc(
|
||||
repetitions * samples * number_of_inputs * sizeof(uint64_t));
|
||||
std::random_device rd;
|
||||
std::mt19937 gen(rd());
|
||||
std::uniform_int_distribution<unsigned long long> dis(
|
||||
std::numeric_limits<std::uint64_t>::min(),
|
||||
std::numeric_limits<std::uint64_t>::max());
|
||||
for (uint r = 0; r < repetitions; r++) {
|
||||
for (uint s = 0; s < samples; s++) {
|
||||
for (int i = 0; i < number_of_inputs; i++) {
|
||||
plaintext_array[r * samples * number_of_inputs + s * number_of_inputs +
|
||||
i] = (dis(gen) % payload_modulus) * delta;
|
||||
}
|
||||
}
|
||||
}
|
||||
return plaintext_array;
|
||||
}
|
||||
|
||||
// Decompose value in r bits
|
||||
// Bit decomposition of the value from MSB to LSB
|
||||
uint64_t *bit_decompose_value(uint64_t value, int r) {
|
||||
uint64_t *bit_array = (uint64_t *)malloc(r * sizeof(uint64_t));
|
||||
|
||||
uint64_t x = value;
|
||||
for (int i = 0; i < r; i++) {
|
||||
bit_array[i] = x & 1;
|
||||
x >>= 1;
|
||||
}
|
||||
return bit_array;
|
||||
}
|
||||
|
||||
uint64_t *generate_identity_lut_pbs(int polynomial_size, int glwe_dimension,
|
||||
int message_modulus, int carry_modulus,
|
||||
std::function<uint64_t(uint64_t)> func) {
|
||||
// Modulus of the msg contained in the msg bits and operations buffer
|
||||
uint64_t modulus_sup = message_modulus * carry_modulus;
|
||||
|
||||
// N/(p/2) = size of each block
|
||||
uint64_t box_size = polynomial_size / modulus_sup;
|
||||
|
||||
// Value of the shift we multiply our messages by
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (uint64_t)(modulus_sup);
|
||||
|
||||
// Create the plaintext lut_pbs
|
||||
uint64_t *plaintext_lut_pbs =
|
||||
(uint64_t *)malloc(polynomial_size * sizeof(uint64_t));
|
||||
|
||||
// This plaintext_lut_pbs extracts the carry bits
|
||||
for (uint64_t i = 0; i < modulus_sup; i++) {
|
||||
uint64_t index = i * box_size;
|
||||
for (uint64_t j = index; j < index + box_size; j++) {
|
||||
plaintext_lut_pbs[j] = func(i) * delta;
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t half_box_size = box_size / 2;
|
||||
|
||||
// Negate the first half_box_size coefficients
|
||||
for (uint64_t i = 0; i < half_box_size; i++) {
|
||||
plaintext_lut_pbs[i] = -plaintext_lut_pbs[i];
|
||||
}
|
||||
|
||||
// Rotate the plaintext_lut_pbs
|
||||
std::rotate(plaintext_lut_pbs, plaintext_lut_pbs + half_box_size,
|
||||
plaintext_lut_pbs + polynomial_size);
|
||||
|
||||
// Create the GLWE lut_pbs
|
||||
uint64_t *lut_pbs = (uint64_t *)malloc(
|
||||
polynomial_size * (glwe_dimension + 1) * sizeof(uint64_t));
|
||||
for (int i = 0; i < polynomial_size * glwe_dimension; i++) {
|
||||
lut_pbs[i] = 0;
|
||||
}
|
||||
for (int i = 0; i < polynomial_size; i++) {
|
||||
int glwe_index = glwe_dimension * polynomial_size + i;
|
||||
lut_pbs[glwe_index] = plaintext_lut_pbs[i];
|
||||
}
|
||||
|
||||
free(plaintext_lut_pbs);
|
||||
return lut_pbs;
|
||||
}
|
||||
|
||||
uint64_t *generate_identity_lut_cmux_tree(int polynomial_size, int num_lut,
|
||||
int tau, int delta_log) {
|
||||
|
||||
// Create the plaintext lut_pbs
|
||||
uint64_t *plaintext_lut_cmux_tree =
|
||||
(uint64_t *)malloc(num_lut * tau * polynomial_size * sizeof(uint64_t));
|
||||
|
||||
// This plaintext_lut_cmux_tree extracts the carry bits
|
||||
for (int tree = 0; tree < tau; tree++)
|
||||
for (int i = 0; i < num_lut; i++) {
|
||||
uint64_t *plaintext_lut_slice = plaintext_lut_cmux_tree +
|
||||
i * polynomial_size +
|
||||
tree * num_lut * polynomial_size;
|
||||
uint64_t coeff = (((uint64_t)(i + tree) % (1 << (64 - delta_log))))
|
||||
<< delta_log;
|
||||
for (int p = 0; p < polynomial_size; p++)
|
||||
plaintext_lut_slice[p] = coeff;
|
||||
}
|
||||
|
||||
return plaintext_lut_cmux_tree;
|
||||
}
|
||||
|
||||
// Generate repetitions LWE secret keys
|
||||
void generate_lwe_secret_keys(uint64_t **lwe_sk_array, int lwe_dimension,
|
||||
Csprng *csprng, const unsigned repetitions) {
|
||||
int lwe_sk_array_size = lwe_dimension * repetitions;
|
||||
*lwe_sk_array = (uint64_t *)malloc(lwe_sk_array_size * sizeof(uint64_t));
|
||||
int shift = 0;
|
||||
for (uint r = 0; r < repetitions; r++) {
|
||||
// Generate the lwe secret key for each repetition
|
||||
concrete_cpu_init_secret_key_u64(*lwe_sk_array + (ptrdiff_t)(shift),
|
||||
lwe_dimension, csprng,
|
||||
&CONCRETE_CSPRNG_VTABLE);
|
||||
shift += lwe_dimension;
|
||||
}
|
||||
}
|
||||
|
||||
// Generate repetitions GLWE secret keys
|
||||
void generate_glwe_secret_keys(uint64_t **glwe_sk_array, int glwe_dimension,
|
||||
int polynomial_size, Csprng *csprng, const unsigned repetitions) {
|
||||
int glwe_sk_array_size = glwe_dimension * polynomial_size * repetitions;
|
||||
*glwe_sk_array = (uint64_t *)malloc(glwe_sk_array_size * sizeof(uint64_t));
|
||||
int shift = 0;
|
||||
for (uint r = 0; r < repetitions; r++) {
|
||||
// Generate the lwe secret key for each repetition
|
||||
concrete_cpu_init_secret_key_u64(*glwe_sk_array + (ptrdiff_t)(shift),
|
||||
glwe_dimension * polynomial_size, csprng,
|
||||
&CONCRETE_CSPRNG_VTABLE);
|
||||
shift += glwe_dimension * polynomial_size;
|
||||
}
|
||||
}
|
||||
|
||||
// Generate repetitions LWE bootstrap keys
|
||||
void generate_lwe_bootstrap_keys(cudaStream_t *stream, int gpu_index,
|
||||
double **d_fourier_bsk_array,
|
||||
uint64_t *lwe_sk_in_array,
|
||||
uint64_t *lwe_sk_out_array, int lwe_dimension,
|
||||
int glwe_dimension, int polynomial_size,
|
||||
int pbs_level, int pbs_base_log,
|
||||
Csprng *csprng, double variance, const unsigned repetitions) {
|
||||
void *v_stream = (void *)stream;
|
||||
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
|
||||
polynomial_size * (lwe_dimension + 1);
|
||||
int bsk_array_size = bsk_size * repetitions;
|
||||
|
||||
uint64_t *bsk_array = (uint64_t *)malloc(bsk_array_size * sizeof(uint64_t));
|
||||
*d_fourier_bsk_array = (double *)cuda_malloc_async(
|
||||
bsk_array_size * sizeof(double), stream, gpu_index);
|
||||
int shift_in = 0;
|
||||
int shift_out = 0;
|
||||
int shift_bsk = 0;
|
||||
|
||||
for (uint r = 0; r < repetitions; r++) {
|
||||
// Generate the bootstrap key for each repetition
|
||||
concrete_cpu_init_lwe_bootstrap_key_u64(
|
||||
bsk_array + (ptrdiff_t)(shift_bsk),
|
||||
lwe_sk_in_array + (ptrdiff_t)(shift_in),
|
||||
lwe_sk_out_array + (ptrdiff_t)(shift_out), lwe_dimension,
|
||||
polynomial_size, glwe_dimension, pbs_level, pbs_base_log, variance,
|
||||
Parallelism(1), csprng, &CONCRETE_CSPRNG_VTABLE);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
double *d_fourier_bsk = *d_fourier_bsk_array + (ptrdiff_t)(shift_bsk);
|
||||
uint64_t *bsk = bsk_array + (ptrdiff_t)(shift_bsk);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
cuda_convert_lwe_bootstrap_key_64(
|
||||
(void *)(d_fourier_bsk), (void *)(bsk), v_stream, gpu_index,
|
||||
lwe_dimension, glwe_dimension, pbs_level, polynomial_size);
|
||||
shift_in += lwe_dimension;
|
||||
shift_out += glwe_dimension * polynomial_size;
|
||||
shift_bsk += bsk_size;
|
||||
}
|
||||
free(bsk_array);
|
||||
}
|
||||
|
||||
// Generate repetitions keyswitch keys
|
||||
void generate_lwe_keyswitch_keys(
|
||||
cudaStream_t *stream, int gpu_index, uint64_t **d_ksk_array,
|
||||
uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array,
|
||||
int input_lwe_dimension, int output_lwe_dimension, int ksk_level,
|
||||
int ksk_base_log, Csprng *csprng, double variance, const unsigned repetitions) {
|
||||
|
||||
int ksk_size = ksk_level * (output_lwe_dimension + 1) * input_lwe_dimension;
|
||||
int ksk_array_size = ksk_size * repetitions;
|
||||
|
||||
uint64_t *ksk_array = (uint64_t *)malloc(ksk_array_size * sizeof(uint64_t));
|
||||
*d_ksk_array = (uint64_t *)cuda_malloc_async(
|
||||
ksk_array_size * sizeof(uint64_t), stream, gpu_index);
|
||||
int shift_in = 0;
|
||||
int shift_out = 0;
|
||||
int shift_ksk = 0;
|
||||
|
||||
for (uint r = 0; r < repetitions; r++) {
|
||||
// Generate the keyswitch key for each repetition
|
||||
concrete_cpu_init_lwe_keyswitch_key_u64(
|
||||
ksk_array + (ptrdiff_t)(shift_ksk),
|
||||
lwe_sk_in_array + (ptrdiff_t)(shift_in),
|
||||
lwe_sk_out_array + (ptrdiff_t)(shift_out), input_lwe_dimension,
|
||||
output_lwe_dimension, ksk_level, ksk_base_log, variance, csprng,
|
||||
&CONCRETE_CSPRNG_VTABLE);
|
||||
uint64_t *d_ksk = *d_ksk_array + (ptrdiff_t)(shift_ksk);
|
||||
uint64_t *ksk = ksk_array + (ptrdiff_t)(shift_ksk);
|
||||
cuda_memcpy_async_to_gpu(d_ksk, ksk, ksk_size * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
|
||||
shift_in += input_lwe_dimension;
|
||||
shift_out += output_lwe_dimension;
|
||||
shift_ksk += ksk_size;
|
||||
}
|
||||
free(ksk_array);
|
||||
}
|
||||
|
||||
// Generate repetitions private functional keyswitch key lists (with (k + 1)
|
||||
// keys each)
|
||||
void generate_lwe_private_functional_keyswitch_key_lists(
|
||||
cudaStream_t *stream, int gpu_index, uint64_t **d_pksk_array,
|
||||
uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array,
|
||||
int input_lwe_dimension, int output_glwe_dimension,
|
||||
int output_polynomial_size, int pksk_level, int pksk_base_log,
|
||||
Csprng *csprng, double variance, const unsigned repetitions) {
|
||||
|
||||
int pksk_list_size = pksk_level * (output_glwe_dimension + 1) *
|
||||
output_polynomial_size * (input_lwe_dimension + 1) *
|
||||
(output_glwe_dimension + 1);
|
||||
int pksk_array_size = pksk_list_size * repetitions;
|
||||
|
||||
uint64_t *pksk_array = (uint64_t *)malloc(pksk_array_size * sizeof(uint64_t));
|
||||
*d_pksk_array = (uint64_t *)cuda_malloc_async(
|
||||
pksk_array_size * sizeof(uint64_t), stream, gpu_index);
|
||||
int shift_in = 0;
|
||||
int shift_out = 0;
|
||||
int shift_pksk_list = 0;
|
||||
|
||||
for (uint r = 0; r < repetitions; r++) {
|
||||
// Generate the (k + 1) private functional keyswitch keys for each
|
||||
// repetition
|
||||
concrete_cpu_init_lwe_circuit_bootstrap_private_functional_packing_keyswitch_keys_u64(
|
||||
pksk_array + (ptrdiff_t)(shift_pksk_list),
|
||||
lwe_sk_in_array + (ptrdiff_t)(shift_in),
|
||||
lwe_sk_out_array + (ptrdiff_t)(shift_out), input_lwe_dimension,
|
||||
output_polynomial_size, output_glwe_dimension, pksk_level,
|
||||
pksk_base_log, variance, Parallelism(1), csprng,
|
||||
&CONCRETE_CSPRNG_VTABLE);
|
||||
uint64_t *d_pksk_list = *d_pksk_array + (ptrdiff_t)(shift_pksk_list);
|
||||
uint64_t *pksk_list = pksk_array + (ptrdiff_t)(shift_pksk_list);
|
||||
cuda_memcpy_async_to_gpu(d_pksk_list, pksk_list,
|
||||
pksk_list_size * sizeof(uint64_t), stream,
|
||||
gpu_index);
|
||||
|
||||
shift_in += input_lwe_dimension;
|
||||
shift_out += output_glwe_dimension * output_polynomial_size;
|
||||
shift_pksk_list += pksk_list_size;
|
||||
}
|
||||
free(pksk_array);
|
||||
}
|
||||
|
||||
// The closest number representable by the decomposition can be computed by
|
||||
// performing the rounding at the appropriate bit.
|
||||
uint64_t closest_representable(uint64_t input, int level_count, int base_log) {
|
||||
// Compute the number of least significant bits which can not be represented
|
||||
// by the decomposition
|
||||
int non_rep_bit_count = 64 - (level_count * base_log);
|
||||
// Generate a mask which captures the non representable bits
|
||||
uint64_t one = 1;
|
||||
uint64_t non_rep_mask = one << (non_rep_bit_count - 1);
|
||||
// Retrieve the non representable bits
|
||||
uint64_t non_rep_bits = input & non_rep_mask;
|
||||
// Extract the msb of the non representable bits to perform the rounding
|
||||
uint64_t non_rep_msb = non_rep_bits >> (non_rep_bit_count - 1);
|
||||
// Remove the non-representable bits and perform the rounding
|
||||
uint64_t res = input >> non_rep_bit_count;
|
||||
res += non_rep_msb;
|
||||
return res << non_rep_bit_count;
|
||||
}
|
||||
50
backends/concrete-cuda/implementation/test/utils.h
Normal file
50
backends/concrete-cuda/implementation/test/utils.h
Normal file
@@ -0,0 +1,50 @@
|
||||
#ifndef TEST_UTILS_H
|
||||
#define TEST_UTILS_H
|
||||
|
||||
#include "../include/device.h"
|
||||
#include "concrete-cpu.h"
|
||||
#include <functional>
|
||||
|
||||
uint64_t *generate_plaintexts(uint64_t payload_modulus, uint64_t delta,
|
||||
int number_of_inputs, const unsigned repetitions, const unsigned
|
||||
samples);
|
||||
|
||||
uint64_t *generate_identity_lut_pbs(int polynomial_size, int glwe_dimension,
|
||||
int message_modulus, int carry_modulus,
|
||||
std::function<uint64_t(uint64_t)> func);
|
||||
|
||||
uint64_t *generate_identity_lut_cmux_tree(int polynomial_size, int num_lut,
|
||||
int tau, int delta_log);
|
||||
|
||||
void generate_lwe_secret_keys(uint64_t **lwe_sk_array, int lwe_dimension,
|
||||
Csprng *csprng, const unsigned repetitions);
|
||||
|
||||
void generate_glwe_secret_keys(uint64_t **glwe_sk_array, int glwe_dimension,
|
||||
int polynomial_size, Csprng *csprng, const unsigned repetitions);
|
||||
|
||||
void generate_lwe_bootstrap_keys(cudaStream_t *stream, int gpu_index,
|
||||
double **d_fourier_bsk_array,
|
||||
uint64_t *lwe_sk_in_array,
|
||||
uint64_t *lwe_sk_out_array, int lwe_dimension,
|
||||
int glwe_dimension, int polynomial_size,
|
||||
int pbs_level, int pbs_base_log,
|
||||
Csprng *csprng, double variance, const unsigned repetitions);
|
||||
|
||||
void generate_lwe_keyswitch_keys(
|
||||
cudaStream_t *stream, int gpu_index, uint64_t **d_ksk_array,
|
||||
uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array,
|
||||
int input_lwe_dimension, int output_lwe_dimension, int ksk_level,
|
||||
int ksk_base_log, Csprng *csprng, double variance, const unsigned repetitions);
|
||||
|
||||
void generate_lwe_private_functional_keyswitch_key_lists(
|
||||
cudaStream_t *stream, int gpu_index, uint64_t **d_pksk_array,
|
||||
uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array,
|
||||
int input_lwe_dimension, int output_glwe_dimension,
|
||||
int output_polynomial_size, int pksk_level, int pksk_base_log,
|
||||
Csprng *csprng, double variance, const unsigned repetitions);
|
||||
|
||||
uint64_t closest_representable(uint64_t input, int level_count, int base_log);
|
||||
|
||||
uint64_t *bit_decompose_value(uint64_t value, int r);
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user