diff --git a/.github/workflows/concrete_cuda_test.yml b/.github/workflows/concrete_cuda_test.yml index 1d6c21168..5d0d35345 100644 --- a/.github/workflows/concrete_cuda_test.yml +++ b/.github/workflows/concrete_cuda_test.yml @@ -39,7 +39,7 @@ jobs: with: mode: start github-token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }} - ec2-image-id: ami-0c4d39cb3dba0fcff + ec2-image-id: ami-03f11dc8c6a5f5c0a ec2-instance-type: p3.2xlarge subnet-id: subnet-8123c9e7 security-group-id: sg-0466d33ced960ba35 @@ -91,31 +91,31 @@ jobs: cd build cmake .. make -j8 - #- name: Test concrete-cuda with Cuda 11.8 - # if: ${{ !cancelled() }} - # run: | - # cd backends/concrete-cuda/implementation/build - # ./test/test_concrete_cuda + - name: Test concrete-cuda with Cuda 11.8 + if: ${{ !cancelled() }} + run: | + cd backends/concrete-cuda/implementation/build + ./test/test_concrete_cuda - #- name: Export variables for CUDA 11.1 - # run: | - # echo "CUDA_PATH=$OLD_CUDA_PATH" >> "${GITHUB_ENV}" - # echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}" - # echo "LD_LIBRARY_PATH=$OLD_CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}" - # echo "CUDACXX=$OLD_CUDA_PATH/bin/nvcc" >> "${GITHUB_ENV}" - #- name: Build concrete-cuda with Cuda 11.1 - # if: ${{ !cancelled() }} - # run: | - # cd backends/concrete-cuda/implementation - # mkdir build-old-cuda - # cd build-old-cuda - # cmake .. - # make -j8 - #- name: Test concrete-cuda with Cuda 11.1 - # if: ${{ !cancelled() }} - # run: | - # cd backends/concrete-cuda/implementation/build-old-cuda - # ./test/test_concrete_cuda + - name: Export variables for CUDA 11.1 + run: | + echo "CUDA_PATH=$OLD_CUDA_PATH" >> "${GITHUB_ENV}" + echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}" + echo "LD_LIBRARY_PATH=$OLD_CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}" + echo "CUDACXX=$OLD_CUDA_PATH/bin/nvcc" >> "${GITHUB_ENV}" + - name: Build concrete-cuda with Cuda 11.1 + if: ${{ !cancelled() }} + run: | + cd backends/concrete-cuda/implementation + mkdir build-old-cuda + cd build-old-cuda + cmake .. + make -j8 + - name: Test concrete-cuda with Cuda 11.1 + if: ${{ !cancelled() }} + run: | + cd backends/concrete-cuda/implementation/build-old-cuda + ./test/test_concrete_cuda - name: Slack Notification if: ${{ always() }} diff --git a/backends/concrete-cuda/implementation/CMakeLists.txt b/backends/concrete-cuda/implementation/CMakeLists.txt index 640cfd68a..38bd4d0f1 100644 --- a/backends/concrete-cuda/implementation/CMakeLists.txt +++ b/backends/concrete-cuda/implementation/CMakeLists.txt @@ -1,8 +1,6 @@ cmake_minimum_required(VERSION 3.8 FATAL_ERROR) project(concrete_cuda LANGUAGES CXX CUDA) -include(CTest) - # See if the minimum CUDA version is available. If not, only enable documentation building. set(MINIMUM_SUPPORTED_CUDA_VERSION 10.0) include(CheckLanguage) @@ -69,6 +67,7 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O3 ${CUD set(INCLUDE_DIR include) add_subdirectory(src) +add_subdirectory(test) add_subdirectory(parameters) target_include_directories(concrete_cuda PRIVATE ${INCLUDE_DIR}) @@ -87,3 +86,6 @@ if (CPPLINT) set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_ALL TRUE) # set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD TRUE) endif () + +enable_testing() + diff --git a/backends/concrete-cuda/implementation/include/device.h b/backends/concrete-cuda/implementation/include/device.h index 9e5bc0fb3..9bcdcc886 100644 --- a/backends/concrete-cuda/implementation/include/device.h +++ b/backends/concrete-cuda/implementation/include/device.h @@ -32,6 +32,10 @@ int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size, int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size, cudaStream_t *stream, uint32_t gpu_index); + +int cuda_memset_async(void *dest, uint64_t val, uint64_t size, + cudaStream_t *stream, uint32_t gpu_index); + int cuda_get_number_of_gpus(); int cuda_synchronize_device(uint32_t gpu_index); diff --git a/backends/concrete-cuda/implementation/src/device.cu b/backends/concrete-cuda/implementation/src/device.cu index f8666846f..976893481 100644 --- a/backends/concrete-cuda/implementation/src/device.cu +++ b/backends/concrete-cuda/implementation/src/device.cu @@ -111,6 +111,28 @@ int cuda_synchronize_device(uint32_t gpu_index) { return 0; } +int cuda_memset_async(void *dest, uint64_t val, uint64_t size, + cudaStream_t *stream, uint32_t gpu_index) { + if (size == 0) { + // error code: zero copy size + return -3; + } + + if (gpu_index >= cuda_get_number_of_gpus()) { + // error code: invalid gpu_index + return -2; + } + cudaPointerAttributes attr; + cudaPointerGetAttributes(&attr, dest); + if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) { + // error code: invalid device pointer + return -1; + } + cudaSetDevice(gpu_index); + cudaMemsetAsync(dest, val, size, *stream); + return 0; +} + /// Tries to copy memory to the GPU asynchronously /// 0: success /// -1: error, invalid device pointer diff --git a/backends/concrete-cuda/implementation/test/CMakeLists.txt b/backends/concrete-cuda/implementation/test/CMakeLists.txt new file mode 100644 index 000000000..9e34d0e7d --- /dev/null +++ b/backends/concrete-cuda/implementation/test/CMakeLists.txt @@ -0,0 +1,54 @@ +include(FetchContent) +FetchContent_Declare( + googletest + URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip +) +# For Windows: Prevent overriding the parent project's compiler/linker settings +set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) +FetchContent_MakeAvailable(googletest) + +# Enable ExternalProject CMake module +include(ExternalProject) + +set(CONCRETE_CPU_BINARY_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../concrete-cpu/target/release") +set(CONCRETE_CPU_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../concrete-cpu") + +# Add rust_example as a CMake target +ExternalProject_Add( + concrete_cpu + SOURCE_DIR ${CONCRETE_CPU_SOURCE_DIR} + DOWNLOAD_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND cargo build --release COMMAND cargo build --release + BINARY_DIR ${CONCRETE_CPU_BINARY_DIR} + INSTALL_COMMAND "" + LOG_BUILD ON) + +include_directories(${CONCRETE_CPU_SOURCE_DIR}/include) +add_library(concrete_cpu_lib STATIC IMPORTED) +set_target_properties(concrete_cpu_lib PROPERTIES IMPORTED_LOCATION + ${CONCRETE_CPU_BINARY_DIR}/libconcrete_cpu.a) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--no-as-needed -ldl") + +set(BINARY test_concrete_cuda) + +file(GLOB_RECURSE TEST_SOURCES LIST_DIRECTORIES false *.h *.cpp) + +set(SOURCES ${TEST_SOURCES}) + +add_executable(${BINARY} ${TEST_SOURCES}) + +add_test(NAME ${BINARY} COMMAND ${BINARY}) + +set_target_properties(test_concrete_cuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON + CUDA_RESOLVE_DEVICE_SYMBOLS ON) +target_link_libraries(test_concrete_cuda PUBLIC GTest::gtest_main concrete_cpu_lib + concrete_cuda cudart) +find_package(CUDA REQUIRED) +include_directories("${CUDA_INCLUDE_DIRS}" "${CMAKE_CURRENT_SOURCE_DIR}") + +include(GoogleTest) +gtest_discover_tests(test_concrete_cuda) + + diff --git a/backends/concrete-cuda/implementation/test/test_bit_extraction.cpp b/backends/concrete-cuda/implementation/test/test_bit_extraction.cpp new file mode 100644 index 000000000..ad58c7d53 --- /dev/null +++ b/backends/concrete-cuda/implementation/test/test_bit_extraction.cpp @@ -0,0 +1,250 @@ +#include "../include/bit_extraction.h" +#include "../include/device.h" +#include "concrete-cpu.h" +#include "utils.h" +#include "gtest/gtest.h" +#include +#include +#include + +const unsigned REPETITIONS = 5; +const unsigned SAMPLES = 100; + +typedef struct { + int lwe_dimension; + int glwe_dimension; + int polynomial_size; + double lwe_modular_variance; + double glwe_modular_variance; + int pbs_base_log; + int pbs_level; + int ks_base_log; + int ks_level; + int number_of_bits_of_message_including_padding; + int number_of_bits_to_extract; + int number_of_inputs; +} BitExtractionTestParams; + +class BitExtractionTestPrimitives_u64 + : public ::testing::TestWithParam { +protected: + int lwe_dimension; + int glwe_dimension; + int polynomial_size; + double lwe_modular_variance; + double glwe_modular_variance; + int pbs_base_log; + int pbs_level; + int ks_base_log; + int ks_level; + int number_of_bits_of_message_including_padding; + int number_of_bits_to_extract; + int number_of_inputs; + uint64_t delta; + int delta_log; + Csprng *csprng; + cudaStream_t *stream; + int gpu_index = 0; + uint64_t *lwe_sk_in_array; + uint64_t *lwe_sk_out_array; + uint64_t *lwe_in_ct_array; + uint64_t *lwe_out_ct_array; + uint64_t *plaintexts; + double *d_fourier_bsk_array; + uint64_t *d_ksk_array; + uint64_t *d_lwe_in_ct_array; + uint64_t *d_lwe_out_ct_array; + int8_t *bit_extract_buffer; + +public: + // Test arithmetic functions + void SetUp() { + stream = cuda_create_stream(0); + + // TestParams + lwe_dimension = (int)GetParam().lwe_dimension; + glwe_dimension = (int)GetParam().glwe_dimension; + polynomial_size = (int)GetParam().polynomial_size; + lwe_modular_variance = (double)GetParam().lwe_modular_variance; + glwe_modular_variance = (double)GetParam().glwe_modular_variance; + pbs_base_log = (int)GetParam().pbs_base_log; + pbs_level = (int)GetParam().pbs_level; + ks_base_log = (int)GetParam().ks_base_log; + ks_level = (int)GetParam().ks_level; + number_of_bits_of_message_including_padding = + (int)GetParam().number_of_bits_of_message_including_padding; + number_of_bits_to_extract = (int)GetParam().number_of_bits_to_extract; + number_of_inputs = (int)GetParam().number_of_inputs; + delta_log = 64 - number_of_bits_of_message_including_padding; + delta = (uint64_t)(1) << delta_log; + + // Create a Csprng + csprng = + (Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE); + uint8_t seed[16] = {(uint8_t)0}; + concrete_cpu_construct_concrete_csprng( + csprng, Uint128{.little_endian_bytes = {*seed}}); + + int input_lwe_dimension = glwe_dimension * polynomial_size; + int output_lwe_dimension = lwe_dimension; + // Generate the keys + generate_lwe_secret_keys(&lwe_sk_in_array, input_lwe_dimension, csprng, REPETITIONS); + generate_lwe_secret_keys(&lwe_sk_out_array, output_lwe_dimension, csprng, REPETITIONS); + generate_lwe_keyswitch_keys( + stream, gpu_index, &d_ksk_array, lwe_sk_in_array, lwe_sk_out_array, + input_lwe_dimension, output_lwe_dimension, ks_level, ks_base_log, + csprng, lwe_modular_variance, REPETITIONS); + generate_lwe_bootstrap_keys( + stream, gpu_index, &d_fourier_bsk_array, lwe_sk_out_array, + lwe_sk_in_array, lwe_dimension, glwe_dimension, polynomial_size, + pbs_level, pbs_base_log, csprng, glwe_modular_variance, REPETITIONS); + plaintexts = generate_plaintexts( + number_of_bits_of_message_including_padding, delta, number_of_inputs, REPETITIONS, SAMPLES); + + d_lwe_out_ct_array = (uint64_t *)cuda_malloc_async( + (output_lwe_dimension + 1) * number_of_bits_to_extract * + number_of_inputs * sizeof(uint64_t), + stream, gpu_index); + + d_lwe_in_ct_array = (uint64_t *)cuda_malloc_async( + (input_lwe_dimension + 1) * number_of_inputs * sizeof(uint64_t), stream, + gpu_index); + + lwe_in_ct_array = (uint64_t *)malloc((input_lwe_dimension + 1) * + number_of_inputs * sizeof(uint64_t)); + lwe_out_ct_array = (uint64_t *)malloc((output_lwe_dimension + 1) * + number_of_bits_to_extract * + number_of_inputs * sizeof(uint64_t)); + // Execute scratch + scratch_cuda_extract_bits_64(stream, gpu_index, &bit_extract_buffer, + glwe_dimension, lwe_dimension, polynomial_size, + pbs_level, number_of_inputs, + cuda_get_max_shared_memory(gpu_index), true); + } + + void TearDown() { + void *v_stream = (void *)stream; + + cuda_synchronize_stream(v_stream); + concrete_cpu_destroy_concrete_csprng(csprng); + free(csprng); + free(lwe_sk_in_array); + free(lwe_sk_out_array); + free(plaintexts); + free(lwe_in_ct_array); + free(lwe_out_ct_array); + cleanup_cuda_extract_bits(stream, gpu_index, &bit_extract_buffer); + cuda_drop_async(d_fourier_bsk_array, stream, gpu_index); + cuda_drop_async(d_ksk_array, stream, gpu_index); + cuda_drop_async(d_lwe_in_ct_array, stream, gpu_index); + cuda_drop_async(d_lwe_out_ct_array, stream, gpu_index); + cuda_destroy_stream(stream, gpu_index); + } +}; + +TEST_P(BitExtractionTestPrimitives_u64, bit_extraction) { + void *v_stream = (void *)stream; + int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level * + polynomial_size * (lwe_dimension + 1); + int ksk_size = + ks_level * (lwe_dimension + 1) * glwe_dimension * polynomial_size; + for (uint r = 0; r < REPETITIONS; r++) { + double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r); + uint64_t *d_ksk = d_ksk_array + (ptrdiff_t)(ksk_size * r); + uint64_t *lwe_in_sk = + lwe_sk_in_array + (ptrdiff_t)(glwe_dimension * polynomial_size * r); + uint64_t *lwe_sk_out = lwe_sk_out_array + (ptrdiff_t)(r * lwe_dimension); + for (uint s = 0; s < SAMPLES; s++) { + for (int i = 0; i < number_of_inputs; i++) { + uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs + + s * number_of_inputs + i]; + uint64_t *lwe_in_ct = + lwe_in_ct_array + + (ptrdiff_t)( + (r * SAMPLES * number_of_inputs + s * number_of_inputs + i) * + (glwe_dimension * polynomial_size + 1)); + concrete_cpu_encrypt_lwe_ciphertext_u64( + lwe_in_sk, lwe_in_ct, plaintext, glwe_dimension * polynomial_size, + lwe_modular_variance, csprng, &CONCRETE_CSPRNG_VTABLE); + } + cuda_synchronize_stream(v_stream); + cuda_memcpy_async_to_gpu(d_lwe_in_ct_array, lwe_in_ct_array, + (glwe_dimension * polynomial_size + 1) * + number_of_inputs * sizeof(uint64_t), + stream, gpu_index); + + // Execute bit extract + cuda_extract_bits_64( + stream, gpu_index, (void *)d_lwe_out_ct_array, + (void *)d_lwe_in_ct_array, bit_extract_buffer, (void *)d_ksk, + (void *)d_fourier_bsk, number_of_bits_to_extract, delta_log, + glwe_dimension * polynomial_size, lwe_dimension, glwe_dimension, + polynomial_size, pbs_base_log, pbs_level, ks_base_log, ks_level, + number_of_inputs, cuda_get_max_shared_memory(gpu_index)); + + // Copy result back + cuda_synchronize_stream(v_stream); + cuda_memcpy_async_to_cpu(lwe_out_ct_array, d_lwe_out_ct_array, + (lwe_dimension + 1) * number_of_bits_to_extract * + number_of_inputs * sizeof(uint64_t), + stream, gpu_index); + cuda_synchronize_stream(v_stream); + + for (int j = 0; j < number_of_inputs; j++) { + uint64_t *result_array = + lwe_out_ct_array + + (ptrdiff_t)(j * number_of_bits_to_extract * (lwe_dimension + 1)); + uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs + + s * number_of_inputs + j]; + for (int i = 0; i < number_of_bits_to_extract; i++) { + uint64_t *result_ct = + result_array + (ptrdiff_t)((number_of_bits_to_extract - 1 - i) * + (lwe_dimension + 1)); + uint64_t decrypted_message = 0; + concrete_cpu_decrypt_lwe_ciphertext_u64( + lwe_sk_out, result_ct, lwe_dimension, &decrypted_message); + // Round after decryption + uint64_t decrypted_rounded = + closest_representable(decrypted_message, 1, 1); + // Bring back the extracted bit found in the MSB in the LSB + uint64_t decrypted_extract_bit = decrypted_rounded >> 63; + uint64_t expected = ((plaintext >> delta_log) >> i) & (uint64_t)(1); + EXPECT_EQ(decrypted_extract_bit, expected); + } + } + } + } +} + +// Defines for which parameters set the PBS will be tested. +// It executes each test for all pairs on phis X qs (Cartesian product) +::testing::internal::ParamGenerator + bit_extract_params_u64 = ::testing::Values( + // n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level, + // ks_base_log, ks_level, number_of_message_bits, + // number_of_bits_to_extract + (BitExtractionTestParams){585, 1, 1024, 7.52316384526264e-37, + 7.52316384526264e-37, 10, 2, 4, 7, 5, 5, 1});//, +// (BitExtractionTestParams){585, 1, 1024, 7.52316384526264e-37, +// 7.52316384526264e-37, 10, 2, 4, 7, 5, 5, 2}); + +std::string +printParamName(::testing::TestParamInfo p) { + BitExtractionTestParams params = p.param; + + return "n_" + std::to_string(params.lwe_dimension) + "_k_" + + std::to_string(params.glwe_dimension) + "_N_" + + std::to_string(params.polynomial_size) + "_pbs_base_log_" + + std::to_string(params.pbs_base_log) + "_pbs_level_" + + std::to_string(params.pbs_level) + "_ks_base_log_" + + std::to_string(params.ks_base_log) + "_ks_level_" + + std::to_string(params.ks_level) + "_number_of_message_bits_" + + std::to_string(params.number_of_bits_of_message_including_padding) + + "_number_of_bits_to_extract_" + + std::to_string(params.number_of_bits_to_extract) + + "_number_of_inputs_" + std::to_string(params.number_of_inputs); +} + +INSTANTIATE_TEST_CASE_P(BitExtractionInstantiation, + BitExtractionTestPrimitives_u64, bit_extract_params_u64, + printParamName); \ No newline at end of file diff --git a/backends/concrete-cuda/implementation/test/test_bootstrap.cpp b/backends/concrete-cuda/implementation/test/test_bootstrap.cpp new file mode 100644 index 000000000..c3022b922 --- /dev/null +++ b/backends/concrete-cuda/implementation/test/test_bootstrap.cpp @@ -0,0 +1,313 @@ +#include "../include/bootstrap.h" +#include "../include/device.h" +#include "concrete-cpu.h" +#include "utils.h" +#include "gtest/gtest.h" +#include +#include +#include +#include + +const unsigned REPETITIONS = 5; +const unsigned SAMPLES = 100; + +typedef struct { + int lwe_dimension; + int glwe_dimension; + int polynomial_size; + double lwe_modular_variance; + double glwe_modular_variance; + int pbs_base_log; + int pbs_level; + int message_modulus; + int carry_modulus; + int number_of_inputs; +} BootstrapTestParams; + +class BootstrapTestPrimitives_u64 + : public ::testing::TestWithParam { +protected: + int lwe_dimension; + int glwe_dimension; + int polynomial_size; + double lwe_modular_variance; + double glwe_modular_variance; + int pbs_base_log; + int pbs_level; + int message_modulus; + int carry_modulus; + int payload_modulus; + int number_of_inputs; + uint64_t delta; + Csprng *csprng; + cudaStream_t *stream; + int gpu_index = 0; + uint64_t *lwe_sk_in_array; + uint64_t *lwe_sk_out_array; + uint64_t *plaintexts; + double *d_fourier_bsk_array; + uint64_t *d_lut_pbs_identity; + uint64_t *d_lut_pbs_indexes; + uint64_t *d_lwe_ct_in_array; + uint64_t *d_lwe_ct_out_array; + +public: + // Test arithmetic functions + void SetUp() { + stream = cuda_create_stream(0); + void *v_stream = (void *)stream; + + // TestParams + lwe_dimension = (int)GetParam().lwe_dimension; + glwe_dimension = (int)GetParam().glwe_dimension; + polynomial_size = (int)GetParam().polynomial_size; + lwe_modular_variance = (int)GetParam().lwe_modular_variance; + glwe_modular_variance = (int)GetParam().glwe_modular_variance; + pbs_base_log = (int)GetParam().pbs_base_log; + pbs_level = (int)GetParam().pbs_level; + message_modulus = (int)GetParam().message_modulus; + carry_modulus = (int)GetParam().carry_modulus; + number_of_inputs = (int)GetParam().number_of_inputs; + + payload_modulus = message_modulus * carry_modulus; + // Value of the shift we multiply our messages by + delta = ((uint64_t)(1) << 63) / (uint64_t)(payload_modulus); + + // Create a Csprng + csprng = + (Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE); + uint8_t seed[16] = {(uint8_t)0}; + concrete_cpu_construct_concrete_csprng( + csprng, Uint128{.little_endian_bytes = {*seed}}); + + // Generate the keys + generate_lwe_secret_keys(&lwe_sk_in_array, lwe_dimension, csprng, REPETITIONS); + generate_lwe_secret_keys(&lwe_sk_out_array, + glwe_dimension * polynomial_size, csprng, REPETITIONS); + generate_lwe_bootstrap_keys( + stream, gpu_index, &d_fourier_bsk_array, lwe_sk_in_array, + lwe_sk_out_array, lwe_dimension, glwe_dimension, polynomial_size, + pbs_level, pbs_base_log, csprng, glwe_modular_variance, REPETITIONS); + plaintexts = generate_plaintexts(payload_modulus, delta, number_of_inputs, REPETITIONS, + SAMPLES); + + // Create the LUT + uint64_t *lut_pbs_identity = generate_identity_lut_pbs( + polynomial_size, glwe_dimension, message_modulus, carry_modulus, + [](int x) -> int { return x; }); + + // Copy the LUT + d_lut_pbs_identity = (uint64_t *)cuda_malloc_async( + (glwe_dimension + 1) * polynomial_size * sizeof(uint64_t), stream, + gpu_index); + d_lut_pbs_indexes = (uint64_t *)cuda_malloc_async( + number_of_inputs * sizeof(uint64_t), stream, gpu_index); + cuda_synchronize_stream(v_stream); + cuda_memset_async(d_lut_pbs_indexes, 0, number_of_inputs * sizeof(uint64_t), + stream, gpu_index); + cuda_memcpy_async_to_gpu(d_lut_pbs_identity, lut_pbs_identity, + polynomial_size * (glwe_dimension + 1) * + sizeof(uint64_t), + stream, gpu_index); + cuda_synchronize_stream(v_stream); + free(lut_pbs_identity); + + d_lwe_ct_out_array = + (uint64_t *)cuda_malloc_async((glwe_dimension * polynomial_size + 1) * + number_of_inputs * sizeof(uint64_t), + stream, gpu_index); + d_lwe_ct_in_array = (uint64_t *)cuda_malloc_async( + (lwe_dimension + 1) * number_of_inputs * REPETITIONS * SAMPLES * + sizeof(uint64_t), + stream, gpu_index); + uint64_t *lwe_ct_in_array = + (uint64_t *)malloc((lwe_dimension + 1) * number_of_inputs * + REPETITIONS * SAMPLES * sizeof(uint64_t)); + // Create the input/output ciphertexts + for (uint r = 0; r < REPETITIONS; r++) { + uint64_t *lwe_sk_in = lwe_sk_in_array + (ptrdiff_t)(r * lwe_dimension); + for (uint s = 0; s < SAMPLES; s++) { + for (int i = 0; i < number_of_inputs; i++) { + uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs + + s * number_of_inputs + i]; + uint64_t *lwe_ct_in = + lwe_ct_in_array + (ptrdiff_t)((r * SAMPLES * number_of_inputs + + s * number_of_inputs + i) * + (lwe_dimension + 1)); + concrete_cpu_encrypt_lwe_ciphertext_u64( + lwe_sk_in, lwe_ct_in, plaintext, lwe_dimension, + lwe_modular_variance, csprng, &CONCRETE_CSPRNG_VTABLE); + } + } + } + cuda_synchronize_stream(v_stream); + cuda_memcpy_async_to_gpu(d_lwe_ct_in_array, lwe_ct_in_array, + REPETITIONS * SAMPLES * number_of_inputs * + (lwe_dimension + 1) * sizeof(uint64_t), + stream, gpu_index); + free(lwe_ct_in_array); + } + + void TearDown() { + concrete_cpu_destroy_concrete_csprng(csprng); + free(csprng); + free(lwe_sk_in_array); + free(lwe_sk_out_array); + free(plaintexts); + cuda_drop_async(d_fourier_bsk_array, stream, gpu_index); + cuda_drop_async(d_lut_pbs_identity, stream, gpu_index); + cuda_drop_async(d_lut_pbs_indexes, stream, gpu_index); + cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index); + cuda_drop_async(d_lwe_ct_out_array, stream, gpu_index); + cuda_destroy_stream(stream, gpu_index); + } +}; + +TEST_P(BootstrapTestPrimitives_u64, amortized_bootstrap) { + uint64_t *lwe_ct_out_array = + (uint64_t *)malloc((glwe_dimension * polynomial_size + 1) * + number_of_inputs * sizeof(uint64_t)); + int8_t *pbs_buffer = nullptr; + scratch_cuda_bootstrap_amortized_64( + stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size, + number_of_inputs, cuda_get_max_shared_memory(gpu_index), true); + int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level * + polynomial_size * (lwe_dimension + 1); + // Here execute the PBS + for (uint r = 0; r < REPETITIONS; r++) { + double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r); + uint64_t *lwe_sk_out = + lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size); + for (uint s = 0; s < SAMPLES; s++) { + uint64_t *d_lwe_ct_in = + d_lwe_ct_in_array + + (ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) * + (lwe_dimension + 1)); + // Execute PBS + cuda_bootstrap_amortized_lwe_ciphertext_vector_64( + stream, gpu_index, (void *)d_lwe_ct_out_array, + (void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes, + (void *)d_lwe_ct_in, (void *)d_fourier_bsk, pbs_buffer, lwe_dimension, + glwe_dimension, polynomial_size, pbs_base_log, pbs_level, + number_of_inputs, 1, 0, cuda_get_max_shared_memory(gpu_index)); + // Copy result back + cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array, + (glwe_dimension * polynomial_size + 1) * + number_of_inputs * sizeof(uint64_t), + stream, gpu_index); + + for (int j = 0; j < number_of_inputs; j++) { + uint64_t *result = + lwe_ct_out_array + + (ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1)); + uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs + + s * number_of_inputs + j]; + uint64_t decrypted = 0; + concrete_cpu_decrypt_lwe_ciphertext_u64( + lwe_sk_out, result, glwe_dimension * polynomial_size, &decrypted); + EXPECT_NE(decrypted, plaintext); + // let err = (decrypted >= plaintext) ? decrypted - plaintext : + // plaintext + // - decrypted; + // error_sample_vec.push(err); + + // The bit before the message + uint64_t rounding_bit = delta >> 1; + // Compute the rounding bit + uint64_t rounding = (decrypted & rounding_bit) << 1; + uint64_t decoded = (decrypted + rounding) / delta; + EXPECT_EQ(decoded, plaintext / delta); + } + } + } + cleanup_cuda_bootstrap_amortized(stream, gpu_index, &pbs_buffer); + free(lwe_ct_out_array); +} + +TEST_P(BootstrapTestPrimitives_u64, low_latency_bootstrap) { + uint64_t *lwe_ct_out_array = + (uint64_t *)malloc((glwe_dimension * polynomial_size + 1) * + number_of_inputs * sizeof(uint64_t)); + int8_t *pbs_buffer = nullptr; + scratch_cuda_bootstrap_low_latency_64( + stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size, + pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index), true); + int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level * + polynomial_size * (lwe_dimension + 1); + // Here execute the PBS + for (uint r = 0; r < REPETITIONS; r++) { + double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r); + uint64_t *lwe_sk_out = + lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size); + for (uint s = 0; s < SAMPLES; s++) { + uint64_t *d_lwe_ct_in = + d_lwe_ct_in_array + + (ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) * + (lwe_dimension + 1)); + // Execute PBS + cuda_bootstrap_low_latency_lwe_ciphertext_vector_64( + stream, gpu_index, (void *)d_lwe_ct_out_array, + (void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes, + (void *)d_lwe_ct_in, (void *)d_fourier_bsk, pbs_buffer, lwe_dimension, + glwe_dimension, polynomial_size, pbs_base_log, pbs_level, + number_of_inputs, 1, 0, cuda_get_max_shared_memory(gpu_index)); + // Copy result back + cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array, + (glwe_dimension * polynomial_size + 1) * + number_of_inputs * sizeof(uint64_t), + stream, gpu_index); + + for (int j = 0; j < number_of_inputs; j++) { + uint64_t *result = + lwe_ct_out_array + + (ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1)); + uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs + + s * number_of_inputs + j]; + uint64_t decrypted = 0; + concrete_cpu_decrypt_lwe_ciphertext_u64( + lwe_sk_out, result, glwe_dimension * polynomial_size, &decrypted); + EXPECT_NE(decrypted, plaintext); + // let err = (decrypted >= plaintext) ? decrypted - plaintext : + // plaintext + // - decrypted; + // error_sample_vec.push(err); + + // The bit before the message + uint64_t rounding_bit = delta >> 1; + // Compute the rounding bit + uint64_t rounding = (decrypted & rounding_bit) << 1; + uint64_t decoded = (decrypted + rounding) / delta; + EXPECT_EQ(decoded, plaintext / delta); + } + } + } + cleanup_cuda_bootstrap_low_latency(stream, gpu_index, &pbs_buffer); + free(lwe_ct_out_array); +} + +// Defines for which parameters set the PBS will be tested. +// It executes each test for all pairs on phis X qs (Cartesian product) +::testing::internal::ParamGenerator pbs_params_u64 = + ::testing::Values( + // n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level, + // message_modulus, carry_modulus + (BootstrapTestParams){500, 1, 1024, 0.000007069849454709433, + 0.00000000000000029403601535432533, 23, 2, 4, 4, + 1}, + (BootstrapTestParams){500, 1, 1024, 0.000007069849454709433, + 0.00000000000000029403601535432533, 23, 2, 4, 4, + 3}); + +std::string printParamName(::testing::TestParamInfo p) { + BootstrapTestParams params = p.param; + + return "n_" + std::to_string(params.lwe_dimension) + "_k_" + + std::to_string(params.glwe_dimension) + "_N_" + + std::to_string(params.polynomial_size) + "_pbs_base_log_" + + std::to_string(params.pbs_base_log) + "_pbs_level_" + + std::to_string(params.pbs_level) + "_number_of_inputs_" + + std::to_string(params.number_of_inputs); +} + +INSTANTIATE_TEST_CASE_P(BootstrapInstantiation, BootstrapTestPrimitives_u64, + pbs_params_u64, printParamName); \ No newline at end of file diff --git a/backends/concrete-cuda/implementation/test/test_circuit_bootstrap.cpp b/backends/concrete-cuda/implementation/test/test_circuit_bootstrap.cpp new file mode 100644 index 000000000..1c0c10655 --- /dev/null +++ b/backends/concrete-cuda/implementation/test/test_circuit_bootstrap.cpp @@ -0,0 +1,271 @@ +#include "../include/circuit_bootstrap.h" +#include "../include/device.h" +#include "concrete-cpu.h" +#include "utils.h" +#include "gtest/gtest.h" +#include +#include +#include + +const unsigned REPETITIONS = 5; +const unsigned SAMPLES = 100; + +typedef struct { + int lwe_dimension; + int glwe_dimension; + int polynomial_size; + double lwe_modular_variance; + double glwe_modular_variance; + int pbs_base_log; + int pbs_level; + int pksk_base_log; + int pksk_level; + int cbs_base_log; + int cbs_level; +} CircuitBootstrapTestParams; + +class CircuitBootstrapTestPrimitives_u64 + : public ::testing::TestWithParam { +protected: + int lwe_dimension; + int glwe_dimension; + int polynomial_size; + double lwe_modular_variance; + double glwe_modular_variance; + int pbs_base_log; + int pbs_level; + int pksk_base_log; + int pksk_level; + int cbs_base_log; + int cbs_level; + int number_of_bits_of_message_including_padding; + int ggsw_size; + uint64_t delta; + int delta_log; + Csprng *csprng; + cudaStream_t *stream; + int gpu_index = 0; + uint64_t *lwe_sk_in_array; + uint64_t *lwe_sk_out_array; + uint64_t *lwe_in_ct; + uint64_t *ggsw_out_ct; + uint64_t *plaintexts; + double *d_fourier_bsk_array; + uint64_t *d_pksk_array; + uint64_t *d_lwe_in_ct; + uint64_t *d_ggsw_out_ct; + uint64_t *d_lut_vector_indexes; + int8_t *cbs_buffer; + +public: + // Test arithmetic functions + void SetUp() { + stream = cuda_create_stream(0); + + // TestParams + lwe_dimension = (int)GetParam().lwe_dimension; + glwe_dimension = (int)GetParam().glwe_dimension; + polynomial_size = (int)GetParam().polynomial_size; + lwe_modular_variance = (double)GetParam().lwe_modular_variance; + glwe_modular_variance = (double)GetParam().glwe_modular_variance; + pbs_base_log = (int)GetParam().pbs_base_log; + pbs_level = (int)GetParam().pbs_level; + pksk_base_log = (int)GetParam().pksk_base_log; + pksk_level = (int)GetParam().pksk_level; + cbs_base_log = (int)GetParam().cbs_base_log; + cbs_level = (int)GetParam().cbs_level; + // We generate binary messages + number_of_bits_of_message_including_padding = 2; + delta_log = 60; + delta = (uint64_t)(1) << delta_log; + ggsw_size = cbs_level * (glwe_dimension + 1) * (glwe_dimension + 1) * + polynomial_size; + + // Create a Csprng + csprng = + (Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE); + uint8_t seed[16] = {(uint8_t)0}; + concrete_cpu_construct_concrete_csprng( + csprng, Uint128{.little_endian_bytes = {*seed}}); + + // Generate the keys + generate_lwe_secret_keys(&lwe_sk_in_array, lwe_dimension, csprng, REPETITIONS); + generate_lwe_secret_keys(&lwe_sk_out_array, + glwe_dimension * polynomial_size, csprng, REPETITIONS); + generate_lwe_bootstrap_keys( + stream, gpu_index, &d_fourier_bsk_array, lwe_sk_in_array, + lwe_sk_out_array, lwe_dimension, glwe_dimension, polynomial_size, + pbs_level, pbs_base_log, csprng, glwe_modular_variance, REPETITIONS); + generate_lwe_private_functional_keyswitch_key_lists( + stream, gpu_index, &d_pksk_array, lwe_sk_out_array, lwe_sk_out_array, + glwe_dimension * polynomial_size, glwe_dimension, polynomial_size, + pksk_level, pksk_base_log, csprng, lwe_modular_variance, REPETITIONS); + plaintexts = generate_plaintexts( + number_of_bits_of_message_including_padding, delta, 1, REPETITIONS, SAMPLES); + + d_ggsw_out_ct = (uint64_t *)cuda_malloc_async(ggsw_size * sizeof(uint64_t), + stream, gpu_index); + + d_lwe_in_ct = (uint64_t *)cuda_malloc_async( + (lwe_dimension + 1) * sizeof(uint64_t), stream, gpu_index); + + lwe_in_ct = (uint64_t *)malloc((lwe_dimension + 1) * sizeof(uint64_t)); + ggsw_out_ct = (uint64_t *)malloc(ggsw_size * sizeof(uint64_t)); + // Execute cbs scratch + scratch_cuda_circuit_bootstrap_64( + stream, gpu_index, &cbs_buffer, glwe_dimension, lwe_dimension, + polynomial_size, cbs_level, 1, cuda_get_max_shared_memory(gpu_index), + true); + // Build LUT vector indexes + uint64_t *h_lut_vector_indexes = + (uint64_t *)malloc(cbs_level * sizeof(uint64_t)); + for (int index = 0; index < cbs_level; index++) { + h_lut_vector_indexes[index] = 0; // index % cbs_level; + } + d_lut_vector_indexes = (uint64_t *)cuda_malloc_async( + cbs_level * sizeof(uint64_t), stream, gpu_index); + cuda_memcpy_async_to_gpu(d_lut_vector_indexes, h_lut_vector_indexes, + cbs_level * sizeof(uint64_t), stream, gpu_index); + free(h_lut_vector_indexes); + } + + void TearDown() { + void *v_stream = (void *)stream; + + cuda_synchronize_stream(v_stream); + concrete_cpu_destroy_concrete_csprng(csprng); + free(csprng); + free(lwe_sk_in_array); + free(lwe_sk_out_array); + free(plaintexts); + free(lwe_in_ct); + free(ggsw_out_ct); + cleanup_cuda_circuit_bootstrap(stream, gpu_index, &cbs_buffer); + cuda_drop_async(d_fourier_bsk_array, stream, gpu_index); + cuda_drop_async(d_pksk_array, stream, gpu_index); + cuda_drop_async(d_lwe_in_ct, stream, gpu_index); + cuda_drop_async(d_ggsw_out_ct, stream, gpu_index); + cuda_drop_async(d_lut_vector_indexes, stream, gpu_index); + cuda_destroy_stream(stream, gpu_index); + } +}; + +TEST_P(CircuitBootstrapTestPrimitives_u64, circuit_bootstrap) { + void *v_stream = (void *)stream; + for (uint r = 0; r < REPETITIONS; r++) { + for (uint s = 0; s < SAMPLES; s++) { + uint64_t plaintext = plaintexts[r * SAMPLES + s]; + int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level * + polynomial_size * (lwe_dimension + 1); + double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r); + int pksk_list_size = pksk_level * (glwe_dimension + 1) * polynomial_size * + (glwe_dimension * polynomial_size + 1) * + (glwe_dimension + 1); + uint64_t *d_pksk_list = d_pksk_array + (ptrdiff_t)(pksk_list_size * r); + uint64_t *lwe_in_sk = lwe_sk_in_array + (ptrdiff_t)(lwe_dimension * r); + uint64_t *lwe_sk_out = + lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size); + concrete_cpu_encrypt_lwe_ciphertext_u64( + lwe_in_sk, lwe_in_ct, plaintext, lwe_dimension, lwe_modular_variance, + csprng, &CONCRETE_CSPRNG_VTABLE); + cuda_synchronize_stream(v_stream); + cuda_memcpy_async_to_gpu(d_lwe_in_ct, lwe_in_ct, + (lwe_dimension + 1) * sizeof(uint64_t), stream, + gpu_index); + + // Execute circuit bootstrap + cuda_circuit_bootstrap_64( + stream, gpu_index, (void *)d_ggsw_out_ct, (void *)d_lwe_in_ct, + (void *)d_fourier_bsk, (void *)d_pksk_list, + (void *)d_lut_vector_indexes, cbs_buffer, delta_log, polynomial_size, + glwe_dimension, lwe_dimension, pbs_level, pbs_base_log, pksk_level, + pksk_base_log, cbs_level, cbs_base_log, 1, + cuda_get_max_shared_memory(gpu_index)); + + // Copy result back + cuda_memcpy_async_to_cpu(ggsw_out_ct, d_ggsw_out_ct, + ggsw_size * sizeof(uint64_t), stream, gpu_index); + cuda_synchronize_stream(v_stream); + + uint64_t *decrypted = + (uint64_t *)malloc(polynomial_size * (glwe_dimension + 1) * + cbs_level * sizeof(uint64_t)); + + uint64_t multiplying_factor = -(plaintext >> delta_log); + for (int l = 1; l < cbs_level + 1; l++) { + for (int j = 0; j < glwe_dimension; j++) { + uint64_t *res = decrypted + (ptrdiff_t)((l - 1) * polynomial_size * + (glwe_dimension + 1) + + j * polynomial_size); + uint64_t *glwe_ct_out = + ggsw_out_ct + + (ptrdiff_t)((l - 1) * polynomial_size * (glwe_dimension + 1) * + (glwe_dimension + 1) + + j * polynomial_size * (glwe_dimension + 1)); + concrete_cpu_decrypt_glwe_ciphertext_u64( + lwe_sk_out, res, glwe_ct_out, glwe_dimension, polynomial_size); + + for (int k = 0; k < polynomial_size; k++) { + uint64_t expected_decryption = + lwe_sk_out[j * polynomial_size + k] * multiplying_factor; + expected_decryption >>= (64 - cbs_base_log * l); + uint64_t decoded_plaintext = + closest_representable(res[k], l, cbs_base_log) >> + (64 - cbs_base_log * l); + EXPECT_EQ(expected_decryption, decoded_plaintext); + } + } + } + // Check last glwe on last level + uint64_t *res = + decrypted + + (ptrdiff_t)((cbs_level - 1) * polynomial_size * (glwe_dimension + 1) + + glwe_dimension * polynomial_size); + uint64_t *glwe_ct_out = + ggsw_out_ct + + (ptrdiff_t)((cbs_level - 1) * polynomial_size * (glwe_dimension + 1) * + (glwe_dimension + 1) + + glwe_dimension * polynomial_size * (glwe_dimension + 1)); + concrete_cpu_decrypt_glwe_ciphertext_u64(lwe_sk_out, res, glwe_ct_out, + glwe_dimension, polynomial_size); + + for (int k = 0; k < polynomial_size; k++) { + uint64_t expected_decryption = (k == 0) ? plaintext / delta : 0; + uint64_t decoded_plaintext = + closest_representable(res[k], cbs_level, cbs_base_log) >> + (64 - cbs_base_log * cbs_level); + EXPECT_EQ(expected_decryption, decoded_plaintext); + } + free(decrypted); + } + } +} + +// Defines for which parameters set the PBS will be tested. +// It executes each test for all pairs on phis X qs (Cartesian product) +::testing::internal::ParamGenerator cbs_params_u64 = + ::testing::Values( + // n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level, + // pksk_base_log, pksk_level, cbs_base_log, cbs_level + (CircuitBootstrapTestParams){10, 2, 512, 7.52316384526264e-37, + 7.52316384526264e-37, 11, 2, 15, 2, 10, + 1}); + +std::string +printParamName(::testing::TestParamInfo p) { + CircuitBootstrapTestParams params = p.param; + + return "n_" + std::to_string(params.lwe_dimension) + "_k_" + + std::to_string(params.glwe_dimension) + "_N_" + + std::to_string(params.polynomial_size) + "_pbs_base_log_" + + std::to_string(params.pbs_base_log) + "_pbs_level_" + + std::to_string(params.pbs_level) + "_pksk_base_log_" + + std::to_string(params.pksk_base_log) + "_pksk_level_" + + std::to_string(params.pksk_level) + "_cbs_base_log_" + + std::to_string(params.cbs_base_log) + "_cbs_level_" + + std::to_string(params.cbs_level); +} + +INSTANTIATE_TEST_CASE_P(CircuitBootstrapInstantiation, + CircuitBootstrapTestPrimitives_u64, cbs_params_u64, + printParamName); \ No newline at end of file diff --git a/backends/concrete-cuda/implementation/test/test_cmux_tree.cpp b/backends/concrete-cuda/implementation/test/test_cmux_tree.cpp new file mode 100644 index 000000000..878aa2c60 --- /dev/null +++ b/backends/concrete-cuda/implementation/test/test_cmux_tree.cpp @@ -0,0 +1,202 @@ +#include "../include/device.h" +#include "../include/vertical_packing.h" +#include "concrete-cpu.h" +#include "utils.h" +#include "gtest/gtest.h" +#include +#include +#include +#include + +const unsigned REPETITIONS = 5; +const unsigned SAMPLES = 100; + +typedef struct { + int glwe_dimension; + int polynomial_size; + int r; + int tau; + double glwe_modular_variance; + int base_log; + int level_count; + int delta_log; +} CMUXTreeTestParams; + +class CMUXTreeTestPrimitives_u64 + : public ::testing::TestWithParam { +protected: + int glwe_dimension; + int polynomial_size; + int r_lut; + int tau; + double glwe_modular_variance; + int base_log; + int level_count; + uint64_t delta; + int delta_log; + Csprng *csprng; + uint64_t *plaintexts; + cudaStream_t *stream; + int gpu_index = 0; + uint64_t *glwe_sk; + uint64_t *d_lut_identity; + +public: + // Test arithmetic functions + void SetUp() { + stream = cuda_create_stream(0); + void *v_stream = (void *)stream; + + // TestParams + glwe_dimension = (int)GetParam().glwe_dimension; + polynomial_size = (int)GetParam().polynomial_size; + r_lut = (int)GetParam().r; + tau = (int)GetParam().tau; + glwe_modular_variance = (int)GetParam().glwe_modular_variance; + base_log = (int)GetParam().base_log; + level_count = (int)GetParam().level_count; + delta_log = (int)GetParam().delta_log; + + // Value of the shift we multiply our messages by + delta = ((uint64_t)(1) << delta_log); + + // Create a Csprng + csprng = + (Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE); + uint8_t seed[16] = {(uint8_t)0}; + concrete_cpu_construct_concrete_csprng( + csprng, Uint128{.little_endian_bytes = {*seed}}); + + // Generate the keys + generate_glwe_secret_keys(&glwe_sk, glwe_dimension, polynomial_size, + csprng, REPETITIONS); + plaintexts = generate_plaintexts(r_lut, 1, 1, REPETITIONS, SAMPLES); + + // Create the LUT + int num_lut = (1 << r_lut); + d_lut_identity = (uint64_t *)cuda_malloc_async( + polynomial_size * num_lut * tau * sizeof(uint64_t), stream, gpu_index); + uint64_t *lut_cmux_tree_identity = generate_identity_lut_cmux_tree( + polynomial_size, num_lut, tau, delta_log); + + // Copy all LUTs + cuda_memcpy_async_to_gpu(d_lut_identity, lut_cmux_tree_identity, + polynomial_size * num_lut * tau * sizeof(uint64_t), + stream, gpu_index); + + cuda_synchronize_stream(v_stream); + free(lut_cmux_tree_identity); + } + + void TearDown() { + cuda_synchronize_stream(stream); + concrete_cpu_destroy_concrete_csprng(csprng); + free(plaintexts); + free(csprng); + cuda_drop_async(d_lut_identity, stream, gpu_index); + cuda_destroy_stream(stream, gpu_index); + } +}; + +TEST_P(CMUXTreeTestPrimitives_u64, cmux_tree) { + int ggsw_size = polynomial_size * (glwe_dimension + 1) * + (glwe_dimension + 1) * level_count; + int glwe_size = (glwe_dimension + 1) * polynomial_size; + uint64_t *d_ggsw_bit_array = (uint64_t *)cuda_malloc_async( + r_lut * ggsw_size * sizeof(uint64_t), stream, gpu_index); + uint64_t *d_results = (uint64_t *)cuda_malloc_async( + tau * glwe_size * sizeof(uint64_t), stream, gpu_index); + uint64_t *results = (uint64_t *)malloc(tau * glwe_size * sizeof(uint64_t)); + uint64_t *ggsw = (uint64_t *)malloc(ggsw_size * sizeof(uint64_t)); + + int8_t *cmux_tree_buffer = nullptr; + scratch_cuda_cmux_tree_64(stream, gpu_index, &cmux_tree_buffer, + glwe_dimension, polynomial_size, level_count, r_lut, + tau, cuda_get_max_shared_memory(gpu_index), true); + + // Here execute the PBS + for (uint r = 0; r < REPETITIONS; r++) { + for (uint s = 0; s < SAMPLES; s++) { + uint64_t witness = plaintexts[r * SAMPLES + s]; + + // Instantiate the GGSW m^tree ciphertexts + // We need r GGSW ciphertexts + // Bit decomposition of the value from MSB to LSB + uint64_t *bit_array = bit_decompose_value(witness, r_lut); + + for (int i = 0; i < r_lut; i++) { + uint64_t *d_ggsw_slice = d_ggsw_bit_array + i * ggsw_size; + concrete_cpu_encrypt_ggsw_ciphertext_u64( + glwe_sk, ggsw, bit_array[i], glwe_dimension, polynomial_size, + level_count, base_log, glwe_modular_variance, csprng, + &CONCRETE_CSPRNG_VTABLE); + cuda_memcpy_async_to_gpu(d_ggsw_slice, ggsw, + ggsw_size * sizeof(uint64_t), stream, + gpu_index); + } + cuda_synchronize_stream(stream); + + // Execute scratch/CMUX tree/cleanup + cuda_cmux_tree_64(stream, gpu_index, (void *)d_results, + (void *)d_ggsw_bit_array, (void *)d_lut_identity, + cmux_tree_buffer, glwe_dimension, polynomial_size, + base_log, level_count, r_lut, tau, + cuda_get_max_shared_memory(gpu_index)); + + // Copy result back + cuda_memcpy_async_to_cpu(results, d_results, + tau * glwe_size * sizeof(uint64_t), stream, + gpu_index); + cuda_synchronize_stream(stream); + for (int tree = 0; tree < tau; tree++) { + uint64_t *result = results + tree * glwe_size; + uint64_t *decrypted = + (uint64_t *)malloc(polynomial_size * sizeof(uint64_t)); + concrete_cpu_decrypt_glwe_ciphertext_u64( + glwe_sk, decrypted, result, glwe_dimension, polynomial_size); + // The bit before the message + uint64_t rounding_bit = delta >> 1; + // Compute the rounding bit + uint64_t rounding = (decrypted[0] & rounding_bit) << 1; + uint64_t decoded = (decrypted[0] + rounding) / delta; + EXPECT_EQ(decoded, (witness + tree) % (1 << (64 - delta_log))); + free(decrypted); + } + free(bit_array); + } + } + cuda_synchronize_stream(stream); + cleanup_cuda_cmux_tree(stream, gpu_index, &cmux_tree_buffer); + free(ggsw); + + cuda_drop_async(d_ggsw_bit_array, stream, gpu_index); +} + +int glwe_dimension; +int polynomial_size; +double glwe_modular_variance; +int base_log; +int level_count; +int message_modulus; +int carry_modulus; + +// Defines for which parameters set the PBS will be tested. +// It executes each test for all pairs on phis X qs (Cartesian product) +::testing::internal::ParamGenerator cmux_tree_params_u64 = + ::testing::Values( + // k, N, r, tau, glwe_variance, base_log, level_count, delta_log + (CMUXTreeTestParams){2, 256, 10, 6, 0.00000000000000029403601535432533, + 6, 3, 60}); + +std::string printParamName(::testing::TestParamInfo p) { + CMUXTreeTestParams params = p.param; + + return "k_" + std::to_string(params.glwe_dimension) + "_N_" + + std::to_string(params.polynomial_size) + "_tau_" + + std::to_string(params.tau) + "_base_log_" + + std::to_string(params.base_log) + "_level_count_" + + std::to_string(params.level_count); +} + +INSTANTIATE_TEST_CASE_P(CMUXTreeInstantiation, CMUXTreeTestPrimitives_u64, + cmux_tree_params_u64, printParamName); \ No newline at end of file diff --git a/backends/concrete-cuda/implementation/test/test_keyswitch.cpp b/backends/concrete-cuda/implementation/test/test_keyswitch.cpp new file mode 100644 index 000000000..6926f8723 --- /dev/null +++ b/backends/concrete-cuda/implementation/test/test_keyswitch.cpp @@ -0,0 +1,192 @@ +#include "../include/device.h" +#include "../include/keyswitch.h" +#include "concrete-cpu.h" +#include "utils.h" +#include "gtest/gtest.h" +#include +#include +#include +#include + +const unsigned REPETITIONS = 5; +const unsigned SAMPLES = 100; + +typedef struct { + int input_lwe_dimension; + int output_lwe_dimension; + double noise_variance; + int ksk_base_log; + int ksk_level; + int message_modulus; + int carry_modulus; +} KeyswitchTestParams; + +class KeyswitchTestPrimitives_u64 + : public ::testing::TestWithParam { +protected: + int input_lwe_dimension; + int output_lwe_dimension; + double noise_variance; + int ksk_base_log; + int ksk_level; + int message_modulus; + int carry_modulus; + int payload_modulus; + uint64_t delta; + Csprng *csprng; + cudaStream_t *stream; + int gpu_index = 0; + uint64_t *lwe_sk_in_array; + uint64_t *lwe_sk_out_array; + uint64_t *plaintexts; + uint64_t *d_ksk_array; + uint64_t *d_lwe_out_ct; + uint64_t *d_lwe_in_ct; + uint64_t *lwe_in_ct; + uint64_t *lwe_out_ct; + int num_samples; + +public: + // Test arithmetic functions + void SetUp() { + stream = cuda_create_stream(0); + void *v_stream = (void *)stream; + + // TestParams + input_lwe_dimension = (int)GetParam().input_lwe_dimension; + output_lwe_dimension = (int)GetParam().output_lwe_dimension; + noise_variance = (int)GetParam().noise_variance; + ksk_base_log = (int)GetParam().ksk_base_log; + ksk_level = (int)GetParam().ksk_level; + message_modulus = (int)GetParam().message_modulus; + carry_modulus = (int)GetParam().carry_modulus; + + payload_modulus = message_modulus * carry_modulus; + // Value of the shift we multiply our messages by + delta = ((uint64_t)(1) << 63) / (uint64_t)(payload_modulus); + + // Create a Csprng + csprng = + (Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE); + uint8_t seed[16] = {(uint8_t)0}; + concrete_cpu_construct_concrete_csprng( + csprng, Uint128{.little_endian_bytes = {*seed}}); + + // Generate the keys + generate_lwe_secret_keys(&lwe_sk_in_array, input_lwe_dimension, csprng, REPETITIONS); + generate_lwe_secret_keys(&lwe_sk_out_array, output_lwe_dimension, csprng, REPETITIONS); + generate_lwe_keyswitch_keys( + stream, gpu_index, &d_ksk_array, lwe_sk_in_array, lwe_sk_out_array, + input_lwe_dimension, output_lwe_dimension, ksk_level, ksk_base_log, + csprng, noise_variance, REPETITIONS); + plaintexts = generate_plaintexts(payload_modulus, delta, 1, REPETITIONS, SAMPLES); + + d_lwe_out_ct = (uint64_t *)cuda_malloc_async( + (output_lwe_dimension + 1) * sizeof(uint64_t), stream, gpu_index); + + d_lwe_in_ct = (uint64_t *)cuda_malloc_async( + (input_lwe_dimension + 1) * sizeof(uint64_t), stream, gpu_index); + + lwe_in_ct = + (uint64_t *)malloc((input_lwe_dimension + 1) * sizeof(uint64_t)); + lwe_out_ct = + (uint64_t *)malloc((output_lwe_dimension + 1) * sizeof(uint64_t)); + + cuda_synchronize_stream(v_stream); + } + + void TearDown() { + void *v_stream = (void *)stream; + + cuda_synchronize_stream(v_stream); + concrete_cpu_destroy_concrete_csprng(csprng); + free(csprng); + cuda_drop_async(d_lwe_in_ct, stream, gpu_index); + cuda_drop_async(d_lwe_out_ct, stream, gpu_index); + free(lwe_in_ct); + free(lwe_out_ct); + free(lwe_sk_in_array); + free(lwe_sk_out_array); + free(plaintexts); + cuda_drop_async(d_ksk_array, stream, gpu_index); + cuda_destroy_stream(stream, gpu_index); + } +}; + +TEST_P(KeyswitchTestPrimitives_u64, keyswitch) { + void *v_stream = (void *)stream; + for (uint r = 0; r < REPETITIONS; r++) { + for (uint s = 0; s < SAMPLES; s++) { + uint64_t plaintext = plaintexts[r * SAMPLES + s]; + uint64_t *lwe_in_sk = + lwe_sk_in_array + (ptrdiff_t)(r * input_lwe_dimension); + uint64_t *lwe_out_sk = + lwe_sk_out_array + (ptrdiff_t)(r * output_lwe_dimension); + int ksk_size = + ksk_level * (output_lwe_dimension + 1) * input_lwe_dimension; + uint64_t *d_ksk = d_ksk_array + (ptrdiff_t)(ksk_size * r); + concrete_cpu_encrypt_lwe_ciphertext_u64( + lwe_in_sk, lwe_in_ct, plaintext, input_lwe_dimension, noise_variance, + csprng, &CONCRETE_CSPRNG_VTABLE); + cuda_synchronize_stream(v_stream); + cuda_memcpy_async_to_gpu(d_lwe_in_ct, lwe_in_ct, + (input_lwe_dimension + 1) * sizeof(uint64_t), + stream, gpu_index); + // Execute keyswitch + cuda_keyswitch_lwe_ciphertext_vector_64( + stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_ct, + (void *)d_ksk, input_lwe_dimension, output_lwe_dimension, + ksk_base_log, ksk_level, 1); + + // Copy result back + cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct, + (output_lwe_dimension + 1) * sizeof(uint64_t), + stream, gpu_index); + uint64_t decrypted = 0; + concrete_cpu_decrypt_lwe_ciphertext_u64(lwe_out_sk, lwe_out_ct, + output_lwe_dimension, &decrypted); + EXPECT_NE(decrypted, plaintext); + // let err = (decrypted >= plaintext) ? decrypted - plaintext : plaintext + // - decrypted; + // error_sample_vec.push(err); + + // The bit before the message + uint64_t rounding_bit = delta >> 1; + // Compute the rounding bit + uint64_t rounding = (decrypted & rounding_bit) << 1; + uint64_t decoded = (decrypted + rounding) / delta; + ASSERT_EQ(decoded, plaintext / delta); + } + } +} + +// Defines for which parameters set the PBS will be tested. +// It executes each test for all pairs on phis X qs (Cartesian product) +::testing::internal::ParamGenerator ksk_params_u64 = + ::testing::Values( + // n, k*N, noise_variance, ks_base_log, ks_level, + // message_modulus, carry_modulus + // 1 bit message 0 bit carry parameters + (KeyswitchTestParams){567, 1280, 2.9802322387695312e-08, 3, 3, 2, 1}, + // 3 bits message 0 bit carry parameters + (KeyswitchTestParams){694, 1536, 2.9802322387695312e-08, 4, 3, 4, 1}, + // 4 bits message 0 bit carry parameters + (KeyswitchTestParams){769, 2048, 2.9802322387695312e-08, 4, 3, 5, 1}, + // 5 bits message 0 bit carry parameters + (KeyswitchTestParams){754, 2048, 2.9802322387695312e-08, 3, 5, 6, 1}, + // 6 bits message 0 bit carry parameters + (KeyswitchTestParams){847, 4096, 2.9802322387695312e-08, 4, 4, 7, 1}, + // 7 bits message 0 bit carry parameters + (KeyswitchTestParams){881, 8192, 2.9802322387695312e-08, 3, 6, 8, 1}); + +std::string printParamName(::testing::TestParamInfo p) { + KeyswitchTestParams params = p.param; + + return "na_" + std::to_string(params.input_lwe_dimension) + "_nb_" + + std::to_string(params.output_lwe_dimension) + "_baselog_" + + std::to_string(params.ksk_base_log) + "_ksk_level_" + + std::to_string(params.ksk_level); +} + +INSTANTIATE_TEST_CASE_P(KeyswitchInstantiation, KeyswitchTestPrimitives_u64, + ksk_params_u64, printParamName); diff --git a/backends/concrete-cuda/implementation/test/test_linear_algebra.cpp b/backends/concrete-cuda/implementation/test/test_linear_algebra.cpp new file mode 100644 index 000000000..34df583ac --- /dev/null +++ b/backends/concrete-cuda/implementation/test/test_linear_algebra.cpp @@ -0,0 +1,279 @@ +#include "../include/device.h" +#include "../include/linear_algebra.h" +#include "concrete-cpu.h" +#include "utils.h" +#include "gtest/gtest.h" +#include +#include +#include +#include + +const unsigned REPETITIONS = 5; +const unsigned SAMPLES = 100; + +typedef struct { + int lwe_dimension; + double noise_variance; + int message_modulus; + int carry_modulus; +} LinearAlgebraTestParams; + +class LinearAlgebraTestPrimitives_u64 + : public ::testing::TestWithParam { +protected: + int lwe_dimension; + double noise_variance; + int message_modulus; + int carry_modulus; + int payload_modulus; + uint64_t delta; + Csprng *csprng; + cudaStream_t *stream; + int gpu_index = 0; + uint64_t *lwe_sk_array; + uint64_t *d_lwe_in_1_ct; + uint64_t *d_lwe_in_2_ct; + uint64_t *d_lwe_out_ct; + uint64_t *lwe_in_1_ct; + uint64_t *lwe_in_2_ct; + uint64_t *lwe_out_ct; + uint64_t *plaintexts_1; + uint64_t *plaintexts_2; + int num_samples; + +public: + // Test arithmetic functions + void SetUp() { + stream = cuda_create_stream(0); + void *v_stream = (void *)stream; + + // TestParams + lwe_dimension = (int)GetParam().lwe_dimension; + noise_variance = (int)GetParam().noise_variance; + message_modulus = (int)GetParam().message_modulus; + carry_modulus = (int)GetParam().carry_modulus; + + payload_modulus = message_modulus * carry_modulus; + // Value of the shift we multiply our messages by + delta = ((uint64_t)(1) << 63) / (uint64_t)(payload_modulus); + + // Create a Csprng + csprng = + (Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE); + uint8_t seed[16] = {(uint8_t)0}; + concrete_cpu_construct_concrete_csprng( + csprng, Uint128{.little_endian_bytes = {*seed}}); + + // Generate the keys + generate_lwe_secret_keys(&lwe_sk_array, lwe_dimension, csprng, REPETITIONS); + plaintexts_1 = generate_plaintexts(payload_modulus, delta, 1, REPETITIONS, SAMPLES); + plaintexts_2 = generate_plaintexts(payload_modulus, delta, 1, REPETITIONS, SAMPLES); + + d_lwe_in_1_ct = (uint64_t *)cuda_malloc_async( + (lwe_dimension + 1) * sizeof(uint64_t), stream, gpu_index); + d_lwe_in_2_ct = (uint64_t *)cuda_malloc_async( + (lwe_dimension + 1) * sizeof(uint64_t), stream, gpu_index); + d_lwe_out_ct = (uint64_t *)cuda_malloc_async( + (lwe_dimension + 1) * sizeof(uint64_t), stream, gpu_index); + + lwe_in_1_ct = (uint64_t *)malloc((lwe_dimension + 1) * sizeof(uint64_t)); + lwe_in_2_ct = (uint64_t *)malloc((lwe_dimension + 1) * sizeof(uint64_t)); + lwe_out_ct = (uint64_t *)malloc((lwe_dimension + 1) * sizeof(uint64_t)); + + cuda_synchronize_stream(v_stream); + } + + void TearDown() { + void *v_stream = (void *)stream; + + cuda_synchronize_stream(v_stream); + concrete_cpu_destroy_concrete_csprng(csprng); + free(csprng); + cuda_drop_async(d_lwe_in_1_ct, stream, gpu_index); + cuda_drop_async(d_lwe_in_2_ct, stream, gpu_index); + cuda_drop_async(d_lwe_out_ct, stream, gpu_index); + free(lwe_in_1_ct); + free(lwe_in_2_ct); + free(lwe_out_ct); + free(lwe_sk_array); + free(plaintexts_1); + free(plaintexts_2); + } +}; + +TEST_P(LinearAlgebraTestPrimitives_u64, addition) { + void *v_stream = (void *)stream; + // Here execute the PBS + for (uint r = 0; r < REPETITIONS; r++) { + for (uint s = 0; s < SAMPLES; s++) { + uint64_t plaintext_1 = plaintexts_1[r * SAMPLES + s]; + uint64_t plaintext_2 = plaintexts_2[r * SAMPLES + s]; + uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension); + concrete_cpu_encrypt_lwe_ciphertext_u64(lwe_sk, lwe_in_1_ct, plaintext_1, + lwe_dimension, noise_variance, + csprng, &CONCRETE_CSPRNG_VTABLE); + concrete_cpu_encrypt_lwe_ciphertext_u64(lwe_sk, lwe_in_2_ct, plaintext_2, + lwe_dimension, noise_variance, + csprng, &CONCRETE_CSPRNG_VTABLE); + cuda_synchronize_stream(v_stream); + cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct, + (lwe_dimension + 1) * sizeof(uint64_t), stream, + gpu_index); + cuda_memcpy_async_to_gpu(d_lwe_in_2_ct, lwe_in_2_ct, + (lwe_dimension + 1) * sizeof(uint64_t), stream, + gpu_index); + // Execute addition + cuda_add_lwe_ciphertext_vector_64( + stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct, + (void *)d_lwe_in_2_ct, lwe_dimension, 1); + + // Copy result back + cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct, + (lwe_dimension + 1) * sizeof(uint64_t), stream, + gpu_index); + uint64_t decrypted = 0; + concrete_cpu_decrypt_lwe_ciphertext_u64(lwe_sk, lwe_out_ct, lwe_dimension, + &decrypted); + // The bit before the message + uint64_t rounding_bit = delta >> 1; + // Compute the rounding bit + uint64_t rounding = (decrypted & rounding_bit) << 1; + uint64_t decoded = (decrypted + rounding) / delta; + ASSERT_EQ(decoded, (plaintext_1 + plaintext_2) / delta); + cuda_synchronize_stream(v_stream); + } + } +} + +TEST_P(LinearAlgebraTestPrimitives_u64, plaintext_addition) { + void *v_stream = (void *)stream; + // Here execute the PBS + for (uint r = 0; r < REPETITIONS; r++) { + for (uint s = 0; s < SAMPLES; s++) { + uint64_t plaintext_1 = plaintexts_1[r * SAMPLES + s]; + uint64_t plaintext_2 = plaintexts_2[r * SAMPLES + s]; + uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension); + concrete_cpu_encrypt_lwe_ciphertext_u64(lwe_sk, lwe_in_1_ct, plaintext_1, + lwe_dimension, noise_variance, + csprng, &CONCRETE_CSPRNG_VTABLE); + cuda_synchronize_stream(v_stream); + cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct, + (lwe_dimension + 1) * sizeof(uint64_t), stream, + gpu_index); + cuda_memcpy_async_to_gpu(d_lwe_in_2_ct, &plaintext_2, sizeof(uint64_t), + stream, gpu_index); + // Execute addition + cuda_add_lwe_ciphertext_vector_plaintext_vector_64( + stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct, + (void *)d_lwe_in_2_ct, lwe_dimension, 1); + // Copy result back + cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct, + (lwe_dimension + 1) * sizeof(uint64_t), stream, + gpu_index); + uint64_t decrypted = 0; + concrete_cpu_decrypt_lwe_ciphertext_u64(lwe_sk, lwe_out_ct, lwe_dimension, + &decrypted); + // The bit before the message + uint64_t rounding_bit = delta >> 1; + // Compute the rounding bit + uint64_t rounding = (decrypted & rounding_bit) << 1; + uint64_t decoded = (decrypted + rounding) / delta; + ASSERT_EQ(decoded, (plaintext_1 + plaintext_2) / delta); + cuda_synchronize_stream(v_stream); + } + } +} + +TEST_P(LinearAlgebraTestPrimitives_u64, plaintext_multiplication) { + void *v_stream = (void *)stream; + // Here execute the PBS + for (uint r = 0; r < REPETITIONS; r++) { + for (uint s = 0; s < SAMPLES; s++) { + uint64_t plaintext_1 = plaintexts_1[r * SAMPLES + s]; + uint64_t plaintext_2 = plaintexts_2[r * SAMPLES + s]; + uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension); + concrete_cpu_encrypt_lwe_ciphertext_u64(lwe_sk, lwe_in_1_ct, plaintext_1, + lwe_dimension, noise_variance, + csprng, &CONCRETE_CSPRNG_VTABLE); + cuda_synchronize_stream(v_stream); + cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct, + (lwe_dimension + 1) * sizeof(uint64_t), stream, + gpu_index); + cuda_memcpy_async_to_gpu(d_lwe_in_2_ct, &plaintext_1, sizeof(uint64_t), + stream, gpu_index); + // Execute addition + cuda_mult_lwe_ciphertext_vector_cleartext_vector_64( + stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct, + (void *)d_lwe_in_2_ct, lwe_dimension, 1); + // Copy result back + cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct, + (lwe_dimension + 1) * sizeof(uint64_t), stream, + gpu_index); + uint64_t decrypted = 0; + concrete_cpu_decrypt_lwe_ciphertext_u64(lwe_sk, lwe_out_ct, lwe_dimension, + &decrypted); + // The bit before the message + uint64_t rounding_bit = delta >> 1; + // Compute the rounding bit + uint64_t rounding = (decrypted & rounding_bit) << 1; + uint64_t decoded = (decrypted + rounding) / delta; + ASSERT_EQ(decoded, (plaintext_1 * plaintext_2) / delta); + cuda_synchronize_stream(v_stream); + } + } +} + +TEST_P(LinearAlgebraTestPrimitives_u64, negate) { + void *v_stream = (void *)stream; + // Here execute the PBS + for (uint r = 0; r < REPETITIONS; r++) { + for (uint s = 0; s < SAMPLES; s++) { + uint64_t plaintext = plaintexts_1[r * SAMPLES + s]; + uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension); + concrete_cpu_encrypt_lwe_ciphertext_u64(lwe_sk, lwe_in_1_ct, plaintext, + lwe_dimension, noise_variance, + csprng, &CONCRETE_CSPRNG_VTABLE); + cuda_synchronize_stream(v_stream); + cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct, + (lwe_dimension + 1) * sizeof(uint64_t), stream, + gpu_index); + // Execute addition + cuda_negate_lwe_ciphertext_vector_64( + stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct, + lwe_dimension, 1); + + // Copy result back + cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct, + (lwe_dimension + 1) * sizeof(uint64_t), stream, + gpu_index); + uint64_t decrypted = 0; + concrete_cpu_decrypt_lwe_ciphertext_u64(lwe_sk, lwe_out_ct, lwe_dimension, + &decrypted); + // The bit before the message + uint64_t rounding_bit = delta >> 1; + // Compute the rounding bit + uint64_t rounding = (decrypted & rounding_bit) << 1; + uint64_t decoded = (decrypted + rounding) / delta; + ASSERT_EQ(decoded, -plaintext / delta); + cuda_synchronize_stream(v_stream); + } + } +} + +// Defines for which parameters set the linear algebra operations will be +// tested. It executes each test for all pairs on phis X qs (Cartesian product) +::testing::internal::ParamGenerator + linear_algebra_params_u64 = ::testing::Values( + // n, lwe_std_dev, message_modulus, carry_modulus + (LinearAlgebraTestParams){600, 0.000007069849454709433, 4, 4}); + +std::string +printParamName(::testing::TestParamInfo p) { + LinearAlgebraTestParams params = p.param; + + return "n_" + std::to_string(params.lwe_dimension); +} + +INSTANTIATE_TEST_CASE_P(LinearAlgebraInstantiation, + LinearAlgebraTestPrimitives_u64, + linear_algebra_params_u64, printParamName); \ No newline at end of file diff --git a/backends/concrete-cuda/implementation/test/test_wop_bootstrap.cpp b/backends/concrete-cuda/implementation/test/test_wop_bootstrap.cpp new file mode 100644 index 000000000..4363ca8d7 --- /dev/null +++ b/backends/concrete-cuda/implementation/test/test_wop_bootstrap.cpp @@ -0,0 +1,289 @@ +#include "../include/bootstrap.h" +#include "../include/device.h" +#include "concrete-cpu.h" +#include "utils.h" +#include "gtest/gtest.h" +#include +#include +#include +#include + +const unsigned REPETITIONS = 5; +const unsigned SAMPLES = 100; + +typedef struct { + int lwe_dimension; + int glwe_dimension; + int polynomial_size; + double lwe_modular_variance; + double glwe_modular_variance; + int pbs_base_log; + int pbs_level; + int ks_base_log; + int ks_level; + int pksk_base_log; + int pksk_level; + int cbs_base_log; + int cbs_level; + int tau; +} WopBootstrapTestParams; + +class WopBootstrapTestPrimitives_u64 + : public ::testing::TestWithParam { +protected: + int lwe_dimension; + int glwe_dimension; + int polynomial_size; + double lwe_modular_variance; + double glwe_modular_variance; + int pbs_base_log; + int pbs_level; + int ks_base_log; + int ks_level; + int pksk_base_log; + int pksk_level; + int cbs_base_log; + int cbs_level; + int tau; + int p; + uint64_t delta; + uint32_t cbs_delta_log; + int delta_log; + int delta_log_lut; + Csprng *csprng; + cudaStream_t *stream; + int gpu_index = 0; + uint64_t *lwe_sk_in_array; + uint64_t *lwe_sk_out_array; + uint64_t *lwe_in_ct_array; + uint64_t *lwe_out_ct_array; + uint64_t *plaintexts; + double *d_fourier_bsk_array; + uint64_t *d_ksk_array; + uint64_t *d_pksk_array; + uint64_t *d_lwe_ct_in_array; + uint64_t *d_lwe_ct_out_array; + uint64_t *d_lut_vector; + int8_t *wop_pbs_buffer; + +public: + // Test arithmetic functions + void SetUp() { + stream = cuda_create_stream(0); + void *v_stream = (void *)stream; + + // TestParams + lwe_dimension = (int)GetParam().lwe_dimension; + glwe_dimension = (int)GetParam().glwe_dimension; + polynomial_size = (int)GetParam().polynomial_size; + lwe_modular_variance = (double)GetParam().lwe_modular_variance; + glwe_modular_variance = (double)GetParam().glwe_modular_variance; + pbs_base_log = (int)GetParam().pbs_base_log; + pbs_level = (int)GetParam().pbs_level; + ks_base_log = (int)GetParam().ks_base_log; + ks_level = (int)GetParam().ks_level; + pksk_base_log = (int)GetParam().pksk_base_log; + pksk_level = (int)GetParam().pksk_level; + cbs_base_log = (int)GetParam().cbs_base_log; + cbs_level = (int)GetParam().cbs_level; + tau = (int)GetParam().tau; + p = 10 / tau; + delta_log = 64 - p; + delta_log_lut = delta_log; + delta = (uint64_t)(1) << delta_log; + + // Create a Csprng + csprng = + (Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE); + uint8_t seed[16] = {(uint8_t)0}; + concrete_cpu_construct_concrete_csprng( + csprng, Uint128{.little_endian_bytes = {*seed}}); + + int input_lwe_dimension = glwe_dimension * polynomial_size; + // Generate the keys + generate_lwe_secret_keys(&lwe_sk_in_array, input_lwe_dimension, csprng, REPETITIONS); + generate_lwe_secret_keys(&lwe_sk_out_array, lwe_dimension, csprng, REPETITIONS); + generate_lwe_keyswitch_keys(stream, gpu_index, &d_ksk_array, + lwe_sk_in_array, lwe_sk_out_array, + input_lwe_dimension, lwe_dimension, ks_level, + ks_base_log, csprng, lwe_modular_variance, REPETITIONS); + generate_lwe_bootstrap_keys( + stream, gpu_index, &d_fourier_bsk_array, lwe_sk_out_array, + lwe_sk_in_array, lwe_dimension, glwe_dimension, polynomial_size, + pbs_level, pbs_base_log, csprng, glwe_modular_variance, REPETITIONS); + generate_lwe_private_functional_keyswitch_key_lists( + stream, gpu_index, &d_pksk_array, lwe_sk_in_array, lwe_sk_in_array, + input_lwe_dimension, glwe_dimension, polynomial_size, pksk_level, + pksk_base_log, csprng, lwe_modular_variance, REPETITIONS); + plaintexts = generate_plaintexts(p, delta, tau, REPETITIONS, SAMPLES); + + // LUT creation + int lut_size = polynomial_size; + int lut_num = tau << (tau * p - (int)log2(polynomial_size)); // r + + uint64_t *big_lut = + (uint64_t *)malloc(lut_num * lut_size * sizeof(uint64_t)); + for (int t = tau - 1; t >= 0; t--) { + uint64_t *small_lut = big_lut + (ptrdiff_t)(t * (1 << (tau * p))); + for (uint64_t value = 0; value < (uint64_t)(1 << (tau * p)); value++) { + int nbits = t * p; + uint64_t x = (value >> nbits) & (uint64_t)((1 << p) - 1); + small_lut[value] = + ((x % (uint64_t)(1 << (64 - delta_log))) << delta_log_lut); + } + } + d_lut_vector = (uint64_t *)cuda_malloc_async( + lut_num * lut_size * sizeof(uint64_t), stream, gpu_index); + cuda_memcpy_async_to_gpu(d_lut_vector, big_lut, + lut_num * lut_size * sizeof(uint64_t), stream, + gpu_index); + free(big_lut); + // Execute scratch + scratch_cuda_wop_pbs_64(stream, gpu_index, &wop_pbs_buffer, + (uint32_t *)&delta_log, &cbs_delta_log, + glwe_dimension, lwe_dimension, polynomial_size, + cbs_level, pbs_level, p, p, tau, + cuda_get_max_shared_memory(gpu_index), true); + // Allocate input + d_lwe_ct_in_array = (uint64_t *)cuda_malloc_async( + (input_lwe_dimension + 1) * tau * sizeof(uint64_t), stream, gpu_index); + // Allocate output + d_lwe_ct_out_array = (uint64_t *)cuda_malloc_async( + (input_lwe_dimension + 1) * tau * sizeof(uint64_t), stream, gpu_index); + lwe_in_ct_array = + (uint64_t *)malloc((input_lwe_dimension + 1) * tau * sizeof(uint64_t)); + lwe_out_ct_array = + (uint64_t *)malloc((input_lwe_dimension + 1) * tau * sizeof(uint64_t)); + } + + void TearDown() { + void *v_stream = (void *)stream; + + cuda_synchronize_stream(v_stream); + concrete_cpu_destroy_concrete_csprng(csprng); + free(csprng); + free(lwe_sk_in_array); + free(lwe_sk_out_array); + free(plaintexts); + free(lwe_in_ct_array); + free(lwe_out_ct_array); + cleanup_cuda_circuit_bootstrap_vertical_packing(stream, gpu_index, + &wop_pbs_buffer); + cuda_drop_async(d_fourier_bsk_array, stream, gpu_index); + cuda_drop_async(d_ksk_array, stream, gpu_index); + cuda_drop_async(d_pksk_array, stream, gpu_index); + cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index); + cuda_drop_async(d_lwe_ct_out_array, stream, gpu_index); + cuda_drop_async(d_lut_vector, stream, gpu_index); + cuda_destroy_stream(stream, gpu_index); + } +}; + +TEST_P(WopBootstrapTestPrimitives_u64, wop_pbs) { + void *v_stream = (void *)stream; + int input_lwe_dimension = glwe_dimension * polynomial_size; + int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level * + polynomial_size * (lwe_dimension + 1); + int ksk_size = + ks_level * (lwe_dimension + 1) * glwe_dimension * polynomial_size; + int pksk_list_size = pksk_level * (glwe_dimension + 1) * polynomial_size * + (glwe_dimension * polynomial_size + 1) * + (glwe_dimension + 1); + for (uint r = 0; r < REPETITIONS; r++) { + double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r); + uint64_t *lwe_sk_in = + lwe_sk_in_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size); + uint64_t *d_ksk = d_ksk_array + (ptrdiff_t)(ksk_size * r); + uint64_t *d_pksk_list = d_pksk_array + (ptrdiff_t)(pksk_list_size * r); + for (uint s = 0; s < SAMPLES; s++) { + for (int t = 0; t < tau; t++) { + uint64_t plaintext = plaintexts[r * SAMPLES * tau + s * tau + t]; + uint64_t *lwe_in_ct = + lwe_in_ct_array + (ptrdiff_t)((r * SAMPLES * tau + s * tau + t) * + (input_lwe_dimension + 1)); + concrete_cpu_encrypt_lwe_ciphertext_u64( + lwe_sk_in, lwe_in_ct, plaintext, input_lwe_dimension, + lwe_modular_variance, csprng, &CONCRETE_CSPRNG_VTABLE); + } + cuda_synchronize_stream(v_stream); + cuda_memcpy_async_to_gpu(d_lwe_ct_in_array, lwe_in_ct_array, + (input_lwe_dimension + 1) * tau * + sizeof(uint64_t), + stream, gpu_index); + + // Execute wop pbs + cuda_wop_pbs_64(stream, gpu_index, (void *)d_lwe_ct_out_array, + (void *)d_lwe_ct_in_array, (void *)d_lut_vector, + (void *)d_fourier_bsk, (void *)d_ksk, (void *)d_pksk_list, + wop_pbs_buffer, cbs_delta_log, glwe_dimension, + lwe_dimension, polynomial_size, pbs_base_log, pbs_level, + ks_base_log, ks_level, pksk_base_log, pksk_level, + cbs_base_log, cbs_level, p, p, delta_log, tau, + cuda_get_max_shared_memory(gpu_index)); + + //// Copy result back + // cuda_memcpy_async_to_cpu(lwe_out_ct_array, d_lwe_ct_out_array, + //(input_lwe_dimension + 1) * tau * sizeof(uint64_t), stream, gpu_index); + // cuda_synchronize_stream(v_stream); + + // for (int i = 0; i < tau; i++) { + // uint64_t *result_ct = + // lwe_out_ct_array + (ptrdiff_t)(i * (input_lwe_dimension + 1)); + // uint64_t decrypted_message = 0; + // concrete_cpu_decrypt_lwe_ciphertext_u64( + // lwe_sk_in, result_ct, input_lwe_dimension, &decrypted_message); + // // Round after decryption + // uint64_t decrypted = + // closest_representable(decrypted_message, 1, p) >> delta_log; + // uint64_t expected = plaintext >> delta_log; + // EXPECT_EQ(decrypted, expected); + //} + } + } +} + +// Defines for which parameters set the PBS will be tested. +// It executes each test for all pairs on phis X qs (Cartesian product) +::testing::internal::ParamGenerator wop_pbs_params_u64 = + ::testing::Values( + // n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level, + // ks_base_log, ks_level, tau + (WopBootstrapTestParams){481, 2, 512, 7.52316384526264e-37, + 7.52316384526264e-37, 4, + 9, 1, 9, 4, 9, 6, 4, 1}//, + //(WopBootstrapTestParams){481, 2, 512, 7.52316384526264e-37, + // 7.52316384526264e-37, 4, 9, 1, 9, 4, 9, 6, 4, + // 2} //, + //(WopBootstrapTestParams){481, 2, 1024, 7.52316384526264e-37, + // 7.52316384526264e-37, 4, + // 9, 1, 9, 4, 9, 6, 4, 1}, + //(WopBootstrapTestParams){481, 2, 1024, 7.52316384526264e-37, + // 7.52316384526264e-37, 4, + // 9, 1, 9, 4, 9, 6, 4, 2} + ); + +std::string printParamName(::testing::TestParamInfo p) { + WopBootstrapTestParams params = p.param; + + std::string message = "Unknown_parameter_set"; + if (params.polynomial_size == 512) { + // When log_2_poly_size == 9 we have a cmux tree done with a single cmux. + message = "wop_pbs_cmux_tree_with_single_cmux_n_" + + std::to_string(params.lwe_dimension) + "_k_" + + std::to_string(params.glwe_dimension) + "_N_" + + std::to_string(params.polynomial_size) + "_tau_" + + std::to_string(params.tau); + } else if (params.polynomial_size == 1024) { + // When log_2_poly_size == 10 the VP skips the cmux tree. + message = "wop_pbs_without_cmux_tree_n_" + + std::to_string(params.lwe_dimension) + "_k_" + + std::to_string(params.glwe_dimension) + "_N_" + + std::to_string(params.polynomial_size) + "_tau_" + + std::to_string(params.tau); + } + return message; +} + +INSTANTIATE_TEST_CASE_P(WopBootstrapInstantiation, + WopBootstrapTestPrimitives_u64, wop_pbs_params_u64, + printParamName); diff --git a/backends/concrete-cuda/implementation/test/utils.cpp b/backends/concrete-cuda/implementation/test/utils.cpp new file mode 100644 index 000000000..ff58a5ff6 --- /dev/null +++ b/backends/concrete-cuda/implementation/test/utils.cpp @@ -0,0 +1,290 @@ +#include "utils.h" +#include "../include/bootstrap.h" +#include "../include/device.h" +#include "concrete-cpu.h" +#include +#include +#include +#include +#include + +// For each sample and repetition, create a plaintext +// The payload_modulus is the message modulus times the carry modulus +// (so the total message modulus) +uint64_t *generate_plaintexts(uint64_t payload_modulus, uint64_t delta, + int number_of_inputs, const unsigned repetitions, const unsigned + samples) { + uint64_t *plaintext_array = (uint64_t *)malloc( + repetitions * samples * number_of_inputs * sizeof(uint64_t)); + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution dis( + std::numeric_limits::min(), + std::numeric_limits::max()); + for (uint r = 0; r < repetitions; r++) { + for (uint s = 0; s < samples; s++) { + for (int i = 0; i < number_of_inputs; i++) { + plaintext_array[r * samples * number_of_inputs + s * number_of_inputs + + i] = (dis(gen) % payload_modulus) * delta; + } + } + } + return plaintext_array; +} + +// Decompose value in r bits +// Bit decomposition of the value from MSB to LSB +uint64_t *bit_decompose_value(uint64_t value, int r) { + uint64_t *bit_array = (uint64_t *)malloc(r * sizeof(uint64_t)); + + uint64_t x = value; + for (int i = 0; i < r; i++) { + bit_array[i] = x & 1; + x >>= 1; + } + return bit_array; +} + +uint64_t *generate_identity_lut_pbs(int polynomial_size, int glwe_dimension, + int message_modulus, int carry_modulus, + std::function func) { + // Modulus of the msg contained in the msg bits and operations buffer + uint64_t modulus_sup = message_modulus * carry_modulus; + + // N/(p/2) = size of each block + uint64_t box_size = polynomial_size / modulus_sup; + + // Value of the shift we multiply our messages by + uint64_t delta = ((uint64_t)1 << 63) / (uint64_t)(modulus_sup); + + // Create the plaintext lut_pbs + uint64_t *plaintext_lut_pbs = + (uint64_t *)malloc(polynomial_size * sizeof(uint64_t)); + + // This plaintext_lut_pbs extracts the carry bits + for (uint64_t i = 0; i < modulus_sup; i++) { + uint64_t index = i * box_size; + for (uint64_t j = index; j < index + box_size; j++) { + plaintext_lut_pbs[j] = func(i) * delta; + } + } + + uint64_t half_box_size = box_size / 2; + + // Negate the first half_box_size coefficients + for (uint64_t i = 0; i < half_box_size; i++) { + plaintext_lut_pbs[i] = -plaintext_lut_pbs[i]; + } + + // Rotate the plaintext_lut_pbs + std::rotate(plaintext_lut_pbs, plaintext_lut_pbs + half_box_size, + plaintext_lut_pbs + polynomial_size); + + // Create the GLWE lut_pbs + uint64_t *lut_pbs = (uint64_t *)malloc( + polynomial_size * (glwe_dimension + 1) * sizeof(uint64_t)); + for (int i = 0; i < polynomial_size * glwe_dimension; i++) { + lut_pbs[i] = 0; + } + for (int i = 0; i < polynomial_size; i++) { + int glwe_index = glwe_dimension * polynomial_size + i; + lut_pbs[glwe_index] = plaintext_lut_pbs[i]; + } + + free(plaintext_lut_pbs); + return lut_pbs; +} + +uint64_t *generate_identity_lut_cmux_tree(int polynomial_size, int num_lut, + int tau, int delta_log) { + + // Create the plaintext lut_pbs + uint64_t *plaintext_lut_cmux_tree = + (uint64_t *)malloc(num_lut * tau * polynomial_size * sizeof(uint64_t)); + + // This plaintext_lut_cmux_tree extracts the carry bits + for (int tree = 0; tree < tau; tree++) + for (int i = 0; i < num_lut; i++) { + uint64_t *plaintext_lut_slice = plaintext_lut_cmux_tree + + i * polynomial_size + + tree * num_lut * polynomial_size; + uint64_t coeff = (((uint64_t)(i + tree) % (1 << (64 - delta_log)))) + << delta_log; + for (int p = 0; p < polynomial_size; p++) + plaintext_lut_slice[p] = coeff; + } + + return plaintext_lut_cmux_tree; +} + +// Generate repetitions LWE secret keys +void generate_lwe_secret_keys(uint64_t **lwe_sk_array, int lwe_dimension, + Csprng *csprng, const unsigned repetitions) { + int lwe_sk_array_size = lwe_dimension * repetitions; + *lwe_sk_array = (uint64_t *)malloc(lwe_sk_array_size * sizeof(uint64_t)); + int shift = 0; + for (uint r = 0; r < repetitions; r++) { + // Generate the lwe secret key for each repetition + concrete_cpu_init_secret_key_u64(*lwe_sk_array + (ptrdiff_t)(shift), + lwe_dimension, csprng, + &CONCRETE_CSPRNG_VTABLE); + shift += lwe_dimension; + } +} + +// Generate repetitions GLWE secret keys +void generate_glwe_secret_keys(uint64_t **glwe_sk_array, int glwe_dimension, + int polynomial_size, Csprng *csprng, const unsigned repetitions) { + int glwe_sk_array_size = glwe_dimension * polynomial_size * repetitions; + *glwe_sk_array = (uint64_t *)malloc(glwe_sk_array_size * sizeof(uint64_t)); + int shift = 0; + for (uint r = 0; r < repetitions; r++) { + // Generate the lwe secret key for each repetition + concrete_cpu_init_secret_key_u64(*glwe_sk_array + (ptrdiff_t)(shift), + glwe_dimension * polynomial_size, csprng, + &CONCRETE_CSPRNG_VTABLE); + shift += glwe_dimension * polynomial_size; + } +} + +// Generate repetitions LWE bootstrap keys +void generate_lwe_bootstrap_keys(cudaStream_t *stream, int gpu_index, + double **d_fourier_bsk_array, + uint64_t *lwe_sk_in_array, + uint64_t *lwe_sk_out_array, int lwe_dimension, + int glwe_dimension, int polynomial_size, + int pbs_level, int pbs_base_log, + Csprng *csprng, double variance, const unsigned repetitions) { + void *v_stream = (void *)stream; + int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level * + polynomial_size * (lwe_dimension + 1); + int bsk_array_size = bsk_size * repetitions; + + uint64_t *bsk_array = (uint64_t *)malloc(bsk_array_size * sizeof(uint64_t)); + *d_fourier_bsk_array = (double *)cuda_malloc_async( + bsk_array_size * sizeof(double), stream, gpu_index); + int shift_in = 0; + int shift_out = 0; + int shift_bsk = 0; + + for (uint r = 0; r < repetitions; r++) { + // Generate the bootstrap key for each repetition + concrete_cpu_init_lwe_bootstrap_key_u64( + bsk_array + (ptrdiff_t)(shift_bsk), + lwe_sk_in_array + (ptrdiff_t)(shift_in), + lwe_sk_out_array + (ptrdiff_t)(shift_out), lwe_dimension, + polynomial_size, glwe_dimension, pbs_level, pbs_base_log, variance, + Parallelism(1), csprng, &CONCRETE_CSPRNG_VTABLE); + cuda_synchronize_stream(v_stream); + double *d_fourier_bsk = *d_fourier_bsk_array + (ptrdiff_t)(shift_bsk); + uint64_t *bsk = bsk_array + (ptrdiff_t)(shift_bsk); + cuda_synchronize_stream(v_stream); + cuda_convert_lwe_bootstrap_key_64( + (void *)(d_fourier_bsk), (void *)(bsk), v_stream, gpu_index, + lwe_dimension, glwe_dimension, pbs_level, polynomial_size); + shift_in += lwe_dimension; + shift_out += glwe_dimension * polynomial_size; + shift_bsk += bsk_size; + } + free(bsk_array); +} + +// Generate repetitions keyswitch keys +void generate_lwe_keyswitch_keys( + cudaStream_t *stream, int gpu_index, uint64_t **d_ksk_array, + uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array, + int input_lwe_dimension, int output_lwe_dimension, int ksk_level, + int ksk_base_log, Csprng *csprng, double variance, const unsigned repetitions) { + + int ksk_size = ksk_level * (output_lwe_dimension + 1) * input_lwe_dimension; + int ksk_array_size = ksk_size * repetitions; + + uint64_t *ksk_array = (uint64_t *)malloc(ksk_array_size * sizeof(uint64_t)); + *d_ksk_array = (uint64_t *)cuda_malloc_async( + ksk_array_size * sizeof(uint64_t), stream, gpu_index); + int shift_in = 0; + int shift_out = 0; + int shift_ksk = 0; + + for (uint r = 0; r < repetitions; r++) { + // Generate the keyswitch key for each repetition + concrete_cpu_init_lwe_keyswitch_key_u64( + ksk_array + (ptrdiff_t)(shift_ksk), + lwe_sk_in_array + (ptrdiff_t)(shift_in), + lwe_sk_out_array + (ptrdiff_t)(shift_out), input_lwe_dimension, + output_lwe_dimension, ksk_level, ksk_base_log, variance, csprng, + &CONCRETE_CSPRNG_VTABLE); + uint64_t *d_ksk = *d_ksk_array + (ptrdiff_t)(shift_ksk); + uint64_t *ksk = ksk_array + (ptrdiff_t)(shift_ksk); + cuda_memcpy_async_to_gpu(d_ksk, ksk, ksk_size * sizeof(uint64_t), stream, + gpu_index); + + shift_in += input_lwe_dimension; + shift_out += output_lwe_dimension; + shift_ksk += ksk_size; + } + free(ksk_array); +} + +// Generate repetitions private functional keyswitch key lists (with (k + 1) +// keys each) +void generate_lwe_private_functional_keyswitch_key_lists( + cudaStream_t *stream, int gpu_index, uint64_t **d_pksk_array, + uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array, + int input_lwe_dimension, int output_glwe_dimension, + int output_polynomial_size, int pksk_level, int pksk_base_log, + Csprng *csprng, double variance, const unsigned repetitions) { + + int pksk_list_size = pksk_level * (output_glwe_dimension + 1) * + output_polynomial_size * (input_lwe_dimension + 1) * + (output_glwe_dimension + 1); + int pksk_array_size = pksk_list_size * repetitions; + + uint64_t *pksk_array = (uint64_t *)malloc(pksk_array_size * sizeof(uint64_t)); + *d_pksk_array = (uint64_t *)cuda_malloc_async( + pksk_array_size * sizeof(uint64_t), stream, gpu_index); + int shift_in = 0; + int shift_out = 0; + int shift_pksk_list = 0; + + for (uint r = 0; r < repetitions; r++) { + // Generate the (k + 1) private functional keyswitch keys for each + // repetition + concrete_cpu_init_lwe_circuit_bootstrap_private_functional_packing_keyswitch_keys_u64( + pksk_array + (ptrdiff_t)(shift_pksk_list), + lwe_sk_in_array + (ptrdiff_t)(shift_in), + lwe_sk_out_array + (ptrdiff_t)(shift_out), input_lwe_dimension, + output_polynomial_size, output_glwe_dimension, pksk_level, + pksk_base_log, variance, Parallelism(1), csprng, + &CONCRETE_CSPRNG_VTABLE); + uint64_t *d_pksk_list = *d_pksk_array + (ptrdiff_t)(shift_pksk_list); + uint64_t *pksk_list = pksk_array + (ptrdiff_t)(shift_pksk_list); + cuda_memcpy_async_to_gpu(d_pksk_list, pksk_list, + pksk_list_size * sizeof(uint64_t), stream, + gpu_index); + + shift_in += input_lwe_dimension; + shift_out += output_glwe_dimension * output_polynomial_size; + shift_pksk_list += pksk_list_size; + } + free(pksk_array); +} + +// The closest number representable by the decomposition can be computed by +// performing the rounding at the appropriate bit. +uint64_t closest_representable(uint64_t input, int level_count, int base_log) { + // Compute the number of least significant bits which can not be represented + // by the decomposition + int non_rep_bit_count = 64 - (level_count * base_log); + // Generate a mask which captures the non representable bits + uint64_t one = 1; + uint64_t non_rep_mask = one << (non_rep_bit_count - 1); + // Retrieve the non representable bits + uint64_t non_rep_bits = input & non_rep_mask; + // Extract the msb of the non representable bits to perform the rounding + uint64_t non_rep_msb = non_rep_bits >> (non_rep_bit_count - 1); + // Remove the non-representable bits and perform the rounding + uint64_t res = input >> non_rep_bit_count; + res += non_rep_msb; + return res << non_rep_bit_count; +} \ No newline at end of file diff --git a/backends/concrete-cuda/implementation/test/utils.h b/backends/concrete-cuda/implementation/test/utils.h new file mode 100644 index 000000000..ef5b60e5a --- /dev/null +++ b/backends/concrete-cuda/implementation/test/utils.h @@ -0,0 +1,50 @@ +#ifndef TEST_UTILS_H +#define TEST_UTILS_H + +#include "../include/device.h" +#include "concrete-cpu.h" +#include + +uint64_t *generate_plaintexts(uint64_t payload_modulus, uint64_t delta, + int number_of_inputs, const unsigned repetitions, const unsigned + samples); + +uint64_t *generate_identity_lut_pbs(int polynomial_size, int glwe_dimension, + int message_modulus, int carry_modulus, + std::function func); + +uint64_t *generate_identity_lut_cmux_tree(int polynomial_size, int num_lut, + int tau, int delta_log); + +void generate_lwe_secret_keys(uint64_t **lwe_sk_array, int lwe_dimension, + Csprng *csprng, const unsigned repetitions); + +void generate_glwe_secret_keys(uint64_t **glwe_sk_array, int glwe_dimension, + int polynomial_size, Csprng *csprng, const unsigned repetitions); + +void generate_lwe_bootstrap_keys(cudaStream_t *stream, int gpu_index, + double **d_fourier_bsk_array, + uint64_t *lwe_sk_in_array, + uint64_t *lwe_sk_out_array, int lwe_dimension, + int glwe_dimension, int polynomial_size, + int pbs_level, int pbs_base_log, + Csprng *csprng, double variance, const unsigned repetitions); + +void generate_lwe_keyswitch_keys( + cudaStream_t *stream, int gpu_index, uint64_t **d_ksk_array, + uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array, + int input_lwe_dimension, int output_lwe_dimension, int ksk_level, + int ksk_base_log, Csprng *csprng, double variance, const unsigned repetitions); + +void generate_lwe_private_functional_keyswitch_key_lists( + cudaStream_t *stream, int gpu_index, uint64_t **d_pksk_array, + uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array, + int input_lwe_dimension, int output_glwe_dimension, + int output_polynomial_size, int pksk_level, int pksk_base_log, + Csprng *csprng, double variance, const unsigned repetitions); + +uint64_t closest_representable(uint64_t input, int level_count, int base_log); + +uint64_t *bit_decompose_value(uint64_t value, int r); + +#endif \ No newline at end of file