diff --git a/backends/concrete-cuda/implementation/test/CMakeLists.txt b/backends/concrete-cuda/implementation/test/CMakeLists.txt index 9e34d0e7d..c5a1b6720 100644 --- a/backends/concrete-cuda/implementation/test/CMakeLists.txt +++ b/backends/concrete-cuda/implementation/test/CMakeLists.txt @@ -21,6 +21,7 @@ ExternalProject_Add( CONFIGURE_COMMAND "" BUILD_COMMAND cargo build --release COMMAND cargo build --release BINARY_DIR ${CONCRETE_CPU_BINARY_DIR} + BUILD_ALWAYS true INSTALL_COMMAND "" LOG_BUILD ON) diff --git a/backends/concrete-cuda/implementation/test/test_bit_extraction.cpp b/backends/concrete-cuda/implementation/test/test_bit_extraction.cpp index b83e7edcc..6762ca1e3 100644 --- a/backends/concrete-cuda/implementation/test/test_bit_extraction.cpp +++ b/backends/concrete-cuda/implementation/test/test_bit_extraction.cpp @@ -55,6 +55,8 @@ protected: uint64_t *d_lwe_in_ct_array; uint64_t *d_lwe_out_ct_array; int8_t *bit_extract_buffer; + int input_lwe_dimension; + int output_lwe_dimension; public: // Test arithmetic functions @@ -85,24 +87,21 @@ public: concrete_cpu_construct_concrete_csprng( csprng, Uint128{.little_endian_bytes = {*seed}}); - int input_lwe_dimension = glwe_dimension * polynomial_size; - int output_lwe_dimension = lwe_dimension; + input_lwe_dimension = glwe_dimension * polynomial_size; + output_lwe_dimension = lwe_dimension; // Generate the keys - generate_lwe_secret_keys(&lwe_sk_in_array, input_lwe_dimension, csprng, - REPETITIONS); - generate_lwe_secret_keys(&lwe_sk_out_array, output_lwe_dimension, csprng, - REPETITIONS); + generate_lwe_secret_keys(&lwe_sk_in_array, input_lwe_dimension, csprng, REPETITIONS); + generate_lwe_secret_keys(&lwe_sk_out_array, output_lwe_dimension, csprng, REPETITIONS); generate_lwe_keyswitch_keys( stream, gpu_index, &d_ksk_array, lwe_sk_in_array, lwe_sk_out_array, input_lwe_dimension, output_lwe_dimension, ks_level, ks_base_log, csprng, lwe_modular_variance, REPETITIONS); generate_lwe_bootstrap_keys( stream, gpu_index, &d_fourier_bsk_array, lwe_sk_out_array, - lwe_sk_in_array, lwe_dimension, glwe_dimension, polynomial_size, + lwe_sk_in_array, output_lwe_dimension, glwe_dimension, polynomial_size, pbs_level, pbs_base_log, csprng, glwe_modular_variance, REPETITIONS); - plaintexts = - generate_plaintexts(number_of_bits_of_message_including_padding, delta, - number_of_inputs, REPETITIONS, SAMPLES); + plaintexts = generate_plaintexts( + number_of_bits_of_message_including_padding, delta, number_of_inputs, REPETITIONS, SAMPLES); d_lwe_out_ct_array = (uint64_t *)cuda_malloc_async( (output_lwe_dimension + 1) * number_of_bits_to_extract * @@ -148,15 +147,15 @@ public: TEST_P(BitExtractionTestPrimitives_u64, bit_extraction) { void *v_stream = (void *)stream; int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level * - polynomial_size * (lwe_dimension + 1); + polynomial_size * (output_lwe_dimension + 1); int ksk_size = - ks_level * (lwe_dimension + 1) * glwe_dimension * polynomial_size; + ks_level * input_lwe_dimension * (output_lwe_dimension + 1); for (uint r = 0; r < REPETITIONS; r++) { double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r); uint64_t *d_ksk = d_ksk_array + (ptrdiff_t)(ksk_size * r); uint64_t *lwe_in_sk = - lwe_sk_in_array + (ptrdiff_t)(glwe_dimension * polynomial_size * r); - uint64_t *lwe_sk_out = lwe_sk_out_array + (ptrdiff_t)(r * lwe_dimension); + lwe_sk_in_array + (ptrdiff_t)(input_lwe_dimension * r); + uint64_t *lwe_sk_out = lwe_sk_out_array + (ptrdiff_t)(r * output_lwe_dimension); for (uint s = 0; s < SAMPLES; s++) { for (int i = 0; i < number_of_inputs; i++) { uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs + @@ -164,15 +163,13 @@ TEST_P(BitExtractionTestPrimitives_u64, bit_extraction) { uint64_t *lwe_in_ct = lwe_in_ct_array + (ptrdiff_t)( - (r * SAMPLES * number_of_inputs + s * number_of_inputs + i) * - (glwe_dimension * polynomial_size + 1)); + i * (input_lwe_dimension + 1)); concrete_cpu_encrypt_lwe_ciphertext_u64( - lwe_in_sk, lwe_in_ct, plaintext, glwe_dimension * polynomial_size, + lwe_in_sk, lwe_in_ct, plaintext, input_lwe_dimension, lwe_modular_variance, csprng, &CONCRETE_CSPRNG_VTABLE); } - cuda_synchronize_stream(v_stream); cuda_memcpy_async_to_gpu(d_lwe_in_ct_array, lwe_in_ct_array, - (glwe_dimension * polynomial_size + 1) * + (input_lwe_dimension + 1) * number_of_inputs * sizeof(uint64_t), stream, gpu_index); @@ -181,31 +178,29 @@ TEST_P(BitExtractionTestPrimitives_u64, bit_extraction) { stream, gpu_index, (void *)d_lwe_out_ct_array, (void *)d_lwe_in_ct_array, bit_extract_buffer, (void *)d_ksk, (void *)d_fourier_bsk, number_of_bits_to_extract, delta_log, - glwe_dimension * polynomial_size, lwe_dimension, glwe_dimension, + input_lwe_dimension, output_lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level, ks_base_log, ks_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index)); // Copy result back - cuda_synchronize_stream(v_stream); cuda_memcpy_async_to_cpu(lwe_out_ct_array, d_lwe_out_ct_array, - (lwe_dimension + 1) * number_of_bits_to_extract * + (output_lwe_dimension + 1) * number_of_bits_to_extract * number_of_inputs * sizeof(uint64_t), stream, gpu_index); cuda_synchronize_stream(v_stream); - for (int j = 0; j < number_of_inputs; j++) { uint64_t *result_array = lwe_out_ct_array + - (ptrdiff_t)(j * number_of_bits_to_extract * (lwe_dimension + 1)); + (ptrdiff_t)(j * number_of_bits_to_extract * (output_lwe_dimension + 1)); uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs + s * number_of_inputs + j]; for (int i = 0; i < number_of_bits_to_extract; i++) { uint64_t *result_ct = result_array + (ptrdiff_t)((number_of_bits_to_extract - 1 - i) * - (lwe_dimension + 1)); + (output_lwe_dimension + 1)); uint64_t decrypted_message = 0; concrete_cpu_decrypt_lwe_ciphertext_u64( - lwe_sk_out, result_ct, lwe_dimension, &decrypted_message); + lwe_sk_out, result_ct, output_lwe_dimension, &decrypted_message); // Round after decryption uint64_t decrypted_rounded = closest_representable(decrypted_message, 1, 1); @@ -225,13 +220,11 @@ TEST_P(BitExtractionTestPrimitives_u64, bit_extraction) { bit_extract_params_u64 = ::testing::Values( // n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level, // ks_base_log, ks_level, number_of_message_bits, - // number_of_bits_to_extract + // number_of_bits_to_extract, number_of_inputs (BitExtractionTestParams){585, 1, 1024, 7.52316384526264e-37, - 7.52316384526264e-37, 10, 2, 4, 7, 5, 5, - 1}); //, -// (BitExtractionTestParams){585, 1, 1024, 7.52316384526264e-37, -// 7.52316384526264e-37, 10, 2, 4, 7, 5, 5, -// 2}); + 7.52316384526264e-37, 10, 2, 4, 7, 5, 5, 1}, + (BitExtractionTestParams){481, 1, 1024, 7.52316384526264e-37, + 7.52316384526264e-37, 4, 7, 1, 9, 5, 5, 1}); std::string printParamName(::testing::TestParamInfo p) { diff --git a/backends/concrete-cuda/implementation/test/test_bootstrap.cpp b/backends/concrete-cuda/implementation/test/test_bootstrap.cpp index fa4493209..cb6e744d8 100644 --- a/backends/concrete-cuda/implementation/test/test_bootstrap.cpp +++ b/backends/concrete-cuda/implementation/test/test_bootstrap.cpp @@ -81,17 +81,15 @@ public: csprng, Uint128{.little_endian_bytes = {*seed}}); // Generate the keys - generate_lwe_secret_keys(&lwe_sk_in_array, lwe_dimension, csprng, - REPETITIONS); + generate_lwe_secret_keys(&lwe_sk_in_array, lwe_dimension, csprng, REPETITIONS); generate_lwe_secret_keys(&lwe_sk_out_array, - glwe_dimension * polynomial_size, csprng, - REPETITIONS); + glwe_dimension * polynomial_size, csprng, REPETITIONS); generate_lwe_bootstrap_keys( stream, gpu_index, &d_fourier_bsk_array, lwe_sk_in_array, lwe_sk_out_array, lwe_dimension, glwe_dimension, polynomial_size, pbs_level, pbs_base_log, csprng, glwe_modular_variance, REPETITIONS); - plaintexts = generate_plaintexts(payload_modulus, delta, number_of_inputs, - REPETITIONS, SAMPLES); + plaintexts = generate_plaintexts(payload_modulus, delta, number_of_inputs, REPETITIONS, + SAMPLES); // Create the LUT uint64_t *lut_pbs_identity = generate_identity_lut_pbs( @@ -227,6 +225,10 @@ TEST_P(BootstrapTestPrimitives_u64, amortized_bootstrap) { } TEST_P(BootstrapTestPrimitives_u64, low_latency_bootstrap) { + int number_of_sm = 0; + cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); + if(number_of_inputs > number_of_sm * 4 / (glwe_dimension + 1) / pbs_level) + GTEST_SKIP() << "The Low Latency PBS does not support this configuration"; uint64_t *lwe_ct_out_array = (uint64_t *)malloc((glwe_dimension * polynomial_size + 1) * number_of_inputs * sizeof(uint64_t)); diff --git a/backends/concrete-cuda/implementation/test/test_wop_bootstrap.cpp b/backends/concrete-cuda/implementation/test/test_wop_bootstrap.cpp index ba35befa6..57212623f 100644 --- a/backends/concrete-cuda/implementation/test/test_wop_bootstrap.cpp +++ b/backends/concrete-cuda/implementation/test/test_wop_bootstrap.cpp @@ -9,7 +9,7 @@ #include const unsigned REPETITIONS = 5; -const unsigned SAMPLES = 100; +const unsigned SAMPLES = 10; typedef struct { int lwe_dimension; @@ -65,11 +65,13 @@ protected: uint64_t *d_lwe_ct_out_array; uint64_t *d_lut_vector; int8_t *wop_pbs_buffer; + int input_lwe_dimension; public: // Test arithmetic functions void SetUp() { stream = cuda_create_stream(0); + void *v_stream = (void *)stream; // TestParams lwe_dimension = (int)GetParam().lwe_dimension; @@ -98,16 +100,14 @@ public: concrete_cpu_construct_concrete_csprng( csprng, Uint128{.little_endian_bytes = {*seed}}); - int input_lwe_dimension = glwe_dimension * polynomial_size; + input_lwe_dimension = glwe_dimension * polynomial_size; // Generate the keys - generate_lwe_secret_keys(&lwe_sk_in_array, input_lwe_dimension, csprng, - REPETITIONS); - generate_lwe_secret_keys(&lwe_sk_out_array, lwe_dimension, csprng, - REPETITIONS); - generate_lwe_keyswitch_keys( - stream, gpu_index, &d_ksk_array, lwe_sk_in_array, lwe_sk_out_array, - input_lwe_dimension, lwe_dimension, ks_level, ks_base_log, csprng, - lwe_modular_variance, REPETITIONS); + generate_lwe_secret_keys(&lwe_sk_in_array, input_lwe_dimension, csprng, REPETITIONS); + generate_lwe_secret_keys(&lwe_sk_out_array, lwe_dimension, csprng, REPETITIONS); + generate_lwe_keyswitch_keys(stream, gpu_index, &d_ksk_array, + lwe_sk_in_array, lwe_sk_out_array, + input_lwe_dimension, lwe_dimension, ks_level, + ks_base_log, csprng, lwe_modular_variance, REPETITIONS); generate_lwe_bootstrap_keys( stream, gpu_index, &d_fourier_bsk_array, lwe_sk_out_array, lwe_sk_in_array, lwe_dimension, glwe_dimension, polynomial_size, @@ -138,7 +138,6 @@ public: cuda_memcpy_async_to_gpu(d_lut_vector, big_lut, lut_num * lut_size * sizeof(uint64_t), stream, gpu_index); - free(big_lut); // Execute scratch scratch_cuda_wop_pbs_64(stream, gpu_index, &wop_pbs_buffer, (uint32_t *)&delta_log, &cbs_delta_log, @@ -155,6 +154,9 @@ public: (uint64_t *)malloc((input_lwe_dimension + 1) * tau * sizeof(uint64_t)); lwe_out_ct_array = (uint64_t *)malloc((input_lwe_dimension + 1) * tau * sizeof(uint64_t)); + + cuda_synchronize_stream(v_stream); + free(big_lut); } void TearDown() { @@ -200,13 +202,11 @@ TEST_P(WopBootstrapTestPrimitives_u64, wop_pbs) { for (int t = 0; t < tau; t++) { uint64_t plaintext = plaintexts[r * SAMPLES * tau + s * tau + t]; uint64_t *lwe_in_ct = - lwe_in_ct_array + (ptrdiff_t)((r * SAMPLES * tau + s * tau + t) * - (input_lwe_dimension + 1)); + lwe_in_ct_array + (ptrdiff_t)(t * (input_lwe_dimension + 1)); concrete_cpu_encrypt_lwe_ciphertext_u64( lwe_sk_in, lwe_in_ct, plaintext, input_lwe_dimension, lwe_modular_variance, csprng, &CONCRETE_CSPRNG_VTABLE); } - cuda_synchronize_stream(v_stream); cuda_memcpy_async_to_gpu(d_lwe_ct_in_array, lwe_in_ct_array, (input_lwe_dimension + 1) * tau * sizeof(uint64_t), @@ -223,22 +223,23 @@ TEST_P(WopBootstrapTestPrimitives_u64, wop_pbs) { cuda_get_max_shared_memory(gpu_index)); //// Copy result back - // cuda_memcpy_async_to_cpu(lwe_out_ct_array, d_lwe_ct_out_array, - //(input_lwe_dimension + 1) * tau * sizeof(uint64_t), stream, gpu_index); - // cuda_synchronize_stream(v_stream); + cuda_memcpy_async_to_cpu(lwe_out_ct_array, d_lwe_ct_out_array, + (input_lwe_dimension + 1) * tau * sizeof(uint64_t), stream, gpu_index); + cuda_synchronize_stream(v_stream); - // for (int i = 0; i < tau; i++) { - // uint64_t *result_ct = - // lwe_out_ct_array + (ptrdiff_t)(i * (input_lwe_dimension + 1)); - // uint64_t decrypted_message = 0; - // concrete_cpu_decrypt_lwe_ciphertext_u64( - // lwe_sk_in, result_ct, input_lwe_dimension, &decrypted_message); - // // Round after decryption - // uint64_t decrypted = - // closest_representable(decrypted_message, 1, p) >> delta_log; - // uint64_t expected = plaintext >> delta_log; - // EXPECT_EQ(decrypted, expected); - //} + for (int i = 0; i < tau; i++) { + uint64_t plaintext = plaintexts[r * SAMPLES * tau + s * tau + i]; + uint64_t *result_ct = + lwe_out_ct_array + (ptrdiff_t)(i * (input_lwe_dimension + 1)); + uint64_t decrypted_message = 0; + concrete_cpu_decrypt_lwe_ciphertext_u64( + lwe_sk_in, result_ct, input_lwe_dimension, &decrypted_message); + // Round after decryption + uint64_t decrypted = + closest_representable(decrypted_message, 1, p) >> delta_log; + uint64_t expected = plaintext >> delta_log; + EXPECT_EQ(decrypted, expected); + } } } } @@ -250,17 +251,17 @@ TEST_P(WopBootstrapTestPrimitives_u64, wop_pbs) { // n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level, // ks_base_log, ks_level, tau (WopBootstrapTestParams){481, 2, 512, 7.52316384526264e-37, - 7.52316384526264e-37, 4, 9, 1, 9, 4, 9, 6, 4, - 1} //, - //(WopBootstrapTestParams){481, 2, 512, 7.52316384526264e-37, - // 7.52316384526264e-37, 4, 9, 1, 9, 4, 9, 6, 4, - // 2} //, - //(WopBootstrapTestParams){481, 2, 1024, 7.52316384526264e-37, - // 7.52316384526264e-37, 4, - // 9, 1, 9, 4, 9, 6, 4, 1}, - //(WopBootstrapTestParams){481, 2, 1024, 7.52316384526264e-37, - // 7.52316384526264e-37, 4, - // 9, 1, 9, 4, 9, 6, 4, 2} + 7.52316384526264e-37, 4, + 9, 1, 9, 4, 9, 6, 4, 1} +// (WopBootstrapTestParams){481, 2, 512, 7.52316384526264e-37, +// 7.52316384526264e-37, 4, 9, 1, 9, 4, 9, 6, 4, +// 2} , +// (WopBootstrapTestParams){481, 2, 1024, 7.52316384526264e-37, +// 7.52316384526264e-37, 4, +// 9, 1, 9, 4, 9, 6, 4, 1}, +// (WopBootstrapTestParams){481, 2, 1024, 7.52316384526264e-37, +// 7.52316384526264e-37, 4, +// 9, 1, 9, 4, 9, 6, 4, 2} ); std::string printParamName(::testing::TestParamInfo p) { diff --git a/backends/concrete-cuda/implementation/test/utils.cpp b/backends/concrete-cuda/implementation/test/utils.cpp index 1938ee5c1..3dff50b3b 100644 --- a/backends/concrete-cuda/implementation/test/utils.cpp +++ b/backends/concrete-cuda/implementation/test/utils.cpp @@ -12,8 +12,9 @@ // The payload_modulus is the message modulus times the carry modulus // (so the total message modulus) uint64_t *generate_plaintexts(uint64_t payload_modulus, uint64_t delta, - int number_of_inputs, const unsigned repetitions, - const unsigned samples) { + int number_of_inputs, const unsigned repetitions, const unsigned + samples) { + uint64_t *plaintext_array = (uint64_t *)malloc( repetitions * samples * number_of_inputs * sizeof(uint64_t)); std::random_device rd; @@ -120,8 +121,7 @@ uint64_t *generate_identity_lut_cmux_tree(int polynomial_size, int num_lut, // Generate repetitions LWE secret keys void generate_lwe_secret_keys(uint64_t **lwe_sk_array, int lwe_dimension, Csprng *csprng, const unsigned repetitions) { - int lwe_sk_array_size = lwe_dimension * repetitions; - *lwe_sk_array = (uint64_t *)malloc(lwe_sk_array_size * sizeof(uint64_t)); + *lwe_sk_array = (uint64_t *)malloc(lwe_dimension * repetitions * sizeof(uint64_t)); int shift = 0; for (uint r = 0; r < repetitions; r++) { // Generate the lwe secret key for each repetition @@ -134,8 +134,7 @@ void generate_lwe_secret_keys(uint64_t **lwe_sk_array, int lwe_dimension, // Generate repetitions GLWE secret keys void generate_glwe_secret_keys(uint64_t **glwe_sk_array, int glwe_dimension, - int polynomial_size, Csprng *csprng, - const unsigned repetitions) { + int polynomial_size, Csprng *csprng, const unsigned repetitions) { int glwe_sk_array_size = glwe_dimension * polynomial_size * repetitions; *glwe_sk_array = (uint64_t *)malloc(glwe_sk_array_size * sizeof(uint64_t)); int shift = 0; @@ -149,11 +148,13 @@ void generate_glwe_secret_keys(uint64_t **glwe_sk_array, int glwe_dimension, } // Generate repetitions LWE bootstrap keys -void generate_lwe_bootstrap_keys( - cudaStream_t *stream, int gpu_index, double **d_fourier_bsk_array, - uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array, int lwe_dimension, - int glwe_dimension, int polynomial_size, int pbs_level, int pbs_base_log, - Csprng *csprng, double variance, const unsigned repetitions) { +void generate_lwe_bootstrap_keys(cudaStream_t *stream, int gpu_index, + double **d_fourier_bsk_array, + uint64_t *lwe_sk_in_array, + uint64_t *lwe_sk_out_array, int lwe_dimension, + int glwe_dimension, int polynomial_size, + int pbs_level, int pbs_base_log, + Csprng *csprng, double variance, const unsigned repetitions) { void *v_stream = (void *)stream; int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level * polynomial_size * (lwe_dimension + 1); @@ -174,7 +175,6 @@ void generate_lwe_bootstrap_keys( lwe_sk_out_array + (ptrdiff_t)(shift_out), lwe_dimension, polynomial_size, glwe_dimension, pbs_level, pbs_base_log, variance, Parallelism(1), csprng, &CONCRETE_CSPRNG_VTABLE); - cuda_synchronize_stream(v_stream); double *d_fourier_bsk = *d_fourier_bsk_array + (ptrdiff_t)(shift_bsk); uint64_t *bsk = bsk_array + (ptrdiff_t)(shift_bsk); cuda_synchronize_stream(v_stream); @@ -185,18 +185,16 @@ void generate_lwe_bootstrap_keys( shift_out += glwe_dimension * polynomial_size; shift_bsk += bsk_size; } + cuda_synchronize_stream(v_stream); free(bsk_array); } // Generate repetitions keyswitch keys -void generate_lwe_keyswitch_keys(cudaStream_t *stream, int gpu_index, - uint64_t **d_ksk_array, - uint64_t *lwe_sk_in_array, - uint64_t *lwe_sk_out_array, - int input_lwe_dimension, - int output_lwe_dimension, int ksk_level, - int ksk_base_log, Csprng *csprng, - double variance, const unsigned repetitions) { +void generate_lwe_keyswitch_keys( + cudaStream_t *stream, int gpu_index, uint64_t **d_ksk_array, + uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array, + int input_lwe_dimension, int output_lwe_dimension, int ksk_level, + int ksk_base_log, Csprng *csprng, double variance, const unsigned repetitions) { int ksk_size = ksk_level * (output_lwe_dimension + 1) * input_lwe_dimension; int ksk_array_size = ksk_size * repetitions; @@ -225,6 +223,7 @@ void generate_lwe_keyswitch_keys(cudaStream_t *stream, int gpu_index, shift_out += output_lwe_dimension; shift_ksk += ksk_size; } + cuda_synchronize_stream(stream); free(ksk_array); }