From 6efe5f3a3b04148bbc778c930cdb45a85437179a Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Thu, 16 Mar 2023 13:30:32 +0100 Subject: [PATCH] fix(concrete_cuda): fix cleartext mult and enhance linalg tests --- .../implementation/src/multiplication.cuh | 4 +- .../implementation/test/test_keyswitch.cpp | 2 +- .../test/test_linear_algebra.cpp | 271 +++++++++++------- 3 files changed, 168 insertions(+), 109 deletions(-) diff --git a/backends/concrete-cuda/implementation/src/multiplication.cuh b/backends/concrete-cuda/implementation/src/multiplication.cuh index 0fa7cc662..8c36caaec 100644 --- a/backends/concrete-cuda/implementation/src/multiplication.cuh +++ b/backends/concrete-cuda/implementation/src/multiplication.cuh @@ -16,8 +16,8 @@ cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input, uint32_t input_lwe_dimension, uint32_t num_entries) { int tid = threadIdx.x; - if (tid < num_entries) { - int index = blockIdx.x * blockDim.x + tid; + int index = blockIdx.x * blockDim.x + tid; + if (index < num_entries) { int cleartext_index = index / (input_lwe_dimension + 1); // Here we take advantage of the wrapping behaviour of uint output[index] = lwe_input[index] * cleartext_input[cleartext_index]; diff --git a/backends/concrete-cuda/implementation/test/test_keyswitch.cpp b/backends/concrete-cuda/implementation/test/test_keyswitch.cpp index 4d555e24d..b346eae9c 100644 --- a/backends/concrete-cuda/implementation/test/test_keyswitch.cpp +++ b/backends/concrete-cuda/implementation/test/test_keyswitch.cpp @@ -158,7 +158,7 @@ TEST_P(KeyswitchTestPrimitives_u64, keyswitch) { // Compute the rounding bit uint64_t rounding = (decrypted & rounding_bit) << 1; uint64_t decoded = (decrypted + rounding) / delta; - ASSERT_EQ(decoded, plaintext / delta); + EXPECT_EQ(decoded, plaintext / delta); } } } diff --git a/backends/concrete-cuda/implementation/test/test_linear_algebra.cpp b/backends/concrete-cuda/implementation/test/test_linear_algebra.cpp index edd116bcd..778112be2 100644 --- a/backends/concrete-cuda/implementation/test/test_linear_algebra.cpp +++ b/backends/concrete-cuda/implementation/test/test_linear_algebra.cpp @@ -16,6 +16,7 @@ typedef struct { double noise_variance; int message_modulus; int carry_modulus; + int number_of_inputs; } LinearAlgebraTestParams; class LinearAlgebraTestPrimitives_u64 @@ -25,6 +26,7 @@ protected: double noise_variance; int message_modulus; int carry_modulus; + int number_of_inputs; int payload_modulus; uint64_t delta; Csprng *csprng; @@ -49,9 +51,10 @@ public: // TestParams lwe_dimension = (int)GetParam().lwe_dimension; - noise_variance = (int)GetParam().noise_variance; + noise_variance = (double)GetParam().noise_variance; message_modulus = (int)GetParam().message_modulus; carry_modulus = (int)GetParam().carry_modulus; + number_of_inputs = (int)GetParam().number_of_inputs; payload_modulus = message_modulus * carry_modulus; // Value of the shift we multiply our messages by @@ -66,21 +69,27 @@ public: // Generate the keys generate_lwe_secret_keys(&lwe_sk_array, lwe_dimension, csprng, REPETITIONS); - plaintexts_1 = - generate_plaintexts(payload_modulus, delta, 1, REPETITIONS, SAMPLES); - plaintexts_2 = - generate_plaintexts(payload_modulus, delta, 1, REPETITIONS, SAMPLES); + plaintexts_1 = generate_plaintexts(payload_modulus, delta, number_of_inputs, + REPETITIONS, SAMPLES); + plaintexts_2 = generate_plaintexts(payload_modulus, delta, number_of_inputs, + REPETITIONS, SAMPLES); d_lwe_in_1_ct = (uint64_t *)cuda_malloc_async( - (lwe_dimension + 1) * sizeof(uint64_t), stream, gpu_index); + number_of_inputs * (lwe_dimension + 1) * sizeof(uint64_t), stream, + gpu_index); d_lwe_in_2_ct = (uint64_t *)cuda_malloc_async( - (lwe_dimension + 1) * sizeof(uint64_t), stream, gpu_index); + number_of_inputs * (lwe_dimension + 1) * sizeof(uint64_t), stream, + gpu_index); d_lwe_out_ct = (uint64_t *)cuda_malloc_async( - (lwe_dimension + 1) * sizeof(uint64_t), stream, gpu_index); + number_of_inputs * (lwe_dimension + 1) * sizeof(uint64_t), stream, + gpu_index); - lwe_in_1_ct = (uint64_t *)malloc((lwe_dimension + 1) * sizeof(uint64_t)); - lwe_in_2_ct = (uint64_t *)malloc((lwe_dimension + 1) * sizeof(uint64_t)); - lwe_out_ct = (uint64_t *)malloc((lwe_dimension + 1) * sizeof(uint64_t)); + lwe_in_1_ct = (uint64_t *)malloc(number_of_inputs * (lwe_dimension + 1) * + sizeof(uint64_t)); + lwe_in_2_ct = (uint64_t *)malloc(number_of_inputs * (lwe_dimension + 1) * + sizeof(uint64_t)); + lwe_out_ct = (uint64_t *)malloc(number_of_inputs * (lwe_dimension + 1) * + sizeof(uint64_t)); cuda_synchronize_stream(v_stream); } @@ -104,160 +113,210 @@ public: }; TEST_P(LinearAlgebraTestPrimitives_u64, addition) { - void *v_stream = (void *)stream; // Here execute the PBS for (uint r = 0; r < REPETITIONS; r++) { for (uint s = 0; s < SAMPLES; s++) { - uint64_t plaintext_1 = plaintexts_1[r * SAMPLES + s]; - uint64_t plaintext_2 = plaintexts_2[r * SAMPLES + s]; uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension); - concrete_cpu_encrypt_lwe_ciphertext_u64(lwe_sk, lwe_in_1_ct, plaintext_1, - lwe_dimension, noise_variance, - csprng, &CONCRETE_CSPRNG_VTABLE); - concrete_cpu_encrypt_lwe_ciphertext_u64(lwe_sk, lwe_in_2_ct, plaintext_2, - lwe_dimension, noise_variance, - csprng, &CONCRETE_CSPRNG_VTABLE); - cuda_synchronize_stream(v_stream); + for (int i = 0; i < number_of_inputs; i++) { + uint64_t plaintext_1 = plaintexts_1[r * SAMPLES * number_of_inputs + + s * number_of_inputs + i]; + uint64_t plaintext_2 = plaintexts_2[r * SAMPLES * number_of_inputs + + s * number_of_inputs + i]; + concrete_cpu_encrypt_lwe_ciphertext_u64( + lwe_sk, lwe_in_1_ct + i * (lwe_dimension + 1), plaintext_1, + lwe_dimension, noise_variance, csprng, &CONCRETE_CSPRNG_VTABLE); + concrete_cpu_encrypt_lwe_ciphertext_u64( + lwe_sk, lwe_in_2_ct + i * (lwe_dimension + 1), plaintext_2, + lwe_dimension, noise_variance, csprng, &CONCRETE_CSPRNG_VTABLE); + } cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct, - (lwe_dimension + 1) * sizeof(uint64_t), stream, - gpu_index); + number_of_inputs * (lwe_dimension + 1) * + sizeof(uint64_t), + stream, gpu_index); cuda_memcpy_async_to_gpu(d_lwe_in_2_ct, lwe_in_2_ct, - (lwe_dimension + 1) * sizeof(uint64_t), stream, - gpu_index); + number_of_inputs * (lwe_dimension + 1) * + sizeof(uint64_t), + stream, gpu_index); // Execute addition cuda_add_lwe_ciphertext_vector_64( stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct, - (void *)d_lwe_in_2_ct, lwe_dimension, 1); + (void *)d_lwe_in_2_ct, lwe_dimension, number_of_inputs); // Copy result back cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct, - (lwe_dimension + 1) * sizeof(uint64_t), stream, - gpu_index); - uint64_t decrypted = 0; - concrete_cpu_decrypt_lwe_ciphertext_u64(lwe_sk, lwe_out_ct, lwe_dimension, - &decrypted); - // The bit before the message - uint64_t rounding_bit = delta >> 1; - // Compute the rounding bit - uint64_t rounding = (decrypted & rounding_bit) << 1; - uint64_t decoded = (decrypted + rounding) / delta; - ASSERT_EQ(decoded, (plaintext_1 + plaintext_2) / delta); - cuda_synchronize_stream(v_stream); + number_of_inputs * (lwe_dimension + 1) * + sizeof(uint64_t), + stream, gpu_index); + for (int i = 0; i < number_of_inputs; i++) { + uint64_t plaintext_1 = plaintexts_1[r * SAMPLES * number_of_inputs + + s * number_of_inputs + i]; + uint64_t plaintext_2 = plaintexts_2[r * SAMPLES * number_of_inputs + + s * number_of_inputs + i]; + uint64_t decrypted = 0; + concrete_cpu_decrypt_lwe_ciphertext_u64( + lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension, + &decrypted); + // The bit before the message + uint64_t rounding_bit = delta >> 1; + // Compute the rounding bit + uint64_t rounding = (decrypted & rounding_bit) << 1; + uint64_t decoded = (decrypted + rounding) / delta; + EXPECT_EQ(decoded, (plaintext_1 + plaintext_2) / delta); + } } } } TEST_P(LinearAlgebraTestPrimitives_u64, plaintext_addition) { - void *v_stream = (void *)stream; // Here execute the PBS for (uint r = 0; r < REPETITIONS; r++) { for (uint s = 0; s < SAMPLES; s++) { - uint64_t plaintext_1 = plaintexts_1[r * SAMPLES + s]; - uint64_t plaintext_2 = plaintexts_2[r * SAMPLES + s]; uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension); - concrete_cpu_encrypt_lwe_ciphertext_u64(lwe_sk, lwe_in_1_ct, plaintext_1, - lwe_dimension, noise_variance, - csprng, &CONCRETE_CSPRNG_VTABLE); - cuda_synchronize_stream(v_stream); + for (int i = 0; i < number_of_inputs; i++) { + uint64_t plaintext_1 = plaintexts_1[r * SAMPLES * number_of_inputs + + s * number_of_inputs + i]; + concrete_cpu_encrypt_lwe_ciphertext_u64( + lwe_sk, lwe_in_1_ct + i * (lwe_dimension + 1), plaintext_1, + lwe_dimension, noise_variance, csprng, &CONCRETE_CSPRNG_VTABLE); + } cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct, - (lwe_dimension + 1) * sizeof(uint64_t), stream, - gpu_index); - cuda_memcpy_async_to_gpu(d_lwe_in_2_ct, &plaintext_2, sizeof(uint64_t), + number_of_inputs * (lwe_dimension + 1) * + sizeof(uint64_t), stream, gpu_index); + cuda_memcpy_async_to_gpu( + d_lwe_in_2_ct, + &plaintexts_2[r * SAMPLES * number_of_inputs + s * number_of_inputs], + number_of_inputs * sizeof(uint64_t), stream, gpu_index); // Execute addition cuda_add_lwe_ciphertext_vector_plaintext_vector_64( stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct, - (void *)d_lwe_in_2_ct, lwe_dimension, 1); + (void *)d_lwe_in_2_ct, lwe_dimension, number_of_inputs); // Copy result back cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct, - (lwe_dimension + 1) * sizeof(uint64_t), stream, - gpu_index); - uint64_t decrypted = 0; - concrete_cpu_decrypt_lwe_ciphertext_u64(lwe_sk, lwe_out_ct, lwe_dimension, - &decrypted); - // The bit before the message - uint64_t rounding_bit = delta >> 1; - // Compute the rounding bit - uint64_t rounding = (decrypted & rounding_bit) << 1; - uint64_t decoded = (decrypted + rounding) / delta; - ASSERT_EQ(decoded, (plaintext_1 + plaintext_2) / delta); - cuda_synchronize_stream(v_stream); + number_of_inputs * (lwe_dimension + 1) * + sizeof(uint64_t), + stream, gpu_index); + for (int i = 0; i < number_of_inputs; i++) { + uint64_t plaintext_1 = plaintexts_1[r * SAMPLES * number_of_inputs + + s * number_of_inputs + i]; + uint64_t plaintext_2 = plaintexts_2[r * SAMPLES * number_of_inputs + + s * number_of_inputs + i]; + uint64_t decrypted = 0; + concrete_cpu_decrypt_lwe_ciphertext_u64( + lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension, + &decrypted); + // The bit before the message + uint64_t rounding_bit = delta >> 1; + // Compute the rounding bit + uint64_t rounding = (decrypted & rounding_bit) << 1; + uint64_t decoded = (decrypted + rounding) / delta; + EXPECT_EQ(decoded, (plaintext_1 + plaintext_2) / delta); + } } } } -TEST_P(LinearAlgebraTestPrimitives_u64, plaintext_multiplication) { +TEST_P(LinearAlgebraTestPrimitives_u64, cleartext_multiplication) { void *v_stream = (void *)stream; + uint64_t delta_2 = + ((uint64_t)(1) << 63) / (uint64_t)(payload_modulus * payload_modulus); // Here execute the PBS for (uint r = 0; r < REPETITIONS; r++) { for (uint s = 0; s < SAMPLES; s++) { - uint64_t plaintext_1 = plaintexts_1[r * SAMPLES + s]; - uint64_t plaintext_2 = plaintexts_2[r * SAMPLES + s]; uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension); - concrete_cpu_encrypt_lwe_ciphertext_u64(lwe_sk, lwe_in_1_ct, plaintext_1, - lwe_dimension, noise_variance, - csprng, &CONCRETE_CSPRNG_VTABLE); + uint64_t *cleartext_array = + (uint64_t *)malloc(number_of_inputs * sizeof(uint64_t)); + for (int i = 0; i < number_of_inputs; i++) { + uint64_t plaintext_1 = plaintexts_1[r * SAMPLES * number_of_inputs + + s * number_of_inputs + i] / + delta * delta_2; + uint64_t plaintext_2 = plaintexts_2[r * SAMPLES * number_of_inputs + + s * number_of_inputs + i]; + cleartext_array[i] = plaintext_2 / delta; + concrete_cpu_encrypt_lwe_ciphertext_u64( + lwe_sk, lwe_in_1_ct + i * (lwe_dimension + 1), plaintext_1, + lwe_dimension, noise_variance, csprng, &CONCRETE_CSPRNG_VTABLE); + } cuda_synchronize_stream(v_stream); cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct, - (lwe_dimension + 1) * sizeof(uint64_t), stream, - gpu_index); - cuda_memcpy_async_to_gpu(d_lwe_in_2_ct, &plaintext_1, sizeof(uint64_t), + number_of_inputs * (lwe_dimension + 1) * + sizeof(uint64_t), stream, gpu_index); - // Execute addition + cuda_memcpy_async_to_gpu(d_lwe_in_2_ct, cleartext_array, + number_of_inputs * sizeof(uint64_t), stream, + gpu_index); + // Execute cleartext multiplication cuda_mult_lwe_ciphertext_vector_cleartext_vector_64( stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct, - (void *)d_lwe_in_2_ct, lwe_dimension, 1); + (void *)d_lwe_in_2_ct, lwe_dimension, number_of_inputs); // Copy result back cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct, - (lwe_dimension + 1) * sizeof(uint64_t), stream, - gpu_index); - uint64_t decrypted = 0; - concrete_cpu_decrypt_lwe_ciphertext_u64(lwe_sk, lwe_out_ct, lwe_dimension, - &decrypted); - // The bit before the message - uint64_t rounding_bit = delta >> 1; - // Compute the rounding bit - uint64_t rounding = (decrypted & rounding_bit) << 1; - uint64_t decoded = (decrypted + rounding) / delta; - ASSERT_EQ(decoded, (plaintext_1 * plaintext_2) / delta); + number_of_inputs * (lwe_dimension + 1) * + sizeof(uint64_t), + stream, gpu_index); cuda_synchronize_stream(v_stream); + for (int i = 0; i < number_of_inputs; i++) { + uint64_t plaintext = plaintexts_1[r * SAMPLES * number_of_inputs + + s * number_of_inputs + i] / + delta * delta_2; + uint64_t cleartext = plaintexts_2[r * SAMPLES * number_of_inputs + + s * number_of_inputs + i] / + delta; + uint64_t decrypted = 0; + concrete_cpu_decrypt_lwe_ciphertext_u64( + lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension, + &decrypted); + // The bit before the message + uint64_t rounding_bit = delta_2 >> 1; + // Compute the rounding bit + uint64_t rounding = (decrypted & rounding_bit) << 1; + uint64_t decoded = (decrypted + rounding) / delta_2; + EXPECT_EQ(decoded, plaintext / delta_2 * cleartext); + } } } } TEST_P(LinearAlgebraTestPrimitives_u64, negate) { - void *v_stream = (void *)stream; // Here execute the PBS for (uint r = 0; r < REPETITIONS; r++) { for (uint s = 0; s < SAMPLES; s++) { - uint64_t plaintext = plaintexts_1[r * SAMPLES + s]; uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension); - concrete_cpu_encrypt_lwe_ciphertext_u64(lwe_sk, lwe_in_1_ct, plaintext, - lwe_dimension, noise_variance, - csprng, &CONCRETE_CSPRNG_VTABLE); - cuda_synchronize_stream(v_stream); + for (int i = 0; i < number_of_inputs; i++) { + uint64_t plaintext = plaintexts_1[r * SAMPLES * number_of_inputs + + s * number_of_inputs + i]; + concrete_cpu_encrypt_lwe_ciphertext_u64( + lwe_sk, lwe_in_1_ct + i * (lwe_dimension + 1), plaintext, + lwe_dimension, noise_variance, csprng, &CONCRETE_CSPRNG_VTABLE); + } cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct, - (lwe_dimension + 1) * sizeof(uint64_t), stream, - gpu_index); + number_of_inputs * (lwe_dimension + 1) * + sizeof(uint64_t), + stream, gpu_index); // Execute addition cuda_negate_lwe_ciphertext_vector_64( stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct, - lwe_dimension, 1); + lwe_dimension, number_of_inputs); // Copy result back cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct, - (lwe_dimension + 1) * sizeof(uint64_t), stream, - gpu_index); - uint64_t decrypted = 0; - concrete_cpu_decrypt_lwe_ciphertext_u64(lwe_sk, lwe_out_ct, lwe_dimension, - &decrypted); - // The bit before the message - uint64_t rounding_bit = delta >> 1; - // Compute the rounding bit - uint64_t rounding = (decrypted & rounding_bit) << 1; - uint64_t decoded = (decrypted + rounding) / delta; - ASSERT_EQ(decoded, -plaintext / delta); - cuda_synchronize_stream(v_stream); + number_of_inputs * (lwe_dimension + 1) * + sizeof(uint64_t), + stream, gpu_index); + for (int i = 0; i < number_of_inputs; i++) { + uint64_t plaintext = plaintexts_1[r * SAMPLES * number_of_inputs + + s * number_of_inputs + i]; + uint64_t decrypted = 0; + concrete_cpu_decrypt_lwe_ciphertext_u64( + lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension, + &decrypted); + // The bit before the message + uint64_t rounding_bit = delta >> 1; + // Compute the rounding bit + uint64_t rounding = (decrypted & rounding_bit) << 1; + uint64_t decoded = (decrypted + rounding) / delta; + EXPECT_EQ(decoded, -plaintext / delta); + } } } } @@ -266,8 +325,8 @@ TEST_P(LinearAlgebraTestPrimitives_u64, negate) { // tested. It executes each test for all pairs on phis X qs (Cartesian product) ::testing::internal::ParamGenerator linear_algebra_params_u64 = ::testing::Values( - // n, lwe_std_dev, message_modulus, carry_modulus - (LinearAlgebraTestParams){600, 0.000007069849454709433, 4, 4}); + // n, lwe_std_dev, message_modulus, carry_modulus, number_of_inputs + (LinearAlgebraTestParams){600, 7.52316384526264e-37, 2, 2, 10}); std::string printParamName(::testing::TestParamInfo p) {