From 6efe5f3a3b04148bbc778c930cdb45a85437179a Mon Sep 17 00:00:00 2001
From: Agnes Leroy <agnes.leroy@zama.ai>
Date: Thu, 16 Mar 2023 13:30:32 +0100
Subject: [PATCH] fix(concrete_cuda): fix cleartext mult and enhance linalg
 tests

---
 .../implementation/src/multiplication.cuh     |   4 +-
 .../implementation/test/test_keyswitch.cpp    |   2 +-
 .../test/test_linear_algebra.cpp              | 271 +++++++++++-------
 3 files changed, 168 insertions(+), 109 deletions(-)

diff --git a/backends/concrete-cuda/implementation/src/multiplication.cuh b/backends/concrete-cuda/implementation/src/multiplication.cuh
index 0fa7cc662..8c36caaec 100644
--- a/backends/concrete-cuda/implementation/src/multiplication.cuh
+++ b/backends/concrete-cuda/implementation/src/multiplication.cuh
@@ -16,8 +16,8 @@ cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input,
                          uint32_t input_lwe_dimension, uint32_t num_entries) {
 
   int tid = threadIdx.x;
-  if (tid < num_entries) {
-    int index = blockIdx.x * blockDim.x + tid;
+  int index = blockIdx.x * blockDim.x + tid;
+  if (index < num_entries) {
     int cleartext_index = index / (input_lwe_dimension + 1);
     // Here we take advantage of the wrapping behaviour of uint
     output[index] = lwe_input[index] * cleartext_input[cleartext_index];
diff --git a/backends/concrete-cuda/implementation/test/test_keyswitch.cpp b/backends/concrete-cuda/implementation/test/test_keyswitch.cpp
index 4d555e24d..b346eae9c 100644
--- a/backends/concrete-cuda/implementation/test/test_keyswitch.cpp
+++ b/backends/concrete-cuda/implementation/test/test_keyswitch.cpp
@@ -158,7 +158,7 @@ TEST_P(KeyswitchTestPrimitives_u64, keyswitch) {
       // Compute the rounding bit
       uint64_t rounding = (decrypted & rounding_bit) << 1;
       uint64_t decoded = (decrypted + rounding) / delta;
-      ASSERT_EQ(decoded, plaintext / delta);
+      EXPECT_EQ(decoded, plaintext / delta);
     }
   }
 }
diff --git a/backends/concrete-cuda/implementation/test/test_linear_algebra.cpp b/backends/concrete-cuda/implementation/test/test_linear_algebra.cpp
index edd116bcd..778112be2 100644
--- a/backends/concrete-cuda/implementation/test/test_linear_algebra.cpp
+++ b/backends/concrete-cuda/implementation/test/test_linear_algebra.cpp
@@ -16,6 +16,7 @@ typedef struct {
   double noise_variance;
   int message_modulus;
   int carry_modulus;
+  int number_of_inputs;
 } LinearAlgebraTestParams;
 
 class LinearAlgebraTestPrimitives_u64
@@ -25,6 +26,7 @@ protected:
   double noise_variance;
   int message_modulus;
   int carry_modulus;
+  int number_of_inputs;
   int payload_modulus;
   uint64_t delta;
   Csprng *csprng;
@@ -49,9 +51,10 @@ public:
 
     // TestParams
     lwe_dimension = (int)GetParam().lwe_dimension;
-    noise_variance = (int)GetParam().noise_variance;
+    noise_variance = (double)GetParam().noise_variance;
     message_modulus = (int)GetParam().message_modulus;
     carry_modulus = (int)GetParam().carry_modulus;
+    number_of_inputs = (int)GetParam().number_of_inputs;
 
     payload_modulus = message_modulus * carry_modulus;
     // Value of the shift we multiply our messages by
@@ -66,21 +69,27 @@ public:
 
     // Generate the keys
     generate_lwe_secret_keys(&lwe_sk_array, lwe_dimension, csprng, REPETITIONS);
-    plaintexts_1 =
-        generate_plaintexts(payload_modulus, delta, 1, REPETITIONS, SAMPLES);
-    plaintexts_2 =
-        generate_plaintexts(payload_modulus, delta, 1, REPETITIONS, SAMPLES);
+    plaintexts_1 = generate_plaintexts(payload_modulus, delta, number_of_inputs,
+                                       REPETITIONS, SAMPLES);
+    plaintexts_2 = generate_plaintexts(payload_modulus, delta, number_of_inputs,
+                                       REPETITIONS, SAMPLES);
 
     d_lwe_in_1_ct = (uint64_t *)cuda_malloc_async(
-        (lwe_dimension + 1) * sizeof(uint64_t), stream, gpu_index);
+        number_of_inputs * (lwe_dimension + 1) * sizeof(uint64_t), stream,
+        gpu_index);
     d_lwe_in_2_ct = (uint64_t *)cuda_malloc_async(
-        (lwe_dimension + 1) * sizeof(uint64_t), stream, gpu_index);
+        number_of_inputs * (lwe_dimension + 1) * sizeof(uint64_t), stream,
+        gpu_index);
     d_lwe_out_ct = (uint64_t *)cuda_malloc_async(
-        (lwe_dimension + 1) * sizeof(uint64_t), stream, gpu_index);
+        number_of_inputs * (lwe_dimension + 1) * sizeof(uint64_t), stream,
+        gpu_index);
 
-    lwe_in_1_ct = (uint64_t *)malloc((lwe_dimension + 1) * sizeof(uint64_t));
-    lwe_in_2_ct = (uint64_t *)malloc((lwe_dimension + 1) * sizeof(uint64_t));
-    lwe_out_ct = (uint64_t *)malloc((lwe_dimension + 1) * sizeof(uint64_t));
+    lwe_in_1_ct = (uint64_t *)malloc(number_of_inputs * (lwe_dimension + 1) *
+                                     sizeof(uint64_t));
+    lwe_in_2_ct = (uint64_t *)malloc(number_of_inputs * (lwe_dimension + 1) *
+                                     sizeof(uint64_t));
+    lwe_out_ct = (uint64_t *)malloc(number_of_inputs * (lwe_dimension + 1) *
+                                    sizeof(uint64_t));
 
     cuda_synchronize_stream(v_stream);
   }
@@ -104,160 +113,210 @@ public:
 };
 
 TEST_P(LinearAlgebraTestPrimitives_u64, addition) {
-  void *v_stream = (void *)stream;
   // Here execute the PBS
   for (uint r = 0; r < REPETITIONS; r++) {
     for (uint s = 0; s < SAMPLES; s++) {
-      uint64_t plaintext_1 = plaintexts_1[r * SAMPLES + s];
-      uint64_t plaintext_2 = plaintexts_2[r * SAMPLES + s];
       uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
-      concrete_cpu_encrypt_lwe_ciphertext_u64(lwe_sk, lwe_in_1_ct, plaintext_1,
-                                              lwe_dimension, noise_variance,
-                                              csprng, &CONCRETE_CSPRNG_VTABLE);
-      concrete_cpu_encrypt_lwe_ciphertext_u64(lwe_sk, lwe_in_2_ct, plaintext_2,
-                                              lwe_dimension, noise_variance,
-                                              csprng, &CONCRETE_CSPRNG_VTABLE);
-      cuda_synchronize_stream(v_stream);
+      for (int i = 0; i < number_of_inputs; i++) {
+        uint64_t plaintext_1 = plaintexts_1[r * SAMPLES * number_of_inputs +
+                                            s * number_of_inputs + i];
+        uint64_t plaintext_2 = plaintexts_2[r * SAMPLES * number_of_inputs +
+                                            s * number_of_inputs + i];
+        concrete_cpu_encrypt_lwe_ciphertext_u64(
+            lwe_sk, lwe_in_1_ct + i * (lwe_dimension + 1), plaintext_1,
+            lwe_dimension, noise_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
+        concrete_cpu_encrypt_lwe_ciphertext_u64(
+            lwe_sk, lwe_in_2_ct + i * (lwe_dimension + 1), plaintext_2,
+            lwe_dimension, noise_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
+      }
       cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
-                               (lwe_dimension + 1) * sizeof(uint64_t), stream,
-                               gpu_index);
+                               number_of_inputs * (lwe_dimension + 1) *
+                                   sizeof(uint64_t),
+                               stream, gpu_index);
       cuda_memcpy_async_to_gpu(d_lwe_in_2_ct, lwe_in_2_ct,
-                               (lwe_dimension + 1) * sizeof(uint64_t), stream,
-                               gpu_index);
+                               number_of_inputs * (lwe_dimension + 1) *
+                                   sizeof(uint64_t),
+                               stream, gpu_index);
       // Execute addition
       cuda_add_lwe_ciphertext_vector_64(
           stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
-          (void *)d_lwe_in_2_ct, lwe_dimension, 1);
+          (void *)d_lwe_in_2_ct, lwe_dimension, number_of_inputs);
 
       // Copy result back
       cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
-                               (lwe_dimension + 1) * sizeof(uint64_t), stream,
-                               gpu_index);
-      uint64_t decrypted = 0;
-      concrete_cpu_decrypt_lwe_ciphertext_u64(lwe_sk, lwe_out_ct, lwe_dimension,
-                                              &decrypted);
-      // The bit before the message
-      uint64_t rounding_bit = delta >> 1;
-      // Compute the rounding bit
-      uint64_t rounding = (decrypted & rounding_bit) << 1;
-      uint64_t decoded = (decrypted + rounding) / delta;
-      ASSERT_EQ(decoded, (plaintext_1 + plaintext_2) / delta);
-      cuda_synchronize_stream(v_stream);
+                               number_of_inputs * (lwe_dimension + 1) *
+                                   sizeof(uint64_t),
+                               stream, gpu_index);
+      for (int i = 0; i < number_of_inputs; i++) {
+        uint64_t plaintext_1 = plaintexts_1[r * SAMPLES * number_of_inputs +
+                                            s * number_of_inputs + i];
+        uint64_t plaintext_2 = plaintexts_2[r * SAMPLES * number_of_inputs +
+                                            s * number_of_inputs + i];
+        uint64_t decrypted = 0;
+        concrete_cpu_decrypt_lwe_ciphertext_u64(
+            lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension,
+            &decrypted);
+        // The bit before the message
+        uint64_t rounding_bit = delta >> 1;
+        // Compute the rounding bit
+        uint64_t rounding = (decrypted & rounding_bit) << 1;
+        uint64_t decoded = (decrypted + rounding) / delta;
+        EXPECT_EQ(decoded, (plaintext_1 + plaintext_2) / delta);
+      }
     }
   }
 }
 
 TEST_P(LinearAlgebraTestPrimitives_u64, plaintext_addition) {
-  void *v_stream = (void *)stream;
   // Here execute the PBS
   for (uint r = 0; r < REPETITIONS; r++) {
     for (uint s = 0; s < SAMPLES; s++) {
-      uint64_t plaintext_1 = plaintexts_1[r * SAMPLES + s];
-      uint64_t plaintext_2 = plaintexts_2[r * SAMPLES + s];
       uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
-      concrete_cpu_encrypt_lwe_ciphertext_u64(lwe_sk, lwe_in_1_ct, plaintext_1,
-                                              lwe_dimension, noise_variance,
-                                              csprng, &CONCRETE_CSPRNG_VTABLE);
-      cuda_synchronize_stream(v_stream);
+      for (int i = 0; i < number_of_inputs; i++) {
+        uint64_t plaintext_1 = plaintexts_1[r * SAMPLES * number_of_inputs +
+                                            s * number_of_inputs + i];
+        concrete_cpu_encrypt_lwe_ciphertext_u64(
+            lwe_sk, lwe_in_1_ct + i * (lwe_dimension + 1), plaintext_1,
+            lwe_dimension, noise_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
+      }
       cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
-                               (lwe_dimension + 1) * sizeof(uint64_t), stream,
-                               gpu_index);
-      cuda_memcpy_async_to_gpu(d_lwe_in_2_ct, &plaintext_2, sizeof(uint64_t),
+                               number_of_inputs * (lwe_dimension + 1) *
+                                   sizeof(uint64_t),
                                stream, gpu_index);
+      cuda_memcpy_async_to_gpu(
+          d_lwe_in_2_ct,
+          &plaintexts_2[r * SAMPLES * number_of_inputs + s * number_of_inputs],
+          number_of_inputs * sizeof(uint64_t), stream, gpu_index);
       // Execute addition
       cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
           stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
-          (void *)d_lwe_in_2_ct, lwe_dimension, 1);
+          (void *)d_lwe_in_2_ct, lwe_dimension, number_of_inputs);
       // Copy result back
       cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
-                               (lwe_dimension + 1) * sizeof(uint64_t), stream,
-                               gpu_index);
-      uint64_t decrypted = 0;
-      concrete_cpu_decrypt_lwe_ciphertext_u64(lwe_sk, lwe_out_ct, lwe_dimension,
-                                              &decrypted);
-      // The bit before the message
-      uint64_t rounding_bit = delta >> 1;
-      // Compute the rounding bit
-      uint64_t rounding = (decrypted & rounding_bit) << 1;
-      uint64_t decoded = (decrypted + rounding) / delta;
-      ASSERT_EQ(decoded, (plaintext_1 + plaintext_2) / delta);
-      cuda_synchronize_stream(v_stream);
+                               number_of_inputs * (lwe_dimension + 1) *
+                                   sizeof(uint64_t),
+                               stream, gpu_index);
+      for (int i = 0; i < number_of_inputs; i++) {
+        uint64_t plaintext_1 = plaintexts_1[r * SAMPLES * number_of_inputs +
+                                            s * number_of_inputs + i];
+        uint64_t plaintext_2 = plaintexts_2[r * SAMPLES * number_of_inputs +
+                                            s * number_of_inputs + i];
+        uint64_t decrypted = 0;
+        concrete_cpu_decrypt_lwe_ciphertext_u64(
+            lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension,
+            &decrypted);
+        // The bit before the message
+        uint64_t rounding_bit = delta >> 1;
+        // Compute the rounding bit
+        uint64_t rounding = (decrypted & rounding_bit) << 1;
+        uint64_t decoded = (decrypted + rounding) / delta;
+        EXPECT_EQ(decoded, (plaintext_1 + plaintext_2) / delta);
+      }
     }
   }
 }
 
-TEST_P(LinearAlgebraTestPrimitives_u64, plaintext_multiplication) {
+TEST_P(LinearAlgebraTestPrimitives_u64, cleartext_multiplication) {
   void *v_stream = (void *)stream;
+  uint64_t delta_2 =
+      ((uint64_t)(1) << 63) / (uint64_t)(payload_modulus * payload_modulus);
   // Here execute the PBS
   for (uint r = 0; r < REPETITIONS; r++) {
     for (uint s = 0; s < SAMPLES; s++) {
-      uint64_t plaintext_1 = plaintexts_1[r * SAMPLES + s];
-      uint64_t plaintext_2 = plaintexts_2[r * SAMPLES + s];
       uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
-      concrete_cpu_encrypt_lwe_ciphertext_u64(lwe_sk, lwe_in_1_ct, plaintext_1,
-                                              lwe_dimension, noise_variance,
-                                              csprng, &CONCRETE_CSPRNG_VTABLE);
+      uint64_t *cleartext_array =
+          (uint64_t *)malloc(number_of_inputs * sizeof(uint64_t));
+      for (int i = 0; i < number_of_inputs; i++) {
+        uint64_t plaintext_1 = plaintexts_1[r * SAMPLES * number_of_inputs +
+                                            s * number_of_inputs + i] /
+                               delta * delta_2;
+        uint64_t plaintext_2 = plaintexts_2[r * SAMPLES * number_of_inputs +
+                                            s * number_of_inputs + i];
+        cleartext_array[i] = plaintext_2 / delta;
+        concrete_cpu_encrypt_lwe_ciphertext_u64(
+            lwe_sk, lwe_in_1_ct + i * (lwe_dimension + 1), plaintext_1,
+            lwe_dimension, noise_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
+      }
       cuda_synchronize_stream(v_stream);
       cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
-                               (lwe_dimension + 1) * sizeof(uint64_t), stream,
-                               gpu_index);
-      cuda_memcpy_async_to_gpu(d_lwe_in_2_ct, &plaintext_1, sizeof(uint64_t),
+                               number_of_inputs * (lwe_dimension + 1) *
+                                   sizeof(uint64_t),
                                stream, gpu_index);
-      // Execute addition
+      cuda_memcpy_async_to_gpu(d_lwe_in_2_ct, cleartext_array,
+                               number_of_inputs * sizeof(uint64_t), stream,
+                               gpu_index);
+      // Execute cleartext multiplication
       cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
           stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
-          (void *)d_lwe_in_2_ct, lwe_dimension, 1);
+          (void *)d_lwe_in_2_ct, lwe_dimension, number_of_inputs);
       // Copy result back
       cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
-                               (lwe_dimension + 1) * sizeof(uint64_t), stream,
-                               gpu_index);
-      uint64_t decrypted = 0;
-      concrete_cpu_decrypt_lwe_ciphertext_u64(lwe_sk, lwe_out_ct, lwe_dimension,
-                                              &decrypted);
-      // The bit before the message
-      uint64_t rounding_bit = delta >> 1;
-      // Compute the rounding bit
-      uint64_t rounding = (decrypted & rounding_bit) << 1;
-      uint64_t decoded = (decrypted + rounding) / delta;
-      ASSERT_EQ(decoded, (plaintext_1 * plaintext_2) / delta);
+                               number_of_inputs * (lwe_dimension + 1) *
+                                   sizeof(uint64_t),
+                               stream, gpu_index);
       cuda_synchronize_stream(v_stream);
+      for (int i = 0; i < number_of_inputs; i++) {
+        uint64_t plaintext = plaintexts_1[r * SAMPLES * number_of_inputs +
+                                          s * number_of_inputs + i] /
+                             delta * delta_2;
+        uint64_t cleartext = plaintexts_2[r * SAMPLES * number_of_inputs +
+                                          s * number_of_inputs + i] /
+                             delta;
+        uint64_t decrypted = 0;
+        concrete_cpu_decrypt_lwe_ciphertext_u64(
+            lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension,
+            &decrypted);
+        // The bit before the message
+        uint64_t rounding_bit = delta_2 >> 1;
+        // Compute the rounding bit
+        uint64_t rounding = (decrypted & rounding_bit) << 1;
+        uint64_t decoded = (decrypted + rounding) / delta_2;
+        EXPECT_EQ(decoded, plaintext / delta_2 * cleartext);
+      }
     }
   }
 }
 
 TEST_P(LinearAlgebraTestPrimitives_u64, negate) {
-  void *v_stream = (void *)stream;
   // Here execute the PBS
   for (uint r = 0; r < REPETITIONS; r++) {
     for (uint s = 0; s < SAMPLES; s++) {
-      uint64_t plaintext = plaintexts_1[r * SAMPLES + s];
       uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
-      concrete_cpu_encrypt_lwe_ciphertext_u64(lwe_sk, lwe_in_1_ct, plaintext,
-                                              lwe_dimension, noise_variance,
-                                              csprng, &CONCRETE_CSPRNG_VTABLE);
-      cuda_synchronize_stream(v_stream);
+      for (int i = 0; i < number_of_inputs; i++) {
+        uint64_t plaintext = plaintexts_1[r * SAMPLES * number_of_inputs +
+                                          s * number_of_inputs + i];
+        concrete_cpu_encrypt_lwe_ciphertext_u64(
+            lwe_sk, lwe_in_1_ct + i * (lwe_dimension + 1), plaintext,
+            lwe_dimension, noise_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
+      }
       cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
-                               (lwe_dimension + 1) * sizeof(uint64_t), stream,
-                               gpu_index);
+                               number_of_inputs * (lwe_dimension + 1) *
+                                   sizeof(uint64_t),
+                               stream, gpu_index);
       // Execute addition
       cuda_negate_lwe_ciphertext_vector_64(
           stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
-          lwe_dimension, 1);
+          lwe_dimension, number_of_inputs);
 
       // Copy result back
       cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
-                               (lwe_dimension + 1) * sizeof(uint64_t), stream,
-                               gpu_index);
-      uint64_t decrypted = 0;
-      concrete_cpu_decrypt_lwe_ciphertext_u64(lwe_sk, lwe_out_ct, lwe_dimension,
-                                              &decrypted);
-      // The bit before the message
-      uint64_t rounding_bit = delta >> 1;
-      // Compute the rounding bit
-      uint64_t rounding = (decrypted & rounding_bit) << 1;
-      uint64_t decoded = (decrypted + rounding) / delta;
-      ASSERT_EQ(decoded, -plaintext / delta);
-      cuda_synchronize_stream(v_stream);
+                               number_of_inputs * (lwe_dimension + 1) *
+                                   sizeof(uint64_t),
+                               stream, gpu_index);
+      for (int i = 0; i < number_of_inputs; i++) {
+        uint64_t plaintext = plaintexts_1[r * SAMPLES * number_of_inputs +
+                                          s * number_of_inputs + i];
+        uint64_t decrypted = 0;
+        concrete_cpu_decrypt_lwe_ciphertext_u64(
+            lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension,
+            &decrypted);
+        // The bit before the message
+        uint64_t rounding_bit = delta >> 1;
+        // Compute the rounding bit
+        uint64_t rounding = (decrypted & rounding_bit) << 1;
+        uint64_t decoded = (decrypted + rounding) / delta;
+        EXPECT_EQ(decoded, -plaintext / delta);
+      }
     }
   }
 }
@@ -266,8 +325,8 @@ TEST_P(LinearAlgebraTestPrimitives_u64, negate) {
 // tested. It executes each test for all pairs on phis X qs (Cartesian product)
 ::testing::internal::ParamGenerator<LinearAlgebraTestParams>
     linear_algebra_params_u64 = ::testing::Values(
-        // n, lwe_std_dev, message_modulus, carry_modulus
-        (LinearAlgebraTestParams){600, 0.000007069849454709433, 4, 4});
+        // n, lwe_std_dev, message_modulus, carry_modulus, number_of_inputs
+        (LinearAlgebraTestParams){600, 7.52316384526264e-37, 2, 2, 10});
 
 std::string
 printParamName(::testing::TestParamInfo<LinearAlgebraTestParams> p) {