From 703c74401ca52936a60fcd99db7531f12ad51d5c Mon Sep 17 00:00:00 2001
From: Agnes Leroy <agnes.leroy@zama.ai>
Date: Fri, 14 Oct 2022 11:13:58 +0200
Subject: [PATCH] chore(cuda): add asserts on base log, poly size and num
 samples values

---
 src/bootstrap_amortized.cu   | 10 ++++++++
 src/bootstrap_low_latency.cu | 24 +++++++++++++++++
 src/bootstrap_wop.cu         | 50 ++++++++++++++++++++++++++++++++++++
 src/bootstrap_wop.cuh        |  3 ---
 4 files changed, 84 insertions(+), 3 deletions(-)

diff --git a/src/bootstrap_amortized.cu b/src/bootstrap_amortized.cu
index 15647b4bc..6504d4b9e 100644
--- a/src/bootstrap_amortized.cu
+++ b/src/bootstrap_amortized.cu
@@ -73,6 +73,11 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
     uint32_t lwe_idx,
     uint32_t max_shared_memory) {
 
+    assert(("Error (GPU amortized PBS): base log should be <= 16", base_log <= 16));
+    assert(("Error (GPU amortized PBS): polynomial size should be one of 512, 1024, 2048, 4096, 8192",
+            polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048 ||
+            polynomial_size == 4096 || polynomial_size == 8192));
+
   switch (polynomial_size) {
   case 512:
     host_bootstrap_amortized<uint32_t, Degree<512>>(
@@ -131,6 +136,11 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
     uint32_t lwe_idx,
     uint32_t max_shared_memory) {
 
+  assert(("Error (GPU amortized PBS): base log should be <= 16", base_log <= 16));
+  assert(("Error (GPU amortized PBS): polynomial size should be one of 512, 1024, 2048, 4096, 8192",
+          polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048 || 
+              polynomial_size == 4096 || polynomial_size == 8192));
+  
   switch (polynomial_size) {
   case 512:
     host_bootstrap_amortized<uint64_t, Degree<512>>(
diff --git a/src/bootstrap_low_latency.cu b/src/bootstrap_low_latency.cu
index b89b754c4..dfa80ce6b 100644
--- a/src/bootstrap_low_latency.cu
+++ b/src/bootstrap_low_latency.cu
@@ -72,6 +72,18 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
         uint32_t lwe_idx,
         uint32_t max_shared_memory) {
 
+    assert(("Error (GPU low latency PBS): base log should be <= 16", base_log <= 16));
+    assert(("Error (GPU low latency PBS): polynomial size should be one of 512, 1024, 2048",
+            polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048));
+    // The number of samples should be lower than SM/(4 * (k + 1) * l) (the
+    // factor 4 being related to the occupancy of 50%). The only supported
+    // value for k is 1, so k + 1 = 2 for now.
+    int number_of_sm = 0;
+    cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+    assert(("Error (GPU low latency PBS): the number of input LWEs must be lower or equal to the "
+            "number of streaming multiprocessors on the device divided by 8 * l_gadget",
+            num_samples <= number_of_sm / 4. / 2. / l_gadget));
+
   switch (polynomial_size) {
   case 512:
     host_bootstrap_low_latency<uint32_t, Degree<512>>(
@@ -134,6 +146,18 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
         uint32_t lwe_idx,
         uint32_t max_shared_memory) {
 
+    assert(("Error (GPU low latency PBS): base log should be <= 16", base_log <= 16));
+    assert(("Error (GPU low latency PBS): polynomial size should be one of 512, 1024, 2048",
+            polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048));
+    // The number of samples should be lower than SM/(4 * (k + 1) * l) (the
+    // factor 4 being related to the occupancy of 50%). The only supported
+    // value for k is 1, so k + 1 = 2 for now.
+    int number_of_sm = 0;
+    cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+    assert(("Error (GPU low latency PBS): the number of input LWEs must be lower or equal to the "
+            "number of streaming multiprocessors on the device divided by 8 * l_gadget",
+            num_samples <= number_of_sm / 4. / 2. / l_gadget));
+
   switch (polynomial_size) {
   case 512:
     host_bootstrap_low_latency<uint64_t, Degree<512>>(
diff --git a/src/bootstrap_wop.cu b/src/bootstrap_wop.cu
index 05453f56e..653dead57 100644
--- a/src/bootstrap_wop.cu
+++ b/src/bootstrap_wop.cu
@@ -12,6 +12,15 @@ void cuda_cmux_tree_32(
         uint32_t r,
         uint32_t max_shared_memory) {
 
+    assert(("Error (GPU Cmux tree): base log should be <= 16", base_log <= 16));
+    assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, 2048, 4096, 8192",
+            polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048 ||
+            polynomial_size == 4096 || polynomial_size == 8192));
+    // For larger k we will need to adjust the mask size
+    assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1", glwe_dimension == 1));
+    assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should be >= 1 ",
+            r >= 1));
+
     switch (polynomial_size) {
         case 512:
             host_cmux_tree<uint32_t, int32_t, Degree<512>>(
@@ -48,6 +57,8 @@ void cuda_cmux_tree_32(
                     glwe_dimension, polynomial_size, base_log, l_gadget, r,
                     max_shared_memory);
             break;
+        default:
+            break;
     }
 }
 
@@ -63,6 +74,15 @@ void cuda_cmux_tree_64(
         uint32_t r,
         uint32_t max_shared_memory) {
 
+    assert(("Error (GPU Cmux tree): base log should be <= 16", base_log <= 16));
+    assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, 2048, 4096, 8192",
+            polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048 ||
+            polynomial_size == 4096 || polynomial_size == 8192));
+    // For larger k we will need to adjust the mask size
+    assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1", glwe_dimension == 1));
+    assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should be >= 1 ",
+            r >= 1));
+
     switch (polynomial_size) {
         case 512:
             host_cmux_tree<uint64_t, int64_t, Degree<512>>(
@@ -99,6 +119,8 @@ void cuda_cmux_tree_64(
                     glwe_dimension, polynomial_size, base_log, l_gadget, r,
                     max_shared_memory);
             break;
+        default:
+            break;
     }
 }
 
@@ -125,6 +147,20 @@ void cuda_extract_bits_32(
     uint32_t l_gadget_ksk,
     uint32_t number_of_samples)
 {
+    assert(("Error (GPU extract bits): base log should be <= 16", base_log_bsk <= 16));
+    assert(("Error (GPU extract bits): lwe_dimension_before should be one of 512, 1024, 2048",
+            lwe_dimension_before == 512 || lwe_dimension_before == 1024 ||
+            lwe_dimension_before == 2048));
+    // The number of samples should be lower than the number of streaming
+    // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
+    // to the occupancy of 50%). The only supported value for k is 1, so
+    // k + 1 = 2 for now.
+    int number_of_sm = 0;
+    cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+    assert(("Error (GPU extract bits): the number of input LWEs must be lower or equal to the "
+            "number of streaming multiprocessors on the device divided by 8 * l_gadget_bsk",
+            number_of_samples <= number_of_sm / 4. / 2. / l_gadget_bsk));
+
   switch (lwe_dimension_before) {
   case 512:
     host_extract_bits<uint32_t, Degree<512>>(
@@ -186,6 +222,20 @@ void cuda_extract_bits_64(
     uint32_t l_gadget_ksk,
     uint32_t number_of_samples)
 {
+    assert(("Error (GPU extract bits): base log should be <= 16", base_log_bsk <= 16));
+    assert(("Error (GPU extract bits): lwe_dimension_before should be one of 512, 1024, 2048",
+            lwe_dimension_before == 512 || lwe_dimension_before == 1024 ||
+            lwe_dimension_before == 2048));
+    // The number of samples should be lower than the number of streaming
+    // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
+    // to the occupancy of 50%). The only supported value for k is 1, so
+    // k + 1 = 2 for now.
+    int number_of_sm = 0;
+    cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+    assert(("Error (GPU extract bits): the number of input LWEs must be lower or equal to the "
+            "number of streaming multiprocessors on the device divided by 8 * l_gadget_bsk",
+            number_of_samples <= number_of_sm / 4. / 2. / l_gadget_bsk));
+
   switch (lwe_dimension_before) {
   case 512:
     host_extract_bits<uint64_t, Degree<512>>(
diff --git a/src/bootstrap_wop.cuh b/src/bootstrap_wop.cuh
index a1aff02f8..964d55071 100644
--- a/src/bootstrap_wop.cuh
+++ b/src/bootstrap_wop.cuh
@@ -298,9 +298,6 @@ void host_cmux_tree(
         uint32_t r,
         uint32_t max_shared_memory) {
 
-    assert(glwe_dimension == 1); // For larger k we will need to adjust the mask size
-    assert(r >= 1);
-
     auto stream = static_cast<cudaStream_t *>(v_stream);
     int num_lut = (1<<r);