chore(cuda): add asserts on base log, poly size and num samples values

2026-02-08 19:44:57 -05:00 · 2022-10-14 11:13:58 +02:00
parent 1a76cadaa8
commit 703c74401c
4 changed files with 84 additions and 3 deletions
--- a/src/bootstrap_amortized.cu
+++ b/src/bootstrap_amortized.cu
@@ -73,6 +73,11 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
    uint32_t lwe_idx,
    uint32_t max_shared_memory) {

+    assert(("Error (GPU amortized PBS): base log should be <= 16", base_log <= 16));
+    assert(("Error (GPU amortized PBS): polynomial size should be one of 512, 1024, 2048, 4096, 8192",
+            polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048 ||
+            polynomial_size == 4096 || polynomial_size == 8192));
+
  switch (polynomial_size) {
  case 512:
    host_bootstrap_amortized<uint32_t, Degree<512>>(
@@ -131,6 +136,11 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
    uint32_t lwe_idx,
    uint32_t max_shared_memory) {

+  assert(("Error (GPU amortized PBS): base log should be <= 16", base_log <= 16));
+  assert(("Error (GPU amortized PBS): polynomial size should be one of 512, 1024, 2048, 4096, 8192",
+          polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048 || 
+              polynomial_size == 4096 || polynomial_size == 8192));
+  
  switch (polynomial_size) {
  case 512:
    host_bootstrap_amortized<uint64_t, Degree<512>>(
--- a/src/bootstrap_low_latency.cu
+++ b/src/bootstrap_low_latency.cu
@@ -72,6 +72,18 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
        uint32_t lwe_idx,
        uint32_t max_shared_memory) {

+    assert(("Error (GPU low latency PBS): base log should be <= 16", base_log <= 16));
+    assert(("Error (GPU low latency PBS): polynomial size should be one of 512, 1024, 2048",
+            polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048));
+    // The number of samples should be lower than SM/(4 * (k + 1) * l) (the
+    // factor 4 being related to the occupancy of 50%). The only supported
+    // value for k is 1, so k + 1 = 2 for now.
+    int number_of_sm = 0;
+    cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+    assert(("Error (GPU low latency PBS): the number of input LWEs must be lower or equal to the "
+            "number of streaming multiprocessors on the device divided by 8 * l_gadget",
+            num_samples <= number_of_sm / 4. / 2. / l_gadget));
+
  switch (polynomial_size) {
  case 512:
    host_bootstrap_low_latency<uint32_t, Degree<512>>(
@@ -134,6 +146,18 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
        uint32_t lwe_idx,
        uint32_t max_shared_memory) {

+    assert(("Error (GPU low latency PBS): base log should be <= 16", base_log <= 16));
+    assert(("Error (GPU low latency PBS): polynomial size should be one of 512, 1024, 2048",
+            polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048));
+    // The number of samples should be lower than SM/(4 * (k + 1) * l) (the
+    // factor 4 being related to the occupancy of 50%). The only supported
+    // value for k is 1, so k + 1 = 2 for now.
+    int number_of_sm = 0;
+    cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+    assert(("Error (GPU low latency PBS): the number of input LWEs must be lower or equal to the "
+            "number of streaming multiprocessors on the device divided by 8 * l_gadget",
+            num_samples <= number_of_sm / 4. / 2. / l_gadget));
+
  switch (polynomial_size) {
  case 512:
    host_bootstrap_low_latency<uint64_t, Degree<512>>(
--- a/src/bootstrap_wop.cu
+++ b/src/bootstrap_wop.cu
@@ -12,6 +12,15 @@ void cuda_cmux_tree_32(
        uint32_t r,
        uint32_t max_shared_memory) {

+    assert(("Error (GPU Cmux tree): base log should be <= 16", base_log <= 16));
+    assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, 2048, 4096, 8192",
+            polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048 ||
+            polynomial_size == 4096 || polynomial_size == 8192));
+    // For larger k we will need to adjust the mask size
+    assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1", glwe_dimension == 1));
+    assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should be >= 1 ",
+            r >= 1));
+
    switch (polynomial_size) {
        case 512:
            host_cmux_tree<uint32_t, int32_t, Degree<512>>(
@@ -48,6 +57,8 @@ void cuda_cmux_tree_32(
                    glwe_dimension, polynomial_size, base_log, l_gadget, r,
                    max_shared_memory);
            break;
+        default:
+            break;
    }
 }

@@ -63,6 +74,15 @@ void cuda_cmux_tree_64(
        uint32_t r,
        uint32_t max_shared_memory) {

+    assert(("Error (GPU Cmux tree): base log should be <= 16", base_log <= 16));
+    assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, 2048, 4096, 8192",
+            polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048 ||
+            polynomial_size == 4096 || polynomial_size == 8192));
+    // For larger k we will need to adjust the mask size
+    assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1", glwe_dimension == 1));
+    assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should be >= 1 ",
+            r >= 1));
+
    switch (polynomial_size) {
        case 512:
            host_cmux_tree<uint64_t, int64_t, Degree<512>>(
@@ -99,6 +119,8 @@ void cuda_cmux_tree_64(
                    glwe_dimension, polynomial_size, base_log, l_gadget, r,
                    max_shared_memory);
            break;
+        default:
+            break;
    }
 }

@@ -125,6 +147,20 @@ void cuda_extract_bits_32(
    uint32_t l_gadget_ksk,
    uint32_t number_of_samples)
 {
+    assert(("Error (GPU extract bits): base log should be <= 16", base_log_bsk <= 16));
+    assert(("Error (GPU extract bits): lwe_dimension_before should be one of 512, 1024, 2048",
+            lwe_dimension_before == 512 || lwe_dimension_before == 1024 ||
+            lwe_dimension_before == 2048));
+    // The number of samples should be lower than the number of streaming
+    // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
+    // to the occupancy of 50%). The only supported value for k is 1, so
+    // k + 1 = 2 for now.
+    int number_of_sm = 0;
+    cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+    assert(("Error (GPU extract bits): the number of input LWEs must be lower or equal to the "
+            "number of streaming multiprocessors on the device divided by 8 * l_gadget_bsk",
+            number_of_samples <= number_of_sm / 4. / 2. / l_gadget_bsk));
+
  switch (lwe_dimension_before) {
  case 512:
    host_extract_bits<uint32_t, Degree<512>>(
@@ -186,6 +222,20 @@ void cuda_extract_bits_64(
    uint32_t l_gadget_ksk,
    uint32_t number_of_samples)
 {
+    assert(("Error (GPU extract bits): base log should be <= 16", base_log_bsk <= 16));
+    assert(("Error (GPU extract bits): lwe_dimension_before should be one of 512, 1024, 2048",
+            lwe_dimension_before == 512 || lwe_dimension_before == 1024 ||
+            lwe_dimension_before == 2048));
+    // The number of samples should be lower than the number of streaming
+    // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
+    // to the occupancy of 50%). The only supported value for k is 1, so
+    // k + 1 = 2 for now.
+    int number_of_sm = 0;
+    cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+    assert(("Error (GPU extract bits): the number of input LWEs must be lower or equal to the "
+            "number of streaming multiprocessors on the device divided by 8 * l_gadget_bsk",
+            number_of_samples <= number_of_sm / 4. / 2. / l_gadget_bsk));
+
  switch (lwe_dimension_before) {
  case 512:
    host_extract_bits<uint64_t, Degree<512>>(
--- a/src/bootstrap_wop.cuh
+++ b/src/bootstrap_wop.cuh
@@ -298,9 +298,6 @@ void host_cmux_tree(
        uint32_t r,
        uint32_t max_shared_memory) {

-    assert(glwe_dimension == 1); // For larger k we will need to adjust the mask size
-    assert(r >= 1);
-
    auto stream = static_cast<cudaStream_t *>(v_stream);
    int num_lut = (1<<r);