fix(cuda): fix the assert on the number of inputs in the low lat pbs

2026-02-08 19:44:57 -05:00 · 2022-12-02 10:13:36 +01:00
parent 9bcf0f8a70
commit 0a0c45338c
2 changed files with 11 additions and 11 deletions
--- a/src/bootstrap_low_latency.cu
+++ b/src/bootstrap_low_latency.cu
@@ -72,7 +72,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
          polynomial_size == 512 || polynomial_size == 1024 ||
              polynomial_size == 2048 || polynomial_size == 4096 ||
              polynomial_size == 8192));
-  // The number of samples should be lower than SM/(4 * (k + 1) * l) (the
+  // The number of samples should be lower than 4 * SM/((k + 1) * l) (the
  // factor 4 being related to the occupancy of 50%). The only supported
  // value for k is 1, so k + 1 = 2 for now.
  int number_of_sm = 0;
@@ -81,7 +81,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
          "or equal to the "
          "number of streaming multiprocessors on the device divided by 8 * "
          "level_count",
-          num_samples <= number_of_sm / 4. / 2. / level_count));
+          num_samples <= number_of_sm * 4. / 2. / level_count));

  switch (polynomial_size) {
  case 512:
@@ -140,7 +140,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
          polynomial_size == 512 || polynomial_size == 1024 ||
              polynomial_size == 2048 || polynomial_size == 4096 ||
              polynomial_size == 8192));
-  // The number of samples should be lower than SM/(4 * (k + 1) * l) (the
+  // The number of samples should be lower than 4 * SM/((k + 1) * l) (the
  // factor 4 being related to the occupancy of 50%). The only supported
  // value for k is 1, so k + 1 = 2 for now.
  int number_of_sm = 0;
@@ -149,7 +149,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
          "or equal to the "
          "number of streaming multiprocessors on the device divided by 8 * "
          "level_count",
-          num_samples <= number_of_sm / 4. / 2. / level_count));
+          num_samples <= number_of_sm * 4. / 2. / level_count));

  switch (polynomial_size) {
  case 512:
--- a/src/bootstrap_wop.cu
+++ b/src/bootstrap_wop.cu
@@ -129,8 +129,8 @@ void cuda_extract_bits_32(
          lwe_dimension_in == 512 || lwe_dimension_in == 1024 ||
              lwe_dimension_in == 2048 || lwe_dimension_in == 4096 ||
              lwe_dimension_in == 8192));
-  // The number of samples should be lower than the number of streaming
-  // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
+  // The number of samples should be lower than 4 time the number of streaming
+  // multiprocessors divided by ((k + 1) * l) (the factor 4 being related
  // to the occupancy of 50%). The only supported value for k is 1, so
  // k + 1 = 2 for now.
  int number_of_sm = 0;
@@ -139,7 +139,7 @@ void cuda_extract_bits_32(
          "equal to the "
          "number of streaming multiprocessors on the device divided by 8 * "
          "level_count_bsk",
-          number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk));
+          number_of_samples <= number_of_sm * 4. / 2. / level_count_bsk));

  switch (lwe_dimension_in) {
  case 512:
@@ -226,9 +226,9 @@ void cuda_extract_bits_64(
          lwe_dimension_in == 512 || lwe_dimension_in == 1024 ||
              lwe_dimension_in == 2048 || lwe_dimension_in == 4096 ||
              lwe_dimension_in == 8192));
-  // The number of samples should be lower than the number of streaming
-  // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
-  // to the occupancy of 50%). The only supported value for k is 1, so
+  // The number of samples should be lower than four time the number of
+  // streaming multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being
+  // related to the occupancy of 50%). The only supported value for k is 1, so
  // k + 1 = 2 for now.
  int number_of_sm = 0;
  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
@@ -236,7 +236,7 @@ void cuda_extract_bits_64(
          "equal to the "
          "number of streaming multiprocessors on the device divided by 8 * "
          "level_count_bsk",
-          number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk));
+          number_of_samples <= number_of_sm * 4. / 2. / level_count_bsk));

  switch (lwe_dimension_in) {
  case 512: