chore(concrete_cuda): Add checks to validate inputs passed to the scratch functions

2026-02-08 11:35:02 -05:00 · 2023-03-13 15:37:40 +01:00
parent e36e2bd340
commit 5cb4e5ff4e
6 changed files with 261 additions and 176 deletions
--- a/backends/concrete-cuda/implementation/src/bit_extraction.cu
+++ b/backends/concrete-cuda/implementation/src/bit_extraction.cu
@@ -1,5 +1,41 @@
 #include "bit_extraction.cuh"

+/*
+ * Runs standard checks to validate the inputs
+ */
+void checks_fast_extract_bits(int nbits, int polynomial_size,
+                              int level_count_bsk, int number_of_samples) {
+
+  assert(("Error (GPU extract bits): polynomial_size should be one of "
+          "256, 512, 1024, 2048, 4096, 8192",
+          polynomial_size == 256 || polynomial_size == 512 ||
+              polynomial_size == 1024 || polynomial_size == 2048 ||
+              polynomial_size == 4096 || polynomial_size == 8192));
+  // The number of samples should be lower than four time the number of
+  // streaming multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being
+  // related to the occupancy of 50%). The only supported value for k is 1, so
+  // k + 1 = 2 for now.
+  int number_of_sm = 0;
+  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+  assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
+          "equal to the "
+          "number of streaming multiprocessors on the device divided by 8 * "
+          "level_count_bsk",
+          number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk));
+}
+
+/*
+ * Runs standard checks to validate the inputs
+ */
+void checks_extract_bits(int nbits, int polynomial_size, int base_log_bsk,
+                         int level_count_bsk, int number_of_samples) {
+
+  assert(("Error (GPU extract bits): base log should be <= nbits",
+          base_log_bsk <= nbits));
+  checks_fast_extract_bits(nbits, polynomial_size, level_count_bsk,
+                           number_of_samples);
+}
+
 /*
 * This scratch function allocates the necessary amount of data on the GPU for
 * the bit extraction on 32 bits inputs, into `cbs_buffer`. It also
@@ -11,6 +47,8 @@ void scratch_cuda_extract_bits_32(
    uint32_t level_count, uint32_t number_of_inputs, uint32_t max_shared_memory,
    bool allocate_gpu_memory) {

+  checks_fast_extract_bits(32, polynomial_size, level_count, number_of_inputs);
+
  switch (polynomial_size) {
  case 256:
    scratch_extract_bits<uint32_t, int32_t, Degree<256>>(
@@ -63,6 +101,7 @@ void scratch_cuda_extract_bits_64(
    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t number_of_inputs, uint32_t max_shared_memory,
    bool allocate_gpu_memory) {
+  checks_fast_extract_bits(64, polynomial_size, level_count, number_of_inputs);

  switch (polynomial_size) {
  case 256:
@@ -119,24 +158,8 @@ void cuda_extract_bits_32(void *v_stream, uint32_t gpu_index,
                          uint32_t level_count_bsk, uint32_t base_log_ksk,
                          uint32_t level_count_ksk, uint32_t number_of_samples,
                          uint32_t max_shared_memory) {
-  assert(("Error (GPU extract bits): base log should be <= 32",
-          base_log_bsk <= 32));
-  assert(("Error (GPU extract bits): polynomial_size should be one of "
-          "256, 512, 1024, 2048, 4096, 8192",
-          polynomial_size == 256 || polynomial_size == 512 ||
-              polynomial_size == 1024 || polynomial_size == 2048 ||
-              polynomial_size == 4096 || polynomial_size == 8192));
-  // The number of samples should be lower than four time the number of
-  // streaming multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being
-  // related to the occupancy of 50%). The only supported value for k is 1, so
-  // k + 1 = 2 for now.
-  int number_of_sm = 0;
-  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
-  assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
-          "equal to the "
-          "number of streaming multiprocessors on the device divided by 8 * "
-          "level_count_bsk",
-          number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk));
+  checks_extract_bits(32, polynomial_size, base_log_bsk, level_count_bsk,
+                      number_of_samples);

  switch (polynomial_size) {
  case 256:
@@ -253,24 +276,8 @@ void cuda_extract_bits_64(void *v_stream, uint32_t gpu_index,
                          uint32_t level_count_bsk, uint32_t base_log_ksk,
                          uint32_t level_count_ksk, uint32_t number_of_samples,
                          uint32_t max_shared_memory) {
-  assert(("Error (GPU extract bits): base log should be <= 64",
-          base_log_bsk <= 64));
-  assert(("Error (GPU extract bits): polynomial_size should be one of "
-          "256, 512, 1024, 2048, 4096, 8192",
-          polynomial_size == 256 || polynomial_size == 512 ||
-              polynomial_size == 1024 || polynomial_size == 2048 ||
-              polynomial_size == 4096 || polynomial_size == 8192));
-  // The number of samples should be lower than four time the number of
-  // streaming multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being
-  // related to the occupancy of 50%). The only supported value for k is 1, so
-  // k + 1 = 2 for now.
-  int number_of_sm = 0;
-  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
-  assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
-          "equal to the "
-          "number of streaming multiprocessors on the device divided by 8 * "
-          "level_count_bsk",
-          number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk));
+  checks_extract_bits(64, polynomial_size, base_log_bsk, level_count_bsk,
+                      number_of_samples);

  switch (polynomial_size) {
  case 256:
--- a/backends/concrete-cuda/implementation/src/bootstrap_amortized.cu
+++ b/backends/concrete-cuda/implementation/src/bootstrap_amortized.cu
@@ -1,5 +1,26 @@
 #include "bootstrap_amortized.cuh"

+/*
+ * Runs standard checks to validate the inputs
+ */
+void checks_fast_bootstrap_amortized(int nbits, int polynomial_size) {
+  assert(
+      ("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
+       "1024, 2048, 4096, 8192",
+       polynomial_size == 256 || polynomial_size == 512 ||
+           polynomial_size == 1024 || polynomial_size == 2048 ||
+           polynomial_size == 4096 || polynomial_size == 8192));
+}
+
+/*
+ * Runs standard checks to validate the inputs
+ */
+void checks_bootstrap_amortized(int nbits, int base_log, int polynomial_size) {
+  assert(("Error (GPU amortized PBS): base log should be <= nbits",
+          base_log <= nbits));
+  checks_fast_bootstrap_amortized(nbits, polynomial_size);
+}
+
 /*
 * This scratch function allocates the necessary amount of data on the GPU for
 * the amortized PBS on 32 bits inputs, into `pbs_buffer`. It also
@@ -13,6 +34,7 @@ void scratch_cuda_bootstrap_amortized_32(void *v_stream, uint32_t gpu_index,
                                         uint32_t input_lwe_ciphertext_count,
                                         uint32_t max_shared_memory,
                                         bool allocate_gpu_memory) {
+  checks_fast_bootstrap_amortized(32, polynomial_size);

  switch (polynomial_size) {
  case 256:
@@ -63,6 +85,7 @@ void scratch_cuda_bootstrap_amortized_64(void *v_stream, uint32_t gpu_index,
                                         uint32_t input_lwe_ciphertext_count,
                                         uint32_t max_shared_memory,
                                         bool allocate_gpu_memory) {
+  checks_fast_bootstrap_amortized(64, polynomial_size);

  switch (polynomial_size) {
  case 256:
@@ -111,14 +134,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
    uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
    uint32_t max_shared_memory) {

-  assert(
-      ("Error (GPU amortized PBS): base log should be <= 32", base_log <= 32));
-  assert(
-      ("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
-       "1024, 2048, 4096, 8192",
-       polynomial_size == 256 || polynomial_size == 512 ||
-           polynomial_size == 1024 || polynomial_size == 2048 ||
-           polynomial_size == 4096 || polynomial_size == 8192));
+  checks_bootstrap_amortized(32, base_log, polynomial_size);

  switch (polynomial_size) {
  case 256:
@@ -247,14 +263,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
    uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
    uint32_t max_shared_memory) {

-  assert(
-      ("Error (GPU amortized PBS): base log should be <= 64", base_log <= 64));
-  assert(
-      ("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
-       "1024, 2048, 4096, 8192",
-       polynomial_size == 256 || polynomial_size == 512 ||
-           polynomial_size == 1024 || polynomial_size == 2048 ||
-           polynomial_size == 4096 || polynomial_size == 8192));
+  checks_bootstrap_amortized(64, base_log, polynomial_size);

  switch (polynomial_size) {
  case 256:
--- a/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu
+++ b/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu
@@ -1,5 +1,41 @@
 #include "bootstrap_low_latency.cuh"

+/*
+ * Runs standard checks to validate the inputs
+ */
+void checks_fast_bootstrap_low_latency(int nbits, int level_count,
+                                       int polynomial_size, int num_samples) {
+
+  assert((
+      "Error (GPU low latency PBS): polynomial size should be one of 256, 512, "
+      "1024, 2048, 4096, 8192",
+      polynomial_size == 256 || polynomial_size == 512 ||
+          polynomial_size == 1024 || polynomial_size == 2048 ||
+          polynomial_size == 4096 || polynomial_size == 8192));
+  // The number of samples should be lower than 4 * SM/((k + 1) * l) (the
+  // factor 4 being related to the occupancy of 50%). The only supported
+  // value for k is 1, so k + 1 = 2 for now.
+  int number_of_sm = 0;
+  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+  assert(("Error (GPU low latency PBS): the number of input LWEs must be lower "
+          "or equal to the "
+          "number of streaming multiprocessors on the device divided by 8 * "
+          "level_count",
+          num_samples <= number_of_sm * 4. / 2. / level_count));
+}
+
+/*
+ * Runs standard checks to validate the inputs
+ */
+void checks_bootstrap_low_latency(int nbits, int level_count, int base_log,
+                                  int polynomial_size, int num_samples) {
+
+  assert(("Error (GPU low latency PBS): base log should be <= nbits",
+          base_log <= nbits));
+  checks_fast_bootstrap_low_latency(nbits, level_count, polynomial_size,
+                                    num_samples);
+}
+
 /*
 * This scratch function allocates the necessary amount of data on the GPU for
 * the low latency PBS on 32 bits inputs, into `pbs_buffer`. It also
@@ -11,6 +47,8 @@ void scratch_cuda_bootstrap_low_latency_32(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
    bool allocate_gpu_memory) {
+  checks_fast_bootstrap_low_latency(32, level_count, polynomial_size,
+                                    input_lwe_ciphertext_count);

  switch (polynomial_size) {
  case 256:
@@ -65,6 +103,8 @@ void scratch_cuda_bootstrap_low_latency_64(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
    bool allocate_gpu_memory) {
+  checks_fast_bootstrap_low_latency(64, level_count, polynomial_size,
+                                    input_lwe_ciphertext_count);

  switch (polynomial_size) {
  case 256:
@@ -123,24 +163,8 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
    uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
    uint32_t max_shared_memory) {

-  assert(("Error (GPU low latency PBS): base log should be <= 32",
-          base_log <= 32));
-  assert((
-      "Error (GPU low latency PBS): polynomial size should be one of 256, 512, "
-      "1024, 2048, 4096, 8192",
-      polynomial_size == 256 || polynomial_size == 512 ||
-          polynomial_size == 1024 || polynomial_size == 2048 ||
-          polynomial_size == 4096 || polynomial_size == 8192));
-  // The number of samples should be lower than 4 * SM/((k + 1) * l) (the
-  // factor 4 being related to the occupancy of 50%). The only supported
-  // value for k is 1, so k + 1 = 2 for now.
-  int number_of_sm = 0;
-  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
-  assert(("Error (GPU low latency PBS): the number of input LWEs must be lower "
-          "or equal to the "
-          "number of streaming multiprocessors on the device divided by 8 * "
-          "level_count",
-          num_samples <= number_of_sm * 4. / 2. / level_count));
+  checks_bootstrap_low_latency(32, level_count, base_log, polynomial_size,
+                               num_samples);

  switch (polynomial_size) {
  case 256:
@@ -280,24 +304,8 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
    uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
    uint32_t max_shared_memory) {

-  assert(("Error (GPU low latency PBS): base log should be <= 64",
-          base_log <= 64));
-  assert((
-      "Error (GPU low latency PBS): polynomial size should be one of 256, 512, "
-      "1024, 2048, 4096, 8192",
-      polynomial_size == 256 || polynomial_size == 512 ||
-          polynomial_size == 1024 || polynomial_size == 2048 ||
-          polynomial_size == 4096 || polynomial_size == 8192));
-  // The number of samples should be lower than 4 * SM/((k + 1) * l) (the
-  // factor 4 being related to the occupancy of 50%). The only supported
-  // value for k is 1, so k + 1 = 2 for now.
-  int number_of_sm = 0;
-  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
-  assert(("Error (GPU low latency PBS): the number of input LWEs must be lower "
-          "or equal to the "
-          "number of streaming multiprocessors on the device divided by 8 * "
-          "level_count",
-          num_samples <= number_of_sm * 4. / 2. / level_count));
+  checks_bootstrap_low_latency(64, level_count, base_log, polynomial_size,
+                               num_samples);

  switch (polynomial_size) {
  case 256:
--- a/backends/concrete-cuda/implementation/src/circuit_bootstrap.cu
+++ b/backends/concrete-cuda/implementation/src/circuit_bootstrap.cu
@@ -1,6 +1,38 @@
 #include "circuit_bootstrap.cuh"
 #include "circuit_bootstrap.h"

+/*
+ * Runs standard checks to validate the inputs
+ */
+void checks_fast_circuit_bootstrap(int polynomial_size, int number_of_inputs) {
+
+  assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
+          "256, 512, 1024, 2048, 4096, 8192",
+          polynomial_size == 256 || polynomial_size == 512 ||
+              polynomial_size == 1024 || polynomial_size == 2048 ||
+              polynomial_size == 4096 || polynomial_size == 8192));
+}
+
+/*
+ * Runs standard checks to validate the inputs
+ */
+void checks_circuit_bootstrap(int polynomial_size, int level_bsk,
+                              int number_of_inputs) {
+  // The number of samples should be lower than the number of streaming
+  // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
+  // to the occupancy of 50%). The only supported value for k is 1, so
+  // k + 1 = 2 for now.
+  int number_of_sm = 0;
+  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+  assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
+          "equal to the "
+          "number of streaming multiprocessors on the device divided by 8 * "
+          "level_count_bsk",
+          number_of_inputs <= number_of_sm / 4. / 2. / level_bsk));
+
+  checks_fast_circuit_bootstrap(polynomial_size, number_of_inputs);
+}
+
 /*
 * This scratch function allocates the necessary amount of data on the GPU for
 * the circuit bootstrap on 32 bits inputs, into `cbs_buffer`. It also
@@ -12,6 +44,8 @@ void scratch_cuda_circuit_bootstrap_32(
    uint32_t level_count_cbs, uint32_t number_of_inputs,
    uint32_t max_shared_memory, bool allocate_gpu_memory) {

+  checks_fast_circuit_bootstrap(polynomial_size, number_of_inputs);
+
  switch (polynomial_size) {
  case 256:
    scratch_circuit_bootstrap<uint32_t, int32_t, Degree<256>>(
@@ -65,6 +99,8 @@ void scratch_cuda_circuit_bootstrap_64(
    uint32_t level_count_cbs, uint32_t number_of_inputs,
    uint32_t max_shared_memory, bool allocate_gpu_memory) {

+  checks_fast_circuit_bootstrap(polynomial_size, number_of_inputs);
+
  switch (polynomial_size) {
  case 256:
    scratch_circuit_bootstrap<uint64_t, int64_t, Degree<256>>(
@@ -119,22 +155,9 @@ void cuda_circuit_bootstrap_32(
    uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
    uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
    uint32_t max_shared_memory) {
-  assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
-          "256, 512, 1024, 2048, 4096, 8192",
-          polynomial_size == 256 || polynomial_size == 512 ||
-              polynomial_size == 1024 || polynomial_size == 2048 ||
-              polynomial_size == 4096 || polynomial_size == 8192));
-  // The number of samples should be lower than the number of streaming
-  // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
-  // to the occupancy of 50%). The only supported value for k is 1, so
-  // k + 1 = 2 for now.
-  int number_of_sm = 0;
-  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
-  assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
-          "equal to the "
-          "number of streaming multiprocessors on the device divided by 8 * "
-          "level_count_bsk",
-          number_of_inputs <= number_of_sm / 4. / 2. / level_bsk));
+
+  checks_circuit_bootstrap(polynomial_size, level_bsk, number_of_inputs);
+
  switch (polynomial_size) {
  case 256:
    host_circuit_bootstrap<uint32_t, Degree<256>>(
@@ -228,23 +251,9 @@ void cuda_circuit_bootstrap_64(
    uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
    uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
    uint32_t max_shared_memory) {
-  assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
-          "256, 512, 1024, 2048, 4096, 8192",
-          polynomial_size == 256 || polynomial_size == 512 ||
-              polynomial_size == 1024 || polynomial_size == 2048 ||
-              polynomial_size == 4096 || polynomial_size == 8192));
-  // The number of samples should be lower than the number of streaming
-  // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
-  // to the occupancy of 50%). The only supported value for k is 1, so
-  // k + 1 = 2 for now.
-  int number_of_sm = 0;
-  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
-  assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
-          "equal to the "
-          "number of streaming multiprocessors on the device divided by 8 * "
-          "level_count_bsk",
-          number_of_inputs <= number_of_sm / 4. / 2. / level_bsk));
-  // The number of samples should be lower than the number of streaming
+
+  checks_circuit_bootstrap(polynomial_size, level_bsk, number_of_inputs);
+
  switch (polynomial_size) {
  case 256:
    host_circuit_bootstrap<uint64_t, Degree<256>>(
--- a/backends/concrete-cuda/implementation/src/vertical_packing.cu
+++ b/backends/concrete-cuda/implementation/src/vertical_packing.cu
@@ -2,6 +2,46 @@
 #include "vertical_packing.h"
 #include <cassert>

+/*
+ * Runs standard checks to validate the inputs
+ */
+void checks_fast_cmux_tree(int nbits, int polynomial_size, int r) {
+  assert((
+      "Error (GPU Cmux tree): polynomial size should be one of 256, 512, 1024, "
+      "2048, 4096, 8192",
+      polynomial_size == 256 || polynomial_size == 512 ||
+          polynomial_size == 1024 || polynomial_size == 2048 ||
+          polynomial_size == 4096 || polynomial_size == 8192));
+  // For larger k we will need to adjust the mask size
+  assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
+          "be >= 1 ",
+          r >= 1));
+}
+
+/*
+ * Runs standard checks to validate the inputs
+ */
+void checks_cmux_tree(int nbits, int polynomial_size, int base_log, int r) {
+
+  assert(("Error (GPU Cmux tree): base log should be <= nbits",
+          base_log <= nbits));
+  checks_fast_cmux_tree(nbits, polynomial_size, r);
+}
+
+/*
+ * Runs standard checks to validate the inputs
+ */
+void checks_blind_rotation_and_sample_extraction(int polynomial_size) {
+
+  assert(("Error (GPU Blind rotation + sample extraction): polynomial size "
+          "should be one of 256, 512, "
+          "1024, "
+          "2048, 4096, 8192",
+          polynomial_size == 256 || polynomial_size == 512 ||
+              polynomial_size == 1024 || polynomial_size == 2048 ||
+              polynomial_size == 4096 || polynomial_size == 8192));
+}
+
 /*
 * This scratch function allocates the necessary amount of data on the GPU for
 * the Cmux tree on 32 bits inputs, into `cmux_tree_buffer`. It also configures
@@ -14,6 +54,7 @@ void scratch_cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index,
                               uint32_t r, uint32_t tau,
                               uint32_t max_shared_memory,
                               bool allocate_gpu_memory) {
+  checks_fast_cmux_tree(32, polynomial_size, r);

  switch (polynomial_size) {
  case 256:
@@ -63,6 +104,8 @@ void scratch_cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index,
                               uint32_t r, uint32_t tau,
                               uint32_t max_shared_memory,
                               bool allocate_gpu_memory) {
+  checks_fast_cmux_tree(64, polynomial_size, r);
+
  switch (polynomial_size) {
  case 256:
    scratch_cmux_tree<uint64_t, int64_t, Degree<256>>(
@@ -110,17 +153,7 @@ void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
                       uint32_t level_count, uint32_t r, uint32_t tau,
                       uint32_t max_shared_memory) {

-  assert(("Error (GPU Cmux tree): base log should be <= 32", base_log <= 32));
-  assert((
-      "Error (GPU Cmux tree): polynomial size should be one of 256, 512, 1024, "
-      "2048, 4096, 8192",
-      polynomial_size == 256 || polynomial_size == 512 ||
-          polynomial_size == 1024 || polynomial_size == 2048 ||
-          polynomial_size == 4096 || polynomial_size == 8192));
-  // For larger k we will need to adjust the mask size
-  assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
-          "be >= 1 ",
-          r >= 1));
+  checks_cmux_tree(32, polynomial_size, base_log, r);

  switch (polynomial_size) {
  case 256:
@@ -197,18 +230,7 @@ void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
                       uint32_t polynomial_size, uint32_t base_log,
                       uint32_t level_count, uint32_t r, uint32_t tau,
                       uint32_t max_shared_memory) {
-
-  assert(("Error (GPU Cmux tree): base log should be <= 64", base_log <= 64));
-  assert((
-      "Error (GPU Cmux tree): polynomial size should be one of 256, 512, 1024, "
-      "2048, 4096, 8192",
-      polynomial_size == 256 || polynomial_size == 512 ||
-          polynomial_size == 1024 || polynomial_size == 2048 ||
-          polynomial_size == 4096 || polynomial_size == 8192));
-  // For larger k we will need to adjust the mask size
-  assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
-          "be >= 1 ",
-          r >= 1));
+  checks_cmux_tree(64, polynomial_size, base_log, r);

  switch (polynomial_size) {
  case 256:
@@ -273,6 +295,7 @@ void scratch_cuda_blind_rotation_sample_extraction_32(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t mbr_size, uint32_t tau, uint32_t max_shared_memory,
    bool allocate_gpu_memory) {
+  checks_blind_rotation_and_sample_extraction(polynomial_size);

  switch (polynomial_size) {
  case 256:
@@ -320,6 +343,7 @@ void scratch_cuda_blind_rotation_sample_extraction_64(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t mbr_size, uint32_t tau, uint32_t max_shared_memory,
    bool allocate_gpu_memory) {
+  checks_blind_rotation_and_sample_extraction(polynomial_size);

  switch (polynomial_size) {
  case 256:
@@ -386,6 +410,7 @@ void cuda_blind_rotate_and_sample_extraction_64(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t l_gadget, uint32_t max_shared_memory) {

+  checks_blind_rotation_and_sample_extraction(polynomial_size);
  switch (polynomial_size) {
  case 256:
    host_blind_rotate_and_sample_extraction<uint64_t, int64_t, Degree<256>>(
--- a/backends/concrete-cuda/implementation/src/wop_bootstrap.cu
+++ b/backends/concrete-cuda/implementation/src/wop_bootstrap.cu
@@ -1,5 +1,53 @@
 #include "wop_bootstrap.cuh"

+/*
+ * Runs standard checks to validate the inputs
+ */
+void checks_wop_pbs(int polynomial_size, int level_count_bsk,
+                    int number_of_inputs) {
+  assert(("Error (GPU WOP PBS): polynomial_size should be one of "
+          "256, 512, 1024, 2048, 4096, 8192",
+          polynomial_size == 256 || polynomial_size == 512 ||
+              polynomial_size == 1024 || polynomial_size == 2048 ||
+              polynomial_size == 4096 || polynomial_size == 8192));
+  // The number of inputs should be lower than the number of streaming
+  // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
+  // to the occupancy of 50%). The only supported value for k is 1, so
+  // k + 1 = 2 for now.
+  int number_of_sm = 0;
+  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+  assert(("Error (GPU WOP PBS): the number of input LWEs must be lower or "
+          "equal to the "
+          "number of streaming multiprocessors on the device divided by 8 * "
+          "level_count_bsk",
+          number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk));
+}
+
+void checks_fast_circuit_bootstrap_vertical_packing(int polynomial_size) {
+  assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
+          "256, 512, 1024, 2048, 4096, 8192",
+          polynomial_size == 256 || polynomial_size == 512 ||
+              polynomial_size == 1024 || polynomial_size == 2048 ||
+              polynomial_size == 4096 || polynomial_size == 8192));
+}
+
+void checks_circuit_bootstrap_vertical_packing(int polynomial_size,
+                                               int number_of_inputs,
+                                               int level_count_bsk) {
+  // The number of inputs should be lower than the number of streaming
+  // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
+  // to the occupancy of 50%). The only supported value for k is 1, so
+  // k + 1 = 2 for now.
+  int number_of_sm = 0;
+  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+  assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
+          "equal to the "
+          "number of streaming multiprocessors on the device divided by 8 * "
+          "level_count_bsk",
+          number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk));
+  checks_fast_circuit_bootstrap_vertical_packing(polynomial_size);
+}
+
 /*
 * This scratch function allocates the necessary amount of data on the GPU for
 * the circuit bootstrap and vertical packing on 32 bits inputs, into
@@ -13,6 +61,8 @@ void scratch_cuda_circuit_bootstrap_vertical_packing_32(
    uint32_t number_of_inputs, uint32_t tau, uint32_t max_shared_memory,
    bool allocate_gpu_memory) {

+  checks_fast_circuit_bootstrap_vertical_packing(polynomial_size);
+
  switch (polynomial_size) {
  case 256:
    scratch_circuit_bootstrap_vertical_packing<uint32_t, int32_t, Degree<256>>(
@@ -68,6 +118,8 @@ void scratch_cuda_circuit_bootstrap_vertical_packing_64(
    uint32_t number_of_inputs, uint32_t tau, uint32_t max_shared_memory,
    bool allocate_gpu_memory) {

+  checks_fast_circuit_bootstrap_vertical_packing(polynomial_size);
+
  switch (polynomial_size) {
  case 256:
    scratch_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<256>>(
@@ -124,6 +176,7 @@ void scratch_cuda_wop_pbs_32(
    uint32_t number_of_bits_of_message_including_padding,
    uint32_t number_of_bits_to_extract, uint32_t number_of_inputs,
    uint32_t max_shared_memory, bool allocate_gpu_memory) {
+  checks_wop_pbs(polynomial_size, level_count_bsk, number_of_inputs);
  switch (polynomial_size) {
  case 256:
    scratch_wop_pbs<uint32_t, int32_t, Degree<256>>(
@@ -192,6 +245,7 @@ void scratch_cuda_wop_pbs_64(
    uint32_t number_of_bits_of_message_including_padding,
    uint32_t number_of_bits_to_extract, uint32_t number_of_inputs,
    uint32_t max_shared_memory, bool allocate_gpu_memory) {
+  checks_wop_pbs(polynomial_size, level_count_bsk, number_of_inputs);
  switch (polynomial_size) {
  case 256:
    scratch_wop_pbs<uint64_t, int64_t, Degree<256>>(
@@ -282,22 +336,10 @@ void cuda_circuit_bootstrap_vertical_packing_64(
    uint32_t level_count_pksk, uint32_t base_log_pksk, uint32_t level_count_cbs,
    uint32_t base_log_cbs, uint32_t number_of_inputs, uint32_t lut_number,
    uint32_t max_shared_memory) {
-  assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
-          "256, 512, 1024, 2048, 4096, 8192",
-          polynomial_size == 256 || polynomial_size == 512 ||
-              polynomial_size == 1024 || polynomial_size == 2048 ||
-              polynomial_size == 4096 || polynomial_size == 8192));
-  // The number of inputs should be lower than the number of streaming
-  // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
-  // to the occupancy of 50%). The only supported value for k is 1, so
-  // k + 1 = 2 for now.
-  int number_of_sm = 0;
-  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
-  assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
-          "equal to the "
-          "number of streaming multiprocessors on the device divided by 8 * "
-          "level_count_bsk",
-          number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk));
+
+  checks_circuit_bootstrap_vertical_packing(polynomial_size, number_of_inputs,
+                                            level_count_bsk);
+
  switch (polynomial_size) {
  case 256:
    host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<256>>(
@@ -411,22 +453,7 @@ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
                     uint32_t number_of_bits_of_message_including_padding,
                     uint32_t number_of_bits_to_extract, uint32_t delta_log,
                     uint32_t number_of_inputs, uint32_t max_shared_memory) {
-  assert(("Error (GPU WOP PBS): polynomial_size should be one of "
-          "256, 512, 1024, 2048, 4096, 8192",
-          polynomial_size == 256 || polynomial_size == 512 ||
-              polynomial_size == 1024 || polynomial_size == 2048 ||
-              polynomial_size == 4096 || polynomial_size == 8192));
-  // The number of inputs should be lower than the number of streaming
-  // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
-  // to the occupancy of 50%). The only supported value for k is 1, so
-  // k + 1 = 2 for now.
-  int number_of_sm = 0;
-  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
-  assert(("Error (GPU WOP PBS): the number of input LWEs must be lower or "
-          "equal to the "
-          "number of streaming multiprocessors on the device divided by 8 * "
-          "level_count_bsk",
-          number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk));
+  checks_wop_pbs(polynomial_size, level_count_bsk, number_of_inputs);
  switch (polynomial_size) {
  case 256:
    host_wop_pbs<uint64_t, int64_t, Degree<256>>(