diff --git a/backends/concrete-cuda/implementation/src/bit_extraction.cu b/backends/concrete-cuda/implementation/src/bit_extraction.cu
index 571c3e14c..5d057b088 100644
--- a/backends/concrete-cuda/implementation/src/bit_extraction.cu
+++ b/backends/concrete-cuda/implementation/src/bit_extraction.cu
@@ -1,5 +1,41 @@
 #include "bit_extraction.cuh"
 
+/*
+ * Runs standard checks to validate the inputs
+ */
+void checks_fast_extract_bits(int nbits, int polynomial_size,
+                              int level_count_bsk, int number_of_samples) {
+
+  assert(("Error (GPU extract bits): polynomial_size should be one of "
+          "256, 512, 1024, 2048, 4096, 8192",
+          polynomial_size == 256 || polynomial_size == 512 ||
+              polynomial_size == 1024 || polynomial_size == 2048 ||
+              polynomial_size == 4096 || polynomial_size == 8192));
+  // The number of samples should be lower than four time the number of
+  // streaming multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being
+  // related to the occupancy of 50%). The only supported value for k is 1, so
+  // k + 1 = 2 for now.
+  int number_of_sm = 0;
+  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+  assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
+          "equal to the "
+          "number of streaming multiprocessors on the device divided by 8 * "
+          "level_count_bsk",
+          number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk));
+}
+
+/*
+ * Runs standard checks to validate the inputs
+ */
+void checks_extract_bits(int nbits, int polynomial_size, int base_log_bsk,
+                         int level_count_bsk, int number_of_samples) {
+
+  assert(("Error (GPU extract bits): base log should be <= nbits",
+          base_log_bsk <= nbits));
+  checks_fast_extract_bits(nbits, polynomial_size, level_count_bsk,
+                           number_of_samples);
+}
+
 /*
  * This scratch function allocates the necessary amount of data on the GPU for
  * the bit extraction on 32 bits inputs, into `cbs_buffer`. It also
@@ -11,6 +47,8 @@ void scratch_cuda_extract_bits_32(
     uint32_t level_count, uint32_t number_of_inputs, uint32_t max_shared_memory,
     bool allocate_gpu_memory) {
 
+  checks_fast_extract_bits(32, polynomial_size, level_count, number_of_inputs);
+
   switch (polynomial_size) {
   case 256:
     scratch_extract_bits<uint32_t, int32_t, Degree<256>>(
@@ -63,6 +101,7 @@ void scratch_cuda_extract_bits_64(
     uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
     uint32_t level_count, uint32_t number_of_inputs, uint32_t max_shared_memory,
     bool allocate_gpu_memory) {
+  checks_fast_extract_bits(64, polynomial_size, level_count, number_of_inputs);
 
   switch (polynomial_size) {
   case 256:
@@ -119,24 +158,8 @@ void cuda_extract_bits_32(void *v_stream, uint32_t gpu_index,
                           uint32_t level_count_bsk, uint32_t base_log_ksk,
                           uint32_t level_count_ksk, uint32_t number_of_samples,
                           uint32_t max_shared_memory) {
-  assert(("Error (GPU extract bits): base log should be <= 32",
-          base_log_bsk <= 32));
-  assert(("Error (GPU extract bits): polynomial_size should be one of "
-          "256, 512, 1024, 2048, 4096, 8192",
-          polynomial_size == 256 || polynomial_size == 512 ||
-              polynomial_size == 1024 || polynomial_size == 2048 ||
-              polynomial_size == 4096 || polynomial_size == 8192));
-  // The number of samples should be lower than four time the number of
-  // streaming multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being
-  // related to the occupancy of 50%). The only supported value for k is 1, so
-  // k + 1 = 2 for now.
-  int number_of_sm = 0;
-  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
-  assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
-          "equal to the "
-          "number of streaming multiprocessors on the device divided by 8 * "
-          "level_count_bsk",
-          number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk));
+  checks_extract_bits(32, polynomial_size, base_log_bsk, level_count_bsk,
+                      number_of_samples);
 
   switch (polynomial_size) {
   case 256:
@@ -253,24 +276,8 @@ void cuda_extract_bits_64(void *v_stream, uint32_t gpu_index,
                           uint32_t level_count_bsk, uint32_t base_log_ksk,
                           uint32_t level_count_ksk, uint32_t number_of_samples,
                           uint32_t max_shared_memory) {
-  assert(("Error (GPU extract bits): base log should be <= 64",
-          base_log_bsk <= 64));
-  assert(("Error (GPU extract bits): polynomial_size should be one of "
-          "256, 512, 1024, 2048, 4096, 8192",
-          polynomial_size == 256 || polynomial_size == 512 ||
-              polynomial_size == 1024 || polynomial_size == 2048 ||
-              polynomial_size == 4096 || polynomial_size == 8192));
-  // The number of samples should be lower than four time the number of
-  // streaming multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being
-  // related to the occupancy of 50%). The only supported value for k is 1, so
-  // k + 1 = 2 for now.
-  int number_of_sm = 0;
-  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
-  assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
-          "equal to the "
-          "number of streaming multiprocessors on the device divided by 8 * "
-          "level_count_bsk",
-          number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk));
+  checks_extract_bits(64, polynomial_size, base_log_bsk, level_count_bsk,
+                      number_of_samples);
 
   switch (polynomial_size) {
   case 256:
diff --git a/backends/concrete-cuda/implementation/src/bootstrap_amortized.cu b/backends/concrete-cuda/implementation/src/bootstrap_amortized.cu
index 4580b0ca8..c1a64bd7d 100644
--- a/backends/concrete-cuda/implementation/src/bootstrap_amortized.cu
+++ b/backends/concrete-cuda/implementation/src/bootstrap_amortized.cu
@@ -1,5 +1,26 @@
 #include "bootstrap_amortized.cuh"
 
+/*
+ * Runs standard checks to validate the inputs
+ */
+void checks_fast_bootstrap_amortized(int nbits, int polynomial_size) {
+  assert(
+      ("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
+       "1024, 2048, 4096, 8192",
+       polynomial_size == 256 || polynomial_size == 512 ||
+           polynomial_size == 1024 || polynomial_size == 2048 ||
+           polynomial_size == 4096 || polynomial_size == 8192));
+}
+
+/*
+ * Runs standard checks to validate the inputs
+ */
+void checks_bootstrap_amortized(int nbits, int base_log, int polynomial_size) {
+  assert(("Error (GPU amortized PBS): base log should be <= nbits",
+          base_log <= nbits));
+  checks_fast_bootstrap_amortized(nbits, polynomial_size);
+}
+
 /*
  * This scratch function allocates the necessary amount of data on the GPU for
  * the amortized PBS on 32 bits inputs, into `pbs_buffer`. It also
@@ -13,6 +34,7 @@ void scratch_cuda_bootstrap_amortized_32(void *v_stream, uint32_t gpu_index,
                                          uint32_t input_lwe_ciphertext_count,
                                          uint32_t max_shared_memory,
                                          bool allocate_gpu_memory) {
+  checks_fast_bootstrap_amortized(32, polynomial_size);
 
   switch (polynomial_size) {
   case 256:
@@ -63,6 +85,7 @@ void scratch_cuda_bootstrap_amortized_64(void *v_stream, uint32_t gpu_index,
                                          uint32_t input_lwe_ciphertext_count,
                                          uint32_t max_shared_memory,
                                          bool allocate_gpu_memory) {
+  checks_fast_bootstrap_amortized(64, polynomial_size);
 
   switch (polynomial_size) {
   case 256:
@@ -111,14 +134,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
     uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
     uint32_t max_shared_memory) {
 
-  assert(
-      ("Error (GPU amortized PBS): base log should be <= 32", base_log <= 32));
-  assert(
-      ("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
-       "1024, 2048, 4096, 8192",
-       polynomial_size == 256 || polynomial_size == 512 ||
-           polynomial_size == 1024 || polynomial_size == 2048 ||
-           polynomial_size == 4096 || polynomial_size == 8192));
+  checks_bootstrap_amortized(32, base_log, polynomial_size);
 
   switch (polynomial_size) {
   case 256:
@@ -247,14 +263,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
     uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
     uint32_t max_shared_memory) {
 
-  assert(
-      ("Error (GPU amortized PBS): base log should be <= 64", base_log <= 64));
-  assert(
-      ("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
-       "1024, 2048, 4096, 8192",
-       polynomial_size == 256 || polynomial_size == 512 ||
-           polynomial_size == 1024 || polynomial_size == 2048 ||
-           polynomial_size == 4096 || polynomial_size == 8192));
+  checks_bootstrap_amortized(64, base_log, polynomial_size);
 
   switch (polynomial_size) {
   case 256:
diff --git a/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu b/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu
index 94f906bd8..31ae804e0 100644
--- a/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu
+++ b/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu
@@ -1,5 +1,41 @@
 #include "bootstrap_low_latency.cuh"
 
+/*
+ * Runs standard checks to validate the inputs
+ */
+void checks_fast_bootstrap_low_latency(int nbits, int level_count,
+                                       int polynomial_size, int num_samples) {
+
+  assert((
+      "Error (GPU low latency PBS): polynomial size should be one of 256, 512, "
+      "1024, 2048, 4096, 8192",
+      polynomial_size == 256 || polynomial_size == 512 ||
+          polynomial_size == 1024 || polynomial_size == 2048 ||
+          polynomial_size == 4096 || polynomial_size == 8192));
+  // The number of samples should be lower than 4 * SM/((k + 1) * l) (the
+  // factor 4 being related to the occupancy of 50%). The only supported
+  // value for k is 1, so k + 1 = 2 for now.
+  int number_of_sm = 0;
+  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+  assert(("Error (GPU low latency PBS): the number of input LWEs must be lower "
+          "or equal to the "
+          "number of streaming multiprocessors on the device divided by 8 * "
+          "level_count",
+          num_samples <= number_of_sm * 4. / 2. / level_count));
+}
+
+/*
+ * Runs standard checks to validate the inputs
+ */
+void checks_bootstrap_low_latency(int nbits, int level_count, int base_log,
+                                  int polynomial_size, int num_samples) {
+
+  assert(("Error (GPU low latency PBS): base log should be <= nbits",
+          base_log <= nbits));
+  checks_fast_bootstrap_low_latency(nbits, level_count, polynomial_size,
+                                    num_samples);
+}
+
 /*
  * This scratch function allocates the necessary amount of data on the GPU for
  * the low latency PBS on 32 bits inputs, into `pbs_buffer`. It also
@@ -11,6 +47,8 @@ void scratch_cuda_bootstrap_low_latency_32(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
     bool allocate_gpu_memory) {
+  checks_fast_bootstrap_low_latency(32, level_count, polynomial_size,
+                                    input_lwe_ciphertext_count);
 
   switch (polynomial_size) {
   case 256:
@@ -65,6 +103,8 @@ void scratch_cuda_bootstrap_low_latency_64(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
     bool allocate_gpu_memory) {
+  checks_fast_bootstrap_low_latency(64, level_count, polynomial_size,
+                                    input_lwe_ciphertext_count);
 
   switch (polynomial_size) {
   case 256:
@@ -123,24 +163,8 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
     uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
     uint32_t max_shared_memory) {
 
-  assert(("Error (GPU low latency PBS): base log should be <= 32",
-          base_log <= 32));
-  assert((
-      "Error (GPU low latency PBS): polynomial size should be one of 256, 512, "
-      "1024, 2048, 4096, 8192",
-      polynomial_size == 256 || polynomial_size == 512 ||
-          polynomial_size == 1024 || polynomial_size == 2048 ||
-          polynomial_size == 4096 || polynomial_size == 8192));
-  // The number of samples should be lower than 4 * SM/((k + 1) * l) (the
-  // factor 4 being related to the occupancy of 50%). The only supported
-  // value for k is 1, so k + 1 = 2 for now.
-  int number_of_sm = 0;
-  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
-  assert(("Error (GPU low latency PBS): the number of input LWEs must be lower "
-          "or equal to the "
-          "number of streaming multiprocessors on the device divided by 8 * "
-          "level_count",
-          num_samples <= number_of_sm * 4. / 2. / level_count));
+  checks_bootstrap_low_latency(32, level_count, base_log, polynomial_size,
+                               num_samples);
 
   switch (polynomial_size) {
   case 256:
@@ -280,24 +304,8 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
     uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
     uint32_t max_shared_memory) {
 
-  assert(("Error (GPU low latency PBS): base log should be <= 64",
-          base_log <= 64));
-  assert((
-      "Error (GPU low latency PBS): polynomial size should be one of 256, 512, "
-      "1024, 2048, 4096, 8192",
-      polynomial_size == 256 || polynomial_size == 512 ||
-          polynomial_size == 1024 || polynomial_size == 2048 ||
-          polynomial_size == 4096 || polynomial_size == 8192));
-  // The number of samples should be lower than 4 * SM/((k + 1) * l) (the
-  // factor 4 being related to the occupancy of 50%). The only supported
-  // value for k is 1, so k + 1 = 2 for now.
-  int number_of_sm = 0;
-  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
-  assert(("Error (GPU low latency PBS): the number of input LWEs must be lower "
-          "or equal to the "
-          "number of streaming multiprocessors on the device divided by 8 * "
-          "level_count",
-          num_samples <= number_of_sm * 4. / 2. / level_count));
+  checks_bootstrap_low_latency(64, level_count, base_log, polynomial_size,
+                               num_samples);
 
   switch (polynomial_size) {
   case 256:
diff --git a/backends/concrete-cuda/implementation/src/circuit_bootstrap.cu b/backends/concrete-cuda/implementation/src/circuit_bootstrap.cu
index 55236f84c..e4716db0d 100644
--- a/backends/concrete-cuda/implementation/src/circuit_bootstrap.cu
+++ b/backends/concrete-cuda/implementation/src/circuit_bootstrap.cu
@@ -1,6 +1,38 @@
 #include "circuit_bootstrap.cuh"
 #include "circuit_bootstrap.h"
 
+/*
+ * Runs standard checks to validate the inputs
+ */
+void checks_fast_circuit_bootstrap(int polynomial_size, int number_of_inputs) {
+
+  assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
+          "256, 512, 1024, 2048, 4096, 8192",
+          polynomial_size == 256 || polynomial_size == 512 ||
+              polynomial_size == 1024 || polynomial_size == 2048 ||
+              polynomial_size == 4096 || polynomial_size == 8192));
+}
+
+/*
+ * Runs standard checks to validate the inputs
+ */
+void checks_circuit_bootstrap(int polynomial_size, int level_bsk,
+                              int number_of_inputs) {
+  // The number of samples should be lower than the number of streaming
+  // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
+  // to the occupancy of 50%). The only supported value for k is 1, so
+  // k + 1 = 2 for now.
+  int number_of_sm = 0;
+  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+  assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
+          "equal to the "
+          "number of streaming multiprocessors on the device divided by 8 * "
+          "level_count_bsk",
+          number_of_inputs <= number_of_sm / 4. / 2. / level_bsk));
+
+  checks_fast_circuit_bootstrap(polynomial_size, number_of_inputs);
+}
+
 /*
  * This scratch function allocates the necessary amount of data on the GPU for
  * the circuit bootstrap on 32 bits inputs, into `cbs_buffer`. It also
@@ -12,6 +44,8 @@ void scratch_cuda_circuit_bootstrap_32(
     uint32_t level_count_cbs, uint32_t number_of_inputs,
     uint32_t max_shared_memory, bool allocate_gpu_memory) {
 
+  checks_fast_circuit_bootstrap(polynomial_size, number_of_inputs);
+
   switch (polynomial_size) {
   case 256:
     scratch_circuit_bootstrap<uint32_t, int32_t, Degree<256>>(
@@ -65,6 +99,8 @@ void scratch_cuda_circuit_bootstrap_64(
     uint32_t level_count_cbs, uint32_t number_of_inputs,
     uint32_t max_shared_memory, bool allocate_gpu_memory) {
 
+  checks_fast_circuit_bootstrap(polynomial_size, number_of_inputs);
+
   switch (polynomial_size) {
   case 256:
     scratch_circuit_bootstrap<uint64_t, int64_t, Degree<256>>(
@@ -119,22 +155,9 @@ void cuda_circuit_bootstrap_32(
     uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
     uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
     uint32_t max_shared_memory) {
-  assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
-          "256, 512, 1024, 2048, 4096, 8192",
-          polynomial_size == 256 || polynomial_size == 512 ||
-              polynomial_size == 1024 || polynomial_size == 2048 ||
-              polynomial_size == 4096 || polynomial_size == 8192));
-  // The number of samples should be lower than the number of streaming
-  // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
-  // to the occupancy of 50%). The only supported value for k is 1, so
-  // k + 1 = 2 for now.
-  int number_of_sm = 0;
-  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
-  assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
-          "equal to the "
-          "number of streaming multiprocessors on the device divided by 8 * "
-          "level_count_bsk",
-          number_of_inputs <= number_of_sm / 4. / 2. / level_bsk));
+
+  checks_circuit_bootstrap(polynomial_size, level_bsk, number_of_inputs);
+
   switch (polynomial_size) {
   case 256:
     host_circuit_bootstrap<uint32_t, Degree<256>>(
@@ -228,23 +251,9 @@ void cuda_circuit_bootstrap_64(
     uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
     uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
     uint32_t max_shared_memory) {
-  assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
-          "256, 512, 1024, 2048, 4096, 8192",
-          polynomial_size == 256 || polynomial_size == 512 ||
-              polynomial_size == 1024 || polynomial_size == 2048 ||
-              polynomial_size == 4096 || polynomial_size == 8192));
-  // The number of samples should be lower than the number of streaming
-  // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
-  // to the occupancy of 50%). The only supported value for k is 1, so
-  // k + 1 = 2 for now.
-  int number_of_sm = 0;
-  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
-  assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
-          "equal to the "
-          "number of streaming multiprocessors on the device divided by 8 * "
-          "level_count_bsk",
-          number_of_inputs <= number_of_sm / 4. / 2. / level_bsk));
-  // The number of samples should be lower than the number of streaming
+
+  checks_circuit_bootstrap(polynomial_size, level_bsk, number_of_inputs);
+
   switch (polynomial_size) {
   case 256:
     host_circuit_bootstrap<uint64_t, Degree<256>>(
diff --git a/backends/concrete-cuda/implementation/src/vertical_packing.cu b/backends/concrete-cuda/implementation/src/vertical_packing.cu
index eb6a437dd..cb7adfbe5 100644
--- a/backends/concrete-cuda/implementation/src/vertical_packing.cu
+++ b/backends/concrete-cuda/implementation/src/vertical_packing.cu
@@ -2,6 +2,46 @@
 #include "vertical_packing.h"
 #include <cassert>
 
+/*
+ * Runs standard checks to validate the inputs
+ */
+void checks_fast_cmux_tree(int nbits, int polynomial_size, int r) {
+  assert((
+      "Error (GPU Cmux tree): polynomial size should be one of 256, 512, 1024, "
+      "2048, 4096, 8192",
+      polynomial_size == 256 || polynomial_size == 512 ||
+          polynomial_size == 1024 || polynomial_size == 2048 ||
+          polynomial_size == 4096 || polynomial_size == 8192));
+  // For larger k we will need to adjust the mask size
+  assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
+          "be >= 1 ",
+          r >= 1));
+}
+
+/*
+ * Runs standard checks to validate the inputs
+ */
+void checks_cmux_tree(int nbits, int polynomial_size, int base_log, int r) {
+
+  assert(("Error (GPU Cmux tree): base log should be <= nbits",
+          base_log <= nbits));
+  checks_fast_cmux_tree(nbits, polynomial_size, r);
+}
+
+/*
+ * Runs standard checks to validate the inputs
+ */
+void checks_blind_rotation_and_sample_extraction(int polynomial_size) {
+
+  assert(("Error (GPU Blind rotation + sample extraction): polynomial size "
+          "should be one of 256, 512, "
+          "1024, "
+          "2048, 4096, 8192",
+          polynomial_size == 256 || polynomial_size == 512 ||
+              polynomial_size == 1024 || polynomial_size == 2048 ||
+              polynomial_size == 4096 || polynomial_size == 8192));
+}
+
 /*
  * This scratch function allocates the necessary amount of data on the GPU for
  * the Cmux tree on 32 bits inputs, into `cmux_tree_buffer`. It also configures
@@ -14,6 +54,7 @@ void scratch_cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index,
                                uint32_t r, uint32_t tau,
                                uint32_t max_shared_memory,
                                bool allocate_gpu_memory) {
+  checks_fast_cmux_tree(32, polynomial_size, r);
 
   switch (polynomial_size) {
   case 256:
@@ -63,6 +104,8 @@ void scratch_cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index,
                                uint32_t r, uint32_t tau,
                                uint32_t max_shared_memory,
                                bool allocate_gpu_memory) {
+  checks_fast_cmux_tree(64, polynomial_size, r);
+
   switch (polynomial_size) {
   case 256:
     scratch_cmux_tree<uint64_t, int64_t, Degree<256>>(
@@ -110,17 +153,7 @@ void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
                        uint32_t level_count, uint32_t r, uint32_t tau,
                        uint32_t max_shared_memory) {
 
-  assert(("Error (GPU Cmux tree): base log should be <= 32", base_log <= 32));
-  assert((
-      "Error (GPU Cmux tree): polynomial size should be one of 256, 512, 1024, "
-      "2048, 4096, 8192",
-      polynomial_size == 256 || polynomial_size == 512 ||
-          polynomial_size == 1024 || polynomial_size == 2048 ||
-          polynomial_size == 4096 || polynomial_size == 8192));
-  // For larger k we will need to adjust the mask size
-  assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
-          "be >= 1 ",
-          r >= 1));
+  checks_cmux_tree(32, polynomial_size, base_log, r);
 
   switch (polynomial_size) {
   case 256:
@@ -197,18 +230,7 @@ void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
                        uint32_t polynomial_size, uint32_t base_log,
                        uint32_t level_count, uint32_t r, uint32_t tau,
                        uint32_t max_shared_memory) {
-
-  assert(("Error (GPU Cmux tree): base log should be <= 64", base_log <= 64));
-  assert((
-      "Error (GPU Cmux tree): polynomial size should be one of 256, 512, 1024, "
-      "2048, 4096, 8192",
-      polynomial_size == 256 || polynomial_size == 512 ||
-          polynomial_size == 1024 || polynomial_size == 2048 ||
-          polynomial_size == 4096 || polynomial_size == 8192));
-  // For larger k we will need to adjust the mask size
-  assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
-          "be >= 1 ",
-          r >= 1));
+  checks_cmux_tree(64, polynomial_size, base_log, r);
 
   switch (polynomial_size) {
   case 256:
@@ -273,6 +295,7 @@ void scratch_cuda_blind_rotation_sample_extraction_32(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
     uint32_t mbr_size, uint32_t tau, uint32_t max_shared_memory,
     bool allocate_gpu_memory) {
+  checks_blind_rotation_and_sample_extraction(polynomial_size);
 
   switch (polynomial_size) {
   case 256:
@@ -320,6 +343,7 @@ void scratch_cuda_blind_rotation_sample_extraction_64(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
     uint32_t mbr_size, uint32_t tau, uint32_t max_shared_memory,
     bool allocate_gpu_memory) {
+  checks_blind_rotation_and_sample_extraction(polynomial_size);
 
   switch (polynomial_size) {
   case 256:
@@ -386,6 +410,7 @@ void cuda_blind_rotate_and_sample_extraction_64(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t l_gadget, uint32_t max_shared_memory) {
 
+  checks_blind_rotation_and_sample_extraction(polynomial_size);
   switch (polynomial_size) {
   case 256:
     host_blind_rotate_and_sample_extraction<uint64_t, int64_t, Degree<256>>(
diff --git a/backends/concrete-cuda/implementation/src/wop_bootstrap.cu b/backends/concrete-cuda/implementation/src/wop_bootstrap.cu
index e21b3945c..15a740d0c 100644
--- a/backends/concrete-cuda/implementation/src/wop_bootstrap.cu
+++ b/backends/concrete-cuda/implementation/src/wop_bootstrap.cu
@@ -1,5 +1,53 @@
 #include "wop_bootstrap.cuh"
 
+/*
+ * Runs standard checks to validate the inputs
+ */
+void checks_wop_pbs(int polynomial_size, int level_count_bsk,
+                    int number_of_inputs) {
+  assert(("Error (GPU WOP PBS): polynomial_size should be one of "
+          "256, 512, 1024, 2048, 4096, 8192",
+          polynomial_size == 256 || polynomial_size == 512 ||
+              polynomial_size == 1024 || polynomial_size == 2048 ||
+              polynomial_size == 4096 || polynomial_size == 8192));
+  // The number of inputs should be lower than the number of streaming
+  // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
+  // to the occupancy of 50%). The only supported value for k is 1, so
+  // k + 1 = 2 for now.
+  int number_of_sm = 0;
+  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+  assert(("Error (GPU WOP PBS): the number of input LWEs must be lower or "
+          "equal to the "
+          "number of streaming multiprocessors on the device divided by 8 * "
+          "level_count_bsk",
+          number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk));
+}
+
+void checks_fast_circuit_bootstrap_vertical_packing(int polynomial_size) {
+  assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
+          "256, 512, 1024, 2048, 4096, 8192",
+          polynomial_size == 256 || polynomial_size == 512 ||
+              polynomial_size == 1024 || polynomial_size == 2048 ||
+              polynomial_size == 4096 || polynomial_size == 8192));
+}
+
+void checks_circuit_bootstrap_vertical_packing(int polynomial_size,
+                                               int number_of_inputs,
+                                               int level_count_bsk) {
+  // The number of inputs should be lower than the number of streaming
+  // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
+  // to the occupancy of 50%). The only supported value for k is 1, so
+  // k + 1 = 2 for now.
+  int number_of_sm = 0;
+  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+  assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
+          "equal to the "
+          "number of streaming multiprocessors on the device divided by 8 * "
+          "level_count_bsk",
+          number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk));
+  checks_fast_circuit_bootstrap_vertical_packing(polynomial_size);
+}
+
 /*
  * This scratch function allocates the necessary amount of data on the GPU for
  * the circuit bootstrap and vertical packing on 32 bits inputs, into
@@ -13,6 +61,8 @@ void scratch_cuda_circuit_bootstrap_vertical_packing_32(
     uint32_t number_of_inputs, uint32_t tau, uint32_t max_shared_memory,
     bool allocate_gpu_memory) {
 
+  checks_fast_circuit_bootstrap_vertical_packing(polynomial_size);
+
   switch (polynomial_size) {
   case 256:
     scratch_circuit_bootstrap_vertical_packing<uint32_t, int32_t, Degree<256>>(
@@ -68,6 +118,8 @@ void scratch_cuda_circuit_bootstrap_vertical_packing_64(
     uint32_t number_of_inputs, uint32_t tau, uint32_t max_shared_memory,
     bool allocate_gpu_memory) {
 
+  checks_fast_circuit_bootstrap_vertical_packing(polynomial_size);
+
   switch (polynomial_size) {
   case 256:
     scratch_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<256>>(
@@ -124,6 +176,7 @@ void scratch_cuda_wop_pbs_32(
     uint32_t number_of_bits_of_message_including_padding,
     uint32_t number_of_bits_to_extract, uint32_t number_of_inputs,
     uint32_t max_shared_memory, bool allocate_gpu_memory) {
+  checks_wop_pbs(polynomial_size, level_count_bsk, number_of_inputs);
   switch (polynomial_size) {
   case 256:
     scratch_wop_pbs<uint32_t, int32_t, Degree<256>>(
@@ -192,6 +245,7 @@ void scratch_cuda_wop_pbs_64(
     uint32_t number_of_bits_of_message_including_padding,
     uint32_t number_of_bits_to_extract, uint32_t number_of_inputs,
     uint32_t max_shared_memory, bool allocate_gpu_memory) {
+  checks_wop_pbs(polynomial_size, level_count_bsk, number_of_inputs);
   switch (polynomial_size) {
   case 256:
     scratch_wop_pbs<uint64_t, int64_t, Degree<256>>(
@@ -282,22 +336,10 @@ void cuda_circuit_bootstrap_vertical_packing_64(
     uint32_t level_count_pksk, uint32_t base_log_pksk, uint32_t level_count_cbs,
     uint32_t base_log_cbs, uint32_t number_of_inputs, uint32_t lut_number,
     uint32_t max_shared_memory) {
-  assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
-          "256, 512, 1024, 2048, 4096, 8192",
-          polynomial_size == 256 || polynomial_size == 512 ||
-              polynomial_size == 1024 || polynomial_size == 2048 ||
-              polynomial_size == 4096 || polynomial_size == 8192));
-  // The number of inputs should be lower than the number of streaming
-  // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
-  // to the occupancy of 50%). The only supported value for k is 1, so
-  // k + 1 = 2 for now.
-  int number_of_sm = 0;
-  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
-  assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
-          "equal to the "
-          "number of streaming multiprocessors on the device divided by 8 * "
-          "level_count_bsk",
-          number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk));
+
+  checks_circuit_bootstrap_vertical_packing(polynomial_size, number_of_inputs,
+                                            level_count_bsk);
+
   switch (polynomial_size) {
   case 256:
     host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<256>>(
@@ -411,22 +453,7 @@ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
                      uint32_t number_of_bits_of_message_including_padding,
                      uint32_t number_of_bits_to_extract, uint32_t delta_log,
                      uint32_t number_of_inputs, uint32_t max_shared_memory) {
-  assert(("Error (GPU WOP PBS): polynomial_size should be one of "
-          "256, 512, 1024, 2048, 4096, 8192",
-          polynomial_size == 256 || polynomial_size == 512 ||
-              polynomial_size == 1024 || polynomial_size == 2048 ||
-              polynomial_size == 4096 || polynomial_size == 8192));
-  // The number of inputs should be lower than the number of streaming
-  // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
-  // to the occupancy of 50%). The only supported value for k is 1, so
-  // k + 1 = 2 for now.
-  int number_of_sm = 0;
-  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
-  assert(("Error (GPU WOP PBS): the number of input LWEs must be lower or "
-          "equal to the "
-          "number of streaming multiprocessors on the device divided by 8 * "
-          "level_count_bsk",
-          number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk));
+  checks_wop_pbs(polynomial_size, level_count_bsk, number_of_inputs);
   switch (polynomial_size) {
   case 256:
     host_wop_pbs<uint64_t, int64_t, Degree<256>>(