diff --git a/backends/concrete-cuda/implementation/src/bit_extraction.cu b/backends/concrete-cuda/implementation/src/bit_extraction.cu index 571c3e14c..5d057b088 100644 --- a/backends/concrete-cuda/implementation/src/bit_extraction.cu +++ b/backends/concrete-cuda/implementation/src/bit_extraction.cu @@ -1,5 +1,41 @@ #include "bit_extraction.cuh" +/* + * Runs standard checks to validate the inputs + */ +void checks_fast_extract_bits(int nbits, int polynomial_size, + int level_count_bsk, int number_of_samples) { + + assert(("Error (GPU extract bits): polynomial_size should be one of " + "256, 512, 1024, 2048, 4096, 8192", + polynomial_size == 256 || polynomial_size == 512 || + polynomial_size == 1024 || polynomial_size == 2048 || + polynomial_size == 4096 || polynomial_size == 8192)); + // The number of samples should be lower than four time the number of + // streaming multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being + // related to the occupancy of 50%). The only supported value for k is 1, so + // k + 1 = 2 for now. + int number_of_sm = 0; + cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); + assert(("Error (GPU extract bits): the number of input LWEs must be lower or " + "equal to the " + "number of streaming multiprocessors on the device divided by 8 * " + "level_count_bsk", + number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk)); +} + +/* + * Runs standard checks to validate the inputs + */ +void checks_extract_bits(int nbits, int polynomial_size, int base_log_bsk, + int level_count_bsk, int number_of_samples) { + + assert(("Error (GPU extract bits): base log should be <= nbits", + base_log_bsk <= nbits)); + checks_fast_extract_bits(nbits, polynomial_size, level_count_bsk, + number_of_samples); +} + /* * This scratch function allocates the necessary amount of data on the GPU for * the bit extraction on 32 bits inputs, into `cbs_buffer`. It also @@ -11,6 +47,8 @@ void scratch_cuda_extract_bits_32( uint32_t level_count, uint32_t number_of_inputs, uint32_t max_shared_memory, bool allocate_gpu_memory) { + checks_fast_extract_bits(32, polynomial_size, level_count, number_of_inputs); + switch (polynomial_size) { case 256: scratch_extract_bits>( @@ -63,6 +101,7 @@ void scratch_cuda_extract_bits_64( uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t number_of_inputs, uint32_t max_shared_memory, bool allocate_gpu_memory) { + checks_fast_extract_bits(64, polynomial_size, level_count, number_of_inputs); switch (polynomial_size) { case 256: @@ -119,24 +158,8 @@ void cuda_extract_bits_32(void *v_stream, uint32_t gpu_index, uint32_t level_count_bsk, uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples, uint32_t max_shared_memory) { - assert(("Error (GPU extract bits): base log should be <= 32", - base_log_bsk <= 32)); - assert(("Error (GPU extract bits): polynomial_size should be one of " - "256, 512, 1024, 2048, 4096, 8192", - polynomial_size == 256 || polynomial_size == 512 || - polynomial_size == 1024 || polynomial_size == 2048 || - polynomial_size == 4096 || polynomial_size == 8192)); - // The number of samples should be lower than four time the number of - // streaming multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being - // related to the occupancy of 50%). The only supported value for k is 1, so - // k + 1 = 2 for now. - int number_of_sm = 0; - cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); - assert(("Error (GPU extract bits): the number of input LWEs must be lower or " - "equal to the " - "number of streaming multiprocessors on the device divided by 8 * " - "level_count_bsk", - number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk)); + checks_extract_bits(32, polynomial_size, base_log_bsk, level_count_bsk, + number_of_samples); switch (polynomial_size) { case 256: @@ -253,24 +276,8 @@ void cuda_extract_bits_64(void *v_stream, uint32_t gpu_index, uint32_t level_count_bsk, uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples, uint32_t max_shared_memory) { - assert(("Error (GPU extract bits): base log should be <= 64", - base_log_bsk <= 64)); - assert(("Error (GPU extract bits): polynomial_size should be one of " - "256, 512, 1024, 2048, 4096, 8192", - polynomial_size == 256 || polynomial_size == 512 || - polynomial_size == 1024 || polynomial_size == 2048 || - polynomial_size == 4096 || polynomial_size == 8192)); - // The number of samples should be lower than four time the number of - // streaming multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being - // related to the occupancy of 50%). The only supported value for k is 1, so - // k + 1 = 2 for now. - int number_of_sm = 0; - cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); - assert(("Error (GPU extract bits): the number of input LWEs must be lower or " - "equal to the " - "number of streaming multiprocessors on the device divided by 8 * " - "level_count_bsk", - number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk)); + checks_extract_bits(64, polynomial_size, base_log_bsk, level_count_bsk, + number_of_samples); switch (polynomial_size) { case 256: diff --git a/backends/concrete-cuda/implementation/src/bootstrap_amortized.cu b/backends/concrete-cuda/implementation/src/bootstrap_amortized.cu index 4580b0ca8..c1a64bd7d 100644 --- a/backends/concrete-cuda/implementation/src/bootstrap_amortized.cu +++ b/backends/concrete-cuda/implementation/src/bootstrap_amortized.cu @@ -1,5 +1,26 @@ #include "bootstrap_amortized.cuh" +/* + * Runs standard checks to validate the inputs + */ +void checks_fast_bootstrap_amortized(int nbits, int polynomial_size) { + assert( + ("Error (GPU amortized PBS): polynomial size should be one of 256, 512, " + "1024, 2048, 4096, 8192", + polynomial_size == 256 || polynomial_size == 512 || + polynomial_size == 1024 || polynomial_size == 2048 || + polynomial_size == 4096 || polynomial_size == 8192)); +} + +/* + * Runs standard checks to validate the inputs + */ +void checks_bootstrap_amortized(int nbits, int base_log, int polynomial_size) { + assert(("Error (GPU amortized PBS): base log should be <= nbits", + base_log <= nbits)); + checks_fast_bootstrap_amortized(nbits, polynomial_size); +} + /* * This scratch function allocates the necessary amount of data on the GPU for * the amortized PBS on 32 bits inputs, into `pbs_buffer`. It also @@ -13,6 +34,7 @@ void scratch_cuda_bootstrap_amortized_32(void *v_stream, uint32_t gpu_index, uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, bool allocate_gpu_memory) { + checks_fast_bootstrap_amortized(32, polynomial_size); switch (polynomial_size) { case 256: @@ -63,6 +85,7 @@ void scratch_cuda_bootstrap_amortized_64(void *v_stream, uint32_t gpu_index, uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, bool allocate_gpu_memory) { + checks_fast_bootstrap_amortized(64, polynomial_size); switch (polynomial_size) { case 256: @@ -111,14 +134,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32( uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) { - assert( - ("Error (GPU amortized PBS): base log should be <= 32", base_log <= 32)); - assert( - ("Error (GPU amortized PBS): polynomial size should be one of 256, 512, " - "1024, 2048, 4096, 8192", - polynomial_size == 256 || polynomial_size == 512 || - polynomial_size == 1024 || polynomial_size == 2048 || - polynomial_size == 4096 || polynomial_size == 8192)); + checks_bootstrap_amortized(32, base_log, polynomial_size); switch (polynomial_size) { case 256: @@ -247,14 +263,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64( uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) { - assert( - ("Error (GPU amortized PBS): base log should be <= 64", base_log <= 64)); - assert( - ("Error (GPU amortized PBS): polynomial size should be one of 256, 512, " - "1024, 2048, 4096, 8192", - polynomial_size == 256 || polynomial_size == 512 || - polynomial_size == 1024 || polynomial_size == 2048 || - polynomial_size == 4096 || polynomial_size == 8192)); + checks_bootstrap_amortized(64, base_log, polynomial_size); switch (polynomial_size) { case 256: diff --git a/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu b/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu index 94f906bd8..31ae804e0 100644 --- a/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu +++ b/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu @@ -1,5 +1,41 @@ #include "bootstrap_low_latency.cuh" +/* + * Runs standard checks to validate the inputs + */ +void checks_fast_bootstrap_low_latency(int nbits, int level_count, + int polynomial_size, int num_samples) { + + assert(( + "Error (GPU low latency PBS): polynomial size should be one of 256, 512, " + "1024, 2048, 4096, 8192", + polynomial_size == 256 || polynomial_size == 512 || + polynomial_size == 1024 || polynomial_size == 2048 || + polynomial_size == 4096 || polynomial_size == 8192)); + // The number of samples should be lower than 4 * SM/((k + 1) * l) (the + // factor 4 being related to the occupancy of 50%). The only supported + // value for k is 1, so k + 1 = 2 for now. + int number_of_sm = 0; + cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); + assert(("Error (GPU low latency PBS): the number of input LWEs must be lower " + "or equal to the " + "number of streaming multiprocessors on the device divided by 8 * " + "level_count", + num_samples <= number_of_sm * 4. / 2. / level_count)); +} + +/* + * Runs standard checks to validate the inputs + */ +void checks_bootstrap_low_latency(int nbits, int level_count, int base_log, + int polynomial_size, int num_samples) { + + assert(("Error (GPU low latency PBS): base log should be <= nbits", + base_log <= nbits)); + checks_fast_bootstrap_low_latency(nbits, level_count, polynomial_size, + num_samples); +} + /* * This scratch function allocates the necessary amount of data on the GPU for * the low latency PBS on 32 bits inputs, into `pbs_buffer`. It also @@ -11,6 +47,8 @@ void scratch_cuda_bootstrap_low_latency_32( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, bool allocate_gpu_memory) { + checks_fast_bootstrap_low_latency(32, level_count, polynomial_size, + input_lwe_ciphertext_count); switch (polynomial_size) { case 256: @@ -65,6 +103,8 @@ void scratch_cuda_bootstrap_low_latency_64( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, bool allocate_gpu_memory) { + checks_fast_bootstrap_low_latency(64, level_count, polynomial_size, + input_lwe_ciphertext_count); switch (polynomial_size) { case 256: @@ -123,24 +163,8 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32( uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) { - assert(("Error (GPU low latency PBS): base log should be <= 32", - base_log <= 32)); - assert(( - "Error (GPU low latency PBS): polynomial size should be one of 256, 512, " - "1024, 2048, 4096, 8192", - polynomial_size == 256 || polynomial_size == 512 || - polynomial_size == 1024 || polynomial_size == 2048 || - polynomial_size == 4096 || polynomial_size == 8192)); - // The number of samples should be lower than 4 * SM/((k + 1) * l) (the - // factor 4 being related to the occupancy of 50%). The only supported - // value for k is 1, so k + 1 = 2 for now. - int number_of_sm = 0; - cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); - assert(("Error (GPU low latency PBS): the number of input LWEs must be lower " - "or equal to the " - "number of streaming multiprocessors on the device divided by 8 * " - "level_count", - num_samples <= number_of_sm * 4. / 2. / level_count)); + checks_bootstrap_low_latency(32, level_count, base_log, polynomial_size, + num_samples); switch (polynomial_size) { case 256: @@ -280,24 +304,8 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64( uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) { - assert(("Error (GPU low latency PBS): base log should be <= 64", - base_log <= 64)); - assert(( - "Error (GPU low latency PBS): polynomial size should be one of 256, 512, " - "1024, 2048, 4096, 8192", - polynomial_size == 256 || polynomial_size == 512 || - polynomial_size == 1024 || polynomial_size == 2048 || - polynomial_size == 4096 || polynomial_size == 8192)); - // The number of samples should be lower than 4 * SM/((k + 1) * l) (the - // factor 4 being related to the occupancy of 50%). The only supported - // value for k is 1, so k + 1 = 2 for now. - int number_of_sm = 0; - cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); - assert(("Error (GPU low latency PBS): the number of input LWEs must be lower " - "or equal to the " - "number of streaming multiprocessors on the device divided by 8 * " - "level_count", - num_samples <= number_of_sm * 4. / 2. / level_count)); + checks_bootstrap_low_latency(64, level_count, base_log, polynomial_size, + num_samples); switch (polynomial_size) { case 256: diff --git a/backends/concrete-cuda/implementation/src/circuit_bootstrap.cu b/backends/concrete-cuda/implementation/src/circuit_bootstrap.cu index 55236f84c..e4716db0d 100644 --- a/backends/concrete-cuda/implementation/src/circuit_bootstrap.cu +++ b/backends/concrete-cuda/implementation/src/circuit_bootstrap.cu @@ -1,6 +1,38 @@ #include "circuit_bootstrap.cuh" #include "circuit_bootstrap.h" +/* + * Runs standard checks to validate the inputs + */ +void checks_fast_circuit_bootstrap(int polynomial_size, int number_of_inputs) { + + assert(("Error (GPU circuit bootstrap): polynomial_size should be one of " + "256, 512, 1024, 2048, 4096, 8192", + polynomial_size == 256 || polynomial_size == 512 || + polynomial_size == 1024 || polynomial_size == 2048 || + polynomial_size == 4096 || polynomial_size == 8192)); +} + +/* + * Runs standard checks to validate the inputs + */ +void checks_circuit_bootstrap(int polynomial_size, int level_bsk, + int number_of_inputs) { + // The number of samples should be lower than the number of streaming + // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related + // to the occupancy of 50%). The only supported value for k is 1, so + // k + 1 = 2 for now. + int number_of_sm = 0; + cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); + assert(("Error (GPU extract bits): the number of input LWEs must be lower or " + "equal to the " + "number of streaming multiprocessors on the device divided by 8 * " + "level_count_bsk", + number_of_inputs <= number_of_sm / 4. / 2. / level_bsk)); + + checks_fast_circuit_bootstrap(polynomial_size, number_of_inputs); +} + /* * This scratch function allocates the necessary amount of data on the GPU for * the circuit bootstrap on 32 bits inputs, into `cbs_buffer`. It also @@ -12,6 +44,8 @@ void scratch_cuda_circuit_bootstrap_32( uint32_t level_count_cbs, uint32_t number_of_inputs, uint32_t max_shared_memory, bool allocate_gpu_memory) { + checks_fast_circuit_bootstrap(polynomial_size, number_of_inputs); + switch (polynomial_size) { case 256: scratch_circuit_bootstrap>( @@ -65,6 +99,8 @@ void scratch_cuda_circuit_bootstrap_64( uint32_t level_count_cbs, uint32_t number_of_inputs, uint32_t max_shared_memory, bool allocate_gpu_memory) { + checks_fast_circuit_bootstrap(polynomial_size, number_of_inputs); + switch (polynomial_size) { case 256: scratch_circuit_bootstrap>( @@ -119,22 +155,9 @@ void cuda_circuit_bootstrap_32( uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs, uint32_t max_shared_memory) { - assert(("Error (GPU circuit bootstrap): polynomial_size should be one of " - "256, 512, 1024, 2048, 4096, 8192", - polynomial_size == 256 || polynomial_size == 512 || - polynomial_size == 1024 || polynomial_size == 2048 || - polynomial_size == 4096 || polynomial_size == 8192)); - // The number of samples should be lower than the number of streaming - // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related - // to the occupancy of 50%). The only supported value for k is 1, so - // k + 1 = 2 for now. - int number_of_sm = 0; - cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); - assert(("Error (GPU extract bits): the number of input LWEs must be lower or " - "equal to the " - "number of streaming multiprocessors on the device divided by 8 * " - "level_count_bsk", - number_of_inputs <= number_of_sm / 4. / 2. / level_bsk)); + + checks_circuit_bootstrap(polynomial_size, level_bsk, number_of_inputs); + switch (polynomial_size) { case 256: host_circuit_bootstrap>( @@ -228,23 +251,9 @@ void cuda_circuit_bootstrap_64( uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs, uint32_t max_shared_memory) { - assert(("Error (GPU circuit bootstrap): polynomial_size should be one of " - "256, 512, 1024, 2048, 4096, 8192", - polynomial_size == 256 || polynomial_size == 512 || - polynomial_size == 1024 || polynomial_size == 2048 || - polynomial_size == 4096 || polynomial_size == 8192)); - // The number of samples should be lower than the number of streaming - // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related - // to the occupancy of 50%). The only supported value for k is 1, so - // k + 1 = 2 for now. - int number_of_sm = 0; - cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); - assert(("Error (GPU extract bits): the number of input LWEs must be lower or " - "equal to the " - "number of streaming multiprocessors on the device divided by 8 * " - "level_count_bsk", - number_of_inputs <= number_of_sm / 4. / 2. / level_bsk)); - // The number of samples should be lower than the number of streaming + + checks_circuit_bootstrap(polynomial_size, level_bsk, number_of_inputs); + switch (polynomial_size) { case 256: host_circuit_bootstrap>( diff --git a/backends/concrete-cuda/implementation/src/vertical_packing.cu b/backends/concrete-cuda/implementation/src/vertical_packing.cu index eb6a437dd..cb7adfbe5 100644 --- a/backends/concrete-cuda/implementation/src/vertical_packing.cu +++ b/backends/concrete-cuda/implementation/src/vertical_packing.cu @@ -2,6 +2,46 @@ #include "vertical_packing.h" #include +/* + * Runs standard checks to validate the inputs + */ +void checks_fast_cmux_tree(int nbits, int polynomial_size, int r) { + assert(( + "Error (GPU Cmux tree): polynomial size should be one of 256, 512, 1024, " + "2048, 4096, 8192", + polynomial_size == 256 || polynomial_size == 512 || + polynomial_size == 1024 || polynomial_size == 2048 || + polynomial_size == 4096 || polynomial_size == 8192)); + // For larger k we will need to adjust the mask size + assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should " + "be >= 1 ", + r >= 1)); +} + +/* + * Runs standard checks to validate the inputs + */ +void checks_cmux_tree(int nbits, int polynomial_size, int base_log, int r) { + + assert(("Error (GPU Cmux tree): base log should be <= nbits", + base_log <= nbits)); + checks_fast_cmux_tree(nbits, polynomial_size, r); +} + +/* + * Runs standard checks to validate the inputs + */ +void checks_blind_rotation_and_sample_extraction(int polynomial_size) { + + assert(("Error (GPU Blind rotation + sample extraction): polynomial size " + "should be one of 256, 512, " + "1024, " + "2048, 4096, 8192", + polynomial_size == 256 || polynomial_size == 512 || + polynomial_size == 1024 || polynomial_size == 2048 || + polynomial_size == 4096 || polynomial_size == 8192)); +} + /* * This scratch function allocates the necessary amount of data on the GPU for * the Cmux tree on 32 bits inputs, into `cmux_tree_buffer`. It also configures @@ -14,6 +54,7 @@ void scratch_cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, uint32_t r, uint32_t tau, uint32_t max_shared_memory, bool allocate_gpu_memory) { + checks_fast_cmux_tree(32, polynomial_size, r); switch (polynomial_size) { case 256: @@ -63,6 +104,8 @@ void scratch_cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, uint32_t r, uint32_t tau, uint32_t max_shared_memory, bool allocate_gpu_memory) { + checks_fast_cmux_tree(64, polynomial_size, r); + switch (polynomial_size) { case 256: scratch_cmux_tree>( @@ -110,17 +153,7 @@ void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out, uint32_t level_count, uint32_t r, uint32_t tau, uint32_t max_shared_memory) { - assert(("Error (GPU Cmux tree): base log should be <= 32", base_log <= 32)); - assert(( - "Error (GPU Cmux tree): polynomial size should be one of 256, 512, 1024, " - "2048, 4096, 8192", - polynomial_size == 256 || polynomial_size == 512 || - polynomial_size == 1024 || polynomial_size == 2048 || - polynomial_size == 4096 || polynomial_size == 8192)); - // For larger k we will need to adjust the mask size - assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should " - "be >= 1 ", - r >= 1)); + checks_cmux_tree(32, polynomial_size, base_log, r); switch (polynomial_size) { case 256: @@ -197,18 +230,7 @@ void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t r, uint32_t tau, uint32_t max_shared_memory) { - - assert(("Error (GPU Cmux tree): base log should be <= 64", base_log <= 64)); - assert(( - "Error (GPU Cmux tree): polynomial size should be one of 256, 512, 1024, " - "2048, 4096, 8192", - polynomial_size == 256 || polynomial_size == 512 || - polynomial_size == 1024 || polynomial_size == 2048 || - polynomial_size == 4096 || polynomial_size == 8192)); - // For larger k we will need to adjust the mask size - assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should " - "be >= 1 ", - r >= 1)); + checks_cmux_tree(64, polynomial_size, base_log, r); switch (polynomial_size) { case 256: @@ -273,6 +295,7 @@ void scratch_cuda_blind_rotation_sample_extraction_32( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t mbr_size, uint32_t tau, uint32_t max_shared_memory, bool allocate_gpu_memory) { + checks_blind_rotation_and_sample_extraction(polynomial_size); switch (polynomial_size) { case 256: @@ -320,6 +343,7 @@ void scratch_cuda_blind_rotation_sample_extraction_64( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t mbr_size, uint32_t tau, uint32_t max_shared_memory, bool allocate_gpu_memory) { + checks_blind_rotation_and_sample_extraction(polynomial_size); switch (polynomial_size) { case 256: @@ -386,6 +410,7 @@ void cuda_blind_rotate_and_sample_extraction_64( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t l_gadget, uint32_t max_shared_memory) { + checks_blind_rotation_and_sample_extraction(polynomial_size); switch (polynomial_size) { case 256: host_blind_rotate_and_sample_extraction>( diff --git a/backends/concrete-cuda/implementation/src/wop_bootstrap.cu b/backends/concrete-cuda/implementation/src/wop_bootstrap.cu index e21b3945c..15a740d0c 100644 --- a/backends/concrete-cuda/implementation/src/wop_bootstrap.cu +++ b/backends/concrete-cuda/implementation/src/wop_bootstrap.cu @@ -1,5 +1,53 @@ #include "wop_bootstrap.cuh" +/* + * Runs standard checks to validate the inputs + */ +void checks_wop_pbs(int polynomial_size, int level_count_bsk, + int number_of_inputs) { + assert(("Error (GPU WOP PBS): polynomial_size should be one of " + "256, 512, 1024, 2048, 4096, 8192", + polynomial_size == 256 || polynomial_size == 512 || + polynomial_size == 1024 || polynomial_size == 2048 || + polynomial_size == 4096 || polynomial_size == 8192)); + // The number of inputs should be lower than the number of streaming + // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related + // to the occupancy of 50%). The only supported value for k is 1, so + // k + 1 = 2 for now. + int number_of_sm = 0; + cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); + assert(("Error (GPU WOP PBS): the number of input LWEs must be lower or " + "equal to the " + "number of streaming multiprocessors on the device divided by 8 * " + "level_count_bsk", + number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk)); +} + +void checks_fast_circuit_bootstrap_vertical_packing(int polynomial_size) { + assert(("Error (GPU circuit bootstrap): polynomial_size should be one of " + "256, 512, 1024, 2048, 4096, 8192", + polynomial_size == 256 || polynomial_size == 512 || + polynomial_size == 1024 || polynomial_size == 2048 || + polynomial_size == 4096 || polynomial_size == 8192)); +} + +void checks_circuit_bootstrap_vertical_packing(int polynomial_size, + int number_of_inputs, + int level_count_bsk) { + // The number of inputs should be lower than the number of streaming + // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related + // to the occupancy of 50%). The only supported value for k is 1, so + // k + 1 = 2 for now. + int number_of_sm = 0; + cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); + assert(("Error (GPU extract bits): the number of input LWEs must be lower or " + "equal to the " + "number of streaming multiprocessors on the device divided by 8 * " + "level_count_bsk", + number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk)); + checks_fast_circuit_bootstrap_vertical_packing(polynomial_size); +} + /* * This scratch function allocates the necessary amount of data on the GPU for * the circuit bootstrap and vertical packing on 32 bits inputs, into @@ -13,6 +61,8 @@ void scratch_cuda_circuit_bootstrap_vertical_packing_32( uint32_t number_of_inputs, uint32_t tau, uint32_t max_shared_memory, bool allocate_gpu_memory) { + checks_fast_circuit_bootstrap_vertical_packing(polynomial_size); + switch (polynomial_size) { case 256: scratch_circuit_bootstrap_vertical_packing>( @@ -68,6 +118,8 @@ void scratch_cuda_circuit_bootstrap_vertical_packing_64( uint32_t number_of_inputs, uint32_t tau, uint32_t max_shared_memory, bool allocate_gpu_memory) { + checks_fast_circuit_bootstrap_vertical_packing(polynomial_size); + switch (polynomial_size) { case 256: scratch_circuit_bootstrap_vertical_packing>( @@ -124,6 +176,7 @@ void scratch_cuda_wop_pbs_32( uint32_t number_of_bits_of_message_including_padding, uint32_t number_of_bits_to_extract, uint32_t number_of_inputs, uint32_t max_shared_memory, bool allocate_gpu_memory) { + checks_wop_pbs(polynomial_size, level_count_bsk, number_of_inputs); switch (polynomial_size) { case 256: scratch_wop_pbs>( @@ -192,6 +245,7 @@ void scratch_cuda_wop_pbs_64( uint32_t number_of_bits_of_message_including_padding, uint32_t number_of_bits_to_extract, uint32_t number_of_inputs, uint32_t max_shared_memory, bool allocate_gpu_memory) { + checks_wop_pbs(polynomial_size, level_count_bsk, number_of_inputs); switch (polynomial_size) { case 256: scratch_wop_pbs>( @@ -282,22 +336,10 @@ void cuda_circuit_bootstrap_vertical_packing_64( uint32_t level_count_pksk, uint32_t base_log_pksk, uint32_t level_count_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs, uint32_t lut_number, uint32_t max_shared_memory) { - assert(("Error (GPU circuit bootstrap): polynomial_size should be one of " - "256, 512, 1024, 2048, 4096, 8192", - polynomial_size == 256 || polynomial_size == 512 || - polynomial_size == 1024 || polynomial_size == 2048 || - polynomial_size == 4096 || polynomial_size == 8192)); - // The number of inputs should be lower than the number of streaming - // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related - // to the occupancy of 50%). The only supported value for k is 1, so - // k + 1 = 2 for now. - int number_of_sm = 0; - cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); - assert(("Error (GPU extract bits): the number of input LWEs must be lower or " - "equal to the " - "number of streaming multiprocessors on the device divided by 8 * " - "level_count_bsk", - number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk)); + + checks_circuit_bootstrap_vertical_packing(polynomial_size, number_of_inputs, + level_count_bsk); + switch (polynomial_size) { case 256: host_circuit_bootstrap_vertical_packing>( @@ -411,22 +453,7 @@ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out, uint32_t number_of_bits_of_message_including_padding, uint32_t number_of_bits_to_extract, uint32_t delta_log, uint32_t number_of_inputs, uint32_t max_shared_memory) { - assert(("Error (GPU WOP PBS): polynomial_size should be one of " - "256, 512, 1024, 2048, 4096, 8192", - polynomial_size == 256 || polynomial_size == 512 || - polynomial_size == 1024 || polynomial_size == 2048 || - polynomial_size == 4096 || polynomial_size == 8192)); - // The number of inputs should be lower than the number of streaming - // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related - // to the occupancy of 50%). The only supported value for k is 1, so - // k + 1 = 2 for now. - int number_of_sm = 0; - cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); - assert(("Error (GPU WOP PBS): the number of input LWEs must be lower or " - "equal to the " - "number of streaming multiprocessors on the device divided by 8 * " - "level_count_bsk", - number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk)); + checks_wop_pbs(polynomial_size, level_count_bsk, number_of_inputs); switch (polynomial_size) { case 256: host_wop_pbs>(