From e0c52515bf7ba581c9bc05d7624870cf31e8d3c5 Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Thu, 16 Mar 2023 10:14:12 +0100 Subject: [PATCH] fix(concrete_cuda): make sure r > 0 in the wop PBS Sometimes the optimizer provides inconsistent parameter sets for the wop PBS depending on how many inputs & bits to extract we're going to need for a certain precision. This results in r < 0 and until now the corresponding error in concrete-cuda was very hard to understand. This commit fixes this behavior. Also, when introducing the support for k > 1 we forgot to update the checks on the low lat PBS. This commit also fixes them. --- .../implementation/src/bit_extraction.cu | 33 +++++++++------- .../implementation/src/bootstrap_amortized.cu | 8 ++-- .../src/bootstrap_low_latency.cu | 36 +++++++++-------- .../implementation/src/circuit_bootstrap.cu | 25 ++++++------ .../implementation/src/vertical_packing.cu | 14 +++---- .../implementation/src/wop_bootstrap.cu | 39 +++++++++++++------ 6 files changed, 88 insertions(+), 67 deletions(-) diff --git a/backends/concrete-cuda/implementation/src/bit_extraction.cu b/backends/concrete-cuda/implementation/src/bit_extraction.cu index 5d057b088..e97a324d9 100644 --- a/backends/concrete-cuda/implementation/src/bit_extraction.cu +++ b/backends/concrete-cuda/implementation/src/bit_extraction.cu @@ -3,7 +3,7 @@ /* * Runs standard checks to validate the inputs */ -void checks_fast_extract_bits(int nbits, int polynomial_size, +void checks_fast_extract_bits(int glwe_dimension, int polynomial_size, int level_count_bsk, int number_of_samples) { assert(("Error (GPU extract bits): polynomial_size should be one of " @@ -13,26 +13,27 @@ void checks_fast_extract_bits(int nbits, int polynomial_size, polynomial_size == 4096 || polynomial_size == 8192)); // The number of samples should be lower than four time the number of // streaming multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being - // related to the occupancy of 50%). The only supported value for k is 1, so - // k + 1 = 2 for now. + // related to the occupancy of 50%). int number_of_sm = 0; cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); assert(("Error (GPU extract bits): the number of input LWEs must be lower or " - "equal to the " - "number of streaming multiprocessors on the device divided by 8 * " + "equal to the number of streaming multiprocessors on the device " + "divided by 4 * (k + 1) " "level_count_bsk", - number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk)); + number_of_samples <= + number_of_sm / 4. / (glwe_dimension + 1) / level_count_bsk)); } /* * Runs standard checks to validate the inputs */ -void checks_extract_bits(int nbits, int polynomial_size, int base_log_bsk, - int level_count_bsk, int number_of_samples) { +void checks_extract_bits(int nbits, int glwe_dimension, int polynomial_size, + int base_log_bsk, int level_count_bsk, + int number_of_samples) { assert(("Error (GPU extract bits): base log should be <= nbits", base_log_bsk <= nbits)); - checks_fast_extract_bits(nbits, polynomial_size, level_count_bsk, + checks_fast_extract_bits(glwe_dimension, polynomial_size, level_count_bsk, number_of_samples); } @@ -47,7 +48,8 @@ void scratch_cuda_extract_bits_32( uint32_t level_count, uint32_t number_of_inputs, uint32_t max_shared_memory, bool allocate_gpu_memory) { - checks_fast_extract_bits(32, polynomial_size, level_count, number_of_inputs); + checks_fast_extract_bits(glwe_dimension, polynomial_size, level_count, + number_of_inputs); switch (polynomial_size) { case 256: @@ -101,7 +103,8 @@ void scratch_cuda_extract_bits_64( uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t number_of_inputs, uint32_t max_shared_memory, bool allocate_gpu_memory) { - checks_fast_extract_bits(64, polynomial_size, level_count, number_of_inputs); + checks_fast_extract_bits(glwe_dimension, polynomial_size, level_count, + number_of_inputs); switch (polynomial_size) { case 256: @@ -158,8 +161,8 @@ void cuda_extract_bits_32(void *v_stream, uint32_t gpu_index, uint32_t level_count_bsk, uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples, uint32_t max_shared_memory) { - checks_extract_bits(32, polynomial_size, base_log_bsk, level_count_bsk, - number_of_samples); + checks_extract_bits(32, glwe_dimension, polynomial_size, base_log_bsk, + level_count_bsk, number_of_samples); switch (polynomial_size) { case 256: @@ -276,8 +279,8 @@ void cuda_extract_bits_64(void *v_stream, uint32_t gpu_index, uint32_t level_count_bsk, uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples, uint32_t max_shared_memory) { - checks_extract_bits(64, polynomial_size, base_log_bsk, level_count_bsk, - number_of_samples); + checks_extract_bits(64, glwe_dimension, polynomial_size, base_log_bsk, + level_count_bsk, number_of_samples); switch (polynomial_size) { case 256: diff --git a/backends/concrete-cuda/implementation/src/bootstrap_amortized.cu b/backends/concrete-cuda/implementation/src/bootstrap_amortized.cu index c1a64bd7d..d0393de56 100644 --- a/backends/concrete-cuda/implementation/src/bootstrap_amortized.cu +++ b/backends/concrete-cuda/implementation/src/bootstrap_amortized.cu @@ -3,7 +3,7 @@ /* * Runs standard checks to validate the inputs */ -void checks_fast_bootstrap_amortized(int nbits, int polynomial_size) { +void checks_fast_bootstrap_amortized(int polynomial_size) { assert( ("Error (GPU amortized PBS): polynomial size should be one of 256, 512, " "1024, 2048, 4096, 8192", @@ -18,7 +18,7 @@ void checks_fast_bootstrap_amortized(int nbits, int polynomial_size) { void checks_bootstrap_amortized(int nbits, int base_log, int polynomial_size) { assert(("Error (GPU amortized PBS): base log should be <= nbits", base_log <= nbits)); - checks_fast_bootstrap_amortized(nbits, polynomial_size); + checks_fast_bootstrap_amortized(polynomial_size); } /* @@ -34,7 +34,7 @@ void scratch_cuda_bootstrap_amortized_32(void *v_stream, uint32_t gpu_index, uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, bool allocate_gpu_memory) { - checks_fast_bootstrap_amortized(32, polynomial_size); + checks_fast_bootstrap_amortized(polynomial_size); switch (polynomial_size) { case 256: @@ -85,7 +85,7 @@ void scratch_cuda_bootstrap_amortized_64(void *v_stream, uint32_t gpu_index, uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, bool allocate_gpu_memory) { - checks_fast_bootstrap_amortized(64, polynomial_size); + checks_fast_bootstrap_amortized(polynomial_size); switch (polynomial_size) { case 256: diff --git a/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu b/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu index 31ae804e0..2da5378af 100644 --- a/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu +++ b/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu @@ -3,7 +3,7 @@ /* * Runs standard checks to validate the inputs */ -void checks_fast_bootstrap_low_latency(int nbits, int level_count, +void checks_fast_bootstrap_low_latency(int glwe_dimension, int level_count, int polynomial_size, int num_samples) { assert(( @@ -17,23 +17,25 @@ void checks_fast_bootstrap_low_latency(int nbits, int level_count, // value for k is 1, so k + 1 = 2 for now. int number_of_sm = 0; cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); - assert(("Error (GPU low latency PBS): the number of input LWEs must be lower " - "or equal to the " - "number of streaming multiprocessors on the device divided by 8 * " - "level_count", - num_samples <= number_of_sm * 4. / 2. / level_count)); + assert( + ("Error (GPU low latency PBS): the number of input LWEs must be lower " + "or equal to the number of streaming multiprocessors on the device " + "divided by 4 * " + "(k + 1) * level_count", + num_samples <= number_of_sm * 4. / (glwe_dimension + 1) / level_count)); } /* * Runs standard checks to validate the inputs */ -void checks_bootstrap_low_latency(int nbits, int level_count, int base_log, +void checks_bootstrap_low_latency(int nbits, int glwe_dimension, + int level_count, int base_log, int polynomial_size, int num_samples) { assert(("Error (GPU low latency PBS): base log should be <= nbits", base_log <= nbits)); - checks_fast_bootstrap_low_latency(nbits, level_count, polynomial_size, - num_samples); + checks_fast_bootstrap_low_latency(glwe_dimension, level_count, + polynomial_size, num_samples); } /* @@ -47,8 +49,8 @@ void scratch_cuda_bootstrap_low_latency_32( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, bool allocate_gpu_memory) { - checks_fast_bootstrap_low_latency(32, level_count, polynomial_size, - input_lwe_ciphertext_count); + checks_fast_bootstrap_low_latency( + glwe_dimension, level_count, polynomial_size, input_lwe_ciphertext_count); switch (polynomial_size) { case 256: @@ -103,8 +105,8 @@ void scratch_cuda_bootstrap_low_latency_64( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, bool allocate_gpu_memory) { - checks_fast_bootstrap_low_latency(64, level_count, polynomial_size, - input_lwe_ciphertext_count); + checks_fast_bootstrap_low_latency( + glwe_dimension, level_count, polynomial_size, input_lwe_ciphertext_count); switch (polynomial_size) { case 256: @@ -163,8 +165,8 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32( uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) { - checks_bootstrap_low_latency(32, level_count, base_log, polynomial_size, - num_samples); + checks_bootstrap_low_latency(32, glwe_dimension, level_count, base_log, + polynomial_size, num_samples); switch (polynomial_size) { case 256: @@ -304,8 +306,8 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64( uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) { - checks_bootstrap_low_latency(64, level_count, base_log, polynomial_size, - num_samples); + checks_bootstrap_low_latency(64, glwe_dimension, level_count, base_log, + polynomial_size, num_samples); switch (polynomial_size) { case 256: diff --git a/backends/concrete-cuda/implementation/src/circuit_bootstrap.cu b/backends/concrete-cuda/implementation/src/circuit_bootstrap.cu index e4716db0d..efc9d2e7a 100644 --- a/backends/concrete-cuda/implementation/src/circuit_bootstrap.cu +++ b/backends/concrete-cuda/implementation/src/circuit_bootstrap.cu @@ -4,7 +4,7 @@ /* * Runs standard checks to validate the inputs */ -void checks_fast_circuit_bootstrap(int polynomial_size, int number_of_inputs) { +void checks_fast_circuit_bootstrap(int polynomial_size) { assert(("Error (GPU circuit bootstrap): polynomial_size should be one of " "256, 512, 1024, 2048, 4096, 8192", @@ -16,8 +16,8 @@ void checks_fast_circuit_bootstrap(int polynomial_size, int number_of_inputs) { /* * Runs standard checks to validate the inputs */ -void checks_circuit_bootstrap(int polynomial_size, int level_bsk, - int number_of_inputs) { +void checks_circuit_bootstrap(int glwe_dimension, int polynomial_size, + int level_bsk, int number_of_inputs) { // The number of samples should be lower than the number of streaming // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related // to the occupancy of 50%). The only supported value for k is 1, so @@ -26,11 +26,12 @@ void checks_circuit_bootstrap(int polynomial_size, int level_bsk, cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); assert(("Error (GPU extract bits): the number of input LWEs must be lower or " "equal to the " - "number of streaming multiprocessors on the device divided by 8 * " - "level_count_bsk", - number_of_inputs <= number_of_sm / 4. / 2. / level_bsk)); + "number of streaming multiprocessors on the device divided by 4 * " + "(k + 1) * level_count_bsk", + number_of_inputs <= + number_of_sm / 4. / (glwe_dimension + 1) / level_bsk)); - checks_fast_circuit_bootstrap(polynomial_size, number_of_inputs); + checks_fast_circuit_bootstrap(polynomial_size); } /* @@ -44,7 +45,7 @@ void scratch_cuda_circuit_bootstrap_32( uint32_t level_count_cbs, uint32_t number_of_inputs, uint32_t max_shared_memory, bool allocate_gpu_memory) { - checks_fast_circuit_bootstrap(polynomial_size, number_of_inputs); + checks_fast_circuit_bootstrap(polynomial_size); switch (polynomial_size) { case 256: @@ -99,7 +100,7 @@ void scratch_cuda_circuit_bootstrap_64( uint32_t level_count_cbs, uint32_t number_of_inputs, uint32_t max_shared_memory, bool allocate_gpu_memory) { - checks_fast_circuit_bootstrap(polynomial_size, number_of_inputs); + checks_fast_circuit_bootstrap(polynomial_size); switch (polynomial_size) { case 256: @@ -156,7 +157,8 @@ void cuda_circuit_bootstrap_32( uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs, uint32_t max_shared_memory) { - checks_circuit_bootstrap(polynomial_size, level_bsk, number_of_inputs); + checks_circuit_bootstrap(glwe_dimension, polynomial_size, level_bsk, + number_of_inputs); switch (polynomial_size) { case 256: @@ -252,7 +254,8 @@ void cuda_circuit_bootstrap_64( uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs, uint32_t max_shared_memory) { - checks_circuit_bootstrap(polynomial_size, level_bsk, number_of_inputs); + checks_circuit_bootstrap(glwe_dimension, polynomial_size, level_bsk, + number_of_inputs); switch (polynomial_size) { case 256: diff --git a/backends/concrete-cuda/implementation/src/vertical_packing.cu b/backends/concrete-cuda/implementation/src/vertical_packing.cu index cb7adfbe5..50d3817fd 100644 --- a/backends/concrete-cuda/implementation/src/vertical_packing.cu +++ b/backends/concrete-cuda/implementation/src/vertical_packing.cu @@ -5,7 +5,7 @@ /* * Runs standard checks to validate the inputs */ -void checks_fast_cmux_tree(int nbits, int polynomial_size, int r) { +void checks_fast_cmux_tree(int polynomial_size, int r) { assert(( "Error (GPU Cmux tree): polynomial size should be one of 256, 512, 1024, " "2048, 4096, 8192", @@ -14,7 +14,7 @@ void checks_fast_cmux_tree(int nbits, int polynomial_size, int r) { polynomial_size == 4096 || polynomial_size == 8192)); // For larger k we will need to adjust the mask size assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should " - "be >= 1 ", + "be >= 1", r >= 1)); } @@ -25,7 +25,7 @@ void checks_cmux_tree(int nbits, int polynomial_size, int base_log, int r) { assert(("Error (GPU Cmux tree): base log should be <= nbits", base_log <= nbits)); - checks_fast_cmux_tree(nbits, polynomial_size, r); + checks_fast_cmux_tree(polynomial_size, r); } /* @@ -34,9 +34,7 @@ void checks_cmux_tree(int nbits, int polynomial_size, int base_log, int r) { void checks_blind_rotation_and_sample_extraction(int polynomial_size) { assert(("Error (GPU Blind rotation + sample extraction): polynomial size " - "should be one of 256, 512, " - "1024, " - "2048, 4096, 8192", + "should be one of 256, 512, 1024, 2048, 4096, 8192", polynomial_size == 256 || polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048 || polynomial_size == 4096 || polynomial_size == 8192)); @@ -54,7 +52,7 @@ void scratch_cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, uint32_t r, uint32_t tau, uint32_t max_shared_memory, bool allocate_gpu_memory) { - checks_fast_cmux_tree(32, polynomial_size, r); + checks_fast_cmux_tree(polynomial_size, r); switch (polynomial_size) { case 256: @@ -104,7 +102,7 @@ void scratch_cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, uint32_t r, uint32_t tau, uint32_t max_shared_memory, bool allocate_gpu_memory) { - checks_fast_cmux_tree(64, polynomial_size, r); + checks_fast_cmux_tree(polynomial_size, r); switch (polynomial_size) { case 256: diff --git a/backends/concrete-cuda/implementation/src/wop_bootstrap.cu b/backends/concrete-cuda/implementation/src/wop_bootstrap.cu index 15a740d0c..aa2014fa8 100644 --- a/backends/concrete-cuda/implementation/src/wop_bootstrap.cu +++ b/backends/concrete-cuda/implementation/src/wop_bootstrap.cu @@ -1,10 +1,12 @@ #include "wop_bootstrap.cuh" +#include /* * Runs standard checks to validate the inputs */ -void checks_wop_pbs(int polynomial_size, int level_count_bsk, - int number_of_inputs) { +void checks_wop_pbs(int glwe_dimension, int polynomial_size, + int level_count_bsk, int number_of_inputs, + int number_of_bits_to_extract) { assert(("Error (GPU WOP PBS): polynomial_size should be one of " "256, 512, 1024, 2048, 4096, 8192", polynomial_size == 256 || polynomial_size == 512 || @@ -18,9 +20,16 @@ void checks_wop_pbs(int polynomial_size, int level_count_bsk, cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); assert(("Error (GPU WOP PBS): the number of input LWEs must be lower or " "equal to the " - "number of streaming multiprocessors on the device divided by 8 * " + "number of streaming multiprocessors on the device divided by 4 * (k " + "+ 1) * " "level_count_bsk", - number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk)); + number_of_inputs <= + number_of_sm / 4. / (glwe_dimension + 1) / level_count_bsk)); + assert( + ("Error (GPU WOP PBS): the number of inputs x the number of extracted " + "bits should be " + "larger than log2 of the polynomial size", + number_of_inputs * number_of_bits_to_extract >= log2(polynomial_size))); } void checks_fast_circuit_bootstrap_vertical_packing(int polynomial_size) { @@ -31,7 +40,8 @@ void checks_fast_circuit_bootstrap_vertical_packing(int polynomial_size) { polynomial_size == 4096 || polynomial_size == 8192)); } -void checks_circuit_bootstrap_vertical_packing(int polynomial_size, +void checks_circuit_bootstrap_vertical_packing(int glwe_dimension, + int polynomial_size, int number_of_inputs, int level_count_bsk) { // The number of inputs should be lower than the number of streaming @@ -42,9 +52,11 @@ void checks_circuit_bootstrap_vertical_packing(int polynomial_size, cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); assert(("Error (GPU extract bits): the number of input LWEs must be lower or " "equal to the " - "number of streaming multiprocessors on the device divided by 8 * " + "number of streaming multiprocessors on the device divided by 4 * (k " + "+ 1) " "level_count_bsk", - number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk)); + number_of_inputs <= + number_of_sm / 4. / (glwe_dimension + 1) / level_count_bsk)); checks_fast_circuit_bootstrap_vertical_packing(polynomial_size); } @@ -176,7 +188,8 @@ void scratch_cuda_wop_pbs_32( uint32_t number_of_bits_of_message_including_padding, uint32_t number_of_bits_to_extract, uint32_t number_of_inputs, uint32_t max_shared_memory, bool allocate_gpu_memory) { - checks_wop_pbs(polynomial_size, level_count_bsk, number_of_inputs); + checks_wop_pbs(glwe_dimension, polynomial_size, level_count_bsk, + number_of_inputs, number_of_bits_to_extract); switch (polynomial_size) { case 256: scratch_wop_pbs>( @@ -245,7 +258,8 @@ void scratch_cuda_wop_pbs_64( uint32_t number_of_bits_of_message_including_padding, uint32_t number_of_bits_to_extract, uint32_t number_of_inputs, uint32_t max_shared_memory, bool allocate_gpu_memory) { - checks_wop_pbs(polynomial_size, level_count_bsk, number_of_inputs); + checks_wop_pbs(glwe_dimension, polynomial_size, level_count_bsk, + number_of_inputs, number_of_bits_to_extract); switch (polynomial_size) { case 256: scratch_wop_pbs>( @@ -337,8 +351,8 @@ void cuda_circuit_bootstrap_vertical_packing_64( uint32_t base_log_cbs, uint32_t number_of_inputs, uint32_t lut_number, uint32_t max_shared_memory) { - checks_circuit_bootstrap_vertical_packing(polynomial_size, number_of_inputs, - level_count_bsk); + checks_circuit_bootstrap_vertical_packing(glwe_dimension, polynomial_size, + number_of_inputs, level_count_bsk); switch (polynomial_size) { case 256: @@ -453,7 +467,8 @@ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out, uint32_t number_of_bits_of_message_including_padding, uint32_t number_of_bits_to_extract, uint32_t delta_log, uint32_t number_of_inputs, uint32_t max_shared_memory) { - checks_wop_pbs(polynomial_size, level_count_bsk, number_of_inputs); + checks_wop_pbs(glwe_dimension, polynomial_size, level_count_bsk, + number_of_inputs, number_of_bits_to_extract); switch (polynomial_size) { case 256: host_wop_pbs>(