From 703c74401ca52936a60fcd99db7531f12ad51d5c Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Fri, 14 Oct 2022 11:13:58 +0200 Subject: [PATCH] chore(cuda): add asserts on base log, poly size and num samples values --- src/bootstrap_amortized.cu | 10 ++++++++ src/bootstrap_low_latency.cu | 24 +++++++++++++++++ src/bootstrap_wop.cu | 50 ++++++++++++++++++++++++++++++++++++ src/bootstrap_wop.cuh | 3 --- 4 files changed, 84 insertions(+), 3 deletions(-) diff --git a/src/bootstrap_amortized.cu b/src/bootstrap_amortized.cu index 15647b4bc..6504d4b9e 100644 --- a/src/bootstrap_amortized.cu +++ b/src/bootstrap_amortized.cu @@ -73,6 +73,11 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32( uint32_t lwe_idx, uint32_t max_shared_memory) { + assert(("Error (GPU amortized PBS): base log should be <= 16", base_log <= 16)); + assert(("Error (GPU amortized PBS): polynomial size should be one of 512, 1024, 2048, 4096, 8192", + polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048 || + polynomial_size == 4096 || polynomial_size == 8192)); + switch (polynomial_size) { case 512: host_bootstrap_amortized>( @@ -131,6 +136,11 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64( uint32_t lwe_idx, uint32_t max_shared_memory) { + assert(("Error (GPU amortized PBS): base log should be <= 16", base_log <= 16)); + assert(("Error (GPU amortized PBS): polynomial size should be one of 512, 1024, 2048, 4096, 8192", + polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048 || + polynomial_size == 4096 || polynomial_size == 8192)); + switch (polynomial_size) { case 512: host_bootstrap_amortized>( diff --git a/src/bootstrap_low_latency.cu b/src/bootstrap_low_latency.cu index b89b754c4..dfa80ce6b 100644 --- a/src/bootstrap_low_latency.cu +++ b/src/bootstrap_low_latency.cu @@ -72,6 +72,18 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32( uint32_t lwe_idx, uint32_t max_shared_memory) { + assert(("Error (GPU low latency PBS): base log should be <= 16", base_log <= 16)); + assert(("Error (GPU low latency PBS): polynomial size should be one of 512, 1024, 2048", + polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048)); + // The number of samples should be lower than SM/(4 * (k + 1) * l) (the + // factor 4 being related to the occupancy of 50%). The only supported + // value for k is 1, so k + 1 = 2 for now. + int number_of_sm = 0; + cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); + assert(("Error (GPU low latency PBS): the number of input LWEs must be lower or equal to the " + "number of streaming multiprocessors on the device divided by 8 * l_gadget", + num_samples <= number_of_sm / 4. / 2. / l_gadget)); + switch (polynomial_size) { case 512: host_bootstrap_low_latency>( @@ -134,6 +146,18 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64( uint32_t lwe_idx, uint32_t max_shared_memory) { + assert(("Error (GPU low latency PBS): base log should be <= 16", base_log <= 16)); + assert(("Error (GPU low latency PBS): polynomial size should be one of 512, 1024, 2048", + polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048)); + // The number of samples should be lower than SM/(4 * (k + 1) * l) (the + // factor 4 being related to the occupancy of 50%). The only supported + // value for k is 1, so k + 1 = 2 for now. + int number_of_sm = 0; + cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); + assert(("Error (GPU low latency PBS): the number of input LWEs must be lower or equal to the " + "number of streaming multiprocessors on the device divided by 8 * l_gadget", + num_samples <= number_of_sm / 4. / 2. / l_gadget)); + switch (polynomial_size) { case 512: host_bootstrap_low_latency>( diff --git a/src/bootstrap_wop.cu b/src/bootstrap_wop.cu index 05453f56e..653dead57 100644 --- a/src/bootstrap_wop.cu +++ b/src/bootstrap_wop.cu @@ -12,6 +12,15 @@ void cuda_cmux_tree_32( uint32_t r, uint32_t max_shared_memory) { + assert(("Error (GPU Cmux tree): base log should be <= 16", base_log <= 16)); + assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, 2048, 4096, 8192", + polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048 || + polynomial_size == 4096 || polynomial_size == 8192)); + // For larger k we will need to adjust the mask size + assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1", glwe_dimension == 1)); + assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should be >= 1 ", + r >= 1)); + switch (polynomial_size) { case 512: host_cmux_tree>( @@ -48,6 +57,8 @@ void cuda_cmux_tree_32( glwe_dimension, polynomial_size, base_log, l_gadget, r, max_shared_memory); break; + default: + break; } } @@ -63,6 +74,15 @@ void cuda_cmux_tree_64( uint32_t r, uint32_t max_shared_memory) { + assert(("Error (GPU Cmux tree): base log should be <= 16", base_log <= 16)); + assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, 2048, 4096, 8192", + polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048 || + polynomial_size == 4096 || polynomial_size == 8192)); + // For larger k we will need to adjust the mask size + assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1", glwe_dimension == 1)); + assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should be >= 1 ", + r >= 1)); + switch (polynomial_size) { case 512: host_cmux_tree>( @@ -99,6 +119,8 @@ void cuda_cmux_tree_64( glwe_dimension, polynomial_size, base_log, l_gadget, r, max_shared_memory); break; + default: + break; } } @@ -125,6 +147,20 @@ void cuda_extract_bits_32( uint32_t l_gadget_ksk, uint32_t number_of_samples) { + assert(("Error (GPU extract bits): base log should be <= 16", base_log_bsk <= 16)); + assert(("Error (GPU extract bits): lwe_dimension_before should be one of 512, 1024, 2048", + lwe_dimension_before == 512 || lwe_dimension_before == 1024 || + lwe_dimension_before == 2048)); + // The number of samples should be lower than the number of streaming + // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related + // to the occupancy of 50%). The only supported value for k is 1, so + // k + 1 = 2 for now. + int number_of_sm = 0; + cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); + assert(("Error (GPU extract bits): the number of input LWEs must be lower or equal to the " + "number of streaming multiprocessors on the device divided by 8 * l_gadget_bsk", + number_of_samples <= number_of_sm / 4. / 2. / l_gadget_bsk)); + switch (lwe_dimension_before) { case 512: host_extract_bits>( @@ -186,6 +222,20 @@ void cuda_extract_bits_64( uint32_t l_gadget_ksk, uint32_t number_of_samples) { + assert(("Error (GPU extract bits): base log should be <= 16", base_log_bsk <= 16)); + assert(("Error (GPU extract bits): lwe_dimension_before should be one of 512, 1024, 2048", + lwe_dimension_before == 512 || lwe_dimension_before == 1024 || + lwe_dimension_before == 2048)); + // The number of samples should be lower than the number of streaming + // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related + // to the occupancy of 50%). The only supported value for k is 1, so + // k + 1 = 2 for now. + int number_of_sm = 0; + cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); + assert(("Error (GPU extract bits): the number of input LWEs must be lower or equal to the " + "number of streaming multiprocessors on the device divided by 8 * l_gadget_bsk", + number_of_samples <= number_of_sm / 4. / 2. / l_gadget_bsk)); + switch (lwe_dimension_before) { case 512: host_extract_bits>( diff --git a/src/bootstrap_wop.cuh b/src/bootstrap_wop.cuh index a1aff02f8..964d55071 100644 --- a/src/bootstrap_wop.cuh +++ b/src/bootstrap_wop.cuh @@ -298,9 +298,6 @@ void host_cmux_tree( uint32_t r, uint32_t max_shared_memory) { - assert(glwe_dimension == 1); // For larger k we will need to adjust the mask size - assert(r >= 1); - auto stream = static_cast(v_stream); int num_lut = (1<