From 184d45338758d77406ccf709a765f7945d5eaf91 Mon Sep 17 00:00:00 2001 From: Pedro Alves Date: Wed, 22 Feb 2023 11:13:38 -0300 Subject: [PATCH] refactor(cuda): Implements support to N=256 in the cmux tree, bit extraction, and cbs. --- src/bit_extraction.cu | 48 +++++++++++++++++++++------ src/circuit_bootstrap.cu | 46 +++++++++++++++++++++----- src/vertical_packing.cu | 60 ++++++++++++++++++++++++++++------ src/wop_bootstrap.cu | 70 ++++++++++++++++++++++++++++++++++------ 4 files changed, 187 insertions(+), 37 deletions(-) diff --git a/src/bit_extraction.cu b/src/bit_extraction.cu index ac7a8f427..a53b4841e 100644 --- a/src/bit_extraction.cu +++ b/src/bit_extraction.cu @@ -12,6 +12,12 @@ void scratch_cuda_extract_bits_32( bool allocate_gpu_memory) { switch (polynomial_size) { + case 256: + scratch_extract_bits>( + v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension, + polynomial_size, level_count, number_of_inputs, max_shared_memory, + allocate_gpu_memory); + break; case 512: scratch_extract_bits>( v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension, @@ -59,6 +65,12 @@ void scratch_cuda_extract_bits_64( bool allocate_gpu_memory) { switch (polynomial_size) { + case 256: + scratch_extract_bits>( + v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension, + polynomial_size, level_count, number_of_inputs, max_shared_memory, + allocate_gpu_memory); + break; case 512: scratch_extract_bits>( v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension, @@ -110,10 +122,10 @@ void cuda_extract_bits_32(void *v_stream, uint32_t gpu_index, assert(("Error (GPU extract bits): base log should be <= 32", base_log_bsk <= 32)); assert(("Error (GPU extract bits): lwe_dimension_in should be one of " - "512, 1024, 2048, 4096, 8192", - lwe_dimension_in == 512 || lwe_dimension_in == 1024 || - lwe_dimension_in == 2048 || lwe_dimension_in == 4096 || - lwe_dimension_in == 8192)); + "256, 512, 1024, 2048, 4096, 8192", + lwe_dimension_in == 256 || lwe_dimension_in == 512 || + lwe_dimension_in == 1024 || lwe_dimension_in == 2048 || + lwe_dimension_in == 4096 || lwe_dimension_in == 8192)); assert(("Error (GPU extract bits): lwe_dimension_in should be equal to " "polynomial_size", lwe_dimension_in == polynomial_size)); @@ -130,6 +142,15 @@ void cuda_extract_bits_32(void *v_stream, uint32_t gpu_index, number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk)); switch (lwe_dimension_in) { + case 256: + host_extract_bits>( + v_stream, gpu_index, (uint32_t *)list_lwe_array_out, + (uint32_t *)lwe_array_in, bit_extract_buffer, (uint32_t *)ksk, + (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in, + lwe_dimension_out, glwe_dimension, polynomial_size, base_log_bsk, + level_count_bsk, base_log_ksk, level_count_ksk, number_of_samples, + max_shared_memory); + break; case 512: host_extract_bits>( v_stream, gpu_index, (uint32_t *)list_lwe_array_out, @@ -210,7 +231,7 @@ void cuda_extract_bits_32(void *v_stream, uint32_t gpu_index, * - 'ksk' keyswitch key * - 'fourier_bsk' complex compressed bsk in fourier domain * - 'lwe_dimension_in' input LWE ciphertext dimension, supported input - * dimensions are: {512, 1024,2048, 4096, 8192} + * dimensions are: {256, 512, 1024,2048, 4096, 8192} * - 'lwe_dimension_out' output LWE ciphertext dimension * - 'glwe_dimension' GLWE dimension, only glwe_dimension = 1 is supported * for now @@ -238,10 +259,10 @@ void cuda_extract_bits_64(void *v_stream, uint32_t gpu_index, assert(("Error (GPU extract bits): base log should be <= 64", base_log_bsk <= 64)); assert(("Error (GPU extract bits): lwe_dimension_in should be one of " - "512, 1024, 2048, 4096, 8192", - lwe_dimension_in == 512 || lwe_dimension_in == 1024 || - lwe_dimension_in == 2048 || lwe_dimension_in == 4096 || - lwe_dimension_in == 8192)); + "256, 512, 1024, 2048, 4096, 8192", + lwe_dimension_in == 256 || lwe_dimension_in == 512 || + lwe_dimension_in == 1024 || lwe_dimension_in == 2048 || + lwe_dimension_in == 4096 || lwe_dimension_in == 8192)); assert(("Error (GPU extract bits): lwe_dimension_in should be equal to " "polynomial_size", lwe_dimension_in == polynomial_size)); @@ -258,6 +279,15 @@ void cuda_extract_bits_64(void *v_stream, uint32_t gpu_index, number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk)); switch (lwe_dimension_in) { + case 256: + host_extract_bits>( + v_stream, gpu_index, (uint64_t *)list_lwe_array_out, + (uint64_t *)lwe_array_in, bit_extract_buffer, (uint64_t *)ksk, + (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in, + lwe_dimension_out, glwe_dimension, polynomial_size, base_log_bsk, + level_count_bsk, base_log_ksk, level_count_ksk, number_of_samples, + max_shared_memory); + break; case 512: host_extract_bits>( v_stream, gpu_index, (uint64_t *)list_lwe_array_out, diff --git a/src/circuit_bootstrap.cu b/src/circuit_bootstrap.cu index 51256b2cd..55236f84c 100644 --- a/src/circuit_bootstrap.cu +++ b/src/circuit_bootstrap.cu @@ -13,6 +13,12 @@ void scratch_cuda_circuit_bootstrap_32( uint32_t max_shared_memory, bool allocate_gpu_memory) { switch (polynomial_size) { + case 256: + scratch_circuit_bootstrap>( + v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension, + polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory, + allocate_gpu_memory); + break; case 512: scratch_circuit_bootstrap>( v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension, @@ -60,6 +66,12 @@ void scratch_cuda_circuit_bootstrap_64( uint32_t max_shared_memory, bool allocate_gpu_memory) { switch (polynomial_size) { + case 256: + scratch_circuit_bootstrap>( + v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension, + polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory, + allocate_gpu_memory); + break; case 512: scratch_circuit_bootstrap>( v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension, @@ -108,10 +120,10 @@ void cuda_circuit_bootstrap_32( uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs, uint32_t max_shared_memory) { assert(("Error (GPU circuit bootstrap): polynomial_size should be one of " - "512, 1024, 2048, 4096, 8192", - polynomial_size == 512 || polynomial_size == 1024 || - polynomial_size == 2048 || polynomial_size == 4096 || - polynomial_size == 8192)); + "256, 512, 1024, 2048, 4096, 8192", + polynomial_size == 256 || polynomial_size == 512 || + polynomial_size == 1024 || polynomial_size == 2048 || + polynomial_size == 4096 || polynomial_size == 8192)); // The number of samples should be lower than the number of streaming // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related // to the occupancy of 50%). The only supported value for k is 1, so @@ -124,6 +136,15 @@ void cuda_circuit_bootstrap_32( "level_count_bsk", number_of_inputs <= number_of_sm / 4. / 2. / level_bsk)); switch (polynomial_size) { + case 256: + host_circuit_bootstrap>( + v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in, + (double2 *)fourier_bsk, (uint32_t *)fp_ksk_array, + (uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size, + glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk, + base_log_pksk, level_cbs, base_log_cbs, number_of_inputs, + max_shared_memory); + break; case 512: host_circuit_bootstrap>( v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in, @@ -208,10 +229,10 @@ void cuda_circuit_bootstrap_64( uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs, uint32_t max_shared_memory) { assert(("Error (GPU circuit bootstrap): polynomial_size should be one of " - "512, 1024, 2048, 4096, 8192", - polynomial_size == 512 || polynomial_size == 1024 || - polynomial_size == 2048 || polynomial_size == 4096 || - polynomial_size == 8192)); + "256, 512, 1024, 2048, 4096, 8192", + polynomial_size == 256 || polynomial_size == 512 || + polynomial_size == 1024 || polynomial_size == 2048 || + polynomial_size == 4096 || polynomial_size == 8192)); // The number of samples should be lower than the number of streaming // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related // to the occupancy of 50%). The only supported value for k is 1, so @@ -225,6 +246,15 @@ void cuda_circuit_bootstrap_64( number_of_inputs <= number_of_sm / 4. / 2. / level_bsk)); // The number of samples should be lower than the number of streaming switch (polynomial_size) { + case 256: + host_circuit_bootstrap>( + v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in, + (double2 *)fourier_bsk, (uint64_t *)fp_ksk_array, + (uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size, + glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk, + base_log_pksk, level_cbs, base_log_cbs, number_of_inputs, + max_shared_memory); + break; case 512: host_circuit_bootstrap>( v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in, diff --git a/src/vertical_packing.cu b/src/vertical_packing.cu index 945343373..eb6a437dd 100644 --- a/src/vertical_packing.cu +++ b/src/vertical_packing.cu @@ -16,6 +16,11 @@ void scratch_cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, bool allocate_gpu_memory) { switch (polynomial_size) { + case 256: + scratch_cmux_tree>( + v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size, + level_count, r, tau, max_shared_memory, allocate_gpu_memory); + break; case 512: scratch_cmux_tree>( v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size, @@ -59,6 +64,11 @@ void scratch_cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, uint32_t max_shared_memory, bool allocate_gpu_memory) { switch (polynomial_size) { + case 256: + scratch_cmux_tree>( + v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size, + level_count, r, tau, max_shared_memory, allocate_gpu_memory); + break; case 512: scratch_cmux_tree>( v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size, @@ -101,17 +111,24 @@ void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out, uint32_t max_shared_memory) { assert(("Error (GPU Cmux tree): base log should be <= 32", base_log <= 32)); - assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, " - "2048, 4096, 8192", - polynomial_size == 512 || polynomial_size == 1024 || - polynomial_size == 2048 || polynomial_size == 4096 || - polynomial_size == 8192)); + assert(( + "Error (GPU Cmux tree): polynomial size should be one of 256, 512, 1024, " + "2048, 4096, 8192", + polynomial_size == 256 || polynomial_size == 512 || + polynomial_size == 1024 || polynomial_size == 2048 || + polynomial_size == 4096 || polynomial_size == 8192)); // For larger k we will need to adjust the mask size assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should " "be >= 1 ", r >= 1)); switch (polynomial_size) { + case 256: + host_cmux_tree>( + v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in, + (uint32_t *)lut_vector, cmux_tree_buffer, glwe_dimension, + polynomial_size, base_log, level_count, r, tau, max_shared_memory); + break; case 512: host_cmux_tree>( v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in, @@ -182,17 +199,24 @@ void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out, uint32_t max_shared_memory) { assert(("Error (GPU Cmux tree): base log should be <= 64", base_log <= 64)); - assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, " - "2048, 4096, 8192", - polynomial_size == 512 || polynomial_size == 1024 || - polynomial_size == 2048 || polynomial_size == 4096 || - polynomial_size == 8192)); + assert(( + "Error (GPU Cmux tree): polynomial size should be one of 256, 512, 1024, " + "2048, 4096, 8192", + polynomial_size == 256 || polynomial_size == 512 || + polynomial_size == 1024 || polynomial_size == 2048 || + polynomial_size == 4096 || polynomial_size == 8192)); // For larger k we will need to adjust the mask size assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should " "be >= 1 ", r >= 1)); switch (polynomial_size) { + case 256: + host_cmux_tree>( + v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in, + (uint64_t *)lut_vector, cmux_tree_buffer, glwe_dimension, + polynomial_size, base_log, level_count, r, tau, max_shared_memory); + break; case 512: host_cmux_tree>( v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in, @@ -251,6 +275,11 @@ void scratch_cuda_blind_rotation_sample_extraction_32( bool allocate_gpu_memory) { switch (polynomial_size) { + case 256: + scratch_blind_rotation_sample_extraction>( + v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size, + level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory); + break; case 512: scratch_blind_rotation_sample_extraction>( v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size, @@ -293,6 +322,11 @@ void scratch_cuda_blind_rotation_sample_extraction_64( bool allocate_gpu_memory) { switch (polynomial_size) { + case 256: + scratch_blind_rotation_sample_extraction>( + v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size, + level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory); + break; case 512: scratch_blind_rotation_sample_extraction>( v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size, @@ -353,6 +387,12 @@ void cuda_blind_rotate_and_sample_extraction_64( uint32_t l_gadget, uint32_t max_shared_memory) { switch (polynomial_size) { + case 256: + host_blind_rotate_and_sample_extraction>( + v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in, + (uint64_t *)lut_vector, br_se_buffer, mbr_size, tau, glwe_dimension, + polynomial_size, base_log, l_gadget, max_shared_memory); + break; case 512: host_blind_rotate_and_sample_extraction>( v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in, diff --git a/src/wop_bootstrap.cu b/src/wop_bootstrap.cu index ae870a522..069c767e6 100644 --- a/src/wop_bootstrap.cu +++ b/src/wop_bootstrap.cu @@ -14,6 +14,12 @@ void scratch_cuda_circuit_bootstrap_vertical_packing_32( bool allocate_gpu_memory) { switch (polynomial_size) { + case 256: + scratch_circuit_bootstrap_vertical_packing>( + v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension, + lwe_dimension, polynomial_size, level_count_cbs, number_of_inputs, tau, + max_shared_memory, allocate_gpu_memory); + break; case 512: scratch_circuit_bootstrap_vertical_packing>( v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension, @@ -63,6 +69,12 @@ void scratch_cuda_circuit_bootstrap_vertical_packing_64( bool allocate_gpu_memory) { switch (polynomial_size) { + case 256: + scratch_circuit_bootstrap_vertical_packing>( + v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension, + lwe_dimension, polynomial_size, level_count_cbs, number_of_inputs, tau, + max_shared_memory, allocate_gpu_memory); + break; case 512: scratch_circuit_bootstrap_vertical_packing>( v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension, @@ -113,6 +125,14 @@ void scratch_cuda_wop_pbs_32( uint32_t number_of_bits_to_extract, uint32_t number_of_inputs, uint32_t max_shared_memory, bool allocate_gpu_memory) { switch (polynomial_size) { + case 256: + scratch_wop_pbs>( + v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log, + glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs, + level_count_bsk, number_of_bits_of_message_including_padding, + number_of_bits_to_extract, number_of_inputs, max_shared_memory, + allocate_gpu_memory); + break; case 512: scratch_wop_pbs>( v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log, @@ -173,6 +193,14 @@ void scratch_cuda_wop_pbs_64( uint32_t number_of_bits_to_extract, uint32_t number_of_inputs, uint32_t max_shared_memory, bool allocate_gpu_memory) { switch (polynomial_size) { + case 256: + scratch_wop_pbs>( + v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log, + glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs, + level_count_bsk, number_of_bits_of_message_including_padding, + number_of_bits_to_extract, number_of_inputs, max_shared_memory, + allocate_gpu_memory); + break; case 512: scratch_wop_pbs>( v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log, @@ -232,7 +260,7 @@ void scratch_cuda_wop_pbs_64( * - 'lut_vector' list of test vectors * - 'cbs_vp_buffer' a pre-allocated array to store intermediate results * - 'polynomial_size' size of the test polynomial, supported sizes: - * {512, 1024, 2048, 4096, 8192} + * {256, 512, 1024, 2048, 4096, 8192} * - 'glwe_dimension' supported dimensions: {1} * - 'lwe_dimension' dimension of input LWE ciphertexts * - 'level_count_bsk' decomposition level for bootstrapping @@ -255,10 +283,10 @@ void cuda_circuit_bootstrap_vertical_packing_64( uint32_t base_log_cbs, uint32_t number_of_inputs, uint32_t lut_number, uint32_t max_shared_memory) { assert(("Error (GPU circuit bootstrap): polynomial_size should be one of " - "512, 1024, 2048, 4096, 8192", - polynomial_size == 512 || polynomial_size == 1024 || - polynomial_size == 2048 || polynomial_size == 4096 || - polynomial_size == 8192)); + "256, 512, 1024, 2048, 4096, 8192", + polynomial_size == 256 || polynomial_size == 512 || + polynomial_size == 1024 || polynomial_size == 2048 || + polynomial_size == 4096 || polynomial_size == 8192)); // The number of inputs should be lower than the number of streaming // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related // to the occupancy of 50%). The only supported value for k is 1, so @@ -271,6 +299,16 @@ void cuda_circuit_bootstrap_vertical_packing_64( "level_count_bsk", number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk)); switch (polynomial_size) { + case 256: + host_circuit_bootstrap_vertical_packing>( + v_stream, gpu_index, (uint64_t *)lwe_array_out, + (uint64_t *)lwe_array_in, (uint64_t *)lut_vector, + (double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, cbs_vp_buffer, + cbs_delta_log, glwe_dimension, lwe_dimension, polynomial_size, + base_log_bsk, level_count_bsk, base_log_pksk, level_count_pksk, + base_log_cbs, level_count_cbs, number_of_inputs, lut_number, + max_shared_memory); + break; case 512: host_circuit_bootstrap_vertical_packing>( v_stream, gpu_index, (uint64_t *)lwe_array_out, @@ -343,7 +381,7 @@ void cuda_circuit_bootstrap_vertical_packing_64( * - 'glwe_dimension' supported dimensions: {1} * - 'lwe_dimension' dimension of input lwe ciphertexts * - 'polynomial_size' size of the test polynomial, supported sizes: - * {512, 1024, 2048, 4096, 8192} + * {256, 512, 1024, 2048, 4096, 8192} * - 'base_log_bsk' base log parameter for bootstrapping * - 'level_count_bsk' decomposition level for bootstrapping * - 'base_log_ksk' base log parameter for keyswitch @@ -374,10 +412,10 @@ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out, uint32_t number_of_bits_to_extract, uint32_t delta_log, uint32_t number_of_inputs, uint32_t max_shared_memory) { assert(("Error (GPU WOP PBS): polynomial_size should be one of " - "512, 1024, 2048, 4096, 8192", - polynomial_size == 512 || polynomial_size == 1024 || - polynomial_size == 2048 || polynomial_size == 4096 || - polynomial_size == 8192)); + "256, 512, 1024, 2048, 4096, 8192", + polynomial_size == 256 || polynomial_size == 512 || + polynomial_size == 1024 || polynomial_size == 2048 || + polynomial_size == 4096 || polynomial_size == 8192)); // The number of inputs should be lower than the number of streaming // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related // to the occupancy of 50%). The only supported value for k is 1, so @@ -390,6 +428,18 @@ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out, "level_count_bsk", number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk)); switch (polynomial_size) { + case 256: + host_wop_pbs>( + v_stream, gpu_index, (uint64_t *)lwe_array_out, + (uint64_t *)lwe_array_in, (uint64_t *)lut_vector, + (double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk, + wop_pbs_buffer, cbs_delta_log, glwe_dimension, lwe_dimension, + polynomial_size, base_log_bsk, level_count_bsk, base_log_ksk, + level_count_ksk, base_log_pksk, level_count_pksk, base_log_cbs, + level_count_cbs, number_of_bits_of_message_including_padding, + number_of_bits_to_extract, delta_log, number_of_inputs, + max_shared_memory); + break; case 512: host_wop_pbs>( v_stream, gpu_index, (uint64_t *)lwe_array_out,