diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h index 0c8c187fc..21889dde4 100644 --- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h @@ -7,23 +7,23 @@ template bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap( uint32_t polynomial_size, uint32_t max_shared_memory); -template +template bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t max_shared_memory); #if CUDA_ARCH >= 900 -template +template uint64_t scratch_cuda_tbc_multi_bit_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); -template +template void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, Torus *lwe_array_out, Torus const *lwe_output_indexes, Torus const *lut_vector, - Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lut_vector_indexes, InputTorus const *lwe_array_in, Torus const *lwe_input_indexes, Torus const *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, @@ -48,7 +48,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride); -template +template uint64_t scratch_cuda_multi_bit_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, @@ -96,7 +96,7 @@ template uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap( uint32_t polynomial_size); -template +template uint64_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs, uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count, uint64_t full_sm_keybundle); diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h index b131e8ff2..75587618f 100644 --- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h @@ -428,29 +428,29 @@ uint64_t get_buffer_size_programmable_bootstrap_cg( return buffer_size + buffer_size % sizeof(double2); } -template +template bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); -template +template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, Torus *lwe_array_out, Torus const *lwe_output_indexes, Torus const *lut_vector, - Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lut_vector_indexes, InputTorus const *lwe_array_in, Torus const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride); -template +template void cuda_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, Torus *lwe_array_out, Torus const *lwe_output_indexes, Torus const *lut_vector, - Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lut_vector_indexes, InputTorus const *lwe_array_in, Torus const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, @@ -458,11 +458,11 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( uint32_t lut_stride); #if (CUDA_ARCH >= 900) -template +template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, Torus *lwe_array_out, Torus const *lwe_output_indexes, Torus const *lut_vector, - Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lut_vector_indexes, InputTorus const *lwe_array_in, Torus const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, @@ -477,14 +477,14 @@ uint64_t scratch_cuda_programmable_bootstrap_tbc( bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type); #endif -template +template uint64_t scratch_cuda_programmable_bootstrap_cg( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type); -template +template uint64_t scratch_cuda_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h index 0b87999f6..5cc3cb509 100644 --- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h +++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h @@ -69,13 +69,19 @@ uint64_t scratch_cuda_programmable_bootstrap_64( uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type); +uint64_t scratch_cuda_programmable_bootstrap_32_64( + void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, + PBS_MS_REDUCTION_T noise_reduction_type); + uint64_t scratch_cuda_programmable_bootstrap_128( void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type); -void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( +void cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64( void *stream, uint32_t gpu_index, void *lwe_array_out, void const *lwe_output_indexes, void const *lut_vector, void const *lut_vector_indexes, void const *lwe_array_in, @@ -84,7 +90,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride); -void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( +void cuda_programmable_bootstrap_lwe_ciphertext_vector_32_64( void *stream, uint32_t gpu_index, void *lwe_array_out, void const *lwe_output_indexes, void const *lut_vector, void const *lut_vector_indexes, void const *lwe_array_in, diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h index 6679ed09c..ca7fc6214 100644 --- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h +++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h @@ -4,12 +4,12 @@ #include "pbs_enums.h" #include "stdint.h" -extern "C" { - bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); +extern "C" { + void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64( void *stream, uint32_t gpu_index, void *dest, void const *src, uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count, @@ -20,6 +20,11 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128( uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size, uint32_t grouping_factor); +uint64_t scratch_cuda_multi_bit_programmable_bootstrap_32_64( + void *stream, uint32_t gpu_index, int8_t **pbs_buffer, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); + uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64( void *stream, uint32_t gpu_index, int8_t **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, @@ -35,6 +40,16 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride); +void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_32_64( + void *stream, uint32_t gpu_index, void *lwe_array_out, + void const *lwe_output_indexes, void const *lut_vector, + void const *lut_vector_indexes, void const *lwe_array_in, + void const *lwe_input_indexes, void const *bootstrapping_key, + int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, + uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, + uint32_t lut_stride); + void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream, uint32_t gpu_index, int8_t **pbs_buffer); diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh index 9b024c73b..efb83da0c 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh @@ -535,7 +535,6 @@ __host__ void host_gemm_keyswitch_lwe_ciphertext_vector( dim3 grid_negate(CEIL_DIV(lwe_dimension_out + 1, BLOCK_SIZE_DECOMP), CEIL_DIV(num_samples, BLOCK_SIZE_DECOMP)); dim3 threads_negate(BLOCK_SIZE_DECOMP, BLOCK_SIZE_DECOMP); - // Negate all outputs in the output LWEs. This is the final step in the GEMM // keyswitch computed as: -(-b + sum(a_i A_KSK)) keyswitch_negate_with_output_indices diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh index 9d9aa757e..ea2f76a34 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh @@ -263,42 +263,7 @@ void execute_pbs_async(CudaStreams streams, uint32_t num_many_lut, uint32_t lut_stride) { if constexpr (std::is_same_v) { - // 32 bits - switch (pbs_type) { - case MULTI_BIT: - PANIC("Error: 32-bit multibit PBS is not supported.\n") - case CLASSICAL: - for (uint i = 0; i < streams.count(); i++) { - int num_inputs_on_gpu = get_num_inputs_on_gpu( - input_lwe_ciphertext_count, i, streams.count()); - - int gpu_offset = - get_gpu_offset(input_lwe_ciphertext_count, i, streams.count()); - auto d_lut_vector_indexes = - lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset); - - // Use the macro to get the correct elements for the current iteration - // Handles the case when the input/output are scattered through - // different gpus and when it is not - auto current_lwe_array_out = get_variant_element(lwe_array_out, i); - auto current_lwe_output_indexes = - get_variant_element(lwe_output_indexes, i); - auto current_lwe_array_in = get_variant_element(lwe_array_in, i); - auto current_lwe_input_indexes = - get_variant_element(lwe_input_indexes, i); - - cuda_programmable_bootstrap_lwe_ciphertext_vector_32( - streams.stream(i), streams.gpu_index(i), current_lwe_array_out, - current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes, - current_lwe_array_in, current_lwe_input_indexes, - bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension, - polynomial_size, base_log, level_count, num_inputs_on_gpu, - num_many_lut, lut_stride); - } - break; - default: - PANIC("Error: unsupported cuda PBS type.") - } + PANIC("Error: unsupported 32b CUDA PBS type.") } else if constexpr (std::is_same_v) { // 64 bits switch (pbs_type) { @@ -353,7 +318,7 @@ void execute_pbs_async(CudaStreams streams, auto d_lut_vector_indexes = lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset); - cuda_programmable_bootstrap_lwe_ciphertext_vector_64( + cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64( streams.stream(i), streams.gpu_index(i), current_lwe_array_out, current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes, current_lwe_array_in, current_lwe_input_indexes, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh index 48601c3ab..309004ee2 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh @@ -35,12 +35,13 @@ namespace cg = cooperative_groups; * * Each y-block computes one element of the lwe_array_out. */ -template +template __global__ void device_programmable_bootstrap_cg( Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes, const Torus *__restrict__ lut_vector, const Torus *__restrict__ lut_vector_indexes, - const Torus *__restrict__ lwe_array_in, + const InputTorus *__restrict__ lwe_array_in, const Torus *__restrict__ lwe_input_indexes, const double2 *__restrict__ bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, @@ -79,7 +80,7 @@ __global__ void device_programmable_bootstrap_cg( // The third dimension of the block is used to determine on which ciphertext // this block is operating, in the case of batch bootstraps - const Torus *block_lwe_array_in = + const InputTorus *block_lwe_array_in = &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)]; const Torus *block_lut_vector = @@ -95,8 +96,8 @@ __global__ void device_programmable_bootstrap_cg( // Put "b" in [0, 2N[ constexpr auto log_modulus = params::log2_degree + 1; - Torus b_hat = 0; - Torus correction = 0; + InputTorus b_hat = 0; + InputTorus correction = 0; if (noise_reduction_type == PBS_MS_REDUCTION_T::CENTERED) { correction = centered_binary_modulus_switch_body_correction_to_add( block_lwe_array_in, lwe_dimension, log_modulus); @@ -113,7 +114,7 @@ __global__ void device_programmable_bootstrap_cg( __syncthreads(); // Put "a" in [0, 2N[ - Torus a_hat = 0; + InputTorus a_hat = 0; modulus_switch(block_lwe_array_in[i], a_hat, params::log2_degree + 1); // Perform ACC * (X^รค - 1) @@ -193,7 +194,7 @@ __global__ void device_programmable_bootstrap_cg( } } -template +template __host__ uint64_t scratch_programmable_bootstrap_cg( cudaStream_t stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t lwe_dimension, @@ -210,18 +211,18 @@ __host__ uint64_t scratch_programmable_bootstrap_cg( auto max_shared_memory = cuda_get_max_shared_memory(gpu_index); if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) { check_cuda_error(cudaFuncSetAttribute( - device_programmable_bootstrap_cg, + device_programmable_bootstrap_cg, cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm)); cudaFuncSetCacheConfig( - device_programmable_bootstrap_cg, + device_programmable_bootstrap_cg, cudaFuncCachePreferShared); check_cuda_error(cudaGetLastError()); } else if (max_shared_memory >= partial_sm) { check_cuda_error(cudaFuncSetAttribute( - device_programmable_bootstrap_cg, + device_programmable_bootstrap_cg, cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm)); cudaFuncSetCacheConfig( - device_programmable_bootstrap_cg, + device_programmable_bootstrap_cg, cudaFuncCachePreferShared); check_cuda_error(cudaGetLastError()); } @@ -237,11 +238,11 @@ __host__ uint64_t scratch_programmable_bootstrap_cg( /* * Host wrapper */ -template +template __host__ void host_programmable_bootstrap_cg( cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, Torus const *lwe_output_indexes, Torus const *lut_vector, - Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lut_vector_indexes, InputTorus const *lwe_array_in, Torus const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, @@ -293,26 +294,29 @@ __host__ void host_programmable_bootstrap_cg( if (max_shared_memory < partial_sm) { kernel_args[13] = &full_dm; check_cuda_error(cudaLaunchCooperativeKernel( - (void *)device_programmable_bootstrap_cg, grid, - thds, (void **)kernel_args, 0, stream)); + (void *) + device_programmable_bootstrap_cg, + grid, thds, (void **)kernel_args, 0, stream)); } else if (max_shared_memory < full_sm) { kernel_args[13] = &partial_dm; check_cuda_error(cudaLaunchCooperativeKernel( - (void *)device_programmable_bootstrap_cg, + (void *)device_programmable_bootstrap_cg, grid, thds, (void **)kernel_args, partial_sm, stream)); } else { int no_dm = 0; kernel_args[13] = &no_dm; check_cuda_error(cudaLaunchCooperativeKernel( - (void *)device_programmable_bootstrap_cg, grid, - thds, (void **)kernel_args, full_sm, stream)); + (void *) + device_programmable_bootstrap_cg, + grid, thds, (void **)kernel_args, full_sm, stream)); } check_cuda_error(cudaGetLastError()); } // Verify if the grid size satisfies the cooperative group constraints -template +template __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size( int glwe_dimension, int level_count, int num_samples, uint32_t max_shared_memory) { @@ -338,30 +342,34 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size( if (max_shared_memory < partial_sm) { cudaOccupancyMaxActiveBlocksPerMultiprocessor( &max_active_blocks_per_sm, - (void *)device_programmable_bootstrap_cg, thds, 0); + (void *) + device_programmable_bootstrap_cg, + thds, 0); } else if (max_shared_memory < full_sm) { check_cuda_error(cudaFuncSetAttribute( - device_programmable_bootstrap_cg, + device_programmable_bootstrap_cg, cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm)); cudaFuncSetCacheConfig( - device_programmable_bootstrap_cg, + device_programmable_bootstrap_cg, cudaFuncCachePreferShared); cudaOccupancyMaxActiveBlocksPerMultiprocessor( &max_active_blocks_per_sm, - (void *)device_programmable_bootstrap_cg, + (void *)device_programmable_bootstrap_cg, thds, partial_sm); check_cuda_error(cudaGetLastError()); } else { check_cuda_error(cudaFuncSetAttribute( - device_programmable_bootstrap_cg, + device_programmable_bootstrap_cg, cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm)); cudaFuncSetCacheConfig( - device_programmable_bootstrap_cg, + device_programmable_bootstrap_cg, cudaFuncCachePreferShared); cudaOccupancyMaxActiveBlocksPerMultiprocessor( &max_active_blocks_per_sm, - (void *)device_programmable_bootstrap_cg, thds, - full_sm); + (void *) + device_programmable_bootstrap_cg, + thds, full_sm); check_cuda_error(cudaGetLastError()); } @@ -372,39 +380,39 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size( } // Verify if the grid size satisfies the cooperative group constraints -template +template __host__ bool supports_cooperative_groups_on_programmable_bootstrap( int glwe_dimension, int polynomial_size, int level_count, int num_samples, uint32_t max_shared_memory) { switch (polynomial_size) { case 256: return verify_cuda_programmable_bootstrap_cg_grid_size< - Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples, - max_shared_memory); + InputTorus, Torus, AmortizedDegree<256>>( + glwe_dimension, level_count, num_samples, max_shared_memory); case 512: return verify_cuda_programmable_bootstrap_cg_grid_size< - Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples, - max_shared_memory); + InputTorus, Torus, AmortizedDegree<512>>( + glwe_dimension, level_count, num_samples, max_shared_memory); case 1024: return verify_cuda_programmable_bootstrap_cg_grid_size< - Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples, - max_shared_memory); + InputTorus, Torus, AmortizedDegree<1024>>( + glwe_dimension, level_count, num_samples, max_shared_memory); case 2048: return verify_cuda_programmable_bootstrap_cg_grid_size< - Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples, - max_shared_memory); + InputTorus, Torus, AmortizedDegree<2048>>( + glwe_dimension, level_count, num_samples, max_shared_memory); case 4096: return verify_cuda_programmable_bootstrap_cg_grid_size< - Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples, - max_shared_memory); + InputTorus, Torus, AmortizedDegree<4096>>( + glwe_dimension, level_count, num_samples, max_shared_memory); case 8192: return verify_cuda_programmable_bootstrap_cg_grid_size< - Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples, - max_shared_memory); + InputTorus, Torus, AmortizedDegree<8192>>( + glwe_dimension, level_count, num_samples, max_shared_memory); case 16384: return verify_cuda_programmable_bootstrap_cg_grid_size< - Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples, - max_shared_memory); + InputTorus, Torus, AmortizedDegree<16384>>( + glwe_dimension, level_count, num_samples, max_shared_memory); default: PANIC("Cuda error (classical PBS): unsupported polynomial size. " "Supported N's are powers of two" diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh index db0c27959..bbdacfce6 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh @@ -18,13 +18,14 @@ #include "types/complex/operations.cuh" #include -template +template __global__ void __launch_bounds__(params::degree / params::opt) device_multi_bit_programmable_bootstrap_cg_accumulate( Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes, const Torus *__restrict__ lut_vector, const Torus *__restrict__ lut_vector_indexes, - const Torus *__restrict__ lwe_array_in, + const InputTorus *__restrict__ lwe_array_in, const Torus *__restrict__ lwe_input_indexes, const double2 *__restrict__ keybundle_array, double2 *join_buffer, Torus *global_accumulator, uint32_t lwe_dimension, @@ -60,7 +61,7 @@ __global__ void __launch_bounds__(params::degree / params::opt) // The third dimension of the block is used to determine on which ciphertext // this block is operating, in the case of batch bootstraps - const Torus *block_lwe_array_in = + const InputTorus *block_lwe_array_in = &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)]; const Torus *block_lut_vector = @@ -80,7 +81,7 @@ __global__ void __launch_bounds__(params::degree / params::opt) if (lwe_offset == 0) { // Put "b" in [0, 2N[ - Torus b_hat = 0; + InputTorus b_hat = 0; modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, params::log2_degree + 1); @@ -208,7 +209,7 @@ uint64_t get_buffer_size_cg_multibit_programmable_bootstrap( return buffer_size + buffer_size % sizeof(double2); } -template +template __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap( cudaStream_t stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t glwe_dimension, @@ -230,57 +231,59 @@ __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap( auto max_shared_memory = cuda_get_max_shared_memory(gpu_index); if (max_shared_memory < full_sm_keybundle) { check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_keybundle, + device_multi_bit_programmable_bootstrap_keybundle, cudaFuncAttributeMaxDynamicSharedMemorySize, 0)); cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_keybundle, + device_multi_bit_programmable_bootstrap_keybundle, cudaFuncCachePreferShared); check_cuda_error(cudaGetLastError()); } else { check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_keybundle, + device_multi_bit_programmable_bootstrap_keybundle, cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle)); cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_keybundle, + device_multi_bit_programmable_bootstrap_keybundle, cudaFuncCachePreferShared); check_cuda_error(cudaGetLastError()); } if (max_shared_memory < partial_sm_cg_accumulate) { check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_cg_accumulate, + device_multi_bit_programmable_bootstrap_cg_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, 0)); cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_cg_accumulate, + device_multi_bit_programmable_bootstrap_cg_accumulate, cudaFuncCachePreferShared); check_cuda_error(cudaGetLastError()); } else if (max_shared_memory < full_sm_cg_accumulate) { check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_cg_accumulate, + device_multi_bit_programmable_bootstrap_cg_accumulate< + InputTorus, Torus, params, PARTIALSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_cg_accumulate)); cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_cg_accumulate, + device_multi_bit_programmable_bootstrap_cg_accumulate< + InputTorus, Torus, params, PARTIALSM>, cudaFuncCachePreferShared); check_cuda_error(cudaGetLastError()); } else { check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_cg_accumulate, + device_multi_bit_programmable_bootstrap_cg_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_cg_accumulate)); cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_cg_accumulate, + device_multi_bit_programmable_bootstrap_cg_accumulate, cudaFuncCachePreferShared); check_cuda_error(cudaGetLastError()); } - auto lwe_chunk_size = get_lwe_chunk_size( + auto lwe_chunk_size = get_lwe_chunk_size( gpu_index, input_lwe_ciphertext_count, polynomial_size, glwe_dimension, level_count, full_sm_keybundle); uint64_t size_tracker = 0; @@ -291,10 +294,10 @@ __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap( return size_tracker; } -template +template __host__ void execute_cg_external_product_loop( cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector, - Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lut_vector_indexes, InputTorus const *lwe_array_in, Torus const *lwe_input_indexes, Torus *lwe_array_out, Torus const *lwe_output_indexes, pbs_buffer *buffer, uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension, @@ -359,28 +362,28 @@ __host__ void execute_cg_external_product_loop( kernel_args[19] = &full_dm; check_cuda_error(cudaLaunchCooperativeKernel( (void *)device_multi_bit_programmable_bootstrap_cg_accumulate< - Torus, params, NOSM>, + InputTorus, Torus, params, NOSM>, grid_accumulate, thds, (void **)kernel_args, 0, stream)); } else if (max_shared_memory < full_dm) { kernel_args[19] = &partial_dm; check_cuda_error(cudaLaunchCooperativeKernel( (void *)device_multi_bit_programmable_bootstrap_cg_accumulate< - Torus, params, PARTIALSM>, + InputTorus, Torus, params, PARTIALSM>, grid_accumulate, thds, (void **)kernel_args, partial_sm, stream)); } else { kernel_args[19] = &no_dm; check_cuda_error(cudaLaunchCooperativeKernel( (void *)device_multi_bit_programmable_bootstrap_cg_accumulate< - Torus, params, FULLSM>, + InputTorus, Torus, params, FULLSM>, grid_accumulate, thds, (void **)kernel_args, full_sm, stream)); } } -template +template __host__ void host_cg_multi_bit_programmable_bootstrap( cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, Torus const *lwe_output_indexes, Torus const *lut_vector, - Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lut_vector_indexes, InputTorus const *lwe_array_in, Torus const *lwe_input_indexes, uint64_t const *bootstrapping_key, pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, @@ -393,13 +396,13 @@ __host__ void host_cg_multi_bit_programmable_bootstrap( lwe_offset += lwe_chunk_size) { // Compute a keybundle - execute_compute_keybundle( + execute_compute_keybundle( stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, level_count, lwe_offset); // Accumulate - execute_cg_external_product_loop( + execute_cg_external_product_loop( stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, @@ -409,7 +412,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap( } // Verify if the grid size satisfies the cooperative group constraints -template +template __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size( int glwe_dimension, int level_count, int num_samples, uint32_t max_shared_memory) { @@ -436,36 +439,36 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size( cudaOccupancyMaxActiveBlocksPerMultiprocessor( &max_active_blocks_per_sm, (void *)device_multi_bit_programmable_bootstrap_cg_accumulate< - Torus, params, NOSM>, + InputTorus, Torus, params, NOSM>, thds, 0); } else if (max_shared_memory < full_sm_cg_accumulate) { check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_cg_accumulate, + device_multi_bit_programmable_bootstrap_cg_accumulate< + InputTorus, Torus, params, PARTIALSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_cg_accumulate)); cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_cg_accumulate, + device_multi_bit_programmable_bootstrap_cg_accumulate< + InputTorus, Torus, params, PARTIALSM>, cudaFuncCachePreferShared); cudaOccupancyMaxActiveBlocksPerMultiprocessor( &max_active_blocks_per_sm, (void *)device_multi_bit_programmable_bootstrap_cg_accumulate< - Torus, params, PARTIALSM>, + InputTorus, Torus, params, PARTIALSM>, thds, partial_sm_cg_accumulate); check_cuda_error(cudaGetLastError()); } else { check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_cg_accumulate, + device_multi_bit_programmable_bootstrap_cg_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_cg_accumulate)); cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_cg_accumulate, + device_multi_bit_programmable_bootstrap_cg_accumulate, cudaFuncCachePreferShared); cudaOccupancyMaxActiveBlocksPerMultiprocessor( &max_active_blocks_per_sm, (void *)device_multi_bit_programmable_bootstrap_cg_accumulate< - Torus, params, FULLSM>, + InputTorus, Torus, params, FULLSM>, thds, full_sm_cg_accumulate); check_cuda_error(cudaGetLastError()); } @@ -478,39 +481,39 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size( // Verify if the grid size for the multi-bit kernel satisfies the cooperative // group constraints -template +template __host__ bool supports_cooperative_groups_on_multibit_programmable_bootstrap( int glwe_dimension, int polynomial_size, int level_count, int num_samples, uint32_t max_shared_memory) { switch (polynomial_size) { case 256: return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size< - Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples, - max_shared_memory); + InputTorus, Torus, AmortizedDegree<256>>( + glwe_dimension, level_count, num_samples, max_shared_memory); case 512: return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size< - Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples, - max_shared_memory); + InputTorus, Torus, AmortizedDegree<512>>( + glwe_dimension, level_count, num_samples, max_shared_memory); case 1024: return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size< - Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples, - max_shared_memory); + InputTorus, Torus, AmortizedDegree<1024>>( + glwe_dimension, level_count, num_samples, max_shared_memory); case 2048: return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size< - Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples, - max_shared_memory); + InputTorus, Torus, AmortizedDegree<2048>>( + glwe_dimension, level_count, num_samples, max_shared_memory); case 4096: return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size< - Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples, - max_shared_memory); + InputTorus, Torus, AmortizedDegree<4096>>( + glwe_dimension, level_count, num_samples, max_shared_memory); case 8192: return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size< - Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples, - max_shared_memory); + InputTorus, Torus, AmortizedDegree<8192>>( + glwe_dimension, level_count, num_samples, max_shared_memory); case 16384: return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size< - Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples, - max_shared_memory); + InputTorus, Torus, AmortizedDegree<16384>>( + glwe_dimension, level_count, num_samples, max_shared_memory); default: PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " "N's are powers of two" diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu index b95260f33..5ac3913f5 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu @@ -7,18 +7,19 @@ #include -template +template bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory) { - return supports_cooperative_groups_on_programmable_bootstrap( + return supports_cooperative_groups_on_programmable_bootstrap( glwe_dimension, polynomial_size, level_count, num_samples, max_shared_memory); } -template +template bool has_support_to_cuda_programmable_bootstrap_tbc( uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t max_shared_memory) { @@ -28,38 +29,39 @@ bool has_support_to_cuda_programmable_bootstrap_tbc( switch (polynomial_size) { case 256: return supports_thread_block_clusters_on_classic_programmable_bootstrap< - Torus, AmortizedDegree<256>>(num_samples, glwe_dimension, - polynomial_size, level_count, - max_shared_memory); + InputTorus, Torus, AmortizedDegree<256>>(num_samples, glwe_dimension, + polynomial_size, level_count, + max_shared_memory); case 512: return supports_thread_block_clusters_on_classic_programmable_bootstrap< - Torus, AmortizedDegree<512>>(num_samples, glwe_dimension, - polynomial_size, level_count, - max_shared_memory); + InputTorus, Torus, AmortizedDegree<512>>(num_samples, glwe_dimension, + polynomial_size, level_count, + max_shared_memory); case 1024: return supports_thread_block_clusters_on_classic_programmable_bootstrap< - Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension, - polynomial_size, level_count, - max_shared_memory); + InputTorus, Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension, + polynomial_size, level_count, + max_shared_memory); case 2048: return supports_thread_block_clusters_on_classic_programmable_bootstrap< - Torus, Degree<2048>>(num_samples, glwe_dimension, polynomial_size, - level_count, max_shared_memory); + InputTorus, Torus, Degree<2048>>(num_samples, glwe_dimension, + polynomial_size, level_count, + max_shared_memory); case 4096: return supports_thread_block_clusters_on_classic_programmable_bootstrap< - Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension, - polynomial_size, level_count, - max_shared_memory); + InputTorus, Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension, + polynomial_size, level_count, + max_shared_memory); case 8192: return supports_thread_block_clusters_on_classic_programmable_bootstrap< - Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension, - polynomial_size, level_count, - max_shared_memory); + InputTorus, Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension, + polynomial_size, level_count, + max_shared_memory); case 16384: return supports_thread_block_clusters_on_classic_programmable_bootstrap< - Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension, - polynomial_size, level_count, - max_shared_memory); + InputTorus, Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension, + polynomial_size, level_count, + max_shared_memory); default: PANIC("Cuda error (classical PBS): unsupported polynomial size. Supported " "N's are powers of two" @@ -71,7 +73,7 @@ bool has_support_to_cuda_programmable_bootstrap_tbc( } #if (CUDA_ARCH >= 900) -template +template uint64_t scratch_cuda_programmable_bootstrap_tbc( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, @@ -80,37 +82,43 @@ uint64_t scratch_cuda_programmable_bootstrap_tbc( switch (polynomial_size) { case 256: - return scratch_programmable_bootstrap_tbc>( + return scratch_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type); case 512: - return scratch_programmable_bootstrap_tbc>( + return scratch_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type); case 1024: - return scratch_programmable_bootstrap_tbc>( + return scratch_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type); case 2048: - return scratch_programmable_bootstrap_tbc>( + return scratch_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type); case 4096: - return scratch_programmable_bootstrap_tbc>( + return scratch_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type); case 8192: - return scratch_programmable_bootstrap_tbc>( + return scratch_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type); case 16384: - return scratch_programmable_bootstrap_tbc>( + return scratch_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type); @@ -121,11 +129,11 @@ uint64_t scratch_cuda_programmable_bootstrap_tbc( } } -template +template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, Torus *lwe_array_out, Torus const *lwe_output_indexes, Torus const *lut_vector, - Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lut_vector_indexes, InputTorus const *lwe_array_in, Torus const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, @@ -134,7 +142,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( switch (polynomial_size) { case 256: - host_programmable_bootstrap_tbc>( + host_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, @@ -142,7 +150,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( num_many_lut, lut_stride); break; case 512: - host_programmable_bootstrap_tbc>( + host_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, @@ -150,7 +158,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( num_many_lut, lut_stride); break; case 1024: - host_programmable_bootstrap_tbc>( + host_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, @@ -158,7 +166,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( num_many_lut, lut_stride); break; case 2048: - host_programmable_bootstrap_tbc>( + host_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, @@ -166,7 +174,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( num_many_lut, lut_stride); break; case 4096: - host_programmable_bootstrap_tbc>( + host_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, @@ -174,7 +182,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( num_many_lut, lut_stride); break; case 8192: - host_programmable_bootstrap_tbc>( + host_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, @@ -182,7 +190,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( num_many_lut, lut_stride); break; case 16384: - host_programmable_bootstrap_tbc>( + host_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, @@ -197,7 +205,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( } #endif -template +template uint64_t scratch_cuda_programmable_bootstrap_cg( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, @@ -206,37 +214,44 @@ uint64_t scratch_cuda_programmable_bootstrap_cg( switch (polynomial_size) { case 256: - return scratch_programmable_bootstrap_cg>( + return scratch_programmable_bootstrap_cg>( static_cast(stream), gpu_index, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type); case 512: - return scratch_programmable_bootstrap_cg>( + return scratch_programmable_bootstrap_cg>( static_cast(stream), gpu_index, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type); case 1024: - return scratch_programmable_bootstrap_cg>( + return scratch_programmable_bootstrap_cg>( static_cast(stream), gpu_index, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type); case 2048: - return scratch_programmable_bootstrap_cg>( + return scratch_programmable_bootstrap_cg>( static_cast(stream), gpu_index, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type); case 4096: - return scratch_programmable_bootstrap_cg>( + return scratch_programmable_bootstrap_cg>( static_cast(stream), gpu_index, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type); case 8192: - return scratch_programmable_bootstrap_cg>( + return scratch_programmable_bootstrap_cg>( static_cast(stream), gpu_index, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type); case 16384: - return scratch_programmable_bootstrap_cg>( + return scratch_programmable_bootstrap_cg>( static_cast(stream), gpu_index, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type); @@ -247,7 +262,7 @@ uint64_t scratch_cuda_programmable_bootstrap_cg( } } -template +template uint64_t scratch_cuda_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, @@ -256,37 +271,44 @@ uint64_t scratch_cuda_programmable_bootstrap( switch (polynomial_size) { case 256: - return scratch_programmable_bootstrap>( + return scratch_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type); case 512: - return scratch_programmable_bootstrap>( + return scratch_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type); case 1024: - return scratch_programmable_bootstrap>( + return scratch_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type); case 2048: - return scratch_programmable_bootstrap>( + return scratch_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type); case 4096: - return scratch_programmable_bootstrap>( + return scratch_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type); case 8192: - return scratch_programmable_bootstrap>( + return scratch_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type); case 16384: - return scratch_programmable_bootstrap>( + return scratch_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type); @@ -299,11 +321,10 @@ uint64_t scratch_cuda_programmable_bootstrap( /* * This scratch function allocates the necessary amount of data on the GPU for - * the classical PBS on 32 bits inputs, into `buffer`. It also - * configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to - * be used. + * the PBS on 64 bits inputs, into `buffer`. It also configures SM options on + * the GPU in case FULLSM or PARTIALSM mode is going to be used. */ -uint64_t scratch_cuda_programmable_bootstrap_32( +uint64_t scratch_cuda_programmable_bootstrap_32_64( void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, @@ -311,25 +332,25 @@ uint64_t scratch_cuda_programmable_bootstrap_32( auto max_shared_memory = cuda_get_max_shared_memory(gpu_index); #if (CUDA_ARCH >= 900) - if (has_support_to_cuda_programmable_bootstrap_tbc( + if (has_support_to_cuda_programmable_bootstrap_tbc( input_lwe_ciphertext_count, glwe_dimension, polynomial_size, level_count, max_shared_memory)) - return scratch_cuda_programmable_bootstrap_tbc( - stream, gpu_index, (pbs_buffer **)buffer, + return scratch_cuda_programmable_bootstrap_tbc( + stream, gpu_index, (pbs_buffer **)buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type); else #endif - if (has_support_to_cuda_programmable_bootstrap_cg( + if (has_support_to_cuda_programmable_bootstrap_cg( glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, max_shared_memory)) - return scratch_cuda_programmable_bootstrap_cg( - stream, gpu_index, (pbs_buffer **)buffer, + return scratch_cuda_programmable_bootstrap_cg( + stream, gpu_index, (pbs_buffer **)buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type); else - return scratch_cuda_programmable_bootstrap( - stream, gpu_index, (pbs_buffer **)buffer, + return scratch_cuda_programmable_bootstrap( + stream, gpu_index, (pbs_buffer **)buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type); } @@ -347,34 +368,34 @@ uint64_t scratch_cuda_programmable_bootstrap_64( auto max_shared_memory = cuda_get_max_shared_memory(gpu_index); #if (CUDA_ARCH >= 900) - if (has_support_to_cuda_programmable_bootstrap_tbc( + if (has_support_to_cuda_programmable_bootstrap_tbc( input_lwe_ciphertext_count, glwe_dimension, polynomial_size, level_count, max_shared_memory)) - return scratch_cuda_programmable_bootstrap_tbc( + return scratch_cuda_programmable_bootstrap_tbc( stream, gpu_index, (pbs_buffer **)buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type); else #endif - if (has_support_to_cuda_programmable_bootstrap_cg( + if (has_support_to_cuda_programmable_bootstrap_cg( glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, max_shared_memory)) - return scratch_cuda_programmable_bootstrap_cg( + return scratch_cuda_programmable_bootstrap_cg( stream, gpu_index, (pbs_buffer **)buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type); else - return scratch_cuda_programmable_bootstrap( + return scratch_cuda_programmable_bootstrap( stream, gpu_index, (pbs_buffer **)buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type); } -template +template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, Torus *lwe_array_out, Torus const *lwe_output_indexes, Torus const *lut_vector, - Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lut_vector_indexes, InputTorus const *lwe_array_in, Torus const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, @@ -383,7 +404,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( switch (polynomial_size) { case 256: - host_programmable_bootstrap_cg>( + host_programmable_bootstrap_cg>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, @@ -391,7 +412,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( num_many_lut, lut_stride); break; case 512: - host_programmable_bootstrap_cg>( + host_programmable_bootstrap_cg>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, @@ -399,7 +420,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( num_many_lut, lut_stride); break; case 1024: - host_programmable_bootstrap_cg>( + host_programmable_bootstrap_cg>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, @@ -407,7 +428,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( num_many_lut, lut_stride); break; case 2048: - host_programmable_bootstrap_cg>( + host_programmable_bootstrap_cg>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, @@ -415,7 +436,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( num_many_lut, lut_stride); break; case 4096: - host_programmable_bootstrap_cg>( + host_programmable_bootstrap_cg>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, @@ -423,7 +444,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( num_many_lut, lut_stride); break; case 8192: - host_programmable_bootstrap_cg>( + host_programmable_bootstrap_cg>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, @@ -431,7 +452,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( num_many_lut, lut_stride); break; case 16384: - host_programmable_bootstrap_cg>( + host_programmable_bootstrap_cg>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, @@ -445,11 +466,11 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( } } -template +template void cuda_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, Torus *lwe_array_out, Torus const *lwe_output_indexes, Torus const *lut_vector, - Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lut_vector_indexes, InputTorus const *lwe_array_in, Torus const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, @@ -458,7 +479,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( switch (polynomial_size) { case 256: - host_programmable_bootstrap>( + host_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, @@ -466,7 +487,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( num_many_lut, lut_stride); break; case 512: - host_programmable_bootstrap>( + host_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, @@ -474,7 +495,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( num_many_lut, lut_stride); break; case 1024: - host_programmable_bootstrap>( + host_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, @@ -482,7 +503,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( num_many_lut, lut_stride); break; case 2048: - host_programmable_bootstrap>( + host_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, @@ -490,7 +511,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( num_many_lut, lut_stride); break; case 4096: - host_programmable_bootstrap>( + host_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, @@ -498,7 +519,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( num_many_lut, lut_stride); break; case 8192: - host_programmable_bootstrap>( + host_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, @@ -506,7 +527,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( num_many_lut, lut_stride); break; case 16384: - host_programmable_bootstrap>( + host_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, @@ -522,7 +543,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( /* Perform bootstrapping on a batch of input u32 LWE ciphertexts. */ -void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( +void cuda_programmable_bootstrap_lwe_ciphertext_vector_32_64( void *stream, uint32_t gpu_index, void *lwe_array_out, void const *lwe_output_indexes, void const *lut_vector, void const *lut_vector_indexes, void const *lwe_array_in, @@ -530,23 +551,24 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride) { + if (base_log > 64) + PANIC("Cuda error (classical PBS): base log should be <= 64") - if (base_log > 32) - PANIC("Cuda error (classical PBS): base log should be <= 32") + pbs_buffer *buffer = + (pbs_buffer *)mem_ptr; - pbs_buffer *buffer = - (pbs_buffer *)mem_ptr; + check_cuda_error(cudaGetLastError()); switch (buffer->pbs_variant) { - case TBC: -#if CUDA_ARCH >= 900 - cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( - stream, gpu_index, static_cast(lwe_array_out), - static_cast(lwe_output_indexes), - static_cast(lut_vector), - static_cast(lut_vector_indexes), + case PBS_VARIANT::TBC: +#if (CUDA_ARCH >= 900) + cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( + stream, gpu_index, static_cast(lwe_array_out), + static_cast(lwe_output_indexes), + static_cast(lut_vector), + static_cast(lut_vector_indexes), static_cast(lwe_array_in), - static_cast(lwe_input_indexes), + static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, num_many_lut, lut_stride); @@ -554,26 +576,26 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( #else PANIC("Cuda error (PBS): TBC pbs is not supported.") #endif - case CG: - cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( - stream, gpu_index, static_cast(lwe_array_out), - static_cast(lwe_output_indexes), - static_cast(lut_vector), - static_cast(lut_vector_indexes), + case PBS_VARIANT::CG: + cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( + stream, gpu_index, static_cast(lwe_array_out), + static_cast(lwe_output_indexes), + static_cast(lut_vector), + static_cast(lut_vector_indexes), static_cast(lwe_array_in), - static_cast(lwe_input_indexes), + static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, num_many_lut, lut_stride); break; - case DEFAULT: - cuda_programmable_bootstrap_lwe_ciphertext_vector( - stream, gpu_index, static_cast(lwe_array_out), - static_cast(lwe_output_indexes), - static_cast(lut_vector), - static_cast(lut_vector_indexes), + case PBS_VARIANT::DEFAULT: + cuda_programmable_bootstrap_lwe_ciphertext_vector( + stream, gpu_index, static_cast(lwe_array_out), + static_cast(lwe_output_indexes), + static_cast(lut_vector), + static_cast(lut_vector_indexes), static_cast(lwe_array_in), - static_cast(lwe_input_indexes), + static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, num_many_lut, lut_stride); @@ -644,7 +666,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( * - the constant memory (64K) is used for storing the roots of identity * values for the FFT */ -void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( +void cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64( void *stream, uint32_t gpu_index, void *lwe_array_out, void const *lwe_output_indexes, void const *lut_vector, void const *lut_vector_indexes, void const *lwe_array_in, @@ -663,7 +685,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( switch (buffer->pbs_variant) { case PBS_VARIANT::TBC: #if (CUDA_ARCH >= 900) - cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( + cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( stream, gpu_index, static_cast(lwe_array_out), static_cast(lwe_output_indexes), static_cast(lut_vector), @@ -678,7 +700,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( PANIC("Cuda error (PBS): TBC pbs is not supported.") #endif case PBS_VARIANT::CG: - cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( + cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( stream, gpu_index, static_cast(lwe_array_out), static_cast(lwe_output_indexes), static_cast(lut_vector), @@ -690,7 +712,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( num_many_lut, lut_stride); break; case PBS_VARIANT::DEFAULT: - cuda_programmable_bootstrap_lwe_ciphertext_vector( + cuda_programmable_bootstrap_lwe_ciphertext_vector( stream, gpu_index, static_cast(lwe_array_out), static_cast(lwe_output_indexes), static_cast(lut_vector), @@ -705,7 +727,6 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( PANIC("Cuda error (PBS): unknown pbs variant.") } } - /* * This cleanup function frees the data on GPU for the PBS buffer for 32 or 64 * bits inputs. @@ -718,11 +739,16 @@ void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index, *buffer = nullptr; } -template bool has_support_to_cuda_programmable_bootstrap_cg( +template bool has_support_to_cuda_programmable_bootstrap_cg( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); -template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( +template bool has_support_to_cuda_programmable_bootstrap_cg( + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, + uint32_t num_samples, uint32_t max_shared_memory); + +template void +cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint64_t *lwe_array_out, uint64_t const *lwe_output_indexes, uint64_t const *lut_vector, uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in, @@ -732,7 +758,8 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride); -template void cuda_programmable_bootstrap_lwe_ciphertext_vector( +template void +cuda_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint64_t *lwe_array_out, uint64_t const *lwe_output_indexes, uint64_t const *lut_vector, uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in, @@ -742,14 +769,14 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector( uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride); -template uint64_t scratch_cuda_programmable_bootstrap_cg( +template uint64_t scratch_cuda_programmable_bootstrap_cg( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type); -template uint64_t scratch_cuda_programmable_bootstrap( +template uint64_t scratch_cuda_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, @@ -765,38 +792,31 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride); -template void cuda_programmable_bootstrap_lwe_ciphertext_vector( - void *stream, uint32_t gpu_index, uint32_t *lwe_array_out, - uint32_t const *lwe_output_indexes, uint32_t const *lut_vector, - uint32_t const *lut_vector_indexes, uint32_t const *lwe_array_in, - uint32_t const *lwe_input_indexes, double2 const *bootstrapping_key, - pbs_buffer *pbs_buffer, uint32_t lwe_dimension, - uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, - uint32_t lut_stride); - -template uint64_t scratch_cuda_programmable_bootstrap_cg( +template uint64_t scratch_cuda_programmable_bootstrap_cg( void *stream, uint32_t gpu_index, - pbs_buffer **pbs_buffer, uint32_t lwe_dimension, + pbs_buffer **pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type); -template uint64_t scratch_cuda_programmable_bootstrap( +template uint64_t scratch_cuda_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type); -template bool has_support_to_cuda_programmable_bootstrap_tbc( +template bool +has_support_to_cuda_programmable_bootstrap_tbc( uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t max_shared_memory); -template bool has_support_to_cuda_programmable_bootstrap_tbc( +template bool +has_support_to_cuda_programmable_bootstrap_tbc( uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t max_shared_memory); #if CUDA_ARCH >= 900 -template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( +template void +cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint32_t *lwe_array_out, uint32_t const *lwe_output_indexes, uint32_t const *lut_vector, uint32_t const *lut_vector_indexes, uint32_t const *lwe_array_in, @@ -805,7 +825,8 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride); -template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( +template void +cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint64_t *lwe_array_out, uint64_t const *lwe_output_indexes, uint64_t const *lut_vector, uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in, @@ -814,18 +835,35 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride); -template uint64_t scratch_cuda_programmable_bootstrap_tbc( +template void +cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( + void *stream, uint32_t gpu_index, uint64_t *lwe_array_out, + uint64_t const *lwe_output_indexes, uint64_t const *lut_vector, + uint64_t const *lut_vector_indexes, uint32_t const *lwe_array_in, + uint64_t const *lwe_input_indexes, double2 const *bootstrapping_key, + pbs_buffer *buffer, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, + uint32_t lut_stride); +template uint64_t scratch_cuda_programmable_bootstrap_tbc( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type); -template uint64_t scratch_cuda_programmable_bootstrap_tbc( +template uint64_t scratch_cuda_programmable_bootstrap_tbc( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type); +template uint64_t scratch_cuda_programmable_bootstrap_tbc( + void *stream, uint32_t gpu_index, + pbs_buffer **pbs_buffer, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, + PBS_MS_REDUCTION_T noise_reduction_type); + template bool supports_distributed_shared_memory_on_classic_programmable_bootstrap< __uint128_t>(uint32_t polynomial_size, uint32_t max_shared_memory); diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh index 79e389056..d52a951b4 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh @@ -17,12 +17,13 @@ #include "polynomial/polynomial_math.cuh" #include "types/complex/operations.cuh" -template +template __global__ void __launch_bounds__(params::degree / params::opt) device_programmable_bootstrap_step_one( const Torus *__restrict__ lut_vector, const Torus *__restrict__ lut_vector_indexes, - const Torus *__restrict__ lwe_array_in, + const InputTorus *__restrict__ lwe_array_in, const Torus *__restrict__ lwe_input_indexes, Torus *global_accumulator, double2 *global_join_buffer, uint32_t lwe_iteration, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, @@ -55,7 +56,7 @@ __global__ void __launch_bounds__(params::degree / params::opt) // The third dimension of the block is used to determine on which ciphertext // this block is operating, in the case of batch bootstraps - const Torus *block_lwe_array_in = + const InputTorus *block_lwe_array_in = &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)]; const Torus *block_lut_vector = @@ -75,8 +76,8 @@ __global__ void __launch_bounds__(params::degree / params::opt) // First iteration // Put "b" in [0, 2N[ constexpr auto log_modulus = params::log2_degree + 1; - Torus b_hat = 0; - Torus correction = 0; + InputTorus b_hat = 0; + InputTorus correction = 0; if (noise_reduction_type == PBS_MS_REDUCTION_T::CENTERED) { correction = centered_binary_modulus_switch_body_correction_to_add( block_lwe_array_in, lwe_dimension, log_modulus); @@ -86,6 +87,7 @@ __global__ void __launch_bounds__(params::degree / params::opt) // The y-dimension is used to select the element of the GLWE this block will // compute + // b_hat is cast from InputTorus (i.e. 64b or 32b) to uint32_t here divide_by_monomial_negacyclic_inplace( accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat, @@ -100,7 +102,7 @@ __global__ void __launch_bounds__(params::degree / params::opt) } // Put "a" in [0, 2N[ - Torus a_hat = 0; + InputTorus a_hat = 0; modulus_switch(block_lwe_array_in[lwe_iteration], a_hat, params::log2_degree + 1); // 2 * params::log2_degree + 1); @@ -307,7 +309,7 @@ uint64_t get_buffer_size_programmable_bootstrap( return buffer_size + buffer_size % sizeof(double2); } -template +template __host__ uint64_t scratch_programmable_bootstrap( cudaStream_t stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t lwe_dimension, @@ -331,30 +333,38 @@ __host__ uint64_t scratch_programmable_bootstrap( // Configure step one if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) { check_cuda_error(cudaFuncSetAttribute( - device_programmable_bootstrap_step_one, + device_programmable_bootstrap_step_one, cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm)); cudaFuncSetCacheConfig( - device_programmable_bootstrap_step_one, + device_programmable_bootstrap_step_one, cudaFuncCachePreferShared); check_cuda_error(cudaFuncSetAttribute( - device_programmable_bootstrap_step_one, + device_programmable_bootstrap_step_one, cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm)); cudaFuncSetCacheConfig( - device_programmable_bootstrap_step_one, + device_programmable_bootstrap_step_one, cudaFuncCachePreferShared); check_cuda_error(cudaGetLastError()); } else if (max_shared_memory >= partial_sm) { check_cuda_error(cudaFuncSetAttribute( - device_programmable_bootstrap_step_one, + device_programmable_bootstrap_step_one, cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one)); cudaFuncSetCacheConfig( - device_programmable_bootstrap_step_one, + device_programmable_bootstrap_step_one, cudaFuncCachePreferShared); check_cuda_error(cudaFuncSetAttribute( - device_programmable_bootstrap_step_one, + device_programmable_bootstrap_step_one, cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one)); cudaFuncSetCacheConfig( - device_programmable_bootstrap_step_one, + device_programmable_bootstrap_step_one, cudaFuncCachePreferShared); check_cuda_error(cudaGetLastError()); } @@ -398,10 +408,10 @@ __host__ uint64_t scratch_programmable_bootstrap( return size_tracker; } -template +template __host__ void execute_step_one( cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector, - Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lut_vector_indexes, InputTorus const *lwe_array_in, Torus const *lwe_input_indexes, double2 const *bootstrapping_key, Torus *global_accumulator, double2 *global_join_buffer, uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension, @@ -416,21 +426,24 @@ __host__ void execute_step_one( dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1, level_count); if (max_shared_memory < partial_sm) { - device_programmable_bootstrap_step_one + device_programmable_bootstrap_step_one <<>>( lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, global_accumulator, global_join_buffer, lwe_iteration, lwe_dimension, polynomial_size, base_log, level_count, d_mem, full_dm, noise_reduction_type); } else if (max_shared_memory < full_sm) { - device_programmable_bootstrap_step_one + device_programmable_bootstrap_step_one <<>>( lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, global_accumulator, global_join_buffer, lwe_iteration, lwe_dimension, polynomial_size, base_log, level_count, d_mem, partial_dm, noise_reduction_type); } else { - device_programmable_bootstrap_step_one + device_programmable_bootstrap_step_one <<>>( lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, global_accumulator, global_join_buffer, lwe_iteration, @@ -484,11 +497,11 @@ __host__ void execute_step_two( /* * Host wrapper to the programmable bootstrap */ -template +template __host__ void host_programmable_bootstrap( cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, Torus const *lwe_output_indexes, Torus const *lut_vector, - Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lut_vector_indexes, InputTorus const *lwe_array_in, Torus const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, @@ -520,7 +533,7 @@ __host__ void host_programmable_bootstrap( for (int i = 0; i < lwe_dimension; i++) { if (i == 0) { - execute_step_one( + execute_step_one( stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, global_accumulator, global_join_buffer, input_lwe_ciphertext_count, lwe_dimension, @@ -528,7 +541,7 @@ __host__ void host_programmable_bootstrap( partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one, noise_reduction_type); } else { - execute_step_one( + execute_step_one( stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, global_accumulator, global_join_buffer, input_lwe_ciphertext_count, lwe_dimension, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu index 0a37ca66e..498ea0bd7 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu @@ -7,15 +7,16 @@ #include "programmable_bootstrap_tbc_multibit.cuh" #endif +// Used in CPP benchmarks, not used in rust code bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory) { return supports_cooperative_groups_on_multibit_programmable_bootstrap< - uint64_t>(glwe_dimension, polynomial_size, level_count, num_samples, - max_shared_memory); + uint64_t, uint64_t>(glwe_dimension, polynomial_size, level_count, + num_samples, max_shared_memory); } -template +template bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t max_shared_memory) { @@ -26,39 +27,39 @@ bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( switch (polynomial_size) { case 256: return supports_thread_block_clusters_on_multibit_programmable_bootstrap< - Torus, AmortizedDegree<256>>(num_samples, glwe_dimension, - polynomial_size, level_count, - max_shared_memory); + InputTorus, Torus, AmortizedDegree<256>>(num_samples, glwe_dimension, + polynomial_size, level_count, + max_shared_memory); case 512: return supports_thread_block_clusters_on_multibit_programmable_bootstrap< - Torus, AmortizedDegree<512>>(num_samples, glwe_dimension, - polynomial_size, level_count, - max_shared_memory); + InputTorus, Torus, AmortizedDegree<512>>(num_samples, glwe_dimension, + polynomial_size, level_count, + max_shared_memory); case 1024: return supports_thread_block_clusters_on_multibit_programmable_bootstrap< - Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension, - polynomial_size, level_count, - max_shared_memory); + InputTorus, Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension, + polynomial_size, level_count, + max_shared_memory); case 2048: return supports_thread_block_clusters_on_multibit_programmable_bootstrap< - Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension, - polynomial_size, level_count, - max_shared_memory); + InputTorus, Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension, + polynomial_size, level_count, + max_shared_memory); case 4096: return supports_thread_block_clusters_on_multibit_programmable_bootstrap< - Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension, - polynomial_size, level_count, - max_shared_memory); + InputTorus, Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension, + polynomial_size, level_count, + max_shared_memory); case 8192: return supports_thread_block_clusters_on_multibit_programmable_bootstrap< - Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension, - polynomial_size, level_count, - max_shared_memory); + InputTorus, Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension, + polynomial_size, level_count, + max_shared_memory); case 16384: return supports_thread_block_clusters_on_multibit_programmable_bootstrap< - Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension, - polynomial_size, level_count, - max_shared_memory); + InputTorus, Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension, + polynomial_size, level_count, + max_shared_memory); default: PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " "N's are powers of two" @@ -69,11 +70,11 @@ bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( #endif } -template +template void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, Torus *lwe_array_out, Torus const *lwe_output_indexes, Torus const *lut_vector, - Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lut_vector_indexes, InputTorus const *lwe_array_in, Torus const *lwe_input_indexes, Torus const *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, @@ -82,7 +83,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( switch (polynomial_size) { case 256: - host_cg_multi_bit_programmable_bootstrap>( + host_cg_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, @@ -90,7 +92,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( num_samples, num_many_lut, lut_stride); break; case 512: - host_cg_multi_bit_programmable_bootstrap>( + host_cg_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, @@ -98,7 +101,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( num_samples, num_many_lut, lut_stride); break; case 1024: - host_cg_multi_bit_programmable_bootstrap>( + host_cg_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, @@ -106,7 +110,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( num_samples, num_many_lut, lut_stride); break; case 2048: - host_cg_multi_bit_programmable_bootstrap>( + host_cg_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, @@ -114,7 +119,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( num_samples, num_many_lut, lut_stride); break; case 4096: - host_cg_multi_bit_programmable_bootstrap>( + host_cg_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, @@ -122,7 +128,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( num_samples, num_many_lut, lut_stride); break; case 8192: - host_cg_multi_bit_programmable_bootstrap>( + host_cg_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, @@ -130,7 +137,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( num_samples, num_many_lut, lut_stride); break; case 16384: - host_cg_multi_bit_programmable_bootstrap>( + host_cg_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, @@ -144,11 +152,11 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( } } -template +template void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, Torus *lwe_array_out, Torus const *lwe_output_indexes, Torus const *lut_vector, - Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lut_vector_indexes, InputTorus const *lwe_array_in, Torus const *lwe_input_indexes, Torus const *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, @@ -157,7 +165,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( switch (polynomial_size) { case 256: - host_multi_bit_programmable_bootstrap>( + host_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, @@ -165,7 +174,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( num_samples, num_many_lut, lut_stride); break; case 512: - host_multi_bit_programmable_bootstrap>( + host_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, @@ -173,7 +183,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( num_samples, num_many_lut, lut_stride); break; case 1024: - host_multi_bit_programmable_bootstrap>( + host_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, @@ -181,7 +192,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( num_samples, num_many_lut, lut_stride); break; case 2048: - host_multi_bit_programmable_bootstrap>( + host_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, @@ -189,7 +201,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( num_samples, num_many_lut, lut_stride); break; case 4096: - host_multi_bit_programmable_bootstrap>( + host_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, @@ -197,7 +210,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( num_samples, num_many_lut, lut_stride); break; case 8192: - host_multi_bit_programmable_bootstrap>( + host_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, @@ -205,7 +219,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( num_samples, num_many_lut, lut_stride); break; case 16384: - host_multi_bit_programmable_bootstrap>( + host_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, @@ -239,7 +254,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( switch (buffer->pbs_variant) { case PBS_VARIANT::TBC: #if CUDA_ARCH >= 900 - cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( + cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( stream, gpu_index, static_cast(lwe_array_out), static_cast(lwe_output_indexes), static_cast(lut_vector), @@ -254,7 +270,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.") #endif case PBS_VARIANT::CG: - cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( + cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( stream, gpu_index, static_cast(lwe_array_out), static_cast(lwe_output_indexes), static_cast(lut_vector), @@ -266,7 +283,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( num_samples, num_many_lut, lut_stride); break; case PBS_VARIANT::DEFAULT: - cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( + cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( stream, gpu_index, static_cast(lwe_array_out), static_cast(lwe_output_indexes), static_cast(lut_vector), @@ -282,7 +300,73 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( } } -template +void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_32_64( + void *stream, uint32_t gpu_index, void *lwe_array_out, + void const *lwe_output_indexes, void const *lut_vector, + void const *lut_vector_indexes, void const *lwe_array_in, + void const *lwe_input_indexes, void const *bootstrapping_key, + int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, + uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, + uint32_t lut_stride) { + + PANIC_IF_FALSE(base_log <= 64, + "Cuda error (multi-bit PBS): base log (%d) should be <= 64", + base_log); + + pbs_buffer *buffer = + (pbs_buffer *)mem_ptr; + + switch (buffer->pbs_variant) { + case PBS_VARIANT::TBC: +#if CUDA_ARCH >= 900 + cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( + stream, gpu_index, static_cast(lwe_array_out), + static_cast(lwe_output_indexes), + static_cast(lut_vector), + static_cast(lut_vector_indexes), + static_cast(lwe_array_in), + static_cast(lwe_input_indexes), + static_cast(bootstrapping_key), buffer, lwe_dimension, + glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, + num_samples, num_many_lut, lut_stride); + break; +#else + PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.") +#endif + case PBS_VARIANT::CG: + cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( + stream, gpu_index, static_cast(lwe_array_out), + static_cast(lwe_output_indexes), + static_cast(lut_vector), + static_cast(lut_vector_indexes), + static_cast(lwe_array_in), + static_cast(lwe_input_indexes), + static_cast(bootstrapping_key), buffer, lwe_dimension, + glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, + num_samples, num_many_lut, lut_stride); + break; + case PBS_VARIANT::DEFAULT: + cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( + stream, gpu_index, static_cast(lwe_array_out), + static_cast(lwe_output_indexes), + static_cast(lut_vector), + static_cast(lut_vector_indexes), + static_cast(lwe_array_in), + static_cast(lwe_input_indexes), + static_cast(bootstrapping_key), buffer, lwe_dimension, + glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, + num_samples, num_many_lut, lut_stride); + break; + default: + PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.") + } +} + +template uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, @@ -290,43 +374,43 @@ uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap( switch (polynomial_size) { case 256: - return scratch_cg_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory); case 512: - return scratch_cg_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory); case 1024: - return scratch_cg_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory); case 2048: - return scratch_cg_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory); case 4096: - return scratch_cg_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory); case 8192: - return scratch_cg_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory); case 16384: - return scratch_cg_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, @@ -338,7 +422,7 @@ uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap( } } -template +template uint64_t scratch_cuda_multi_bit_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, @@ -346,43 +430,43 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap( switch (polynomial_size) { case 256: - return scratch_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory); case 512: - return scratch_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory); case 1024: - return scratch_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory); case 2048: - return scratch_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory); case 4096: - return scratch_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory); case 8192: - return scratch_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory); case 16384: - return scratch_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, @@ -400,7 +484,8 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64( uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { bool supports_cg = - supports_cooperative_groups_on_multibit_programmable_bootstrap( + supports_cooperative_groups_on_multibit_programmable_bootstrap( glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index)); #if (CUDA_ARCH >= 900) @@ -412,24 +497,70 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64( &num_sms, cudaDevAttrMultiProcessorCount, gpu_index)); bool supports_tbc = - has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( + has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( input_lwe_ciphertext_count, glwe_dimension, polynomial_size, level_count, cuda_get_max_shared_memory(gpu_index)); if (supports_tbc) - return scratch_cuda_tbc_multi_bit_programmable_bootstrap( + return scratch_cuda_tbc_multi_bit_programmable_bootstrap( stream, gpu_index, (pbs_buffer **)buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory); else #endif if (supports_cg) - return scratch_cuda_cg_multi_bit_programmable_bootstrap( + return scratch_cuda_cg_multi_bit_programmable_bootstrap( stream, gpu_index, (pbs_buffer **)buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory); else - return scratch_cuda_multi_bit_programmable_bootstrap( + return scratch_cuda_multi_bit_programmable_bootstrap( + stream, gpu_index, (pbs_buffer **)buffer, + glwe_dimension, polynomial_size, level_count, + input_lwe_ciphertext_count, allocate_gpu_memory); +} + +uint64_t scratch_cuda_multi_bit_programmable_bootstrap_32_64( + void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t level_count, + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { + + bool supports_cg = + supports_cooperative_groups_on_multibit_programmable_bootstrap( + glwe_dimension, polynomial_size, level_count, + input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index)); +#if (CUDA_ARCH >= 900) + // On H100s we should be using TBC until num_samples < num_sms / 2. + // After that we switch to CG until not supported anymore. + // At this point we return to TBC. + int num_sms = 0; + check_cuda_error(cudaDeviceGetAttribute( + &num_sms, cudaDevAttrMultiProcessorCount, gpu_index)); + + bool supports_tbc = + has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( + input_lwe_ciphertext_count, glwe_dimension, polynomial_size, + level_count, cuda_get_max_shared_memory(gpu_index)); + + if (supports_tbc) + return scratch_cuda_tbc_multi_bit_programmable_bootstrap( + stream, gpu_index, (pbs_buffer **)buffer, + glwe_dimension, polynomial_size, level_count, + input_lwe_ciphertext_count, allocate_gpu_memory); + else +#endif + if (supports_cg) + return scratch_cuda_cg_multi_bit_programmable_bootstrap( + stream, gpu_index, (pbs_buffer **)buffer, + glwe_dimension, polynomial_size, level_count, + input_lwe_ciphertext_count, allocate_gpu_memory); + else + return scratch_cuda_multi_bit_programmable_bootstrap( stream, gpu_index, (pbs_buffer **)buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory); @@ -455,7 +586,7 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream, * The value 13 was empirically determined based on memory requirements for * benchmarking on an RTX 4090 GPU, balancing performance and resource use. */ -template +template uint64_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs, uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count, uint64_t full_sm_keybundle) { @@ -466,13 +597,14 @@ uint64_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs, if (max_shared_memory < full_sm_keybundle) cudaOccupancyMaxActiveBlocksPerMultiprocessor( &max_blocks_per_sm, - device_multi_bit_programmable_bootstrap_keybundle, + device_multi_bit_programmable_bootstrap_keybundle, polynomial_size / params::opt, full_sm_keybundle); else cudaOccupancyMaxActiveBlocksPerMultiprocessor( &max_blocks_per_sm, - device_multi_bit_programmable_bootstrap_keybundle, + device_multi_bit_programmable_bootstrap_keybundle, polynomial_size / params::opt, 0); int num_sms = 0; @@ -535,14 +667,15 @@ uint64_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs, return (max_num_chunks > divisor) ? divisor : max_num_chunks; } -template uint64_t scratch_cuda_multi_bit_programmable_bootstrap( +template uint64_t +scratch_cuda_multi_bit_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); template void -cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( +cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint64_t *lwe_array_out, uint64_t const *lwe_output_indexes, uint64_t const *lut_vector, uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in, @@ -552,14 +685,16 @@ cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride); -template uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap( +template uint64_t +scratch_cuda_cg_multi_bit_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); template void -cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( +cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint64_t *lwe_array_out, uint64_t const *lwe_output_indexes, uint64_t const *lut_vector, uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in, @@ -570,12 +705,12 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( uint32_t num_many_lut, uint32_t lut_stride); template bool -has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( +has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t max_shared_memory); #if (CUDA_ARCH >= 900) -template +template uint64_t scratch_cuda_tbc_multi_bit_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, @@ -583,43 +718,43 @@ uint64_t scratch_cuda_tbc_multi_bit_programmable_bootstrap( switch (polynomial_size) { case 256: - return scratch_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory); case 512: - return scratch_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory); case 1024: - return scratch_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory); case 2048: - return scratch_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory); case 4096: - return scratch_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory); case 8192: - return scratch_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, allocate_gpu_memory); case 16384: - return scratch_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, @@ -630,11 +765,11 @@ uint64_t scratch_cuda_tbc_multi_bit_programmable_bootstrap( " in the interval [256..16384].") } } -template +template void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, Torus *lwe_array_out, Torus const *lwe_output_indexes, Torus const *lut_vector, - Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lut_vector_indexes, InputTorus const *lwe_array_in, Torus const *lwe_input_indexes, Torus const *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, @@ -646,7 +781,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( switch (polynomial_size) { case 256: - host_tbc_multi_bit_programmable_bootstrap>( + host_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, @@ -654,7 +790,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( num_samples, num_many_lut, lut_stride); break; case 512: - host_tbc_multi_bit_programmable_bootstrap>( + host_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, @@ -662,7 +799,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( num_samples, num_many_lut, lut_stride); break; case 1024: - host_tbc_multi_bit_programmable_bootstrap>( + host_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, @@ -675,14 +813,16 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( &num_sms, cudaDevAttrMultiProcessorCount, gpu_index)); if (4 * num_sms < num_samples * level_count * (glwe_dimension + 1)) - host_tbc_multi_bit_programmable_bootstrap>( + host_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, num_samples, num_many_lut, lut_stride); else - host_tbc_multi_bit_programmable_bootstrap>( + host_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, @@ -692,7 +832,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( break; } case 4096: - host_tbc_multi_bit_programmable_bootstrap>( + host_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, @@ -700,7 +841,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( num_samples, num_many_lut, lut_stride); break; case 8192: - host_tbc_multi_bit_programmable_bootstrap>( + host_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, @@ -708,7 +850,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( num_samples, num_many_lut, lut_stride); break; case 16384: - host_tbc_multi_bit_programmable_bootstrap>( + host_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, @@ -722,13 +865,15 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( } } -template uint64_t scratch_cuda_tbc_multi_bit_programmable_bootstrap( +template uint64_t +scratch_cuda_tbc_multi_bit_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); template void -cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( +cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint64_t *lwe_array_out, uint64_t const *lwe_output_indexes, uint64_t const *lut_vector, uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in, @@ -737,4 +882,16 @@ cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride); + +template void +cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( + void *stream, uint32_t gpu_index, uint64_t *lwe_array_out, + uint64_t const *lwe_output_indexes, uint64_t const *lut_vector, + uint64_t const *lut_vector_indexes, uint32_t const *lwe_array_in, + uint64_t const *lwe_input_indexes, uint64_t const *bootstrapping_key, + pbs_buffer *pbs_buffer, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, + uint32_t base_log, uint32_t level_count, uint32_t num_samples, + uint32_t num_many_lut, uint32_t lut_stride); #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh index 64c04050c..50b155218 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh @@ -25,9 +25,10 @@ get_start_ith_ggsw_offset(uint32_t polynomial_size, int glwe_dimension, level_count; } -template +template __global__ void device_multi_bit_programmable_bootstrap_keybundle( - const Torus *__restrict__ lwe_array_in, + const InputTorus *__restrict__ lwe_array_in, const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array, const Torus *__restrict__ bootstrapping_key, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, @@ -55,7 +56,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle( if (lwe_iteration < (lwe_dimension / grouping_factor)) { - const Torus *block_lwe_array_in = + const InputTorus *block_lwe_array_in = &lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)]; double2 *keybundle = keybundle_array + @@ -86,10 +87,11 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle( // Precalculate the monomial degrees and store them in shared memory uint32_t *monomial_degrees = (uint32_t *)selected_memory; if (threadIdx.x < (1 << grouping_factor)) { - const Torus *lwe_array_group = + const InputTorus *lwe_array_group = block_lwe_array_in + rev_lwe_iteration * grouping_factor; - monomial_degrees[threadIdx.x] = calculates_monomial_degree( - lwe_array_group, threadIdx.x, grouping_factor); + monomial_degrees[threadIdx.x] = + calculates_monomial_degree( + lwe_array_group, threadIdx.x, grouping_factor); } __syncthreads(); @@ -145,9 +147,10 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle( // Then we can just calculate the offset needed to apply this coefficients, and // the operation transforms into a pointwise vector multiplication, avoiding to // perform extra instructions other than MADD -template +template __global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params( - const Torus *__restrict__ lwe_array_in, + const InputTorus *__restrict__ lwe_array_in, const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array, const Torus *__restrict__ bootstrapping_key, uint32_t lwe_dimension, uint32_t lwe_offset, uint64_t lwe_chunk_size, @@ -188,7 +191,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params( if (lwe_iteration < (lwe_dimension / grouping_factor)) { - const Torus *block_lwe_array_in = + const InputTorus *block_lwe_array_in = &lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)]; double2 *keybundle = keybundle_array + @@ -219,10 +222,11 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params( uint32_t *monomial_degrees = (uint32_t *)selected_memory; if (threadIdx.x < (1 << grouping_factor)) { - const Torus *lwe_array_group = + const InputTorus *lwe_array_group = block_lwe_array_in + rev_lwe_iteration * grouping_factor; - monomial_degrees[threadIdx.x] = calculates_monomial_degree( - lwe_array_group, threadIdx.x, grouping_factor); + monomial_degrees[threadIdx.x] = + calculates_monomial_degree( + lwe_array_group, threadIdx.x, grouping_factor); } __syncthreads(); @@ -269,10 +273,11 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params( } } -template +template __global__ void __launch_bounds__(params::degree / params::opt) device_multi_bit_programmable_bootstrap_accumulate_step_one( - const Torus *__restrict__ lwe_array_in, + const InputTorus *__restrict__ lwe_array_in, const Torus *__restrict__ lwe_input_indexes, const Torus *__restrict__ lut_vector, const Torus *__restrict__ lut_vector_indexes, Torus *global_accumulator, @@ -305,7 +310,7 @@ __global__ void __launch_bounds__(params::degree / params::opt) if constexpr (SMD == PARTIALSM) accumulator_fft = (double2 *)sharedmem; - const Torus *block_lwe_array_in = + const InputTorus *block_lwe_array_in = &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)]; const Torus *block_lut_vector = @@ -327,7 +332,7 @@ __global__ void __launch_bounds__(params::degree / params::opt) //////////////////////////////////////////////////////////// // Initializes the accumulator with the body of LWE // Put "b" in [0, 2N[ - Torus b_hat = 0; + InputTorus b_hat = 0; modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, params::log2_degree + 1); @@ -501,7 +506,7 @@ uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two( return sizeof(Torus) * polynomial_size; // accumulator } -template +template __host__ uint64_t scratch_multi_bit_programmable_bootstrap( cudaStream_t stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t glwe_dimension, @@ -526,20 +531,22 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap( if (max_shared_memory < full_sm_keybundle) { check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_keybundle, + device_multi_bit_programmable_bootstrap_keybundle, cudaFuncAttributeMaxDynamicSharedMemorySize, 0)); cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_keybundle, + device_multi_bit_programmable_bootstrap_keybundle, cudaFuncCachePreferShared); check_cuda_error(cudaGetLastError()); } else { check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_keybundle, + device_multi_bit_programmable_bootstrap_keybundle, cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle)); cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_keybundle, + device_multi_bit_programmable_bootstrap_keybundle, cudaFuncCachePreferShared); check_cuda_error(cudaGetLastError()); } @@ -547,59 +554,59 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap( if (max_shared_memory < partial_sm_accumulate_step_one) { check_cuda_error(cudaFuncSetAttribute( device_multi_bit_programmable_bootstrap_accumulate_step_one< - Torus, params, NOSM, false>, + InputTorus, Torus, params, NOSM, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, 0)); cudaFuncSetCacheConfig( device_multi_bit_programmable_bootstrap_accumulate_step_one< - Torus, params, NOSM, false>, + InputTorus, Torus, params, NOSM, false>, cudaFuncCachePreferShared); check_cuda_error(cudaFuncSetAttribute( device_multi_bit_programmable_bootstrap_accumulate_step_one< - Torus, params, NOSM, true>, + InputTorus, Torus, params, NOSM, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, 0)); cudaFuncSetCacheConfig( device_multi_bit_programmable_bootstrap_accumulate_step_one< - Torus, params, NOSM, true>, + InputTorus, Torus, params, NOSM, true>, cudaFuncCachePreferShared); check_cuda_error(cudaGetLastError()); } else if (max_shared_memory < full_sm_accumulate_step_one) { check_cuda_error(cudaFuncSetAttribute( device_multi_bit_programmable_bootstrap_accumulate_step_one< - Torus, params, PARTIALSM, false>, + InputTorus, Torus, params, PARTIALSM, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_accumulate_step_one)); cudaFuncSetCacheConfig( device_multi_bit_programmable_bootstrap_accumulate_step_one< - Torus, params, PARTIALSM, false>, + InputTorus, Torus, params, PARTIALSM, false>, cudaFuncCachePreferShared); check_cuda_error(cudaFuncSetAttribute( device_multi_bit_programmable_bootstrap_accumulate_step_one< - Torus, params, PARTIALSM, true>, + InputTorus, Torus, params, PARTIALSM, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_accumulate_step_one)); cudaFuncSetCacheConfig( device_multi_bit_programmable_bootstrap_accumulate_step_one< - Torus, params, PARTIALSM, true>, + InputTorus, Torus, params, PARTIALSM, true>, cudaFuncCachePreferShared); check_cuda_error(cudaGetLastError()); } else { check_cuda_error(cudaFuncSetAttribute( device_multi_bit_programmable_bootstrap_accumulate_step_one< - Torus, params, FULLSM, false>, + InputTorus, Torus, params, FULLSM, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_accumulate_step_one)); cudaFuncSetCacheConfig( device_multi_bit_programmable_bootstrap_accumulate_step_one< - Torus, params, FULLSM, false>, + InputTorus, Torus, params, FULLSM, false>, cudaFuncCachePreferShared); check_cuda_error(cudaFuncSetAttribute( device_multi_bit_programmable_bootstrap_accumulate_step_one< - Torus, params, FULLSM, true>, + InputTorus, Torus, params, FULLSM, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_accumulate_step_one)); cudaFuncSetCacheConfig( device_multi_bit_programmable_bootstrap_accumulate_step_one< - Torus, params, FULLSM, true>, + InputTorus, Torus, params, FULLSM, true>, cudaFuncCachePreferShared); check_cuda_error(cudaGetLastError()); } @@ -644,7 +651,7 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap( check_cuda_error(cudaGetLastError()); } - auto lwe_chunk_size = get_lwe_chunk_size( + auto lwe_chunk_size = get_lwe_chunk_size( gpu_index, input_lwe_ciphertext_count, polynomial_size, glwe_dimension, level_count, full_sm_keybundle); uint64_t size_tracker = 0; @@ -655,9 +662,9 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap( return size_tracker; } -template +template __host__ void execute_compute_keybundle( - cudaStream_t stream, uint32_t gpu_index, Torus const *lwe_array_in, + cudaStream_t stream, uint32_t gpu_index, InputTorus const *lwe_array_in, Torus const *lwe_input_indexes, Torus const *bootstrapping_key, pbs_buffer *buffer, uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, @@ -686,7 +693,8 @@ __host__ void execute_compute_keybundle( dim3 thds(polynomial_size / params::opt, 1, 1); if (max_shared_memory < full_sm_keybundle) { - device_multi_bit_programmable_bootstrap_keybundle + device_multi_bit_programmable_bootstrap_keybundle <<>>( lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, @@ -694,7 +702,8 @@ __host__ void execute_compute_keybundle( d_mem, full_sm_keybundle); } else { bool supports_tbc = - has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( + has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( num_samples, glwe_dimension, polynomial_size, level_count, cuda_get_max_shared_memory(gpu_index)); @@ -703,20 +712,22 @@ __host__ void execute_compute_keybundle( dim3 thds_new_keybundle(512, 1, 1); check_cuda_error(cudaFuncSetAttribute( device_multi_bit_programmable_bootstrap_keybundle_2_2_params< - Torus, Degree<2048>, FULLSM>, + InputTorus, Torus, Degree<2048>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, 3 * full_sm_keybundle)); cudaFuncSetCacheConfig( device_multi_bit_programmable_bootstrap_keybundle_2_2_params< - Torus, Degree<2048>, FULLSM>, + InputTorus, Torus, Degree<2048>, FULLSM>, cudaFuncCachePreferShared); check_cuda_error(cudaGetLastError()); device_multi_bit_programmable_bootstrap_keybundle_2_2_params< - Torus, Degree<2048>, FULLSM><<>>( - lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key, - lwe_dimension, lwe_offset, chunk_size, keybundle_size_per_input); + InputTorus, Torus, Degree<2048>, FULLSM> + <<>>(lwe_array_in, lwe_input_indexes, keybundle_fft, + bootstrapping_key, lwe_dimension, lwe_offset, chunk_size, + keybundle_size_per_input); } else { - device_multi_bit_programmable_bootstrap_keybundle + device_multi_bit_programmable_bootstrap_keybundle <<>>( lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, @@ -727,10 +738,10 @@ __host__ void execute_compute_keybundle( check_cuda_error(cudaGetLastError()); } -template +template __host__ void execute_step_one( cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector, - Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lut_vector_indexes, InputTorus const *lwe_array_in, Torus const *lwe_input_indexes, pbs_buffer *buffer, uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count) { @@ -754,7 +765,7 @@ __host__ void execute_step_one( if (max_shared_memory < partial_sm_accumulate_step_one) device_multi_bit_programmable_bootstrap_accumulate_step_one< - Torus, params, NOSM, is_first_iter> + InputTorus, Torus, params, NOSM, is_first_iter> <<>>( lwe_array_in, lwe_input_indexes, lut_vector, lut_vector_indexes, global_accumulator, global_accumulator_fft, lwe_dimension, @@ -762,7 +773,7 @@ __host__ void execute_step_one( full_sm_accumulate_step_one); else if (max_shared_memory < full_sm_accumulate_step_one) device_multi_bit_programmable_bootstrap_accumulate_step_one< - Torus, params, PARTIALSM, is_first_iter> + InputTorus, Torus, params, PARTIALSM, is_first_iter> <<>>(lwe_array_in, lwe_input_indexes, lut_vector, lut_vector_indexes, global_accumulator, @@ -771,7 +782,7 @@ __host__ void execute_step_one( partial_sm_accumulate_step_one); else device_multi_bit_programmable_bootstrap_accumulate_step_one< - Torus, params, FULLSM, is_first_iter> + InputTorus, Torus, params, FULLSM, is_first_iter> <<>>(lwe_array_in, lwe_input_indexes, lut_vector, lut_vector_indexes, global_accumulator, @@ -823,11 +834,11 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, check_cuda_error(cudaGetLastError()); } -template +template __host__ void host_multi_bit_programmable_bootstrap( cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, Torus const *lwe_output_indexes, Torus const *lut_vector, - Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lut_vector_indexes, InputTorus const *lwe_array_in, Torus const *lwe_input_indexes, Torus const *bootstrapping_key, pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, @@ -840,7 +851,7 @@ __host__ void host_multi_bit_programmable_bootstrap( lwe_offset += lwe_chunk_size) { // Compute a keybundle - execute_compute_keybundle( + execute_compute_keybundle( stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, level_count, lwe_offset); @@ -853,12 +864,12 @@ __host__ void host_multi_bit_programmable_bootstrap( bool is_last_iter = (j + lwe_offset) + 1 == (lwe_dimension / grouping_factor); if (is_first_iter) { - execute_step_one( + execute_step_one( stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count); } else { - execute_step_one( + execute_step_one( stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count); diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu index 0a3dc9ac4..d0702df53 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu @@ -98,9 +98,9 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64( bool supports_cg = supports_cooperative_groups_on_multibit_programmable_bootstrap< - __uint128_t>(glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, - cuda_get_max_shared_memory(gpu_index)); + uint64_t, __uint128_t>(glwe_dimension, polynomial_size, level_count, + input_lwe_ciphertext_count, + cuda_get_max_shared_memory(gpu_index)); if (supports_cg) return scratch_cuda_cg_multi_bit_programmable_bootstrap_128( diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh index 758e7bd98..8e12342bf 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh @@ -35,12 +35,13 @@ namespace cg = cooperative_groups; * * Each y-block computes one element of the lwe_array_out. */ -template +template __global__ void device_programmable_bootstrap_tbc( Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes, const Torus *__restrict__ lut_vector, const Torus *__restrict__ lut_vector_indexes, - const Torus *__restrict__ lwe_array_in, + const InputTorus *__restrict__ lwe_array_in, const Torus *__restrict__ lwe_input_indexes, const double2 *__restrict__ bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, @@ -83,7 +84,7 @@ __global__ void device_programmable_bootstrap_tbc( // The third dimension of the block is used to determine on which ciphertext // this block is operating, in the case of batch bootstraps - const Torus *block_lwe_array_in = + const InputTorus *block_lwe_array_in = &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)]; const Torus *block_lut_vector = @@ -99,8 +100,8 @@ __global__ void device_programmable_bootstrap_tbc( // Put "b" in [0, 2N[ constexpr auto log_modulus = params::log2_degree + 1; - Torus b_hat = 0; - Torus correction = 0; + InputTorus b_hat = 0; + InputTorus correction = 0; if (noise_reduction_type == PBS_MS_REDUCTION_T::CENTERED) { correction = centered_binary_modulus_switch_body_correction_to_add( block_lwe_array_in, lwe_dimension, log_modulus); @@ -117,7 +118,7 @@ __global__ void device_programmable_bootstrap_tbc( __syncthreads(); // Put "a" in [0, 2N[ - Torus a_hat = 0; + InputTorus a_hat = 0; modulus_switch(block_lwe_array_in[i], a_hat, log_modulus); // Perform ACC * (X^รค - 1) @@ -200,12 +201,13 @@ __global__ void device_programmable_bootstrap_tbc( } } -template +template __global__ void device_programmable_bootstrap_tbc_2_2_params( Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes, const Torus *__restrict__ lut_vector, const Torus *__restrict__ lut_vector_indexes, - const Torus *__restrict__ lwe_array_in, + const InputTorus *__restrict__ lwe_array_in, const Torus *__restrict__ lwe_input_indexes, const double2 *__restrict__ bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension, uint32_t num_many_lut, uint32_t lut_stride, @@ -247,7 +249,7 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params( } // The third dimension of the block is used to determine on which ciphertext // this block is operating, in the case of batch bootstraps - const Torus *block_lwe_array_in = + const InputTorus *block_lwe_array_in = &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)]; const Torus *block_lut_vector = @@ -263,8 +265,8 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params( // Put "b" in [0, 2N[ constexpr auto log_modulus = params::log2_degree + 1; - Torus b_hat = 0; - Torus correction = 0; + InputTorus b_hat = 0; + InputTorus correction = 0; if (noise_reduction_type == PBS_MS_REDUCTION_T::CENTERED) { correction = centered_binary_modulus_switch_body_correction_to_add( block_lwe_array_in, lwe_dimension, log_modulus); @@ -276,7 +278,7 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params( params::degree / params::opt>( accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat, false); - Torus temp_a_hat = 0; + InputTorus temp_a_hat = 0; for (int i = 0; i < lwe_dimension; i++) { // We calculate the modulus switch of a warp size of elements @@ -382,7 +384,7 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params( cluster.sync(); } -template +template __host__ uint64_t scratch_programmable_bootstrap_tbc( cudaStream_t stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t lwe_dimension, @@ -410,27 +412,27 @@ __host__ uint64_t scratch_programmable_bootstrap_tbc( if (max_shared_memory >= full_sm + minimum_sm_tbc) { check_cuda_error(cudaFuncSetAttribute( - device_programmable_bootstrap_tbc, + device_programmable_bootstrap_tbc, cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm + minimum_sm_tbc)); cudaFuncSetCacheConfig( - device_programmable_bootstrap_tbc, + device_programmable_bootstrap_tbc, cudaFuncCachePreferShared); check_cuda_error(cudaGetLastError()); } else if (max_shared_memory >= partial_sm + minimum_sm_tbc) { check_cuda_error(cudaFuncSetAttribute( - device_programmable_bootstrap_tbc, + device_programmable_bootstrap_tbc, cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm + minimum_sm_tbc)); cudaFuncSetCacheConfig( - device_programmable_bootstrap_tbc, + device_programmable_bootstrap_tbc, cudaFuncCachePreferShared); check_cuda_error(cudaGetLastError()); } else { check_cuda_error(cudaFuncSetAttribute( - device_programmable_bootstrap_tbc, + device_programmable_bootstrap_tbc, cudaFuncAttributeMaxDynamicSharedMemorySize, minimum_sm_tbc)); cudaFuncSetCacheConfig( - device_programmable_bootstrap_tbc, + device_programmable_bootstrap_tbc, cudaFuncCachePreferShared); check_cuda_error(cudaGetLastError()); } @@ -446,11 +448,11 @@ __host__ uint64_t scratch_programmable_bootstrap_tbc( /* * Host wrapper */ -template +template __host__ void host_programmable_bootstrap_tbc( cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, Torus const *lwe_output_indexes, Torus const *lut_vector, - Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lut_vector_indexes, InputTorus const *lwe_array_in, Torus const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, @@ -506,7 +508,8 @@ __host__ void host_programmable_bootstrap_tbc( config.dynamicSmemBytes = minimum_sm_tbc; check_cuda_error(cudaLaunchKernelEx( - &config, device_programmable_bootstrap_tbc, + &config, + device_programmable_bootstrap_tbc, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft, lwe_dimension, polynomial_size, base_log, level_count, d_mem, full_dm, @@ -515,7 +518,8 @@ __host__ void host_programmable_bootstrap_tbc( config.dynamicSmemBytes = partial_sm + minimum_sm_tbc; check_cuda_error(cudaLaunchKernelEx( - &config, device_programmable_bootstrap_tbc, + &config, + device_programmable_bootstrap_tbc, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft, lwe_dimension, polynomial_size, base_log, level_count, d_mem, @@ -530,18 +534,22 @@ __host__ void host_programmable_bootstrap_tbc( config.dynamicSmemBytes = full_sm_2_2; check_cuda_error(cudaFuncSetAttribute( - device_programmable_bootstrap_tbc_2_2_params, + device_programmable_bootstrap_tbc_2_2_params, cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_2_2)); check_cuda_error(cudaFuncSetAttribute( - device_programmable_bootstrap_tbc_2_2_params, + device_programmable_bootstrap_tbc_2_2_params, cudaFuncAttributePreferredSharedMemoryCarveout, cudaSharedmemCarveoutMaxShared)); check_cuda_error(cudaFuncSetCacheConfig( - device_programmable_bootstrap_tbc_2_2_params, + device_programmable_bootstrap_tbc_2_2_params, cudaFuncCachePreferShared)); check_cuda_error(cudaLaunchKernelEx( &config, - device_programmable_bootstrap_tbc_2_2_params, + device_programmable_bootstrap_tbc_2_2_params, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft, lwe_dimension, num_many_lut, lut_stride, noise_reduction_type)); @@ -549,7 +557,8 @@ __host__ void host_programmable_bootstrap_tbc( config.dynamicSmemBytes = full_sm + minimum_sm_tbc; check_cuda_error(cudaLaunchKernelEx( - &config, device_programmable_bootstrap_tbc, + &config, + device_programmable_bootstrap_tbc, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft, lwe_dimension, polynomial_size, base_log, level_count, d_mem, 0, @@ -560,7 +569,7 @@ __host__ void host_programmable_bootstrap_tbc( } // Verify if the grid size satisfies the cooperative group constraints -template +template __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size( int glwe_dimension, int level_count, int num_samples, uint32_t max_shared_memory) { @@ -585,18 +594,21 @@ __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size( if (max_shared_memory < partial_sm) { cudaOccupancyMaxActiveBlocksPerMultiprocessor( &max_active_blocks_per_sm, - (void *)device_programmable_bootstrap_tbc, thds, - 0); + (void *) + device_programmable_bootstrap_tbc, + thds, 0); } else if (max_shared_memory < full_sm) { cudaOccupancyMaxActiveBlocksPerMultiprocessor( &max_active_blocks_per_sm, - (void *)device_programmable_bootstrap_tbc, + (void *)device_programmable_bootstrap_tbc, thds, partial_sm); } else { cudaOccupancyMaxActiveBlocksPerMultiprocessor( &max_active_blocks_per_sm, - (void *)device_programmable_bootstrap_tbc, thds, - full_sm); + (void *)device_programmable_bootstrap_tbc, + thds, full_sm); } // Get the number of streaming multiprocessors @@ -621,7 +633,7 @@ bool supports_distributed_shared_memory_on_classic_programmable_bootstrap( } } -template +template __host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap( uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t max_shared_memory) { @@ -661,34 +673,39 @@ __host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap( * disable cudaFuncAttributeNonPortableClusterSizeAllowed */ if (max_shared_memory < partial_sm + minimum_sm_tbc) { check_cuda_error(cudaFuncSetAttribute( - device_programmable_bootstrap_tbc, - cudaFuncAttributeNonPortableClusterSizeAllowed, false)); - check_cuda_error(cudaOccupancyMaxPotentialClusterSize( - &cluster_size, device_programmable_bootstrap_tbc, - &config)); - } else if (max_shared_memory < full_sm + minimum_sm_tbc) { - check_cuda_error(cudaFuncSetAttribute( - device_programmable_bootstrap_tbc, + device_programmable_bootstrap_tbc, cudaFuncAttributeNonPortableClusterSizeAllowed, false)); check_cuda_error(cudaOccupancyMaxPotentialClusterSize( &cluster_size, - device_programmable_bootstrap_tbc, &config)); + device_programmable_bootstrap_tbc, + &config)); + } else if (max_shared_memory < full_sm + minimum_sm_tbc) { + check_cuda_error(cudaFuncSetAttribute( + device_programmable_bootstrap_tbc, + cudaFuncAttributeNonPortableClusterSizeAllowed, false)); + check_cuda_error(cudaOccupancyMaxPotentialClusterSize( + &cluster_size, + device_programmable_bootstrap_tbc, + &config)); } else { if (polynomial_size == 2048 && level_count == 1 && glwe_dimension == 1) { check_cuda_error(cudaFuncSetAttribute( - device_programmable_bootstrap_tbc_2_2_params, + device_programmable_bootstrap_tbc_2_2_params, cudaFuncAttributeNonPortableClusterSizeAllowed, false)); check_cuda_error(cudaOccupancyMaxPotentialClusterSize( &cluster_size, - device_programmable_bootstrap_tbc_2_2_params, + device_programmable_bootstrap_tbc_2_2_params, &config)); } else { check_cuda_error(cudaFuncSetAttribute( - device_programmable_bootstrap_tbc, + device_programmable_bootstrap_tbc, cudaFuncAttributeNonPortableClusterSizeAllowed, false)); check_cuda_error(cudaOccupancyMaxPotentialClusterSize( &cluster_size, - device_programmable_bootstrap_tbc, &config)); + device_programmable_bootstrap_tbc, + &config)); } } diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh index 9891dfff8..29ea2ca21 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh @@ -18,13 +18,14 @@ #include "types/complex/operations.cuh" #include -template +template __global__ void __launch_bounds__(params::degree / params::opt) device_multi_bit_programmable_bootstrap_tbc_accumulate( Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes, const Torus *__restrict__ lut_vector, const Torus *__restrict__ lut_vector_indexes, - const Torus *__restrict__ lwe_array_in, + const InputTorus *__restrict__ lwe_array_in, const Torus *__restrict__ lwe_input_indexes, const double2 *__restrict__ keybundle_array, double2 *join_buffer, Torus *global_accumulator, uint32_t lwe_dimension, @@ -67,7 +68,7 @@ __global__ void __launch_bounds__(params::degree / params::opt) // The first dimension of the block is used to determine on which ciphertext // this block is operating, in the case of batch bootstraps - const Torus *block_lwe_array_in = + const InputTorus *block_lwe_array_in = &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)]; const Torus *block_lut_vector = @@ -87,7 +88,7 @@ __global__ void __launch_bounds__(params::degree / params::opt) if (lwe_offset == 0) { // Put "b" in [0, 2N[ - Torus b_hat = 0; + InputTorus b_hat = 0; modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, params::log2_degree + 1); @@ -197,13 +198,14 @@ __global__ void __launch_bounds__(params::degree / params::opt) //- Use a register based fft that uses the minimal synchronizations //- Register based fourier domain multiplication. Transfer fft's between blocks // instead of accumulator. -template +template __global__ void __launch_bounds__(params::degree / params::opt) device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params( Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes, const Torus *__restrict__ lut_vector, const Torus *__restrict__ lut_vector_indexes, - const Torus *__restrict__ lwe_array_in, + const InputTorus *__restrict__ lwe_array_in, const Torus *__restrict__ lwe_input_indexes, const double2 *__restrict__ keybundle_array, double2 *join_buffer, Torus *global_accumulator, uint32_t lwe_dimension, uint32_t lwe_offset, @@ -247,7 +249,7 @@ __global__ void __launch_bounds__(params::degree / params::opt) // The first dimension of the block is used to determine on which ciphertext // this block is operating, in the case of batch bootstraps - const Torus *block_lwe_array_in = + const InputTorus *block_lwe_array_in = &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)]; const Torus *block_lut_vector = @@ -266,7 +268,7 @@ __global__ void __launch_bounds__(params::degree / params::opt) Torus reg_acc_rotated[params::opt]; if (lwe_offset == 0) { // Put "b" in [0, 2N[ - Torus b_hat = 0; + InputTorus b_hat = 0; modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, params::log2_degree + 1); @@ -400,7 +402,7 @@ uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap( return sizeof(Torus) * polynomial_size * 2; // accumulator } -template +template __host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap( cudaStream_t stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t glwe_dimension, @@ -430,20 +432,22 @@ __host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap( if (max_shared_memory < full_sm_keybundle) { check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_keybundle, + device_multi_bit_programmable_bootstrap_keybundle, cudaFuncAttributeMaxDynamicSharedMemorySize, 0)); cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_keybundle, + device_multi_bit_programmable_bootstrap_keybundle, cudaFuncCachePreferShared); check_cuda_error(cudaGetLastError()); } else { check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_keybundle, + device_multi_bit_programmable_bootstrap_keybundle, cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle)); cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_keybundle, + device_multi_bit_programmable_bootstrap_keybundle, cudaFuncCachePreferShared); check_cuda_error(cudaGetLastError()); } @@ -451,58 +455,58 @@ __host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap( if (max_shared_memory < partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate) { check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_tbc_accumulate, + device_multi_bit_programmable_bootstrap_tbc_accumulate< + InputTorus, Torus, params, NOSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, minimum_sm_tbc_accumulate)); cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_tbc_accumulate, + device_multi_bit_programmable_bootstrap_tbc_accumulate< + InputTorus, Torus, params, NOSM>, cudaFuncCachePreferShared); check_cuda_error(cudaGetLastError()); } else if (max_shared_memory < full_sm_tbc_accumulate + minimum_sm_tbc_accumulate) { check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_tbc_accumulate, + device_multi_bit_programmable_bootstrap_tbc_accumulate< + InputTorus, Torus, params, PARTIALSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate)); cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_tbc_accumulate, + device_multi_bit_programmable_bootstrap_tbc_accumulate< + InputTorus, Torus, params, PARTIALSM>, cudaFuncCachePreferShared); check_cuda_error(cudaGetLastError()); } else { if (polynomial_size == 2048 && level_count == 1 && glwe_dimension == 1) { check_cuda_error(cudaFuncSetAttribute( device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params< - Torus, params, FULLSM>, + InputTorus, Torus, params, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_tbc_accumulate + 2 * minimum_sm_tbc_accumulate)); check_cuda_error(cudaFuncSetAttribute( device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params< - Torus, params, FULLSM>, + InputTorus, Torus, params, FULLSM>, cudaFuncAttributePreferredSharedMemoryCarveout, cudaSharedmemCarveoutMaxShared)); check_cuda_error(cudaFuncSetCacheConfig( device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params< - Torus, params, FULLSM>, + InputTorus, Torus, params, FULLSM>, cudaFuncCachePreferShared)); } else { check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_tbc_accumulate, + device_multi_bit_programmable_bootstrap_tbc_accumulate< + InputTorus, Torus, params, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_tbc_accumulate + minimum_sm_tbc_accumulate)); cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_tbc_accumulate, + device_multi_bit_programmable_bootstrap_tbc_accumulate< + InputTorus, Torus, params, FULLSM>, cudaFuncCachePreferShared); } check_cuda_error(cudaGetLastError()); } - auto lwe_chunk_size = get_lwe_chunk_size( + auto lwe_chunk_size = get_lwe_chunk_size( gpu_index, input_lwe_ciphertext_count, polynomial_size, glwe_dimension, level_count, full_sm_keybundle); uint64_t size_tracker = 0; @@ -513,10 +517,10 @@ __host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap( return size_tracker; } -template +template __host__ void execute_tbc_external_product_loop( cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector, - Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lut_vector_indexes, InputTorus const *lwe_array_in, Torus const *lwe_input_indexes, Torus *lwe_array_out, Torus const *lwe_output_indexes, pbs_buffer *buffer, uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension, @@ -579,8 +583,8 @@ __host__ void execute_tbc_external_product_loop( config.dynamicSmemBytes = minimum_dm; check_cuda_error(cudaLaunchKernelEx( &config, - device_multi_bit_programmable_bootstrap_tbc_accumulate, + device_multi_bit_programmable_bootstrap_tbc_accumulate< + InputTorus, Torus, params, NOSM>, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft, global_accumulator, lwe_dimension, glwe_dimension, polynomial_size, @@ -591,8 +595,8 @@ __host__ void execute_tbc_external_product_loop( config.dynamicSmemBytes = partial_dm + minimum_dm; check_cuda_error(cudaLaunchKernelEx( &config, - device_multi_bit_programmable_bootstrap_tbc_accumulate, + device_multi_bit_programmable_bootstrap_tbc_accumulate< + InputTorus, Torus, params, PARTIALSM>, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft, global_accumulator, lwe_dimension, glwe_dimension, polynomial_size, @@ -607,22 +611,22 @@ __host__ void execute_tbc_external_product_loop( config.dynamicSmemBytes = full_dm + 2 * minimum_dm; check_cuda_error(cudaFuncSetAttribute( device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params< - Torus, params, FULLSM>, + InputTorus, Torus, params, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, full_dm + 2 * minimum_dm)); check_cuda_error(cudaFuncSetAttribute( device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params< - Torus, params, FULLSM>, + InputTorus, Torus, params, FULLSM>, cudaFuncAttributePreferredSharedMemoryCarveout, cudaSharedmemCarveoutMaxShared)); check_cuda_error(cudaFuncSetCacheConfig( device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params< - Torus, params, FULLSM>, + InputTorus, Torus, params, FULLSM>, cudaFuncCachePreferShared)); check_cuda_error(cudaLaunchKernelEx( &config, device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params< - Torus, params, FULLSM>, + InputTorus, Torus, params, FULLSM>, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft, global_accumulator, lwe_dimension, lwe_offset, chunk_size, @@ -630,8 +634,8 @@ __host__ void execute_tbc_external_product_loop( } else { check_cuda_error(cudaLaunchKernelEx( &config, - device_multi_bit_programmable_bootstrap_tbc_accumulate, + device_multi_bit_programmable_bootstrap_tbc_accumulate< + InputTorus, Torus, params, FULLSM>, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft, global_accumulator, lwe_dimension, glwe_dimension, polynomial_size, @@ -642,11 +646,11 @@ __host__ void execute_tbc_external_product_loop( } } -template +template __host__ void host_tbc_multi_bit_programmable_bootstrap( cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, Torus const *lwe_output_indexes, Torus const *lut_vector, - Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lut_vector_indexes, InputTorus const *lwe_array_in, Torus const *lwe_input_indexes, Torus const *bootstrapping_key, pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, @@ -659,13 +663,13 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap( lwe_offset += lwe_chunk_size) { // Compute a keybundle - execute_compute_keybundle( + execute_compute_keybundle( stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, level_count, lwe_offset); // Accumulate - execute_tbc_external_product_loop( + execute_tbc_external_product_loop( stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, @@ -690,7 +694,7 @@ bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap( } } -template +template __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap( uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t max_shared_memory) { @@ -732,45 +736,45 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap( if (max_shared_memory < partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate) { check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_tbc_accumulate, + device_multi_bit_programmable_bootstrap_tbc_accumulate< + InputTorus, Torus, params, NOSM>, cudaFuncAttributeNonPortableClusterSizeAllowed, false)); check_cuda_error(cudaOccupancyMaxPotentialClusterSize( &cluster_size, - device_multi_bit_programmable_bootstrap_tbc_accumulate, + device_multi_bit_programmable_bootstrap_tbc_accumulate< + InputTorus, Torus, params, NOSM>, &config)); } else if (max_shared_memory < full_sm_tbc_accumulate + minimum_sm_tbc_accumulate) { check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_tbc_accumulate, + device_multi_bit_programmable_bootstrap_tbc_accumulate< + InputTorus, Torus, params, PARTIALSM>, cudaFuncAttributeNonPortableClusterSizeAllowed, false)); check_cuda_error(cudaOccupancyMaxPotentialClusterSize( &cluster_size, - device_multi_bit_programmable_bootstrap_tbc_accumulate, + device_multi_bit_programmable_bootstrap_tbc_accumulate< + InputTorus, Torus, params, PARTIALSM>, &config)); } else { if (polynomial_size == 2048 && level_count == 1 && glwe_dimension == 1) { check_cuda_error(cudaFuncSetAttribute( device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params< - Torus, params, FULLSM>, + InputTorus, Torus, params, FULLSM>, cudaFuncAttributeNonPortableClusterSizeAllowed, false)); check_cuda_error(cudaOccupancyMaxPotentialClusterSize( &cluster_size, device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params< - Torus, params, FULLSM>, + InputTorus, Torus, params, FULLSM>, &config)); } else { check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_tbc_accumulate, + device_multi_bit_programmable_bootstrap_tbc_accumulate< + InputTorus, Torus, params, FULLSM>, cudaFuncAttributeNonPortableClusterSizeAllowed, false)); check_cuda_error(cudaOccupancyMaxPotentialClusterSize( &cluster_size, - device_multi_bit_programmable_bootstrap_tbc_accumulate, + device_multi_bit_programmable_bootstrap_tbc_accumulate< + InputTorus, Torus, params, FULLSM>, &config)); } } diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp index 7196b9210..8ad6ab768 100644 --- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp +++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp @@ -166,14 +166,15 @@ public: #if CUDA_ARCH >= 900 BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit) (benchmark::State &st) { - if (!has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( + if (!has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( input_lwe_ciphertext_count, glwe_dimension, polynomial_size, pbs_level, cuda_get_max_shared_memory(0))) { st.SkipWithError("Configuration not supported for tbc operation"); return; } - scratch_cuda_tbc_multi_bit_programmable_bootstrap( + scratch_cuda_tbc_multi_bit_programmable_bootstrap( stream, gpu_index, (pbs_buffer **)&buffer, glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count, true); @@ -181,7 +182,8 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit) uint32_t lut_stride = 0; for (auto _ : st) { // Execute PBS - cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( + cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( stream, gpu_index, d_lwe_ct_out_array, d_lwe_output_indexes, d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_input_indexes, d_bsk, (pbs_buffer *)buffer, @@ -197,7 +199,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit) BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit) (benchmark::State &st) { - if (!has_support_to_cuda_programmable_bootstrap_cg_multi_bit( + if (!has_support_to_cuda_programmable_bootstrap_cg_multi_bit_64( glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index))) { st.SkipWithError("Configuration not supported for fast operation"); @@ -254,7 +256,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit) #if CUDA_ARCH >= 900 BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC) (benchmark::State &st) { - if (!has_support_to_cuda_programmable_bootstrap_tbc( + if (!has_support_to_cuda_programmable_bootstrap_tbc( input_lwe_ciphertext_count, glwe_dimension, polynomial_size, pbs_level, cuda_get_max_shared_memory(0))) { st.SkipWithError("Configuration not supported for tbc operation"); @@ -269,7 +271,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC) uint32_t lut_stride = 0; for (auto _ : st) { // Execute PBS - cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( + cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( stream, gpu_index, (uint64_t *)d_lwe_ct_out_array, (uint64_t *)d_lwe_output_indexes, (uint64_t *)d_lut_pbs_identity, (uint64_t *)d_lut_pbs_indexes, (uint64_t *)d_lwe_ct_in_array, @@ -301,7 +303,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, CgPBS) uint32_t lut_stride = 0; for (auto _ : st) { // Execute PBS - cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( + cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( stream, gpu_index, (uint64_t *)d_lwe_ct_out_array, (uint64_t *)d_lwe_output_indexes, (uint64_t *)d_lut_pbs_identity, (uint64_t *)d_lut_pbs_indexes, (uint64_t *)d_lwe_ct_in_array, diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp index f25653fcd..f671271bd 100644 --- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp +++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp @@ -187,7 +187,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) { (ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) * (lwe_dimension + 1)); // Execute PBS - cuda_programmable_bootstrap_lwe_ciphertext_vector_64( + cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64( stream, gpu_index, (void *)d_lwe_ct_out_array, (void *)d_lwe_output_indexes, (void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in, diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs index d58512a03..5eed4f0f8 100644 --- a/backends/tfhe-cuda-backend/src/bindings.rs +++ b/backends/tfhe-cuda-backend/src/bindings.rs @@ -3020,6 +3020,20 @@ unsafe extern "C" { noise_reduction_type: PBS_MS_REDUCTION_T, ) -> u64; } +unsafe extern "C" { + pub fn scratch_cuda_programmable_bootstrap_32_64( + stream: *mut ffi::c_void, + gpu_index: u32, + buffer: *mut *mut i8, + lwe_dimension: u32, + glwe_dimension: u32, + polynomial_size: u32, + level_count: u32, + input_lwe_ciphertext_count: u32, + allocate_gpu_memory: bool, + noise_reduction_type: PBS_MS_REDUCTION_T, + ) -> u64; +} unsafe extern "C" { pub fn scratch_cuda_programmable_bootstrap_128( stream: *mut ffi::c_void, @@ -3035,7 +3049,7 @@ unsafe extern "C" { ) -> u64; } unsafe extern "C" { - pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_32( + pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64( stream: *mut ffi::c_void, gpu_index: u32, lwe_array_out: *mut ffi::c_void, @@ -3057,7 +3071,7 @@ unsafe extern "C" { ); } unsafe extern "C" { - pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_64( + pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_32_64( stream: *mut ffi::c_void, gpu_index: u32, lwe_array_out: *mut ffi::c_void, @@ -3110,6 +3124,7 @@ unsafe extern "C" { ); } unsafe extern "C" { + #[link_name = "\u{1}_Z55has_support_to_cuda_programmable_bootstrap_cg_multi_bitjjjjj"] pub fn has_support_to_cuda_programmable_bootstrap_cg_multi_bit( glwe_dimension: u32, polynomial_size: u32, @@ -3144,6 +3159,18 @@ unsafe extern "C" { grouping_factor: u32, ); } +unsafe extern "C" { + pub fn scratch_cuda_multi_bit_programmable_bootstrap_32_64( + stream: *mut ffi::c_void, + gpu_index: u32, + pbs_buffer: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + level_count: u32, + input_lwe_ciphertext_count: u32, + allocate_gpu_memory: bool, + ) -> u64; +} unsafe extern "C" { pub fn scratch_cuda_multi_bit_programmable_bootstrap_64( stream: *mut ffi::c_void, @@ -3179,6 +3206,29 @@ unsafe extern "C" { lut_stride: u32, ); } +unsafe extern "C" { + pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_32_64( + stream: *mut ffi::c_void, + gpu_index: u32, + lwe_array_out: *mut ffi::c_void, + lwe_output_indexes: *const ffi::c_void, + lut_vector: *const ffi::c_void, + lut_vector_indexes: *const ffi::c_void, + lwe_array_in: *const ffi::c_void, + lwe_input_indexes: *const ffi::c_void, + bootstrapping_key: *const ffi::c_void, + buffer: *mut i8, + lwe_dimension: u32, + glwe_dimension: u32, + polynomial_size: u32, + grouping_factor: u32, + base_log: u32, + level_count: u32, + num_samples: u32, + num_many_lut: u32, + lut_stride: u32, + ); +} unsafe extern "C" { pub fn cleanup_cuda_multi_bit_programmable_bootstrap( stream: *mut ffi::c_void, diff --git a/tfhe-benchmark/benches/core_crypto/ks_bench.rs b/tfhe-benchmark/benches/core_crypto/ks_bench.rs index 285a2019a..0c416ebf2 100644 --- a/tfhe-benchmark/benches/core_crypto/ks_bench.rs +++ b/tfhe-benchmark/benches/core_crypto/ks_bench.rs @@ -415,7 +415,7 @@ mod cuda { let big_lwe_sk_64 = glwe_sk_64.into_lwe_secret_key(); let ciphertext_modulus_out = CiphertextModulus::::new_native(); - let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new() + let cpu_keys: CpuKeys = CpuKeysBuilder::new() .keyswitch_key(ksk_big_to_small) .build(); @@ -667,7 +667,8 @@ mod cuda { &mut encryption_generator, ); - let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new().packing_keyswitch_key(pksk).build(); + let cpu_keys: CpuKeys = + CpuKeysBuilder::new().packing_keyswitch_key(pksk).build(); let bench_id; match get_bench_type() { diff --git a/tfhe-benchmark/benches/core_crypto/ks_pbs_bench.rs b/tfhe-benchmark/benches/core_crypto/ks_pbs_bench.rs index f70d5b7b0..fc79f6984 100644 --- a/tfhe-benchmark/benches/core_crypto/ks_pbs_bench.rs +++ b/tfhe-benchmark/benches/core_crypto/ks_pbs_bench.rs @@ -511,12 +511,19 @@ mod cuda { }; use tfhe::core_crypto::prelude::*; - fn cuda_ks_pbs + CastFrom + Serialize>( + fn cuda_ks_pbs< + Scalar: UnsignedTorus + CastInto + CastFrom + Serialize, + KeyswitchScalar: UnsignedTorus + CastFrom + CastInto, + >( c: &mut Criterion, parameters: &[(String, CryptoParametersRecord)], ) { - let bench_name = "core_crypto::cuda::ks_pbs"; - let mut bench_group = c.benchmark_group(bench_name); + let bench_name = format!( + "core_crypto::cuda::ks_pbs::ks{}::pbs{}", + KeyswitchScalar::BITS, + Scalar::BITS + ); + let mut bench_group = c.benchmark_group(bench_name.clone()); bench_group .sample_size(10) .measurement_time(std::time::Duration::from_secs(30)); @@ -535,29 +542,57 @@ mod cuda { continue; } - // Create the LweSecretKey - let input_lwe_secret_key = allocate_and_generate_new_binary_lwe_secret_key( - params.lwe_dimension.unwrap(), - &mut secret_generator, - ); - let output_glwe_secret_key: GlweSecretKeyOwned = + let lwe_noise_distribution_ksk = match params.lwe_noise_distribution.unwrap() { + DynamicDistribution::Gaussian(gaussian_lwe_noise_distribution) => { + DynamicDistribution::::new_gaussian( + gaussian_lwe_noise_distribution.standard_dev(), + ) + } + DynamicDistribution::TUniform(uniform_lwe_noise_distribution) => { + DynamicDistribution::::new_t_uniform( + match KeyswitchScalar::BITS { + 32 => uniform_lwe_noise_distribution.bound_log2() - 32, + 64 => uniform_lwe_noise_distribution.bound_log2(), + _ => panic!("Unsupported Keyswitch scalar input dtype"), + }, + ) + } + }; + + let lwe_sk: LweSecretKeyOwned = + allocate_and_generate_new_binary_lwe_secret_key( + params.lwe_dimension.unwrap(), + &mut secret_generator, + ); + + let glwe_sk: GlweSecretKeyOwned = allocate_and_generate_new_binary_glwe_secret_key( params.glwe_dimension.unwrap(), params.polynomial_size.unwrap(), &mut secret_generator, ); - let output_lwe_secret_key = output_glwe_secret_key.into_lwe_secret_key(); + let big_lwe_sk = glwe_sk.into_lwe_secret_key(); let ksk_big_to_small = allocate_and_generate_new_lwe_keyswitch_key( - &output_lwe_secret_key, - &input_lwe_secret_key, + &big_lwe_sk, + &lwe_sk, params.ks_base_log.unwrap(), params.ks_level.unwrap(), - params.lwe_noise_distribution.unwrap(), + lwe_noise_distribution_ksk, CiphertextModulus::new_native(), &mut encryption_generator, ); + let glwe_sk_64: GlweSecretKeyOwned = + allocate_and_generate_new_binary_glwe_secret_key( + params.glwe_dimension.unwrap(), + params.polynomial_size.unwrap(), + &mut secret_generator, + ); + + let big_lwe_sk_64 = glwe_sk_64.into_lwe_secret_key(); + let ciphertext_modulus_out = CiphertextModulus::::new_native(); + let bsk = LweBootstrapKey::new( Scalar::ZERO, params.glwe_dimension.unwrap().to_glwe_size(), @@ -568,7 +603,7 @@ mod cuda { params.ciphertext_modulus.unwrap(), ); - let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new() + let cpu_keys: CpuKeys<_, _> = CpuKeysBuilder::new() .keyswitch_key(ksk_big_to_small) .bootstrap_key(bsk) .build(); @@ -582,7 +617,7 @@ mod cuda { // Allocate a new LweCiphertext and encrypt our plaintext let input_ks_ct = allocate_and_encrypt_new_lwe_ciphertext( - &output_lwe_secret_key, + &big_lwe_sk_64, Plaintext(Scalar::ZERO), params.lwe_noise_distribution.unwrap(), params.ciphertext_modulus.unwrap(), @@ -591,10 +626,10 @@ mod cuda { let input_ks_ct_gpu = CudaLweCiphertextList::from_lwe_ciphertext(&input_ks_ct, &streams); - let output_ks_ct: LweCiphertextOwned = LweCiphertext::new( - Scalar::ZERO, - input_lwe_secret_key.lwe_dimension().to_lwe_size(), - params.ciphertext_modulus.unwrap(), + let output_ks_ct: LweCiphertextOwned = LweCiphertext::new( + KeyswitchScalar::ZERO, + lwe_sk.lwe_dimension().to_lwe_size(), + ciphertext_modulus_out, ); let mut output_ks_ct_gpu = CudaLweCiphertextList::from_lwe_ciphertext(&output_ks_ct, &streams); @@ -611,7 +646,7 @@ mod cuda { // Allocate the LweCiphertext to store the result of the PBS let output_pbs_ct = LweCiphertext::new( Scalar::ZERO, - output_lwe_secret_key.lwe_dimension().to_lwe_size(), + big_lwe_sk_64.lwe_dimension().to_lwe_size(), params.ciphertext_modulus.unwrap(), ); let mut output_pbs_ct_gpu = @@ -671,12 +706,12 @@ mod cuda { .map(|i| { let mut input_ks_list = LweCiphertextList::new( Scalar::ZERO, - output_lwe_secret_key.lwe_dimension().to_lwe_size(), + big_lwe_sk_64.lwe_dimension().to_lwe_size(), LweCiphertextCount(elements_per_stream), params.ciphertext_modulus.unwrap(), ); encrypt_lwe_ciphertext_list( - &output_lwe_secret_key, + &big_lwe_sk_64, &mut input_ks_list, &plaintext_list, params.lwe_noise_distribution.unwrap(), @@ -692,10 +727,10 @@ mod cuda { let output_ks_cts = (0..gpu_count) .map(|i| { let output_ks_list = LweCiphertextList::new( - Scalar::ZERO, - input_lwe_secret_key.lwe_dimension().to_lwe_size(), + KeyswitchScalar::ZERO, + lwe_sk.lwe_dimension().to_lwe_size(), LweCiphertextCount(elements_per_stream), - params.ciphertext_modulus.unwrap(), + ciphertext_modulus_out, ); CudaLweCiphertextList::from_lwe_ciphertext_list( &output_ks_list, @@ -724,7 +759,7 @@ mod cuda { .map(|i| { let output_pbs_ct = LweCiphertextList::new( Scalar::ZERO, - output_lwe_secret_key.lwe_dimension().to_lwe_size(), + big_lwe_sk_64.lwe_dimension().to_lwe_size(), LweCiphertextCount(elements_per_stream), params.ciphertext_modulus.unwrap(), ); @@ -822,12 +857,18 @@ mod cuda { fn cuda_multi_bit_ks_pbs< Scalar: UnsignedTorus + CastInto + CastFrom + Default + Serialize + Sync, + KeyswitchScalar: UnsignedTorus + CastFrom + CastInto, >( c: &mut Criterion, parameters: &[(String, CryptoParametersRecord, LweBskGroupingFactor)], ) { - let bench_name = "core_crypto::cuda::multi_bit_ks_pbs"; - let mut bench_group = c.benchmark_group(bench_name); + let bench_name = format!( + "core_crypto::cuda::multi_bit_ks_pbs::ks_pbs::ks{}::pbs{}", + KeyswitchScalar::BITS, + Scalar::BITS + ); + + let mut bench_group = c.benchmark_group(bench_name.clone()); bench_group .sample_size(10) .measurement_time(std::time::Duration::from_secs(30)); @@ -846,29 +887,57 @@ mod cuda { continue; } - // Create the LweSecretKey - let input_lwe_secret_key = allocate_and_generate_new_binary_lwe_secret_key( - params.lwe_dimension.unwrap(), - &mut secret_generator, - ); - let output_glwe_secret_key: GlweSecretKeyOwned = + let lwe_noise_distribution_ksk = match params.lwe_noise_distribution.unwrap() { + DynamicDistribution::Gaussian(gaussian_lwe_noise_distribution) => { + DynamicDistribution::::new_gaussian( + gaussian_lwe_noise_distribution.standard_dev(), + ) + } + DynamicDistribution::TUniform(uniform_lwe_noise_distribution) => { + DynamicDistribution::::new_t_uniform( + match KeyswitchScalar::BITS { + 32 => uniform_lwe_noise_distribution.bound_log2() - 32, + 64 => uniform_lwe_noise_distribution.bound_log2(), + _ => panic!("Unsupported Keyswitch scalar input dtype"), + }, + ) + } + }; + + let lwe_sk: LweSecretKeyOwned = + allocate_and_generate_new_binary_lwe_secret_key( + params.lwe_dimension.unwrap(), + &mut secret_generator, + ); + + let glwe_sk: GlweSecretKeyOwned = allocate_and_generate_new_binary_glwe_secret_key( params.glwe_dimension.unwrap(), params.polynomial_size.unwrap(), &mut secret_generator, ); - let output_lwe_secret_key = output_glwe_secret_key.into_lwe_secret_key(); + let big_lwe_sk = glwe_sk.into_lwe_secret_key(); let ksk_big_to_small = allocate_and_generate_new_lwe_keyswitch_key( - &output_lwe_secret_key, - &input_lwe_secret_key, + &big_lwe_sk, + &lwe_sk, params.ks_base_log.unwrap(), params.ks_level.unwrap(), - params.lwe_noise_distribution.unwrap(), + lwe_noise_distribution_ksk, CiphertextModulus::new_native(), &mut encryption_generator, ); + let glwe_sk_64: GlweSecretKeyOwned = + allocate_and_generate_new_binary_glwe_secret_key( + params.glwe_dimension.unwrap(), + params.polynomial_size.unwrap(), + &mut secret_generator, + ); + + let big_lwe_sk_64 = glwe_sk_64.into_lwe_secret_key(); + let ciphertext_modulus_out = CiphertextModulus::::new_native(); + let multi_bit_bsk = LweMultiBitBootstrapKey::new( Scalar::ZERO, params.glwe_dimension.unwrap().to_glwe_size(), @@ -880,7 +949,7 @@ mod cuda { params.ciphertext_modulus.unwrap(), ); - let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new() + let cpu_keys: CpuKeys<_, _> = CpuKeysBuilder::new() .keyswitch_key(ksk_big_to_small) .multi_bit_bootstrap_key(multi_bit_bsk) .build(); @@ -894,7 +963,7 @@ mod cuda { // Allocate a new LweCiphertext and encrypt our plaintext let input_ks_ct = allocate_and_encrypt_new_lwe_ciphertext( - &output_lwe_secret_key, + &big_lwe_sk_64, Plaintext(Scalar::ZERO), params.lwe_noise_distribution.unwrap(), params.ciphertext_modulus.unwrap(), @@ -903,10 +972,10 @@ mod cuda { let input_ks_ct_gpu = CudaLweCiphertextList::from_lwe_ciphertext(&input_ks_ct, &streams); - let output_ks_ct: LweCiphertextOwned = LweCiphertext::new( - Scalar::ZERO, - input_lwe_secret_key.lwe_dimension().to_lwe_size(), - params.ciphertext_modulus.unwrap(), + let output_ks_ct: LweCiphertextOwned = LweCiphertext::new( + KeyswitchScalar::ZERO, + lwe_sk.lwe_dimension().to_lwe_size(), + ciphertext_modulus_out, ); let mut output_ks_ct_gpu = CudaLweCiphertextList::from_lwe_ciphertext(&output_ks_ct, &streams); @@ -923,7 +992,7 @@ mod cuda { // Allocate the LweCiphertext to store the result of the PBS let output_pbs_ct = LweCiphertext::new( Scalar::ZERO, - output_lwe_secret_key.lwe_dimension().to_lwe_size(), + big_lwe_sk_64.lwe_dimension().to_lwe_size(), params.ciphertext_modulus.unwrap(), ); let mut output_pbs_ct_gpu = @@ -981,12 +1050,12 @@ mod cuda { .map(|i| { let mut input_ks_list = LweCiphertextList::new( Scalar::ZERO, - output_lwe_secret_key.lwe_dimension().to_lwe_size(), + big_lwe_sk_64.lwe_dimension().to_lwe_size(), LweCiphertextCount(elements_per_stream), params.ciphertext_modulus.unwrap(), ); encrypt_lwe_ciphertext_list( - &output_lwe_secret_key, + &big_lwe_sk_64, &mut input_ks_list, &plaintext_list, params.lwe_noise_distribution.unwrap(), @@ -1002,10 +1071,10 @@ mod cuda { let output_ks_cts = (0..gpu_count) .map(|i| { let output_ks_list = LweCiphertextList::new( - Scalar::ZERO, - input_lwe_secret_key.lwe_dimension().to_lwe_size(), + KeyswitchScalar::ZERO, + lwe_sk.lwe_dimension().to_lwe_size(), LweCiphertextCount(elements_per_stream), - params.ciphertext_modulus.unwrap(), + ciphertext_modulus_out, ); CudaLweCiphertextList::from_lwe_ciphertext_list( &output_ks_list, @@ -1034,7 +1103,7 @@ mod cuda { .map(|i| { let output_pbs_ct = LweCiphertextList::new( Scalar::ZERO, - output_lwe_secret_key.lwe_dimension().to_lwe_size(), + big_lwe_sk_64.lwe_dimension().to_lwe_size(), LweCiphertextCount(elements_per_stream), params.ciphertext_modulus.unwrap(), ); @@ -1132,12 +1201,17 @@ mod cuda { pub fn cuda_ks_pbs_group() { let mut criterion: Criterion<_> = (Criterion::default()).configure_from_args(); - cuda_ks_pbs(&mut criterion, &benchmark_parameters()); + cuda_ks_pbs::(&mut criterion, &benchmark_parameters()); + cuda_ks_pbs::(&mut criterion, &benchmark_parameters()); } pub fn cuda_multi_bit_ks_pbs_group() { let mut criterion: Criterion<_> = (Criterion::default()).configure_from_args(); - cuda_multi_bit_ks_pbs( + cuda_multi_bit_ks_pbs::( + &mut criterion, + &multi_bit_benchmark_parameters_with_grouping(), + ); + cuda_multi_bit_ks_pbs::( &mut criterion, &multi_bit_benchmark_parameters_with_grouping(), ); diff --git a/tfhe-benchmark/benches/core_crypto/pbs128_bench.rs b/tfhe-benchmark/benches/core_crypto/pbs128_bench.rs index 3c7503252..e0f6a9d7d 100644 --- a/tfhe-benchmark/benches/core_crypto/pbs128_bench.rs +++ b/tfhe-benchmark/benches/core_crypto/pbs128_bench.rs @@ -250,7 +250,7 @@ mod cuda { } }; - let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new().bootstrap_key(bsk).build(); + let cpu_keys: CpuKeys<_, u64> = CpuKeysBuilder::new().bootstrap_key(bsk).build(); let message_modulus: u64 = 1 << 4; let input_message: u64 = 3; @@ -493,7 +493,7 @@ mod cuda { squash_params.ciphertext_modulus, ); - let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new() + let cpu_keys: CpuKeys<_, u64> = CpuKeysBuilder::new() .multi_bit_bootstrap_key(multi_bit_bsk) .build(); diff --git a/tfhe-benchmark/benches/core_crypto/pbs_bench.rs b/tfhe-benchmark/benches/core_crypto/pbs_bench.rs index 4e8044bd3..d00554e9e 100644 --- a/tfhe-benchmark/benches/core_crypto/pbs_bench.rs +++ b/tfhe-benchmark/benches/core_crypto/pbs_bench.rs @@ -978,7 +978,8 @@ mod cuda { params.ciphertext_modulus.unwrap(), ); - let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new().bootstrap_key(bsk).build(); + let cpu_keys: CpuKeys = + CpuKeysBuilder::new().bootstrap_key(bsk).build(); let bench_id; @@ -1236,7 +1237,7 @@ mod cuda { params.ciphertext_modulus.unwrap(), ); - let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new() + let cpu_keys: CpuKeys = CpuKeysBuilder::new() .multi_bit_bootstrap_key(multi_bit_bsk) .build(); diff --git a/tfhe-benchmark/src/utilities.rs b/tfhe-benchmark/src/utilities.rs index fc16e098c..5c71e69fe 100644 --- a/tfhe-benchmark/src/utilities.rs +++ b/tfhe-benchmark/src/utilities.rs @@ -575,28 +575,28 @@ mod cuda_utils { } /// Computing keys in their CPU flavor. - pub struct CpuKeys { - ksk: Option>, + pub struct CpuKeys { + ksk: Option>, pksk: Option>, bsk: Option>, multi_bit_bsk: Option>, } - impl CpuKeys { - pub fn builder() -> CpuKeysBuilder { + impl CpuKeys { + pub fn builder() -> CpuKeysBuilder { CpuKeysBuilder::new() } } - pub struct CpuKeysBuilder { - ksk: Option>, + pub struct CpuKeysBuilder { + ksk: Option>, pksk: Option>, bsk: Option>, multi_bit_bsk: Option>, } - impl CpuKeysBuilder { - pub fn new() -> CpuKeysBuilder { + impl CpuKeysBuilder { + pub fn new() -> CpuKeysBuilder { Self { ksk: None, pksk: None, @@ -605,7 +605,7 @@ mod cuda_utils { } } - pub fn keyswitch_key(mut self, ksk: LweKeyswitchKeyOwned) -> CpuKeysBuilder { + pub fn keyswitch_key(mut self, ksk: LweKeyswitchKeyOwned) -> CpuKeysBuilder { self.ksk = Some(ksk); self } @@ -613,12 +613,12 @@ mod cuda_utils { pub fn packing_keyswitch_key( mut self, pksk: LwePackingKeyswitchKeyOwned, - ) -> CpuKeysBuilder { + ) -> CpuKeysBuilder { self.pksk = Some(pksk); self } - pub fn bootstrap_key(mut self, bsk: LweBootstrapKeyOwned) -> CpuKeysBuilder { + pub fn bootstrap_key(mut self, bsk: LweBootstrapKeyOwned) -> CpuKeysBuilder { self.bsk = Some(bsk); self } @@ -626,12 +626,12 @@ mod cuda_utils { pub fn multi_bit_bootstrap_key( mut self, mb_bsk: LweMultiBitBootstrapKeyOwned, - ) -> CpuKeysBuilder { + ) -> CpuKeysBuilder { self.multi_bit_bsk = Some(mb_bsk); self } - pub fn build(self) -> CpuKeys { + pub fn build(self) -> CpuKeys { CpuKeys { ksk: self.ksk, pksk: self.pksk, @@ -640,7 +640,7 @@ mod cuda_utils { } } } - impl Default for CpuKeysBuilder { + impl Default for CpuKeysBuilder { fn default() -> Self { Self::new() } @@ -648,17 +648,17 @@ mod cuda_utils { /// Computing keys in their Cuda flavor. #[allow(dead_code)] - pub struct CudaLocalKeys { - pub ksk: Option>, + pub struct CudaLocalKeys { + pub ksk: Option>, pub pksk: Option>, pub bsk: Option, pub multi_bit_bsk: Option>, } #[allow(dead_code)] - impl CudaLocalKeys { + impl CudaLocalKeys { pub fn from_cpu_keys( - cpu_keys: &CpuKeys, + cpu_keys: &CpuKeys, ms_noise_reduction: Option, stream: &CudaStreams, ) -> Self { @@ -681,10 +681,10 @@ mod cuda_utils { } /// Instantiate Cuda computing keys to each available GPU. - pub fn cuda_local_keys_core( - cpu_keys: &CpuKeys, + pub fn cuda_local_keys_core( + cpu_keys: &CpuKeys, ms_noise_reduction: Option, - ) -> Vec> { + ) -> Vec> { let gpu_count = get_number_of_gpus() as usize; let mut gpu_keys_vec = Vec::with_capacity(gpu_count); for i in 0..gpu_count { diff --git a/tfhe/src/core_crypto/gpu/algorithms/lwe_multi_bit_programmable_bootstrapping.rs b/tfhe/src/core_crypto/gpu/algorithms/lwe_multi_bit_programmable_bootstrapping.rs index 326ad192c..1143a6140 100644 --- a/tfhe/src/core_crypto/gpu/algorithms/lwe_multi_bit_programmable_bootstrapping.rs +++ b/tfhe/src/core_crypto/gpu/algorithms/lwe_multi_bit_programmable_bootstrapping.rs @@ -10,8 +10,8 @@ use crate::core_crypto::prelude::{CastInto, UnsignedTorus}; /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must not /// be dropped until streams is synchronised #[allow(clippy::too_many_arguments)] -pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async( - input: &CudaLweCiphertextList, +pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async( + input: &CudaLweCiphertextList, output: &mut CudaLweCiphertextList, accumulator: &CudaGlweCiphertextList, lut_indexes: &CudaVec, @@ -20,6 +20,7 @@ pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async multi_bit_bsk: &CudaLweMultiBitBootstrapKey, streams: &CudaStreams, ) where + InputScalar: UnsignedTorus + CastInto, // CastInto required for PBS modulus switch which returns a usize Scalar: UnsignedTorus + CastInto, { @@ -59,21 +60,22 @@ pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async multi_bit_bsk.polynomial_size(), ); - assert_eq!( - input.ciphertext_modulus(), - output.ciphertext_modulus(), + assert!( + input.ciphertext_modulus().associated_scalar_bits() + <= output.ciphertext_modulus().associated_scalar_bits(), "Mismatched CiphertextModulus between input ({:?}) and output ({:?})", input.ciphertext_modulus(), output.ciphertext_modulus(), ); assert_eq!( - input.ciphertext_modulus(), + output.ciphertext_modulus(), accumulator.ciphertext_modulus(), - "Mismatched CiphertextModulus between input ({:?}) and accumulator ({:?})", + "Mismatched CiphertextModulus between output ({:?}) and accumulator ({:?})", input.ciphertext_modulus(), accumulator.ciphertext_modulus(), ); + assert_eq!( streams.gpu_indexes[0], multi_bit_bsk.d_vec.gpu_index(0), @@ -144,8 +146,8 @@ pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async } #[allow(clippy::too_many_arguments)] -pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext( - input: &CudaLweCiphertextList, +pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext( + input: &CudaLweCiphertextList, output: &mut CudaLweCiphertextList, accumulator: &CudaGlweCiphertextList, lut_indexes: &CudaVec, @@ -154,6 +156,7 @@ pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext( multi_bit_bsk: &CudaLweMultiBitBootstrapKey, streams: &CudaStreams, ) where + InputScalar: UnsignedTorus + CastInto, // CastInto required for PBS modulus switch which returns a usize Scalar: UnsignedTorus + CastInto, { diff --git a/tfhe/src/core_crypto/gpu/algorithms/lwe_programmable_bootstrapping.rs b/tfhe/src/core_crypto/gpu/algorithms/lwe_programmable_bootstrapping.rs index 6406b544c..98842d21c 100644 --- a/tfhe/src/core_crypto/gpu/algorithms/lwe_programmable_bootstrapping.rs +++ b/tfhe/src/core_crypto/gpu/algorithms/lwe_programmable_bootstrapping.rs @@ -12,8 +12,8 @@ use crate::core_crypto::prelude::{CastInto, UnsignedTorus}; /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must not /// be dropped until streams is synchronised #[allow(clippy::too_many_arguments)] -pub unsafe fn cuda_programmable_bootstrap_lwe_ciphertext_async( - input: &CudaLweCiphertextList, +pub unsafe fn cuda_programmable_bootstrap_lwe_ciphertext_async( + input: &CudaLweCiphertextList, output: &mut CudaLweCiphertextList, accumulator: &CudaGlweCiphertextList, lut_indexes: &CudaVec, @@ -22,6 +22,7 @@ pub unsafe fn cuda_programmable_bootstrap_lwe_ciphertext_async( bsk: &CudaLweBootstrapKey, streams: &CudaStreams, ) where + InputScalar: UnsignedTorus + CastInto, // CastInto required for PBS modulus switch which returns a usize Scalar: UnsignedTorus + CastInto, { @@ -259,8 +260,8 @@ pub unsafe fn cuda_programmable_bootstrap_128_lwe_ciphertext_async( } #[allow(clippy::too_many_arguments)] -pub fn cuda_programmable_bootstrap_lwe_ciphertext( - input: &CudaLweCiphertextList, +pub fn cuda_programmable_bootstrap_lwe_ciphertext( + input: &CudaLweCiphertextList, output: &mut CudaLweCiphertextList, accumulator: &CudaGlweCiphertextList, lut_indexes: &CudaVec, @@ -269,6 +270,7 @@ pub fn cuda_programmable_bootstrap_lwe_ciphertext( bsk: &CudaLweBootstrapKey, streams: &CudaStreams, ) where + InputScalar: UnsignedTorus + CastInto, Scalar: UnsignedTorus + CastInto, { unsafe { diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/lwe_programmable_bootstrapping.rs b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_programmable_bootstrapping.rs index 2473aa3bd..11b368708 100644 --- a/tfhe/src/core_crypto/gpu/algorithms/test/lwe_programmable_bootstrapping.rs +++ b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_programmable_bootstrapping.rs @@ -4,8 +4,11 @@ use crate::core_crypto::gpu::lwe_bootstrap_key::{ CudaLweBootstrapKey, CudaModulusSwitchNoiseReductionConfiguration, }; use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; +use crate::core_crypto::gpu::lwe_keyswitch_key::CudaLweKeyswitchKey; use crate::core_crypto::gpu::vec::{CudaVec, GpuIndex}; -use crate::core_crypto::gpu::{cuda_programmable_bootstrap_lwe_ciphertext, CudaStreams}; +use crate::core_crypto::gpu::{ + cuda_keyswitch_lwe_ciphertext, cuda_programmable_bootstrap_lwe_ciphertext, CudaStreams, +}; use crate::core_crypto::prelude::misc::check_encrypted_content_respects_mod; use itertools::Itertools; @@ -174,6 +177,218 @@ fn lwe_encrypt_pbs_decrypt< } } +fn lwe_ks_encrypt_pbs_decrypt_compare_32_64< + Scalar: UnsignedTorus + Sync + Send + CastFrom + CastInto + CastFrom + CastInto, +>( + params: ClassicTestParams, +) { + assert!(Scalar::BITS <= 64); + + let input_lwe_dimension = params.lwe_dimension; + let lwe_noise_distribution = params.lwe_noise_distribution; + let glwe_noise_distribution = params.glwe_noise_distribution; + let ciphertext_modulus = params.ciphertext_modulus; + let message_modulus_log = params.message_modulus_log; + let msg_modulus = Scalar::ONE.shl(message_modulus_log.0); + let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus); + let glwe_dimension = params.glwe_dimension; + let polynomial_size = params.polynomial_size; + let decomp_base_log = params.pbs_base_log; + let decomp_level_count = params.pbs_level; + + let gpu_index = 0; + let stream = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index)); + + let mut rsc = TestResources::new(); + + let f = |x: Scalar| { + x.wrapping_mul(Scalar::TWO) + .wrapping_sub(Scalar::ONE) + .wrapping_rem(msg_modulus) + }; + + let delta: Scalar = encoding_with_padding / msg_modulus; + let mut msg = msg_modulus; + const NB_TESTS: usize = 10; + let number_of_messages = 1; + + let accumulator = generate_programmable_bootstrap_glwe_lut( + polynomial_size, + glwe_dimension.to_glwe_size(), + msg_modulus.cast_into(), + ciphertext_modulus, + delta, + f, + ); + + assert!(check_encrypted_content_respects_mod( + &accumulator, + ciphertext_modulus + )); + + let lwe_noise_distribution_ksk_32 = match params.lwe_noise_distribution { + DynamicDistribution::Gaussian(gaussian_lwe_noise_distribution) => { + DynamicDistribution::::new_gaussian(gaussian_lwe_noise_distribution.standard_dev()) + } + DynamicDistribution::TUniform(uniform_lwe_noise_distribution) => { + DynamicDistribution::::new_t_uniform( + uniform_lwe_noise_distribution.bound_log2() - 32, + ) + } + }; + + // Create the LweSecretKey + let intermediate_lwe_secret_key_32: LweSecretKeyOwned = + allocate_and_generate_new_binary_lwe_secret_key( + input_lwe_dimension, + &mut rsc.secret_random_generator, + ); // lwe_sk + + let glwe_sk_64: GlweSecretKeyOwned = allocate_and_generate_new_binary_glwe_secret_key( + params.glwe_dimension, + params.polynomial_size, + &mut rsc.secret_random_generator, + ); + + let big_lwe_sk_64 = glwe_sk_64.clone().into_lwe_secret_key(); + + let ksk_big_to_small = allocate_and_generate_new_lwe_keyswitch_key( + &big_lwe_sk_64, + &intermediate_lwe_secret_key_32, + params.ks_base_log, + params.ks_level, + lwe_noise_distribution_ksk_32, + CiphertextModulus::::new_native(), + &mut rsc.encryption_random_generator, + ); + let d_ksk_big_to_small = + CudaLweKeyswitchKey::from_lwe_keyswitch_key(&ksk_big_to_small, &stream); + + let ciphertext_modulus_ks32 = CiphertextModulus::::new_native(); + + let output_lwe_dimension = big_lwe_sk_64.lwe_dimension(); + + let mut bsk = LweBootstrapKey::new( + Scalar::ZERO, + glwe_dimension.to_glwe_size(), + polynomial_size, + decomp_base_log, + decomp_level_count, + input_lwe_dimension, + ciphertext_modulus, + ); + + par_generate_lwe_bootstrap_key( + &intermediate_lwe_secret_key_32, + &glwe_sk_64, + &mut bsk, + glwe_noise_distribution, + &mut rsc.encryption_random_generator, + ); + + assert!(check_encrypted_content_respects_mod( + &*bsk, + ciphertext_modulus + )); + + let d_bsk = CudaLweBootstrapKey::from_lwe_bootstrap_key(&bsk, None, &stream); + + while msg != Scalar::ZERO { + msg = msg.wrapping_sub(Scalar::ONE); + for _ in 0..NB_TESTS { + let plaintext = Plaintext(msg * delta); + + let lwe_ciphertext_in = allocate_and_encrypt_new_lwe_ciphertext( + &big_lwe_sk_64, + plaintext, + lwe_noise_distribution, + ciphertext_modulus, + &mut rsc.encryption_random_generator, + ); + + assert!(check_encrypted_content_respects_mod( + &lwe_ciphertext_in, + ciphertext_modulus + )); + + let d_lwe_ciphertext_in = + CudaLweCiphertextList::from_lwe_ciphertext(&lwe_ciphertext_in, &stream); + + let output_ks_ct: LweCiphertextOwned = LweCiphertext::new( + 0u32, + intermediate_lwe_secret_key_32.lwe_dimension().to_lwe_size(), + ciphertext_modulus_ks32, + ); + let mut output_ks_ct_gpu = + CudaLweCiphertextList::from_lwe_ciphertext(&output_ks_ct, &stream); + + let mut d_out_pbs_ct = CudaLweCiphertextList::new( + output_lwe_dimension, + LweCiphertextCount(1), + ciphertext_modulus, + &stream, + ); + + let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &stream); + + let test_vector_indexes: Vec = vec![Scalar::ZERO; number_of_messages]; + + let mut d_test_vector_indexes = + unsafe { CudaVec::::new_async(number_of_messages, &stream, 0) }; + unsafe { d_test_vector_indexes.copy_from_cpu_async(&test_vector_indexes, &stream, 0) }; + + let num_blocks = d_lwe_ciphertext_in.0.lwe_ciphertext_count.0; + let lwe_indexes_usize: Vec = (0..num_blocks).collect_vec(); + let lwe_indexes = lwe_indexes_usize + .iter() + .map(|&x| >::cast_into(x)) + .collect_vec(); + let mut d_output_indexes = + unsafe { CudaVec::::new_async(num_blocks, &stream, 0) }; + let mut d_input_indexes = + unsafe { CudaVec::::new_async(num_blocks, &stream, 0) }; + unsafe { + d_input_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0); + d_output_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0); + } + + cuda_keyswitch_lwe_ciphertext( + &d_ksk_big_to_small, + &d_lwe_ciphertext_in, + &mut output_ks_ct_gpu, + &d_input_indexes, + &d_output_indexes, + true, + &stream, + false, + ); + + cuda_programmable_bootstrap_lwe_ciphertext( + &output_ks_ct_gpu, + &mut d_out_pbs_ct, + &d_accumulator, + &d_test_vector_indexes, + &d_output_indexes, + &d_input_indexes, + &d_bsk, + &stream, + ); + + let out_pbs_ct = d_out_pbs_ct.into_lwe_ciphertext(&stream); + assert!(check_encrypted_content_respects_mod( + &out_pbs_ct, + ciphertext_modulus + )); + + let decrypted = decrypt_lwe_ciphertext(&big_lwe_sk_64, &out_pbs_ct); + + let decoded = round_decode(decrypted.0, delta) % msg_modulus; + + assert_eq!(decoded, f(msg)); + } + } +} + fn lwe_encrypt_centered_ms_pbs_decrypt< Scalar: UnsignedTorus + Sync + Send + CastFrom + CastInto, >( @@ -327,6 +542,6 @@ fn lwe_encrypt_centered_ms_pbs_decrypt< } } } - +create_gpu_parameterized_test!(lwe_ks_encrypt_pbs_decrypt_compare_32_64); create_gpu_parameterized_test!(lwe_encrypt_pbs_decrypt); create_gpu_parameterized_test!(lwe_encrypt_centered_ms_pbs_decrypt); diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs index 9d8743cca..83bcde145 100644 --- a/tfhe/src/core_crypto/gpu/mod.rs +++ b/tfhe/src/core_crypto/gpu/mod.rs @@ -166,13 +166,13 @@ impl Drop for CudaStreams { /// [CudaStreams::synchronize] __must__ be called as soon as synchronization is /// required #[allow(clippy::too_many_arguments)] -pub unsafe fn programmable_bootstrap_async( +pub unsafe fn programmable_bootstrap_async( streams: &CudaStreams, lwe_array_out: &mut CudaVec, lwe_out_indexes: &CudaVec, test_vector: &CudaVec, test_vector_indexes: &CudaVec, - lwe_array_in: &CudaVec, + lwe_array_in: &CudaVec, lwe_in_indexes: &CudaVec, bootstrapping_key: &CudaVec, lwe_dimension: LweDimension, @@ -193,45 +193,89 @@ pub unsafe fn programmable_bootstrap_async( PBSMSNoiseReductionType::Centered }); - scratch_cuda_programmable_bootstrap_64( - streams.ptr[0], - streams.gpu_indexes[0].get(), - std::ptr::addr_of_mut!(pbs_buffer), - lwe_dimension.0 as u32, - glwe_dimension.0 as u32, - polynomial_size.0 as u32, - level.0 as u32, - num_samples, - true, - noise_reduction_type as u32, - ); + if KST::BITS == 32 { + scratch_cuda_programmable_bootstrap_32_64( + streams.ptr[0], + streams.gpu_indexes[0].get(), + std::ptr::addr_of_mut!(pbs_buffer), + lwe_dimension.0 as u32, + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + level.0 as u32, + num_samples, + true, + noise_reduction_type as u32, + ); - cuda_programmable_bootstrap_lwe_ciphertext_vector_64( - streams.ptr[0], - streams.gpu_indexes[0].get(), - lwe_array_out.as_mut_c_ptr(0), - lwe_out_indexes.as_c_ptr(0), - test_vector.as_c_ptr(0), - test_vector_indexes.as_c_ptr(0), - lwe_array_in.as_c_ptr(0), - lwe_in_indexes.as_c_ptr(0), - bootstrapping_key.as_c_ptr(0), - pbs_buffer, - lwe_dimension.0 as u32, - glwe_dimension.0 as u32, - polynomial_size.0 as u32, - base_log.0 as u32, - level.0 as u32, - num_samples, - num_many_lut, - lut_stride, - ); + cuda_programmable_bootstrap_lwe_ciphertext_vector_32_64( + streams.ptr[0], + streams.gpu_indexes[0].get(), + lwe_array_out.as_mut_c_ptr(0), + lwe_out_indexes.as_c_ptr(0), + test_vector.as_c_ptr(0), + test_vector_indexes.as_c_ptr(0), + lwe_array_in.as_c_ptr(0), + lwe_in_indexes.as_c_ptr(0), + bootstrapping_key.as_c_ptr(0), + pbs_buffer, + lwe_dimension.0 as u32, + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + base_log.0 as u32, + level.0 as u32, + num_samples, + num_many_lut, + lut_stride, + ); - cleanup_cuda_programmable_bootstrap( - streams.ptr[0], - streams.gpu_indexes[0].get(), - std::ptr::addr_of_mut!(pbs_buffer), - ); + cleanup_cuda_programmable_bootstrap( + streams.ptr[0], + streams.gpu_indexes[0].get(), + std::ptr::addr_of_mut!(pbs_buffer), + ); + } else if KST::BITS == 64 { + scratch_cuda_programmable_bootstrap_64( + streams.ptr[0], + streams.gpu_indexes[0].get(), + std::ptr::addr_of_mut!(pbs_buffer), + lwe_dimension.0 as u32, + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + level.0 as u32, + num_samples, + true, + noise_reduction_type as u32, + ); + + cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64( + streams.ptr[0], + streams.gpu_indexes[0].get(), + lwe_array_out.as_mut_c_ptr(0), + lwe_out_indexes.as_c_ptr(0), + test_vector.as_c_ptr(0), + test_vector_indexes.as_c_ptr(0), + lwe_array_in.as_c_ptr(0), + lwe_in_indexes.as_c_ptr(0), + bootstrapping_key.as_c_ptr(0), + pbs_buffer, + lwe_dimension.0 as u32, + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + base_log.0 as u32, + level.0 as u32, + num_samples, + num_many_lut, + lut_stride, + ); + + cleanup_cuda_programmable_bootstrap( + streams.ptr[0], + streams.gpu_indexes[0].get(), + std::ptr::addr_of_mut!(pbs_buffer), + ); + } else { + panic!("Unsupported PBS input bitwidth {}", KST::BITS); + } } #[allow(clippy::too_many_arguments)] @@ -348,6 +392,7 @@ pub unsafe fn programmable_bootstrap_128_async( #[allow(clippy::too_many_arguments)] pub unsafe fn programmable_bootstrap_multi_bit_async< T: UnsignedInteger, + KST: UnsignedInteger, B: Any + UnsignedInteger, >( streams: &CudaStreams, @@ -355,7 +400,7 @@ pub unsafe fn programmable_bootstrap_multi_bit_async< output_indexes: &CudaVec, test_vector: &CudaVec, test_vector_indexes: &CudaVec, - lwe_array_in: &CudaVec, + lwe_array_in: &CudaVec, input_indexes: &CudaVec, bootstrapping_key: &CudaVec, lwe_dimension: LweDimension, @@ -406,42 +451,83 @@ pub unsafe fn programmable_bootstrap_multi_bit_async< std::ptr::addr_of_mut!(pbs_buffer), ); } else if TypeId::of::() == TypeId::of::() { - scratch_cuda_multi_bit_programmable_bootstrap_64( - streams.ptr[0], - streams.gpu_indexes[0].get(), - std::ptr::addr_of_mut!(pbs_buffer), - glwe_dimension.0 as u32, - polynomial_size.0 as u32, - level.0 as u32, - num_samples, - true, - ); - cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( - streams.ptr[0], - streams.gpu_indexes[0].get(), - lwe_array_out.as_mut_c_ptr(0), - output_indexes.as_c_ptr(0), - test_vector.as_c_ptr(0), - test_vector_indexes.as_c_ptr(0), - lwe_array_in.as_c_ptr(0), - input_indexes.as_c_ptr(0), - bootstrapping_key.as_c_ptr(0), - pbs_buffer, - lwe_dimension.0 as u32, - glwe_dimension.0 as u32, - polynomial_size.0 as u32, - grouping_factor.0 as u32, - base_log.0 as u32, - level.0 as u32, - num_samples, - num_many_lut, - lut_stride, - ); - cleanup_cuda_multi_bit_programmable_bootstrap( - streams.ptr[0], - streams.gpu_indexes[0].get(), - std::ptr::addr_of_mut!(pbs_buffer), - ); + if TypeId::of::() == TypeId::of::() { + scratch_cuda_multi_bit_programmable_bootstrap_32_64( + streams.ptr[0], + streams.gpu_indexes[0].get(), + std::ptr::addr_of_mut!(pbs_buffer), + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + level.0 as u32, + num_samples, + true, + ); + cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_32_64( + streams.ptr[0], + streams.gpu_indexes[0].get(), + lwe_array_out.as_mut_c_ptr(0), + output_indexes.as_c_ptr(0), + test_vector.as_c_ptr(0), + test_vector_indexes.as_c_ptr(0), + lwe_array_in.as_c_ptr(0), + input_indexes.as_c_ptr(0), + bootstrapping_key.as_c_ptr(0), + pbs_buffer, + lwe_dimension.0 as u32, + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + grouping_factor.0 as u32, + base_log.0 as u32, + level.0 as u32, + num_samples, + num_many_lut, + lut_stride, + ); + cleanup_cuda_multi_bit_programmable_bootstrap( + streams.ptr[0], + streams.gpu_indexes[0].get(), + std::ptr::addr_of_mut!(pbs_buffer), + ); + } else if TypeId::of::() == TypeId::of::() { + scratch_cuda_multi_bit_programmable_bootstrap_64( + streams.ptr[0], + streams.gpu_indexes[0].get(), + std::ptr::addr_of_mut!(pbs_buffer), + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + level.0 as u32, + num_samples, + true, + ); + cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( + streams.ptr[0], + streams.gpu_indexes[0].get(), + lwe_array_out.as_mut_c_ptr(0), + output_indexes.as_c_ptr(0), + test_vector.as_c_ptr(0), + test_vector_indexes.as_c_ptr(0), + lwe_array_in.as_c_ptr(0), + input_indexes.as_c_ptr(0), + bootstrapping_key.as_c_ptr(0), + pbs_buffer, + lwe_dimension.0 as u32, + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + grouping_factor.0 as u32, + base_log.0 as u32, + level.0 as u32, + num_samples, + num_many_lut, + lut_stride, + ); + cleanup_cuda_multi_bit_programmable_bootstrap( + streams.ptr[0], + streams.gpu_indexes[0].get(), + std::ptr::addr_of_mut!(pbs_buffer), + ); + } else { + panic!("Unsupported MB PBS input torus size"); + } } else { panic!("Unsupported torus size") }