diff --git a/include/bootstrap.h b/include/bootstrap.h index e157100ec..732fdccd4 100644 --- a/include/bootstrap.h +++ b/include/bootstrap.h @@ -45,24 +45,6 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64( uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_test_vectors, uint32_t lwe_idx, uint32_t max_shared_memory); -void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out, - void *ggsw_in, void *lut_vector, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t r, uint32_t tau, - uint32_t max_shared_memory); - -void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out, - void *ggsw_in, void *lut_vector, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t r, uint32_t tau, - uint32_t max_shared_memory); - -void cuda_blind_rotate_and_sample_extraction_64( - void *v_stream, uint32_t gpu_index, void *lwe_out, void *ggsw_in, - void *lut_vector, uint32_t mbr_size, uint32_t tau, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t base_log, uint32_t l_gadget, - uint32_t max_shared_memory); - void cuda_extract_bits_32( void *v_stream, uint32_t gpu_index, void *list_lwe_array_out, void *lwe_array_in, void *lwe_array_in_buffer, diff --git a/include/vertical_packing.h b/include/vertical_packing.h new file mode 100644 index 000000000..6a1824b0d --- /dev/null +++ b/include/vertical_packing.h @@ -0,0 +1,48 @@ +#ifndef VERTICAL_PACKING_H +#define VERTICAL_PACKING_H + +#include + +extern "C" { + +void scratch_cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, + int8_t **cmux_tree_buffer, + uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t level_count, + uint32_t r, uint32_t tau, + uint32_t max_shared_memory, + bool allocate_gpu_memory); + +void scratch_cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, + int8_t **cmux_tree_buffer, + uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t level_count, + uint32_t r, uint32_t tau, + uint32_t max_shared_memory, + bool allocate_gpu_memory); + +void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out, + void *ggsw_in, void *lut_vector, + int8_t *cmux_tree_buffer, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, uint32_t r, uint32_t tau, + uint32_t max_shared_memory); + +void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out, + void *ggsw_in, void *lut_vector, + int8_t *cmux_tree_buffer, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, uint32_t r, uint32_t tau, + uint32_t max_shared_memory); + +void cleanup_cuda_cmux_tree(void *v_stream, uint32_t gpu_index, + int8_t **cmux_tree_buffer); + +void cuda_blind_rotate_and_sample_extraction_64( + void *v_stream, uint32_t gpu_index, void *lwe_out, void *ggsw_in, + void *lut_vector, uint32_t mbr_size, uint32_t tau, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t base_log, uint32_t l_gadget, + uint32_t max_shared_memory); +} + +#endif // VERTICAL_PACKING_H diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 59053c4c2..95be8d92f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,7 +2,8 @@ set(SOURCES ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/boolean_gates.h ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h) + ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h + ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h) file(GLOB SOURCES "*.cu" "*.h" diff --git a/src/crypto/ggsw.cuh b/src/crypto/ggsw.cuh index aeb33c8cd..d5ca4e637 100644 --- a/src/crypto/ggsw.cuh +++ b/src/crypto/ggsw.cuh @@ -50,7 +50,7 @@ __global__ void device_batch_fft_ggsw_vector(double2 *dest, T *src, */ template void batch_fft_ggsw_vector(cudaStream_t *stream, double2 *dest, T *src, - uint32_t r, uint32_t glwe_dim, + int8_t *d_mem, uint32_t r, uint32_t glwe_dim, uint32_t polynomial_size, uint32_t level_count, uint32_t gpu_index, uint32_t max_shared_memory) { @@ -59,19 +59,15 @@ void batch_fft_ggsw_vector(cudaStream_t *stream, double2 *dest, T *src, int gridSize = r * (glwe_dim + 1) * (glwe_dim + 1) * level_count; int blockSize = polynomial_size / params::opt; - int8_t *d_mem; if (max_shared_memory < shared_memory_size) { - d_mem = (int8_t *)cuda_malloc_async(shared_memory_size, stream, gpu_index); device_batch_fft_ggsw_vector <<>>(dest, src, d_mem); - check_cuda_error(cudaGetLastError()); - cuda_drop_async(d_mem, stream, gpu_index); } else { device_batch_fft_ggsw_vector <<>>(dest, src, d_mem); - check_cuda_error(cudaGetLastError()); } + check_cuda_error(cudaGetLastError()); } #endif // CONCRETE_CORE_GGSW_CUH diff --git a/src/vertical_packing.cu b/src/vertical_packing.cu index 6c6a9fb2d..1a04028e0 100644 --- a/src/vertical_packing.cu +++ b/src/vertical_packing.cu @@ -1,11 +1,101 @@ #include "vertical_packing.cuh" +#include "vertical_packing.h" +#include + +/* + * This scratch function allocates the necessary amount of data on the GPU for + * the Cmux tree on 32 bits inputs, into `cmux_tree_buffer`. It also configures + * SM options on the GPU in case FULLSM mode is going to be used. + */ +void scratch_cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, + int8_t **cmux_tree_buffer, + uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t level_count, + uint32_t r, uint32_t tau, + uint32_t max_shared_memory, + bool allocate_gpu_memory) { + + switch (polynomial_size) { + case 512: + scratch_cmux_tree>( + v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size, + level_count, r, tau, max_shared_memory, allocate_gpu_memory); + break; + case 1024: + scratch_cmux_tree>( + v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size, + level_count, r, tau, max_shared_memory, allocate_gpu_memory); + break; + case 2048: + scratch_cmux_tree>( + v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size, + level_count, r, tau, max_shared_memory, allocate_gpu_memory); + break; + case 4096: + scratch_cmux_tree>( + v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size, + level_count, r, tau, max_shared_memory, allocate_gpu_memory); + break; + case 8192: + scratch_cmux_tree>( + v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size, + level_count, r, tau, max_shared_memory, allocate_gpu_memory); + break; + default: + break; + } +} + +/* + * This scratch function allocates the necessary amount of data on the GPU for + * the Cmux tree on 64 bits inputs, into `cmux_tree_buffer`. It also configures + * SM options on the GPU in case FULLSM mode is going to be used. + */ +void scratch_cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, + int8_t **cmux_tree_buffer, + uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t level_count, + uint32_t r, uint32_t tau, + uint32_t max_shared_memory, + bool allocate_gpu_memory) { + switch (polynomial_size) { + case 512: + scratch_cmux_tree>( + v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size, + level_count, r, tau, max_shared_memory, allocate_gpu_memory); + break; + case 1024: + scratch_cmux_tree>( + v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size, + level_count, r, tau, max_shared_memory, allocate_gpu_memory); + break; + case 2048: + scratch_cmux_tree>( + v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size, + level_count, r, tau, max_shared_memory, allocate_gpu_memory); + break; + case 4096: + scratch_cmux_tree>( + v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size, + level_count, r, tau, max_shared_memory, allocate_gpu_memory); + break; + case 8192: + scratch_cmux_tree>( + v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size, + level_count, r, tau, max_shared_memory, allocate_gpu_memory); + break; + default: + break; + } +} /* * Perform cmux tree on a batch of 32-bit input GGSW ciphertexts. * Check the equivalent function for 64-bit inputs for more details. */ void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out, - void *ggsw_in, void *lut_vector, uint32_t glwe_dimension, + void *ggsw_in, void *lut_vector, + int8_t *cmux_tree_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t r, uint32_t tau, uint32_t max_shared_memory) { @@ -25,32 +115,32 @@ void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out, case 512: host_cmux_tree>( v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in, - (uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log, - level_count, r, tau, max_shared_memory); + (uint32_t *)lut_vector, cmux_tree_buffer, glwe_dimension, + polynomial_size, base_log, level_count, r, tau, max_shared_memory); break; case 1024: host_cmux_tree>( v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in, - (uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log, - level_count, r, tau, max_shared_memory); + (uint32_t *)lut_vector, cmux_tree_buffer, glwe_dimension, + polynomial_size, base_log, level_count, r, tau, max_shared_memory); break; case 2048: host_cmux_tree>( v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in, - (uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log, - level_count, r, tau, max_shared_memory); + (uint32_t *)lut_vector, cmux_tree_buffer, glwe_dimension, + polynomial_size, base_log, level_count, r, tau, max_shared_memory); break; case 4096: host_cmux_tree>( v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in, - (uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log, - level_count, r, tau, max_shared_memory); + (uint32_t *)lut_vector, cmux_tree_buffer, glwe_dimension, + polynomial_size, base_log, level_count, r, tau, max_shared_memory); break; case 8192: host_cmux_tree>( v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in, - (uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log, - level_count, r, tau, max_shared_memory); + (uint32_t *)lut_vector, cmux_tree_buffer, glwe_dimension, + polynomial_size, base_log, level_count, r, tau, max_shared_memory); break; default: break; @@ -85,7 +175,8 @@ void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out, * polynomial degree. */ void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out, - void *ggsw_in, void *lut_vector, uint32_t glwe_dimension, + void *ggsw_in, void *lut_vector, + int8_t *cmux_tree_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t r, uint32_t tau, uint32_t max_shared_memory) { @@ -105,38 +196,49 @@ void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out, case 512: host_cmux_tree>( v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in, - (uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log, - level_count, r, tau, max_shared_memory); + (uint64_t *)lut_vector, cmux_tree_buffer, glwe_dimension, + polynomial_size, base_log, level_count, r, tau, max_shared_memory); break; case 1024: host_cmux_tree>( v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in, - (uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log, - level_count, r, tau, max_shared_memory); + (uint64_t *)lut_vector, cmux_tree_buffer, glwe_dimension, + polynomial_size, base_log, level_count, r, tau, max_shared_memory); break; case 2048: host_cmux_tree>( v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in, - (uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log, - level_count, r, tau, max_shared_memory); + (uint64_t *)lut_vector, cmux_tree_buffer, glwe_dimension, + polynomial_size, base_log, level_count, r, tau, max_shared_memory); break; case 4096: host_cmux_tree>( v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in, - (uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log, - level_count, r, tau, max_shared_memory); + (uint64_t *)lut_vector, cmux_tree_buffer, glwe_dimension, + polynomial_size, base_log, level_count, r, tau, max_shared_memory); break; case 8192: host_cmux_tree>( v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in, - (uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log, - level_count, r, tau, max_shared_memory); + (uint64_t *)lut_vector, cmux_tree_buffer, glwe_dimension, + polynomial_size, base_log, level_count, r, tau, max_shared_memory); break; default: break; } } +/* + * This cleanup function frees the data for the Cmux tree on GPU in + * cmux_tree_buffer for 32 or 64 bits inputs. + */ +void cleanup_cuda_cmux_tree(void *v_stream, uint32_t gpu_index, + int8_t **cmux_tree_buffer) { + auto stream = static_cast(v_stream); + // Free memory + cuda_drop_async(*cmux_tree_buffer, stream, gpu_index); +} + /* * Performs blind rotation on batch of 64-bit input ggsw ciphertexts * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel diff --git a/src/vertical_packing.cuh b/src/vertical_packing.cuh index baa5d77cc..8f0190656 100644 --- a/src/vertical_packing.cuh +++ b/src/vertical_packing.cuh @@ -1,6 +1,7 @@ -#ifndef VERTICAL_PACKING_H -#define VERTICAL_PACKING_H +#ifndef VERTICAL_PACKING_CUH +#define VERTICAL_PACKING_CUH +#include "../include/vertical_packing.h" #include "bootstrap.h" #include "complex/operations.cuh" #include "crypto/gadget.cuh" @@ -242,6 +243,71 @@ __global__ void device_batch_cmux(Torus *glwe_array_out, Torus *glwe_array_in, selected_memory, output_idx, input_idx1, input_idx2, glwe_dim, polynomial_size, base_log, level_count, ggsw_idx); } + +template +__host__ __device__ int +get_memory_needed_per_block_cmux_tree(uint32_t polynomial_size) { + return sizeof(Torus) * polynomial_size + // glwe_sub_mask + sizeof(Torus) * polynomial_size + // glwe_sub_body + sizeof(double2) * polynomial_size / 2 + // mask_res_fft + sizeof(double2) * polynomial_size / 2 + // body_res_fft + sizeof(double2) * polynomial_size / 2; // glwe_fft +} + +template +__host__ __device__ int +get_buffer_size_cmux_tree(uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t level_count, uint32_t r, uint32_t tau, + uint32_t max_shared_memory) { + + int memory_needed_per_block = + get_memory_needed_per_block_cmux_tree(polynomial_size); + int num_lut = (1 << r); + int ggsw_size = polynomial_size * (glwe_dimension + 1) * + (glwe_dimension + 1) * level_count; + int glwe_size = (glwe_dimension + 1) * polynomial_size; + int device_mem = 0; + if (max_shared_memory < memory_needed_per_block) { + device_mem = memory_needed_per_block * (1 << (r - 1)) * tau; + } + if (max_shared_memory < polynomial_size * sizeof(double)) { + device_mem += polynomial_size * sizeof(double); + } + return r * ggsw_size * sizeof(double) + + num_lut * tau * glwe_size * sizeof(Torus) + + num_lut * tau * glwe_size * sizeof(Torus) + device_mem; +} + +template +__host__ void +scratch_cmux_tree(void *v_stream, uint32_t gpu_index, int8_t **cmux_tree_buffer, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t level_count, uint32_t r, uint32_t tau, + uint32_t max_shared_memory, bool allocate_gpu_memory) { + cudaSetDevice(gpu_index); + auto stream = static_cast(v_stream); + + int memory_needed_per_block = + get_memory_needed_per_block_cmux_tree(polynomial_size); + if (max_shared_memory >= memory_needed_per_block) { + check_cuda_error(cudaFuncSetAttribute( + device_batch_cmux, + cudaFuncAttributeMaxDynamicSharedMemorySize, memory_needed_per_block)); + check_cuda_error( + cudaFuncSetCacheConfig(device_batch_cmux, + cudaFuncCachePreferShared)); + } + + if (allocate_gpu_memory) { + int buffer_size = get_buffer_size_cmux_tree( + glwe_dimension, polynomial_size, level_count, r, tau, + max_shared_memory); + *cmux_tree_buffer = + (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index); + check_cuda_error(cudaGetLastError()); + } +} + /* * This kernel executes the CMUX tree used by the hybrid packing of the WoPBS. * @@ -259,12 +325,12 @@ __global__ void device_batch_cmux(Torus *glwe_array_out, Torus *glwe_array_in, * - tau: The quantity of CMUX trees that should be executed */ template -__host__ void host_cmux_tree(void *v_stream, uint32_t gpu_index, - Torus *glwe_array_out, Torus *ggsw_in, - Torus *lut_vector, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t r, uint32_t tau, - uint32_t max_shared_memory) { +__host__ void +host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out, + Torus *ggsw_in, Torus *lut_vector, int8_t *cmux_tree_buffer, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t base_log, uint32_t level_count, uint32_t r, + uint32_t tau, uint32_t max_shared_memory) { cudaSetDevice(gpu_index); auto stream = static_cast(v_stream); @@ -277,11 +343,7 @@ __host__ void host_cmux_tree(void *v_stream, uint32_t gpu_index, } int memory_needed_per_block = - sizeof(Torus) * polynomial_size + // glwe_sub_mask - sizeof(Torus) * polynomial_size + // glwe_sub_body - sizeof(double2) * polynomial_size / 2 + // mask_res_fft - sizeof(double2) * polynomial_size / 2 + // body_res_fft - sizeof(double2) * polynomial_size / 2; // glwe_fft + get_memory_needed_per_block_cmux_tree(polynomial_size); dim3 thds(polynomial_size / params::opt, 1, 1); @@ -289,51 +351,45 @@ __host__ void host_cmux_tree(void *v_stream, uint32_t gpu_index, int ggsw_size = polynomial_size * (glwe_dimension + 1) * (glwe_dimension + 1) * level_count; - double2 *d_ggsw_fft_in = (double2 *)cuda_malloc_async( - r * ggsw_size * sizeof(double), stream, gpu_index); + double2 *d_ggsw_fft_in = (double2 *)cmux_tree_buffer; + int8_t *d_mem_fft = + (int8_t *)d_ggsw_fft_in + (ptrdiff_t)(r * ggsw_size * sizeof(double)); batch_fft_ggsw_vector( - stream, d_ggsw_fft_in, ggsw_in, r, glwe_dimension, polynomial_size, - level_count, gpu_index, max_shared_memory); + stream, d_ggsw_fft_in, ggsw_in, d_mem_fft, r, glwe_dimension, + polynomial_size, level_count, gpu_index, max_shared_memory); ////////////////////// // Allocate global memory in case parameters are too large - int8_t *d_mem; - if (max_shared_memory < memory_needed_per_block) { - d_mem = (int8_t *)cuda_malloc_async( - memory_needed_per_block * (1 << (r - 1)) * tau, stream, gpu_index); - } else { - check_cuda_error(cudaFuncSetAttribute( - device_batch_cmux, - cudaFuncAttributeMaxDynamicSharedMemorySize, memory_needed_per_block)); - check_cuda_error( - cudaFuncSetCacheConfig(device_batch_cmux, - cudaFuncCachePreferShared)); + int8_t *d_buffer1 = d_mem_fft; + if (max_shared_memory < polynomial_size * sizeof(double)) { + d_buffer1 = d_mem_fft + (ptrdiff_t)(polynomial_size * sizeof(double)); } // Allocate buffers int glwe_size = (glwe_dimension + 1) * polynomial_size; - Torus *d_buffer1 = (Torus *)cuda_malloc_async( - num_lut * tau * glwe_size * sizeof(Torus), stream, gpu_index); - Torus *d_buffer2 = (Torus *)cuda_malloc_async( - num_lut * tau * glwe_size * sizeof(Torus), stream, gpu_index); + int8_t *d_buffer2 = + d_buffer1 + (ptrdiff_t)(num_lut * tau * glwe_size * sizeof(Torus)); - add_padding_to_lut_async(d_buffer1, lut_vector, glwe_dimension, - num_lut * tau, stream); + add_padding_to_lut_async( + (Torus *)d_buffer1, lut_vector, glwe_dimension, num_lut * tau, stream); Torus *output; // Run the cmux tree for (int layer_idx = 0; layer_idx < r; layer_idx++) { - output = (layer_idx % 2 ? d_buffer1 : d_buffer2); - Torus *input = (layer_idx % 2 ? d_buffer2 : d_buffer1); + output = (layer_idx % 2 ? (Torus *)d_buffer1 : (Torus *)d_buffer2); + Torus *input = (layer_idx % 2 ? (Torus *)d_buffer2 : (Torus *)d_buffer1); int num_cmuxes = (1 << (r - 1 - layer_idx)); dim3 grid(num_cmuxes, tau, 1); + int8_t *d_mem = + d_buffer2 + (ptrdiff_t)(num_lut * tau * glwe_size * sizeof(Torus)); + // walks horizontally through the leaves - if (max_shared_memory < memory_needed_per_block) + if (max_shared_memory < memory_needed_per_block) { device_batch_cmux <<>>(output, input, d_ggsw_fft_in, d_mem, memory_needed_per_block, @@ -341,7 +397,7 @@ __host__ void host_cmux_tree(void *v_stream, uint32_t gpu_index, polynomial_size, base_log, level_count, layer_idx, // r num_lut); - else + } else { device_batch_cmux <<>>( output, input, d_ggsw_fft_in, d_mem, memory_needed_per_block, @@ -349,20 +405,15 @@ __host__ void host_cmux_tree(void *v_stream, uint32_t gpu_index, polynomial_size, base_log, level_count, layer_idx, // r num_lut); + } check_cuda_error(cudaGetLastError()); } - for (int i = 0; i < tau; i++) + for (int i = 0; i < tau; i++) { check_cuda_error(cudaMemcpyAsync( glwe_array_out + i * glwe_size, output + i * num_lut * glwe_size, glwe_size * sizeof(Torus), cudaMemcpyDeviceToDevice, *stream)); - - // Free memory - cuda_drop_async(d_ggsw_fft_in, stream, gpu_index); - cuda_drop_async(d_buffer1, stream, gpu_index); - cuda_drop_async(d_buffer2, stream, gpu_index); - if (max_shared_memory < memory_needed_per_block) - cuda_drop_async(d_mem, stream, gpu_index); + } } /* @@ -495,10 +546,13 @@ __host__ void host_blind_rotate_and_sample_extraction( double2 *d_ggsw_fft_in = (double2 *)cuda_malloc_async( mbr_size * ggsw_size * sizeof(double), stream, gpu_index); + int8_t *d_mem_fft = (int8_t *)cuda_malloc_async( + polynomial_size * sizeof(double), stream, gpu_index); batch_fft_ggsw_vector( - stream, d_ggsw_fft_in, ggsw_in, mbr_size, glwe_dimension, polynomial_size, - level_count, gpu_index, max_shared_memory); + stream, d_ggsw_fft_in, ggsw_in, d_mem_fft, mbr_size, glwe_dimension, + polynomial_size, level_count, gpu_index, max_shared_memory); check_cuda_error(cudaGetLastError()); + cuda_drop_async(d_mem_fft, stream, gpu_index); // dim3 thds(polynomial_size / params::opt, 1, 1); @@ -525,4 +579,4 @@ __host__ void host_blind_rotate_and_sample_extraction( if (max_shared_memory < memory_needed_per_block) cuda_drop_async(d_mem, stream, gpu_index); } -#endif // VERTICAL_PACKING_H +#endif // VERTICAL_PACKING_CUH diff --git a/src/wop_bootstrap.cuh b/src/wop_bootstrap.cuh index 23464ace5..0cb9cb5c0 100644 --- a/src/wop_bootstrap.cuh +++ b/src/wop_bootstrap.cuh @@ -68,9 +68,13 @@ __host__ void scratch_circuit_bootstrap_vertical_packing( // allocate and initialize device pointers for circuit bootstrap and vertical // packing if (allocate_gpu_memory) { - int buffer_size = get_buffer_size_cbs_vp( - glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs, - number_of_inputs, tau); + int buffer_size = + get_buffer_size_cbs_vp(glwe_dimension, lwe_dimension, + polynomial_size, level_count_cbs, + number_of_inputs, tau) + + get_buffer_size_cmux_tree(glwe_dimension, polynomial_size, + level_count_cbs, r, tau, + max_shared_memory); *cbs_vp_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index); } @@ -88,6 +92,9 @@ __host__ void scratch_circuit_bootstrap_vertical_packing( uint32_t bits = sizeof(Torus) * 8; *cbs_delta_log = (bits - 1); + scratch_cmux_tree( + v_stream, gpu_index, cbs_vp_buffer, glwe_dimension, polynomial_size, + level_count_cbs, r, tau, max_shared_memory, false); } /* @@ -157,12 +164,15 @@ __host__ void host_circuit_bootstrap_vertical_packing( // split the vec of GGSW in two, the msb GGSW is for the CMux tree and the // lsb GGSW is for the last blind rotation. uint32_t r = number_of_inputs - params::log2_degree; + int8_t *cmux_tree_buffer = + (int8_t *)glwe_array_out + + tau * (glwe_dimension + 1) * polynomial_size * sizeof(Torus); // CMUX Tree // r = tau * p - log2(N) host_cmux_tree( - v_stream, gpu_index, glwe_array_out, ggsw_out, lut_vector, glwe_dimension, - polynomial_size, base_log_cbs, level_count_cbs, r, tau, - max_shared_memory); + v_stream, gpu_index, glwe_array_out, ggsw_out, lut_vector, + cmux_tree_buffer, glwe_dimension, polynomial_size, base_log_cbs, + level_count_cbs, r, tau, max_shared_memory); check_cuda_error(cudaGetLastError()); // Blind rotation + sample extraction @@ -215,10 +225,14 @@ scratch_wop_pbs(void *v_stream, uint32_t gpu_index, int8_t **wop_pbs_buffer, number_of_inputs * number_of_bits_to_extract; uint32_t tau = number_of_inputs; uint32_t r = cbs_vp_number_of_inputs - params::log2_degree; - int buffer_size = get_buffer_size_cbs_vp( - glwe_dimension, lwe_dimension, polynomial_size, - level_count_cbs, cbs_vp_number_of_inputs, tau) + - wop_pbs_buffer_size; + int buffer_size = + get_buffer_size_cbs_vp(glwe_dimension, lwe_dimension, + polynomial_size, level_count_cbs, + cbs_vp_number_of_inputs, tau) + + get_buffer_size_cmux_tree(glwe_dimension, polynomial_size, + level_count_cbs, r, tau, + max_shared_memory) + + wop_pbs_buffer_size; *wop_pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);