diff --git a/include/linear_algebra.h b/include/linear_algebra.h index 4fd7ec80d..c47ea7599 100644 --- a/include/linear_algebra.h +++ b/include/linear_algebra.h @@ -35,6 +35,14 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_64( void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, void *plaintext_array_in, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count); +void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32( + void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, + void *cleartext_array_in, uint32_t input_lwe_dimension, + uint32_t input_lwe_ciphertext_count); +void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64( + void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, + void *cleartext_array_in, uint32_t input_lwe_dimension, + uint32_t input_lwe_ciphertext_count); } #endif // CUDA_LINALG_H_ diff --git a/src/bootstrap_amortized.cuh b/src/bootstrap_amortized.cuh index 641870f87..40d6414a3 100644 --- a/src/bootstrap_amortized.cuh +++ b/src/bootstrap_amortized.cuh @@ -10,11 +10,11 @@ #include "cooperative_groups.h" #include "../include/helper_cuda.h" -#include "device.h" #include "bootstrap.h" #include "complex/operations.cuh" #include "crypto/gadget.cuh" #include "crypto/torus.cuh" +#include "device.h" #include "fft/bnsmfft.cuh" #include "fft/smfft.cuh" #include "fft/twiddles.cuh" @@ -338,7 +338,8 @@ __host__ void host_bootstrap_amortized( // from one of three templates (no use, partial use or full use // of shared memory) if (max_shared_memory < SM_PART) { - d_mem = (char*) cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count, *stream); + d_mem = (char *)cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count, + *stream); device_bootstrap_amortized<<>>( lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in, bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size, @@ -348,7 +349,8 @@ __host__ void host_bootstrap_amortized( cudaFuncAttributeMaxDynamicSharedMemorySize, SM_PART); cudaFuncSetCacheConfig(device_bootstrap_amortized, cudaFuncCachePreferShared); - d_mem = (char*) cuda_malloc_async(DM_PART * input_lwe_ciphertext_count, *stream); + d_mem = (char *)cuda_malloc_async(DM_PART * input_lwe_ciphertext_count, + *stream); device_bootstrap_amortized <<>>( lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in, @@ -366,7 +368,7 @@ __host__ void host_bootstrap_amortized( checkCudaErrors(cudaFuncSetCacheConfig( device_bootstrap_amortized, cudaFuncCachePreferShared)); - d_mem = (char*) cuda_malloc_async(0, *stream); + d_mem = (char *)cuda_malloc_async(0, *stream); device_bootstrap_amortized <<>>( diff --git a/src/bootstrap_low_latency.cuh b/src/bootstrap_low_latency.cuh index f7c127853..0610c1ab1 100644 --- a/src/bootstrap_low_latency.cuh +++ b/src/bootstrap_low_latency.cuh @@ -10,11 +10,11 @@ #include "cooperative_groups.h" #include "../include/helper_cuda.h" -#include "device.h" #include "bootstrap.h" #include "complex/operations.cuh" #include "crypto/gadget.cuh" #include "crypto/torus.cuh" +#include "device.h" #include "fft/bnsmfft.cuh" #include "fft/smfft.cuh" #include "fft/twiddles.cuh" @@ -262,8 +262,10 @@ host_bootstrap_low_latency(void *v_stream, Torus *lwe_array_out, int buffer_size_per_gpu = level_count * num_samples * polynomial_size / 2 * sizeof(double2); - double2 *mask_buffer_fft = (double2*) cuda_malloc_async(buffer_size_per_gpu, *stream); - double2 *body_buffer_fft = (double2*) cuda_malloc_async(buffer_size_per_gpu, *stream); + double2 *mask_buffer_fft = + (double2 *)cuda_malloc_async(buffer_size_per_gpu, *stream); + double2 *body_buffer_fft = + (double2 *)cuda_malloc_async(buffer_size_per_gpu, *stream); int bytes_needed = sizeof(int16_t) * polynomial_size + // accumulator_decomp sizeof(Torus) * polynomial_size + // accumulator diff --git a/src/bootstrap_wop.cuh b/src/bootstrap_wop.cuh index bca42d583..fcd8ea3dc 100644 --- a/src/bootstrap_wop.cuh +++ b/src/bootstrap_wop.cuh @@ -4,12 +4,12 @@ #include "cooperative_groups.h" #include "../include/helper_cuda.h" -#include "device.h" #include "bootstrap.h" #include "bootstrap_low_latency.cuh" #include "complex/operations.cuh" #include "crypto/ggsw.cuh" #include "crypto/torus.cuh" +#include "device.h" #include "fft/bnsmfft.cuh" #include "fft/smfft.cuh" #include "fft/twiddles.cuh" @@ -300,7 +300,8 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in, int ggsw_size = r * polynomial_size * (glwe_dimension + 1) * (glwe_dimension + 1) * level_count; - double2 *d_ggsw_fft_in = (double2*) cuda_malloc_async(ggsw_size * sizeof(double), *stream); + double2 *d_ggsw_fft_in = + (double2 *)cuda_malloc_async(ggsw_size * sizeof(double), *stream); batch_fft_ggsw_vector(v_stream, d_ggsw_fft_in, ggsw_in, r, glwe_dimension, @@ -311,7 +312,8 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in, // Allocate global memory in case parameters are too large char *d_mem; if (max_shared_memory < memory_needed_per_block) { - d_mem = (char*) cuda_malloc_async(memory_needed_per_block * (1 << (r - 1)), *stream); + d_mem = (char *)cuda_malloc_async(memory_needed_per_block * (1 << (r - 1)), + *stream); } else { checkCudaErrors(cudaFuncSetAttribute( device_batch_cmux, @@ -324,8 +326,10 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in, // Allocate buffers int glwe_size = (glwe_dimension + 1) * polynomial_size; - Torus *d_buffer1 = (Torus*) cuda_malloc_async(num_lut * glwe_size * sizeof(Torus), *stream); - Torus *d_buffer2 = (Torus*) cuda_malloc_async(num_lut * glwe_size * sizeof(Torus), *stream); + Torus *d_buffer1 = + (Torus *)cuda_malloc_async(num_lut * glwe_size * sizeof(Torus), *stream); + Torus *d_buffer2 = + (Torus *)cuda_malloc_async(num_lut * glwe_size * sizeof(Torus), *stream); checkCudaErrors(cudaMemcpyAsync(d_buffer1, lut_vector, num_lut * glwe_size * sizeof(Torus), @@ -369,7 +373,7 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in, // later. checkCudaErrors(cudaStreamSynchronize(*stream)); -// Free memory + // Free memory cuda_drop_async(d_ggsw_fft_in, *stream); cuda_drop_async(d_buffer1, *stream); cuda_drop_async(d_buffer2, *stream); diff --git a/src/device.cu b/src/device.cu index 97fd70651..886bcbd61 100644 --- a/src/device.cu +++ b/src/device.cu @@ -31,15 +31,16 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) { return ptr; } -/// Allocates a size-byte array at the device memory. Tries to do it asynchronously. +/// Allocates a size-byte array at the device memory. Tries to do it +/// asynchronously. void *cuda_malloc_async(uint64_t size, cudaStream_t stream) { void *ptr; - #if (CUDART_VERSION < 11020) +#if (CUDART_VERSION < 11020) checkCudaErrors(cudaMalloc((void **)&ptr, size)); - #else +#else checkCudaErrors(cudaMallocAsync((void **)&ptr, size, stream)); - #endif +#endif return ptr; } @@ -155,11 +156,11 @@ int cuda_drop(void *ptr, uint32_t gpu_index) { /// Drop a cuda array. Tries to do it asynchronously int cuda_drop_async(void *ptr, cudaStream_t stream) { - #if (CUDART_VERSION < 11020) +#if (CUDART_VERSION < 11020) checkCudaErrors(cudaFree(ptr)); - #else +#else checkCudaErrors(cudaFreeAsync(ptr, stream)); - #endif +#endif return 0; } diff --git a/src/multiplication.cu b/src/multiplication.cu new file mode 100644 index 000000000..397c16255 --- /dev/null +++ b/src/multiplication.cu @@ -0,0 +1,24 @@ +#include "multiplication.cuh" + +void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32( + void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, + void *cleartext_array_in, uint32_t input_lwe_dimension, + uint32_t input_lwe_ciphertext_count) { + + host_cleartext_multiplication( + v_stream, gpu_index, static_cast(lwe_array_out), + static_cast(lwe_array_in), + static_cast(cleartext_array_in), input_lwe_dimension, + input_lwe_ciphertext_count); +} +void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64( + void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, + void *cleartext_array_in, uint32_t input_lwe_dimension, + uint32_t input_lwe_ciphertext_count) { + + host_cleartext_multiplication( + v_stream, gpu_index, static_cast(lwe_array_out), + static_cast(lwe_array_in), + static_cast(cleartext_array_in), input_lwe_dimension, + input_lwe_ciphertext_count); +} diff --git a/src/multiplication.cuh b/src/multiplication.cuh new file mode 100644 index 000000000..74db53fc8 --- /dev/null +++ b/src/multiplication.cuh @@ -0,0 +1,52 @@ +#ifndef CUDA_ADD_H +#define CUDA_ADD_H + +#ifdef __CDT_PARSER__ +#undef __CUDA_RUNTIME_H__ +#include +#include +#endif + +#include "linear_algebra.h" +#include "utils/kernel_dimensions.cuh" + +template +__global__ void +cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input, + uint32_t input_lwe_dimension, uint32_t num_entries) { + + int tid = threadIdx.x; + if (tid < num_entries) { + int index = blockIdx.x * blockDim.x + tid; + int cleartext_index = index / (input_lwe_dimension + 1); + // Here we take advantage of the wrapping behaviour of uint + output[index] = lwe_input[index] * cleartext_input[cleartext_index]; + } +} + +template +__host__ void +host_cleartext_multiplication(void *v_stream, uint32_t gpu_index, T *output, + T *lwe_input, T *cleartext_input, + uint32_t input_lwe_dimension, + uint32_t input_lwe_ciphertext_count) { + + cudaSetDevice(gpu_index); + // lwe_size includes the presence of the body + // whereas lwe_dimension is the number of elements in the mask + int lwe_size = input_lwe_dimension + 1; + // Create a 1-dimensional grid of threads + int num_blocks = 0, num_threads = 0; + int num_entries = input_lwe_ciphertext_count * lwe_size; + getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads); + dim3 grid(num_blocks, 1, 1); + dim3 thds(num_threads, 1, 1); + + auto stream = static_cast(v_stream); + cleartext_multiplication<<>>( + output, lwe_input, cleartext_input, input_lwe_dimension, num_entries); + + cudaStreamSynchronize(*stream); +} + +#endif // CUDA_ADD_H