diff --git a/include/bootstrap.h b/include/bootstrap.h index 3ba371236..d1bbd7dae 100644 --- a/include/bootstrap.h +++ b/include/bootstrap.h @@ -5,7 +5,8 @@ extern "C" { -void cuda_initialize_twiddles(uint32_t polynomial_size, uint32_t gpu_index); +void cuda_initialize_twiddles(uint32_t polynomial_size, void *v_stream, + uint32_t gpu_index); void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src, void *v_stream, uint32_t gpu_index, diff --git a/include/device.h b/include/device.h index a993aaf5e..e656a925a 100644 --- a/include/device.h +++ b/include/device.h @@ -33,4 +33,6 @@ int cuda_drop(void *ptr, uint32_t gpu_index); int cuda_drop_async(void *ptr, cudaStream_t *stream, uint32_t gpu_index); int cuda_get_max_shared_memory(uint32_t gpu_index); + +int cuda_synchronize_stream(void *v_stream); } diff --git a/include/keyswitch.h b/include/keyswitch.h index 42526040d..8edc3492e 100644 --- a/include/keyswitch.h +++ b/include/keyswitch.h @@ -15,23 +15,19 @@ void cuda_keyswitch_lwe_ciphertext_vector_64( void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count, uint32_t num_samples); -void cuda_fp_keyswitch_lwe_to_glwe_32(void *v_stream, void *glwe_array_out, - void *lwe_array_in, void *fp_ksk_array, - uint32_t input_lwe_dimension, - uint32_t output_glwe_dimension, - uint32_t output_polynomial_size, - uint32_t base_log, uint32_t level_count, - uint32_t number_of_input_lwe, - uint32_t number_of_keys); +void cuda_fp_keyswitch_lwe_to_glwe_32( + void *v_stream, uint32_t gpu_index, void *glwe_array_out, + void *lwe_array_in, void *fp_ksk_array, uint32_t input_lwe_dimension, + uint32_t output_glwe_dimension, uint32_t output_polynomial_size, + uint32_t base_log, uint32_t level_count, uint32_t number_of_input_lwe, + uint32_t number_of_keys); -void cuda_fp_keyswitch_lwe_to_glwe_64(void *v_stream, void *glwe_array_out, - void *lwe_array_in, void *fp_ksk_array, - uint32_t input_lwe_dimension, - uint32_t output_glwe_dimension, - uint32_t output_polynomial_size, - uint32_t base_log, uint32_t level_count, - uint32_t number_of_input_lwe, - uint32_t number_of_keys); +void cuda_fp_keyswitch_lwe_to_glwe_64( + void *v_stream, uint32_t gpu_index, void *glwe_array_out, + void *lwe_array_in, void *fp_ksk_array, uint32_t input_lwe_dimension, + uint32_t output_glwe_dimension, uint32_t output_polynomial_size, + uint32_t base_log, uint32_t level_count, uint32_t number_of_input_lwe, + uint32_t number_of_keys); } #endif // CNCRT_KS_H_ diff --git a/src/addition.cuh b/src/addition.cuh index c33a7e5eb..6bd698cc8 100644 --- a/src/addition.cuh +++ b/src/addition.cuh @@ -59,8 +59,6 @@ __host__ void host_addition(void *v_stream, uint32_t gpu_index, T *output, auto stream = static_cast(v_stream); addition<<>>(output, input_1, input_2, num_entries); checkCudaErrors(cudaGetLastError()); - - cudaStreamSynchronize(*stream); } template @@ -86,7 +84,5 @@ __host__ void host_addition_plaintext(void *v_stream, uint32_t gpu_index, plaintext_addition<<>>( output, lwe_input, plaintext_input, input_lwe_dimension, num_entries); checkCudaErrors(cudaGetLastError()); - - cudaStreamSynchronize(*stream); } #endif // CUDA_ADD_H diff --git a/src/bit_extraction.cuh b/src/bit_extraction.cuh index f8d6539a0..b26fa65fa 100644 --- a/src/bit_extraction.cuh +++ b/src/bit_extraction.cuh @@ -145,6 +145,7 @@ __host__ void host_extract_bits( uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples, uint32_t max_shared_memory) { + cudaSetDevice(gpu_index); auto stream = static_cast(v_stream); uint32_t ciphertext_n_bits = sizeof(Torus) * 8; diff --git a/src/bootstrap_amortized.cuh b/src/bootstrap_amortized.cuh index 067f2a9c5..24183b5cd 100644 --- a/src/bootstrap_amortized.cuh +++ b/src/bootstrap_amortized.cuh @@ -9,7 +9,6 @@ #include "cooperative_groups.h" -#include "../include/helper_cuda.h" #include "bootstrap.h" #include "complex/operations.cuh" #include "crypto/gadget.cuh" @@ -18,11 +17,11 @@ #include "fft/bnsmfft.cuh" #include "fft/smfft.cuh" #include "fft/twiddles.cuh" +#include "helper_cuda.h" #include "polynomial/functions.cuh" #include "polynomial/parameters.cuh" #include "polynomial/polynomial.cuh" #include "polynomial/polynomial_math.cuh" -#include "utils/memory.cuh" #include "utils/timer.cuh" template @@ -284,6 +283,7 @@ __host__ void host_bootstrap_amortized( uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) { + cudaSetDevice(gpu_index); int SM_FULL = sizeof(Torus) * polynomial_size + // accumulator mask sizeof(Torus) * polynomial_size + // accumulator body sizeof(Torus) * polynomial_size + // accumulator mask rotated @@ -356,9 +356,6 @@ __host__ void host_bootstrap_amortized( } checkCudaErrors(cudaGetLastError()); - // Synchronize the streams before copying the result to lwe_array_out at the - // right place - cudaStreamSynchronize(*stream); cuda_drop_async(d_mem, stream, gpu_index); } diff --git a/src/bootstrap_low_latency.cuh b/src/bootstrap_low_latency.cuh index 4c2cca490..059812849 100644 --- a/src/bootstrap_low_latency.cuh +++ b/src/bootstrap_low_latency.cuh @@ -9,7 +9,6 @@ #include "cooperative_groups.h" -#include "../include/helper_cuda.h" #include "bootstrap.h" #include "complex/operations.cuh" #include "crypto/gadget.cuh" @@ -18,10 +17,10 @@ #include "fft/bnsmfft.cuh" #include "fft/smfft.cuh" #include "fft/twiddles.cuh" +#include "helper_cuda.h" #include "polynomial/parameters.cuh" #include "polynomial/polynomial.cuh" #include "polynomial/polynomial_math.cuh" -#include "utils/memory.cuh" #include "utils/timer.cuh" // Cooperative groups are used in the low latency PBS @@ -263,6 +262,7 @@ __host__ void host_bootstrap_low_latency( uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors, uint32_t max_shared_memory) { + cudaSetDevice(gpu_index); auto stream = static_cast(v_stream); int buffer_size_per_gpu = level_count * input_lwe_ciphertext_count * @@ -346,7 +346,6 @@ __host__ void host_bootstrap_low_latency( checkCudaErrors(cudaGetLastError()); // Synchronize the streams before copying the result to lwe_array_out at the // right place - cudaStreamSynchronize(*stream); cuda_drop_async(mask_buffer_fft, stream, gpu_index); cuda_drop_async(body_buffer_fft, stream, gpu_index); cuda_drop_async(d_mem, stream, gpu_index); diff --git a/src/circuit_bootstrap.cuh b/src/circuit_bootstrap.cuh index 9b3aa36a6..af3194bf3 100644 --- a/src/circuit_bootstrap.cuh +++ b/src/circuit_bootstrap.cuh @@ -1,11 +1,11 @@ #ifndef CBS_H #define CBS_H -#include "../include/helper_cuda.h" #include "bit_extraction.cuh" #include "bootstrap.h" #include "bootstrap_amortized.cuh" #include "device.h" +#include "helper_cuda.h" #include "keyswitch.cuh" #include "polynomial/parameters.cuh" #include "utils/timer.cuh" @@ -113,6 +113,7 @@ __host__ void host_circuit_bootstrap( uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_samples, uint32_t max_shared_memory) { + cudaSetDevice(gpu_index); auto stream = static_cast(v_stream); uint32_t ciphertext_n_bits = sizeof(Torus) * 8; @@ -151,12 +152,12 @@ __host__ void host_circuit_bootstrap( dim3 copy_block(params::degree / params::opt, 1, 1); // Add q/4 to center the error while computing a negacyclic LUT // copy pbs result (glwe_dimension + 1) times to be an input of fp-ks - copy_add_lwe_cbs<<>>( + copy_add_lwe_cbs<<>>( lwe_array_in_fp_ks_buffer, lwe_array_out_pbs_buffer, ciphertext_n_bits, base_log_cbs, level_cbs); cuda_fp_keyswitch_lwe_to_glwe( - v_stream, ggsw_out, lwe_array_in_fp_ks_buffer, fp_ksk_array, + v_stream, gpu_index, ggsw_out, lwe_array_in_fp_ks_buffer, fp_ksk_array, polynomial_size, glwe_dimension, polynomial_size, base_log_pksk, level_pksk, pbs_count * (glwe_dimension + 1), glwe_dimension + 1); } diff --git a/src/crypto/bootstrapping_key.cuh b/src/crypto/bootstrapping_key.cuh index f26a4c2a7..6148add7e 100644 --- a/src/crypto/bootstrapping_key.cuh +++ b/src/crypto/bootstrapping_key.cuh @@ -38,7 +38,8 @@ __device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level, polynomial_size / 2]; } -void cuda_initialize_twiddles(uint32_t polynomial_size, uint32_t gpu_index) { +void cuda_initialize_twiddles(uint32_t polynomial_size, void *v_stream, + uint32_t gpu_index) { cudaSetDevice(gpu_index); int sw_size = polynomial_size / 2; short *sw1_h, *sw2_h; @@ -61,10 +62,11 @@ void cuda_initialize_twiddles(uint32_t polynomial_size, uint32_t gpu_index) { cnt++; } } - cudaMemcpyToSymbol(SW1, sw1_h, sw_size * sizeof(short), 0, - cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(SW2, sw2_h, sw_size * sizeof(short), 0, - cudaMemcpyHostToDevice); + auto stream = static_cast(v_stream); + cudaMemcpyToSymbolAsync(SW1, sw1_h, sw_size * sizeof(short), 0, + cudaMemcpyHostToDevice, *stream); + cudaMemcpyToSymbolAsync(SW2, sw2_h, sw_size * sizeof(short), 0, + cudaMemcpyHostToDevice, *stream); free(sw1_h); free(sw2_h); } @@ -91,8 +93,8 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream, int blockSize = polynomial_size / choose_opt(polynomial_size); double2 *h_bsk = (double2 *)malloc(buffer_size); - double2 *d_bsk; - cudaMalloc((void **)&d_bsk, buffer_size); + auto stream = static_cast(v_stream); + double2 *d_bsk = (double2 *)cuda_malloc_async(buffer_size, stream, gpu_index); // compress real bsk to complex and divide it on DOUBLE_MAX for (int i = 0; i < total_polynomials; i++) { @@ -110,9 +112,8 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream, } } - cudaMemcpy(d_bsk, h_bsk, buffer_size, cudaMemcpyHostToDevice); + cuda_memcpy_async_to_gpu(d_bsk, h_bsk, buffer_size, stream, gpu_index); - auto stream = static_cast(v_stream); double2 *buffer; switch (polynomial_size) { case 512: diff --git a/src/device.cu b/src/device.cu index d890a49ca..c31e33d63 100644 --- a/src/device.cu +++ b/src/device.cu @@ -187,3 +187,9 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) { } return max_shared_memory; } + +int cuda_synchronize_stream(void *v_stream) { + auto stream = static_cast(v_stream); + cudaStreamSynchronize(*stream); + return 0; +} diff --git a/src/keyswitch.cu b/src/keyswitch.cu index 51bc397fb..ff21233a8 100644 --- a/src/keyswitch.cu +++ b/src/keyswitch.cu @@ -46,17 +46,15 @@ void cuda_keyswitch_lwe_ciphertext_vector_64( /* Perform functional packing keyswitch on a batch of 32 bits input LWE * ciphertexts. See the equivalent function on 64 bit inputs for more details. */ -void cuda_fp_keyswitch_lwe_to_glwe_32(void *v_stream, void *glwe_array_out, - void *lwe_array_in, void *fp_ksk_array, - uint32_t input_lwe_dimension, - uint32_t output_glwe_dimension, - uint32_t output_polynomial_size, - uint32_t base_log, uint32_t level_count, - uint32_t number_of_input_lwe, - uint32_t number_of_keys) { +void cuda_fp_keyswitch_lwe_to_glwe_32( + void *v_stream, uint32_t gpu_index, void *glwe_array_out, + void *lwe_array_in, void *fp_ksk_array, uint32_t input_lwe_dimension, + uint32_t output_glwe_dimension, uint32_t output_polynomial_size, + uint32_t base_log, uint32_t level_count, uint32_t number_of_input_lwe, + uint32_t number_of_keys) { cuda_fp_keyswitch_lwe_to_glwe( - v_stream, static_cast(glwe_array_out), + v_stream, gpu_index, static_cast(glwe_array_out), static_cast(lwe_array_in), static_cast(fp_ksk_array), input_lwe_dimension, output_glwe_dimension, output_polynomial_size, base_log, level_count, @@ -68,6 +66,7 @@ void cuda_fp_keyswitch_lwe_to_glwe_32(void *v_stream, void *glwe_array_out, * * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel * launch + * - `gpu_index` is the index of the GPU to be used in the kernel launch * - `glwe_array_out`: output batch of keyswitched ciphertexts * - `lwe_array_in`: input batch of num_samples LWE ciphertexts, containing * lwe_dimension_in mask values + 1 body value @@ -83,17 +82,15 @@ void cuda_fp_keyswitch_lwe_to_glwe_32(void *v_stream, void *glwe_array_out, * This function calls a wrapper to a device kernel that performs the functional * packing keyswitch. */ -void cuda_fp_keyswitch_lwe_to_glwe_64(void *v_stream, void *glwe_array_out, - void *lwe_array_in, void *fp_ksk_array, - uint32_t input_lwe_dimension, - uint32_t output_glwe_dimension, - uint32_t output_polynomial_size, - uint32_t base_log, uint32_t level_count, - uint32_t number_of_input_lwe, - uint32_t number_of_keys) { +void cuda_fp_keyswitch_lwe_to_glwe_64( + void *v_stream, uint32_t gpu_index, void *glwe_array_out, + void *lwe_array_in, void *fp_ksk_array, uint32_t input_lwe_dimension, + uint32_t output_glwe_dimension, uint32_t output_polynomial_size, + uint32_t base_log, uint32_t level_count, uint32_t number_of_input_lwe, + uint32_t number_of_keys) { cuda_fp_keyswitch_lwe_to_glwe( - v_stream, static_cast(glwe_array_out), + v_stream, gpu_index, static_cast(glwe_array_out), static_cast(lwe_array_in), static_cast(fp_ksk_array), input_lwe_dimension, output_glwe_dimension, output_polynomial_size, base_log, level_count, diff --git a/src/keyswitch.cuh b/src/keyswitch.cuh index 1fc236432..c384c72c7 100644 --- a/src/keyswitch.cuh +++ b/src/keyswitch.cuh @@ -170,6 +170,7 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector( uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count, uint32_t num_samples) { + cudaSetDevice(gpu_index); constexpr int ideal_threads = 128; int lwe_dim = lwe_dimension_out + 1; @@ -190,7 +191,8 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector( int shared_mem = sizeof(Torus) * (lwe_dimension_out + 1); - cudaMemset(lwe_array_out, 0, sizeof(Torus) * lwe_size_after); + auto stream = static_cast(v_stream); + cudaMemsetAsync(lwe_array_out, 0, sizeof(Torus) * lwe_size_after, *stream); dim3 grid(num_samples, 1, 1); dim3 threads(ideal_threads, 1, 1); @@ -198,21 +200,20 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector( cudaFuncSetAttribute(keyswitch, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem); - auto stream = static_cast(v_stream); keyswitch<<>>( lwe_array_out, lwe_array_in, ksk, lwe_dimension_in, lwe_dimension_out, base_log, level_count, lwe_lower, lwe_upper, cutoff); checkCudaErrors(cudaGetLastError()); - - cudaStreamSynchronize(*stream); } template __host__ void cuda_fp_keyswitch_lwe_to_glwe( - void *v_stream, Torus *glwe_array_out, Torus *lwe_array_in, - Torus *fp_ksk_array, uint32_t lwe_dimension_in, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t number_of_input_lwe, uint32_t number_of_keys) { + void *v_stream, uint32_t gpu_index, Torus *glwe_array_out, + Torus *lwe_array_in, Torus *fp_ksk_array, uint32_t lwe_dimension_in, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, uint32_t number_of_input_lwe, + uint32_t number_of_keys) { + cudaSetDevice(gpu_index); int threads = 256; int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size; dim3 blocks(glwe_accumulator_size / threads, number_of_input_lwe, 1); @@ -223,8 +224,6 @@ __host__ void cuda_fp_keyswitch_lwe_to_glwe( glwe_array_out, lwe_array_in, fp_ksk_array, lwe_dimension_in, glwe_dimension, polynomial_size, base_log, level_count, number_of_input_lwe, number_of_keys); - - cudaStreamSynchronize(*stream); } #endif diff --git a/src/multiplication.cuh b/src/multiplication.cuh index 7ff70f74b..3f770966e 100644 --- a/src/multiplication.cuh +++ b/src/multiplication.cuh @@ -47,8 +47,6 @@ host_cleartext_multiplication(void *v_stream, uint32_t gpu_index, T *output, cleartext_multiplication<<>>( output, lwe_input, cleartext_input, input_lwe_dimension, num_entries); checkCudaErrors(cudaGetLastError()); - - cudaStreamSynchronize(*stream); } #endif // CUDA_MULT_H diff --git a/src/negation.cuh b/src/negation.cuh index 301fddfa0..b6536f7be 100644 --- a/src/negation.cuh +++ b/src/negation.cuh @@ -41,8 +41,6 @@ __host__ void host_negation(void *v_stream, uint32_t gpu_index, T *output, auto stream = static_cast(v_stream); negation<<>>(output, input, num_entries); checkCudaErrors(cudaGetLastError()); - - cudaStreamSynchronize(*stream); } #endif // CUDA_NEGATE_H diff --git a/src/polynomial/functions.cuh b/src/polynomial/functions.cuh index fa14a98a7..bbc839390 100644 --- a/src/polynomial/functions.cuh +++ b/src/polynomial/functions.cuh @@ -1,6 +1,6 @@ #ifndef GPU_POLYNOMIAL_FUNCTIONS #define GPU_POLYNOMIAL_FUNCTIONS -#include "utils/memory.cuh" +#include "helper_cuda.h" #include "utils/timer.cuh" /* diff --git a/src/polynomial/polynomial.cuh b/src/polynomial/polynomial.cuh index bef200966..e5022e21d 100644 --- a/src/polynomial/polynomial.cuh +++ b/src/polynomial/polynomial.cuh @@ -5,8 +5,8 @@ #include "crypto/torus.cuh" #include "fft/bnsmfft.cuh" #include "fft/smfft.cuh" +#include "helper_cuda.h" #include "parameters.cuh" -#include "utils/memory.cuh" #include "utils/timer.cuh" #include #include @@ -32,67 +32,6 @@ template class Vector; template class Twiddles; -template class VectorPolynomial { -public: - T *m_data; - uint32_t m_num_polynomials; - - __device__ VectorPolynomial(T *data, uint32_t num_polynomials) - : m_data(data), m_num_polynomials(num_polynomials) {} - - __device__ VectorPolynomial get_chunk(int chunk_num, - int chunk_size) { - int pos = chunk_num * chunk_size; - T *ptr = &m_data[pos]; - - return VectorPolynomial(ptr, chunk_size / params::degree); - } - - __host__ VectorPolynomial() {} - - __host__ VectorPolynomial(DeviceMemory &dmem, uint32_t num_polynomials, - int device) - : m_num_polynomials(num_polynomials) { - dmem.get_allocation(&m_data, m_num_polynomials * params::degree, device); - } - - __host__ VectorPolynomial(DeviceMemory &dmem, T *source, - uint32_t num_polynomials, int device) - : m_num_polynomials(num_polynomials) { - dmem.get_allocation_and_copy_async( - &m_data, source, m_num_polynomials * params::degree, device); - } - - __host__ void copy_to_host(T *dest) { - cudaMemcpyAsync(dest, m_data, - sizeof(T) * m_num_polynomials * params::degree, - cudaMemcpyDeviceToHost); - } - - __device__ void copy_into(Polynomial &dest, - int polynomial_number = 0) { - int tid = threadIdx.x; - int begin = polynomial_number * params::degree; -#pragma unroll - for (int i = 0; i < params::opt; i++) { - dest.coefficients[tid] = m_data[tid + begin]; - tid = tid + params::degree / params::opt; - } - synchronize_threads_in_block(); - } - - __device__ void split_into_polynomials(Polynomial &first, - Polynomial &second) { - int tid = threadIdx.x; -#pragma unroll - for (int i = 0; i < params::opt; i++) { - first.coefficients[tid] = m_data[tid]; - second.coefficients[tid] = m_data[tid + params::degree]; - tid = tid + params::degree / params::opt; - } - } -}; - template class Polynomial { public: T *coefficients; @@ -104,18 +43,6 @@ public: __device__ Polynomial(char *memory, uint32_t degree) : coefficients((T *)memory), degree(degree) {} - __host__ Polynomial(DeviceMemory &dmem, uint32_t degree, int device) - : degree(degree) { - dmem.get_allocation(&this->coefficients, params::degree, device); - } - - __host__ Polynomial(DeviceMemory &dmem, T *source, uint32_t degree, - int device) - : degree(degree) { - dmem.get_allocation_and_copy_async(&this->coefficients, source, - params::degree, device); - } - __host__ void copy_to_host(T *dest) { cudaMemcpyAsync(dest, this->coefficients, sizeof(T) * params::degree, cudaMemcpyDeviceToHost); @@ -402,22 +329,6 @@ public: cudaMemcpyHostToDevice); } - __host__ Vector(DeviceMemory &dmem, T *source, uint32_t size_source, - int device) - : m_size(size_source) { - dmem.get_allocation_and_copy_async(&m_data, source, m_size, device); - } - - __host__ Vector(DeviceMemory &dmem, T *source, uint32_t allocation_size, - uint32_t copy_size, int device) - : m_size(allocation_size) { - if (copy_size > allocation_size) { - printf("warning: copying more than allocation"); - } - dmem.get_allocation_and_copy_async(&m_data, source, m_size, copy_size, - device); - } - __host__ void copy_to_host(T *dest) { cudaMemcpyAsync(dest, m_data, sizeof(T) * m_size, cudaMemcpyDeviceToHost); } diff --git a/src/utils/memory.cuh b/src/utils/memory.cuh deleted file mode 100644 index 0f8c7acd7..000000000 --- a/src/utils/memory.cuh +++ /dev/null @@ -1,77 +0,0 @@ -#ifndef CNCRT_SHMEM_H -#define CNCRT_SHMEM_H - -#include "helper_cuda.h" -#include -#include -#include -#include -#include -#include - -class DeviceMemory { -public: - std::vector> m_allocated; - std::mutex m_allocation_mtx; - std::atomic m_total_devices; - - DeviceMemory() : m_total_devices(1) {} - - __host__ void set_device(int device) { - if (device > m_total_devices) - m_total_devices = device + 1; - } - - template - __host__ void get_allocation(T **ptr, int elements, int device) { - T *res; - cudaMalloc((void **)&res, sizeof(T) * elements); - *ptr = res; - std::lock_guard lock(m_allocation_mtx); - m_allocated.push_back(std::make_tuple(res, device)); - } - - template - __host__ void get_allocation_and_copy_async(T **ptr, T *src, int elements, - int device) { - T *res; - cudaMalloc((void **)&res, sizeof(T) * elements); - cudaMemcpyAsync(res, src, sizeof(T) * elements, cudaMemcpyHostToDevice); - *ptr = res; - std::lock_guard lock(m_allocation_mtx); - m_allocated.push_back(std::make_tuple(res, device)); - } - - template - __host__ void get_allocation_and_copy_async(T **ptr, T *src, int allocation, - int elements, int device) { - T *res; - cudaMalloc((void **)&res, sizeof(T) * allocation); - cudaMemcpyAsync(res, src, sizeof(T) * elements, cudaMemcpyHostToDevice); - *ptr = res; - std::lock_guard lock(m_allocation_mtx); - m_allocated.push_back(std::make_tuple(res, device)); - } - - void free_all_from_device(int device) { - cudaSetDevice(device); - for (auto elem : m_allocated) { - auto dev = std::get<1>(elem); - if (dev == device) { - auto mem = std::get<0>(elem); - checkCudaErrors(cudaFree(mem)); - } - } - } - - __host__ ~DeviceMemory() { - for (auto elem : m_allocated) { - auto dev = std::get<1>(elem); - auto mem = std::get<0>(elem); - cudaSetDevice(dev); - checkCudaErrors(cudaFree(mem)); - } - } -}; - -#endif // CNCRT_SHMEM_H diff --git a/src/utils/timer.cuh b/src/utils/timer.cuh index e751a2836..f486e4ca6 100644 --- a/src/utils/timer.cuh +++ b/src/utils/timer.cuh @@ -1,6 +1,7 @@ #ifndef CNCRT_TIMER_H #define CNCRT_TIMER_H +#include #define synchronize_threads_in_block() __syncthreads() template class CudaMeasureExecution { diff --git a/src/vertical_packing.cuh b/src/vertical_packing.cuh index 9daeb58ac..7af9221bc 100644 --- a/src/vertical_packing.cuh +++ b/src/vertical_packing.cuh @@ -1,7 +1,6 @@ #ifndef VERTICAL_PACKING_H #define VERTICAL_PACKING_H -#include "../include/helper_cuda.h" #include "bootstrap.h" #include "complex/operations.cuh" #include "crypto/gadget.cuh" @@ -11,11 +10,11 @@ #include "fft/bnsmfft.cuh" #include "fft/smfft.cuh" #include "fft/twiddles.cuh" +#include "helper_cuda.h" #include "polynomial/functions.cuh" #include "polynomial/parameters.cuh" #include "polynomial/polynomial.cuh" #include "polynomial/polynomial_math.cuh" -#include "utils/memory.cuh" #include "utils/timer.cuh" template __device__ void fft(double2 *output) { @@ -266,11 +265,13 @@ __global__ void device_batch_cmux(Torus *glwe_array_out, Torus *glwe_array_in, * - tau: The quantity of CMUX trees that should be executed */ template -void host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out, - Torus *ggsw_in, Torus *lut_vector, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t r, uint32_t tau, - uint32_t max_shared_memory) { +__host__ void host_cmux_tree(void *v_stream, uint32_t gpu_index, + Torus *glwe_array_out, Torus *ggsw_in, + Torus *lut_vector, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, uint32_t r, uint32_t tau, + uint32_t max_shared_memory) { + cudaSetDevice(gpu_index); auto stream = static_cast(v_stream); int num_lut = (1 << r); @@ -278,12 +279,9 @@ void host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out, // Simply copy the LUTs add_padding_to_lut_async(glwe_array_out, lut_vector, glwe_dimension, tau, stream); - checkCudaErrors(cudaStreamSynchronize(*stream)); return; } - cuda_initialize_twiddles(polynomial_size, 0); - int memory_needed_per_block = sizeof(Torus) * polynomial_size + // glwe_sub_mask sizeof(Torus) * polynomial_size + // glwe_sub_body @@ -365,11 +363,6 @@ void host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out, glwe_array_out + i * glwe_size, output + i * num_lut * glwe_size, glwe_size * sizeof(Torus), cudaMemcpyDeviceToDevice, *stream)); - // We only need synchronization to assert that data is in glwe_array_out - // before returning. Memory release can be added to the stream and processed - // later. - checkCudaErrors(cudaStreamSynchronize(*stream)); - // Free memory cuda_drop_async(d_ggsw_fft_in, stream, gpu_index); cuda_drop_async(d_buffer1, stream, gpu_index); @@ -466,12 +459,13 @@ __global__ void device_blind_rotation_and_sample_extraction( } template -void host_blind_rotate_and_sample_extraction( +__host__ void host_blind_rotate_and_sample_extraction( void *v_stream, uint32_t gpu_index, Torus *lwe_out, Torus *ggsw_in, Torus *lut_vector, uint32_t mbr_size, uint32_t tau, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t l_gadget, uint32_t max_shared_memory) { + cudaSetDevice(gpu_index); assert(glwe_dimension == 1); // For larger k we will need to adjust the mask size auto stream = static_cast(v_stream); diff --git a/src/wop_bootstrap.cuh b/src/wop_bootstrap.cuh index 361b95c30..de5bced06 100644 --- a/src/wop_bootstrap.cuh +++ b/src/wop_bootstrap.cuh @@ -3,12 +3,11 @@ #include "cooperative_groups.h" -#include "../include/helper_cuda.h" #include "bit_extraction.cuh" #include "bootstrap.h" #include "circuit_bootstrap.cuh" +#include "helper_cuda.h" #include "utils/kernel_dimensions.cuh" -#include "utils/memory.cuh" #include "utils/timer.cuh" #include "vertical_packing.cuh" @@ -40,6 +39,7 @@ __host__ void host_circuit_bootstrap_vertical_packing( uint32_t level_count_cbs, uint32_t number_of_inputs, uint32_t tau, uint32_t max_shared_memory) { + cudaSetDevice(gpu_index); auto stream = static_cast(v_stream); // allocate and initialize device pointers for circuit bootstrap @@ -140,6 +140,7 @@ __host__ void host_wop_pbs( uint32_t number_of_bits_to_extract, uint32_t number_of_inputs, uint32_t max_shared_memory) { + cudaSetDevice(gpu_index); auto stream = static_cast(v_stream); // let mut h_lut_vector_indexes = vec![0 as u32; 1];