diff --git a/.github/workflows/concrete_cuda_benchmark.yml b/.github/workflows/concrete_cuda_benchmark.yml index 3070f8dcb..4edc2e9e6 100644 --- a/.github/workflows/concrete_cuda_benchmark.yml +++ b/.github/workflows/concrete_cuda_benchmark.yml @@ -80,8 +80,7 @@ jobs: - name: Benchmark concrete-cuda if: ${{ !cancelled() }} run: | - ${{ env.BENCHMARK_DIR }}/benchmark_concrete_cuda --benchmark_out=benchmarks_results.json - --benchmark_out_format=json + ${{ env.BENCHMARK_DIR }}/benchmark_concrete_cuda --benchmark_out=benchmarks_results.json --benchmark_out_format=json - name: Upload raw results artifact uses: actions/upload-artifact@v3 diff --git a/backends/concrete-cuda/implementation/include/bootstrap.h b/backends/concrete-cuda/implementation/include/bootstrap.h index c8e584000..d85170555 100644 --- a/backends/concrete-cuda/implementation/include/bootstrap.h +++ b/backends/concrete-cuda/implementation/include/bootstrap.h @@ -56,6 +56,12 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64( void cleanup_cuda_bootstrap_amortized(void *v_stream, uint32_t gpu_index, int8_t **pbs_buffer); +bool verify_cuda_bootstrap_low_latency_grid_size_64(int glwe_dimension, + int polynomial_size, + int level_count, + int num_samples, + uint32_t max_shared_memory); + void scratch_cuda_bootstrap_low_latency_32( void *v_stream, uint32_t gpu_index, int8_t **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, diff --git a/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu b/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu index c05caeb13..42e1e07dd 100644 --- a/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu +++ b/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu @@ -11,6 +11,55 @@ uint64_t get_buffer_size_bootstrap_low_latency_64( max_shared_memory); } +// The number of samples should be lower than 4 * SM/((k + 1) * l) (the +// factor 4 being related to the occupancy of 50%). +bool verify_cuda_bootstrap_low_latency_grid_size_64( + int glwe_dimension, int polynomial_size, int level_count, int num_samples, + uint32_t max_shared_memory) { + + int ret; + switch (polynomial_size) { + case 256: + ret = verify_cuda_bootstrap_low_latency_grid_size>( + glwe_dimension, polynomial_size, level_count, num_samples, + max_shared_memory); + break; + case 512: + ret = verify_cuda_bootstrap_low_latency_grid_size>( + glwe_dimension, polynomial_size, level_count, num_samples, + max_shared_memory); + break; + case 1024: + ret = verify_cuda_bootstrap_low_latency_grid_size>( + glwe_dimension, polynomial_size, level_count, num_samples, + max_shared_memory); + break; + case 2048: + ret = verify_cuda_bootstrap_low_latency_grid_size>( + glwe_dimension, polynomial_size, level_count, num_samples, + max_shared_memory); + break; + case 4096: + ret = verify_cuda_bootstrap_low_latency_grid_size>( + glwe_dimension, polynomial_size, level_count, num_samples, + max_shared_memory); + break; + case 8192: + ret = verify_cuda_bootstrap_low_latency_grid_size>( + glwe_dimension, polynomial_size, level_count, num_samples, + max_shared_memory); + break; + case 16384: + ret = verify_cuda_bootstrap_low_latency_grid_size>( + glwe_dimension, polynomial_size, level_count, num_samples, + max_shared_memory); + break; + default: + break; + } + + return ret; +} /* * Runs standard checks to validate the inputs */ @@ -25,16 +74,14 @@ void checks_fast_bootstrap_low_latency(int glwe_dimension, int level_count, polynomial_size == 4096 || polynomial_size == 8192 || polynomial_size == 16384)); // The number of samples should be lower than 4 * SM/((k + 1) * l) (the - // factor 4 being related to the occupancy of 50%). The only supported - // value for k is 1, so k + 1 = 2 for now. + // factor 4 being related to the occupancy of 50%). int number_of_sm = 0; cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); - assert( - ("Error (GPU low latency PBS): the number of input LWEs must be lower " - "or equal to the number of streaming multiprocessors on the device " - "divided by 4 * " - "(k + 1) * level_count", - num_samples <= number_of_sm * 4. / (glwe_dimension + 1) / level_count)); + assert(("Error (GPU low latency PBS): the number of input LWEs must be lower " + "or equal to the number of streaming multiprocessors on the device " + "divided by 4 * (k + 1) * level_count", + verify_cuda_bootstrap_low_latency_grid_size( + glwe_dimension, level_count, num_samples))); } /* @@ -43,7 +90,6 @@ void checks_fast_bootstrap_low_latency(int glwe_dimension, int level_count, void checks_bootstrap_low_latency(int nbits, int glwe_dimension, int level_count, int base_log, int polynomial_size, int num_samples) { - assert(("Error (GPU low latency PBS): base log should be <= nbits", base_log <= nbits)); checks_fast_bootstrap_low_latency(glwe_dimension, level_count, diff --git a/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cuh b/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cuh index 17fb66a6a..9951c5913 100644 --- a/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cuh +++ b/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cuh @@ -398,4 +398,44 @@ __host__ void host_bootstrap_low_latency( check_cuda_error(cudaGetLastError()); } +// Verify if the grid size for the low latency kernel satisfies the cooperative +// group constraints +template +__host__ bool verify_cuda_bootstrap_low_latency_grid_size( + int glwe_dimension, int polynomial_size, int level_count, int num_samples, + uint32_t max_shared_memory) { + // Calculate the dimension of the kernel + uint64_t full_sm = + get_buffer_size_full_sm_bootstrap_low_latency(polynomial_size); + + uint64_t partial_sm = + get_buffer_size_partial_sm_bootstrap_low_latency(polynomial_size); + + int thds = polynomial_size / params::opt; + + // Get the maximum number of active blocks per streaming multiprocessors + int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples; + int max_active_blocks_per_sm; + + if (max_shared_memory < partial_sm) { + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks_per_sm, + (void *)device_bootstrap_low_latency, thds, 0); + } else if (max_shared_memory < full_sm) { + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks_per_sm, + (void *)device_bootstrap_low_latency, thds, + 0); + } else { + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks_per_sm, + (void *)device_bootstrap_low_latency, thds, 0); + } + + // Get the number of streaming multiprocessors + int number_of_sm = 0; + cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); + return number_of_blocks <= max_active_blocks_per_sm * number_of_sm; +} + #endif // LOWLAT_PBS_H diff --git a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_bit_extraction.cpp b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_bit_extraction.cpp index ef288dbaf..32d4812ef 100644 --- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_bit_extraction.cpp +++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_bit_extraction.cpp @@ -72,7 +72,7 @@ public: gpu_index); } - void TearDown() { + void TearDown(const ::benchmark::State &state) { bit_extraction_teardown(stream, csprng, lwe_sk_in, lwe_sk_out, d_fourier_bsk, d_ksk, plaintexts, d_lwe_ct_in_array, d_lwe_ct_out_array, bit_extract_buffer, gpu_index); diff --git a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_bootstrap.cpp b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_bootstrap.cpp index a527e5eb0..55c843fe7 100644 --- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_bootstrap.cpp +++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_bootstrap.cpp @@ -42,6 +42,7 @@ protected: public: void SetUp(const ::benchmark::State &state) { + cudaDeviceSynchronize(); stream = cuda_create_stream(0); lwe_dimension = state.range(0); @@ -64,12 +65,13 @@ public: (lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint64_t)); } - void TearDown() { + void TearDown(const ::benchmark::State &state) { bootstrap_teardown(stream, csprng, lwe_sk_in_array, lwe_sk_out_array, d_fourier_bsk_array, plaintexts, d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_ct_out_array, gpu_index); free(lwe_ct_array); + cudaDeviceSynchronize(); cudaDeviceReset(); } }; @@ -151,12 +153,12 @@ BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_LowLatencyPBS) uint64_t buffer_size = get_buffer_size_bootstrap_low_latency_64( glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index)); + if (buffer_size > free) st.SkipWithError("Not enough free memory in the device. Skipping..."); - int number_of_sm = 0; - cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); - if (input_lwe_ciphertext_count > - number_of_sm * 4 / (glwe_dimension + 1) / pbs_level) + if (!verify_cuda_bootstrap_low_latency_grid_size_64( + glwe_dimension, polynomial_size, pbs_level, + input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index))) st.SkipWithError( "Not enough SM on device to run this configuration. Skipping..."); diff --git a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_circuit_bootstrap.cpp b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_circuit_bootstrap.cpp index 750c87f5a..0efa08f3c 100644 --- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_circuit_bootstrap.cpp +++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_circuit_bootstrap.cpp @@ -79,7 +79,7 @@ public: &delta, number_of_inputs, 1, 1, gpu_index); } - void TearDown() { + void TearDown(const ::benchmark::State &state) { circuit_bootstrap_teardown(stream, csprng, lwe_sk_in, lwe_sk_out, d_fourier_bsk, d_pksk, plaintexts, d_lwe_ct_in_array, d_lut_vector_indexes, diff --git a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_cmux_tree.cpp b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_cmux_tree.cpp index 75ebb8c1c..4e4d7b5d2 100644 --- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_cmux_tree.cpp +++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_cmux_tree.cpp @@ -58,7 +58,7 @@ public: gpu_index); } - void TearDown() { + void TearDown(const ::benchmark::State &state) { cmux_tree_teardown(stream, &csprng, &glwe_sk, &d_lut_identity, &plaintexts, &d_ggsw_bit_array, &cmux_tree_buffer, &d_glwe_out, gpu_index); diff --git a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_keyswitch.cpp b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_keyswitch.cpp index fc9f38277..27ef4c080 100644 --- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_keyswitch.cpp +++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_keyswitch.cpp @@ -54,7 +54,7 @@ public: &delta, number_of_inputs, 1, 1, gpu_index); } - void TearDown() { + void TearDown(const ::benchmark::State &state) { keyswitch_teardown(stream, csprng, lwe_sk_in_array, lwe_sk_out_array, d_ksk_array, plaintexts, d_lwe_in_ct_array, d_lwe_out_ct_array, gpu_index); @@ -77,6 +77,8 @@ BENCHMARK_DEFINE_F(Keyswitch_u64, ConcreteCuda_CopiesPlusKeyswitch) (benchmark::State &st) { uint64_t *lwe_in_ct = (uint64_t *)malloc( number_of_inputs * (input_lwe_dimension + 1) * sizeof(uint64_t)); + uint64_t *lwe_out_ct = (uint64_t *)malloc( + number_of_inputs * (output_lwe_dimension + 1) * sizeof(uint64_t)); void *v_stream = (void *)stream; for (auto _ : st) { cuda_memcpy_async_to_gpu(d_lwe_in_ct_array, lwe_in_ct, @@ -88,13 +90,14 @@ BENCHMARK_DEFINE_F(Keyswitch_u64, ConcreteCuda_CopiesPlusKeyswitch) stream, gpu_index, (void *)d_lwe_out_ct_array, (void *)d_lwe_in_ct_array, (void *)d_ksk_array, input_lwe_dimension, output_lwe_dimension, ksk_base_log, ksk_level, number_of_inputs); - cuda_memcpy_async_to_cpu(lwe_in_ct, d_lwe_out_ct_array, + cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct_array, number_of_inputs * (output_lwe_dimension + 1) * sizeof(uint64_t), stream, gpu_index); cuda_synchronize_stream(v_stream); } free(lwe_in_ct); + free(lwe_out_ct); } static void diff --git a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_linear_algebra.cpp b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_linear_algebra.cpp index d8f68da30..5f1ba10b8 100644 --- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_linear_algebra.cpp +++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_linear_algebra.cpp @@ -54,7 +54,7 @@ public: noise_variance, payload_modulus, delta, num_samples, 1, 1, gpu_index); } - void TearDown() { + void TearDown(const ::benchmark::State &state) { linear_algebra_teardown( stream, &csprng, &lwe_sk_array, &d_lwe_in_1_ct, &d_lwe_in_2_ct, &d_lwe_out_ct, &lwe_in_1_ct, &lwe_in_2_ct, &lwe_out_ct, &plaintexts_1, diff --git a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_wop_bootstrap.cpp b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_wop_bootstrap.cpp index 28f67b2f2..9fcf7562b 100644 --- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_wop_bootstrap.cpp +++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_wop_bootstrap.cpp @@ -88,6 +88,8 @@ public: // We keep the following for the benchmarks with copies lwe_ct_in_array = (uint64_t *)malloc( (glwe_dimension * polynomial_size + 1) * tau * sizeof(uint64_t)); + lwe_ct_out_array = (uint64_t *)malloc( + (glwe_dimension * polynomial_size + 1) * tau * sizeof(uint64_t)); for (int i = 0; i < tau; i++) { uint64_t plaintext = plaintexts[i]; uint64_t *lwe_ct_in = @@ -97,11 +99,9 @@ public: lwe_sk_in, lwe_ct_in, plaintext, glwe_dimension * polynomial_size, lwe_modular_variance, csprng, &CONCRETE_CSPRNG_VTABLE); } - lwe_ct_out_array = (uint64_t *)malloc( - (glwe_dimension * polynomial_size + 1) * tau * sizeof(uint64_t)); } - void TearDown() { + void TearDown(const ::benchmark::State &state) { wop_pbs_teardown(stream, csprng, lwe_sk_in, lwe_sk_out, d_ksk, d_fourier_bsk, d_pksk, plaintexts, d_lwe_ct_in_array, d_lut_vector, d_lwe_ct_out_array, wop_pbs_buffer, diff --git a/backends/concrete-cuda/implementation/test_and_benchmark/setup_and_teardown.cpp b/backends/concrete-cuda/implementation/test_and_benchmark/setup_and_teardown.cpp index 4af43829e..8e6d10940 100644 --- a/backends/concrete-cuda/implementation/test_and_benchmark/setup_and_teardown.cpp +++ b/backends/concrete-cuda/implementation/test_and_benchmark/setup_and_teardown.cpp @@ -15,7 +15,6 @@ void bootstrap_setup(cudaStream_t *stream, Csprng **csprng, int number_of_inputs, int repetitions, int samples, int gpu_index) { - void *v_stream = (void *)stream; *payload_modulus = message_modulus * carry_modulus; // Value of the shift we multiply our messages by *delta = ((uint64_t)(1) << 63) / (uint64_t)(*payload_modulus); @@ -92,7 +91,7 @@ void bootstrap_setup(cudaStream_t *stream, Csprng **csprng, (lwe_dimension + 1) * sizeof(uint64_t), stream, gpu_index); - cuda_synchronize_stream(v_stream); + cuda_synchronize_stream(stream); free(lwe_ct_in_array); free(lut_pbs_identity); @@ -105,21 +104,21 @@ void bootstrap_teardown(cudaStream_t *stream, Csprng *csprng, uint64_t *d_lut_pbs_indexes, uint64_t *d_lwe_ct_in_array, uint64_t *d_lwe_ct_out_array, int gpu_index) { - void *v_stream = (void *)stream; - cuda_synchronize_stream(v_stream); + cuda_synchronize_stream(stream); concrete_cpu_destroy_concrete_csprng(csprng); free(csprng); free(lwe_sk_in_array); free(lwe_sk_out_array); free(plaintexts); + cuda_drop_async(d_fourier_bsk_array, stream, gpu_index); cuda_drop_async(d_lut_pbs_identity, stream, gpu_index); cuda_drop_async(d_lut_pbs_indexes, stream, gpu_index); cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index); cuda_drop_async(d_lwe_ct_out_array, stream, gpu_index); + cuda_synchronize_stream(stream); cuda_destroy_stream(stream, gpu_index); - cudaDeviceSynchronize(); } void keyswitch_setup(cudaStream_t *stream, Csprng **csprng, @@ -137,7 +136,6 @@ void keyswitch_setup(cudaStream_t *stream, Csprng **csprng, // Value of the shift we multiply our messages by *delta = ((uint64_t)(1) << 63) / (uint64_t)(*payload_modulus); - void *v_stream = (void *)stream; // Create a Csprng *csprng = (Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE); @@ -189,7 +187,7 @@ void keyswitch_setup(cudaStream_t *stream, Csprng **csprng, repetitions * samples * number_of_inputs * (input_lwe_dimension + 1) * sizeof(uint64_t), stream, gpu_index); - cuda_synchronize_stream(v_stream); + cuda_synchronize_stream(stream); free(lwe_ct_in_array); } @@ -198,18 +196,19 @@ void keyswitch_teardown(cudaStream_t *stream, Csprng *csprng, uint64_t *d_ksk_array, uint64_t *plaintexts, uint64_t *d_lwe_ct_in_array, uint64_t *d_lwe_ct_out_array, int gpu_index) { - void *v_stream = (void *)stream; - cuda_synchronize_stream(v_stream); + cuda_synchronize_stream(stream); + concrete_cpu_destroy_concrete_csprng(csprng); free(csprng); free(lwe_sk_in_array); free(lwe_sk_out_array); free(plaintexts); + cuda_drop_async(d_ksk_array, stream, gpu_index); cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index); cuda_drop_async(d_lwe_ct_out_array, stream, gpu_index); + cuda_synchronize_stream(stream); cuda_destroy_stream(stream, gpu_index); - cudaDeviceSynchronize(); } void linear_algebra_setup(cudaStream_t *stream, Csprng **csprng, @@ -312,8 +311,7 @@ void linear_algebra_setup(cudaStream_t *stream, Csprng **csprng, sizeof(uint64_t), stream, gpu_index); - void *v_stream = (void *)stream; - cuda_synchronize_stream(v_stream); + cuda_synchronize_stream(stream); free(cleartext_2); } @@ -324,24 +322,25 @@ void linear_algebra_teardown(cudaStream_t *stream, Csprng **csprng, uint64_t **lwe_out_ct, uint64_t **plaintexts_1, uint64_t **plaintexts_2, uint64_t **d_plaintexts_2, uint64_t **d_cleartext_2, int gpu_index) { - void *v_stream = (void *)stream; - cuda_synchronize_stream(v_stream); + cuda_synchronize_stream(stream); + concrete_cpu_destroy_concrete_csprng(*csprng); free(*csprng); - cuda_drop_async(*d_lwe_in_1_ct, stream, gpu_index); - cuda_drop_async(*d_lwe_in_2_ct, stream, gpu_index); - cuda_drop_async(*d_plaintexts_2, stream, gpu_index); - cuda_drop_async(*d_cleartext_2, stream, gpu_index); - cuda_drop_async(*d_lwe_out_ct, stream, gpu_index); free(*lwe_out_ct); free(*lwe_sk_array); free(*plaintexts_1); free(*plaintexts_2); free(*lwe_in_1_ct); free(*lwe_in_2_ct); + + cuda_drop_async(*d_lwe_in_1_ct, stream, gpu_index); + cuda_drop_async(*d_lwe_in_2_ct, stream, gpu_index); + cuda_drop_async(*d_plaintexts_2, stream, gpu_index); + cuda_drop_async(*d_cleartext_2, stream, gpu_index); + cuda_drop_async(*d_lwe_out_ct, stream, gpu_index); + cuda_synchronize_stream(stream); cuda_destroy_stream(stream, gpu_index); - cudaDeviceSynchronize(); } void bit_extraction_setup( @@ -355,7 +354,6 @@ void bit_extraction_setup( int number_of_bits_of_message_including_padding, int number_of_bits_to_extract, int *delta_log, uint64_t *delta, int number_of_inputs, int repetitions, int samples, int gpu_index) { - void *v_stream = (void *)stream; *delta_log = 64 - number_of_bits_of_message_including_padding; *delta = (uint64_t)(1) << *delta_log; @@ -428,7 +426,7 @@ void bit_extraction_setup( pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index), true); - cuda_synchronize_stream(v_stream); + cuda_synchronize_stream(stream); free(lwe_ct_in_array); } @@ -439,20 +437,21 @@ void bit_extraction_teardown(cudaStream_t *stream, Csprng *csprng, uint64_t *plaintexts, uint64_t *d_lwe_ct_in_array, uint64_t *d_lwe_ct_out_array, int8_t *bit_extract_buffer, int gpu_index) { - void *v_stream = (void *)stream; - cuda_synchronize_stream(v_stream); + cuda_synchronize_stream(stream); + concrete_cpu_destroy_concrete_csprng(csprng); free(csprng); free(lwe_sk_in_array); free(lwe_sk_out_array); free(plaintexts); + cleanup_cuda_extract_bits(stream, gpu_index, &bit_extract_buffer); cuda_drop_async(d_fourier_bsk_array, stream, gpu_index); cuda_drop_async(d_ksk_array, stream, gpu_index); cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index); cuda_drop_async(d_lwe_ct_out_array, stream, gpu_index); + cuda_synchronize_stream(stream); cuda_destroy_stream(stream, gpu_index); - cudaDeviceSynchronize(); } void circuit_bootstrap_setup( @@ -468,7 +467,6 @@ void circuit_bootstrap_setup( int *delta_log, uint64_t *delta, int number_of_inputs, int repetitions, int samples, int gpu_index) { - void *v_stream = (void *)stream; *delta_log = 60; *delta = (uint64_t)(1) << *delta_log; @@ -547,7 +545,7 @@ void circuit_bootstrap_setup( cuda_memcpy_async_to_gpu(*d_lut_vector_indexes, h_lut_vector_indexes, number_of_inputs * cbs_level * sizeof(uint64_t), stream, gpu_index); - cuda_synchronize_stream(v_stream); + cuda_synchronize_stream(stream); free(h_lut_vector_indexes); free(lwe_ct_in_array); } @@ -558,21 +556,23 @@ void circuit_bootstrap_teardown( uint64_t *d_pksk_array, uint64_t *plaintexts, uint64_t *d_lwe_ct_in_array, uint64_t *d_lut_vector_indexes, uint64_t *d_ggsw_ct_out_array, int8_t *cbs_buffer, int gpu_index) { - void *v_stream = (void *)stream; - cuda_synchronize_stream(v_stream); + + cuda_synchronize_stream(stream); + concrete_cpu_destroy_concrete_csprng(csprng); free(csprng); free(lwe_sk_in_array); free(lwe_sk_out_array); free(plaintexts); + cleanup_cuda_circuit_bootstrap(stream, gpu_index, &cbs_buffer); cuda_drop_async(d_fourier_bsk_array, stream, gpu_index); cuda_drop_async(d_pksk_array, stream, gpu_index); cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index); cuda_drop_async(d_ggsw_ct_out_array, stream, gpu_index); cuda_drop_async(d_lut_vector_indexes, stream, gpu_index); + cuda_synchronize_stream(stream); cuda_destroy_stream(stream, gpu_index); - cudaDeviceSynchronize(); } void cmux_tree_setup(cudaStream_t *stream, Csprng **csprng, uint64_t **glwe_sk, @@ -583,7 +583,6 @@ void cmux_tree_setup(cudaStream_t *stream, Csprng **csprng, uint64_t **glwe_sk, double glwe_modular_variance, int r_lut, int tau, uint64_t delta_log, int repetitions, int samples, int gpu_index) { - void *v_stream = (void *)stream; int ggsw_size = polynomial_size * (glwe_dimension + 1) * (glwe_dimension + 1) * level_count; int glwe_size = (glwe_dimension + 1) * polynomial_size; @@ -647,7 +646,7 @@ void cmux_tree_setup(cudaStream_t *stream, Csprng **csprng, uint64_t **glwe_sk, scratch_cuda_cmux_tree_64(stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size, level_count, r_lut, tau, cuda_get_max_shared_memory(gpu_index), true); - cuda_synchronize_stream(v_stream); + cuda_synchronize_stream(stream); free(lut_cmux_tree_identity); free(ggsw_bit_array); } @@ -657,16 +656,18 @@ void cmux_tree_teardown(cudaStream_t *stream, Csprng **csprng, int8_t **cmux_tree_buffer, uint64_t **d_glwe_out, int gpu_index) { cuda_synchronize_stream(stream); + concrete_cpu_destroy_concrete_csprng(*csprng); free(*plaintexts); free(*csprng); free(*glwe_sk); + + cleanup_cuda_cmux_tree(stream, gpu_index, cmux_tree_buffer); cuda_drop_async(*d_lut_identity, stream, gpu_index); cuda_drop_async(*d_ggsw_bit_array, stream, gpu_index); cuda_drop_async(*d_glwe_out, stream, gpu_index); - cleanup_cuda_cmux_tree(stream, gpu_index, cmux_tree_buffer); + cuda_synchronize_stream(stream); cuda_destroy_stream(stream, gpu_index); - cudaDeviceSynchronize(); } void wop_pbs_setup(cudaStream_t *stream, Csprng **csprng, @@ -683,7 +684,6 @@ void wop_pbs_setup(cudaStream_t *stream, Csprng **csprng, int *delta_log_lut, uint64_t *delta, int tau, int repetitions, int samples, int gpu_index) { - void *v_stream = (void *)stream; int input_lwe_dimension = glwe_dimension * polynomial_size; *delta_log = 64 - p; *delta_log_lut = *delta_log; @@ -773,7 +773,7 @@ void wop_pbs_setup(cudaStream_t *stream, Csprng **csprng, cbs_level, pbs_level, p, p, tau, cuda_get_max_shared_memory(gpu_index), true); - cuda_synchronize_stream(v_stream); + cuda_synchronize_stream(stream); free(lwe_ct_in_array); free(big_lut); } @@ -785,23 +785,23 @@ void wop_pbs_teardown(cudaStream_t *stream, Csprng *csprng, uint64_t *d_lwe_ct_in_array, uint64_t *d_lut_vector, uint64_t *d_lwe_ct_out_array, int8_t *wop_pbs_buffer, int gpu_index) { - void *v_stream = (void *)stream; - cuda_synchronize_stream(v_stream); + cuda_synchronize_stream(stream); + concrete_cpu_destroy_concrete_csprng(csprng); free(csprng); free(lwe_sk_in_array); free(lwe_sk_out_array); free(plaintexts); - cleanup_cuda_circuit_bootstrap_vertical_packing(stream, gpu_index, - &wop_pbs_buffer); + + cleanup_cuda_wop_pbs(stream, gpu_index, &wop_pbs_buffer); cuda_drop_async(d_fourier_bsk_array, stream, gpu_index); cuda_drop_async(d_ksk_array, stream, gpu_index); cuda_drop_async(d_pksk_array, stream, gpu_index); cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index); cuda_drop_async(d_lwe_ct_out_array, stream, gpu_index); cuda_drop_async(d_lut_vector, stream, gpu_index); + cuda_synchronize_stream(stream); cuda_destroy_stream(stream, gpu_index); - cudaDeviceSynchronize(); } void fft_setup(cudaStream_t *stream, double **_poly1, double **_poly2, @@ -816,7 +816,6 @@ void fft_setup(cudaStream_t *stream, double **_poly1, double **_poly2, auto &d_cpoly1 = *_d_cpoly1; auto &d_cpoly2 = *_d_cpoly2; - void *v_stream = (void *)stream; poly1 = (double *)malloc(polynomial_size * samples * sizeof(double)); poly2 = (double *)malloc(polynomial_size * samples * sizeof(double)); h_cpoly1 = (double2 *)malloc(polynomial_size / 2 * samples * sizeof(double2)); @@ -859,18 +858,21 @@ void fft_setup(cudaStream_t *stream, double **_poly1, double **_poly2, cuda_memcpy_async_to_gpu(d_cpoly2, h_cpoly2, polynomial_size / 2 * samples * sizeof(double2), stream, gpu_index); - cuda_synchronize_stream(v_stream); + cuda_synchronize_stream(stream); } void fft_teardown(cudaStream_t *stream, double *poly1, double *poly2, double2 *h_cpoly1, double2 *h_cpoly2, double2 *d_cpoly1, double2 *d_cpoly2, int gpu_index) { + cuda_synchronize_stream(stream); + free(poly1); free(poly2); free(h_cpoly1); free(h_cpoly2); + cuda_drop_async(d_cpoly1, stream, gpu_index); cuda_drop_async(d_cpoly2, stream, gpu_index); + cuda_synchronize_stream(stream); cuda_destroy_stream(stream, gpu_index); - cudaDeviceSynchronize(); }