mirror of
https://github.com/zama-ai/concrete.git
synced 2026-02-08 19:44:57 -05:00
fix(cuda): fix asynchronous behaviour for pbs and wop pbs
This commit is contained in:
@@ -5,7 +5,8 @@
|
||||
|
||||
extern "C" {
|
||||
|
||||
void cuda_initialize_twiddles(uint32_t polynomial_size, uint32_t gpu_index);
|
||||
void cuda_initialize_twiddles(uint32_t polynomial_size, void *v_stream,
|
||||
uint32_t gpu_index);
|
||||
|
||||
void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src, void *v_stream,
|
||||
uint32_t gpu_index,
|
||||
|
||||
@@ -33,4 +33,6 @@ int cuda_drop(void *ptr, uint32_t gpu_index);
|
||||
int cuda_drop_async(void *ptr, cudaStream_t *stream, uint32_t gpu_index);
|
||||
|
||||
int cuda_get_max_shared_memory(uint32_t gpu_index);
|
||||
|
||||
int cuda_synchronize_stream(void *v_stream);
|
||||
}
|
||||
|
||||
@@ -15,23 +15,19 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
|
||||
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
|
||||
|
||||
void cuda_fp_keyswitch_lwe_to_glwe_32(void *v_stream, void *glwe_array_out,
|
||||
void *lwe_array_in, void *fp_ksk_array,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t output_glwe_dimension,
|
||||
uint32_t output_polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count,
|
||||
uint32_t number_of_input_lwe,
|
||||
uint32_t number_of_keys);
|
||||
void cuda_fp_keyswitch_lwe_to_glwe_32(
|
||||
void *v_stream, uint32_t gpu_index, void *glwe_array_out,
|
||||
void *lwe_array_in, void *fp_ksk_array, uint32_t input_lwe_dimension,
|
||||
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t number_of_input_lwe,
|
||||
uint32_t number_of_keys);
|
||||
|
||||
void cuda_fp_keyswitch_lwe_to_glwe_64(void *v_stream, void *glwe_array_out,
|
||||
void *lwe_array_in, void *fp_ksk_array,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t output_glwe_dimension,
|
||||
uint32_t output_polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count,
|
||||
uint32_t number_of_input_lwe,
|
||||
uint32_t number_of_keys);
|
||||
void cuda_fp_keyswitch_lwe_to_glwe_64(
|
||||
void *v_stream, uint32_t gpu_index, void *glwe_array_out,
|
||||
void *lwe_array_in, void *fp_ksk_array, uint32_t input_lwe_dimension,
|
||||
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t number_of_input_lwe,
|
||||
uint32_t number_of_keys);
|
||||
}
|
||||
|
||||
#endif // CNCRT_KS_H_
|
||||
|
||||
@@ -59,8 +59,6 @@ __host__ void host_addition(void *v_stream, uint32_t gpu_index, T *output,
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
addition<<<grid, thds, 0, *stream>>>(output, input_1, input_2, num_entries);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
|
||||
cudaStreamSynchronize(*stream);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@@ -86,7 +84,5 @@ __host__ void host_addition_plaintext(void *v_stream, uint32_t gpu_index,
|
||||
plaintext_addition<<<grid, thds, 0, *stream>>>(
|
||||
output, lwe_input, plaintext_input, input_lwe_dimension, num_entries);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
|
||||
cudaStreamSynchronize(*stream);
|
||||
}
|
||||
#endif // CUDA_ADD_H
|
||||
|
||||
@@ -145,6 +145,7 @@ __host__ void host_extract_bits(
|
||||
uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
uint32_t ciphertext_n_bits = sizeof(Torus) * 8;
|
||||
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
|
||||
#include "cooperative_groups.h"
|
||||
|
||||
#include "../include/helper_cuda.h"
|
||||
#include "bootstrap.h"
|
||||
#include "complex/operations.cuh"
|
||||
#include "crypto/gadget.cuh"
|
||||
@@ -18,11 +17,11 @@
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/smfft.cuh"
|
||||
#include "fft/twiddles.cuh"
|
||||
#include "helper_cuda.h"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "utils/memory.cuh"
|
||||
#include "utils/timer.cuh"
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
@@ -284,6 +283,7 @@ __host__ void host_bootstrap_amortized(
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
int SM_FULL = sizeof(Torus) * polynomial_size + // accumulator mask
|
||||
sizeof(Torus) * polynomial_size + // accumulator body
|
||||
sizeof(Torus) * polynomial_size + // accumulator mask rotated
|
||||
@@ -356,9 +356,6 @@ __host__ void host_bootstrap_amortized(
|
||||
}
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
|
||||
// Synchronize the streams before copying the result to lwe_array_out at the
|
||||
// right place
|
||||
cudaStreamSynchronize(*stream);
|
||||
cuda_drop_async(d_mem, stream, gpu_index);
|
||||
}
|
||||
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
|
||||
#include "cooperative_groups.h"
|
||||
|
||||
#include "../include/helper_cuda.h"
|
||||
#include "bootstrap.h"
|
||||
#include "complex/operations.cuh"
|
||||
#include "crypto/gadget.cuh"
|
||||
@@ -18,10 +17,10 @@
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/smfft.cuh"
|
||||
#include "fft/twiddles.cuh"
|
||||
#include "helper_cuda.h"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "utils/memory.cuh"
|
||||
#include "utils/timer.cuh"
|
||||
|
||||
// Cooperative groups are used in the low latency PBS
|
||||
@@ -263,6 +262,7 @@ __host__ void host_bootstrap_low_latency(
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
int buffer_size_per_gpu = level_count * input_lwe_ciphertext_count *
|
||||
@@ -346,7 +346,6 @@ __host__ void host_bootstrap_low_latency(
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
// Synchronize the streams before copying the result to lwe_array_out at the
|
||||
// right place
|
||||
cudaStreamSynchronize(*stream);
|
||||
cuda_drop_async(mask_buffer_fft, stream, gpu_index);
|
||||
cuda_drop_async(body_buffer_fft, stream, gpu_index);
|
||||
cuda_drop_async(d_mem, stream, gpu_index);
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
#ifndef CBS_H
|
||||
#define CBS_H
|
||||
|
||||
#include "../include/helper_cuda.h"
|
||||
#include "bit_extraction.cuh"
|
||||
#include "bootstrap.h"
|
||||
#include "bootstrap_amortized.cuh"
|
||||
#include "device.h"
|
||||
#include "helper_cuda.h"
|
||||
#include "keyswitch.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "utils/timer.cuh"
|
||||
@@ -113,6 +113,7 @@ __host__ void host_circuit_bootstrap(
|
||||
uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk,
|
||||
uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs,
|
||||
uint32_t number_of_samples, uint32_t max_shared_memory) {
|
||||
cudaSetDevice(gpu_index);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
uint32_t ciphertext_n_bits = sizeof(Torus) * 8;
|
||||
@@ -151,12 +152,12 @@ __host__ void host_circuit_bootstrap(
|
||||
dim3 copy_block(params::degree / params::opt, 1, 1);
|
||||
// Add q/4 to center the error while computing a negacyclic LUT
|
||||
// copy pbs result (glwe_dimension + 1) times to be an input of fp-ks
|
||||
copy_add_lwe_cbs<Torus, params><<<copy_grid, copy_block>>>(
|
||||
copy_add_lwe_cbs<Torus, params><<<copy_grid, copy_block, 0, *stream>>>(
|
||||
lwe_array_in_fp_ks_buffer, lwe_array_out_pbs_buffer, ciphertext_n_bits,
|
||||
base_log_cbs, level_cbs);
|
||||
|
||||
cuda_fp_keyswitch_lwe_to_glwe(
|
||||
v_stream, ggsw_out, lwe_array_in_fp_ks_buffer, fp_ksk_array,
|
||||
v_stream, gpu_index, ggsw_out, lwe_array_in_fp_ks_buffer, fp_ksk_array,
|
||||
polynomial_size, glwe_dimension, polynomial_size, base_log_pksk,
|
||||
level_pksk, pbs_count * (glwe_dimension + 1), glwe_dimension + 1);
|
||||
}
|
||||
|
||||
@@ -38,7 +38,8 @@ __device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
|
||||
polynomial_size / 2];
|
||||
}
|
||||
|
||||
void cuda_initialize_twiddles(uint32_t polynomial_size, uint32_t gpu_index) {
|
||||
void cuda_initialize_twiddles(uint32_t polynomial_size, void *v_stream,
|
||||
uint32_t gpu_index) {
|
||||
cudaSetDevice(gpu_index);
|
||||
int sw_size = polynomial_size / 2;
|
||||
short *sw1_h, *sw2_h;
|
||||
@@ -61,10 +62,11 @@ void cuda_initialize_twiddles(uint32_t polynomial_size, uint32_t gpu_index) {
|
||||
cnt++;
|
||||
}
|
||||
}
|
||||
cudaMemcpyToSymbol(SW1, sw1_h, sw_size * sizeof(short), 0,
|
||||
cudaMemcpyHostToDevice);
|
||||
cudaMemcpyToSymbol(SW2, sw2_h, sw_size * sizeof(short), 0,
|
||||
cudaMemcpyHostToDevice);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
cudaMemcpyToSymbolAsync(SW1, sw1_h, sw_size * sizeof(short), 0,
|
||||
cudaMemcpyHostToDevice, *stream);
|
||||
cudaMemcpyToSymbolAsync(SW2, sw2_h, sw_size * sizeof(short), 0,
|
||||
cudaMemcpyHostToDevice, *stream);
|
||||
free(sw1_h);
|
||||
free(sw2_h);
|
||||
}
|
||||
@@ -91,8 +93,8 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
|
||||
int blockSize = polynomial_size / choose_opt(polynomial_size);
|
||||
|
||||
double2 *h_bsk = (double2 *)malloc(buffer_size);
|
||||
double2 *d_bsk;
|
||||
cudaMalloc((void **)&d_bsk, buffer_size);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
double2 *d_bsk = (double2 *)cuda_malloc_async(buffer_size, stream, gpu_index);
|
||||
|
||||
// compress real bsk to complex and divide it on DOUBLE_MAX
|
||||
for (int i = 0; i < total_polynomials; i++) {
|
||||
@@ -110,9 +112,8 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
|
||||
}
|
||||
}
|
||||
|
||||
cudaMemcpy(d_bsk, h_bsk, buffer_size, cudaMemcpyHostToDevice);
|
||||
cuda_memcpy_async_to_gpu(d_bsk, h_bsk, buffer_size, stream, gpu_index);
|
||||
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
double2 *buffer;
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
|
||||
@@ -187,3 +187,9 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) {
|
||||
}
|
||||
return max_shared_memory;
|
||||
}
|
||||
|
||||
int cuda_synchronize_stream(void *v_stream) {
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
cudaStreamSynchronize(*stream);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -46,17 +46,15 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
|
||||
/* Perform functional packing keyswitch on a batch of 32 bits input LWE
|
||||
* ciphertexts. See the equivalent function on 64 bit inputs for more details.
|
||||
*/
|
||||
void cuda_fp_keyswitch_lwe_to_glwe_32(void *v_stream, void *glwe_array_out,
|
||||
void *lwe_array_in, void *fp_ksk_array,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t output_glwe_dimension,
|
||||
uint32_t output_polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count,
|
||||
uint32_t number_of_input_lwe,
|
||||
uint32_t number_of_keys) {
|
||||
void cuda_fp_keyswitch_lwe_to_glwe_32(
|
||||
void *v_stream, uint32_t gpu_index, void *glwe_array_out,
|
||||
void *lwe_array_in, void *fp_ksk_array, uint32_t input_lwe_dimension,
|
||||
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t number_of_input_lwe,
|
||||
uint32_t number_of_keys) {
|
||||
|
||||
cuda_fp_keyswitch_lwe_to_glwe(
|
||||
v_stream, static_cast<uint32_t *>(glwe_array_out),
|
||||
v_stream, gpu_index, static_cast<uint32_t *>(glwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(fp_ksk_array), input_lwe_dimension,
|
||||
output_glwe_dimension, output_polynomial_size, base_log, level_count,
|
||||
@@ -68,6 +66,7 @@ void cuda_fp_keyswitch_lwe_to_glwe_32(void *v_stream, void *glwe_array_out,
|
||||
*
|
||||
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
|
||||
* launch
|
||||
* - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
* - `glwe_array_out`: output batch of keyswitched ciphertexts
|
||||
* - `lwe_array_in`: input batch of num_samples LWE ciphertexts, containing
|
||||
* lwe_dimension_in mask values + 1 body value
|
||||
@@ -83,17 +82,15 @@ void cuda_fp_keyswitch_lwe_to_glwe_32(void *v_stream, void *glwe_array_out,
|
||||
* This function calls a wrapper to a device kernel that performs the functional
|
||||
* packing keyswitch.
|
||||
*/
|
||||
void cuda_fp_keyswitch_lwe_to_glwe_64(void *v_stream, void *glwe_array_out,
|
||||
void *lwe_array_in, void *fp_ksk_array,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t output_glwe_dimension,
|
||||
uint32_t output_polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count,
|
||||
uint32_t number_of_input_lwe,
|
||||
uint32_t number_of_keys) {
|
||||
void cuda_fp_keyswitch_lwe_to_glwe_64(
|
||||
void *v_stream, uint32_t gpu_index, void *glwe_array_out,
|
||||
void *lwe_array_in, void *fp_ksk_array, uint32_t input_lwe_dimension,
|
||||
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t number_of_input_lwe,
|
||||
uint32_t number_of_keys) {
|
||||
|
||||
cuda_fp_keyswitch_lwe_to_glwe(
|
||||
v_stream, static_cast<uint64_t *>(glwe_array_out),
|
||||
v_stream, gpu_index, static_cast<uint64_t *>(glwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(fp_ksk_array), input_lwe_dimension,
|
||||
output_glwe_dimension, output_polynomial_size, base_log, level_count,
|
||||
|
||||
@@ -170,6 +170,7 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(
|
||||
uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
constexpr int ideal_threads = 128;
|
||||
|
||||
int lwe_dim = lwe_dimension_out + 1;
|
||||
@@ -190,7 +191,8 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(
|
||||
|
||||
int shared_mem = sizeof(Torus) * (lwe_dimension_out + 1);
|
||||
|
||||
cudaMemset(lwe_array_out, 0, sizeof(Torus) * lwe_size_after);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
cudaMemsetAsync(lwe_array_out, 0, sizeof(Torus) * lwe_size_after, *stream);
|
||||
|
||||
dim3 grid(num_samples, 1, 1);
|
||||
dim3 threads(ideal_threads, 1, 1);
|
||||
@@ -198,21 +200,20 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(
|
||||
cudaFuncSetAttribute(keyswitch<Torus>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem);
|
||||
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
keyswitch<<<grid, threads, shared_mem, *stream>>>(
|
||||
lwe_array_out, lwe_array_in, ksk, lwe_dimension_in, lwe_dimension_out,
|
||||
base_log, level_count, lwe_lower, lwe_upper, cutoff);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
|
||||
cudaStreamSynchronize(*stream);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void cuda_fp_keyswitch_lwe_to_glwe(
|
||||
void *v_stream, Torus *glwe_array_out, Torus *lwe_array_in,
|
||||
Torus *fp_ksk_array, uint32_t lwe_dimension_in, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t number_of_input_lwe, uint32_t number_of_keys) {
|
||||
void *v_stream, uint32_t gpu_index, Torus *glwe_array_out,
|
||||
Torus *lwe_array_in, Torus *fp_ksk_array, uint32_t lwe_dimension_in,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t number_of_input_lwe,
|
||||
uint32_t number_of_keys) {
|
||||
cudaSetDevice(gpu_index);
|
||||
int threads = 256;
|
||||
int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
|
||||
dim3 blocks(glwe_accumulator_size / threads, number_of_input_lwe, 1);
|
||||
@@ -223,8 +224,6 @@ __host__ void cuda_fp_keyswitch_lwe_to_glwe(
|
||||
glwe_array_out, lwe_array_in, fp_ksk_array, lwe_dimension_in,
|
||||
glwe_dimension, polynomial_size, base_log, level_count,
|
||||
number_of_input_lwe, number_of_keys);
|
||||
|
||||
cudaStreamSynchronize(*stream);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -47,8 +47,6 @@ host_cleartext_multiplication(void *v_stream, uint32_t gpu_index, T *output,
|
||||
cleartext_multiplication<<<grid, thds, 0, *stream>>>(
|
||||
output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
|
||||
cudaStreamSynchronize(*stream);
|
||||
}
|
||||
|
||||
#endif // CUDA_MULT_H
|
||||
|
||||
@@ -41,8 +41,6 @@ __host__ void host_negation(void *v_stream, uint32_t gpu_index, T *output,
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
negation<<<grid, thds, 0, *stream>>>(output, input, num_entries);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
|
||||
cudaStreamSynchronize(*stream);
|
||||
}
|
||||
|
||||
#endif // CUDA_NEGATE_H
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#ifndef GPU_POLYNOMIAL_FUNCTIONS
|
||||
#define GPU_POLYNOMIAL_FUNCTIONS
|
||||
#include "utils/memory.cuh"
|
||||
#include "helper_cuda.h"
|
||||
#include "utils/timer.cuh"
|
||||
|
||||
/*
|
||||
|
||||
@@ -5,8 +5,8 @@
|
||||
#include "crypto/torus.cuh"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/smfft.cuh"
|
||||
#include "helper_cuda.h"
|
||||
#include "parameters.cuh"
|
||||
#include "utils/memory.cuh"
|
||||
#include "utils/timer.cuh"
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
@@ -32,67 +32,6 @@ template <typename T, class params> class Vector;
|
||||
|
||||
template <typename FT, class params> class Twiddles;
|
||||
|
||||
template <typename T, class params> class VectorPolynomial {
|
||||
public:
|
||||
T *m_data;
|
||||
uint32_t m_num_polynomials;
|
||||
|
||||
__device__ VectorPolynomial(T *data, uint32_t num_polynomials)
|
||||
: m_data(data), m_num_polynomials(num_polynomials) {}
|
||||
|
||||
__device__ VectorPolynomial<T, params> get_chunk(int chunk_num,
|
||||
int chunk_size) {
|
||||
int pos = chunk_num * chunk_size;
|
||||
T *ptr = &m_data[pos];
|
||||
|
||||
return VectorPolynomial<T, params>(ptr, chunk_size / params::degree);
|
||||
}
|
||||
|
||||
__host__ VectorPolynomial() {}
|
||||
|
||||
__host__ VectorPolynomial(DeviceMemory &dmem, uint32_t num_polynomials,
|
||||
int device)
|
||||
: m_num_polynomials(num_polynomials) {
|
||||
dmem.get_allocation(&m_data, m_num_polynomials * params::degree, device);
|
||||
}
|
||||
|
||||
__host__ VectorPolynomial(DeviceMemory &dmem, T *source,
|
||||
uint32_t num_polynomials, int device)
|
||||
: m_num_polynomials(num_polynomials) {
|
||||
dmem.get_allocation_and_copy_async(
|
||||
&m_data, source, m_num_polynomials * params::degree, device);
|
||||
}
|
||||
|
||||
__host__ void copy_to_host(T *dest) {
|
||||
cudaMemcpyAsync(dest, m_data,
|
||||
sizeof(T) * m_num_polynomials * params::degree,
|
||||
cudaMemcpyDeviceToHost);
|
||||
}
|
||||
|
||||
__device__ void copy_into(Polynomial<T, params> &dest,
|
||||
int polynomial_number = 0) {
|
||||
int tid = threadIdx.x;
|
||||
int begin = polynomial_number * params::degree;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
dest.coefficients[tid] = m_data[tid + begin];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
__device__ void split_into_polynomials(Polynomial<T, params> &first,
|
||||
Polynomial<T, params> &second) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
first.coefficients[tid] = m_data[tid];
|
||||
second.coefficients[tid] = m_data[tid + params::degree];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, class params> class Polynomial {
|
||||
public:
|
||||
T *coefficients;
|
||||
@@ -104,18 +43,6 @@ public:
|
||||
__device__ Polynomial(char *memory, uint32_t degree)
|
||||
: coefficients((T *)memory), degree(degree) {}
|
||||
|
||||
__host__ Polynomial(DeviceMemory &dmem, uint32_t degree, int device)
|
||||
: degree(degree) {
|
||||
dmem.get_allocation(&this->coefficients, params::degree, device);
|
||||
}
|
||||
|
||||
__host__ Polynomial(DeviceMemory &dmem, T *source, uint32_t degree,
|
||||
int device)
|
||||
: degree(degree) {
|
||||
dmem.get_allocation_and_copy_async(&this->coefficients, source,
|
||||
params::degree, device);
|
||||
}
|
||||
|
||||
__host__ void copy_to_host(T *dest) {
|
||||
cudaMemcpyAsync(dest, this->coefficients, sizeof(T) * params::degree,
|
||||
cudaMemcpyDeviceToHost);
|
||||
@@ -402,22 +329,6 @@ public:
|
||||
cudaMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
__host__ Vector(DeviceMemory &dmem, T *source, uint32_t size_source,
|
||||
int device)
|
||||
: m_size(size_source) {
|
||||
dmem.get_allocation_and_copy_async(&m_data, source, m_size, device);
|
||||
}
|
||||
|
||||
__host__ Vector(DeviceMemory &dmem, T *source, uint32_t allocation_size,
|
||||
uint32_t copy_size, int device)
|
||||
: m_size(allocation_size) {
|
||||
if (copy_size > allocation_size) {
|
||||
printf("warning: copying more than allocation");
|
||||
}
|
||||
dmem.get_allocation_and_copy_async(&m_data, source, m_size, copy_size,
|
||||
device);
|
||||
}
|
||||
|
||||
__host__ void copy_to_host(T *dest) {
|
||||
cudaMemcpyAsync(dest, m_data, sizeof(T) * m_size, cudaMemcpyDeviceToHost);
|
||||
}
|
||||
|
||||
@@ -1,77 +0,0 @@
|
||||
#ifndef CNCRT_SHMEM_H
|
||||
#define CNCRT_SHMEM_H
|
||||
|
||||
#include "helper_cuda.h"
|
||||
#include <atomic>
|
||||
#include <iostream>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
class DeviceMemory {
|
||||
public:
|
||||
std::vector<std::tuple<void *, int>> m_allocated;
|
||||
std::mutex m_allocation_mtx;
|
||||
std::atomic<uint32_t> m_total_devices;
|
||||
|
||||
DeviceMemory() : m_total_devices(1) {}
|
||||
|
||||
__host__ void set_device(int device) {
|
||||
if (device > m_total_devices)
|
||||
m_total_devices = device + 1;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void get_allocation(T **ptr, int elements, int device) {
|
||||
T *res;
|
||||
cudaMalloc((void **)&res, sizeof(T) * elements);
|
||||
*ptr = res;
|
||||
std::lock_guard<std::mutex> lock(m_allocation_mtx);
|
||||
m_allocated.push_back(std::make_tuple(res, device));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void get_allocation_and_copy_async(T **ptr, T *src, int elements,
|
||||
int device) {
|
||||
T *res;
|
||||
cudaMalloc((void **)&res, sizeof(T) * elements);
|
||||
cudaMemcpyAsync(res, src, sizeof(T) * elements, cudaMemcpyHostToDevice);
|
||||
*ptr = res;
|
||||
std::lock_guard<std::mutex> lock(m_allocation_mtx);
|
||||
m_allocated.push_back(std::make_tuple(res, device));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void get_allocation_and_copy_async(T **ptr, T *src, int allocation,
|
||||
int elements, int device) {
|
||||
T *res;
|
||||
cudaMalloc((void **)&res, sizeof(T) * allocation);
|
||||
cudaMemcpyAsync(res, src, sizeof(T) * elements, cudaMemcpyHostToDevice);
|
||||
*ptr = res;
|
||||
std::lock_guard<std::mutex> lock(m_allocation_mtx);
|
||||
m_allocated.push_back(std::make_tuple(res, device));
|
||||
}
|
||||
|
||||
void free_all_from_device(int device) {
|
||||
cudaSetDevice(device);
|
||||
for (auto elem : m_allocated) {
|
||||
auto dev = std::get<1>(elem);
|
||||
if (dev == device) {
|
||||
auto mem = std::get<0>(elem);
|
||||
checkCudaErrors(cudaFree(mem));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__host__ ~DeviceMemory() {
|
||||
for (auto elem : m_allocated) {
|
||||
auto dev = std::get<1>(elem);
|
||||
auto mem = std::get<0>(elem);
|
||||
cudaSetDevice(dev);
|
||||
checkCudaErrors(cudaFree(mem));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#endif // CNCRT_SHMEM_H
|
||||
@@ -1,6 +1,7 @@
|
||||
#ifndef CNCRT_TIMER_H
|
||||
#define CNCRT_TIMER_H
|
||||
|
||||
#include <iostream>
|
||||
#define synchronize_threads_in_block() __syncthreads()
|
||||
|
||||
template <bool active> class CudaMeasureExecution {
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
#ifndef VERTICAL_PACKING_H
|
||||
#define VERTICAL_PACKING_H
|
||||
|
||||
#include "../include/helper_cuda.h"
|
||||
#include "bootstrap.h"
|
||||
#include "complex/operations.cuh"
|
||||
#include "crypto/gadget.cuh"
|
||||
@@ -11,11 +10,11 @@
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/smfft.cuh"
|
||||
#include "fft/twiddles.cuh"
|
||||
#include "helper_cuda.h"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "utils/memory.cuh"
|
||||
#include "utils/timer.cuh"
|
||||
|
||||
template <class params> __device__ void fft(double2 *output) {
|
||||
@@ -266,11 +265,13 @@ __global__ void device_batch_cmux(Torus *glwe_array_out, Torus *glwe_array_in,
|
||||
* - tau: The quantity of CMUX trees that should be executed
|
||||
*/
|
||||
template <typename Torus, typename STorus, class params>
|
||||
void host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out,
|
||||
Torus *ggsw_in, Torus *lut_vector, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t r, uint32_t tau,
|
||||
uint32_t max_shared_memory) {
|
||||
__host__ void host_cmux_tree(void *v_stream, uint32_t gpu_index,
|
||||
Torus *glwe_array_out, Torus *ggsw_in,
|
||||
Torus *lut_vector, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t r, uint32_t tau,
|
||||
uint32_t max_shared_memory) {
|
||||
cudaSetDevice(gpu_index);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
int num_lut = (1 << r);
|
||||
@@ -278,12 +279,9 @@ void host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out,
|
||||
// Simply copy the LUTs
|
||||
add_padding_to_lut_async<Torus, params>(glwe_array_out, lut_vector,
|
||||
glwe_dimension, tau, stream);
|
||||
checkCudaErrors(cudaStreamSynchronize(*stream));
|
||||
return;
|
||||
}
|
||||
|
||||
cuda_initialize_twiddles(polynomial_size, 0);
|
||||
|
||||
int memory_needed_per_block =
|
||||
sizeof(Torus) * polynomial_size + // glwe_sub_mask
|
||||
sizeof(Torus) * polynomial_size + // glwe_sub_body
|
||||
@@ -365,11 +363,6 @@ void host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out,
|
||||
glwe_array_out + i * glwe_size, output + i * num_lut * glwe_size,
|
||||
glwe_size * sizeof(Torus), cudaMemcpyDeviceToDevice, *stream));
|
||||
|
||||
// We only need synchronization to assert that data is in glwe_array_out
|
||||
// before returning. Memory release can be added to the stream and processed
|
||||
// later.
|
||||
checkCudaErrors(cudaStreamSynchronize(*stream));
|
||||
|
||||
// Free memory
|
||||
cuda_drop_async(d_ggsw_fft_in, stream, gpu_index);
|
||||
cuda_drop_async(d_buffer1, stream, gpu_index);
|
||||
@@ -466,12 +459,13 @@ __global__ void device_blind_rotation_and_sample_extraction(
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, class params>
|
||||
void host_blind_rotate_and_sample_extraction(
|
||||
__host__ void host_blind_rotate_and_sample_extraction(
|
||||
void *v_stream, uint32_t gpu_index, Torus *lwe_out, Torus *ggsw_in,
|
||||
Torus *lut_vector, uint32_t mbr_size, uint32_t tau, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t l_gadget,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
assert(glwe_dimension ==
|
||||
1); // For larger k we will need to adjust the mask size
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
@@ -3,12 +3,11 @@
|
||||
|
||||
#include "cooperative_groups.h"
|
||||
|
||||
#include "../include/helper_cuda.h"
|
||||
#include "bit_extraction.cuh"
|
||||
#include "bootstrap.h"
|
||||
#include "circuit_bootstrap.cuh"
|
||||
#include "helper_cuda.h"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include "utils/memory.cuh"
|
||||
#include "utils/timer.cuh"
|
||||
#include "vertical_packing.cuh"
|
||||
|
||||
@@ -40,6 +39,7 @@ __host__ void host_circuit_bootstrap_vertical_packing(
|
||||
uint32_t level_count_cbs, uint32_t number_of_inputs, uint32_t tau,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
// allocate and initialize device pointers for circuit bootstrap
|
||||
@@ -140,6 +140,7 @@ __host__ void host_wop_pbs(
|
||||
uint32_t number_of_bits_to_extract, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
// let mut h_lut_vector_indexes = vec![0 as u32; 1];
|
||||
|
||||
Reference in New Issue
Block a user