fix(cuda): fix asynchronous behaviour for pbs and wop pbs

This commit is contained in:
Agnes Leroy
2023-01-23 15:55:57 +01:00
committed by Agnès Leroy
parent 2fcc5b2d0f
commit bd9cbbc7af
20 changed files with 81 additions and 259 deletions

View File

@@ -5,7 +5,8 @@
extern "C" {
void cuda_initialize_twiddles(uint32_t polynomial_size, uint32_t gpu_index);
void cuda_initialize_twiddles(uint32_t polynomial_size, void *v_stream,
uint32_t gpu_index);
void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src, void *v_stream,
uint32_t gpu_index,

View File

@@ -33,4 +33,6 @@ int cuda_drop(void *ptr, uint32_t gpu_index);
int cuda_drop_async(void *ptr, cudaStream_t *stream, uint32_t gpu_index);
int cuda_get_max_shared_memory(uint32_t gpu_index);
int cuda_synchronize_stream(void *v_stream);
}

View File

@@ -15,23 +15,19 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
void cuda_fp_keyswitch_lwe_to_glwe_32(void *v_stream, void *glwe_array_out,
void *lwe_array_in, void *fp_ksk_array,
uint32_t input_lwe_dimension,
uint32_t output_glwe_dimension,
uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count,
uint32_t number_of_input_lwe,
uint32_t number_of_keys);
void cuda_fp_keyswitch_lwe_to_glwe_32(
void *v_stream, uint32_t gpu_index, void *glwe_array_out,
void *lwe_array_in, void *fp_ksk_array, uint32_t input_lwe_dimension,
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t number_of_input_lwe,
uint32_t number_of_keys);
void cuda_fp_keyswitch_lwe_to_glwe_64(void *v_stream, void *glwe_array_out,
void *lwe_array_in, void *fp_ksk_array,
uint32_t input_lwe_dimension,
uint32_t output_glwe_dimension,
uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count,
uint32_t number_of_input_lwe,
uint32_t number_of_keys);
void cuda_fp_keyswitch_lwe_to_glwe_64(
void *v_stream, uint32_t gpu_index, void *glwe_array_out,
void *lwe_array_in, void *fp_ksk_array, uint32_t input_lwe_dimension,
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t number_of_input_lwe,
uint32_t number_of_keys);
}
#endif // CNCRT_KS_H_

View File

@@ -59,8 +59,6 @@ __host__ void host_addition(void *v_stream, uint32_t gpu_index, T *output,
auto stream = static_cast<cudaStream_t *>(v_stream);
addition<<<grid, thds, 0, *stream>>>(output, input_1, input_2, num_entries);
checkCudaErrors(cudaGetLastError());
cudaStreamSynchronize(*stream);
}
template <typename T>
@@ -86,7 +84,5 @@ __host__ void host_addition_plaintext(void *v_stream, uint32_t gpu_index,
plaintext_addition<<<grid, thds, 0, *stream>>>(
output, lwe_input, plaintext_input, input_lwe_dimension, num_entries);
checkCudaErrors(cudaGetLastError());
cudaStreamSynchronize(*stream);
}
#endif // CUDA_ADD_H

View File

@@ -145,6 +145,7 @@ __host__ void host_extract_bits(
uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples,
uint32_t max_shared_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
uint32_t ciphertext_n_bits = sizeof(Torus) * 8;

View File

@@ -9,7 +9,6 @@
#include "cooperative_groups.h"
#include "../include/helper_cuda.h"
#include "bootstrap.h"
#include "complex/operations.cuh"
#include "crypto/gadget.cuh"
@@ -18,11 +17,11 @@
#include "fft/bnsmfft.cuh"
#include "fft/smfft.cuh"
#include "fft/twiddles.cuh"
#include "helper_cuda.h"
#include "polynomial/functions.cuh"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial.cuh"
#include "polynomial/polynomial_math.cuh"
#include "utils/memory.cuh"
#include "utils/timer.cuh"
template <typename Torus, class params, sharedMemDegree SMD>
@@ -284,6 +283,7 @@ __host__ void host_bootstrap_amortized(
uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
uint32_t lwe_idx, uint32_t max_shared_memory) {
cudaSetDevice(gpu_index);
int SM_FULL = sizeof(Torus) * polynomial_size + // accumulator mask
sizeof(Torus) * polynomial_size + // accumulator body
sizeof(Torus) * polynomial_size + // accumulator mask rotated
@@ -356,9 +356,6 @@ __host__ void host_bootstrap_amortized(
}
checkCudaErrors(cudaGetLastError());
// Synchronize the streams before copying the result to lwe_array_out at the
// right place
cudaStreamSynchronize(*stream);
cuda_drop_async(d_mem, stream, gpu_index);
}

View File

@@ -9,7 +9,6 @@
#include "cooperative_groups.h"
#include "../include/helper_cuda.h"
#include "bootstrap.h"
#include "complex/operations.cuh"
#include "crypto/gadget.cuh"
@@ -18,10 +17,10 @@
#include "fft/bnsmfft.cuh"
#include "fft/smfft.cuh"
#include "fft/twiddles.cuh"
#include "helper_cuda.h"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial.cuh"
#include "polynomial/polynomial_math.cuh"
#include "utils/memory.cuh"
#include "utils/timer.cuh"
// Cooperative groups are used in the low latency PBS
@@ -263,6 +262,7 @@ __host__ void host_bootstrap_low_latency(
uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
uint32_t max_shared_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
int buffer_size_per_gpu = level_count * input_lwe_ciphertext_count *
@@ -346,7 +346,6 @@ __host__ void host_bootstrap_low_latency(
checkCudaErrors(cudaGetLastError());
// Synchronize the streams before copying the result to lwe_array_out at the
// right place
cudaStreamSynchronize(*stream);
cuda_drop_async(mask_buffer_fft, stream, gpu_index);
cuda_drop_async(body_buffer_fft, stream, gpu_index);
cuda_drop_async(d_mem, stream, gpu_index);

View File

@@ -1,11 +1,11 @@
#ifndef CBS_H
#define CBS_H
#include "../include/helper_cuda.h"
#include "bit_extraction.cuh"
#include "bootstrap.h"
#include "bootstrap_amortized.cuh"
#include "device.h"
#include "helper_cuda.h"
#include "keyswitch.cuh"
#include "polynomial/parameters.cuh"
#include "utils/timer.cuh"
@@ -113,6 +113,7 @@ __host__ void host_circuit_bootstrap(
uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk,
uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs,
uint32_t number_of_samples, uint32_t max_shared_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
uint32_t ciphertext_n_bits = sizeof(Torus) * 8;
@@ -151,12 +152,12 @@ __host__ void host_circuit_bootstrap(
dim3 copy_block(params::degree / params::opt, 1, 1);
// Add q/4 to center the error while computing a negacyclic LUT
// copy pbs result (glwe_dimension + 1) times to be an input of fp-ks
copy_add_lwe_cbs<Torus, params><<<copy_grid, copy_block>>>(
copy_add_lwe_cbs<Torus, params><<<copy_grid, copy_block, 0, *stream>>>(
lwe_array_in_fp_ks_buffer, lwe_array_out_pbs_buffer, ciphertext_n_bits,
base_log_cbs, level_cbs);
cuda_fp_keyswitch_lwe_to_glwe(
v_stream, ggsw_out, lwe_array_in_fp_ks_buffer, fp_ksk_array,
v_stream, gpu_index, ggsw_out, lwe_array_in_fp_ks_buffer, fp_ksk_array,
polynomial_size, glwe_dimension, polynomial_size, base_log_pksk,
level_pksk, pbs_count * (glwe_dimension + 1), glwe_dimension + 1);
}

View File

@@ -38,7 +38,8 @@ __device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
polynomial_size / 2];
}
void cuda_initialize_twiddles(uint32_t polynomial_size, uint32_t gpu_index) {
void cuda_initialize_twiddles(uint32_t polynomial_size, void *v_stream,
uint32_t gpu_index) {
cudaSetDevice(gpu_index);
int sw_size = polynomial_size / 2;
short *sw1_h, *sw2_h;
@@ -61,10 +62,11 @@ void cuda_initialize_twiddles(uint32_t polynomial_size, uint32_t gpu_index) {
cnt++;
}
}
cudaMemcpyToSymbol(SW1, sw1_h, sw_size * sizeof(short), 0,
cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(SW2, sw2_h, sw_size * sizeof(short), 0,
cudaMemcpyHostToDevice);
auto stream = static_cast<cudaStream_t *>(v_stream);
cudaMemcpyToSymbolAsync(SW1, sw1_h, sw_size * sizeof(short), 0,
cudaMemcpyHostToDevice, *stream);
cudaMemcpyToSymbolAsync(SW2, sw2_h, sw_size * sizeof(short), 0,
cudaMemcpyHostToDevice, *stream);
free(sw1_h);
free(sw2_h);
}
@@ -91,8 +93,8 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
int blockSize = polynomial_size / choose_opt(polynomial_size);
double2 *h_bsk = (double2 *)malloc(buffer_size);
double2 *d_bsk;
cudaMalloc((void **)&d_bsk, buffer_size);
auto stream = static_cast<cudaStream_t *>(v_stream);
double2 *d_bsk = (double2 *)cuda_malloc_async(buffer_size, stream, gpu_index);
// compress real bsk to complex and divide it on DOUBLE_MAX
for (int i = 0; i < total_polynomials; i++) {
@@ -110,9 +112,8 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
}
}
cudaMemcpy(d_bsk, h_bsk, buffer_size, cudaMemcpyHostToDevice);
cuda_memcpy_async_to_gpu(d_bsk, h_bsk, buffer_size, stream, gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
double2 *buffer;
switch (polynomial_size) {
case 512:

View File

@@ -187,3 +187,9 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) {
}
return max_shared_memory;
}
int cuda_synchronize_stream(void *v_stream) {
auto stream = static_cast<cudaStream_t *>(v_stream);
cudaStreamSynchronize(*stream);
return 0;
}

View File

@@ -46,17 +46,15 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
/* Perform functional packing keyswitch on a batch of 32 bits input LWE
* ciphertexts. See the equivalent function on 64 bit inputs for more details.
*/
void cuda_fp_keyswitch_lwe_to_glwe_32(void *v_stream, void *glwe_array_out,
void *lwe_array_in, void *fp_ksk_array,
uint32_t input_lwe_dimension,
uint32_t output_glwe_dimension,
uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count,
uint32_t number_of_input_lwe,
uint32_t number_of_keys) {
void cuda_fp_keyswitch_lwe_to_glwe_32(
void *v_stream, uint32_t gpu_index, void *glwe_array_out,
void *lwe_array_in, void *fp_ksk_array, uint32_t input_lwe_dimension,
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t number_of_input_lwe,
uint32_t number_of_keys) {
cuda_fp_keyswitch_lwe_to_glwe(
v_stream, static_cast<uint32_t *>(glwe_array_out),
v_stream, gpu_index, static_cast<uint32_t *>(glwe_array_out),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(fp_ksk_array), input_lwe_dimension,
output_glwe_dimension, output_polynomial_size, base_log, level_count,
@@ -68,6 +66,7 @@ void cuda_fp_keyswitch_lwe_to_glwe_32(void *v_stream, void *glwe_array_out,
*
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - `glwe_array_out`: output batch of keyswitched ciphertexts
* - `lwe_array_in`: input batch of num_samples LWE ciphertexts, containing
* lwe_dimension_in mask values + 1 body value
@@ -83,17 +82,15 @@ void cuda_fp_keyswitch_lwe_to_glwe_32(void *v_stream, void *glwe_array_out,
* This function calls a wrapper to a device kernel that performs the functional
* packing keyswitch.
*/
void cuda_fp_keyswitch_lwe_to_glwe_64(void *v_stream, void *glwe_array_out,
void *lwe_array_in, void *fp_ksk_array,
uint32_t input_lwe_dimension,
uint32_t output_glwe_dimension,
uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count,
uint32_t number_of_input_lwe,
uint32_t number_of_keys) {
void cuda_fp_keyswitch_lwe_to_glwe_64(
void *v_stream, uint32_t gpu_index, void *glwe_array_out,
void *lwe_array_in, void *fp_ksk_array, uint32_t input_lwe_dimension,
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t number_of_input_lwe,
uint32_t number_of_keys) {
cuda_fp_keyswitch_lwe_to_glwe(
v_stream, static_cast<uint64_t *>(glwe_array_out),
v_stream, gpu_index, static_cast<uint64_t *>(glwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(fp_ksk_array), input_lwe_dimension,
output_glwe_dimension, output_polynomial_size, base_log, level_count,

View File

@@ -170,6 +170,7 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(
uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
uint32_t num_samples) {
cudaSetDevice(gpu_index);
constexpr int ideal_threads = 128;
int lwe_dim = lwe_dimension_out + 1;
@@ -190,7 +191,8 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(
int shared_mem = sizeof(Torus) * (lwe_dimension_out + 1);
cudaMemset(lwe_array_out, 0, sizeof(Torus) * lwe_size_after);
auto stream = static_cast<cudaStream_t *>(v_stream);
cudaMemsetAsync(lwe_array_out, 0, sizeof(Torus) * lwe_size_after, *stream);
dim3 grid(num_samples, 1, 1);
dim3 threads(ideal_threads, 1, 1);
@@ -198,21 +200,20 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(
cudaFuncSetAttribute(keyswitch<Torus>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem);
auto stream = static_cast<cudaStream_t *>(v_stream);
keyswitch<<<grid, threads, shared_mem, *stream>>>(
lwe_array_out, lwe_array_in, ksk, lwe_dimension_in, lwe_dimension_out,
base_log, level_count, lwe_lower, lwe_upper, cutoff);
checkCudaErrors(cudaGetLastError());
cudaStreamSynchronize(*stream);
}
template <typename Torus>
__host__ void cuda_fp_keyswitch_lwe_to_glwe(
void *v_stream, Torus *glwe_array_out, Torus *lwe_array_in,
Torus *fp_ksk_array, uint32_t lwe_dimension_in, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t number_of_input_lwe, uint32_t number_of_keys) {
void *v_stream, uint32_t gpu_index, Torus *glwe_array_out,
Torus *lwe_array_in, Torus *fp_ksk_array, uint32_t lwe_dimension_in,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t number_of_input_lwe,
uint32_t number_of_keys) {
cudaSetDevice(gpu_index);
int threads = 256;
int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
dim3 blocks(glwe_accumulator_size / threads, number_of_input_lwe, 1);
@@ -223,8 +224,6 @@ __host__ void cuda_fp_keyswitch_lwe_to_glwe(
glwe_array_out, lwe_array_in, fp_ksk_array, lwe_dimension_in,
glwe_dimension, polynomial_size, base_log, level_count,
number_of_input_lwe, number_of_keys);
cudaStreamSynchronize(*stream);
}
#endif

View File

@@ -47,8 +47,6 @@ host_cleartext_multiplication(void *v_stream, uint32_t gpu_index, T *output,
cleartext_multiplication<<<grid, thds, 0, *stream>>>(
output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
checkCudaErrors(cudaGetLastError());
cudaStreamSynchronize(*stream);
}
#endif // CUDA_MULT_H

View File

@@ -41,8 +41,6 @@ __host__ void host_negation(void *v_stream, uint32_t gpu_index, T *output,
auto stream = static_cast<cudaStream_t *>(v_stream);
negation<<<grid, thds, 0, *stream>>>(output, input, num_entries);
checkCudaErrors(cudaGetLastError());
cudaStreamSynchronize(*stream);
}
#endif // CUDA_NEGATE_H

View File

@@ -1,6 +1,6 @@
#ifndef GPU_POLYNOMIAL_FUNCTIONS
#define GPU_POLYNOMIAL_FUNCTIONS
#include "utils/memory.cuh"
#include "helper_cuda.h"
#include "utils/timer.cuh"
/*

View File

@@ -5,8 +5,8 @@
#include "crypto/torus.cuh"
#include "fft/bnsmfft.cuh"
#include "fft/smfft.cuh"
#include "helper_cuda.h"
#include "parameters.cuh"
#include "utils/memory.cuh"
#include "utils/timer.cuh"
#include <cassert>
#include <cstdint>
@@ -32,67 +32,6 @@ template <typename T, class params> class Vector;
template <typename FT, class params> class Twiddles;
template <typename T, class params> class VectorPolynomial {
public:
T *m_data;
uint32_t m_num_polynomials;
__device__ VectorPolynomial(T *data, uint32_t num_polynomials)
: m_data(data), m_num_polynomials(num_polynomials) {}
__device__ VectorPolynomial<T, params> get_chunk(int chunk_num,
int chunk_size) {
int pos = chunk_num * chunk_size;
T *ptr = &m_data[pos];
return VectorPolynomial<T, params>(ptr, chunk_size / params::degree);
}
__host__ VectorPolynomial() {}
__host__ VectorPolynomial(DeviceMemory &dmem, uint32_t num_polynomials,
int device)
: m_num_polynomials(num_polynomials) {
dmem.get_allocation(&m_data, m_num_polynomials * params::degree, device);
}
__host__ VectorPolynomial(DeviceMemory &dmem, T *source,
uint32_t num_polynomials, int device)
: m_num_polynomials(num_polynomials) {
dmem.get_allocation_and_copy_async(
&m_data, source, m_num_polynomials * params::degree, device);
}
__host__ void copy_to_host(T *dest) {
cudaMemcpyAsync(dest, m_data,
sizeof(T) * m_num_polynomials * params::degree,
cudaMemcpyDeviceToHost);
}
__device__ void copy_into(Polynomial<T, params> &dest,
int polynomial_number = 0) {
int tid = threadIdx.x;
int begin = polynomial_number * params::degree;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
dest.coefficients[tid] = m_data[tid + begin];
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();
}
__device__ void split_into_polynomials(Polynomial<T, params> &first,
Polynomial<T, params> &second) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
first.coefficients[tid] = m_data[tid];
second.coefficients[tid] = m_data[tid + params::degree];
tid = tid + params::degree / params::opt;
}
}
};
template <typename T, class params> class Polynomial {
public:
T *coefficients;
@@ -104,18 +43,6 @@ public:
__device__ Polynomial(char *memory, uint32_t degree)
: coefficients((T *)memory), degree(degree) {}
__host__ Polynomial(DeviceMemory &dmem, uint32_t degree, int device)
: degree(degree) {
dmem.get_allocation(&this->coefficients, params::degree, device);
}
__host__ Polynomial(DeviceMemory &dmem, T *source, uint32_t degree,
int device)
: degree(degree) {
dmem.get_allocation_and_copy_async(&this->coefficients, source,
params::degree, device);
}
__host__ void copy_to_host(T *dest) {
cudaMemcpyAsync(dest, this->coefficients, sizeof(T) * params::degree,
cudaMemcpyDeviceToHost);
@@ -402,22 +329,6 @@ public:
cudaMemcpyHostToDevice);
}
__host__ Vector(DeviceMemory &dmem, T *source, uint32_t size_source,
int device)
: m_size(size_source) {
dmem.get_allocation_and_copy_async(&m_data, source, m_size, device);
}
__host__ Vector(DeviceMemory &dmem, T *source, uint32_t allocation_size,
uint32_t copy_size, int device)
: m_size(allocation_size) {
if (copy_size > allocation_size) {
printf("warning: copying more than allocation");
}
dmem.get_allocation_and_copy_async(&m_data, source, m_size, copy_size,
device);
}
__host__ void copy_to_host(T *dest) {
cudaMemcpyAsync(dest, m_data, sizeof(T) * m_size, cudaMemcpyDeviceToHost);
}

View File

@@ -1,77 +0,0 @@
#ifndef CNCRT_SHMEM_H
#define CNCRT_SHMEM_H
#include "helper_cuda.h"
#include <atomic>
#include <iostream>
#include <mutex>
#include <thread>
#include <tuple>
#include <vector>
class DeviceMemory {
public:
std::vector<std::tuple<void *, int>> m_allocated;
std::mutex m_allocation_mtx;
std::atomic<uint32_t> m_total_devices;
DeviceMemory() : m_total_devices(1) {}
__host__ void set_device(int device) {
if (device > m_total_devices)
m_total_devices = device + 1;
}
template <typename T>
__host__ void get_allocation(T **ptr, int elements, int device) {
T *res;
cudaMalloc((void **)&res, sizeof(T) * elements);
*ptr = res;
std::lock_guard<std::mutex> lock(m_allocation_mtx);
m_allocated.push_back(std::make_tuple(res, device));
}
template <typename T>
__host__ void get_allocation_and_copy_async(T **ptr, T *src, int elements,
int device) {
T *res;
cudaMalloc((void **)&res, sizeof(T) * elements);
cudaMemcpyAsync(res, src, sizeof(T) * elements, cudaMemcpyHostToDevice);
*ptr = res;
std::lock_guard<std::mutex> lock(m_allocation_mtx);
m_allocated.push_back(std::make_tuple(res, device));
}
template <typename T>
__host__ void get_allocation_and_copy_async(T **ptr, T *src, int allocation,
int elements, int device) {
T *res;
cudaMalloc((void **)&res, sizeof(T) * allocation);
cudaMemcpyAsync(res, src, sizeof(T) * elements, cudaMemcpyHostToDevice);
*ptr = res;
std::lock_guard<std::mutex> lock(m_allocation_mtx);
m_allocated.push_back(std::make_tuple(res, device));
}
void free_all_from_device(int device) {
cudaSetDevice(device);
for (auto elem : m_allocated) {
auto dev = std::get<1>(elem);
if (dev == device) {
auto mem = std::get<0>(elem);
checkCudaErrors(cudaFree(mem));
}
}
}
__host__ ~DeviceMemory() {
for (auto elem : m_allocated) {
auto dev = std::get<1>(elem);
auto mem = std::get<0>(elem);
cudaSetDevice(dev);
checkCudaErrors(cudaFree(mem));
}
}
};
#endif // CNCRT_SHMEM_H

View File

@@ -1,6 +1,7 @@
#ifndef CNCRT_TIMER_H
#define CNCRT_TIMER_H
#include <iostream>
#define synchronize_threads_in_block() __syncthreads()
template <bool active> class CudaMeasureExecution {

View File

@@ -1,7 +1,6 @@
#ifndef VERTICAL_PACKING_H
#define VERTICAL_PACKING_H
#include "../include/helper_cuda.h"
#include "bootstrap.h"
#include "complex/operations.cuh"
#include "crypto/gadget.cuh"
@@ -11,11 +10,11 @@
#include "fft/bnsmfft.cuh"
#include "fft/smfft.cuh"
#include "fft/twiddles.cuh"
#include "helper_cuda.h"
#include "polynomial/functions.cuh"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial.cuh"
#include "polynomial/polynomial_math.cuh"
#include "utils/memory.cuh"
#include "utils/timer.cuh"
template <class params> __device__ void fft(double2 *output) {
@@ -266,11 +265,13 @@ __global__ void device_batch_cmux(Torus *glwe_array_out, Torus *glwe_array_in,
* - tau: The quantity of CMUX trees that should be executed
*/
template <typename Torus, typename STorus, class params>
void host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out,
Torus *ggsw_in, Torus *lut_vector, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t r, uint32_t tau,
uint32_t max_shared_memory) {
__host__ void host_cmux_tree(void *v_stream, uint32_t gpu_index,
Torus *glwe_array_out, Torus *ggsw_in,
Torus *lut_vector, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t r, uint32_t tau,
uint32_t max_shared_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
int num_lut = (1 << r);
@@ -278,12 +279,9 @@ void host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out,
// Simply copy the LUTs
add_padding_to_lut_async<Torus, params>(glwe_array_out, lut_vector,
glwe_dimension, tau, stream);
checkCudaErrors(cudaStreamSynchronize(*stream));
return;
}
cuda_initialize_twiddles(polynomial_size, 0);
int memory_needed_per_block =
sizeof(Torus) * polynomial_size + // glwe_sub_mask
sizeof(Torus) * polynomial_size + // glwe_sub_body
@@ -365,11 +363,6 @@ void host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out,
glwe_array_out + i * glwe_size, output + i * num_lut * glwe_size,
glwe_size * sizeof(Torus), cudaMemcpyDeviceToDevice, *stream));
// We only need synchronization to assert that data is in glwe_array_out
// before returning. Memory release can be added to the stream and processed
// later.
checkCudaErrors(cudaStreamSynchronize(*stream));
// Free memory
cuda_drop_async(d_ggsw_fft_in, stream, gpu_index);
cuda_drop_async(d_buffer1, stream, gpu_index);
@@ -466,12 +459,13 @@ __global__ void device_blind_rotation_and_sample_extraction(
}
template <typename Torus, typename STorus, class params>
void host_blind_rotate_and_sample_extraction(
__host__ void host_blind_rotate_and_sample_extraction(
void *v_stream, uint32_t gpu_index, Torus *lwe_out, Torus *ggsw_in,
Torus *lut_vector, uint32_t mbr_size, uint32_t tau, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t l_gadget,
uint32_t max_shared_memory) {
cudaSetDevice(gpu_index);
assert(glwe_dimension ==
1); // For larger k we will need to adjust the mask size
auto stream = static_cast<cudaStream_t *>(v_stream);

View File

@@ -3,12 +3,11 @@
#include "cooperative_groups.h"
#include "../include/helper_cuda.h"
#include "bit_extraction.cuh"
#include "bootstrap.h"
#include "circuit_bootstrap.cuh"
#include "helper_cuda.h"
#include "utils/kernel_dimensions.cuh"
#include "utils/memory.cuh"
#include "utils/timer.cuh"
#include "vertical_packing.cuh"
@@ -40,6 +39,7 @@ __host__ void host_circuit_bootstrap_vertical_packing(
uint32_t level_count_cbs, uint32_t number_of_inputs, uint32_t tau,
uint32_t max_shared_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
// allocate and initialize device pointers for circuit bootstrap
@@ -140,6 +140,7 @@ __host__ void host_wop_pbs(
uint32_t number_of_bits_to_extract, uint32_t number_of_inputs,
uint32_t max_shared_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
// let mut h_lut_vector_indexes = vec![0 as u32; 1];