mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-10 07:08:03 -05:00
chore(gpu): encapsulate cudaSetDevice
This commit is contained in:
@@ -27,6 +27,8 @@ inline void cuda_error(cudaError_t code, const char *file, int line) {
|
||||
std::abort(); \
|
||||
}
|
||||
|
||||
void cuda_set_device(uint32_t gpu_index);
|
||||
|
||||
cudaEvent_t cuda_create_event(uint32_t gpu_index);
|
||||
|
||||
void cuda_event_record(cudaEvent_t event, cudaStream_t stream,
|
||||
|
||||
@@ -189,7 +189,7 @@ template <typename Torus> struct int_radix_lut {
|
||||
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cudaSetDevice(i);
|
||||
cuda_set_device(i);
|
||||
int8_t *gpu_pbs_buffer;
|
||||
auto num_blocks_on_gpu =
|
||||
get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
|
||||
@@ -384,7 +384,7 @@ template <typename Torus> struct int_radix_lut {
|
||||
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cudaSetDevice(i);
|
||||
cuda_set_device(i);
|
||||
int8_t *gpu_pbs_buffer;
|
||||
auto num_blocks_on_gpu =
|
||||
get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
|
||||
|
||||
@@ -114,7 +114,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
|
||||
PBS_VARIANT pbs_variant, bool allocate_gpu_memory) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
this->pbs_variant = pbs_variant;
|
||||
this->lwe_chunk_size = lwe_chunk_size;
|
||||
|
||||
@@ -77,7 +77,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
|
||||
bool allocate_gpu_memory) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
this->pbs_variant = pbs_variant;
|
||||
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
|
||||
@@ -11,7 +11,7 @@ void cuda_convert_lwe_ciphertext_vector_to_gpu(cudaStream_t stream,
|
||||
uint32_t gpu_index, T *dest,
|
||||
T *src, uint32_t number_of_cts,
|
||||
uint32_t lwe_dimension) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
|
||||
cuda_memcpy_async_to_gpu(dest, src, size, stream, gpu_index);
|
||||
}
|
||||
@@ -21,7 +21,7 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu(cudaStream_t stream,
|
||||
uint32_t gpu_index, T *dest,
|
||||
T *src, uint32_t number_of_cts,
|
||||
uint32_t lwe_dimension) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
|
||||
cuda_memcpy_async_to_cpu(dest, src, size, stream, gpu_index);
|
||||
}
|
||||
@@ -55,7 +55,7 @@ __host__ void host_sample_extract(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus const *glwe_array_in,
|
||||
uint32_t const *nth_array, uint32_t num_nths,
|
||||
uint32_t glwe_dimension) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
dim3 grid(num_nths);
|
||||
dim3 thds(params::degree / params::opt);
|
||||
|
||||
@@ -261,7 +261,7 @@ __host__ void host_fast_packing_keyswitch_lwe_list_to_glwe(
|
||||
|
||||
// Optimization of packing keyswitch when packing many LWEs
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
|
||||
|
||||
@@ -57,7 +57,7 @@ void batch_fft_ggsw_vector(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
if (gpu_count != 1)
|
||||
PANIC("GPU error (batch_fft_ggsw_vector): multi-GPU execution is not "
|
||||
"supported yet.")
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
|
||||
int shared_memory_size = sizeof(double) * polynomial_size;
|
||||
|
||||
|
||||
@@ -105,7 +105,7 @@ __host__ void host_keyswitch_lwe_ciphertext_vector(
|
||||
uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
constexpr int num_threads_y = 32;
|
||||
int num_blocks, num_threads_x;
|
||||
@@ -160,7 +160,7 @@ __host__ void scratch_packing_keyswitch_lwe_list_to_glwe(
|
||||
cudaStream_t stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t num_lwes, bool allocate_gpu_memory) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
|
||||
|
||||
|
||||
@@ -110,7 +110,7 @@ template <typename Torus>
|
||||
__host__ void host_modulus_switch_inplace(cudaStream_t stream,
|
||||
uint32_t gpu_index, Torus *array,
|
||||
int size, uint32_t log_modulus) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
int num_threads = 0, num_blocks = 0;
|
||||
getNumBlocksAndThreads(size, 1024, num_blocks, num_threads);
|
||||
|
||||
@@ -2,8 +2,12 @@
|
||||
#include <cstdint>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
cudaEvent_t cuda_create_event(uint32_t gpu_index) {
|
||||
void cuda_set_device(uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
}
|
||||
|
||||
cudaEvent_t cuda_create_event(uint32_t gpu_index) {
|
||||
cuda_set_device(gpu_index);
|
||||
cudaEvent_t event;
|
||||
check_cuda_error(cudaEventCreate(&event));
|
||||
return event;
|
||||
@@ -11,24 +15,24 @@ cudaEvent_t cuda_create_event(uint32_t gpu_index) {
|
||||
|
||||
void cuda_event_record(cudaEvent_t event, cudaStream_t stream,
|
||||
uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
cuda_set_device(gpu_index);
|
||||
check_cuda_error(cudaEventRecord(event, stream));
|
||||
}
|
||||
|
||||
void cuda_stream_wait_event(cudaStream_t stream, cudaEvent_t event,
|
||||
uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
cuda_set_device(gpu_index);
|
||||
check_cuda_error(cudaStreamWaitEvent(stream, event, 0));
|
||||
}
|
||||
|
||||
void cuda_event_destroy(cudaEvent_t event, uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
cuda_set_device(gpu_index);
|
||||
check_cuda_error(cudaEventDestroy(event));
|
||||
}
|
||||
|
||||
/// Unsafe function to create a CUDA stream, must check first that GPU exists
|
||||
cudaStream_t cuda_create_stream(uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
cuda_set_device(gpu_index);
|
||||
cudaStream_t stream;
|
||||
check_cuda_error(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||
return stream;
|
||||
@@ -36,12 +40,12 @@ cudaStream_t cuda_create_stream(uint32_t gpu_index) {
|
||||
|
||||
/// Unsafe function to destroy CUDA stream, must check first the GPU exists
|
||||
void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
cuda_set_device(gpu_index);
|
||||
check_cuda_error(cudaStreamDestroy(stream));
|
||||
}
|
||||
|
||||
void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
cuda_set_device(gpu_index);
|
||||
check_cuda_error(cudaStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
@@ -59,7 +63,7 @@ uint32_t cuda_is_available() { return cudaSetDevice(0) == cudaSuccess; }
|
||||
/// or if there's not enough memory. A safe wrapper around it must call
|
||||
/// cuda_check_valid_malloc() first
|
||||
void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
cuda_set_device(gpu_index);
|
||||
void *ptr;
|
||||
check_cuda_error(cudaMalloc((void **)&ptr, size));
|
||||
|
||||
@@ -70,7 +74,7 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
|
||||
/// asynchronously.
|
||||
void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
|
||||
uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
cuda_set_device(gpu_index);
|
||||
void *ptr;
|
||||
|
||||
#ifndef CUDART_VERSION
|
||||
@@ -93,7 +97,7 @@ void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
|
||||
|
||||
/// Check that allocation is valid
|
||||
void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
cuda_set_device(gpu_index);
|
||||
size_t total_mem, free_mem;
|
||||
check_cuda_error(cudaMemGetInfo(&free_mem, &total_mem));
|
||||
if (size > free_mem) {
|
||||
@@ -141,7 +145,7 @@ void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
PANIC("Cuda error: invalid device pointer in async copy to GPU.")
|
||||
}
|
||||
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
cuda_set_device(gpu_index);
|
||||
check_cuda_error(
|
||||
cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream));
|
||||
}
|
||||
@@ -161,7 +165,7 @@ void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
||||
if (attr_src.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
|
||||
}
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
cuda_set_device(gpu_index);
|
||||
if (attr_src.device == attr_dest.device) {
|
||||
check_cuda_error(
|
||||
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice, stream));
|
||||
@@ -186,7 +190,7 @@ void cuda_memcpy_gpu_to_gpu(void *dest, void *src, uint64_t size,
|
||||
if (attr_src.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
|
||||
}
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
cuda_set_device(gpu_index);
|
||||
if (attr_src.device == attr_dest.device) {
|
||||
check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToDevice));
|
||||
} else {
|
||||
@@ -197,7 +201,7 @@ void cuda_memcpy_gpu_to_gpu(void *dest, void *src, uint64_t size,
|
||||
|
||||
/// Synchronizes device
|
||||
void cuda_synchronize_device(uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
cuda_set_device(gpu_index);
|
||||
check_cuda_error(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
@@ -210,7 +214,7 @@ void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
|
||||
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid dest device pointer in cuda memset.")
|
||||
}
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
cuda_set_device(gpu_index);
|
||||
check_cuda_error(cudaMemsetAsync(dest, val, size, stream));
|
||||
}
|
||||
|
||||
@@ -230,7 +234,7 @@ void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
|
||||
if (attr.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid dest device pointer in cuda set value.")
|
||||
}
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
cuda_set_device(gpu_index);
|
||||
int block_size = 256;
|
||||
int num_blocks = (n + block_size - 1) / block_size;
|
||||
|
||||
@@ -260,7 +264,7 @@ void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||
PANIC("Cuda error: invalid src device pointer in copy to CPU async.")
|
||||
}
|
||||
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
cuda_set_device(gpu_index);
|
||||
check_cuda_error(
|
||||
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream));
|
||||
}
|
||||
@@ -274,14 +278,14 @@ int cuda_get_number_of_gpus() {
|
||||
|
||||
/// Drop a cuda array
|
||||
void cuda_drop(void *ptr, uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
cuda_set_device(gpu_index);
|
||||
check_cuda_error(cudaFree(ptr));
|
||||
}
|
||||
|
||||
/// Drop a cuda array asynchronously, if supported on the device
|
||||
void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {
|
||||
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
cuda_set_device(gpu_index);
|
||||
#ifndef CUDART_VERSION
|
||||
#error CUDART_VERSION Undefined!
|
||||
#elif (CUDART_VERSION >= 11020)
|
||||
|
||||
@@ -12,7 +12,7 @@ __host__ void zero_out_if(cudaStream_t const *streams,
|
||||
int_zero_out_if_buffer<Torus> *mem_ptr,
|
||||
int_radix_lut<Torus> *predicate, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_radix_blocks) {
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
auto params = mem_ptr->params;
|
||||
|
||||
// We can't use integer_radix_apply_bivariate_lookup_table_kb since the
|
||||
|
||||
@@ -38,7 +38,7 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
|
||||
uint32_t lwe_dimension,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = (lwe_dimension + 1);
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
|
||||
@@ -50,7 +50,7 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
|
||||
if (array_in == array_out)
|
||||
PANIC("Cuda error: Input and output must be different");
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
auto compression_params = mem_ptr->compression_params;
|
||||
|
||||
auto log_modulus = mem_ptr->storage_log_modulus;
|
||||
@@ -185,7 +185,7 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
|
||||
if (array_in == glwe_array_out)
|
||||
PANIC("Cuda error: Input and output must be different");
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
auto compression_params = mem_ptr->compression_params;
|
||||
|
||||
|
||||
@@ -78,7 +78,7 @@ host_radix_blocks_rotate_right(cudaStream_t const *streams,
|
||||
PANIC("Cuda error (blocks_rotate_right): the source and destination "
|
||||
"pointers should be different");
|
||||
}
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
radix_blocks_rotate_right<Torus><<<blocks_count, 1024, 0, streams[0]>>>(
|
||||
dst, src, value, blocks_count, lwe_size);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
@@ -96,7 +96,7 @@ host_radix_blocks_rotate_left(cudaStream_t const *streams,
|
||||
PANIC("Cuda error (blocks_rotate_left): the source and destination "
|
||||
"pointers should be different");
|
||||
}
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
radix_blocks_rotate_left<Torus><<<blocks_count, 1024, 0, streams[0]>>>(
|
||||
dst, src, value, blocks_count, lwe_size);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
@@ -125,7 +125,7 @@ __host__ void
|
||||
host_radix_blocks_reverse_inplace(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, Torus *src,
|
||||
uint32_t blocks_count, uint32_t lwe_size) {
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
int num_blocks = blocks_count / 2, num_threads = 1024;
|
||||
radix_blocks_reverse_lwe_inplace<Torus>
|
||||
<<<num_blocks, num_threads, 0, streams[0]>>>(src, blocks_count, lwe_size);
|
||||
@@ -163,7 +163,7 @@ template <typename Torus>
|
||||
__host__ void host_radix_cumulative_sum_in_groups(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *dest, Torus *src,
|
||||
uint32_t radix_blocks_count, uint32_t lwe_size, uint32_t group_size) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
// Each CUDA block is responsible for a single group
|
||||
int num_blocks = (radix_blocks_count + group_size - 1) / group_size,
|
||||
num_threads = 512;
|
||||
@@ -219,7 +219,7 @@ __host__ void host_radix_split_simulators_and_grouping_pgns(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *simulators,
|
||||
Torus *grouping_pgns, Torus *src, uint32_t radix_blocks_count,
|
||||
uint32_t lwe_size, uint32_t group_size, Torus delta) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
// Each CUDA block is responsible for a single group
|
||||
int num_blocks = radix_blocks_count, num_threads = 512;
|
||||
radix_split_simulators_and_grouping_pgns<Torus>
|
||||
@@ -255,7 +255,7 @@ __host__ void host_radix_sum_in_groups(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *dest, Torus *src1, Torus *src2,
|
||||
uint32_t radix_blocks_count,
|
||||
uint32_t lwe_size, uint32_t group_size) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
int num_blocks = radix_blocks_count, num_threads = 512;
|
||||
radix_sum_in_groups<Torus><<<num_blocks, num_threads, 0, stream>>>(
|
||||
@@ -297,7 +297,7 @@ pack_bivariate_blocks(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t lwe_dimension, uint32_t shift,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
// Left message is shifted
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = num_radix_blocks * (lwe_dimension + 1);
|
||||
@@ -341,7 +341,7 @@ __host__ void pack_bivariate_blocks_with_single_block(
|
||||
Torus const *lwe_array_1, Torus const *lwe_2, Torus const *lwe_indexes_in,
|
||||
uint32_t lwe_dimension, uint32_t shift, uint32_t num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
// Left message is shifted
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = num_radix_blocks * (lwe_dimension + 1);
|
||||
@@ -1361,7 +1361,7 @@ __host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index,
|
||||
uint32_t factor) {
|
||||
if (num_radix_blocks == 0)
|
||||
return;
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = (lwe_dimension + 1);
|
||||
getNumBlocksAndThreads(num_entries, 1024, num_blocks, num_threads);
|
||||
@@ -1392,7 +1392,7 @@ create_trivial_radix(cudaStream_t stream, uint32_t gpu_index,
|
||||
uint32_t num_scalar_blocks, Torus message_modulus,
|
||||
Torus carry_modulus) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
size_t radix_size = (lwe_dimension + 1) * num_radix_blocks;
|
||||
cuda_memset_async(lwe_array_out, 0, radix_size * sizeof(Torus), stream,
|
||||
gpu_index);
|
||||
|
||||
@@ -297,7 +297,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
ch_amount++;
|
||||
dim3 add_grid(ch_amount, num_blocks, 1);
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
|
||||
new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
|
||||
|
||||
@@ -541,7 +541,7 @@ __host__ void host_integer_mult_radix_kb(
|
||||
dim3 grid(lsb_vector_block_count, 1, 1);
|
||||
dim3 thds(params::degree / params::opt, 1, 1);
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
all_shifted_lhs_rhs<Torus, params><<<grid, thds, 0, streams[0]>>>(
|
||||
radix_lwe_left, vector_result_lsb, vector_result_msb, radix_lwe_right,
|
||||
vector_lsb_rhs, vector_msb_rhs, num_blocks);
|
||||
@@ -556,7 +556,7 @@ __host__ void host_integer_mult_radix_kb(
|
||||
vector_result_msb = &block_mul_res[lsb_vector_block_count *
|
||||
(polynomial_size * glwe_dimension + 1)];
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
fill_radix_from_lsb_msb<Torus, params>
|
||||
<<<num_blocks * num_blocks, params::degree / params::opt, 0,
|
||||
streams[0]>>>(vector_result_sb, vector_result_lsb, vector_result_msb,
|
||||
|
||||
@@ -59,7 +59,7 @@ __host__ void host_integer_radix_negation(
|
||||
uint32_t gpu_count, Torus *output, Torus const *input,
|
||||
uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
|
||||
uint64_t message_modulus, uint64_t carry_modulus) {
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
|
||||
@@ -29,7 +29,7 @@ __host__ void host_integer_radix_scalar_addition_inplace(
|
||||
uint32_t gpu_count, Torus *lwe_array, Torus const *scalar_input,
|
||||
uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus) {
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
@@ -68,7 +68,7 @@ __host__ void host_integer_radix_add_scalar_one_inplace(
|
||||
uint32_t gpu_count, Torus *lwe_array, uint32_t lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus) {
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
@@ -108,7 +108,7 @@ __host__ void host_integer_radix_scalar_subtraction_inplace(
|
||||
uint32_t gpu_count, Torus *lwe_array, Torus *scalar_input,
|
||||
uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus) {
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
|
||||
@@ -127,7 +127,7 @@ __host__ void host_integer_small_scalar_mul_radix(
|
||||
uint32_t gpu_count, T *output_lwe_array, T *input_lwe_array, T scalar,
|
||||
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
int lwe_size = input_lwe_dimension + 1;
|
||||
|
||||
@@ -51,7 +51,7 @@ __host__ void host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index,
|
||||
const uint32_t lwe_dimension,
|
||||
const uint32_t lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = lwe_ciphertext_count;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
@@ -72,7 +72,7 @@ __host__ void host_addition_plaintext_scalar(
|
||||
const T plaintext_input, const uint32_t lwe_dimension,
|
||||
const uint32_t lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = lwe_ciphertext_count;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
@@ -106,7 +106,7 @@ __host__ void host_addition(cudaStream_t stream, uint32_t gpu_index,
|
||||
CudaRadixCiphertextFFI const *input_1,
|
||||
CudaRadixCiphertextFFI const *input_2) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
int lwe_size = output->lwe_dimension + 1;
|
||||
@@ -136,7 +136,7 @@ __host__ void legacy_host_addition(cudaStream_t stream, uint32_t gpu_index,
|
||||
const uint32_t input_lwe_dimension,
|
||||
const uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
int lwe_size = input_lwe_dimension + 1;
|
||||
@@ -172,7 +172,7 @@ __host__ void host_pack_for_overflowing_ops(cudaStream_t stream,
|
||||
uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t message_modulus) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
int lwe_size = input_lwe_dimension + 1;
|
||||
@@ -210,7 +210,7 @@ __host__ void host_subtraction(cudaStream_t stream, uint32_t gpu_index,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
int lwe_size = input_lwe_dimension + 1;
|
||||
@@ -248,7 +248,7 @@ __host__ void host_subtraction_plaintext(cudaStream_t stream,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = input_lwe_ciphertext_count;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
@@ -294,7 +294,7 @@ __host__ void host_unchecked_sub_with_correcting_term(
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, uint32_t degree) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
int lwe_size = input_lwe_dimension + 1;
|
||||
|
||||
@@ -34,7 +34,7 @@ __host__ void host_cleartext_vec_multiplication(
|
||||
T const *cleartext_input, const uint32_t input_lwe_dimension,
|
||||
const uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
int lwe_size = input_lwe_dimension + 1;
|
||||
@@ -70,7 +70,7 @@ host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
int lwe_size = input_lwe_dimension + 1;
|
||||
|
||||
@@ -26,7 +26,7 @@ __host__ void host_negation(cudaStream_t stream, uint32_t gpu_index, T *output,
|
||||
T const *input, const uint32_t input_lwe_dimension,
|
||||
const uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
int lwe_size = input_lwe_dimension + 1;
|
||||
|
||||
@@ -96,7 +96,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
|
||||
uint32_t total_polynomials) {
|
||||
|
||||
auto stream = static_cast<cudaStream_t>(stream_v);
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
auto input1 = (double2 *)_input1;
|
||||
auto input2 = (double2 *)_input2;
|
||||
auto output = (double2 *)_output;
|
||||
|
||||
@@ -78,7 +78,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
double2 *dest, ST const *src,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t total_polynomials) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
int shared_memory_size = sizeof(double) * polynomial_size;
|
||||
|
||||
// Here the buffer size is the size of double2 times the number of polynomials
|
||||
|
||||
@@ -312,7 +312,7 @@ __host__ void host_programmable_bootstrap_amortized(
|
||||
uint64_t DM_FULL = SM_FULL;
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
// Create a 1-dimensional grid of threads
|
||||
// where each block handles 1 sample and each thread
|
||||
|
||||
@@ -247,7 +247,7 @@ __host__ void host_programmable_bootstrap_cg(
|
||||
polynomial_size);
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
uint64_t full_dm = full_sm;
|
||||
|
||||
|
||||
@@ -215,7 +215,7 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
uint64_t full_sm_keybundle =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
|
||||
@@ -298,7 +298,7 @@ __host__ void execute_cg_external_product_loop(
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t lwe_offset, uint32_t num_many_lut,
|
||||
uint32_t lut_stride) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
|
||||
|
||||
@@ -373,7 +373,7 @@ __host__ void execute_step_one(
|
||||
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) {
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
int thds = polynomial_size / params::opt;
|
||||
dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
|
||||
|
||||
@@ -415,7 +415,7 @@ __host__ void execute_step_two(
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
int thds = polynomial_size / params::opt;
|
||||
dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1);
|
||||
|
||||
@@ -456,7 +456,7 @@ __host__ void host_programmable_bootstrap(
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
// With SM each block corresponds to either the mask or body, no need to
|
||||
// duplicate data for each
|
||||
|
||||
@@ -450,7 +450,7 @@ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
|
||||
int max_blocks_per_sm;
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
if (max_shared_memory < full_sm_keybundle)
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_blocks_per_sm,
|
||||
|
||||
@@ -388,7 +388,7 @@ __host__ void scratch_multi_bit_programmable_bootstrap(
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
uint64_t full_sm_keybundle =
|
||||
@@ -496,7 +496,7 @@ __host__ void execute_compute_keybundle(
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
uint32_t chunk_size =
|
||||
@@ -545,7 +545,7 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t j, uint32_t lwe_offset) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
uint64_t full_sm_accumulate_step_one =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
|
||||
@@ -601,7 +601,7 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
uint32_t polynomial_size, int32_t grouping_factor,
|
||||
uint32_t level_count, uint32_t j, uint32_t lwe_offset,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
uint64_t full_sm_accumulate_step_two =
|
||||
|
||||
@@ -201,7 +201,7 @@ __host__ void scratch_programmable_bootstrap_tbc(
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
bool supports_dsm =
|
||||
@@ -264,7 +264,7 @@ __host__ void host_programmable_bootstrap_tbc(
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto supports_dsm =
|
||||
|
||||
@@ -204,7 +204,7 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap(
|
||||
pbs_buffer<uint64_t, MULTI_BIT> **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
bool supports_dsm =
|
||||
@@ -301,7 +301,7 @@ __host__ void execute_tbc_external_product_loop(
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t lwe_offset, uint32_t num_many_lut,
|
||||
uint32_t lut_stride) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
|
||||
@@ -402,7 +402,7 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
|
||||
|
||||
@@ -21,7 +21,7 @@ int32_t cuda_setup_multi_gpu() {
|
||||
check_cuda_error(
|
||||
cudaDeviceCanAccessPeer(&has_peer_access_to_device_0, i, 0));
|
||||
if (has_peer_access_to_device_0) {
|
||||
check_cuda_error(cudaSetDevice(i));
|
||||
cuda_set_device(i);
|
||||
check_cuda_error(cudaDeviceEnablePeerAccess(0, 0));
|
||||
}
|
||||
num_used_gpus += 1;
|
||||
|
||||
@@ -140,7 +140,7 @@ void programmable_bootstrap_multibit_setup(
|
||||
DynamicDistribution glwe_noise_distribution, int pbs_base_log,
|
||||
int pbs_level, int message_modulus, int carry_modulus, int *payload_modulus,
|
||||
uint64_t *delta, int number_of_inputs, int repetitions, int samples) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
*payload_modulus = message_modulus * carry_modulus;
|
||||
// Value of the shift we multiply our messages by
|
||||
|
||||
Reference in New Issue
Block a user