Compare commits

...

1 Commits

Author SHA1 Message Date
Agnes Leroy
651ff8fd4e chore(gpu): change active gpu count logic 2025-07-16 18:24:01 +02:00
5 changed files with 30 additions and 30 deletions

View File

@@ -6,6 +6,7 @@
extern std::mutex m;
extern bool p2p_enabled;
extern const int THRESHOLD_MULTI_GPU;
extern "C" {
int32_t cuda_setup_multi_gpu(int device_0_id);

View File

@@ -308,8 +308,9 @@ template <typename Torus> struct int_radix_lut {
for (uint i = 0; i < active_gpu_count; i++) {
cuda_set_device(i);
int8_t *gpu_pbs_buffer;
auto num_blocks_on_gpu =
get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
auto num_blocks_on_gpu = std::max(
THRESHOLD_MULTI_GPU,
get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count));
uint64_t size = 0;
execute_scratch_pbs<Torus>(
@@ -524,8 +525,9 @@ template <typename Torus> struct int_radix_lut {
for (uint i = 0; i < active_gpu_count; i++) {
cuda_set_device(i);
int8_t *gpu_pbs_buffer;
auto num_blocks_on_gpu =
get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
auto num_blocks_on_gpu = std::max(
THRESHOLD_MULTI_GPU,
get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count));
uint64_t size = 0;
execute_scratch_pbs<Torus>(
@@ -830,8 +832,9 @@ template <typename InputTorus> struct int_noise_squashing_lut {
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint i = 0; i < active_gpu_count; i++) {
cuda_set_device(i);
auto num_radix_blocks_on_gpu =
get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
auto num_radix_blocks_on_gpu = std::max(
THRESHOLD_MULTI_GPU,
get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count));
int8_t *gpu_pbs_buffer;
uint64_t size = 0;
execute_scratch_pbs_128(streams[i], gpu_indexes[i], &gpu_pbs_buffer,
@@ -4957,7 +4960,6 @@ template <typename Torus> struct int_div_rem_memory {
// sub streams
cudaStream_t *sub_streams_1;
cudaStream_t *sub_streams_2;
cudaStream_t *sub_streams_3;
// temporary device buffers
CudaRadixCiphertextFFI *positive_numerator;
@@ -4973,7 +4975,7 @@ template <typename Torus> struct int_div_rem_memory {
bool allocate_gpu_memory, uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
this->active_gpu_count = get_active_gpu_count(2 * num_blocks, gpu_count);
this->active_gpu_count = get_active_gpu_count(num_blocks, gpu_count);
this->params = params;
this->is_signed = is_signed;
@@ -5038,16 +5040,11 @@ template <typename Torus> struct int_div_rem_memory {
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
// init sub streams
sub_streams_1 =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
sub_streams_2 =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
sub_streams_3 =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
for (uint j = 0; j < active_gpu_count; j++) {
sub_streams_1 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
sub_streams_2 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
for (uint j = 0; j < gpu_count; j++) {
sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
sub_streams_3[j] = cuda_create_stream(gpu_indexes[j]);
}
// init lookup tables
@@ -5111,14 +5108,12 @@ template <typename Torus> struct int_div_rem_memory {
delete compare_signed_bits_lut;
// release sub streams
for (uint i = 0; i < active_gpu_count; i++) {
for (uint i = 0; i < gpu_count; i++) {
cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
cuda_destroy_stream(sub_streams_3[i], gpu_indexes[i]);
}
free(sub_streams_1);
free(sub_streams_2);
free(sub_streams_3);
// delete temporary buffers
delete positive_numerator;

View File

@@ -530,11 +530,13 @@ __host__ void host_integer_div_rem_kb(
}
host_integer_abs_kb<Torus>(
int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count, positive_numerator,
bsks, ksks, ms_noise_reduction_key, int_mem_ptr->abs_mem_1, true);
host_integer_abs_kb<Torus>(
int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count, positive_divisor,
bsks, ksks, ms_noise_reduction_key, int_mem_ptr->abs_mem_2, true);
int_mem_ptr->sub_streams_1, gpu_indexes, int_mem_ptr->active_gpu_count,
positive_numerator, bsks, ksks, ms_noise_reduction_key,
int_mem_ptr->abs_mem_1, true);
host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_2, gpu_indexes,
int_mem_ptr->active_gpu_count, positive_divisor,
bsks, ksks, ms_noise_reduction_key,
int_mem_ptr->abs_mem_2, true);
for (uint j = 0; j < int_mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(int_mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(int_mem_ptr->sub_streams_2[j], gpu_indexes[j]);

View File

@@ -5,6 +5,7 @@
std::mutex m;
bool p2p_enabled = false;
const int THRESHOLD_MULTI_GPU = 12;
// Enable bidirectional p2p access between all available GPUs and device_0_id
int32_t cuda_setup_multi_gpu(int device_0_id) {
@@ -39,10 +40,9 @@ int32_t cuda_setup_multi_gpu(int device_0_id) {
}
int get_active_gpu_count(int num_inputs, int gpu_count) {
int active_gpu_count = gpu_count;
if (gpu_count > num_inputs) {
active_gpu_count = num_inputs;
}
int ceil_div_inputs =
std::max(1, (num_inputs + THRESHOLD_MULTI_GPU - 1) / THRESHOLD_MULTI_GPU);
int active_gpu_count = std::min(ceil_div_inputs, gpu_count);
return active_gpu_count;
}

View File

@@ -51,7 +51,8 @@ void multi_gpu_alloc_lwe_async(cudaStream_t const *streams,
dest.resize(gpu_count);
for (uint i = 0; i < gpu_count; i++) {
uint64_t size_tracker_on_gpu_i = 0;
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
auto inputs_on_gpu = std::max(
THRESHOLD_MULTI_GPU, get_num_inputs_on_gpu(num_inputs, i, gpu_count));
Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
inputs_on_gpu * lwe_size * sizeof(Torus), streams[i], gpu_indexes[i],
size_tracker_on_gpu_i, allocate_gpu_memory);
@@ -80,7 +81,8 @@ void multi_gpu_alloc_lwe_many_lut_output_async(
dest.resize(gpu_count);
for (uint i = 0; i < gpu_count; i++) {
uint64_t size_tracker = 0;
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
auto inputs_on_gpu = std::max(
THRESHOLD_MULTI_GPU, get_num_inputs_on_gpu(num_inputs, i, gpu_count));
Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
num_many_lut * inputs_on_gpu * lwe_size * sizeof(Torus), streams[i],
gpu_indexes[i], size_tracker, allocate_gpu_memory);