mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-11 07:38:08 -05:00
Compare commits
1 Commits
mz/factori
...
al/debug_m
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
651ff8fd4e |
@@ -6,6 +6,7 @@
|
||||
|
||||
extern std::mutex m;
|
||||
extern bool p2p_enabled;
|
||||
extern const int THRESHOLD_MULTI_GPU;
|
||||
|
||||
extern "C" {
|
||||
int32_t cuda_setup_multi_gpu(int device_0_id);
|
||||
|
||||
@@ -308,8 +308,9 @@ template <typename Torus> struct int_radix_lut {
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_set_device(i);
|
||||
int8_t *gpu_pbs_buffer;
|
||||
auto num_blocks_on_gpu =
|
||||
get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
|
||||
auto num_blocks_on_gpu = std::max(
|
||||
THRESHOLD_MULTI_GPU,
|
||||
get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count));
|
||||
|
||||
uint64_t size = 0;
|
||||
execute_scratch_pbs<Torus>(
|
||||
@@ -524,8 +525,9 @@ template <typename Torus> struct int_radix_lut {
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_set_device(i);
|
||||
int8_t *gpu_pbs_buffer;
|
||||
auto num_blocks_on_gpu =
|
||||
get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
|
||||
auto num_blocks_on_gpu = std::max(
|
||||
THRESHOLD_MULTI_GPU,
|
||||
get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count));
|
||||
|
||||
uint64_t size = 0;
|
||||
execute_scratch_pbs<Torus>(
|
||||
@@ -830,8 +832,9 @@ template <typename InputTorus> struct int_noise_squashing_lut {
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_set_device(i);
|
||||
auto num_radix_blocks_on_gpu =
|
||||
get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
|
||||
auto num_radix_blocks_on_gpu = std::max(
|
||||
THRESHOLD_MULTI_GPU,
|
||||
get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count));
|
||||
int8_t *gpu_pbs_buffer;
|
||||
uint64_t size = 0;
|
||||
execute_scratch_pbs_128(streams[i], gpu_indexes[i], &gpu_pbs_buffer,
|
||||
@@ -4957,7 +4960,6 @@ template <typename Torus> struct int_div_rem_memory {
|
||||
// sub streams
|
||||
cudaStream_t *sub_streams_1;
|
||||
cudaStream_t *sub_streams_2;
|
||||
cudaStream_t *sub_streams_3;
|
||||
|
||||
// temporary device buffers
|
||||
CudaRadixCiphertextFFI *positive_numerator;
|
||||
@@ -4973,7 +4975,7 @@ template <typename Torus> struct int_div_rem_memory {
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->active_gpu_count = get_active_gpu_count(2 * num_blocks, gpu_count);
|
||||
this->active_gpu_count = get_active_gpu_count(num_blocks, gpu_count);
|
||||
this->params = params;
|
||||
this->is_signed = is_signed;
|
||||
|
||||
@@ -5038,16 +5040,11 @@ template <typename Torus> struct int_div_rem_memory {
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
// init sub streams
|
||||
sub_streams_1 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
sub_streams_2 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
sub_streams_3 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
for (uint j = 0; j < active_gpu_count; j++) {
|
||||
sub_streams_1 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
|
||||
sub_streams_2 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
sub_streams_3[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
}
|
||||
|
||||
// init lookup tables
|
||||
@@ -5111,14 +5108,12 @@ template <typename Torus> struct int_div_rem_memory {
|
||||
delete compare_signed_bits_lut;
|
||||
|
||||
// release sub streams
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
|
||||
cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
|
||||
cuda_destroy_stream(sub_streams_3[i], gpu_indexes[i]);
|
||||
}
|
||||
free(sub_streams_1);
|
||||
free(sub_streams_2);
|
||||
free(sub_streams_3);
|
||||
|
||||
// delete temporary buffers
|
||||
delete positive_numerator;
|
||||
|
||||
@@ -530,11 +530,13 @@ __host__ void host_integer_div_rem_kb(
|
||||
}
|
||||
|
||||
host_integer_abs_kb<Torus>(
|
||||
int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count, positive_numerator,
|
||||
bsks, ksks, ms_noise_reduction_key, int_mem_ptr->abs_mem_1, true);
|
||||
host_integer_abs_kb<Torus>(
|
||||
int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count, positive_divisor,
|
||||
bsks, ksks, ms_noise_reduction_key, int_mem_ptr->abs_mem_2, true);
|
||||
int_mem_ptr->sub_streams_1, gpu_indexes, int_mem_ptr->active_gpu_count,
|
||||
positive_numerator, bsks, ksks, ms_noise_reduction_key,
|
||||
int_mem_ptr->abs_mem_1, true);
|
||||
host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_2, gpu_indexes,
|
||||
int_mem_ptr->active_gpu_count, positive_divisor,
|
||||
bsks, ksks, ms_noise_reduction_key,
|
||||
int_mem_ptr->abs_mem_2, true);
|
||||
for (uint j = 0; j < int_mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(int_mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(int_mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
|
||||
std::mutex m;
|
||||
bool p2p_enabled = false;
|
||||
const int THRESHOLD_MULTI_GPU = 12;
|
||||
|
||||
// Enable bidirectional p2p access between all available GPUs and device_0_id
|
||||
int32_t cuda_setup_multi_gpu(int device_0_id) {
|
||||
@@ -39,10 +40,9 @@ int32_t cuda_setup_multi_gpu(int device_0_id) {
|
||||
}
|
||||
|
||||
int get_active_gpu_count(int num_inputs, int gpu_count) {
|
||||
int active_gpu_count = gpu_count;
|
||||
if (gpu_count > num_inputs) {
|
||||
active_gpu_count = num_inputs;
|
||||
}
|
||||
int ceil_div_inputs =
|
||||
std::max(1, (num_inputs + THRESHOLD_MULTI_GPU - 1) / THRESHOLD_MULTI_GPU);
|
||||
int active_gpu_count = std::min(ceil_div_inputs, gpu_count);
|
||||
return active_gpu_count;
|
||||
}
|
||||
|
||||
|
||||
@@ -51,7 +51,8 @@ void multi_gpu_alloc_lwe_async(cudaStream_t const *streams,
|
||||
dest.resize(gpu_count);
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
uint64_t size_tracker_on_gpu_i = 0;
|
||||
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
|
||||
auto inputs_on_gpu = std::max(
|
||||
THRESHOLD_MULTI_GPU, get_num_inputs_on_gpu(num_inputs, i, gpu_count));
|
||||
Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
|
||||
inputs_on_gpu * lwe_size * sizeof(Torus), streams[i], gpu_indexes[i],
|
||||
size_tracker_on_gpu_i, allocate_gpu_memory);
|
||||
@@ -80,7 +81,8 @@ void multi_gpu_alloc_lwe_many_lut_output_async(
|
||||
dest.resize(gpu_count);
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
uint64_t size_tracker = 0;
|
||||
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
|
||||
auto inputs_on_gpu = std::max(
|
||||
THRESHOLD_MULTI_GPU, get_num_inputs_on_gpu(num_inputs, i, gpu_count));
|
||||
Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
|
||||
num_many_lut * inputs_on_gpu * lwe_size * sizeof(Torus), streams[i],
|
||||
gpu_indexes[i], size_tracker, allocate_gpu_memory);
|
||||
|
||||
Reference in New Issue
Block a user