diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h index 736aff79b..6acdfac14 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h @@ -756,18 +756,20 @@ template struct int_radix_lut { CudaStreams streams, uint64_t max_num_radix_blocks, uint64_t &size_tracker, bool allocate_gpu_memory) { // We need to create the auxiliary array only in GPU 0 - lwe_aligned_vec.resize(active_streams.count()); - for (uint i = 0; i < active_streams.count(); i++) { - uint64_t size_tracker_on_array_i = 0; - auto inputs_on_gpu = std::max( - THRESHOLD_MULTI_GPU, get_num_inputs_on_gpu(max_num_radix_blocks, i, - active_streams.count())); - Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async( - inputs_on_gpu * (params.big_lwe_dimension + 1) * sizeof(Torus), - streams.stream(0), streams.gpu_index(0), size_tracker_on_array_i, - allocate_gpu_memory); - lwe_aligned_vec[i] = d_array; - size_tracker += size_tracker_on_array_i; + if (active_streams.count() > 1) { + lwe_aligned_vec.resize(active_streams.count()); + for (uint i = 0; i < active_streams.count(); i++) { + uint64_t size_tracker_on_array_i = 0; + auto inputs_on_gpu = std::max( + THRESHOLD_MULTI_GPU, get_num_inputs_on_gpu(max_num_radix_blocks, i, + active_streams.count())); + Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async( + inputs_on_gpu * (params.big_lwe_dimension + 1) * sizeof(Torus), + streams.stream(0), streams.gpu_index(0), size_tracker_on_array_i, + allocate_gpu_memory); + lwe_aligned_vec[i] = d_array; + size_tracker += size_tracker_on_array_i; + } } } @@ -1632,8 +1634,19 @@ template struct int_sum_ciphertexts_vec_memory { luts_message_carry = new int_radix_lut( streams, params, 2, pbs_count, true, size_tracker); allocated_luts_message_carry = true; + uint64_t message_modulus_bits = + (uint64_t)std::log2(params.message_modulus); + uint64_t carry_modulus_bits = (uint64_t)std::log2(params.carry_modulus); + uint64_t total_bits_per_block = + message_modulus_bits + carry_modulus_bits; + uint64_t denominator = + (uint64_t)std::ceil((pow(2, total_bits_per_block) - 1) / + (pow(2, message_modulus_bits) - 1)); + + uint64_t upper_bound_num_blocks = + max_total_blocks_in_vec * 2 / denominator; luts_message_carry->allocate_lwe_vector_for_non_trivial_indexes( - streams, this->max_total_blocks_in_vec, size_tracker, true); + streams, upper_bound_num_blocks, size_tracker, true); } } if (allocated_luts_message_carry) { @@ -1731,9 +1744,17 @@ template struct int_sum_ciphertexts_vec_memory { this->current_blocks = current_blocks; this->small_lwe_vector = small_lwe_vector; this->luts_message_carry = reused_lut; + + uint64_t message_modulus_bits = (uint64_t)std::log2(params.message_modulus); + uint64_t carry_modulus_bits = (uint64_t)std::log2(params.carry_modulus); + uint64_t total_bits_per_block = message_modulus_bits + carry_modulus_bits; + uint64_t denominator = + (uint64_t)std::ceil((pow(2, total_bits_per_block) - 1) / + (pow(2, message_modulus_bits) - 1)); + + uint64_t upper_bound_num_blocks = max_total_blocks_in_vec * 2 / denominator; this->luts_message_carry->allocate_lwe_vector_for_non_trivial_indexes( - streams, this->max_total_blocks_in_vec, size_tracker, - allocate_gpu_memory); + streams, upper_bound_num_blocks, size_tracker, allocate_gpu_memory); setup_index_buffers(streams, size_tracker); } diff --git a/tfhe-benchmark/src/utilities.rs b/tfhe-benchmark/src/utilities.rs index 614e789ec..a8f9c8590 100644 --- a/tfhe-benchmark/src/utilities.rs +++ b/tfhe-benchmark/src/utilities.rs @@ -421,23 +421,32 @@ pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 { let block_multiplicator = (ref_block_count as f64 / num_block as f64).ceil().min(1.0); // Some operations with a high serial workload (e.g. division) would yield an operation // loading value so low that the number of elements in the end wouldn't be meaningful. - let minimum_loading = if num_block < 64 { 0.2 } else { 0.01 }; + let minimum_loading = if num_block < 64 { 1.0 } else { 0.015 }; #[cfg(feature = "gpu")] { let num_sms_per_gpu = get_number_of_sms(); let total_num_sm = num_sms_per_gpu * get_number_of_gpus(); - let total_blocks_per_sm = 4u32; // Assume each SM can handle 4 blocks concurrently - let total_num_sm = total_blocks_per_sm * total_num_sm; + let total_blocks_per_sm = 4u64; // Assume each SM can handle 4 blocks concurrently let min_num_waves = 4u64; //Enforce at least 4 waves in the GPU - let elements_per_wave = total_num_sm as u64 / (num_block as u64); - + let block_factor = ((2.0f64 * num_block as f64) / 4.0f64).ceil() as u64; + let elements_per_wave = total_blocks_per_sm * total_num_sm as u64 / block_factor; + // We need to enable the new load for pbs benches and for sizes larger than 16 blocks in + // demanding operations for the rest of operations we maintain a minimum of 200 + // elements + let min_elements = if op_pbs_count == 1 + || (op_pbs_count > (num_block * num_block) as u64 && num_block >= 16) + { + elements_per_wave * min_num_waves + } else { + 200u64 + }; let operation_loading = ((total_num_sm as u64 / op_pbs_count) as f64).max(minimum_loading); let elements = (total_num_sm as f64 * block_multiplicator * operation_loading) as u64; - elements.min(elements_per_wave * min_num_waves) // This threshold is useful for operation - // with both a small number of - // block and low PBs count. + elements.min(min_elements) // This threshold is useful for operation + // with both a small number of + // block and low PBs count. } #[cfg(feature = "hpu")] {