From 8282824338740aeadc8a8a6a465b3edacf04c722 Mon Sep 17 00:00:00 2001 From: Pedro Alves Date: Mon, 21 Jul 2025 12:20:10 -0300 Subject: [PATCH] chore(gpu): remove h_lut_indexes from int_radix_lut - That pointer is missleading and unnecessary --- .../cuda/include/integer/integer_utilities.h | 81 +++++++++++-------- .../cuda/src/integer/comparison.cuh | 11 +-- .../cuda/src/integer/integer.cuh | 35 ++++++-- 3 files changed, 81 insertions(+), 46 deletions(-) diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h index 237b13a21..d65da137b 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h @@ -298,7 +298,7 @@ template struct int_radix_lut { // done at the moment std::vector lut_vec; std::vector lut_indexes_vec; - Torus *h_lut_indexes; + // All tmp lwe arrays and index arrays for lwe contain the total // amount of blocks to be computed on, there is no split between GPUs // for the moment @@ -441,7 +441,6 @@ template struct int_radix_lut { create_zero_radix_ciphertext_async( streams[0], gpu_indexes[0], tmp_lwe_before_ks, num_radix_blocks, params.big_lwe_dimension, size_tracker, allocate_gpu_memory); - h_lut_indexes = (Torus *)(calloc(num_radix_blocks, sizeof(Torus))); degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t)); max_degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t)); } @@ -533,7 +532,6 @@ template struct int_radix_lut { streams[0], gpu_indexes[0], allocate_gpu_memory); memcpy(h_lwe_indexes_out, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus)); - h_lut_indexes = (Torus *)(calloc(num_radix_blocks, sizeof(Torus))); degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t)); max_degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t)); } @@ -659,7 +657,6 @@ template struct int_radix_lut { create_zero_radix_ciphertext_async( streams[0], gpu_indexes[0], tmp_lwe_before_ks, num_radix_blocks, params.big_lwe_dimension, size_tracker, allocate_gpu_memory); - h_lut_indexes = (Torus *)(calloc(num_radix_blocks, sizeof(Torus))); degrees = (uint64_t *)malloc(num_many_lut * num_luts * sizeof(uint64_t)); max_degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t)); } @@ -682,11 +679,17 @@ template struct int_radix_lut { // Return a pointer to idx-ith max degree uint64_t *get_max_degree(size_t idx) { return &max_degrees[idx]; } - // Return a pointer to idx-ith lut indexes at gpu_index's global memory - Torus *get_lut_indexes(uint32_t gpu_index, size_t ind) { + /* Return a pointer to idx-ith lut indexes at gpu_index's global memory + * + * gpu_index_in_lut_array is the index of the target GPU within + * lut_indexes_vec. This MUST NOT be confused with the device ID. + */ + Torus *get_lut_indexes(uint32_t gpu_index_in_lut_array, size_t ind) { if (!gpu_memory_allocated) return nullptr; - auto lut_indexes = lut_indexes_vec[gpu_index]; + if (gpu_index_in_lut_array >= lut_indexes_vec.size()) + PANIC("Cuda error: invalid lut_indexes index") + auto lut_indexes = lut_indexes_vec[gpu_index_in_lut_array]; return &lut_indexes[ind]; } @@ -794,7 +797,6 @@ template struct int_radix_lut { lwe_after_pbs_vec.clear(); lwe_trivial_indexes_vec.clear(); } - free(h_lut_indexes); free(degrees); free(max_degrees); } @@ -1036,16 +1038,18 @@ template struct int_bit_extract_luts_buffer { * we have bits_per_blocks LUTs that should be used for all bits in all * blocks */ - Torus *h_lut_indexes = lut->h_lut_indexes; + auto h_lut_indexes = + (Torus *)malloc(bits_per_block * num_radix_blocks * sizeof(Torus)); for (int j = 0; j < num_radix_blocks; j++) { for (int i = 0; i < bits_per_block; i++) h_lut_indexes[i + j * bits_per_block] = i; } cuda_memcpy_with_size_tracking_async_to_gpu( lut->get_lut_indexes(0, 0), h_lut_indexes, - num_radix_blocks * bits_per_block * sizeof(Torus), streams[0], + bits_per_block * num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0], allocate_gpu_memory); lut->broadcast_lut(streams, gpu_indexes); + cuda_synchronize_stream(streams[0], gpu_indexes[0]); /** * the input indexes should take the first bits_per_block PBS to target @@ -1073,6 +1077,7 @@ template struct int_bit_extract_luts_buffer { h_lwe_indexes_out); cuda_synchronize_stream(streams[0], gpu_indexes[0]); + free(h_lut_indexes); free(h_lwe_indexes_in); free(h_lwe_indexes_out); } @@ -1953,7 +1958,7 @@ template struct int_shifted_blocks_and_states_memory { // Generate the indexes to switch between luts within the pbs uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus); - Torus *h_lut_indexes = luts_array_first_step->h_lut_indexes; + auto h_lut_indexes = static_cast(malloc(lut_indexes_size)); for (int index = 0; index < num_radix_blocks; index++) { uint32_t grouping_index = index / grouping_size; bool is_in_first_grouping = (grouping_index == 0); @@ -1980,7 +1985,10 @@ template struct int_shifted_blocks_and_states_memory { // Do I need to do something else for the multi-gpu? luts_array_first_step->broadcast_lut(streams, gpu_indexes); - }; + + cuda_synchronize_stream(streams[0], gpu_indexes[0]); + free(h_lut_indexes); + } void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { @@ -2434,6 +2442,9 @@ template struct int_sc_prop_memory { lut_overflow_flag_prep->broadcast_lut(streams, gpu_indexes); } + auto h_lut_indexes = + static_cast(calloc((num_radix_blocks + 1), sizeof(Torus))); + // For the final cleanup in case of overflow or carry (it seems that I can) // It seems that this lut could be apply together with the other one but for // now we won't do it @@ -2461,14 +2472,8 @@ template struct int_sc_prop_memory { polynomial_size, message_modulus, carry_modulus, f_overflow_last, gpu_memory_allocated); - Torus *h_lut_indexes = lut_message_extract->h_lut_indexes; - for (int index = 0; index < num_radix_blocks + 1; index++) { - if (index < num_radix_blocks) { - h_lut_indexes[index] = 0; - } else { - h_lut_indexes[index] = 1; - } - } + h_lut_indexes[num_radix_blocks] = 1; + cuda_memcpy_with_size_tracking_async_to_gpu( lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes, (num_radix_blocks + 1) * sizeof(Torus), streams[0], gpu_indexes[0], @@ -2487,7 +2492,6 @@ template struct int_sc_prop_memory { polynomial_size, message_modulus, carry_modulus, f_carry_last, gpu_memory_allocated); - Torus *h_lut_indexes = lut_message_extract->h_lut_indexes; for (int index = 0; index < num_radix_blocks + 1; index++) { if (index < num_radix_blocks) { h_lut_indexes[index] = 0; @@ -2501,6 +2505,8 @@ template struct int_sc_prop_memory { allocate_gpu_memory); } lut_message_extract->broadcast_lut(streams, gpu_indexes); + cuda_synchronize_stream(streams[0], gpu_indexes[0]); + free(h_lut_indexes); }; void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, @@ -2670,7 +2676,7 @@ template struct int_shifted_blocks_and_borrow_states_memory { // Generate the indexes to switch between luts within the pbs uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus); - Torus *h_lut_indexes = luts_array_first_step->h_lut_indexes; + auto h_lut_indexes = static_cast(malloc(lut_indexes_size)); for (int index = 0; index < num_radix_blocks; index++) { uint32_t grouping_index = index / grouping_size; @@ -2690,13 +2696,15 @@ template struct int_shifted_blocks_and_borrow_states_memory { } } // copy the indexes to the gpu - Torus *lut_indexes = luts_array_first_step->get_lut_indexes(0, 0); + Torus *d_lut_indexes = luts_array_first_step->get_lut_indexes(0, 0); cuda_memcpy_with_size_tracking_async_to_gpu( - lut_indexes, h_lut_indexes, lut_indexes_size, streams[0], + d_lut_indexes, h_lut_indexes, lut_indexes_size, streams[0], gpu_indexes[0], allocate_gpu_memory); // Do I need to do something else for the multi-gpu? luts_array_first_step->broadcast_lut(streams, gpu_indexes); + cuda_synchronize_stream(streams[0], gpu_indexes[0]); + free(h_lut_indexes); }; // needed for the division to update the lut indexes @@ -3551,14 +3559,13 @@ template struct int_cmux_buffer { message_extract_lut->get_max_degree(0), params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, message_extract_lut_f, gpu_memory_allocated); - Torus *h_lut_indexes = predicate_lut->h_lut_indexes; - for (int index = 0; index < 2 * num_radix_blocks; index++) { - if (index < num_radix_blocks) { - h_lut_indexes[index] = 0; - } else { - h_lut_indexes[index] = 1; - } + + auto h_lut_indexes = + static_cast(calloc(2 * num_radix_blocks, sizeof(Torus))); + for (int index = num_radix_blocks; index < 2 * num_radix_blocks; index++) { + h_lut_indexes[index] = 1; } + cuda_memcpy_with_size_tracking_async_to_gpu( predicate_lut->get_lut_indexes(0, 0), h_lut_indexes, 2 * num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0], @@ -3566,6 +3573,9 @@ template struct int_cmux_buffer { predicate_lut->broadcast_lut(streams, gpu_indexes); message_extract_lut->broadcast_lut(streams, gpu_indexes); + + cuda_synchronize_stream(streams[0], gpu_indexes[0]); + free(h_lut_indexes); } void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, @@ -3599,6 +3609,7 @@ template struct int_are_all_block_true_buffer { // of interest in are_all_block_true(), as with max_value (the maximum message // value). int_radix_lut *is_max_value; + Torus *h_lut_indexes; bool gpu_memory_allocated; int_are_all_block_true_buffer(cudaStream_t const *streams, @@ -3638,6 +3649,7 @@ template struct int_are_all_block_true_buffer { params.carry_modulus, is_max_value_f, gpu_memory_allocated); is_max_value->broadcast_lut(streams, gpu_indexes); + h_lut_indexes = static_cast(malloc(max_chunks * sizeof(Torus))); } void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, @@ -3650,6 +3662,9 @@ template struct int_are_all_block_true_buffer { delete is_max_value; delete tmp_out; delete tmp_block_accumulated; + + cuda_synchronize_stream(streams[0], gpu_indexes[0]); + free(h_lut_indexes); } }; @@ -3914,7 +3929,6 @@ template struct int_comparison_buffer { int_radix_params params; - ////////////////// int_radix_lut *identity_lut; std::function identity_lut_f; @@ -4596,8 +4610,9 @@ template struct unsigned_int_div_rem_memory { scalars_for_overflow_sub[nb - 1], h_scalar, nb * sizeof(Torus), streams[0], gpu_indexes[0], allocate_gpu_memory); } - free(h_lut_indexes); free(h_scalar); + cuda_synchronize_stream(streams[0], gpu_indexes[0]); + free(h_lut_indexes); }; void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh index d3dba45bc..83a2ff74b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh @@ -85,6 +85,7 @@ __host__ void are_all_comparisons_block_true( uint32_t total_modulus = message_modulus * carry_modulus; uint32_t max_value = (total_modulus - 1) / (message_modulus - 1); + auto h_lut_indexes = are_all_block_true_buffer->h_lut_indexes; copy_radix_ciphertext_slice_async(streams[0], gpu_indexes[0], tmp_out, 0, num_radix_blocks, lwe_array_in, 0, @@ -137,7 +138,6 @@ __host__ void are_all_comparisons_block_true( polynomial_size, message_modulus, carry_modulus, is_equal_to_num_blocks_lut_f, true); - Torus *h_lut_indexes = is_max_value_lut->h_lut_indexes; for (int index = 0; index < num_chunks; index++) { if (index == num_chunks - 1) { h_lut_indexes[index] = 1; @@ -161,12 +161,9 @@ __host__ void are_all_comparisons_block_true( ksks, ms_noise_reduction_key, lut, 1); // Reset max_value_lut_indexes before returning, otherwise if the lut is // reused the lut indexes will be wrong - memset(is_max_value_lut->h_lut_indexes, 0, - is_max_value_lut->num_blocks * sizeof(Torus)); - cuda_memcpy_async_to_gpu(is_max_value_lut->get_lut_indexes(0, 0), - is_max_value_lut->h_lut_indexes, - is_max_value_lut->num_blocks * sizeof(Torus), - streams[0], gpu_indexes[0]); + cuda_memset_async(is_max_value_lut->get_lut_indexes(0, 0), 0, + is_max_value_lut->num_blocks * sizeof(Torus), + streams[0], gpu_indexes[0]); is_max_value_lut->broadcast_lut(streams, gpu_indexes); reset_radix_ciphertext_blocks(lwe_array_out, 1); return; diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index d7c61e06d..c6a616885 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -603,13 +603,20 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( cuda_synchronize_stream(streams[i], gpu_indexes[i]); } } + auto h_lut_indexes = + static_cast(malloc(num_radix_blocks * sizeof(Torus))); + cuda_memcpy_async_to_cpu(h_lut_indexes, lut->get_lut_indexes(0, 0), + num_radix_blocks * sizeof(Torus), streams[0], + gpu_indexes[0]); + cuda_synchronize_stream(streams[0], gpu_indexes[0]); for (uint i = 0; i < num_radix_blocks; i++) { - auto degrees_index = lut->h_lut_indexes[i]; + auto degrees_index = h_lut_indexes[i]; lwe_array_out->degrees[i] = lut->degrees[degrees_index]; lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL; CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], params.message_modulus, params.carry_modulus); } + free(h_lut_indexes); POP_RANGE() } @@ -710,13 +717,20 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb( cuda_synchronize_stream(streams[i], gpu_indexes[i]); } } + auto h_lut_indexes = + static_cast(malloc(lut->num_blocks * sizeof(Torus))); + cuda_memcpy_async_to_cpu(h_lut_indexes, lut->get_lut_indexes(0, 0), + lut->num_blocks * sizeof(Torus), streams[0], + gpu_indexes[0]); + cuda_synchronize_stream(streams[0], gpu_indexes[0]); for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) { - auto degrees_index = lut->h_lut_indexes[i % lut->num_blocks]; + auto degrees_index = h_lut_indexes[i % lut->num_blocks]; lwe_array_out->degrees[i] = lut->degrees[degrees_index]; lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL; CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], params.message_modulus, params.carry_modulus); } + free(h_lut_indexes); POP_RANGE() } @@ -828,13 +842,20 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( cuda_synchronize_stream(streams[i], gpu_indexes[i]); } } + auto h_lut_indexes = + static_cast(malloc(num_radix_blocks * sizeof(Torus))); + cuda_memcpy_async_to_cpu(h_lut_indexes, lut->get_lut_indexes(0, 0), + num_radix_blocks * sizeof(Torus), streams[0], + gpu_indexes[0]); + cuda_synchronize_stream(streams[0], gpu_indexes[0]); for (uint i = 0; i < num_radix_blocks; i++) { - auto degrees_index = lut->h_lut_indexes[i]; + auto degrees_index = h_lut_indexes[i]; lwe_array_out->degrees[i] = lut->degrees[degrees_index]; lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL; CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], params.message_modulus, params.carry_modulus); } + free(h_lut_indexes); POP_RANGE() } @@ -1462,8 +1483,10 @@ void host_full_propagate_inplace( void *const *bsks, uint32_t num_blocks) { auto params = mem_ptr->lut->params; - int big_lwe_size = (params.glwe_dimension * params.polynomial_size + 1); - int small_lwe_size = (params.small_lwe_dimension + 1); + Torus degrees_index; + cuda_memcpy_async_to_cpu(°rees_index, mem_ptr->lut->get_lut_indexes(0, 0), + sizeof(Torus), streams[0], gpu_indexes[0]); + cuda_synchronize_stream(streams[0], gpu_indexes[0]); // In the case of extracting a single LWE this parameters are dummy uint32_t num_many_lut = 1; @@ -1496,7 +1519,7 @@ void host_full_propagate_inplace( copy_radix_ciphertext_slice_async(streams[0], gpu_indexes[0], &cur_input_block, 0, 1, mem_ptr->tmp_big_lwe_vector, 0, 1); - auto degrees_index = mem_ptr->lut->h_lut_indexes[0]; + input_blocks->degrees[i] = mem_ptr->lut->degrees[degrees_index]; input_blocks->noise_levels[i] = NoiseLevel::NOMINAL; CHECK_NOISE_LEVEL(input_blocks->noise_levels[i], params.message_modulus,