chore(gpu): remove h_lut_indexes from int_radix_lut

- That pointer is missleading and unnecessary
This commit is contained in:
Pedro Alves
2025-07-21 12:20:10 -03:00
parent 7b7ad5bea0
commit 8282824338
3 changed files with 81 additions and 46 deletions

View File

@@ -298,7 +298,7 @@ template <typename Torus> struct int_radix_lut {
// done at the moment
std::vector<Torus *> lut_vec;
std::vector<Torus *> lut_indexes_vec;
Torus *h_lut_indexes;
// All tmp lwe arrays and index arrays for lwe contain the total
// amount of blocks to be computed on, there is no split between GPUs
// for the moment
@@ -441,7 +441,6 @@ template <typename Torus> struct int_radix_lut {
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], tmp_lwe_before_ks, num_radix_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
h_lut_indexes = (Torus *)(calloc(num_radix_blocks, sizeof(Torus)));
degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t));
max_degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t));
}
@@ -533,7 +532,6 @@ template <typename Torus> struct int_radix_lut {
streams[0], gpu_indexes[0], allocate_gpu_memory);
memcpy(h_lwe_indexes_out, h_lwe_indexes_in,
num_radix_blocks * sizeof(Torus));
h_lut_indexes = (Torus *)(calloc(num_radix_blocks, sizeof(Torus)));
degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t));
max_degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t));
}
@@ -659,7 +657,6 @@ template <typename Torus> struct int_radix_lut {
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], tmp_lwe_before_ks, num_radix_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
h_lut_indexes = (Torus *)(calloc(num_radix_blocks, sizeof(Torus)));
degrees = (uint64_t *)malloc(num_many_lut * num_luts * sizeof(uint64_t));
max_degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t));
}
@@ -682,11 +679,17 @@ template <typename Torus> struct int_radix_lut {
// Return a pointer to idx-ith max degree
uint64_t *get_max_degree(size_t idx) { return &max_degrees[idx]; }
// Return a pointer to idx-ith lut indexes at gpu_index's global memory
Torus *get_lut_indexes(uint32_t gpu_index, size_t ind) {
/* Return a pointer to idx-ith lut indexes at gpu_index's global memory
*
* gpu_index_in_lut_array is the index of the target GPU within
* lut_indexes_vec. This MUST NOT be confused with the device ID.
*/
Torus *get_lut_indexes(uint32_t gpu_index_in_lut_array, size_t ind) {
if (!gpu_memory_allocated)
return nullptr;
auto lut_indexes = lut_indexes_vec[gpu_index];
if (gpu_index_in_lut_array >= lut_indexes_vec.size())
PANIC("Cuda error: invalid lut_indexes index")
auto lut_indexes = lut_indexes_vec[gpu_index_in_lut_array];
return &lut_indexes[ind];
}
@@ -794,7 +797,6 @@ template <typename Torus> struct int_radix_lut {
lwe_after_pbs_vec.clear();
lwe_trivial_indexes_vec.clear();
}
free(h_lut_indexes);
free(degrees);
free(max_degrees);
}
@@ -1036,16 +1038,18 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
* we have bits_per_blocks LUTs that should be used for all bits in all
* blocks
*/
Torus *h_lut_indexes = lut->h_lut_indexes;
auto h_lut_indexes =
(Torus *)malloc(bits_per_block * num_radix_blocks * sizeof(Torus));
for (int j = 0; j < num_radix_blocks; j++) {
for (int i = 0; i < bits_per_block; i++)
h_lut_indexes[i + j * bits_per_block] = i;
}
cuda_memcpy_with_size_tracking_async_to_gpu(
lut->get_lut_indexes(0, 0), h_lut_indexes,
num_radix_blocks * bits_per_block * sizeof(Torus), streams[0],
bits_per_block * num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0], allocate_gpu_memory);
lut->broadcast_lut(streams, gpu_indexes);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
/**
* the input indexes should take the first bits_per_block PBS to target
@@ -1073,6 +1077,7 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
h_lwe_indexes_out);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
free(h_lut_indexes);
free(h_lwe_indexes_in);
free(h_lwe_indexes_out);
}
@@ -1953,7 +1958,7 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
// Generate the indexes to switch between luts within the pbs
uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus);
Torus *h_lut_indexes = luts_array_first_step->h_lut_indexes;
auto h_lut_indexes = static_cast<Torus *>(malloc(lut_indexes_size));
for (int index = 0; index < num_radix_blocks; index++) {
uint32_t grouping_index = index / grouping_size;
bool is_in_first_grouping = (grouping_index == 0);
@@ -1980,7 +1985,10 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
// Do I need to do something else for the multi-gpu?
luts_array_first_step->broadcast_lut(streams, gpu_indexes);
};
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
free(h_lut_indexes);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
@@ -2434,6 +2442,9 @@ template <typename Torus> struct int_sc_prop_memory {
lut_overflow_flag_prep->broadcast_lut(streams, gpu_indexes);
}
auto h_lut_indexes =
static_cast<Torus *>(calloc((num_radix_blocks + 1), sizeof(Torus)));
// For the final cleanup in case of overflow or carry (it seems that I can)
// It seems that this lut could be apply together with the other one but for
// now we won't do it
@@ -2461,14 +2472,8 @@ template <typename Torus> struct int_sc_prop_memory {
polynomial_size, message_modulus, carry_modulus, f_overflow_last,
gpu_memory_allocated);
Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
for (int index = 0; index < num_radix_blocks + 1; index++) {
if (index < num_radix_blocks) {
h_lut_indexes[index] = 0;
} else {
h_lut_indexes[index] = 1;
}
}
h_lut_indexes[num_radix_blocks] = 1;
cuda_memcpy_with_size_tracking_async_to_gpu(
lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
(num_radix_blocks + 1) * sizeof(Torus), streams[0], gpu_indexes[0],
@@ -2487,7 +2492,6 @@ template <typename Torus> struct int_sc_prop_memory {
polynomial_size, message_modulus, carry_modulus, f_carry_last,
gpu_memory_allocated);
Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
for (int index = 0; index < num_radix_blocks + 1; index++) {
if (index < num_radix_blocks) {
h_lut_indexes[index] = 0;
@@ -2501,6 +2505,8 @@ template <typename Torus> struct int_sc_prop_memory {
allocate_gpu_memory);
}
lut_message_extract->broadcast_lut(streams, gpu_indexes);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
free(h_lut_indexes);
};
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -2670,7 +2676,7 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
// Generate the indexes to switch between luts within the pbs
uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus);
Torus *h_lut_indexes = luts_array_first_step->h_lut_indexes;
auto h_lut_indexes = static_cast<Torus *>(malloc(lut_indexes_size));
for (int index = 0; index < num_radix_blocks; index++) {
uint32_t grouping_index = index / grouping_size;
@@ -2690,13 +2696,15 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
}
}
// copy the indexes to the gpu
Torus *lut_indexes = luts_array_first_step->get_lut_indexes(0, 0);
Torus *d_lut_indexes = luts_array_first_step->get_lut_indexes(0, 0);
cuda_memcpy_with_size_tracking_async_to_gpu(
lut_indexes, h_lut_indexes, lut_indexes_size, streams[0],
d_lut_indexes, h_lut_indexes, lut_indexes_size, streams[0],
gpu_indexes[0], allocate_gpu_memory);
// Do I need to do something else for the multi-gpu?
luts_array_first_step->broadcast_lut(streams, gpu_indexes);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
free(h_lut_indexes);
};
// needed for the division to update the lut indexes
@@ -3551,14 +3559,13 @@ template <typename Torus> struct int_cmux_buffer {
message_extract_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
message_extract_lut_f, gpu_memory_allocated);
Torus *h_lut_indexes = predicate_lut->h_lut_indexes;
for (int index = 0; index < 2 * num_radix_blocks; index++) {
if (index < num_radix_blocks) {
h_lut_indexes[index] = 0;
} else {
h_lut_indexes[index] = 1;
}
auto h_lut_indexes =
static_cast<Torus *>(calloc(2 * num_radix_blocks, sizeof(Torus)));
for (int index = num_radix_blocks; index < 2 * num_radix_blocks; index++) {
h_lut_indexes[index] = 1;
}
cuda_memcpy_with_size_tracking_async_to_gpu(
predicate_lut->get_lut_indexes(0, 0), h_lut_indexes,
2 * num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0],
@@ -3566,6 +3573,9 @@ template <typename Torus> struct int_cmux_buffer {
predicate_lut->broadcast_lut(streams, gpu_indexes);
message_extract_lut->broadcast_lut(streams, gpu_indexes);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
free(h_lut_indexes);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -3599,6 +3609,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {
// of interest in are_all_block_true(), as with max_value (the maximum message
// value).
int_radix_lut<Torus> *is_max_value;
Torus *h_lut_indexes;
bool gpu_memory_allocated;
int_are_all_block_true_buffer(cudaStream_t const *streams,
@@ -3638,6 +3649,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {
params.carry_modulus, is_max_value_f, gpu_memory_allocated);
is_max_value->broadcast_lut(streams, gpu_indexes);
h_lut_indexes = static_cast<Torus *>(malloc(max_chunks * sizeof(Torus)));
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -3650,6 +3662,9 @@ template <typename Torus> struct int_are_all_block_true_buffer {
delete is_max_value;
delete tmp_out;
delete tmp_block_accumulated;
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
free(h_lut_indexes);
}
};
@@ -3914,7 +3929,6 @@ template <typename Torus> struct int_comparison_buffer {
int_radix_params params;
//////////////////
int_radix_lut<Torus> *identity_lut;
std::function<Torus(Torus)> identity_lut_f;
@@ -4596,8 +4610,9 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
scalars_for_overflow_sub[nb - 1], h_scalar, nb * sizeof(Torus),
streams[0], gpu_indexes[0], allocate_gpu_memory);
}
free(h_lut_indexes);
free(h_scalar);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
free(h_lut_indexes);
};
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,

View File

@@ -85,6 +85,7 @@ __host__ void are_all_comparisons_block_true(
uint32_t total_modulus = message_modulus * carry_modulus;
uint32_t max_value = (total_modulus - 1) / (message_modulus - 1);
auto h_lut_indexes = are_all_block_true_buffer->h_lut_indexes;
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], tmp_out,
0, num_radix_blocks, lwe_array_in, 0,
@@ -137,7 +138,6 @@ __host__ void are_all_comparisons_block_true(
polynomial_size, message_modulus, carry_modulus,
is_equal_to_num_blocks_lut_f, true);
Torus *h_lut_indexes = is_max_value_lut->h_lut_indexes;
for (int index = 0; index < num_chunks; index++) {
if (index == num_chunks - 1) {
h_lut_indexes[index] = 1;
@@ -161,12 +161,9 @@ __host__ void are_all_comparisons_block_true(
ksks, ms_noise_reduction_key, lut, 1);
// Reset max_value_lut_indexes before returning, otherwise if the lut is
// reused the lut indexes will be wrong
memset(is_max_value_lut->h_lut_indexes, 0,
is_max_value_lut->num_blocks * sizeof(Torus));
cuda_memcpy_async_to_gpu(is_max_value_lut->get_lut_indexes(0, 0),
is_max_value_lut->h_lut_indexes,
is_max_value_lut->num_blocks * sizeof(Torus),
streams[0], gpu_indexes[0]);
cuda_memset_async(is_max_value_lut->get_lut_indexes(0, 0), 0,
is_max_value_lut->num_blocks * sizeof(Torus),
streams[0], gpu_indexes[0]);
is_max_value_lut->broadcast_lut(streams, gpu_indexes);
reset_radix_ciphertext_blocks(lwe_array_out, 1);
return;

View File

@@ -603,13 +603,20 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}
auto h_lut_indexes =
static_cast<Torus *>(malloc(num_radix_blocks * sizeof(Torus)));
cuda_memcpy_async_to_cpu(h_lut_indexes, lut->get_lut_indexes(0, 0),
num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint i = 0; i < num_radix_blocks; i++) {
auto degrees_index = lut->h_lut_indexes[i];
auto degrees_index = h_lut_indexes[i];
lwe_array_out->degrees[i] = lut->degrees[degrees_index];
lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], params.message_modulus,
params.carry_modulus);
}
free(h_lut_indexes);
POP_RANGE()
}
@@ -710,13 +717,20 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}
auto h_lut_indexes =
static_cast<Torus *>(malloc(lut->num_blocks * sizeof(Torus)));
cuda_memcpy_async_to_cpu(h_lut_indexes, lut->get_lut_indexes(0, 0),
lut->num_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) {
auto degrees_index = lut->h_lut_indexes[i % lut->num_blocks];
auto degrees_index = h_lut_indexes[i % lut->num_blocks];
lwe_array_out->degrees[i] = lut->degrees[degrees_index];
lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], params.message_modulus,
params.carry_modulus);
}
free(h_lut_indexes);
POP_RANGE()
}
@@ -828,13 +842,20 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}
auto h_lut_indexes =
static_cast<Torus *>(malloc(num_radix_blocks * sizeof(Torus)));
cuda_memcpy_async_to_cpu(h_lut_indexes, lut->get_lut_indexes(0, 0),
num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint i = 0; i < num_radix_blocks; i++) {
auto degrees_index = lut->h_lut_indexes[i];
auto degrees_index = h_lut_indexes[i];
lwe_array_out->degrees[i] = lut->degrees[degrees_index];
lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], params.message_modulus,
params.carry_modulus);
}
free(h_lut_indexes);
POP_RANGE()
}
@@ -1462,8 +1483,10 @@ void host_full_propagate_inplace(
void *const *bsks, uint32_t num_blocks) {
auto params = mem_ptr->lut->params;
int big_lwe_size = (params.glwe_dimension * params.polynomial_size + 1);
int small_lwe_size = (params.small_lwe_dimension + 1);
Torus degrees_index;
cuda_memcpy_async_to_cpu(&degrees_index, mem_ptr->lut->get_lut_indexes(0, 0),
sizeof(Torus), streams[0], gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
// In the case of extracting a single LWE this parameters are dummy
uint32_t num_many_lut = 1;
@@ -1496,7 +1519,7 @@ void host_full_propagate_inplace(
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
&cur_input_block, 0, 1,
mem_ptr->tmp_big_lwe_vector, 0, 1);
auto degrees_index = mem_ptr->lut->h_lut_indexes[0];
input_blocks->degrees[i] = mem_ptr->lut->degrees[degrees_index];
input_blocks->noise_levels[i] = NoiseLevel::NOMINAL;
CHECK_NOISE_LEVEL(input_blocks->noise_levels[i], params.message_modulus,