mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-09 14:47:56 -05:00
chore(gpu): remove h_lut_indexes from int_radix_lut
- That pointer is missleading and unnecessary
This commit is contained in:
@@ -298,7 +298,7 @@ template <typename Torus> struct int_radix_lut {
|
||||
// done at the moment
|
||||
std::vector<Torus *> lut_vec;
|
||||
std::vector<Torus *> lut_indexes_vec;
|
||||
Torus *h_lut_indexes;
|
||||
|
||||
// All tmp lwe arrays and index arrays for lwe contain the total
|
||||
// amount of blocks to be computed on, there is no split between GPUs
|
||||
// for the moment
|
||||
@@ -441,7 +441,6 @@ template <typename Torus> struct int_radix_lut {
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], tmp_lwe_before_ks, num_radix_blocks,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
h_lut_indexes = (Torus *)(calloc(num_radix_blocks, sizeof(Torus)));
|
||||
degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t));
|
||||
max_degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t));
|
||||
}
|
||||
@@ -533,7 +532,6 @@ template <typename Torus> struct int_radix_lut {
|
||||
streams[0], gpu_indexes[0], allocate_gpu_memory);
|
||||
memcpy(h_lwe_indexes_out, h_lwe_indexes_in,
|
||||
num_radix_blocks * sizeof(Torus));
|
||||
h_lut_indexes = (Torus *)(calloc(num_radix_blocks, sizeof(Torus)));
|
||||
degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t));
|
||||
max_degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t));
|
||||
}
|
||||
@@ -659,7 +657,6 @@ template <typename Torus> struct int_radix_lut {
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], tmp_lwe_before_ks, num_radix_blocks,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
h_lut_indexes = (Torus *)(calloc(num_radix_blocks, sizeof(Torus)));
|
||||
degrees = (uint64_t *)malloc(num_many_lut * num_luts * sizeof(uint64_t));
|
||||
max_degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t));
|
||||
}
|
||||
@@ -682,11 +679,17 @@ template <typename Torus> struct int_radix_lut {
|
||||
// Return a pointer to idx-ith max degree
|
||||
uint64_t *get_max_degree(size_t idx) { return &max_degrees[idx]; }
|
||||
|
||||
// Return a pointer to idx-ith lut indexes at gpu_index's global memory
|
||||
Torus *get_lut_indexes(uint32_t gpu_index, size_t ind) {
|
||||
/* Return a pointer to idx-ith lut indexes at gpu_index's global memory
|
||||
*
|
||||
* gpu_index_in_lut_array is the index of the target GPU within
|
||||
* lut_indexes_vec. This MUST NOT be confused with the device ID.
|
||||
*/
|
||||
Torus *get_lut_indexes(uint32_t gpu_index_in_lut_array, size_t ind) {
|
||||
if (!gpu_memory_allocated)
|
||||
return nullptr;
|
||||
auto lut_indexes = lut_indexes_vec[gpu_index];
|
||||
if (gpu_index_in_lut_array >= lut_indexes_vec.size())
|
||||
PANIC("Cuda error: invalid lut_indexes index")
|
||||
auto lut_indexes = lut_indexes_vec[gpu_index_in_lut_array];
|
||||
return &lut_indexes[ind];
|
||||
}
|
||||
|
||||
@@ -794,7 +797,6 @@ template <typename Torus> struct int_radix_lut {
|
||||
lwe_after_pbs_vec.clear();
|
||||
lwe_trivial_indexes_vec.clear();
|
||||
}
|
||||
free(h_lut_indexes);
|
||||
free(degrees);
|
||||
free(max_degrees);
|
||||
}
|
||||
@@ -1036,16 +1038,18 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
|
||||
* we have bits_per_blocks LUTs that should be used for all bits in all
|
||||
* blocks
|
||||
*/
|
||||
Torus *h_lut_indexes = lut->h_lut_indexes;
|
||||
auto h_lut_indexes =
|
||||
(Torus *)malloc(bits_per_block * num_radix_blocks * sizeof(Torus));
|
||||
for (int j = 0; j < num_radix_blocks; j++) {
|
||||
for (int i = 0; i < bits_per_block; i++)
|
||||
h_lut_indexes[i + j * bits_per_block] = i;
|
||||
}
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
lut->get_lut_indexes(0, 0), h_lut_indexes,
|
||||
num_radix_blocks * bits_per_block * sizeof(Torus), streams[0],
|
||||
bits_per_block * num_radix_blocks * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0], allocate_gpu_memory);
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
|
||||
/**
|
||||
* the input indexes should take the first bits_per_block PBS to target
|
||||
@@ -1073,6 +1077,7 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
|
||||
h_lwe_indexes_out);
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
free(h_lut_indexes);
|
||||
free(h_lwe_indexes_in);
|
||||
free(h_lwe_indexes_out);
|
||||
}
|
||||
@@ -1953,7 +1958,7 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
|
||||
// Generate the indexes to switch between luts within the pbs
|
||||
uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus);
|
||||
|
||||
Torus *h_lut_indexes = luts_array_first_step->h_lut_indexes;
|
||||
auto h_lut_indexes = static_cast<Torus *>(malloc(lut_indexes_size));
|
||||
for (int index = 0; index < num_radix_blocks; index++) {
|
||||
uint32_t grouping_index = index / grouping_size;
|
||||
bool is_in_first_grouping = (grouping_index == 0);
|
||||
@@ -1980,7 +1985,10 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
|
||||
// Do I need to do something else for the multi-gpu?
|
||||
|
||||
luts_array_first_step->broadcast_lut(streams, gpu_indexes);
|
||||
};
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
free(h_lut_indexes);
|
||||
}
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
|
||||
@@ -2434,6 +2442,9 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
lut_overflow_flag_prep->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
|
||||
auto h_lut_indexes =
|
||||
static_cast<Torus *>(calloc((num_radix_blocks + 1), sizeof(Torus)));
|
||||
|
||||
// For the final cleanup in case of overflow or carry (it seems that I can)
|
||||
// It seems that this lut could be apply together with the other one but for
|
||||
// now we won't do it
|
||||
@@ -2461,14 +2472,8 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
polynomial_size, message_modulus, carry_modulus, f_overflow_last,
|
||||
gpu_memory_allocated);
|
||||
|
||||
Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
|
||||
for (int index = 0; index < num_radix_blocks + 1; index++) {
|
||||
if (index < num_radix_blocks) {
|
||||
h_lut_indexes[index] = 0;
|
||||
} else {
|
||||
h_lut_indexes[index] = 1;
|
||||
}
|
||||
}
|
||||
h_lut_indexes[num_radix_blocks] = 1;
|
||||
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
|
||||
(num_radix_blocks + 1) * sizeof(Torus), streams[0], gpu_indexes[0],
|
||||
@@ -2487,7 +2492,6 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
polynomial_size, message_modulus, carry_modulus, f_carry_last,
|
||||
gpu_memory_allocated);
|
||||
|
||||
Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
|
||||
for (int index = 0; index < num_radix_blocks + 1; index++) {
|
||||
if (index < num_radix_blocks) {
|
||||
h_lut_indexes[index] = 0;
|
||||
@@ -2501,6 +2505,8 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
lut_message_extract->broadcast_lut(streams, gpu_indexes);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
free(h_lut_indexes);
|
||||
};
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -2670,7 +2676,7 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
|
||||
|
||||
// Generate the indexes to switch between luts within the pbs
|
||||
uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus);
|
||||
Torus *h_lut_indexes = luts_array_first_step->h_lut_indexes;
|
||||
auto h_lut_indexes = static_cast<Torus *>(malloc(lut_indexes_size));
|
||||
|
||||
for (int index = 0; index < num_radix_blocks; index++) {
|
||||
uint32_t grouping_index = index / grouping_size;
|
||||
@@ -2690,13 +2696,15 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
|
||||
}
|
||||
}
|
||||
// copy the indexes to the gpu
|
||||
Torus *lut_indexes = luts_array_first_step->get_lut_indexes(0, 0);
|
||||
Torus *d_lut_indexes = luts_array_first_step->get_lut_indexes(0, 0);
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
lut_indexes, h_lut_indexes, lut_indexes_size, streams[0],
|
||||
d_lut_indexes, h_lut_indexes, lut_indexes_size, streams[0],
|
||||
gpu_indexes[0], allocate_gpu_memory);
|
||||
// Do I need to do something else for the multi-gpu?
|
||||
|
||||
luts_array_first_step->broadcast_lut(streams, gpu_indexes);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
free(h_lut_indexes);
|
||||
};
|
||||
|
||||
// needed for the division to update the lut indexes
|
||||
@@ -3551,14 +3559,13 @@ template <typename Torus> struct int_cmux_buffer {
|
||||
message_extract_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
message_extract_lut_f, gpu_memory_allocated);
|
||||
Torus *h_lut_indexes = predicate_lut->h_lut_indexes;
|
||||
for (int index = 0; index < 2 * num_radix_blocks; index++) {
|
||||
if (index < num_radix_blocks) {
|
||||
h_lut_indexes[index] = 0;
|
||||
} else {
|
||||
h_lut_indexes[index] = 1;
|
||||
}
|
||||
|
||||
auto h_lut_indexes =
|
||||
static_cast<Torus *>(calloc(2 * num_radix_blocks, sizeof(Torus)));
|
||||
for (int index = num_radix_blocks; index < 2 * num_radix_blocks; index++) {
|
||||
h_lut_indexes[index] = 1;
|
||||
}
|
||||
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
predicate_lut->get_lut_indexes(0, 0), h_lut_indexes,
|
||||
2 * num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0],
|
||||
@@ -3566,6 +3573,9 @@ template <typename Torus> struct int_cmux_buffer {
|
||||
|
||||
predicate_lut->broadcast_lut(streams, gpu_indexes);
|
||||
message_extract_lut->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
free(h_lut_indexes);
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -3599,6 +3609,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {
|
||||
// of interest in are_all_block_true(), as with max_value (the maximum message
|
||||
// value).
|
||||
int_radix_lut<Torus> *is_max_value;
|
||||
Torus *h_lut_indexes;
|
||||
bool gpu_memory_allocated;
|
||||
|
||||
int_are_all_block_true_buffer(cudaStream_t const *streams,
|
||||
@@ -3638,6 +3649,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {
|
||||
params.carry_modulus, is_max_value_f, gpu_memory_allocated);
|
||||
|
||||
is_max_value->broadcast_lut(streams, gpu_indexes);
|
||||
h_lut_indexes = static_cast<Torus *>(malloc(max_chunks * sizeof(Torus)));
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -3650,6 +3662,9 @@ template <typename Torus> struct int_are_all_block_true_buffer {
|
||||
delete is_max_value;
|
||||
delete tmp_out;
|
||||
delete tmp_block_accumulated;
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
free(h_lut_indexes);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -3914,7 +3929,6 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
|
||||
int_radix_params params;
|
||||
|
||||
//////////////////
|
||||
int_radix_lut<Torus> *identity_lut;
|
||||
std::function<Torus(Torus)> identity_lut_f;
|
||||
|
||||
@@ -4596,8 +4610,9 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
scalars_for_overflow_sub[nb - 1], h_scalar, nb * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0], allocate_gpu_memory);
|
||||
}
|
||||
free(h_lut_indexes);
|
||||
free(h_scalar);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
free(h_lut_indexes);
|
||||
};
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
|
||||
@@ -85,6 +85,7 @@ __host__ void are_all_comparisons_block_true(
|
||||
|
||||
uint32_t total_modulus = message_modulus * carry_modulus;
|
||||
uint32_t max_value = (total_modulus - 1) / (message_modulus - 1);
|
||||
auto h_lut_indexes = are_all_block_true_buffer->h_lut_indexes;
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], tmp_out,
|
||||
0, num_radix_blocks, lwe_array_in, 0,
|
||||
@@ -137,7 +138,6 @@ __host__ void are_all_comparisons_block_true(
|
||||
polynomial_size, message_modulus, carry_modulus,
|
||||
is_equal_to_num_blocks_lut_f, true);
|
||||
|
||||
Torus *h_lut_indexes = is_max_value_lut->h_lut_indexes;
|
||||
for (int index = 0; index < num_chunks; index++) {
|
||||
if (index == num_chunks - 1) {
|
||||
h_lut_indexes[index] = 1;
|
||||
@@ -161,12 +161,9 @@ __host__ void are_all_comparisons_block_true(
|
||||
ksks, ms_noise_reduction_key, lut, 1);
|
||||
// Reset max_value_lut_indexes before returning, otherwise if the lut is
|
||||
// reused the lut indexes will be wrong
|
||||
memset(is_max_value_lut->h_lut_indexes, 0,
|
||||
is_max_value_lut->num_blocks * sizeof(Torus));
|
||||
cuda_memcpy_async_to_gpu(is_max_value_lut->get_lut_indexes(0, 0),
|
||||
is_max_value_lut->h_lut_indexes,
|
||||
is_max_value_lut->num_blocks * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memset_async(is_max_value_lut->get_lut_indexes(0, 0), 0,
|
||||
is_max_value_lut->num_blocks * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
is_max_value_lut->broadcast_lut(streams, gpu_indexes);
|
||||
reset_radix_ciphertext_blocks(lwe_array_out, 1);
|
||||
return;
|
||||
|
||||
@@ -603,13 +603,20 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
}
|
||||
auto h_lut_indexes =
|
||||
static_cast<Torus *>(malloc(num_radix_blocks * sizeof(Torus)));
|
||||
cuda_memcpy_async_to_cpu(h_lut_indexes, lut->get_lut_indexes(0, 0),
|
||||
num_radix_blocks * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
for (uint i = 0; i < num_radix_blocks; i++) {
|
||||
auto degrees_index = lut->h_lut_indexes[i];
|
||||
auto degrees_index = h_lut_indexes[i];
|
||||
lwe_array_out->degrees[i] = lut->degrees[degrees_index];
|
||||
lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
|
||||
CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], params.message_modulus,
|
||||
params.carry_modulus);
|
||||
}
|
||||
free(h_lut_indexes);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
@@ -710,13 +717,20 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
}
|
||||
auto h_lut_indexes =
|
||||
static_cast<Torus *>(malloc(lut->num_blocks * sizeof(Torus)));
|
||||
cuda_memcpy_async_to_cpu(h_lut_indexes, lut->get_lut_indexes(0, 0),
|
||||
lut->num_blocks * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) {
|
||||
auto degrees_index = lut->h_lut_indexes[i % lut->num_blocks];
|
||||
auto degrees_index = h_lut_indexes[i % lut->num_blocks];
|
||||
lwe_array_out->degrees[i] = lut->degrees[degrees_index];
|
||||
lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
|
||||
CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], params.message_modulus,
|
||||
params.carry_modulus);
|
||||
}
|
||||
free(h_lut_indexes);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
@@ -828,13 +842,20 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
}
|
||||
auto h_lut_indexes =
|
||||
static_cast<Torus *>(malloc(num_radix_blocks * sizeof(Torus)));
|
||||
cuda_memcpy_async_to_cpu(h_lut_indexes, lut->get_lut_indexes(0, 0),
|
||||
num_radix_blocks * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
for (uint i = 0; i < num_radix_blocks; i++) {
|
||||
auto degrees_index = lut->h_lut_indexes[i];
|
||||
auto degrees_index = h_lut_indexes[i];
|
||||
lwe_array_out->degrees[i] = lut->degrees[degrees_index];
|
||||
lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
|
||||
CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], params.message_modulus,
|
||||
params.carry_modulus);
|
||||
}
|
||||
free(h_lut_indexes);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
@@ -1462,8 +1483,10 @@ void host_full_propagate_inplace(
|
||||
void *const *bsks, uint32_t num_blocks) {
|
||||
auto params = mem_ptr->lut->params;
|
||||
|
||||
int big_lwe_size = (params.glwe_dimension * params.polynomial_size + 1);
|
||||
int small_lwe_size = (params.small_lwe_dimension + 1);
|
||||
Torus degrees_index;
|
||||
cuda_memcpy_async_to_cpu(°rees_index, mem_ptr->lut->get_lut_indexes(0, 0),
|
||||
sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
|
||||
// In the case of extracting a single LWE this parameters are dummy
|
||||
uint32_t num_many_lut = 1;
|
||||
@@ -1496,7 +1519,7 @@ void host_full_propagate_inplace(
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
&cur_input_block, 0, 1,
|
||||
mem_ptr->tmp_big_lwe_vector, 0, 1);
|
||||
auto degrees_index = mem_ptr->lut->h_lut_indexes[0];
|
||||
|
||||
input_blocks->degrees[i] = mem_ptr->lut->degrees[degrees_index];
|
||||
input_blocks->noise_levels[i] = NoiseLevel::NOMINAL;
|
||||
CHECK_NOISE_LEVEL(input_blocks->noise_levels[i], params.message_modulus,
|
||||
|
||||
Reference in New Issue
Block a user