mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-09 14:47:56 -05:00
755 lines
32 KiB
Plaintext
755 lines
32 KiB
Plaintext
#ifndef CUDA_INTEGER_COMPARISON_OPS_CUH
|
|
#define CUDA_INTEGER_COMPARISON_OPS_CUH
|
|
|
|
#include "crypto/keyswitch.cuh"
|
|
#include "device.h"
|
|
#include "integer.cuh"
|
|
#include "integer/cmux.cuh"
|
|
#include "integer/integer_utilities.h"
|
|
#include "integer/negation.cuh"
|
|
#include "integer/scalar_addition.cuh"
|
|
#include "integer/subtraction.cuh"
|
|
#include "pbs/programmable_bootstrap_classic.cuh"
|
|
#include "pbs/programmable_bootstrap_multibit.cuh"
|
|
#include "types/complex/operations.cuh"
|
|
#include "utils/kernel_dimensions.cuh"
|
|
|
|
// lwe_dimension + 1 threads
|
|
// todo: This kernel MUST be refactored to a binary reduction
|
|
template <typename Torus>
|
|
__global__ void
|
|
device_accumulate_all_blocks(Torus *output, Torus const *input_block,
|
|
uint32_t lwe_dimension, uint32_t num_blocks) {
|
|
int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
|
if (idx < lwe_dimension + 1) {
|
|
auto block = &input_block[idx];
|
|
|
|
Torus sum = block[0];
|
|
for (int i = 1; i < num_blocks; i++) {
|
|
sum += block[i * (lwe_dimension + 1)];
|
|
}
|
|
|
|
output[idx] = sum;
|
|
}
|
|
}
|
|
|
|
template <typename Torus>
|
|
__host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
|
|
Torus *output, Torus const *input,
|
|
uint32_t lwe_dimension,
|
|
uint32_t num_radix_blocks) {
|
|
|
|
cuda_set_device(gpu_index);
|
|
int num_blocks = 0, num_threads = 0;
|
|
int num_entries = (lwe_dimension + 1);
|
|
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
|
// Add all blocks and store in sum
|
|
device_accumulate_all_blocks<Torus><<<num_blocks, num_threads, 0, stream>>>(
|
|
output, input, lwe_dimension, num_radix_blocks);
|
|
check_cuda_error(cudaGetLastError());
|
|
}
|
|
|
|
/* This takes an array of lwe ciphertexts, where each is an encryption of
|
|
* either 0 or 1.
|
|
*
|
|
* It writes in lwe_array_out a single lwe ciphertext encrypting 1 if all input
|
|
* blocks are 1 otherwise the block encrypts 0
|
|
*
|
|
*/
|
|
template <typename Torus>
|
|
__host__ void are_all_comparisons_block_true(
|
|
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
|
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
|
CudaRadixCiphertextFFI const *lwe_array_in,
|
|
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
|
Torus *const *ksks,
|
|
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
|
uint32_t num_radix_blocks) {
|
|
|
|
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
|
|
PANIC("Cuda error: input and output lwe dimensions must be the same")
|
|
if (lwe_array_in->num_radix_blocks < num_radix_blocks)
|
|
PANIC("Cuda error: input num radix blocks should not be lower "
|
|
"than the number of blocks to operate on")
|
|
|
|
auto params = mem_ptr->params;
|
|
auto big_lwe_dimension = params.big_lwe_dimension;
|
|
auto glwe_dimension = params.glwe_dimension;
|
|
auto polynomial_size = params.polynomial_size;
|
|
auto message_modulus = params.message_modulus;
|
|
auto carry_modulus = params.carry_modulus;
|
|
|
|
auto are_all_block_true_buffer =
|
|
mem_ptr->eq_buffer->are_all_block_true_buffer;
|
|
auto tmp_out = are_all_block_true_buffer->tmp_out;
|
|
|
|
uint32_t total_modulus = message_modulus * carry_modulus;
|
|
uint32_t max_value = (total_modulus - 1) / (message_modulus - 1);
|
|
auto h_lut_indexes = are_all_block_true_buffer->h_lut_indexes;
|
|
|
|
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], tmp_out,
|
|
0, num_radix_blocks, lwe_array_in, 0,
|
|
num_radix_blocks);
|
|
|
|
uint32_t remaining_blocks = num_radix_blocks;
|
|
|
|
while (remaining_blocks > 0) {
|
|
// Split in max_value chunks
|
|
int num_chunks = (remaining_blocks + max_value - 1) / max_value;
|
|
|
|
// Since all blocks encrypt either 0 or 1, we can sum max_value of them
|
|
// as in the worst case we will be adding `max_value` ones
|
|
auto input_blocks = (Torus *)tmp_out->ptr;
|
|
auto accumulator_ptr =
|
|
(Torus *)are_all_block_true_buffer->tmp_block_accumulated->ptr;
|
|
auto is_max_value_lut = are_all_block_true_buffer->is_max_value;
|
|
uint32_t chunk_lengths[num_chunks];
|
|
auto begin_remaining_blocks = remaining_blocks;
|
|
for (int i = 0; i < num_chunks; i++) {
|
|
uint32_t chunk_length =
|
|
std::min(max_value, begin_remaining_blocks - i * max_value);
|
|
chunk_lengths[i] = chunk_length;
|
|
accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator_ptr,
|
|
input_blocks, big_lwe_dimension,
|
|
chunk_length);
|
|
|
|
accumulator_ptr += (big_lwe_dimension + 1);
|
|
remaining_blocks -= (chunk_length - 1);
|
|
input_blocks += (big_lwe_dimension + 1) * chunk_length;
|
|
}
|
|
auto accumulator = are_all_block_true_buffer->tmp_block_accumulated;
|
|
|
|
// Selects a LUT
|
|
int_radix_lut<Torus> *lut;
|
|
if (are_all_block_true_buffer->op == COMPARISON_TYPE::NE) {
|
|
// is_non_zero_lut_buffer LUT
|
|
lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
|
} else {
|
|
if (chunk_lengths[num_chunks - 1] != max_value) {
|
|
// LUT needs to be computed
|
|
uint32_t chunk_length = chunk_lengths[num_chunks - 1];
|
|
auto is_equal_to_num_blocks_lut_f = [chunk_length](Torus x) -> Torus {
|
|
return x == chunk_length;
|
|
};
|
|
generate_device_accumulator<Torus>(
|
|
streams[0], gpu_indexes[0], is_max_value_lut->get_lut(0, 1),
|
|
is_max_value_lut->get_degree(1),
|
|
is_max_value_lut->get_max_degree(1), glwe_dimension,
|
|
polynomial_size, message_modulus, carry_modulus,
|
|
is_equal_to_num_blocks_lut_f, true);
|
|
|
|
for (int index = 0; index < num_chunks; index++) {
|
|
if (index == num_chunks - 1) {
|
|
h_lut_indexes[index] = 1;
|
|
} else {
|
|
h_lut_indexes[index] = 0;
|
|
}
|
|
}
|
|
cuda_memcpy_async_to_gpu(is_max_value_lut->get_lut_indexes(0, 0),
|
|
h_lut_indexes, num_chunks * sizeof(Torus),
|
|
streams[0], gpu_indexes[0]);
|
|
is_max_value_lut->broadcast_lut(streams, gpu_indexes);
|
|
}
|
|
lut = is_max_value_lut;
|
|
}
|
|
|
|
// Applies the LUT
|
|
if (remaining_blocks == 1) {
|
|
// In the last iteration we copy the output to the final address
|
|
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
|
streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsks,
|
|
ksks, ms_noise_reduction_key, lut, 1);
|
|
// Reset max_value_lut_indexes before returning, otherwise if the lut is
|
|
// reused the lut indexes will be wrong
|
|
cuda_memset_async(is_max_value_lut->get_lut_indexes(0, 0), 0,
|
|
is_max_value_lut->num_blocks * sizeof(Torus),
|
|
streams[0], gpu_indexes[0]);
|
|
is_max_value_lut->broadcast_lut(streams, gpu_indexes);
|
|
reset_radix_ciphertext_blocks(lwe_array_out, 1);
|
|
return;
|
|
} else {
|
|
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
|
streams, gpu_indexes, gpu_count, tmp_out, accumulator, bsks, ksks,
|
|
ms_noise_reduction_key, lut, num_chunks);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* This takes an array of lwe ciphertexts, where each is an encryption of
|
|
* either 0 or 1.
|
|
*
|
|
* It writes in lwe_array_out a single lwe ciphertext encrypting 1 if at least
|
|
* one input ciphertext encrypts 1 otherwise encrypts 0
|
|
*/
|
|
template <typename Torus>
|
|
__host__ void is_at_least_one_comparisons_block_true(
|
|
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
|
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
|
CudaRadixCiphertextFFI const *lwe_array_in,
|
|
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
|
Torus *const *ksks,
|
|
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
|
uint32_t num_radix_blocks) {
|
|
|
|
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
|
|
PANIC("Cuda error: input lwe dimensions must be the same")
|
|
|
|
if (lwe_array_in->num_radix_blocks < num_radix_blocks)
|
|
PANIC("Cuda error: input num radix blocks should not be lower "
|
|
"than the number of blocks to operate on")
|
|
auto params = mem_ptr->params;
|
|
auto big_lwe_dimension = params.big_lwe_dimension;
|
|
auto message_modulus = params.message_modulus;
|
|
auto carry_modulus = params.carry_modulus;
|
|
|
|
auto buffer = mem_ptr->eq_buffer->are_all_block_true_buffer;
|
|
|
|
uint32_t total_modulus = message_modulus * carry_modulus;
|
|
uint32_t max_value = (total_modulus - 1) / (message_modulus - 1);
|
|
|
|
copy_radix_ciphertext_slice_async<Torus>(
|
|
streams[0], gpu_indexes[0], mem_ptr->tmp_lwe_array_out, 0,
|
|
num_radix_blocks, lwe_array_in, 0, num_radix_blocks);
|
|
|
|
uint32_t remaining_blocks = num_radix_blocks;
|
|
while (remaining_blocks > 0) {
|
|
// Split in max_value chunks
|
|
int num_chunks = (remaining_blocks + max_value - 1) / max_value;
|
|
|
|
// Since all blocks encrypt either 0 or 1, we can sum max_value of them
|
|
// as in the worst case we will be adding `max_value` ones
|
|
auto input_blocks = (Torus *)mem_ptr->tmp_lwe_array_out->ptr;
|
|
auto accumulator = (Torus *)buffer->tmp_block_accumulated->ptr;
|
|
uint32_t chunk_lengths[num_chunks];
|
|
auto begin_remaining_blocks = remaining_blocks;
|
|
for (int i = 0; i < num_chunks; i++) {
|
|
uint32_t chunk_length =
|
|
std::min(max_value, begin_remaining_blocks - i * max_value);
|
|
chunk_lengths[i] = chunk_length;
|
|
accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator,
|
|
input_blocks, big_lwe_dimension,
|
|
chunk_length);
|
|
|
|
accumulator += (big_lwe_dimension + 1);
|
|
remaining_blocks -= (chunk_length - 1);
|
|
input_blocks += (big_lwe_dimension + 1) * chunk_length;
|
|
}
|
|
|
|
// Selects a LUT
|
|
int_radix_lut<Torus> *lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
|
|
|
// Applies the LUT
|
|
if (remaining_blocks == 1) {
|
|
// In the last iteration we copy the output to the final address
|
|
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
|
streams, gpu_indexes, gpu_count, lwe_array_out,
|
|
buffer->tmp_block_accumulated, bsks, ksks, ms_noise_reduction_key,
|
|
lut, 1);
|
|
return;
|
|
} else {
|
|
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
|
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
|
|
buffer->tmp_block_accumulated, bsks, ksks, ms_noise_reduction_key,
|
|
lut, num_chunks);
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename Torus>
|
|
__host__ void host_compare_blocks_with_zero(
|
|
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
|
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
|
CudaRadixCiphertextFFI const *lwe_array_in,
|
|
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
|
Torus *const *ksks,
|
|
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
|
int32_t num_radix_blocks, int_radix_lut<Torus> *zero_comparison) {
|
|
|
|
if (num_radix_blocks == 0)
|
|
return;
|
|
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
|
|
PANIC("Cuda error: input lwe dimensions must be the same")
|
|
if (lwe_array_in->num_radix_blocks < num_radix_blocks)
|
|
PANIC("Cuda error: input num radix blocks should not be lower "
|
|
"than the number of blocks to operate on")
|
|
|
|
auto params = mem_ptr->params;
|
|
auto big_lwe_dimension = params.big_lwe_dimension;
|
|
auto message_modulus = params.message_modulus;
|
|
auto carry_modulus = params.carry_modulus;
|
|
|
|
// The idea is that we will sum chunks of blocks until carries are full
|
|
// then we compare the sum with 0.
|
|
//
|
|
// If all blocks were 0, the sum will be zero
|
|
// If at least one bock was not zero, the sum won't be zero
|
|
uint32_t total_modulus = message_modulus * carry_modulus;
|
|
uint32_t message_max = message_modulus - 1;
|
|
|
|
uint32_t num_elements_to_fill_carry = (total_modulus - 1) / message_max;
|
|
|
|
size_t big_lwe_size = big_lwe_dimension + 1;
|
|
int num_sum_blocks = 0;
|
|
// Accumulator
|
|
auto sum = lwe_array_out;
|
|
|
|
if (num_radix_blocks == 1) {
|
|
// Just copy
|
|
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], sum, 0,
|
|
1, lwe_array_in, 0, 1);
|
|
num_sum_blocks = 1;
|
|
} else {
|
|
uint32_t remainder_blocks = num_radix_blocks;
|
|
auto sum_i = (Torus *)sum->ptr;
|
|
auto chunk = (Torus *)lwe_array_in->ptr;
|
|
while (remainder_blocks > 1) {
|
|
uint32_t chunk_size =
|
|
std::min(remainder_blocks, num_elements_to_fill_carry);
|
|
|
|
accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], sum_i, chunk,
|
|
big_lwe_dimension, chunk_size);
|
|
|
|
num_sum_blocks++;
|
|
remainder_blocks -= (chunk_size - 1);
|
|
|
|
// Update operands
|
|
chunk += (chunk_size - 1) * big_lwe_size;
|
|
sum_i += big_lwe_size;
|
|
}
|
|
}
|
|
|
|
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
|
streams, gpu_indexes, gpu_count, lwe_array_out, sum, bsks, ksks,
|
|
ms_noise_reduction_key, zero_comparison, num_sum_blocks);
|
|
|
|
reset_radix_ciphertext_blocks(lwe_array_out, num_sum_blocks);
|
|
}
|
|
|
|
template <typename Torus>
|
|
__host__ void host_integer_radix_equality_check_kb(
|
|
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
|
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
|
CudaRadixCiphertextFFI const *lwe_array_1,
|
|
CudaRadixCiphertextFFI const *lwe_array_2,
|
|
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
|
Torus *const *ksks,
|
|
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
|
uint32_t num_radix_blocks) {
|
|
|
|
if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
|
|
lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
|
|
PANIC("Cuda error: input lwe dimensions must be the same")
|
|
auto eq_buffer = mem_ptr->eq_buffer;
|
|
|
|
// Applies the LUT for the comparison operation
|
|
auto comparisons = mem_ptr->tmp_block_comparisons;
|
|
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
|
streams, gpu_indexes, gpu_count, comparisons, lwe_array_1, lwe_array_2,
|
|
bsks, ksks, ms_noise_reduction_key, eq_buffer->operator_lut,
|
|
num_radix_blocks, eq_buffer->operator_lut->params.message_modulus);
|
|
|
|
// This takes a Vec of blocks, where each block is either 0 or 1.
|
|
//
|
|
// It returns a block encrypting 1 if all input blocks are 1
|
|
// otherwise the block encrypts 0
|
|
are_all_comparisons_block_true<Torus>(
|
|
streams, gpu_indexes, gpu_count, lwe_array_out, comparisons, mem_ptr,
|
|
bsks, ksks, ms_noise_reduction_key, num_radix_blocks);
|
|
}
|
|
|
|
template <typename Torus>
|
|
__host__ void compare_radix_blocks_kb(
|
|
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
|
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
|
CudaRadixCiphertextFFI const *lwe_array_left,
|
|
CudaRadixCiphertextFFI const *lwe_array_right,
|
|
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
|
Torus *const *ksks,
|
|
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
|
uint32_t num_radix_blocks) {
|
|
|
|
if (lwe_array_out->lwe_dimension != lwe_array_left->lwe_dimension ||
|
|
lwe_array_out->lwe_dimension != lwe_array_right->lwe_dimension)
|
|
PANIC("Cuda error: input and output radix ciphertexts should have the same "
|
|
"lwe dimension")
|
|
auto params = mem_ptr->params;
|
|
auto big_lwe_dimension = params.big_lwe_dimension;
|
|
auto message_modulus = params.message_modulus;
|
|
auto carry_modulus = params.carry_modulus;
|
|
|
|
// When rhs > lhs, the subtraction will overflow, and the bit of padding will
|
|
// be set to 1
|
|
// meaning that the output of the pbs will be the negative (modulo message
|
|
// space)
|
|
//
|
|
// Example:
|
|
// lhs: 1, rhs: 3, message modulus: 4, carry modulus 4
|
|
// lhs - rhs = -2 % (4 * 4) = 14 = 1|1110 (padding_bit|b4b3b2b1)
|
|
// Since there was an overflow the bit of padding is 1 and not 0.
|
|
// When applying the LUT for an input value of 14 we would expect 1,
|
|
// but since the bit of padding is 1, we will get -1 modulus our message
|
|
// space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000
|
|
|
|
// Subtract
|
|
host_subtraction<Torus>(
|
|
streams[0], gpu_indexes[0], (Torus *)lwe_array_out->ptr,
|
|
(Torus *)lwe_array_left->ptr, (Torus *)lwe_array_right->ptr,
|
|
big_lwe_dimension, num_radix_blocks);
|
|
|
|
// Apply LUT to compare to 0
|
|
auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
|
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
|
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_out, bsks, ksks,
|
|
ms_noise_reduction_key, is_non_zero_lut, num_radix_blocks);
|
|
|
|
// Add one
|
|
// Here Lhs can have the following values: (-1) % (message modulus * carry
|
|
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
|
|
host_integer_radix_add_scalar_one_inplace<Torus>(
|
|
streams, gpu_indexes, gpu_count, lwe_array_out, message_modulus,
|
|
carry_modulus);
|
|
}
|
|
|
|
// Reduces a vec containing shortint blocks that encrypts a sign
|
|
// (inferior, equal, superior) to one single shortint block containing the
|
|
// final sign
|
|
template <typename Torus>
|
|
__host__ void tree_sign_reduction(
|
|
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
|
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
|
CudaRadixCiphertextFFI *lwe_block_comparisons,
|
|
int_tree_sign_reduction_buffer<Torus> *tree_buffer,
|
|
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
|
|
Torus *const *ksks,
|
|
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
|
uint32_t num_radix_blocks) {
|
|
|
|
if (lwe_array_out->lwe_dimension != lwe_block_comparisons->lwe_dimension)
|
|
PANIC("Cuda error: input lwe dimensions must be the same")
|
|
if (lwe_block_comparisons->num_radix_blocks < num_radix_blocks)
|
|
PANIC("Cuda error: block comparisons num radix blocks should not be lower "
|
|
"than the number of blocks to operate on")
|
|
|
|
auto params = tree_buffer->params;
|
|
auto big_lwe_dimension = params.big_lwe_dimension;
|
|
auto glwe_dimension = params.glwe_dimension;
|
|
auto polynomial_size = params.polynomial_size;
|
|
auto message_modulus = params.message_modulus;
|
|
auto carry_modulus = params.carry_modulus;
|
|
|
|
// Tree reduction
|
|
// Reduces a vec containing shortint blocks that encrypts a sign
|
|
// (inferior, equal, superior) to one single shortint block containing the
|
|
// final sign
|
|
auto x = tree_buffer->tmp_x;
|
|
auto y = tree_buffer->tmp_y;
|
|
if (x != lwe_block_comparisons)
|
|
copy_radix_ciphertext_slice_async<Torus>(
|
|
streams[0], gpu_indexes[0], x, 0, num_radix_blocks,
|
|
lwe_block_comparisons, 0, num_radix_blocks);
|
|
|
|
uint32_t partial_block_count = num_radix_blocks;
|
|
|
|
auto inner_tree_leaf = tree_buffer->tree_inner_leaf_lut;
|
|
while (partial_block_count > 2) {
|
|
pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, partial_block_count,
|
|
message_modulus);
|
|
|
|
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
|
streams, gpu_indexes, gpu_count, x, y, bsks, ksks,
|
|
ms_noise_reduction_key, inner_tree_leaf, partial_block_count >> 1);
|
|
|
|
if ((partial_block_count % 2) != 0) {
|
|
partial_block_count >>= 1;
|
|
partial_block_count++;
|
|
|
|
copy_radix_ciphertext_slice_async<Torus>(
|
|
streams[0], gpu_indexes[0], x, partial_block_count - 1,
|
|
partial_block_count, y, partial_block_count - 1, partial_block_count);
|
|
} else {
|
|
partial_block_count >>= 1;
|
|
}
|
|
}
|
|
|
|
auto last_lut = tree_buffer->tree_last_leaf_lut;
|
|
auto block_selector_f = tree_buffer->block_selector_f;
|
|
std::function<Torus(Torus)> f;
|
|
auto num_bits_in_message = log2_int(params.message_modulus);
|
|
if (partial_block_count == 2) {
|
|
pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, partial_block_count,
|
|
message_modulus);
|
|
|
|
f = [block_selector_f, sign_handler_f, num_bits_in_message,
|
|
message_modulus](Torus x) -> Torus {
|
|
Torus msb = (x >> num_bits_in_message) & (message_modulus - 1);
|
|
Torus lsb = x & (message_modulus - 1);
|
|
|
|
Torus final_sign = block_selector_f(msb, lsb);
|
|
return sign_handler_f(final_sign);
|
|
};
|
|
} else {
|
|
// partial_block_count == 1
|
|
y = x;
|
|
f = sign_handler_f;
|
|
}
|
|
generate_device_accumulator<Torus>(
|
|
streams[0], gpu_indexes[0], last_lut->get_lut(0, 0),
|
|
last_lut->get_degree(0), last_lut->get_max_degree(0), glwe_dimension,
|
|
polynomial_size, message_modulus, carry_modulus, f, true);
|
|
last_lut->broadcast_lut(streams, gpu_indexes);
|
|
|
|
// Last leaf
|
|
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
|
streams, gpu_indexes, gpu_count, lwe_array_out, y, bsks, ksks,
|
|
ms_noise_reduction_key, last_lut, 1);
|
|
}
|
|
|
|
template <typename Torus>
|
|
__host__ void host_integer_radix_difference_check_kb(
|
|
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
|
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
|
CudaRadixCiphertextFFI const *lwe_array_left,
|
|
CudaRadixCiphertextFFI const *lwe_array_right,
|
|
int_comparison_buffer<Torus> *mem_ptr,
|
|
std::function<Torus(Torus)> reduction_lut_f, void *const *bsks,
|
|
Torus *const *ksks,
|
|
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
|
uint32_t num_radix_blocks) {
|
|
|
|
if (lwe_array_out->lwe_dimension != lwe_array_left->lwe_dimension ||
|
|
lwe_array_out->lwe_dimension != lwe_array_right->lwe_dimension)
|
|
PANIC("Cuda error: input lwe dimensions must be the same")
|
|
|
|
auto diff_buffer = mem_ptr->diff_buffer;
|
|
|
|
auto params = mem_ptr->params;
|
|
auto message_modulus = params.message_modulus;
|
|
auto carry_modulus = params.carry_modulus;
|
|
|
|
uint32_t packed_num_radix_blocks = num_radix_blocks;
|
|
CudaRadixCiphertextFFI lhs;
|
|
as_radix_ciphertext_slice<Torus>(&lhs, diff_buffer->tmp_packed, 0,
|
|
num_radix_blocks / 2);
|
|
CudaRadixCiphertextFFI rhs;
|
|
as_radix_ciphertext_slice<Torus>(&rhs, diff_buffer->tmp_packed,
|
|
num_radix_blocks / 2, num_radix_blocks);
|
|
if (carry_modulus >= message_modulus) {
|
|
// Packing is possible
|
|
// Pack inputs
|
|
// In case the ciphertext is signed, the sign block and the one before it
|
|
// are handled separately
|
|
if (mem_ptr->is_signed) {
|
|
packed_num_radix_blocks -= 2;
|
|
}
|
|
pack_blocks<Torus>(streams[0], gpu_indexes[0], &lhs, lwe_array_left,
|
|
packed_num_radix_blocks, message_modulus);
|
|
pack_blocks<Torus>(streams[0], gpu_indexes[0], &rhs, lwe_array_right,
|
|
packed_num_radix_blocks, message_modulus);
|
|
// From this point we have half number of blocks
|
|
packed_num_radix_blocks /= 2;
|
|
|
|
// Clean noise
|
|
auto identity_lut = mem_ptr->identity_lut;
|
|
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
|
streams, gpu_indexes, gpu_count, diff_buffer->tmp_packed,
|
|
diff_buffer->tmp_packed, bsks, ksks, ms_noise_reduction_key,
|
|
identity_lut, 2 * packed_num_radix_blocks);
|
|
} else {
|
|
as_radix_ciphertext_slice<Torus>(&lhs, lwe_array_left, 0,
|
|
lwe_array_left->num_radix_blocks);
|
|
as_radix_ciphertext_slice<Torus>(&rhs, lwe_array_right, 0,
|
|
lwe_array_right->num_radix_blocks);
|
|
}
|
|
|
|
// comparisons will be assigned
|
|
// - 0 if lhs < rhs
|
|
// - 1 if lhs == rhs
|
|
// - 2 if lhs > rhs
|
|
auto comparisons = mem_ptr->tmp_block_comparisons;
|
|
auto num_comparisons = 0;
|
|
if (!mem_ptr->is_signed) {
|
|
// Compare packed blocks, or simply the total number of radix blocks in the
|
|
// inputs
|
|
compare_radix_blocks_kb<Torus>(
|
|
streams, gpu_indexes, gpu_count, comparisons, &lhs, &rhs, mem_ptr, bsks,
|
|
ksks, ms_noise_reduction_key, packed_num_radix_blocks);
|
|
num_comparisons = packed_num_radix_blocks;
|
|
} else {
|
|
// Packing is possible
|
|
if (carry_modulus >= message_modulus) {
|
|
// Compare (num_radix_blocks - 2) / 2 packed blocks
|
|
compare_radix_blocks_kb<Torus>(
|
|
streams, gpu_indexes, gpu_count, comparisons, &lhs, &rhs, mem_ptr,
|
|
bsks, ksks, ms_noise_reduction_key, packed_num_radix_blocks);
|
|
|
|
// Compare the last block before the sign block separately
|
|
auto identity_lut = mem_ptr->identity_lut;
|
|
CudaRadixCiphertextFFI last_left_block_before_sign_block;
|
|
as_radix_ciphertext_slice<Torus>(
|
|
&last_left_block_before_sign_block, diff_buffer->tmp_packed,
|
|
packed_num_radix_blocks, packed_num_radix_blocks + 1);
|
|
CudaRadixCiphertextFFI shifted_lwe_array_left;
|
|
as_radix_ciphertext_slice<Torus>(&shifted_lwe_array_left, lwe_array_left,
|
|
num_radix_blocks - 2,
|
|
num_radix_blocks - 1);
|
|
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
|
streams, gpu_indexes, gpu_count, &last_left_block_before_sign_block,
|
|
&shifted_lwe_array_left, bsks, ksks, ms_noise_reduction_key,
|
|
identity_lut, 1);
|
|
|
|
CudaRadixCiphertextFFI last_right_block_before_sign_block;
|
|
as_radix_ciphertext_slice<Torus>(
|
|
&last_right_block_before_sign_block, diff_buffer->tmp_packed,
|
|
num_radix_blocks / 2 + packed_num_radix_blocks,
|
|
num_radix_blocks / 2 + packed_num_radix_blocks + 1);
|
|
CudaRadixCiphertextFFI shifted_lwe_array_right;
|
|
as_radix_ciphertext_slice<Torus>(&shifted_lwe_array_right,
|
|
lwe_array_right, num_radix_blocks - 2,
|
|
num_radix_blocks - 1);
|
|
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
|
streams, gpu_indexes, gpu_count, &last_right_block_before_sign_block,
|
|
&shifted_lwe_array_right, bsks, ksks, ms_noise_reduction_key,
|
|
identity_lut, 1);
|
|
|
|
CudaRadixCiphertextFFI shifted_comparisons;
|
|
as_radix_ciphertext_slice<Torus>(&shifted_comparisons, comparisons,
|
|
packed_num_radix_blocks,
|
|
packed_num_radix_blocks + 1);
|
|
compare_radix_blocks_kb<Torus>(
|
|
streams, gpu_indexes, gpu_count, &shifted_comparisons,
|
|
&last_left_block_before_sign_block,
|
|
&last_right_block_before_sign_block, mem_ptr, bsks, ksks,
|
|
ms_noise_reduction_key, 1);
|
|
|
|
// Compare the sign block separately
|
|
as_radix_ciphertext_slice<Torus>(&shifted_comparisons, comparisons,
|
|
packed_num_radix_blocks + 1,
|
|
packed_num_radix_blocks + 2);
|
|
CudaRadixCiphertextFFI last_left_block;
|
|
as_radix_ciphertext_slice<Torus>(&last_left_block, lwe_array_left,
|
|
num_radix_blocks - 1, num_radix_blocks);
|
|
CudaRadixCiphertextFFI last_right_block;
|
|
as_radix_ciphertext_slice<Torus>(&last_right_block, lwe_array_right,
|
|
num_radix_blocks - 1, num_radix_blocks);
|
|
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
|
streams, gpu_indexes, gpu_count, &shifted_comparisons,
|
|
&last_left_block, &last_right_block, bsks, ksks,
|
|
ms_noise_reduction_key, mem_ptr->signed_lut, 1,
|
|
mem_ptr->signed_lut->params.message_modulus);
|
|
num_comparisons = packed_num_radix_blocks + 2;
|
|
|
|
} else {
|
|
compare_radix_blocks_kb<Torus>(
|
|
streams, gpu_indexes, gpu_count, comparisons, lwe_array_left,
|
|
lwe_array_right, mem_ptr, bsks, ksks, ms_noise_reduction_key,
|
|
num_radix_blocks - 1);
|
|
// Compare the sign block separately
|
|
CudaRadixCiphertextFFI shifted_comparisons;
|
|
as_radix_ciphertext_slice<Torus>(&shifted_comparisons, comparisons,
|
|
num_radix_blocks - 1, num_radix_blocks);
|
|
CudaRadixCiphertextFFI last_left_block;
|
|
as_radix_ciphertext_slice<Torus>(&last_left_block, lwe_array_left,
|
|
num_radix_blocks - 1, num_radix_blocks);
|
|
CudaRadixCiphertextFFI last_right_block;
|
|
as_radix_ciphertext_slice<Torus>(&last_right_block, lwe_array_right,
|
|
num_radix_blocks - 1, num_radix_blocks);
|
|
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
|
streams, gpu_indexes, gpu_count, &shifted_comparisons,
|
|
&last_left_block, &last_right_block, bsks, ksks,
|
|
ms_noise_reduction_key, mem_ptr->signed_lut, 1,
|
|
mem_ptr->signed_lut->params.message_modulus);
|
|
num_comparisons = num_radix_blocks;
|
|
}
|
|
}
|
|
|
|
// Reduces a vec containing radix blocks that encrypts a sign
|
|
// (inferior, equal, superior) to one single radix block containing the
|
|
// final sign
|
|
tree_sign_reduction<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
|
|
comparisons, mem_ptr->diff_buffer->tree_buffer,
|
|
reduction_lut_f, bsks, ksks,
|
|
ms_noise_reduction_key, num_comparisons);
|
|
}
|
|
|
|
template <typename Torus>
|
|
__host__ uint64_t scratch_cuda_integer_radix_comparison_check_kb(
|
|
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
|
uint32_t gpu_count, int_comparison_buffer<Torus> **mem_ptr,
|
|
uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
|
|
bool is_signed, bool allocate_gpu_memory) {
|
|
|
|
uint64_t size_tracker = 0;
|
|
*mem_ptr = new int_comparison_buffer<Torus>(
|
|
streams, gpu_indexes, gpu_count, op, params, num_radix_blocks, is_signed,
|
|
allocate_gpu_memory, size_tracker);
|
|
return size_tracker;
|
|
}
|
|
|
|
template <typename Torus>
|
|
__host__ void host_integer_radix_maxmin_kb(
|
|
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
|
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
|
CudaRadixCiphertextFFI const *lwe_array_left,
|
|
CudaRadixCiphertextFFI const *lwe_array_right,
|
|
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
|
Torus *const *ksks,
|
|
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
|
uint32_t num_radix_blocks) {
|
|
|
|
if (lwe_array_out->lwe_dimension != lwe_array_left->lwe_dimension ||
|
|
lwe_array_out->lwe_dimension != lwe_array_right->lwe_dimension)
|
|
PANIC("Cuda error: input and output lwe dimensions must be the same")
|
|
if (lwe_array_out->num_radix_blocks < num_radix_blocks ||
|
|
lwe_array_left->num_radix_blocks < num_radix_blocks ||
|
|
lwe_array_right->num_radix_blocks < num_radix_blocks)
|
|
PANIC("Cuda error: input and output num radix blocks should not be lower "
|
|
"than the number of blocks to operate on")
|
|
|
|
// Compute the sign
|
|
host_integer_radix_difference_check_kb<Torus>(
|
|
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
|
|
lwe_array_left, lwe_array_right, mem_ptr, mem_ptr->identity_lut_f, bsks,
|
|
ksks, ms_noise_reduction_key, num_radix_blocks);
|
|
|
|
// Selector
|
|
host_integer_radix_cmux_kb<Torus>(
|
|
streams, gpu_indexes, gpu_count, lwe_array_out,
|
|
mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
|
|
mem_ptr->cmux_buffer, bsks, ksks, ms_noise_reduction_key);
|
|
}
|
|
|
|
template <typename Torus>
|
|
__host__ void host_integer_are_all_comparisons_block_true_kb(
|
|
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
|
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
|
CudaRadixCiphertextFFI const *lwe_array_in,
|
|
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
|
Torus *const *ksks,
|
|
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
|
uint32_t num_radix_blocks) {
|
|
|
|
// It returns a block encrypting 1 if all input blocks are 1
|
|
// otherwise the block encrypts 0
|
|
are_all_comparisons_block_true<Torus>(
|
|
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, mem_ptr,
|
|
bsks, ksks, ms_noise_reduction_key, num_radix_blocks);
|
|
}
|
|
|
|
template <typename Torus>
|
|
__host__ void host_integer_is_at_least_one_comparisons_block_true_kb(
|
|
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
|
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
|
CudaRadixCiphertextFFI const *lwe_array_in,
|
|
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
|
Torus *const *ksks,
|
|
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
|
uint32_t num_radix_blocks) {
|
|
|
|
// It returns a block encrypting 1 if all input blocks are 1
|
|
// otherwise the block encrypts 0
|
|
is_at_least_one_comparisons_block_true<Torus>(
|
|
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, mem_ptr,
|
|
bsks, ksks, ms_noise_reduction_key, num_radix_blocks);
|
|
}
|
|
#endif
|