Compare commits

...

4 Commits

Author SHA1 Message Date
Agnes Leroy
eb3b988380 Use internal streams 2025-12-05 15:28:07 +01:00
Agnes Leroy
ec19579c39 Wrap erc20 from backend to hl api 2025-12-05 15:23:08 +01:00
Agnes Leroy
8ed3b4b59d chore(gpu): reuse CPU LUT buffer to generate accumulators 2025-12-05 15:23:07 +01:00
Agnes Leroy
20daf182f0 Experiment with erc20 in the backend 2025-12-05 15:23:07 +01:00
28 changed files with 992 additions and 171 deletions

View File

@@ -86,6 +86,7 @@ fn main() {
"cuda/include/integer/integer.h",
"cuda/include/integer/rerand.h",
"cuda/include/aes/aes.h",
"cuda/include/erc20/erc20.h",
"cuda/include/zk/zk.h",
"cuda/include/keyswitch/keyswitch.h",
"cuda/include/keyswitch/ks_enums.h",

View File

@@ -0,0 +1,20 @@
#pragma once
#include "../integer/integer.h"
extern "C" {
uint64_t scratch_cuda_erc20_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_erc20_assign_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *from_amount,
CudaRadixCiphertextFFI *to_amount,
CudaRadixCiphertextFFI const *amount, int8_t *mem_ptr,
void *const *bsks, void *const *ksks);
void cleanup_cuda_erc20(CudaStreamsFFI streams, int8_t **mem_ptr_void);
}

View File

@@ -0,0 +1,82 @@
#pragma once
#include "../integer/integer_utilities.h"
#include "integer/comparison.h"
#include "integer/multiplication.h"
#include "integer/subtraction.h"
template <typename Torus> struct int_erc20_buffer {
int_radix_params params;
int_comparison_buffer<Torus> *diff_buffer;
int_mul_memory<Torus> *mul_buffer;
int_sc_prop_memory<Torus> *add_buffer;
int_sub_and_propagate<Torus> *sub_buffer;
CudaRadixCiphertextFFI *tmp_amount;
CudaRadixCiphertextFFI *has_enough_funds;
CudaStreams active_streams;
InternalCudaStreams internal_cuda_streams;
uint32_t num_internal_streams;
bool allocate_gpu_memory;
Torus *preallocated_h_lut;
int_erc20_buffer(CudaStreams streams, int_radix_params params,
uint32_t num_radix_blocks, bool allocate_gpu_memory,
uint64_t &size_tracker) {
this->params = params;
this->allocate_gpu_memory = allocate_gpu_memory;
preallocated_h_lut = (Torus *)malloc(
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
diff_buffer = new int_comparison_buffer<Torus>(
streams, COMPARISON_TYPE::GT, params, num_radix_blocks, false,
allocate_gpu_memory, size_tracker, preallocated_h_lut);
mul_buffer = new int_mul_memory<Torus>(
streams, params, false, true, num_radix_blocks, allocate_gpu_memory,
size_tracker, preallocated_h_lut);
add_buffer = new int_sc_prop_memory<Torus>(
streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory,
size_tracker, preallocated_h_lut);
sub_buffer = new int_sub_and_propagate<Torus>(
streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory,
size_tracker, preallocated_h_lut);
tmp_amount = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), tmp_amount, num_radix_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
has_enough_funds = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), has_enough_funds, 1,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
active_streams = streams.active_gpu_subset(num_radix_blocks);
num_internal_streams = 2;
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_internal_streams);
}
void release(CudaStreams streams) {
diff_buffer->release(streams);
delete diff_buffer;
diff_buffer = nullptr;
mul_buffer->release(streams);
delete mul_buffer;
mul_buffer = nullptr;
add_buffer->release(streams);
delete add_buffer;
add_buffer = nullptr;
sub_buffer->release(streams);
delete sub_buffer;
sub_buffer = nullptr;
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
tmp_amount, this->allocate_gpu_memory);
delete tmp_amount;
tmp_amount = nullptr;
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
has_enough_funds, this->allocate_gpu_memory);
delete has_enough_funds;
has_enough_funds = nullptr;
internal_cuda_streams.release(streams);
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
free(preallocated_h_lut);
}
};

View File

@@ -43,7 +43,8 @@ template <typename Torus> struct int_cmux_buffer {
int_cmux_buffer(CudaStreams streams,
std::function<Torus(Torus)> predicate_lut_f,
int_radix_params params, uint32_t num_radix_blocks,
bool allocate_gpu_memory, uint64_t &size_tracker) {
bool allocate_gpu_memory, uint64_t &size_tracker,
Torus *preallocated_h_lut = nullptr) {
gpu_memory_allocated = allocate_gpu_memory;
this->params = params;
@@ -88,20 +89,21 @@ template <typename Torus> struct int_cmux_buffer {
streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 0),
predicate_lut->get_degree(0), predicate_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, inverted_lut_f, gpu_memory_allocated);
params.carry_modulus, inverted_lut_f, gpu_memory_allocated,
preallocated_h_lut);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 1),
predicate_lut->get_degree(1), predicate_lut->get_max_degree(1),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f, gpu_memory_allocated);
params.carry_modulus, lut_f, gpu_memory_allocated, preallocated_h_lut);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
message_extract_lut->get_lut(0, 0), message_extract_lut->get_degree(0),
message_extract_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
message_extract_lut_f, gpu_memory_allocated);
message_extract_lut_f, gpu_memory_allocated, preallocated_h_lut);
Torus *h_lut_indexes = predicate_lut->h_lut_indexes;
for (int index = 0; index < 2 * num_radix_blocks; index++) {
if (index < num_radix_blocks) {

View File

@@ -384,7 +384,8 @@ template <typename Torus> struct int_comparison_buffer {
int_comparison_buffer(CudaStreams streams, COMPARISON_TYPE op,
int_radix_params params, uint32_t num_radix_blocks,
bool is_signed, bool allocate_gpu_memory,
uint64_t &size_tracker) {
uint64_t &size_tracker,
Torus *preallocated_h_lut_from_elsewhere = nullptr) {
gpu_memory_allocated = allocate_gpu_memory;
this->params = params;
this->op = op;
@@ -426,7 +427,8 @@ template <typename Torus> struct int_comparison_buffer {
streams.stream(0), streams.gpu_index(0), identity_lut->get_lut(0, 0),
identity_lut->get_degree(0), identity_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, identity_lut_f, gpu_memory_allocated);
params.carry_modulus, identity_lut_f, gpu_memory_allocated,
preallocated_h_lut_from_elsewhere);
identity_lut->broadcast_lut(active_streams);
uint32_t total_modulus = params.message_modulus * params.carry_modulus;
@@ -441,7 +443,8 @@ template <typename Torus> struct int_comparison_buffer {
streams.stream(0), streams.gpu_index(0), is_zero_lut->get_lut(0, 0),
is_zero_lut->get_degree(0), is_zero_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, is_zero_f, gpu_memory_allocated);
params.carry_modulus, is_zero_f, gpu_memory_allocated,
preallocated_h_lut_from_elsewhere);
is_zero_lut->broadcast_lut(active_streams);
@@ -456,7 +459,8 @@ template <typename Torus> struct int_comparison_buffer {
else
return (x == IS_INFERIOR);
},
params, num_radix_blocks, allocate_gpu_memory, size_tracker);
params, num_radix_blocks, allocate_gpu_memory, size_tracker,
preallocated_h_lut_from_elsewhere);
case COMPARISON_TYPE::GT:
case COMPARISON_TYPE::GE:
case COMPARISON_TYPE::LT:

View File

@@ -77,7 +77,8 @@ void generate_device_accumulator_bivariate(
cudaStream_t stream, uint32_t gpu_index, Torus *acc_bivariate,
uint64_t *degree, uint64_t *max_degree, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
std::function<Torus(Torus, Torus)> f, bool gpu_memory_allocated);
std::function<Torus(Torus, Torus)> f, bool gpu_memory_allocated,
Torus *preallocated_h_lut = nullptr);
template <typename Torus>
void generate_device_accumulator_bivariate_with_factor(
@@ -114,14 +115,16 @@ void generate_device_accumulator(
cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t message_modulus, uint32_t carry_modulus,
std::function<Torus(Torus)> f, bool gpu_memory_allocated);
std::function<Torus(Torus)> f, bool gpu_memory_allocated,
Torus *preallocated_h_lut = nullptr);
template <typename Torus>
void generate_many_lut_device_accumulator(
cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degrees,
uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t message_modulus, uint32_t carry_modulus,
std::vector<std::function<Torus(Torus)>> &f, bool gpu_memory_allocated);
std::vector<std::function<Torus(Torus)>> &f, bool gpu_memory_allocated,
Torus *preallocated_h_lut = nullptr);
struct radix_columns {
std::vector<uint32_t> columns_counter;
@@ -1160,6 +1163,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
bool mem_reuse = false;
bool allocated_luts_message_carry;
Torus *preallocated_h_lut;
void setup_index_buffers(CudaStreams streams, uint64_t &size_tracker) {
@@ -1206,7 +1210,8 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
}
void setup_lookup_tables(CudaStreams streams, uint32_t num_radix_in_vec,
const uint64_t *const degrees) {
const uint64_t *const degrees,
Torus *preallocated_h_lut = nullptr) {
uint32_t message_modulus = params.message_modulus;
bool _needs_processing = false;
radix_columns current_columns(degrees, num_blocks_in_radix,
@@ -1257,13 +1262,13 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
luts_message_carry->get_degree(0),
luts_message_carry->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, message_modulus, params.carry_modulus,
lut_f_message, gpu_memory_allocated);
lut_f_message, gpu_memory_allocated, preallocated_h_lut);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), carry_acc,
luts_message_carry->get_degree(1),
luts_message_carry->get_max_degree(1), params.glwe_dimension,
params.polynomial_size, message_modulus, params.carry_modulus,
lut_f_carry, gpu_memory_allocated);
lut_f_carry, gpu_memory_allocated, preallocated_h_lut);
auto active_gpu_count_mc = streams.active_gpu_subset(pbs_count);
luts_message_carry->broadcast_lut(active_gpu_count_mc);
}
@@ -1272,7 +1277,8 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
CudaStreams streams, int_radix_params params,
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
bool reduce_degrees_for_single_carry_propagation,
bool allocate_gpu_memory, uint64_t &size_tracker) {
bool allocate_gpu_memory, uint64_t &size_tracker,
Torus *preallocated_h_lut = nullptr) {
this->params = params;
this->mem_reuse = false;
this->max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
@@ -1284,6 +1290,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
this->allocated_luts_message_carry = false;
this->reduce_degrees_for_single_carry_propagation =
reduce_degrees_for_single_carry_propagation;
this->preallocated_h_lut = preallocated_h_lut;
setup_index_buffers(streams, size_tracker);
// because we setup_lut in host function for sum_ciphertexts to save memory
@@ -1318,7 +1325,8 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
CudaRadixCiphertextFFI *small_lwe_vector,
int_radix_lut<Torus> *reused_lut,
bool reduce_degrees_for_single_carry_propagation,
bool allocate_gpu_memory, uint64_t &size_tracker) {
bool allocate_gpu_memory, uint64_t &size_tracker,
Torus *preallocated_h_lut = nullptr) {
this->mem_reuse = true;
this->params = params;
this->max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
@@ -1334,6 +1342,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
this->current_blocks = current_blocks;
this->small_lwe_vector = small_lwe_vector;
this->luts_message_carry = reused_lut;
this->preallocated_h_lut = preallocated_h_lut;
uint64_t message_modulus_bits = (uint64_t)std::log2(params.message_modulus);
uint64_t carry_modulus_bits = (uint64_t)std::log2(params.carry_modulus);
@@ -1395,10 +1404,12 @@ template <typename Torus> struct int_seq_group_prop_memory {
int_radix_lut<Torus> *lut_sequential_algorithm;
uint32_t grouping_size;
bool gpu_memory_allocated;
Torus *h_seq_lut_indexes;
int_seq_group_prop_memory(CudaStreams streams, int_radix_params params,
uint32_t group_size, uint32_t big_lwe_size_bytes,
bool allocate_gpu_memory, uint64_t &size_tracker) {
bool allocate_gpu_memory, uint64_t &size_tracker,
Torus *preallocated_h_lut = nullptr) {
gpu_memory_allocated = allocate_gpu_memory;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
@@ -1413,7 +1424,7 @@ template <typename Torus> struct int_seq_group_prop_memory {
allocate_gpu_memory);
int num_seq_luts = grouping_size - 1;
Torus *h_seq_lut_indexes = (Torus *)malloc(num_seq_luts * sizeof(Torus));
h_seq_lut_indexes = (Torus *)malloc(num_seq_luts * sizeof(Torus));
lut_sequential_algorithm =
new int_radix_lut<Torus>(streams, params, num_seq_luts, num_seq_luts,
allocate_gpu_memory, size_tracker);
@@ -1427,7 +1438,7 @@ template <typename Torus> struct int_seq_group_prop_memory {
lut_sequential_algorithm->get_degree(index),
lut_sequential_algorithm->get_max_degree(index), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, f_lut_sequential,
gpu_memory_allocated);
gpu_memory_allocated, preallocated_h_lut);
h_seq_lut_indexes[index] = index;
}
Torus *seq_lut_indexes = lut_sequential_algorithm->get_lut_indexes(0, 0);
@@ -1436,7 +1447,6 @@ template <typename Torus> struct int_seq_group_prop_memory {
streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
auto active_streams = streams.active_gpu_subset(num_seq_luts);
lut_sequential_algorithm->broadcast_lut(active_streams);
free(h_seq_lut_indexes);
};
void release(CudaStreams streams) {
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
@@ -1446,6 +1456,7 @@ template <typename Torus> struct int_seq_group_prop_memory {
delete group_resolved_carries;
delete lut_sequential_algorithm;
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
free(h_seq_lut_indexes);
};
};
@@ -1457,7 +1468,8 @@ template <typename Torus> struct int_hs_group_prop_memory {
int_hs_group_prop_memory(CudaStreams streams, int_radix_params params,
uint32_t num_groups, uint32_t big_lwe_size_bytes,
bool allocate_gpu_memory, uint64_t &size_tracker) {
bool allocate_gpu_memory, uint64_t &size_tracker,
Torus *preallocated_h_lut = nullptr) {
gpu_memory_allocated = allocate_gpu_memory;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
@@ -1487,7 +1499,7 @@ template <typename Torus> struct int_hs_group_prop_memory {
lut_hillis_steele->get_lut(0, 0), lut_hillis_steele->get_degree(0),
lut_hillis_steele->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f_lut_hillis_steele,
gpu_memory_allocated);
gpu_memory_allocated, preallocated_h_lut);
auto active_streams = streams.active_gpu_subset(num_groups);
lut_hillis_steele->broadcast_lut(active_streams);
};
@@ -1511,7 +1523,7 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
int_shifted_blocks_and_states_memory(
CudaStreams streams, int_radix_params params, uint32_t num_radix_blocks,
uint32_t num_many_lut, uint32_t grouping_size, bool allocate_gpu_memory,
uint64_t &size_tracker) {
uint64_t &size_tracker, Torus *preallocated_h_lut = nullptr) {
gpu_memory_allocated = allocate_gpu_memory;
auto glwe_dimension = params.glwe_dimension;
@@ -1561,7 +1573,7 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
streams.stream(0), streams.gpu_index(0), first_block_lut,
first_block_lut_degrees, first_block_lut_max_degree, glwe_dimension,
polynomial_size, message_modulus, carry_modulus, f_first_grouping_luts,
gpu_memory_allocated);
gpu_memory_allocated, preallocated_h_lut);
// luts for other blocks of the first grouping
for (int lut_id = 1; lut_id < grouping_size; lut_id++) {
@@ -1584,7 +1596,8 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
generate_many_lut_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), lut, lut_degrees,
lut_max_degree, glwe_dimension, polynomial_size, message_modulus,
carry_modulus, f_grouping_luts, gpu_memory_allocated);
carry_modulus, f_grouping_luts, gpu_memory_allocated,
preallocated_h_lut);
}
// luts for the rest of groupings (except for the last block)
@@ -1610,7 +1623,8 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
generate_many_lut_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), lut, lut_degrees,
lut_max_degree, glwe_dimension, polynomial_size, message_modulus,
carry_modulus, f_grouping_luts, gpu_memory_allocated);
carry_modulus, f_grouping_luts, gpu_memory_allocated,
preallocated_h_lut);
}
// For the last block we need to generate a new lut
@@ -1635,7 +1649,7 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
streams.stream(0), streams.gpu_index(0), last_block_lut,
last_block_lut_degrees, last_block_lut_max_degree, glwe_dimension,
polynomial_size, message_modulus, carry_modulus, f_last_grouping_luts,
gpu_memory_allocated);
gpu_memory_allocated, preallocated_h_lut);
// Generate the indexes to switch between luts within the pbs
uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus);
@@ -1706,11 +1720,12 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
uint32_t group_size;
bool use_sequential_algorithm_to_resolve_group_carries;
bool gpu_memory_allocated;
Torus *h_second_lut_indexes;
int_prop_simu_group_carries_memory(
CudaStreams streams, int_radix_params params, uint32_t num_radix_blocks,
uint32_t grouping_size, uint32_t num_groups, bool allocate_gpu_memory,
uint64_t &size_tracker) {
uint64_t &size_tracker, Torus *preallocated_h_lut = nullptr) {
gpu_memory_allocated = allocate_gpu_memory;
auto glwe_dimension = params.glwe_dimension;
@@ -1803,7 +1818,8 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
luts_array_second_step->get_degree(lut_id),
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
f_first_grouping_inner_propagation, gpu_memory_allocated);
f_first_grouping_inner_propagation, gpu_memory_allocated,
preallocated_h_lut);
}
auto f_first_grouping_outer_propagation =
@@ -1818,7 +1834,8 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
luts_array_second_step->get_degree(lut_id),
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
f_first_grouping_outer_propagation, gpu_memory_allocated);
f_first_grouping_outer_propagation, gpu_memory_allocated,
preallocated_h_lut);
// for other groupings inner propagation
for (int index = 0; index < grouping_size; index++) {
@@ -1842,7 +1859,8 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
luts_array_second_step->get_degree(lut_id),
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
f_other_groupings_inner_propagation, gpu_memory_allocated);
f_other_groupings_inner_propagation, gpu_memory_allocated,
preallocated_h_lut);
}
if (use_sequential_algorithm_to_resolve_group_carries) {
@@ -1864,7 +1882,7 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
luts_array_second_step->get_degree(lut_id),
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
f_group_propagation, gpu_memory_allocated);
f_group_propagation, gpu_memory_allocated, preallocated_h_lut);
}
} else {
uint32_t lut_id = 2 * grouping_size;
@@ -1882,10 +1900,10 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
luts_array_second_step->get_degree(lut_id),
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, f_group_propagation,
gpu_memory_allocated);
gpu_memory_allocated, preallocated_h_lut);
}
Torus *h_second_lut_indexes = (Torus *)malloc(lut_indexes_size);
h_second_lut_indexes = (Torus *)malloc(lut_indexes_size);
for (int index = 0; index < num_radix_blocks; index++) {
uint32_t grouping_index = index / grouping_size;
@@ -1937,15 +1955,13 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
seq_group_prop_mem = new int_seq_group_prop_memory<Torus>(
streams, params, grouping_size, big_lwe_size_bytes,
allocate_gpu_memory, size_tracker);
allocate_gpu_memory, size_tracker, preallocated_h_lut);
} else {
hs_group_prop_mem = new int_hs_group_prop_memory<Torus>(
streams, params, num_groups, big_lwe_size_bytes, allocate_gpu_memory,
size_tracker);
size_tracker, preallocated_h_lut);
}
free(h_second_lut_indexes);
};
// needed for the division to update the lut indexes
@@ -1996,6 +2012,7 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
delete luts_array_second_step;
delete[] h_scalar_array_cum_sum;
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
free(h_second_lut_indexes);
};
};
@@ -2020,7 +2037,8 @@ template <typename Torus> struct int_sc_prop_memory {
int_sc_prop_memory(CudaStreams streams, int_radix_params params,
uint32_t num_radix_blocks, uint32_t requested_flag_in,
bool allocate_gpu_memory, uint64_t &size_tracker) {
bool allocate_gpu_memory, uint64_t &size_tracker,
Torus *preallocated_h_lut = nullptr) {
gpu_memory_allocated = allocate_gpu_memory;
this->params = params;
auto glwe_dimension = params.glwe_dimension;
@@ -2040,11 +2058,11 @@ template <typename Torus> struct int_sc_prop_memory {
shifted_blocks_state_mem = new int_shifted_blocks_and_states_memory<Torus>(
streams, params, num_radix_blocks, num_many_lut, grouping_size,
allocate_gpu_memory, size_tracker);
allocate_gpu_memory, size_tracker, preallocated_h_lut);
prop_simu_group_carries_mem = new int_prop_simu_group_carries_memory<Torus>(
streams, params, num_radix_blocks, grouping_size, num_groups,
allocate_gpu_memory, size_tracker);
allocate_gpu_memory, size_tracker, preallocated_h_lut);
// Step 3 elements
int num_luts_message_extract =
@@ -2061,8 +2079,8 @@ template <typename Torus> struct int_sc_prop_memory {
streams.stream(0), streams.gpu_index(0),
lut_message_extract->get_lut(0, 0), lut_message_extract->get_degree(0),
lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f_message_extract,
gpu_memory_allocated);
message_modulus, carry_modulus, f_message_extract, gpu_memory_allocated,
preallocated_h_lut);
// This store a single block that with be used to store the overflow or
// carry results
@@ -2120,7 +2138,7 @@ template <typename Torus> struct int_sc_prop_memory {
lut_overflow_flag_prep->get_degree(0),
lut_overflow_flag_prep->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, f_overflow_fp,
gpu_memory_allocated);
gpu_memory_allocated, preallocated_h_lut);
auto active_streams = streams.active_gpu_subset(1);
lut_overflow_flag_prep->broadcast_lut(active_streams);
@@ -2152,7 +2170,7 @@ template <typename Torus> struct int_sc_prop_memory {
lut_message_extract->get_degree(1),
lut_message_extract->get_max_degree(1), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, f_overflow_last,
gpu_memory_allocated);
gpu_memory_allocated, preallocated_h_lut);
Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
for (int index = 0; index < num_radix_blocks + 1; index++) {
@@ -2179,7 +2197,7 @@ template <typename Torus> struct int_sc_prop_memory {
lut_message_extract->get_degree(1),
lut_message_extract->get_max_degree(1), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, f_carry_last,
gpu_memory_allocated);
gpu_memory_allocated, preallocated_h_lut);
Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
for (int index = 0; index < num_radix_blocks + 1; index++) {

View File

@@ -21,7 +21,7 @@ template <typename Torus> struct int_mul_memory {
int_mul_memory(CudaStreams streams, int_radix_params params,
bool const is_boolean_left, bool const is_boolean_right,
uint32_t num_radix_blocks, bool allocate_gpu_memory,
uint64_t &size_tracker) {
uint64_t &size_tracker, Torus *preallocated_h_lut = nullptr) {
gpu_memory_allocated = allocate_gpu_memory;
this->boolean_mul = is_boolean_left || is_boolean_right;
this->params = params;
@@ -43,7 +43,7 @@ template <typename Torus> struct int_mul_memory {
zero_out_predicate_lut->get_degree(0),
zero_out_predicate_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
zero_out_predicate_lut_f, gpu_memory_allocated);
zero_out_predicate_lut_f, gpu_memory_allocated, preallocated_h_lut);
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
zero_out_predicate_lut->broadcast_lut(active_streams);

View File

@@ -142,14 +142,15 @@ template <typename Torus> struct int_sub_and_propagate {
int_sub_and_propagate(CudaStreams streams, const int_radix_params params,
uint32_t num_radix_blocks, uint32_t requested_flag_in,
bool allocate_gpu_memory, uint64_t &size_tracker) {
bool allocate_gpu_memory, uint64_t &size_tracker,
Torus *preallocated_h_lut = nullptr) {
this->params = params;
this->allocate_gpu_memory = allocate_gpu_memory;
this->sc_prop_mem = new int_sc_prop_memory<Torus>(
streams, params, num_radix_blocks, requested_flag_in,
allocate_gpu_memory, size_tracker);
allocate_gpu_memory, size_tracker, preallocated_h_lut);
this->neg_rhs_array = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(

View File

@@ -1,6 +1,5 @@
file(GLOB_RECURSE SOURCES "*.cu")
add_library(tfhe_cuda_backend STATIC ${SOURCES} pbs/programmable_bootstrap_multibit_128.cuh
pbs/programmable_bootstrap_multibit_128.cu)
add_library(tfhe_cuda_backend STATIC ${SOURCES})
set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX)
target_include_directories(tfhe_cuda_backend PRIVATE .)

View File

@@ -0,0 +1,85 @@
#include "erc20/erc20.cuh"
uint64_t scratch_cuda_erc20_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
PUSH_RANGE("scratch erc20")
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
std::function<uint64_t(uint64_t)> predicate_lut_f =
[](uint64_t x) -> uint64_t { return x == 1; };
uint64_t ret = scratch_cuda_erc20<uint64_t>(
CudaStreams(streams), (int_erc20_buffer<uint64_t> **)mem_ptr,
lwe_ciphertext_count, params, allocate_gpu_memory);
POP_RANGE()
return ret;
}
void cuda_erc20_assign_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *from_amount,
CudaRadixCiphertextFFI *to_amount,
CudaRadixCiphertextFFI const *amount, int8_t *mem_ptr,
void *const *bsks, void *const *ksks) {
PUSH_RANGE("erc20")
auto mem = reinterpret_cast<int_erc20_buffer<uint64_t> *>(mem_ptr);
switch (mem->params.polynomial_size) {
case 256:
host_erc20_assign<uint64_t, AmortizedDegree<256>>(
CudaStreams(streams), from_amount, to_amount, amount, mem, bsks,
(uint64_t **)(ksks));
break;
case 512:
host_erc20_assign<uint64_t, AmortizedDegree<512>>(
CudaStreams(streams), from_amount, to_amount, amount, mem, bsks,
(uint64_t **)(ksks));
break;
case 1024:
host_erc20_assign<uint64_t, AmortizedDegree<1024>>(
CudaStreams(streams), from_amount, to_amount, amount, mem, bsks,
(uint64_t **)(ksks));
break;
case 2048:
host_erc20_assign<uint64_t, AmortizedDegree<2048>>(
CudaStreams(streams), from_amount, to_amount, amount, mem, bsks,
(uint64_t **)(ksks));
break;
case 4096:
host_erc20_assign<uint64_t, AmortizedDegree<4096>>(
CudaStreams(streams), from_amount, to_amount, amount, mem, bsks,
(uint64_t **)(ksks));
break;
case 8192:
host_erc20_assign<uint64_t, AmortizedDegree<8192>>(
CudaStreams(streams), from_amount, to_amount, amount, mem, bsks,
(uint64_t **)(ksks));
break;
case 16384:
host_erc20_assign<uint64_t, AmortizedDegree<16384>>(
CudaStreams(streams), from_amount, to_amount, amount, mem, bsks,
(uint64_t **)(ksks));
break;
default:
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
"Supported N's are powers of two in the interval [256..16384].")
}
POP_RANGE()
}
void cleanup_cuda_erc20(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
PUSH_RANGE("cleanup erc20")
int_erc20_buffer<uint64_t> *mem_ptr =
(int_erc20_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
POP_RANGE()
}

View File

@@ -0,0 +1,49 @@
#pragma once
#include "erc20/erc20.h"
#include "erc20/erc20_utilities.h"
#include "integer/comparison.cuh"
#include "integer/integer.cuh"
#include "integer/multiplication.cuh"
#include "integer/subtraction.cuh"
template <typename Torus, class params>
__host__ void host_erc20_assign(CudaStreams streams,
CudaRadixCiphertextFFI *from_amount,
CudaRadixCiphertextFFI *to_amount,
CudaRadixCiphertextFFI const *amount,
int_erc20_buffer<Torus> *mem_ptr,
void *const *bsks, Torus *const *ksks) {
auto num_radix_blocks = from_amount->num_radix_blocks;
host_difference_check<Torus>(streams, mem_ptr->has_enough_funds, from_amount,
amount, mem_ptr->diff_buffer,
mem_ptr->diff_buffer->diff_buffer->operator_f,
bsks, ksks, num_radix_blocks);
host_integer_mult_radix<Torus, params>(
streams, mem_ptr->tmp_amount, amount, false, mem_ptr->has_enough_funds,
true, bsks, ksks, mem_ptr->mul_buffer, num_radix_blocks);
mem_ptr->internal_cuda_streams.internal_streams_wait_for_main_stream_0(
streams);
// stream1
host_add_and_propagate_single_carry(
mem_ptr->internal_cuda_streams[0], to_amount, mem_ptr->tmp_amount,
nullptr, nullptr, mem_ptr->add_buffer, bsks, ksks, FLAG_NONE, 0);
// stream2
host_sub_and_propagate_single_carry(
mem_ptr->internal_cuda_streams[1], to_amount, mem_ptr->tmp_amount,
nullptr, nullptr, mem_ptr->sub_buffer, bsks, ksks, FLAG_NONE, 0);
mem_ptr->internal_cuda_streams.main_stream_0_wait_for_internal_streams(
streams);
}
template <typename Torus>
__host__ uint64_t scratch_cuda_erc20(CudaStreams streams,
int_erc20_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks,
int_radix_params params,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_erc20_buffer<Torus>(streams, params, num_radix_blocks,
allocate_gpu_memory, size_tracker);
return size_tracker;
}

View File

@@ -134,7 +134,7 @@ __host__ void are_all_comparisons_block_true(
auto is_equal_to_num_blocks_lut_f = [chunk_length](Torus x) -> Torus {
return x == chunk_length;
};
generate_device_accumulator_with_cpu_prealloc<Torus>(
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
is_max_value_lut->get_lut(0, 1), is_max_value_lut->get_degree(1),
is_max_value_lut->get_max_degree(1), glwe_dimension,
@@ -482,7 +482,7 @@ tree_sign_reduction(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
y = x;
f = sign_handler_f;
}
generate_device_accumulator_with_cpu_prealloc<Torus>(
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), last_lut->get_lut(0, 0),
last_lut->get_degree(0), last_lut->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, f, true,

View File

@@ -9,7 +9,6 @@
#include "integer/scalar_addition.cuh"
#include "linearalgebra/addition.cuh"
#include "linearalgebra/negation.cuh"
#include "pbs/pbs_128_utilities.h"
#include "polynomial/functions.cuh"
#include "utils/helper.cuh"
#include "utils/helper_multi_gpu.cuh"
@@ -1040,26 +1039,41 @@ void generate_device_accumulator_bivariate(
cudaStream_t stream, uint32_t gpu_index, Torus *acc_bivariate,
uint64_t *degree, uint64_t *max_degree, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
std::function<Torus(Torus, Torus)> f, bool gpu_memory_allocated) {
std::function<Torus(Torus, Torus)> f, bool gpu_memory_allocated,
Torus *preallocated_cpu_lut) {
PUSH_RANGE("gen bivar lut acc")
// host lut
Torus *h_lut =
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
*max_degree = message_modulus * carry_modulus - 1;
// fill bivariate accumulator
*degree = generate_lookup_table_bivariate<Torus>(
h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus,
f);
if (preallocated_cpu_lut == nullptr) {
// host lut
Torus *h_lut =
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
*max_degree = message_modulus * carry_modulus - 1;
// fill bivariate accumulator
*degree = generate_lookup_table_bivariate<Torus>(
h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus,
f);
// copy host lut and lut_indexes_vec to device
cuda_memcpy_with_size_tracking_async_to_gpu(
acc_bivariate, h_lut,
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream, gpu_index,
gpu_memory_allocated);
// copy host lut and lut_indexes_vec to device
cuda_memcpy_with_size_tracking_async_to_gpu(
acc_bivariate, h_lut,
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream,
gpu_index, gpu_memory_allocated);
cuda_synchronize_stream(stream, gpu_index);
free(h_lut);
cuda_synchronize_stream(stream, gpu_index);
free(h_lut);
} else {
*max_degree = message_modulus * carry_modulus - 1;
// fill bivariate accumulator
*degree = generate_lookup_table_bivariate<Torus>(
preallocated_cpu_lut, glwe_dimension, polynomial_size, message_modulus,
carry_modulus, f);
// copy host lut and lut_indexes_vec to device
cuda_memcpy_with_size_tracking_async_to_gpu(
acc_bivariate, preallocated_cpu_lut,
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream,
gpu_index, gpu_memory_allocated);
}
POP_RANGE()
}
@@ -1097,41 +1111,6 @@ void generate_device_accumulator_bivariate_with_factor(
cuda_synchronize_stream(stream, gpu_index);
free(h_lut);
}
/*
* generate bivariate accumulator for device pointer
* using preallocated host lut to avoid blocking the cpu thread
* with the stream synchronization (required to free the host lut).
* This enables concurrent execution of multiple streams when using
* a single cpu thread.
* stream - cuda stream
* acc - device pointer for bivariate accumulator
* ...
* f - wrapping function with two Torus inputs
* h_lut - preallocated host lut to be used
*
*/
template <typename Torus>
void generate_device_accumulator_bivariate_with_cpu_prealloc(
cudaStream_t stream, uint32_t gpu_index, Torus *acc_bivariate,
uint64_t *degree, uint64_t *max_degree, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
std::function<Torus(Torus, Torus)> f, bool gpu_memory_allocated,
Torus *h_lut) {
PUSH_RANGE("gen bivar lut acc")
*max_degree = message_modulus * carry_modulus - 1;
// fill bivariate accumulator
*degree = generate_lookup_table_bivariate<Torus>(
h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus,
f);
// copy host lut and lut_indexes_vec to device
cuda_memcpy_with_size_tracking_async_to_gpu(
acc_bivariate, h_lut,
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream, gpu_index,
gpu_memory_allocated);
POP_RANGE()
}
template <typename Torus>
void generate_device_accumulator_with_encoding(
@@ -1190,33 +1169,6 @@ void generate_device_accumulator_with_encoding_with_cpu_prealloc(
*/
template <typename Torus>
void generate_device_accumulator(
cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t message_modulus, uint32_t carry_modulus,
std::function<Torus(Torus)> f, bool gpu_memory_allocated) {
PUSH_RANGE("gen lut acc")
generate_device_accumulator_with_encoding(
stream, gpu_index, acc, degree, max_degree, glwe_dimension,
polynomial_size, message_modulus, carry_modulus, message_modulus,
carry_modulus, f, gpu_memory_allocated);
POP_RANGE()
}
/*
* generate accumulator for device pointer using preallocated
* host lut to avoid blocking the cpu thread with the stream
* synchronization (required to free the host lut).
* This enables concurrent execution of multiple streams when using
* a single cpu thread.
* v_stream - cuda stream
* acc - device pointer for accumulator
* ...
* f - evaluating function with one Torus input
* h_lut - preallocated host lut to be used
*/
template <typename Torus>
void generate_device_accumulator_with_cpu_prealloc(
cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t message_modulus, uint32_t carry_modulus,
@@ -1224,12 +1176,19 @@ void generate_device_accumulator_with_cpu_prealloc(
Torus *preallocated_h_lut) {
PUSH_RANGE("gen lut acc")
generate_device_accumulator_with_encoding_with_cpu_prealloc(
stream, gpu_index, acc, degree, max_degree, glwe_dimension,
polynomial_size, message_modulus, carry_modulus, message_modulus,
carry_modulus, f, gpu_memory_allocated, preallocated_h_lut);
if (preallocated_h_lut != nullptr)
generate_device_accumulator_with_encoding_with_cpu_prealloc(
stream, gpu_index, acc, degree, max_degree, glwe_dimension,
polynomial_size, message_modulus, carry_modulus, message_modulus,
carry_modulus, f, gpu_memory_allocated, preallocated_h_lut);
else
generate_device_accumulator_with_encoding(
stream, gpu_index, acc, degree, max_degree, glwe_dimension,
polynomial_size, message_modulus, carry_modulus, message_modulus,
carry_modulus, f, gpu_memory_allocated);
POP_RANGE()
}
/*
* generate many lut accumulator for device pointer
* v_stream - cuda stream
@@ -1243,25 +1202,38 @@ void generate_many_lut_device_accumulator(
uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t message_modulus, uint32_t carry_modulus,
std::vector<std::function<Torus(Torus)>> &functions,
bool gpu_memory_allocated) {
bool gpu_memory_allocated, Torus *preallocated_h_lut) {
PUSH_RANGE("gen many lut acc")
// host lut
Torus *h_lut =
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
if (preallocated_h_lut == nullptr) {
// host lut
Torus *h_lut =
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
// fill accumulator
*max_degree = generate_many_lookup_table<Torus>(
h_lut, degrees, glwe_dimension, polynomial_size, message_modulus,
carry_modulus, functions);
// fill accumulator
*max_degree = generate_many_lookup_table<Torus>(
h_lut, degrees, glwe_dimension, polynomial_size, message_modulus,
carry_modulus, functions);
// copy host lut and lut_indexes_vec to device
cuda_memcpy_with_size_tracking_async_to_gpu(
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
stream, gpu_index, gpu_memory_allocated);
// copy host lut and lut_indexes_vec to device
cuda_memcpy_with_size_tracking_async_to_gpu(
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
stream, gpu_index, gpu_memory_allocated);
cuda_synchronize_stream(stream, gpu_index);
free(h_lut);
cuda_synchronize_stream(stream, gpu_index);
free(h_lut);
} else {
// fill accumulator
*max_degree = generate_many_lookup_table<Torus>(
preallocated_h_lut, degrees, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, functions);
// copy host lut and lut_indexes_vec to device
cuda_memcpy_with_size_tracking_async_to_gpu(
acc, preallocated_h_lut,
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream,
gpu_index, gpu_memory_allocated);
}
POP_RANGE()
}
@@ -1732,7 +1704,7 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
signs_array_in, 0, num_sign_blocks);
if (num_sign_blocks > 2) {
auto lut = diff_buffer->reduce_signs_lut;
generate_device_accumulator_with_cpu_prealloc<Torus>(
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
@@ -1763,7 +1735,7 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
};
auto lut = diff_buffer->reduce_signs_lut;
generate_device_accumulator_with_cpu_prealloc<Torus>(
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, final_lut_f, true,
@@ -1783,7 +1755,7 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
};
auto lut = mem_ptr->diff_buffer->reduce_signs_lut;
generate_device_accumulator_with_cpu_prealloc<Torus>(
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, final_lut_f, true,

View File

@@ -366,7 +366,8 @@ __host__ void host_integer_partial_sum_ciphertexts_vec(
const dim3 number_of_blocks_2d(num_radix_blocks, part_count, 1);
mem_ptr->setup_lookup_tables(streams, num_radix_in_vec,
current_blocks->degrees);
current_blocks->degrees,
mem_ptr->preallocated_h_lut);
while (needs_processing) {
auto luts_message_carry = mem_ptr->luts_message_carry;

View File

@@ -142,7 +142,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check(
};
auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
generate_device_accumulator_with_cpu_prealloc<Torus>(
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, scalar_last_leaf_lut_f,
@@ -235,7 +235,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check(
};
auto lut = diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
generate_device_accumulator_bivariate_with_cpu_prealloc<Torus>(
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
@@ -269,7 +269,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check(
int_radix_lut<Torus> *one_block_lut =
new int_radix_lut<Torus>(streams, params, 1, 1, true, size);
generate_device_accumulator_with_cpu_prealloc<Torus>(
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), one_block_lut->get_lut(0, 0),
one_block_lut->get_degree(0), one_block_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
@@ -413,7 +413,7 @@ __host__ void integer_radix_signed_scalar_difference_check(
};
auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
generate_device_accumulator_bivariate_with_cpu_prealloc<Torus>(
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
@@ -515,7 +515,7 @@ __host__ void integer_radix_signed_scalar_difference_check(
};
auto signed_msb_lut = mem_ptr->signed_msb_lut;
generate_device_accumulator_bivariate_with_cpu_prealloc<Torus>(
generate_device_accumulator_bivariate<Torus>(
msb_streams.stream(0), streams.gpu_index(0),
signed_msb_lut->get_lut(0, 0), signed_msb_lut->get_degree(0),
signed_msb_lut->get_max_degree(0), params.glwe_dimension,
@@ -561,7 +561,7 @@ __host__ void integer_radix_signed_scalar_difference_check(
int_radix_lut<Torus> *one_block_lut =
new int_radix_lut<Torus>(streams, params, 1, 1, true, size);
generate_device_accumulator_with_cpu_prealloc<Torus>(
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), one_block_lut->get_lut(0, 0),
one_block_lut->get_degree(0), one_block_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,

View File

@@ -2503,6 +2503,41 @@ unsafe extern "C" {
mem_ptr_void: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn scratch_cuda_erc20_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
big_lwe_dimension: u32,
small_lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
lwe_ciphertext_count: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
allocate_gpu_memory: bool,
noise_reduction_type: PBS_MS_REDUCTION_T,
) -> u64;
}
unsafe extern "C" {
pub fn cuda_erc20_assign_64(
streams: CudaStreamsFFI,
from_amount: *mut CudaRadixCiphertextFFI,
to_amount: *mut CudaRadixCiphertextFFI,
amount: *const CudaRadixCiphertextFFI,
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
);
}
unsafe extern "C" {
pub fn cleanup_cuda_erc20(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
}
pub const KS_TYPE_BIG_TO_SMALL: KS_TYPE = 0;
pub const KS_TYPE_SMALL_TO_BIG: KS_TYPE = 1;
pub type KS_TYPE = ffi::c_uint;

View File

@@ -4,6 +4,7 @@
#include "cuda/include/integer/integer.h"
#include "cuda/include/integer/rerand.h"
#include "cuda/include/aes/aes.h"
#include "cuda/include/erc20/erc20.h"
#include "cuda/include/zk/zk.h"
#include "cuda/include/keyswitch/keyswitch.h"
#include "cuda/include/keyswitch/ks_enums.h"

View File

@@ -115,6 +115,12 @@ path = "benches/integer/bench.rs"
harness = false
required-features = ["integer", "pbs-stats", "internal-keycache"]
[[bench]]
name = "integer-erc20"
path = "benches/integer/erc20.rs"
harness = false
required-features = ["integer", "pbs-stats", "internal-keycache"]
[[bench]]
name = "integer-signed"
path = "benches/integer/signed_bench.rs"

View File

@@ -42,6 +42,19 @@ where
(new_from_amount, new_to_amount)
}
#[cfg(feature = "gpu")]
pub fn transfer_backend<FheType>(
from_amount: &FheType,
to_amount: &FheType,
amount: &FheType,
) -> (FheType, FheType)
where
FheType: FheErc20<Output = FheType>,
for<'a> &'a FheType: FheErc20<Output = FheType>,
{
from_amount.erc20(to_amount, amount)
}
/// Parallel variant of [`transfer_whitepaper`].
pub fn par_transfer_whitepaper<FheType>(
from_amount: &FheType,
@@ -965,6 +978,14 @@ fn main() {
"transfer::no_cmux",
transfer_no_cmux::<FheUint64>,
);
cuda_bench_transfer_throughput(
&mut group,
&cks,
bench_name,
"FheUint64",
"transfer::backend",
transfer_backend::<FheUint64>,
);
cuda_bench_transfer_throughput(
&mut group,
&cks,

View File

@@ -2,14 +2,17 @@
mod aes;
mod aes256;
mod erc20;
mod oprf;
mod rerand;
use benchmark::params::ParamsAndNumBlocksIter;
use benchmark::utilities::{
get_bench_type, throughput_num_threads, write_to_json, BenchmarkType, EnvConfig, OperatorType,
gen_random_u256, get_bench_type, throughput_num_threads, write_to_json, BenchmarkType,
EnvConfig, OperatorType,
};
use criterion::{criterion_group, Criterion, Throughput};
use rand::prelude::*;
use rayon::prelude::*;
@@ -26,13 +29,6 @@ use tfhe::{get_pbs_count, reset_pbs_count};
/// It must be as big as the largest bit size tested
type ScalarType = U256;
fn gen_random_u256(rng: &mut ThreadRng) -> U256 {
let clearlow = rng.gen::<u128>();
let clearhigh = rng.gen::<u128>();
tfhe::integer::U256::from((clearlow, clearhigh))
}
/// Base function to bench a server key function that is a binary operation, input ciphertexts will
/// contain non zero carries
fn bench_server_key_binary_function_dirty_inputs<F>(

View File

@@ -0,0 +1,144 @@
use benchmark::params_aliases::BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
use benchmark::utilities::{
cuda_local_keys, cuda_local_streams, gen_random_u256, get_bench_type, BenchmarkType,
};
use criterion::{Criterion, Throughput};
use rayon::prelude::*;
use rayon::ThreadPoolBuilder;
#[cfg(feature = "gpu")]
use tfhe::core_crypto::gpu::{get_number_of_gpus, CudaStreams};
#[cfg(feature = "gpu")]
use tfhe::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext;
#[cfg(feature = "gpu")]
use tfhe::integer::gpu::CudaServerKey;
use tfhe::integer::keycache::KEY_CACHE;
use tfhe::integer::IntegerKeyKind;
use tfhe::keycache::NamedParam;
fn main() {
let mut criterion: Criterion<_> = (Criterion::default()).configure_from_args();
#[cfg(feature = "gpu")]
cuda_erc20(&mut criterion);
Criterion::default().configure_from_args().final_summary();
}
#[cfg(feature = "gpu")]
pub fn cuda_erc20(c: &mut Criterion) {
let bench_name = "integer::cuda::erc20";
let mut bench_group = c.benchmark_group(bench_name);
bench_group
.sample_size(15)
.measurement_time(std::time::Duration::from_secs(30));
let mut rng = rand::thread_rng();
let bench_id;
let param = BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
let param_name = param.name();
let num_block = 32;
match get_bench_type() {
BenchmarkType::Latency => {
let streams = CudaStreams::new_multi_gpu();
bench_id = format!("{bench_name}::{param_name}");
bench_group.bench_function(&bench_id, |b| {
let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let gpu_sks = CudaServerKey::new(&cks, &streams);
let encrypt_values = || {
let ct_0 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block);
let ct_1 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block);
let ct_2 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block);
let d_ctxt_0 =
CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_0, &streams);
let d_ctxt_1 =
CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_1, &streams);
let d_ctxt_2 =
CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_2, &streams);
(d_ctxt_0, d_ctxt_1, d_ctxt_2)
};
b.iter_batched(
encrypt_values,
|(ct_0, ct_1, ct_2)| {
gpu_sks.erc20(&ct_0, &ct_1, &ct_2, &streams);
},
criterion::BatchSize::SmallInput,
)
});
}
BenchmarkType::Throughput => {
let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let gpu_sks_vec = cuda_local_keys(&cks);
let gpu_count = get_number_of_gpus() as usize;
bench_id = format!("{bench_name}::throughput::{param_name}");
let elements = 800;
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
let setup_encrypted_values = || {
let local_streams = cuda_local_streams(num_block, elements as usize);
let cts_0 = (0..elements)
.map(|i| {
let ct_0 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block);
CudaUnsignedRadixCiphertext::from_radix_ciphertext(
&ct_0,
&local_streams[i as usize],
)
})
.collect::<Vec<_>>();
let cts_1 = (0..elements)
.map(|i| {
let ct_1 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block);
CudaUnsignedRadixCiphertext::from_radix_ciphertext(
&ct_1,
&local_streams[i as usize],
)
})
.collect::<Vec<_>>();
let cts_2 = (0..elements)
.map(|i| {
let ct_2 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block);
CudaUnsignedRadixCiphertext::from_radix_ciphertext(
&ct_2,
&local_streams[i as usize],
)
})
.collect::<Vec<_>>();
(cts_0, cts_1, cts_2, local_streams)
};
let pool = ThreadPoolBuilder::new().num_threads(32).build().unwrap();
b.iter_batched(
setup_encrypted_values,
|(cts_0, cts_1, cts_2, local_streams)| {
pool.install(|| {
cts_0
.par_iter()
.zip(cts_1.par_iter())
.zip(cts_2.par_iter())
.zip(local_streams.par_iter())
.enumerate()
.for_each(|(i, (((ct_0, ct_1), ct_2), local_stream))| {
gpu_sks_vec[i % gpu_count].erc20(
ct_0,
ct_1,
ct_2,
local_stream,
);
});
})
},
criterion::BatchSize::SmallInput,
);
});
}
};
bench_group.finish();
}

View File

@@ -1,3 +1,5 @@
use rand::prelude::ThreadRng;
use rand::Rng;
use serde::Serialize;
use std::path::PathBuf;
use std::sync::OnceLock;
@@ -791,3 +793,11 @@ mod cuda_utils {
#[cfg(feature = "gpu")]
pub use cuda_utils::*;
use tfhe::integer::U256;
pub fn gen_random_u256(rng: &mut ThreadRng) -> U256 {
let clearlow = rng.gen::<u128>();
let clearhigh = rng.gen::<u128>();
tfhe::integer::U256::from((clearlow, clearhigh))
}

View File

@@ -17,7 +17,7 @@ use crate::high_level_api::traits::{
RotateRightSizeOnGpu, ShlSizeOnGpu, ShrSizeOnGpu, SizeOnGpu, SubSizeOnGpu,
};
use crate::high_level_api::traits::{
DivRem, FheEq, FheMax, FheMin, FheOrd, RotateLeft, RotateLeftAssign, RotateRight,
DivRem, FheEq, FheErc20, FheMax, FheMin, FheOrd, RotateLeft, RotateLeftAssign, RotateRight,
RotateRightAssign,
};
#[cfg(feature = "gpu")]
@@ -3206,3 +3206,68 @@ where
})
}
}
#[cfg(feature = "gpu")]
impl<Id> FheErc20<Self> for FheUint<Id>
where
Id: FheUintId,
{
type Output = Self;
fn erc20(self, to: Self, amount: Self) -> (Self::Output, Self::Output) {
<Self as FheErc20<&Self>>::erc20(self, &to, &amount)
}
}
#[cfg(feature = "gpu")]
impl<Id> FheErc20<&Self> for FheUint<Id>
where
Id: FheUintId,
{
type Output = Self;
fn erc20(self, to: &Self, amount: &Self) -> (Self::Output, Self::Output) {
<&Self as FheErc20<&Self>>::erc20(&self, to, amount)
}
}
#[cfg(feature = "gpu")]
impl<Id> FheErc20<Self> for &FheUint<Id>
where
Id: FheUintId,
{
type Output = FheUint<Id>;
fn erc20(self, to: Self, amount: Self) -> (Self::Output, Self::Output) {
global_state::with_internal_keys(|key| match key {
InternalServerKey::Cpu(_cpu_key) => {
panic!("Erc20 is not supported on CPU");
}
#[cfg(feature = "gpu")]
InternalServerKey::Cuda(cuda_key) => {
let streams = &cuda_key.streams;
let inner_result = cuda_key.key.key.erc20(
&*self.ciphertext.on_gpu(streams),
&*to.ciphertext.on_gpu(streams),
&*amount.ciphertext.on_gpu(streams),
streams,
);
(
FheUint::<Id>::new(
inner_result.0,
cuda_key.tag.clone(),
ReRandomizationMetadata::default(),
),
FheUint::<Id>::new(
inner_result.1,
cuda_key.tag.clone(),
ReRandomizationMetadata::default(),
),
)
}
#[cfg(feature = "hpu")]
InternalServerKey::Hpu(_device) => {
panic!("Erc20 is not supported on HPU");
}
})
}
}

View File

@@ -27,7 +27,7 @@ pub use crate::high_level_api::strings::traits::*;
#[cfg(feature = "gpu")]
pub use crate::high_level_api::traits::{
AddSizeOnGpu, BitAndSizeOnGpu, BitNotSizeOnGpu, BitOrSizeOnGpu, BitXorSizeOnGpu,
DivRemSizeOnGpu, DivSizeOnGpu, FheEqSizeOnGpu, FheMaxSizeOnGpu, FheMinSizeOnGpu,
DivRemSizeOnGpu, DivSizeOnGpu, FheEqSizeOnGpu, FheErc20, FheMaxSizeOnGpu, FheMinSizeOnGpu,
FheOrdSizeOnGpu, IfThenElseSizeOnGpu, MulSizeOnGpu, NegSizeOnGpu, RemSizeOnGpu,
RotateLeftSizeOnGpu, RotateRightSizeOnGpu, ShlSizeOnGpu, ShrSizeOnGpu, SizeOnGpu, SubSizeOnGpu,
};

View File

@@ -355,3 +355,9 @@ pub trait FheEqSizeOnGpu<Rhs = Self> {
fn get_eq_size_on_gpu(&self, amount: Rhs) -> u64;
fn get_ne_size_on_gpu(&self, amount: Rhs) -> u64;
}
#[cfg(feature = "gpu")]
pub trait FheErc20<Rhs = Self> {
type Output;
fn erc20(self, to: Rhs, amount: Rhs) -> (Self::Output, Self::Output);
}

View File

@@ -10357,3 +10357,130 @@ pub(crate) unsafe fn cuda_backend_cast_to_signed<T: UnsignedInteger, B: Numeric>
update_noise_degree(output, &cuda_ffi_output);
}
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_erc20_assign<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
from_amount: &mut CudaRadixCiphertext,
to_amount: &mut CudaRadixCiphertext,
amount: &CudaRadixCiphertext,
bootstrapping_key: &CudaVec<B>,
keyswitch_key: &CudaVec<T>,
message_modulus: MessageModulus,
carry_modulus: CarryModulus,
glwe_dimension: GlweDimension,
polynomial_size: PolynomialSize,
big_lwe_dimension: LweDimension,
small_lwe_dimension: LweDimension,
ks_level: DecompositionLevelCount,
ks_base_log: DecompositionBaseLog,
pbs_level: DecompositionLevelCount,
pbs_base_log: DecompositionBaseLog,
num_blocks: u32,
pbs_type: PBSType,
grouping_factor: LweBskGroupingFactor,
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
) {
assert_eq!(
streams.gpu_indexes[0],
from_amount.d_blocks.0.d_vec.gpu_index(0),
"GPU error: first stream is on GPU {}, first from_amount pointer is on GPU {}",
streams.gpu_indexes[0].get(),
from_amount.d_blocks.0.d_vec.gpu_index(0).get(),
);
assert_eq!(
streams.gpu_indexes[0],
to_amount.d_blocks.0.d_vec.gpu_index(0),
"GPU error: first stream is on GPU {}, first to_amount pointer is on GPU {}",
streams.gpu_indexes[0].get(),
to_amount.d_blocks.0.d_vec.gpu_index(0).get(),
);
assert_eq!(
streams.gpu_indexes[0],
amount.d_blocks.0.d_vec.gpu_index(0),
"GPU error: first stream is on GPU {}, first amount pointer is on GPU {}",
streams.gpu_indexes[0].get(),
amount.d_blocks.0.d_vec.gpu_index(0).get(),
);
assert_eq!(
streams.gpu_indexes[0],
bootstrapping_key.gpu_index(0),
"GPU error: first stream is on GPU {}, first bsk pointer is on GPU {}",
streams.gpu_indexes[0].get(),
bootstrapping_key.gpu_index(0).get(),
);
assert_eq!(
streams.gpu_indexes[0],
keyswitch_key.gpu_index(0),
"GPU error: first stream is on GPU {}, first ksk pointer is on GPU {}",
streams.gpu_indexes[0].get(),
keyswitch_key.gpu_index(0).get(),
);
let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration);
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
let mut from_amount_degrees = from_amount.info.blocks.iter().map(|b| b.degree.0).collect();
let mut from_amount_noise_levels = from_amount
.info
.blocks
.iter()
.map(|b| b.noise_level.0)
.collect();
let mut cuda_ffi_from_amount = prepare_cuda_radix_ffi(
from_amount,
&mut from_amount_degrees,
&mut from_amount_noise_levels,
);
let mut amount_degrees = amount.info.blocks.iter().map(|b| b.degree.0).collect();
let mut amount_noise_levels = amount.info.blocks.iter().map(|b| b.noise_level.0).collect();
let cuda_ffi_amount =
prepare_cuda_radix_ffi(amount, &mut amount_degrees, &mut amount_noise_levels);
let mut to_amount_degrees = to_amount.info.blocks.iter().map(|b| b.degree.0).collect();
let mut to_amount_noise_levels = to_amount
.info
.blocks
.iter()
.map(|b| b.noise_level.0)
.collect();
let mut cuda_ffi_to_amount = prepare_cuda_radix_ffi(
to_amount,
&mut to_amount_degrees,
&mut to_amount_noise_levels,
);
scratch_cuda_erc20_64(
streams.ffi(),
std::ptr::addr_of_mut!(mem_ptr),
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
big_lwe_dimension.0 as u32,
small_lwe_dimension.0 as u32,
ks_level.0 as u32,
ks_base_log.0 as u32,
pbs_level.0 as u32,
pbs_base_log.0 as u32,
grouping_factor.0 as u32,
num_blocks,
message_modulus.0 as u32,
carry_modulus.0 as u32,
pbs_type as u32,
true,
noise_reduction_type as u32,
);
cuda_erc20_assign_64(
streams.ffi(),
&raw mut cuda_ffi_from_amount,
&raw mut cuda_ffi_to_amount,
&raw const cuda_ffi_amount,
mem_ptr,
bootstrapping_key.ptr.as_ptr(),
keyswitch_key.ptr.as_ptr(),
);
cleanup_cuda_erc20(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr));
update_noise_degree(from_amount, &cuda_ffi_from_amount);
update_noise_degree(to_amount, &cuda_ffi_to_amount);
}

View File

@@ -0,0 +1,175 @@
use crate::core_crypto::gpu::CudaStreams;
use crate::core_crypto::prelude::LweBskGroupingFactor;
use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
use crate::integer::gpu::{cuda_backend_erc20_assign, PBSType};
impl CudaServerKey {
pub fn unchecked_erc20_assign<T>(
&self,
from_amount: &mut T,
to_amount: &mut T,
amount: &T,
streams: &CudaStreams,
) where
T: CudaIntegerRadixCiphertext,
{
let num_blocks = amount.as_ref().d_blocks.lwe_ciphertext_count().0 as u32;
unsafe {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
cuda_backend_erc20_assign(
streams,
from_amount.as_mut(),
to_amount.as_mut(),
amount.as_ref(),
&d_bsk.d_vec,
&self.key_switching_key.d_vec,
self.message_modulus,
self.carry_modulus,
d_bsk.glwe_dimension,
d_bsk.polynomial_size,
self.key_switching_key
.input_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key
.output_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count,
d_bsk.decomp_base_log,
num_blocks,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
cuda_backend_erc20_assign(
streams,
from_amount.as_mut(),
to_amount.as_mut(),
amount.as_ref(),
&d_multibit_bsk.d_vec,
&self.key_switching_key.d_vec,
self.message_modulus,
self.carry_modulus,
d_multibit_bsk.glwe_dimension,
d_multibit_bsk.polynomial_size,
self.key_switching_key
.input_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key
.output_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_multibit_bsk.decomp_level_count,
d_multibit_bsk.decomp_base_log,
num_blocks,
PBSType::MultiBit,
d_multibit_bsk.grouping_factor,
None,
);
}
}
}
}
pub fn unchecked_erc20<T>(
&self,
from_amount: &T,
to_amount: &T,
amount: &T,
streams: &CudaStreams,
) -> (T, T)
where
T: CudaIntegerRadixCiphertext,
{
let mut from_amount = from_amount.duplicate(streams);
let mut to_amount = to_amount.duplicate(streams);
self.unchecked_erc20_assign(&mut from_amount, &mut to_amount, amount, streams);
(from_amount, to_amount)
}
pub fn erc20<T>(
&self,
from_amount: &T,
to_amount: &T,
amount: &T,
streams: &CudaStreams,
) -> (T, T)
where
T: CudaIntegerRadixCiphertext,
{
let mut tmp_from_amount;
let mut tmp_to_amount;
let (from_amount, to_amount) = match (
from_amount.block_carries_are_empty(),
to_amount.block_carries_are_empty(),
) {
(true, true) => (from_amount, to_amount),
(true, false) => {
tmp_to_amount = to_amount.duplicate(streams);
self.full_propagate_assign(&mut tmp_to_amount, streams);
(from_amount, &tmp_to_amount)
}
(false, true) => {
tmp_from_amount = from_amount.duplicate(streams);
self.full_propagate_assign(&mut tmp_from_amount, streams);
(&tmp_from_amount, to_amount)
}
(false, false) => {
tmp_to_amount = to_amount.duplicate(streams);
tmp_from_amount = from_amount.duplicate(streams);
self.full_propagate_assign(&mut tmp_from_amount, streams);
self.full_propagate_assign(&mut tmp_to_amount, streams);
(&tmp_from_amount, &tmp_to_amount)
}
};
self.unchecked_erc20(from_amount, to_amount, amount, streams)
}
pub fn erc20_assign<T>(
&self,
from_amount: &mut T,
to_amount: &mut T,
amount: &T,
streams: &CudaStreams,
) where
T: CudaIntegerRadixCiphertext,
{
let mut tmp_from_amount;
let mut tmp_to_amount;
let (from_amount, to_amount) = match (
from_amount.block_carries_are_empty(),
to_amount.block_carries_are_empty(),
) {
(true, true) => (from_amount, to_amount),
(true, false) => {
tmp_to_amount = to_amount.duplicate(streams);
self.full_propagate_assign(&mut tmp_to_amount, streams);
(from_amount, &mut tmp_to_amount)
}
(false, true) => {
tmp_from_amount = from_amount.duplicate(streams);
self.full_propagate_assign(&mut tmp_from_amount, streams);
(&mut tmp_from_amount, to_amount)
}
(false, false) => {
tmp_to_amount = to_amount.duplicate(streams);
tmp_from_amount = from_amount.duplicate(streams);
self.full_propagate_assign(&mut tmp_from_amount, streams);
self.full_propagate_assign(&mut tmp_to_amount, streams);
(&mut tmp_from_amount, &mut tmp_to_amount)
}
};
self.unchecked_erc20_assign(from_amount, to_amount, amount, streams);
}
}

View File

@@ -37,6 +37,7 @@ mod bitwise_op;
mod cmux;
mod comparison;
mod div_mod;
mod erc20;
mod even_odd;
mod ilog2;
mod mul;