mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-13 16:47:59 -05:00
Compare commits
4 Commits
pa/paralle
...
al/erc20_e
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
eb3b988380 | ||
|
|
ec19579c39 | ||
|
|
8ed3b4b59d | ||
|
|
20daf182f0 |
@@ -86,6 +86,7 @@ fn main() {
|
||||
"cuda/include/integer/integer.h",
|
||||
"cuda/include/integer/rerand.h",
|
||||
"cuda/include/aes/aes.h",
|
||||
"cuda/include/erc20/erc20.h",
|
||||
"cuda/include/zk/zk.h",
|
||||
"cuda/include/keyswitch/keyswitch.h",
|
||||
"cuda/include/keyswitch/ks_enums.h",
|
||||
|
||||
20
backends/tfhe-cuda-backend/cuda/include/erc20/erc20.h
Normal file
20
backends/tfhe-cuda-backend/cuda/include/erc20/erc20.h
Normal file
@@ -0,0 +1,20 @@
|
||||
#pragma once
|
||||
#include "../integer/integer.h"
|
||||
extern "C" {
|
||||
uint64_t scratch_cuda_erc20_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_erc20_assign_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *from_amount,
|
||||
CudaRadixCiphertextFFI *to_amount,
|
||||
CudaRadixCiphertextFFI const *amount, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks);
|
||||
|
||||
void cleanup_cuda_erc20(CudaStreamsFFI streams, int8_t **mem_ptr_void);
|
||||
}
|
||||
@@ -0,0 +1,82 @@
|
||||
#pragma once
|
||||
#include "../integer/integer_utilities.h"
|
||||
#include "integer/comparison.h"
|
||||
#include "integer/multiplication.h"
|
||||
#include "integer/subtraction.h"
|
||||
|
||||
template <typename Torus> struct int_erc20_buffer {
|
||||
int_radix_params params;
|
||||
|
||||
int_comparison_buffer<Torus> *diff_buffer;
|
||||
int_mul_memory<Torus> *mul_buffer;
|
||||
int_sc_prop_memory<Torus> *add_buffer;
|
||||
int_sub_and_propagate<Torus> *sub_buffer;
|
||||
CudaRadixCiphertextFFI *tmp_amount;
|
||||
CudaRadixCiphertextFFI *has_enough_funds;
|
||||
CudaStreams active_streams;
|
||||
InternalCudaStreams internal_cuda_streams;
|
||||
uint32_t num_internal_streams;
|
||||
bool allocate_gpu_memory;
|
||||
Torus *preallocated_h_lut;
|
||||
|
||||
int_erc20_buffer(CudaStreams streams, int_radix_params params,
|
||||
uint32_t num_radix_blocks, bool allocate_gpu_memory,
|
||||
uint64_t &size_tracker) {
|
||||
this->params = params;
|
||||
this->allocate_gpu_memory = allocate_gpu_memory;
|
||||
preallocated_h_lut = (Torus *)malloc(
|
||||
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
|
||||
diff_buffer = new int_comparison_buffer<Torus>(
|
||||
streams, COMPARISON_TYPE::GT, params, num_radix_blocks, false,
|
||||
allocate_gpu_memory, size_tracker, preallocated_h_lut);
|
||||
mul_buffer = new int_mul_memory<Torus>(
|
||||
streams, params, false, true, num_radix_blocks, allocate_gpu_memory,
|
||||
size_tracker, preallocated_h_lut);
|
||||
add_buffer = new int_sc_prop_memory<Torus>(
|
||||
streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory,
|
||||
size_tracker, preallocated_h_lut);
|
||||
sub_buffer = new int_sub_and_propagate<Torus>(
|
||||
streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory,
|
||||
size_tracker, preallocated_h_lut);
|
||||
tmp_amount = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), tmp_amount, num_radix_blocks,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
has_enough_funds = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), has_enough_funds, 1,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
active_streams = streams.active_gpu_subset(num_radix_blocks);
|
||||
num_internal_streams = 2;
|
||||
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
|
||||
active_streams, num_internal_streams);
|
||||
}
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
diff_buffer->release(streams);
|
||||
delete diff_buffer;
|
||||
diff_buffer = nullptr;
|
||||
mul_buffer->release(streams);
|
||||
delete mul_buffer;
|
||||
mul_buffer = nullptr;
|
||||
add_buffer->release(streams);
|
||||
delete add_buffer;
|
||||
add_buffer = nullptr;
|
||||
sub_buffer->release(streams);
|
||||
delete sub_buffer;
|
||||
sub_buffer = nullptr;
|
||||
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
tmp_amount, this->allocate_gpu_memory);
|
||||
delete tmp_amount;
|
||||
tmp_amount = nullptr;
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
has_enough_funds, this->allocate_gpu_memory);
|
||||
delete has_enough_funds;
|
||||
has_enough_funds = nullptr;
|
||||
|
||||
internal_cuda_streams.release(streams);
|
||||
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
||||
free(preallocated_h_lut);
|
||||
}
|
||||
};
|
||||
@@ -43,7 +43,8 @@ template <typename Torus> struct int_cmux_buffer {
|
||||
int_cmux_buffer(CudaStreams streams,
|
||||
std::function<Torus(Torus)> predicate_lut_f,
|
||||
int_radix_params params, uint32_t num_radix_blocks,
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker,
|
||||
Torus *preallocated_h_lut = nullptr) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
|
||||
this->params = params;
|
||||
@@ -88,20 +89,21 @@ template <typename Torus> struct int_cmux_buffer {
|
||||
streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 0),
|
||||
predicate_lut->get_degree(0), predicate_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, inverted_lut_f, gpu_memory_allocated);
|
||||
params.carry_modulus, inverted_lut_f, gpu_memory_allocated,
|
||||
preallocated_h_lut);
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 1),
|
||||
predicate_lut->get_degree(1), predicate_lut->get_max_degree(1),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_f, gpu_memory_allocated);
|
||||
params.carry_modulus, lut_f, gpu_memory_allocated, preallocated_h_lut);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
message_extract_lut->get_lut(0, 0), message_extract_lut->get_degree(0),
|
||||
message_extract_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
message_extract_lut_f, gpu_memory_allocated);
|
||||
message_extract_lut_f, gpu_memory_allocated, preallocated_h_lut);
|
||||
Torus *h_lut_indexes = predicate_lut->h_lut_indexes;
|
||||
for (int index = 0; index < 2 * num_radix_blocks; index++) {
|
||||
if (index < num_radix_blocks) {
|
||||
|
||||
@@ -384,7 +384,8 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
int_comparison_buffer(CudaStreams streams, COMPARISON_TYPE op,
|
||||
int_radix_params params, uint32_t num_radix_blocks,
|
||||
bool is_signed, bool allocate_gpu_memory,
|
||||
uint64_t &size_tracker) {
|
||||
uint64_t &size_tracker,
|
||||
Torus *preallocated_h_lut_from_elsewhere = nullptr) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->params = params;
|
||||
this->op = op;
|
||||
@@ -426,7 +427,8 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
streams.stream(0), streams.gpu_index(0), identity_lut->get_lut(0, 0),
|
||||
identity_lut->get_degree(0), identity_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, identity_lut_f, gpu_memory_allocated);
|
||||
params.carry_modulus, identity_lut_f, gpu_memory_allocated,
|
||||
preallocated_h_lut_from_elsewhere);
|
||||
identity_lut->broadcast_lut(active_streams);
|
||||
|
||||
uint32_t total_modulus = params.message_modulus * params.carry_modulus;
|
||||
@@ -441,7 +443,8 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
streams.stream(0), streams.gpu_index(0), is_zero_lut->get_lut(0, 0),
|
||||
is_zero_lut->get_degree(0), is_zero_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, is_zero_f, gpu_memory_allocated);
|
||||
params.carry_modulus, is_zero_f, gpu_memory_allocated,
|
||||
preallocated_h_lut_from_elsewhere);
|
||||
|
||||
is_zero_lut->broadcast_lut(active_streams);
|
||||
|
||||
@@ -456,7 +459,8 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
else
|
||||
return (x == IS_INFERIOR);
|
||||
},
|
||||
params, num_radix_blocks, allocate_gpu_memory, size_tracker);
|
||||
params, num_radix_blocks, allocate_gpu_memory, size_tracker,
|
||||
preallocated_h_lut_from_elsewhere);
|
||||
case COMPARISON_TYPE::GT:
|
||||
case COMPARISON_TYPE::GE:
|
||||
case COMPARISON_TYPE::LT:
|
||||
|
||||
@@ -77,7 +77,8 @@ void generate_device_accumulator_bivariate(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *acc_bivariate,
|
||||
uint64_t *degree, uint64_t *max_degree, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
std::function<Torus(Torus, Torus)> f, bool gpu_memory_allocated);
|
||||
std::function<Torus(Torus, Torus)> f, bool gpu_memory_allocated,
|
||||
Torus *preallocated_h_lut = nullptr);
|
||||
|
||||
template <typename Torus>
|
||||
void generate_device_accumulator_bivariate_with_factor(
|
||||
@@ -114,14 +115,16 @@ void generate_device_accumulator(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
|
||||
uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t message_modulus, uint32_t carry_modulus,
|
||||
std::function<Torus(Torus)> f, bool gpu_memory_allocated);
|
||||
std::function<Torus(Torus)> f, bool gpu_memory_allocated,
|
||||
Torus *preallocated_h_lut = nullptr);
|
||||
|
||||
template <typename Torus>
|
||||
void generate_many_lut_device_accumulator(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degrees,
|
||||
uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t message_modulus, uint32_t carry_modulus,
|
||||
std::vector<std::function<Torus(Torus)>> &f, bool gpu_memory_allocated);
|
||||
std::vector<std::function<Torus(Torus)>> &f, bool gpu_memory_allocated,
|
||||
Torus *preallocated_h_lut = nullptr);
|
||||
|
||||
struct radix_columns {
|
||||
std::vector<uint32_t> columns_counter;
|
||||
@@ -1160,6 +1163,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
|
||||
bool mem_reuse = false;
|
||||
bool allocated_luts_message_carry;
|
||||
Torus *preallocated_h_lut;
|
||||
|
||||
void setup_index_buffers(CudaStreams streams, uint64_t &size_tracker) {
|
||||
|
||||
@@ -1206,7 +1210,8 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
}
|
||||
|
||||
void setup_lookup_tables(CudaStreams streams, uint32_t num_radix_in_vec,
|
||||
const uint64_t *const degrees) {
|
||||
const uint64_t *const degrees,
|
||||
Torus *preallocated_h_lut = nullptr) {
|
||||
uint32_t message_modulus = params.message_modulus;
|
||||
bool _needs_processing = false;
|
||||
radix_columns current_columns(degrees, num_blocks_in_radix,
|
||||
@@ -1257,13 +1262,13 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
luts_message_carry->get_degree(0),
|
||||
luts_message_carry->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, message_modulus, params.carry_modulus,
|
||||
lut_f_message, gpu_memory_allocated);
|
||||
lut_f_message, gpu_memory_allocated, preallocated_h_lut);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), carry_acc,
|
||||
luts_message_carry->get_degree(1),
|
||||
luts_message_carry->get_max_degree(1), params.glwe_dimension,
|
||||
params.polynomial_size, message_modulus, params.carry_modulus,
|
||||
lut_f_carry, gpu_memory_allocated);
|
||||
lut_f_carry, gpu_memory_allocated, preallocated_h_lut);
|
||||
auto active_gpu_count_mc = streams.active_gpu_subset(pbs_count);
|
||||
luts_message_carry->broadcast_lut(active_gpu_count_mc);
|
||||
}
|
||||
@@ -1272,7 +1277,8 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
CudaStreams streams, int_radix_params params,
|
||||
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
|
||||
bool reduce_degrees_for_single_carry_propagation,
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker,
|
||||
Torus *preallocated_h_lut = nullptr) {
|
||||
this->params = params;
|
||||
this->mem_reuse = false;
|
||||
this->max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
|
||||
@@ -1284,6 +1290,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
this->allocated_luts_message_carry = false;
|
||||
this->reduce_degrees_for_single_carry_propagation =
|
||||
reduce_degrees_for_single_carry_propagation;
|
||||
this->preallocated_h_lut = preallocated_h_lut;
|
||||
|
||||
setup_index_buffers(streams, size_tracker);
|
||||
// because we setup_lut in host function for sum_ciphertexts to save memory
|
||||
@@ -1318,7 +1325,8 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
CudaRadixCiphertextFFI *small_lwe_vector,
|
||||
int_radix_lut<Torus> *reused_lut,
|
||||
bool reduce_degrees_for_single_carry_propagation,
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker,
|
||||
Torus *preallocated_h_lut = nullptr) {
|
||||
this->mem_reuse = true;
|
||||
this->params = params;
|
||||
this->max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
|
||||
@@ -1334,6 +1342,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
this->current_blocks = current_blocks;
|
||||
this->small_lwe_vector = small_lwe_vector;
|
||||
this->luts_message_carry = reused_lut;
|
||||
this->preallocated_h_lut = preallocated_h_lut;
|
||||
|
||||
uint64_t message_modulus_bits = (uint64_t)std::log2(params.message_modulus);
|
||||
uint64_t carry_modulus_bits = (uint64_t)std::log2(params.carry_modulus);
|
||||
@@ -1395,10 +1404,12 @@ template <typename Torus> struct int_seq_group_prop_memory {
|
||||
int_radix_lut<Torus> *lut_sequential_algorithm;
|
||||
uint32_t grouping_size;
|
||||
bool gpu_memory_allocated;
|
||||
Torus *h_seq_lut_indexes;
|
||||
|
||||
int_seq_group_prop_memory(CudaStreams streams, int_radix_params params,
|
||||
uint32_t group_size, uint32_t big_lwe_size_bytes,
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker,
|
||||
Torus *preallocated_h_lut = nullptr) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
@@ -1413,7 +1424,7 @@ template <typename Torus> struct int_seq_group_prop_memory {
|
||||
allocate_gpu_memory);
|
||||
|
||||
int num_seq_luts = grouping_size - 1;
|
||||
Torus *h_seq_lut_indexes = (Torus *)malloc(num_seq_luts * sizeof(Torus));
|
||||
h_seq_lut_indexes = (Torus *)malloc(num_seq_luts * sizeof(Torus));
|
||||
lut_sequential_algorithm =
|
||||
new int_radix_lut<Torus>(streams, params, num_seq_luts, num_seq_luts,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
@@ -1427,7 +1438,7 @@ template <typename Torus> struct int_seq_group_prop_memory {
|
||||
lut_sequential_algorithm->get_degree(index),
|
||||
lut_sequential_algorithm->get_max_degree(index), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, f_lut_sequential,
|
||||
gpu_memory_allocated);
|
||||
gpu_memory_allocated, preallocated_h_lut);
|
||||
h_seq_lut_indexes[index] = index;
|
||||
}
|
||||
Torus *seq_lut_indexes = lut_sequential_algorithm->get_lut_indexes(0, 0);
|
||||
@@ -1436,7 +1447,6 @@ template <typename Torus> struct int_seq_group_prop_memory {
|
||||
streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
|
||||
auto active_streams = streams.active_gpu_subset(num_seq_luts);
|
||||
lut_sequential_algorithm->broadcast_lut(active_streams);
|
||||
free(h_seq_lut_indexes);
|
||||
};
|
||||
void release(CudaStreams streams) {
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
@@ -1446,6 +1456,7 @@ template <typename Torus> struct int_seq_group_prop_memory {
|
||||
delete group_resolved_carries;
|
||||
delete lut_sequential_algorithm;
|
||||
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
||||
free(h_seq_lut_indexes);
|
||||
};
|
||||
};
|
||||
|
||||
@@ -1457,7 +1468,8 @@ template <typename Torus> struct int_hs_group_prop_memory {
|
||||
|
||||
int_hs_group_prop_memory(CudaStreams streams, int_radix_params params,
|
||||
uint32_t num_groups, uint32_t big_lwe_size_bytes,
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker,
|
||||
Torus *preallocated_h_lut = nullptr) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
@@ -1487,7 +1499,7 @@ template <typename Torus> struct int_hs_group_prop_memory {
|
||||
lut_hillis_steele->get_lut(0, 0), lut_hillis_steele->get_degree(0),
|
||||
lut_hillis_steele->get_max_degree(0), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, f_lut_hillis_steele,
|
||||
gpu_memory_allocated);
|
||||
gpu_memory_allocated, preallocated_h_lut);
|
||||
auto active_streams = streams.active_gpu_subset(num_groups);
|
||||
lut_hillis_steele->broadcast_lut(active_streams);
|
||||
};
|
||||
@@ -1511,7 +1523,7 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
|
||||
int_shifted_blocks_and_states_memory(
|
||||
CudaStreams streams, int_radix_params params, uint32_t num_radix_blocks,
|
||||
uint32_t num_many_lut, uint32_t grouping_size, bool allocate_gpu_memory,
|
||||
uint64_t &size_tracker) {
|
||||
uint64_t &size_tracker, Torus *preallocated_h_lut = nullptr) {
|
||||
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
@@ -1561,7 +1573,7 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
|
||||
streams.stream(0), streams.gpu_index(0), first_block_lut,
|
||||
first_block_lut_degrees, first_block_lut_max_degree, glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, f_first_grouping_luts,
|
||||
gpu_memory_allocated);
|
||||
gpu_memory_allocated, preallocated_h_lut);
|
||||
|
||||
// luts for other blocks of the first grouping
|
||||
for (int lut_id = 1; lut_id < grouping_size; lut_id++) {
|
||||
@@ -1584,7 +1596,8 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
|
||||
generate_many_lut_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lut, lut_degrees,
|
||||
lut_max_degree, glwe_dimension, polynomial_size, message_modulus,
|
||||
carry_modulus, f_grouping_luts, gpu_memory_allocated);
|
||||
carry_modulus, f_grouping_luts, gpu_memory_allocated,
|
||||
preallocated_h_lut);
|
||||
}
|
||||
|
||||
// luts for the rest of groupings (except for the last block)
|
||||
@@ -1610,7 +1623,8 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
|
||||
generate_many_lut_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lut, lut_degrees,
|
||||
lut_max_degree, glwe_dimension, polynomial_size, message_modulus,
|
||||
carry_modulus, f_grouping_luts, gpu_memory_allocated);
|
||||
carry_modulus, f_grouping_luts, gpu_memory_allocated,
|
||||
preallocated_h_lut);
|
||||
}
|
||||
|
||||
// For the last block we need to generate a new lut
|
||||
@@ -1635,7 +1649,7 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
|
||||
streams.stream(0), streams.gpu_index(0), last_block_lut,
|
||||
last_block_lut_degrees, last_block_lut_max_degree, glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, f_last_grouping_luts,
|
||||
gpu_memory_allocated);
|
||||
gpu_memory_allocated, preallocated_h_lut);
|
||||
|
||||
// Generate the indexes to switch between luts within the pbs
|
||||
uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus);
|
||||
@@ -1706,11 +1720,12 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
|
||||
uint32_t group_size;
|
||||
bool use_sequential_algorithm_to_resolve_group_carries;
|
||||
bool gpu_memory_allocated;
|
||||
Torus *h_second_lut_indexes;
|
||||
|
||||
int_prop_simu_group_carries_memory(
|
||||
CudaStreams streams, int_radix_params params, uint32_t num_radix_blocks,
|
||||
uint32_t grouping_size, uint32_t num_groups, bool allocate_gpu_memory,
|
||||
uint64_t &size_tracker) {
|
||||
uint64_t &size_tracker, Torus *preallocated_h_lut = nullptr) {
|
||||
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
@@ -1803,7 +1818,8 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
|
||||
luts_array_second_step->get_degree(lut_id),
|
||||
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus,
|
||||
f_first_grouping_inner_propagation, gpu_memory_allocated);
|
||||
f_first_grouping_inner_propagation, gpu_memory_allocated,
|
||||
preallocated_h_lut);
|
||||
}
|
||||
|
||||
auto f_first_grouping_outer_propagation =
|
||||
@@ -1818,7 +1834,8 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
|
||||
luts_array_second_step->get_degree(lut_id),
|
||||
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus,
|
||||
f_first_grouping_outer_propagation, gpu_memory_allocated);
|
||||
f_first_grouping_outer_propagation, gpu_memory_allocated,
|
||||
preallocated_h_lut);
|
||||
|
||||
// for other groupings inner propagation
|
||||
for (int index = 0; index < grouping_size; index++) {
|
||||
@@ -1842,7 +1859,8 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
|
||||
luts_array_second_step->get_degree(lut_id),
|
||||
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus,
|
||||
f_other_groupings_inner_propagation, gpu_memory_allocated);
|
||||
f_other_groupings_inner_propagation, gpu_memory_allocated,
|
||||
preallocated_h_lut);
|
||||
}
|
||||
|
||||
if (use_sequential_algorithm_to_resolve_group_carries) {
|
||||
@@ -1864,7 +1882,7 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
|
||||
luts_array_second_step->get_degree(lut_id),
|
||||
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus,
|
||||
f_group_propagation, gpu_memory_allocated);
|
||||
f_group_propagation, gpu_memory_allocated, preallocated_h_lut);
|
||||
}
|
||||
} else {
|
||||
uint32_t lut_id = 2 * grouping_size;
|
||||
@@ -1882,10 +1900,10 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
|
||||
luts_array_second_step->get_degree(lut_id),
|
||||
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, f_group_propagation,
|
||||
gpu_memory_allocated);
|
||||
gpu_memory_allocated, preallocated_h_lut);
|
||||
}
|
||||
|
||||
Torus *h_second_lut_indexes = (Torus *)malloc(lut_indexes_size);
|
||||
h_second_lut_indexes = (Torus *)malloc(lut_indexes_size);
|
||||
|
||||
for (int index = 0; index < num_radix_blocks; index++) {
|
||||
uint32_t grouping_index = index / grouping_size;
|
||||
@@ -1937,15 +1955,13 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
|
||||
|
||||
seq_group_prop_mem = new int_seq_group_prop_memory<Torus>(
|
||||
streams, params, grouping_size, big_lwe_size_bytes,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
allocate_gpu_memory, size_tracker, preallocated_h_lut);
|
||||
|
||||
} else {
|
||||
hs_group_prop_mem = new int_hs_group_prop_memory<Torus>(
|
||||
streams, params, num_groups, big_lwe_size_bytes, allocate_gpu_memory,
|
||||
size_tracker);
|
||||
size_tracker, preallocated_h_lut);
|
||||
}
|
||||
|
||||
free(h_second_lut_indexes);
|
||||
};
|
||||
|
||||
// needed for the division to update the lut indexes
|
||||
@@ -1996,6 +2012,7 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
|
||||
delete luts_array_second_step;
|
||||
delete[] h_scalar_array_cum_sum;
|
||||
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
||||
free(h_second_lut_indexes);
|
||||
};
|
||||
};
|
||||
|
||||
@@ -2020,7 +2037,8 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
|
||||
int_sc_prop_memory(CudaStreams streams, int_radix_params params,
|
||||
uint32_t num_radix_blocks, uint32_t requested_flag_in,
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker,
|
||||
Torus *preallocated_h_lut = nullptr) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->params = params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
@@ -2040,11 +2058,11 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
|
||||
shifted_blocks_state_mem = new int_shifted_blocks_and_states_memory<Torus>(
|
||||
streams, params, num_radix_blocks, num_many_lut, grouping_size,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
allocate_gpu_memory, size_tracker, preallocated_h_lut);
|
||||
|
||||
prop_simu_group_carries_mem = new int_prop_simu_group_carries_memory<Torus>(
|
||||
streams, params, num_radix_blocks, grouping_size, num_groups,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
allocate_gpu_memory, size_tracker, preallocated_h_lut);
|
||||
|
||||
// Step 3 elements
|
||||
int num_luts_message_extract =
|
||||
@@ -2061,8 +2079,8 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
lut_message_extract->get_lut(0, 0), lut_message_extract->get_degree(0),
|
||||
lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, f_message_extract,
|
||||
gpu_memory_allocated);
|
||||
message_modulus, carry_modulus, f_message_extract, gpu_memory_allocated,
|
||||
preallocated_h_lut);
|
||||
|
||||
// This store a single block that with be used to store the overflow or
|
||||
// carry results
|
||||
@@ -2120,7 +2138,7 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
lut_overflow_flag_prep->get_degree(0),
|
||||
lut_overflow_flag_prep->get_max_degree(0), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, f_overflow_fp,
|
||||
gpu_memory_allocated);
|
||||
gpu_memory_allocated, preallocated_h_lut);
|
||||
|
||||
auto active_streams = streams.active_gpu_subset(1);
|
||||
lut_overflow_flag_prep->broadcast_lut(active_streams);
|
||||
@@ -2152,7 +2170,7 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
lut_message_extract->get_degree(1),
|
||||
lut_message_extract->get_max_degree(1), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, f_overflow_last,
|
||||
gpu_memory_allocated);
|
||||
gpu_memory_allocated, preallocated_h_lut);
|
||||
|
||||
Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
|
||||
for (int index = 0; index < num_radix_blocks + 1; index++) {
|
||||
@@ -2179,7 +2197,7 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
lut_message_extract->get_degree(1),
|
||||
lut_message_extract->get_max_degree(1), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, f_carry_last,
|
||||
gpu_memory_allocated);
|
||||
gpu_memory_allocated, preallocated_h_lut);
|
||||
|
||||
Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
|
||||
for (int index = 0; index < num_radix_blocks + 1; index++) {
|
||||
|
||||
@@ -21,7 +21,7 @@ template <typename Torus> struct int_mul_memory {
|
||||
int_mul_memory(CudaStreams streams, int_radix_params params,
|
||||
bool const is_boolean_left, bool const is_boolean_right,
|
||||
uint32_t num_radix_blocks, bool allocate_gpu_memory,
|
||||
uint64_t &size_tracker) {
|
||||
uint64_t &size_tracker, Torus *preallocated_h_lut = nullptr) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->boolean_mul = is_boolean_left || is_boolean_right;
|
||||
this->params = params;
|
||||
@@ -43,7 +43,7 @@ template <typename Torus> struct int_mul_memory {
|
||||
zero_out_predicate_lut->get_degree(0),
|
||||
zero_out_predicate_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
zero_out_predicate_lut_f, gpu_memory_allocated);
|
||||
zero_out_predicate_lut_f, gpu_memory_allocated, preallocated_h_lut);
|
||||
|
||||
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
|
||||
zero_out_predicate_lut->broadcast_lut(active_streams);
|
||||
|
||||
@@ -142,14 +142,15 @@ template <typename Torus> struct int_sub_and_propagate {
|
||||
|
||||
int_sub_and_propagate(CudaStreams streams, const int_radix_params params,
|
||||
uint32_t num_radix_blocks, uint32_t requested_flag_in,
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker,
|
||||
Torus *preallocated_h_lut = nullptr) {
|
||||
|
||||
this->params = params;
|
||||
this->allocate_gpu_memory = allocate_gpu_memory;
|
||||
|
||||
this->sc_prop_mem = new int_sc_prop_memory<Torus>(
|
||||
streams, params, num_radix_blocks, requested_flag_in,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
allocate_gpu_memory, size_tracker, preallocated_h_lut);
|
||||
|
||||
this->neg_rhs_array = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
file(GLOB_RECURSE SOURCES "*.cu")
|
||||
add_library(tfhe_cuda_backend STATIC ${SOURCES} pbs/programmable_bootstrap_multibit_128.cuh
|
||||
pbs/programmable_bootstrap_multibit_128.cu)
|
||||
add_library(tfhe_cuda_backend STATIC ${SOURCES})
|
||||
set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
|
||||
target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX)
|
||||
target_include_directories(tfhe_cuda_backend PRIVATE .)
|
||||
|
||||
85
backends/tfhe-cuda-backend/cuda/src/erc20/erc20.cu
Normal file
85
backends/tfhe-cuda-backend/cuda/src/erc20/erc20.cu
Normal file
@@ -0,0 +1,85 @@
|
||||
#include "erc20/erc20.cuh"
|
||||
|
||||
uint64_t scratch_cuda_erc20_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
PUSH_RANGE("scratch erc20")
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
std::function<uint64_t(uint64_t)> predicate_lut_f =
|
||||
[](uint64_t x) -> uint64_t { return x == 1; };
|
||||
|
||||
uint64_t ret = scratch_cuda_erc20<uint64_t>(
|
||||
CudaStreams(streams), (int_erc20_buffer<uint64_t> **)mem_ptr,
|
||||
lwe_ciphertext_count, params, allocate_gpu_memory);
|
||||
POP_RANGE()
|
||||
return ret;
|
||||
}
|
||||
|
||||
void cuda_erc20_assign_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *from_amount,
|
||||
CudaRadixCiphertextFFI *to_amount,
|
||||
CudaRadixCiphertextFFI const *amount, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
PUSH_RANGE("erc20")
|
||||
auto mem = reinterpret_cast<int_erc20_buffer<uint64_t> *>(mem_ptr);
|
||||
switch (mem->params.polynomial_size) {
|
||||
case 256:
|
||||
host_erc20_assign<uint64_t, AmortizedDegree<256>>(
|
||||
CudaStreams(streams), from_amount, to_amount, amount, mem, bsks,
|
||||
(uint64_t **)(ksks));
|
||||
break;
|
||||
case 512:
|
||||
host_erc20_assign<uint64_t, AmortizedDegree<512>>(
|
||||
CudaStreams(streams), from_amount, to_amount, amount, mem, bsks,
|
||||
(uint64_t **)(ksks));
|
||||
break;
|
||||
case 1024:
|
||||
host_erc20_assign<uint64_t, AmortizedDegree<1024>>(
|
||||
CudaStreams(streams), from_amount, to_amount, amount, mem, bsks,
|
||||
(uint64_t **)(ksks));
|
||||
break;
|
||||
case 2048:
|
||||
host_erc20_assign<uint64_t, AmortizedDegree<2048>>(
|
||||
CudaStreams(streams), from_amount, to_amount, amount, mem, bsks,
|
||||
(uint64_t **)(ksks));
|
||||
break;
|
||||
case 4096:
|
||||
host_erc20_assign<uint64_t, AmortizedDegree<4096>>(
|
||||
CudaStreams(streams), from_amount, to_amount, amount, mem, bsks,
|
||||
(uint64_t **)(ksks));
|
||||
break;
|
||||
case 8192:
|
||||
host_erc20_assign<uint64_t, AmortizedDegree<8192>>(
|
||||
CudaStreams(streams), from_amount, to_amount, amount, mem, bsks,
|
||||
(uint64_t **)(ksks));
|
||||
break;
|
||||
case 16384:
|
||||
host_erc20_assign<uint64_t, AmortizedDegree<16384>>(
|
||||
CudaStreams(streams), from_amount, to_amount, amount, mem, bsks,
|
||||
(uint64_t **)(ksks));
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
|
||||
"Supported N's are powers of two in the interval [256..16384].")
|
||||
}
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cleanup_cuda_erc20(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
|
||||
PUSH_RANGE("cleanup erc20")
|
||||
int_erc20_buffer<uint64_t> *mem_ptr =
|
||||
(int_erc20_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
POP_RANGE()
|
||||
}
|
||||
49
backends/tfhe-cuda-backend/cuda/src/erc20/erc20.cuh
Normal file
49
backends/tfhe-cuda-backend/cuda/src/erc20/erc20.cuh
Normal file
@@ -0,0 +1,49 @@
|
||||
#pragma once
|
||||
#include "erc20/erc20.h"
|
||||
#include "erc20/erc20_utilities.h"
|
||||
#include "integer/comparison.cuh"
|
||||
#include "integer/integer.cuh"
|
||||
#include "integer/multiplication.cuh"
|
||||
#include "integer/subtraction.cuh"
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_erc20_assign(CudaStreams streams,
|
||||
CudaRadixCiphertextFFI *from_amount,
|
||||
CudaRadixCiphertextFFI *to_amount,
|
||||
CudaRadixCiphertextFFI const *amount,
|
||||
int_erc20_buffer<Torus> *mem_ptr,
|
||||
void *const *bsks, Torus *const *ksks) {
|
||||
auto num_radix_blocks = from_amount->num_radix_blocks;
|
||||
host_difference_check<Torus>(streams, mem_ptr->has_enough_funds, from_amount,
|
||||
amount, mem_ptr->diff_buffer,
|
||||
mem_ptr->diff_buffer->diff_buffer->operator_f,
|
||||
bsks, ksks, num_radix_blocks);
|
||||
host_integer_mult_radix<Torus, params>(
|
||||
streams, mem_ptr->tmp_amount, amount, false, mem_ptr->has_enough_funds,
|
||||
true, bsks, ksks, mem_ptr->mul_buffer, num_radix_blocks);
|
||||
|
||||
mem_ptr->internal_cuda_streams.internal_streams_wait_for_main_stream_0(
|
||||
streams);
|
||||
// stream1
|
||||
host_add_and_propagate_single_carry(
|
||||
mem_ptr->internal_cuda_streams[0], to_amount, mem_ptr->tmp_amount,
|
||||
nullptr, nullptr, mem_ptr->add_buffer, bsks, ksks, FLAG_NONE, 0);
|
||||
// stream2
|
||||
host_sub_and_propagate_single_carry(
|
||||
mem_ptr->internal_cuda_streams[1], to_amount, mem_ptr->tmp_amount,
|
||||
nullptr, nullptr, mem_ptr->sub_buffer, bsks, ksks, FLAG_NONE, 0);
|
||||
mem_ptr->internal_cuda_streams.main_stream_0_wait_for_internal_streams(
|
||||
streams);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_erc20(CudaStreams streams,
|
||||
int_erc20_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks,
|
||||
int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_erc20_buffer<Torus>(streams, params, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
@@ -134,7 +134,7 @@ __host__ void are_all_comparisons_block_true(
|
||||
auto is_equal_to_num_blocks_lut_f = [chunk_length](Torus x) -> Torus {
|
||||
return x == chunk_length;
|
||||
};
|
||||
generate_device_accumulator_with_cpu_prealloc<Torus>(
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
is_max_value_lut->get_lut(0, 1), is_max_value_lut->get_degree(1),
|
||||
is_max_value_lut->get_max_degree(1), glwe_dimension,
|
||||
@@ -482,7 +482,7 @@ tree_sign_reduction(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
y = x;
|
||||
f = sign_handler_f;
|
||||
}
|
||||
generate_device_accumulator_with_cpu_prealloc<Torus>(
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), last_lut->get_lut(0, 0),
|
||||
last_lut->get_degree(0), last_lut->get_max_degree(0), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, f, true,
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
#include "integer/scalar_addition.cuh"
|
||||
#include "linearalgebra/addition.cuh"
|
||||
#include "linearalgebra/negation.cuh"
|
||||
#include "pbs/pbs_128_utilities.h"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/helper_multi_gpu.cuh"
|
||||
@@ -1040,26 +1039,41 @@ void generate_device_accumulator_bivariate(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *acc_bivariate,
|
||||
uint64_t *degree, uint64_t *max_degree, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
std::function<Torus(Torus, Torus)> f, bool gpu_memory_allocated) {
|
||||
std::function<Torus(Torus, Torus)> f, bool gpu_memory_allocated,
|
||||
Torus *preallocated_cpu_lut) {
|
||||
PUSH_RANGE("gen bivar lut acc")
|
||||
|
||||
// host lut
|
||||
Torus *h_lut =
|
||||
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
|
||||
*max_degree = message_modulus * carry_modulus - 1;
|
||||
// fill bivariate accumulator
|
||||
*degree = generate_lookup_table_bivariate<Torus>(
|
||||
h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus,
|
||||
f);
|
||||
if (preallocated_cpu_lut == nullptr) {
|
||||
// host lut
|
||||
Torus *h_lut =
|
||||
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
|
||||
*max_degree = message_modulus * carry_modulus - 1;
|
||||
// fill bivariate accumulator
|
||||
*degree = generate_lookup_table_bivariate<Torus>(
|
||||
h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus,
|
||||
f);
|
||||
|
||||
// copy host lut and lut_indexes_vec to device
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
acc_bivariate, h_lut,
|
||||
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream, gpu_index,
|
||||
gpu_memory_allocated);
|
||||
// copy host lut and lut_indexes_vec to device
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
acc_bivariate, h_lut,
|
||||
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream,
|
||||
gpu_index, gpu_memory_allocated);
|
||||
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
free(h_lut);
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
free(h_lut);
|
||||
} else {
|
||||
*max_degree = message_modulus * carry_modulus - 1;
|
||||
// fill bivariate accumulator
|
||||
*degree = generate_lookup_table_bivariate<Torus>(
|
||||
preallocated_cpu_lut, glwe_dimension, polynomial_size, message_modulus,
|
||||
carry_modulus, f);
|
||||
|
||||
// copy host lut and lut_indexes_vec to device
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
acc_bivariate, preallocated_cpu_lut,
|
||||
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream,
|
||||
gpu_index, gpu_memory_allocated);
|
||||
}
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
@@ -1097,41 +1111,6 @@ void generate_device_accumulator_bivariate_with_factor(
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
free(h_lut);
|
||||
}
|
||||
/*
|
||||
* generate bivariate accumulator for device pointer
|
||||
* using preallocated host lut to avoid blocking the cpu thread
|
||||
* with the stream synchronization (required to free the host lut).
|
||||
* This enables concurrent execution of multiple streams when using
|
||||
* a single cpu thread.
|
||||
* stream - cuda stream
|
||||
* acc - device pointer for bivariate accumulator
|
||||
* ...
|
||||
* f - wrapping function with two Torus inputs
|
||||
* h_lut - preallocated host lut to be used
|
||||
*
|
||||
*/
|
||||
template <typename Torus>
|
||||
void generate_device_accumulator_bivariate_with_cpu_prealloc(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *acc_bivariate,
|
||||
uint64_t *degree, uint64_t *max_degree, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
std::function<Torus(Torus, Torus)> f, bool gpu_memory_allocated,
|
||||
Torus *h_lut) {
|
||||
PUSH_RANGE("gen bivar lut acc")
|
||||
|
||||
*max_degree = message_modulus * carry_modulus - 1;
|
||||
// fill bivariate accumulator
|
||||
*degree = generate_lookup_table_bivariate<Torus>(
|
||||
h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus,
|
||||
f);
|
||||
|
||||
// copy host lut and lut_indexes_vec to device
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
acc_bivariate, h_lut,
|
||||
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream, gpu_index,
|
||||
gpu_memory_allocated);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void generate_device_accumulator_with_encoding(
|
||||
@@ -1190,33 +1169,6 @@ void generate_device_accumulator_with_encoding_with_cpu_prealloc(
|
||||
*/
|
||||
template <typename Torus>
|
||||
void generate_device_accumulator(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
|
||||
uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t message_modulus, uint32_t carry_modulus,
|
||||
std::function<Torus(Torus)> f, bool gpu_memory_allocated) {
|
||||
|
||||
PUSH_RANGE("gen lut acc")
|
||||
generate_device_accumulator_with_encoding(
|
||||
stream, gpu_index, acc, degree, max_degree, glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, message_modulus,
|
||||
carry_modulus, f, gpu_memory_allocated);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
/*
|
||||
* generate accumulator for device pointer using preallocated
|
||||
* host lut to avoid blocking the cpu thread with the stream
|
||||
* synchronization (required to free the host lut).
|
||||
* This enables concurrent execution of multiple streams when using
|
||||
* a single cpu thread.
|
||||
* v_stream - cuda stream
|
||||
* acc - device pointer for accumulator
|
||||
* ...
|
||||
* f - evaluating function with one Torus input
|
||||
* h_lut - preallocated host lut to be used
|
||||
*/
|
||||
template <typename Torus>
|
||||
void generate_device_accumulator_with_cpu_prealloc(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
|
||||
uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t message_modulus, uint32_t carry_modulus,
|
||||
@@ -1224,12 +1176,19 @@ void generate_device_accumulator_with_cpu_prealloc(
|
||||
Torus *preallocated_h_lut) {
|
||||
|
||||
PUSH_RANGE("gen lut acc")
|
||||
generate_device_accumulator_with_encoding_with_cpu_prealloc(
|
||||
stream, gpu_index, acc, degree, max_degree, glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, message_modulus,
|
||||
carry_modulus, f, gpu_memory_allocated, preallocated_h_lut);
|
||||
if (preallocated_h_lut != nullptr)
|
||||
generate_device_accumulator_with_encoding_with_cpu_prealloc(
|
||||
stream, gpu_index, acc, degree, max_degree, glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, message_modulus,
|
||||
carry_modulus, f, gpu_memory_allocated, preallocated_h_lut);
|
||||
else
|
||||
generate_device_accumulator_with_encoding(
|
||||
stream, gpu_index, acc, degree, max_degree, glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, message_modulus,
|
||||
carry_modulus, f, gpu_memory_allocated);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
/*
|
||||
* generate many lut accumulator for device pointer
|
||||
* v_stream - cuda stream
|
||||
@@ -1243,25 +1202,38 @@ void generate_many_lut_device_accumulator(
|
||||
uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t message_modulus, uint32_t carry_modulus,
|
||||
std::vector<std::function<Torus(Torus)>> &functions,
|
||||
bool gpu_memory_allocated) {
|
||||
bool gpu_memory_allocated, Torus *preallocated_h_lut) {
|
||||
|
||||
PUSH_RANGE("gen many lut acc")
|
||||
// host lut
|
||||
Torus *h_lut =
|
||||
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
|
||||
if (preallocated_h_lut == nullptr) {
|
||||
// host lut
|
||||
Torus *h_lut =
|
||||
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
|
||||
|
||||
// fill accumulator
|
||||
*max_degree = generate_many_lookup_table<Torus>(
|
||||
h_lut, degrees, glwe_dimension, polynomial_size, message_modulus,
|
||||
carry_modulus, functions);
|
||||
// fill accumulator
|
||||
*max_degree = generate_many_lookup_table<Torus>(
|
||||
h_lut, degrees, glwe_dimension, polynomial_size, message_modulus,
|
||||
carry_modulus, functions);
|
||||
|
||||
// copy host lut and lut_indexes_vec to device
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
|
||||
stream, gpu_index, gpu_memory_allocated);
|
||||
// copy host lut and lut_indexes_vec to device
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
|
||||
stream, gpu_index, gpu_memory_allocated);
|
||||
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
free(h_lut);
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
free(h_lut);
|
||||
} else {
|
||||
// fill accumulator
|
||||
*max_degree = generate_many_lookup_table<Torus>(
|
||||
preallocated_h_lut, degrees, glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, functions);
|
||||
|
||||
// copy host lut and lut_indexes_vec to device
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
acc, preallocated_h_lut,
|
||||
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream,
|
||||
gpu_index, gpu_memory_allocated);
|
||||
}
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
@@ -1732,7 +1704,7 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
|
||||
signs_array_in, 0, num_sign_blocks);
|
||||
if (num_sign_blocks > 2) {
|
||||
auto lut = diff_buffer->reduce_signs_lut;
|
||||
generate_device_accumulator_with_cpu_prealloc<Torus>(
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
|
||||
lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus,
|
||||
@@ -1763,7 +1735,7 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
|
||||
};
|
||||
|
||||
auto lut = diff_buffer->reduce_signs_lut;
|
||||
generate_device_accumulator_with_cpu_prealloc<Torus>(
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
|
||||
lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, final_lut_f, true,
|
||||
@@ -1783,7 +1755,7 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
|
||||
};
|
||||
|
||||
auto lut = mem_ptr->diff_buffer->reduce_signs_lut;
|
||||
generate_device_accumulator_with_cpu_prealloc<Torus>(
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
|
||||
lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, final_lut_f, true,
|
||||
|
||||
@@ -366,7 +366,8 @@ __host__ void host_integer_partial_sum_ciphertexts_vec(
|
||||
const dim3 number_of_blocks_2d(num_radix_blocks, part_count, 1);
|
||||
|
||||
mem_ptr->setup_lookup_tables(streams, num_radix_in_vec,
|
||||
current_blocks->degrees);
|
||||
current_blocks->degrees,
|
||||
mem_ptr->preallocated_h_lut);
|
||||
|
||||
while (needs_processing) {
|
||||
auto luts_message_carry = mem_ptr->luts_message_carry;
|
||||
|
||||
@@ -142,7 +142,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check(
|
||||
};
|
||||
|
||||
auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
|
||||
generate_device_accumulator_with_cpu_prealloc<Torus>(
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
|
||||
lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, scalar_last_leaf_lut_f,
|
||||
@@ -235,7 +235,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check(
|
||||
};
|
||||
|
||||
auto lut = diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
|
||||
generate_device_accumulator_bivariate_with_cpu_prealloc<Torus>(
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
|
||||
lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus,
|
||||
@@ -269,7 +269,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check(
|
||||
int_radix_lut<Torus> *one_block_lut =
|
||||
new int_radix_lut<Torus>(streams, params, 1, 1, true, size);
|
||||
|
||||
generate_device_accumulator_with_cpu_prealloc<Torus>(
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), one_block_lut->get_lut(0, 0),
|
||||
one_block_lut->get_degree(0), one_block_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
@@ -413,7 +413,7 @@ __host__ void integer_radix_signed_scalar_difference_check(
|
||||
};
|
||||
|
||||
auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
|
||||
generate_device_accumulator_bivariate_with_cpu_prealloc<Torus>(
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
|
||||
lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus,
|
||||
@@ -515,7 +515,7 @@ __host__ void integer_radix_signed_scalar_difference_check(
|
||||
};
|
||||
|
||||
auto signed_msb_lut = mem_ptr->signed_msb_lut;
|
||||
generate_device_accumulator_bivariate_with_cpu_prealloc<Torus>(
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
msb_streams.stream(0), streams.gpu_index(0),
|
||||
signed_msb_lut->get_lut(0, 0), signed_msb_lut->get_degree(0),
|
||||
signed_msb_lut->get_max_degree(0), params.glwe_dimension,
|
||||
@@ -561,7 +561,7 @@ __host__ void integer_radix_signed_scalar_difference_check(
|
||||
int_radix_lut<Torus> *one_block_lut =
|
||||
new int_radix_lut<Torus>(streams, params, 1, 1, true, size);
|
||||
|
||||
generate_device_accumulator_with_cpu_prealloc<Torus>(
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), one_block_lut->get_lut(0, 0),
|
||||
one_block_lut->get_degree(0), one_block_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
|
||||
@@ -2503,6 +2503,41 @@ unsafe extern "C" {
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_erc20_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
big_lwe_dimension: u32,
|
||||
small_lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
lwe_ciphertext_count: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
noise_reduction_type: PBS_MS_REDUCTION_T,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_erc20_assign_64(
|
||||
streams: CudaStreamsFFI,
|
||||
from_amount: *mut CudaRadixCiphertextFFI,
|
||||
to_amount: *mut CudaRadixCiphertextFFI,
|
||||
amount: *const CudaRadixCiphertextFFI,
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_erc20(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
|
||||
}
|
||||
pub const KS_TYPE_BIG_TO_SMALL: KS_TYPE = 0;
|
||||
pub const KS_TYPE_SMALL_TO_BIG: KS_TYPE = 1;
|
||||
pub type KS_TYPE = ffi::c_uint;
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
#include "cuda/include/integer/integer.h"
|
||||
#include "cuda/include/integer/rerand.h"
|
||||
#include "cuda/include/aes/aes.h"
|
||||
#include "cuda/include/erc20/erc20.h"
|
||||
#include "cuda/include/zk/zk.h"
|
||||
#include "cuda/include/keyswitch/keyswitch.h"
|
||||
#include "cuda/include/keyswitch/ks_enums.h"
|
||||
|
||||
@@ -115,6 +115,12 @@ path = "benches/integer/bench.rs"
|
||||
harness = false
|
||||
required-features = ["integer", "pbs-stats", "internal-keycache"]
|
||||
|
||||
[[bench]]
|
||||
name = "integer-erc20"
|
||||
path = "benches/integer/erc20.rs"
|
||||
harness = false
|
||||
required-features = ["integer", "pbs-stats", "internal-keycache"]
|
||||
|
||||
[[bench]]
|
||||
name = "integer-signed"
|
||||
path = "benches/integer/signed_bench.rs"
|
||||
|
||||
@@ -42,6 +42,19 @@ where
|
||||
(new_from_amount, new_to_amount)
|
||||
}
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
pub fn transfer_backend<FheType>(
|
||||
from_amount: &FheType,
|
||||
to_amount: &FheType,
|
||||
amount: &FheType,
|
||||
) -> (FheType, FheType)
|
||||
where
|
||||
FheType: FheErc20<Output = FheType>,
|
||||
for<'a> &'a FheType: FheErc20<Output = FheType>,
|
||||
{
|
||||
from_amount.erc20(to_amount, amount)
|
||||
}
|
||||
|
||||
/// Parallel variant of [`transfer_whitepaper`].
|
||||
pub fn par_transfer_whitepaper<FheType>(
|
||||
from_amount: &FheType,
|
||||
@@ -965,6 +978,14 @@ fn main() {
|
||||
"transfer::no_cmux",
|
||||
transfer_no_cmux::<FheUint64>,
|
||||
);
|
||||
cuda_bench_transfer_throughput(
|
||||
&mut group,
|
||||
&cks,
|
||||
bench_name,
|
||||
"FheUint64",
|
||||
"transfer::backend",
|
||||
transfer_backend::<FheUint64>,
|
||||
);
|
||||
cuda_bench_transfer_throughput(
|
||||
&mut group,
|
||||
&cks,
|
||||
|
||||
@@ -2,14 +2,17 @@
|
||||
|
||||
mod aes;
|
||||
mod aes256;
|
||||
mod erc20;
|
||||
mod oprf;
|
||||
|
||||
mod rerand;
|
||||
|
||||
use benchmark::params::ParamsAndNumBlocksIter;
|
||||
use benchmark::utilities::{
|
||||
get_bench_type, throughput_num_threads, write_to_json, BenchmarkType, EnvConfig, OperatorType,
|
||||
gen_random_u256, get_bench_type, throughput_num_threads, write_to_json, BenchmarkType,
|
||||
EnvConfig, OperatorType,
|
||||
};
|
||||
|
||||
use criterion::{criterion_group, Criterion, Throughput};
|
||||
use rand::prelude::*;
|
||||
use rayon::prelude::*;
|
||||
@@ -26,13 +29,6 @@ use tfhe::{get_pbs_count, reset_pbs_count};
|
||||
/// It must be as big as the largest bit size tested
|
||||
type ScalarType = U256;
|
||||
|
||||
fn gen_random_u256(rng: &mut ThreadRng) -> U256 {
|
||||
let clearlow = rng.gen::<u128>();
|
||||
let clearhigh = rng.gen::<u128>();
|
||||
|
||||
tfhe::integer::U256::from((clearlow, clearhigh))
|
||||
}
|
||||
|
||||
/// Base function to bench a server key function that is a binary operation, input ciphertexts will
|
||||
/// contain non zero carries
|
||||
fn bench_server_key_binary_function_dirty_inputs<F>(
|
||||
|
||||
144
tfhe-benchmark/benches/integer/erc20.rs
Normal file
144
tfhe-benchmark/benches/integer/erc20.rs
Normal file
@@ -0,0 +1,144 @@
|
||||
use benchmark::params_aliases::BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||
use benchmark::utilities::{
|
||||
cuda_local_keys, cuda_local_streams, gen_random_u256, get_bench_type, BenchmarkType,
|
||||
};
|
||||
use criterion::{Criterion, Throughput};
|
||||
use rayon::prelude::*;
|
||||
use rayon::ThreadPoolBuilder;
|
||||
#[cfg(feature = "gpu")]
|
||||
use tfhe::core_crypto::gpu::{get_number_of_gpus, CudaStreams};
|
||||
#[cfg(feature = "gpu")]
|
||||
use tfhe::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext;
|
||||
#[cfg(feature = "gpu")]
|
||||
use tfhe::integer::gpu::CudaServerKey;
|
||||
use tfhe::integer::keycache::KEY_CACHE;
|
||||
use tfhe::integer::IntegerKeyKind;
|
||||
use tfhe::keycache::NamedParam;
|
||||
|
||||
fn main() {
|
||||
let mut criterion: Criterion<_> = (Criterion::default()).configure_from_args();
|
||||
#[cfg(feature = "gpu")]
|
||||
cuda_erc20(&mut criterion);
|
||||
|
||||
Criterion::default().configure_from_args().final_summary();
|
||||
}
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
pub fn cuda_erc20(c: &mut Criterion) {
|
||||
let bench_name = "integer::cuda::erc20";
|
||||
|
||||
let mut bench_group = c.benchmark_group(bench_name);
|
||||
bench_group
|
||||
.sample_size(15)
|
||||
.measurement_time(std::time::Duration::from_secs(30));
|
||||
let mut rng = rand::thread_rng();
|
||||
let bench_id;
|
||||
|
||||
let param = BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||
let param_name = param.name();
|
||||
let num_block = 32;
|
||||
|
||||
match get_bench_type() {
|
||||
BenchmarkType::Latency => {
|
||||
let streams = CudaStreams::new_multi_gpu();
|
||||
bench_id = format!("{bench_name}::{param_name}");
|
||||
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
let gpu_sks = CudaServerKey::new(&cks, &streams);
|
||||
|
||||
let encrypt_values = || {
|
||||
let ct_0 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block);
|
||||
let ct_1 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block);
|
||||
let ct_2 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block);
|
||||
let d_ctxt_0 =
|
||||
CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_0, &streams);
|
||||
let d_ctxt_1 =
|
||||
CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_1, &streams);
|
||||
let d_ctxt_2 =
|
||||
CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_2, &streams);
|
||||
|
||||
(d_ctxt_0, d_ctxt_1, d_ctxt_2)
|
||||
};
|
||||
|
||||
b.iter_batched(
|
||||
encrypt_values,
|
||||
|(ct_0, ct_1, ct_2)| {
|
||||
gpu_sks.erc20(&ct_0, &ct_1, &ct_2, &streams);
|
||||
},
|
||||
criterion::BatchSize::SmallInput,
|
||||
)
|
||||
});
|
||||
}
|
||||
BenchmarkType::Throughput => {
|
||||
let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
let gpu_sks_vec = cuda_local_keys(&cks);
|
||||
let gpu_count = get_number_of_gpus() as usize;
|
||||
|
||||
bench_id = format!("{bench_name}::throughput::{param_name}");
|
||||
let elements = 800;
|
||||
bench_group.throughput(Throughput::Elements(elements));
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
let setup_encrypted_values = || {
|
||||
let local_streams = cuda_local_streams(num_block, elements as usize);
|
||||
let cts_0 = (0..elements)
|
||||
.map(|i| {
|
||||
let ct_0 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block);
|
||||
CudaUnsignedRadixCiphertext::from_radix_ciphertext(
|
||||
&ct_0,
|
||||
&local_streams[i as usize],
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let cts_1 = (0..elements)
|
||||
.map(|i| {
|
||||
let ct_1 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block);
|
||||
CudaUnsignedRadixCiphertext::from_radix_ciphertext(
|
||||
&ct_1,
|
||||
&local_streams[i as usize],
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let cts_2 = (0..elements)
|
||||
.map(|i| {
|
||||
let ct_2 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block);
|
||||
CudaUnsignedRadixCiphertext::from_radix_ciphertext(
|
||||
&ct_2,
|
||||
&local_streams[i as usize],
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
(cts_0, cts_1, cts_2, local_streams)
|
||||
};
|
||||
|
||||
let pool = ThreadPoolBuilder::new().num_threads(32).build().unwrap();
|
||||
|
||||
b.iter_batched(
|
||||
setup_encrypted_values,
|
||||
|(cts_0, cts_1, cts_2, local_streams)| {
|
||||
pool.install(|| {
|
||||
cts_0
|
||||
.par_iter()
|
||||
.zip(cts_1.par_iter())
|
||||
.zip(cts_2.par_iter())
|
||||
.zip(local_streams.par_iter())
|
||||
.enumerate()
|
||||
.for_each(|(i, (((ct_0, ct_1), ct_2), local_stream))| {
|
||||
gpu_sks_vec[i % gpu_count].erc20(
|
||||
ct_0,
|
||||
ct_1,
|
||||
ct_2,
|
||||
local_stream,
|
||||
);
|
||||
});
|
||||
})
|
||||
},
|
||||
criterion::BatchSize::SmallInput,
|
||||
);
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
bench_group.finish();
|
||||
}
|
||||
@@ -1,3 +1,5 @@
|
||||
use rand::prelude::ThreadRng;
|
||||
use rand::Rng;
|
||||
use serde::Serialize;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::OnceLock;
|
||||
@@ -791,3 +793,11 @@ mod cuda_utils {
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
pub use cuda_utils::*;
|
||||
use tfhe::integer::U256;
|
||||
|
||||
pub fn gen_random_u256(rng: &mut ThreadRng) -> U256 {
|
||||
let clearlow = rng.gen::<u128>();
|
||||
let clearhigh = rng.gen::<u128>();
|
||||
|
||||
tfhe::integer::U256::from((clearlow, clearhigh))
|
||||
}
|
||||
|
||||
@@ -17,7 +17,7 @@ use crate::high_level_api::traits::{
|
||||
RotateRightSizeOnGpu, ShlSizeOnGpu, ShrSizeOnGpu, SizeOnGpu, SubSizeOnGpu,
|
||||
};
|
||||
use crate::high_level_api::traits::{
|
||||
DivRem, FheEq, FheMax, FheMin, FheOrd, RotateLeft, RotateLeftAssign, RotateRight,
|
||||
DivRem, FheEq, FheErc20, FheMax, FheMin, FheOrd, RotateLeft, RotateLeftAssign, RotateRight,
|
||||
RotateRightAssign,
|
||||
};
|
||||
#[cfg(feature = "gpu")]
|
||||
@@ -3206,3 +3206,68 @@ where
|
||||
})
|
||||
}
|
||||
}
|
||||
#[cfg(feature = "gpu")]
|
||||
impl<Id> FheErc20<Self> for FheUint<Id>
|
||||
where
|
||||
Id: FheUintId,
|
||||
{
|
||||
type Output = Self;
|
||||
|
||||
fn erc20(self, to: Self, amount: Self) -> (Self::Output, Self::Output) {
|
||||
<Self as FheErc20<&Self>>::erc20(self, &to, &amount)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
impl<Id> FheErc20<&Self> for FheUint<Id>
|
||||
where
|
||||
Id: FheUintId,
|
||||
{
|
||||
type Output = Self;
|
||||
|
||||
fn erc20(self, to: &Self, amount: &Self) -> (Self::Output, Self::Output) {
|
||||
<&Self as FheErc20<&Self>>::erc20(&self, to, amount)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
impl<Id> FheErc20<Self> for &FheUint<Id>
|
||||
where
|
||||
Id: FheUintId,
|
||||
{
|
||||
type Output = FheUint<Id>;
|
||||
|
||||
fn erc20(self, to: Self, amount: Self) -> (Self::Output, Self::Output) {
|
||||
global_state::with_internal_keys(|key| match key {
|
||||
InternalServerKey::Cpu(_cpu_key) => {
|
||||
panic!("Erc20 is not supported on CPU");
|
||||
}
|
||||
#[cfg(feature = "gpu")]
|
||||
InternalServerKey::Cuda(cuda_key) => {
|
||||
let streams = &cuda_key.streams;
|
||||
let inner_result = cuda_key.key.key.erc20(
|
||||
&*self.ciphertext.on_gpu(streams),
|
||||
&*to.ciphertext.on_gpu(streams),
|
||||
&*amount.ciphertext.on_gpu(streams),
|
||||
streams,
|
||||
);
|
||||
(
|
||||
FheUint::<Id>::new(
|
||||
inner_result.0,
|
||||
cuda_key.tag.clone(),
|
||||
ReRandomizationMetadata::default(),
|
||||
),
|
||||
FheUint::<Id>::new(
|
||||
inner_result.1,
|
||||
cuda_key.tag.clone(),
|
||||
ReRandomizationMetadata::default(),
|
||||
),
|
||||
)
|
||||
}
|
||||
#[cfg(feature = "hpu")]
|
||||
InternalServerKey::Hpu(_device) => {
|
||||
panic!("Erc20 is not supported on HPU");
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,7 +27,7 @@ pub use crate::high_level_api::strings::traits::*;
|
||||
#[cfg(feature = "gpu")]
|
||||
pub use crate::high_level_api::traits::{
|
||||
AddSizeOnGpu, BitAndSizeOnGpu, BitNotSizeOnGpu, BitOrSizeOnGpu, BitXorSizeOnGpu,
|
||||
DivRemSizeOnGpu, DivSizeOnGpu, FheEqSizeOnGpu, FheMaxSizeOnGpu, FheMinSizeOnGpu,
|
||||
DivRemSizeOnGpu, DivSizeOnGpu, FheEqSizeOnGpu, FheErc20, FheMaxSizeOnGpu, FheMinSizeOnGpu,
|
||||
FheOrdSizeOnGpu, IfThenElseSizeOnGpu, MulSizeOnGpu, NegSizeOnGpu, RemSizeOnGpu,
|
||||
RotateLeftSizeOnGpu, RotateRightSizeOnGpu, ShlSizeOnGpu, ShrSizeOnGpu, SizeOnGpu, SubSizeOnGpu,
|
||||
};
|
||||
|
||||
@@ -355,3 +355,9 @@ pub trait FheEqSizeOnGpu<Rhs = Self> {
|
||||
fn get_eq_size_on_gpu(&self, amount: Rhs) -> u64;
|
||||
fn get_ne_size_on_gpu(&self, amount: Rhs) -> u64;
|
||||
}
|
||||
#[cfg(feature = "gpu")]
|
||||
pub trait FheErc20<Rhs = Self> {
|
||||
type Output;
|
||||
|
||||
fn erc20(self, to: Rhs, amount: Rhs) -> (Self::Output, Self::Output);
|
||||
}
|
||||
|
||||
@@ -10357,3 +10357,130 @@ pub(crate) unsafe fn cuda_backend_cast_to_signed<T: UnsignedInteger, B: Numeric>
|
||||
|
||||
update_noise_degree(output, &cuda_ffi_output);
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
/// # Safety
|
||||
///
|
||||
/// - The data must not be moved or dropped while being used by the CUDA kernel.
|
||||
/// - This function assumes exclusive access to the passed data; violating this may lead to
|
||||
/// undefined behavior.
|
||||
pub(crate) unsafe fn cuda_backend_erc20_assign<T: UnsignedInteger, B: Numeric>(
|
||||
streams: &CudaStreams,
|
||||
from_amount: &mut CudaRadixCiphertext,
|
||||
to_amount: &mut CudaRadixCiphertext,
|
||||
amount: &CudaRadixCiphertext,
|
||||
bootstrapping_key: &CudaVec<B>,
|
||||
keyswitch_key: &CudaVec<T>,
|
||||
message_modulus: MessageModulus,
|
||||
carry_modulus: CarryModulus,
|
||||
glwe_dimension: GlweDimension,
|
||||
polynomial_size: PolynomialSize,
|
||||
big_lwe_dimension: LweDimension,
|
||||
small_lwe_dimension: LweDimension,
|
||||
ks_level: DecompositionLevelCount,
|
||||
ks_base_log: DecompositionBaseLog,
|
||||
pbs_level: DecompositionLevelCount,
|
||||
pbs_base_log: DecompositionBaseLog,
|
||||
num_blocks: u32,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
from_amount.d_blocks.0.d_vec.gpu_index(0),
|
||||
"GPU error: first stream is on GPU {}, first from_amount pointer is on GPU {}",
|
||||
streams.gpu_indexes[0].get(),
|
||||
from_amount.d_blocks.0.d_vec.gpu_index(0).get(),
|
||||
);
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
to_amount.d_blocks.0.d_vec.gpu_index(0),
|
||||
"GPU error: first stream is on GPU {}, first to_amount pointer is on GPU {}",
|
||||
streams.gpu_indexes[0].get(),
|
||||
to_amount.d_blocks.0.d_vec.gpu_index(0).get(),
|
||||
);
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
amount.d_blocks.0.d_vec.gpu_index(0),
|
||||
"GPU error: first stream is on GPU {}, first amount pointer is on GPU {}",
|
||||
streams.gpu_indexes[0].get(),
|
||||
amount.d_blocks.0.d_vec.gpu_index(0).get(),
|
||||
);
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
bootstrapping_key.gpu_index(0),
|
||||
"GPU error: first stream is on GPU {}, first bsk pointer is on GPU {}",
|
||||
streams.gpu_indexes[0].get(),
|
||||
bootstrapping_key.gpu_index(0).get(),
|
||||
);
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
keyswitch_key.gpu_index(0),
|
||||
"GPU error: first stream is on GPU {}, first ksk pointer is on GPU {}",
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration);
|
||||
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
let mut from_amount_degrees = from_amount.info.blocks.iter().map(|b| b.degree.0).collect();
|
||||
let mut from_amount_noise_levels = from_amount
|
||||
.info
|
||||
.blocks
|
||||
.iter()
|
||||
.map(|b| b.noise_level.0)
|
||||
.collect();
|
||||
let mut cuda_ffi_from_amount = prepare_cuda_radix_ffi(
|
||||
from_amount,
|
||||
&mut from_amount_degrees,
|
||||
&mut from_amount_noise_levels,
|
||||
);
|
||||
let mut amount_degrees = amount.info.blocks.iter().map(|b| b.degree.0).collect();
|
||||
let mut amount_noise_levels = amount.info.blocks.iter().map(|b| b.noise_level.0).collect();
|
||||
let cuda_ffi_amount =
|
||||
prepare_cuda_radix_ffi(amount, &mut amount_degrees, &mut amount_noise_levels);
|
||||
let mut to_amount_degrees = to_amount.info.blocks.iter().map(|b| b.degree.0).collect();
|
||||
let mut to_amount_noise_levels = to_amount
|
||||
.info
|
||||
.blocks
|
||||
.iter()
|
||||
.map(|b| b.noise_level.0)
|
||||
.collect();
|
||||
let mut cuda_ffi_to_amount = prepare_cuda_radix_ffi(
|
||||
to_amount,
|
||||
&mut to_amount_degrees,
|
||||
&mut to_amount_noise_levels,
|
||||
);
|
||||
scratch_cuda_erc20_64(
|
||||
streams.ffi(),
|
||||
std::ptr::addr_of_mut!(mem_ptr),
|
||||
glwe_dimension.0 as u32,
|
||||
polynomial_size.0 as u32,
|
||||
big_lwe_dimension.0 as u32,
|
||||
small_lwe_dimension.0 as u32,
|
||||
ks_level.0 as u32,
|
||||
ks_base_log.0 as u32,
|
||||
pbs_level.0 as u32,
|
||||
pbs_base_log.0 as u32,
|
||||
grouping_factor.0 as u32,
|
||||
num_blocks,
|
||||
message_modulus.0 as u32,
|
||||
carry_modulus.0 as u32,
|
||||
pbs_type as u32,
|
||||
true,
|
||||
noise_reduction_type as u32,
|
||||
);
|
||||
cuda_erc20_assign_64(
|
||||
streams.ffi(),
|
||||
&raw mut cuda_ffi_from_amount,
|
||||
&raw mut cuda_ffi_to_amount,
|
||||
&raw const cuda_ffi_amount,
|
||||
mem_ptr,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
);
|
||||
cleanup_cuda_erc20(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr));
|
||||
update_noise_degree(from_amount, &cuda_ffi_from_amount);
|
||||
update_noise_degree(to_amount, &cuda_ffi_to_amount);
|
||||
}
|
||||
|
||||
175
tfhe/src/integer/gpu/server_key/radix/erc20.rs
Normal file
175
tfhe/src/integer/gpu/server_key/radix/erc20.rs
Normal file
@@ -0,0 +1,175 @@
|
||||
use crate::core_crypto::gpu::CudaStreams;
|
||||
use crate::core_crypto::prelude::LweBskGroupingFactor;
|
||||
use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
|
||||
use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
|
||||
use crate::integer::gpu::{cuda_backend_erc20_assign, PBSType};
|
||||
|
||||
impl CudaServerKey {
|
||||
pub fn unchecked_erc20_assign<T>(
|
||||
&self,
|
||||
from_amount: &mut T,
|
||||
to_amount: &mut T,
|
||||
amount: &T,
|
||||
streams: &CudaStreams,
|
||||
) where
|
||||
T: CudaIntegerRadixCiphertext,
|
||||
{
|
||||
let num_blocks = amount.as_ref().d_blocks.lwe_ciphertext_count().0 as u32;
|
||||
unsafe {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
cuda_backend_erc20_assign(
|
||||
streams,
|
||||
from_amount.as_mut(),
|
||||
to_amount.as_mut(),
|
||||
amount.as_ref(),
|
||||
&d_bsk.d_vec,
|
||||
&self.key_switching_key.d_vec,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
d_bsk.glwe_dimension,
|
||||
d_bsk.polynomial_size,
|
||||
self.key_switching_key
|
||||
.input_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key
|
||||
.output_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count,
|
||||
d_bsk.decomp_base_log,
|
||||
num_blocks,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
cuda_backend_erc20_assign(
|
||||
streams,
|
||||
from_amount.as_mut(),
|
||||
to_amount.as_mut(),
|
||||
amount.as_ref(),
|
||||
&d_multibit_bsk.d_vec,
|
||||
&self.key_switching_key.d_vec,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
d_multibit_bsk.glwe_dimension,
|
||||
d_multibit_bsk.polynomial_size,
|
||||
self.key_switching_key
|
||||
.input_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key
|
||||
.output_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_multibit_bsk.decomp_level_count,
|
||||
d_multibit_bsk.decomp_base_log,
|
||||
num_blocks,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn unchecked_erc20<T>(
|
||||
&self,
|
||||
from_amount: &T,
|
||||
to_amount: &T,
|
||||
amount: &T,
|
||||
streams: &CudaStreams,
|
||||
) -> (T, T)
|
||||
where
|
||||
T: CudaIntegerRadixCiphertext,
|
||||
{
|
||||
let mut from_amount = from_amount.duplicate(streams);
|
||||
let mut to_amount = to_amount.duplicate(streams);
|
||||
|
||||
self.unchecked_erc20_assign(&mut from_amount, &mut to_amount, amount, streams);
|
||||
(from_amount, to_amount)
|
||||
}
|
||||
|
||||
pub fn erc20<T>(
|
||||
&self,
|
||||
from_amount: &T,
|
||||
to_amount: &T,
|
||||
amount: &T,
|
||||
streams: &CudaStreams,
|
||||
) -> (T, T)
|
||||
where
|
||||
T: CudaIntegerRadixCiphertext,
|
||||
{
|
||||
let mut tmp_from_amount;
|
||||
let mut tmp_to_amount;
|
||||
|
||||
let (from_amount, to_amount) = match (
|
||||
from_amount.block_carries_are_empty(),
|
||||
to_amount.block_carries_are_empty(),
|
||||
) {
|
||||
(true, true) => (from_amount, to_amount),
|
||||
(true, false) => {
|
||||
tmp_to_amount = to_amount.duplicate(streams);
|
||||
self.full_propagate_assign(&mut tmp_to_amount, streams);
|
||||
(from_amount, &tmp_to_amount)
|
||||
}
|
||||
(false, true) => {
|
||||
tmp_from_amount = from_amount.duplicate(streams);
|
||||
self.full_propagate_assign(&mut tmp_from_amount, streams);
|
||||
(&tmp_from_amount, to_amount)
|
||||
}
|
||||
(false, false) => {
|
||||
tmp_to_amount = to_amount.duplicate(streams);
|
||||
tmp_from_amount = from_amount.duplicate(streams);
|
||||
self.full_propagate_assign(&mut tmp_from_amount, streams);
|
||||
self.full_propagate_assign(&mut tmp_to_amount, streams);
|
||||
(&tmp_from_amount, &tmp_to_amount)
|
||||
}
|
||||
};
|
||||
|
||||
self.unchecked_erc20(from_amount, to_amount, amount, streams)
|
||||
}
|
||||
|
||||
pub fn erc20_assign<T>(
|
||||
&self,
|
||||
from_amount: &mut T,
|
||||
to_amount: &mut T,
|
||||
amount: &T,
|
||||
streams: &CudaStreams,
|
||||
) where
|
||||
T: CudaIntegerRadixCiphertext,
|
||||
{
|
||||
let mut tmp_from_amount;
|
||||
let mut tmp_to_amount;
|
||||
|
||||
let (from_amount, to_amount) = match (
|
||||
from_amount.block_carries_are_empty(),
|
||||
to_amount.block_carries_are_empty(),
|
||||
) {
|
||||
(true, true) => (from_amount, to_amount),
|
||||
(true, false) => {
|
||||
tmp_to_amount = to_amount.duplicate(streams);
|
||||
self.full_propagate_assign(&mut tmp_to_amount, streams);
|
||||
(from_amount, &mut tmp_to_amount)
|
||||
}
|
||||
(false, true) => {
|
||||
tmp_from_amount = from_amount.duplicate(streams);
|
||||
self.full_propagate_assign(&mut tmp_from_amount, streams);
|
||||
(&mut tmp_from_amount, to_amount)
|
||||
}
|
||||
(false, false) => {
|
||||
tmp_to_amount = to_amount.duplicate(streams);
|
||||
tmp_from_amount = from_amount.duplicate(streams);
|
||||
self.full_propagate_assign(&mut tmp_from_amount, streams);
|
||||
self.full_propagate_assign(&mut tmp_to_amount, streams);
|
||||
(&mut tmp_from_amount, &mut tmp_to_amount)
|
||||
}
|
||||
};
|
||||
|
||||
self.unchecked_erc20_assign(from_amount, to_amount, amount, streams);
|
||||
}
|
||||
}
|
||||
@@ -37,6 +37,7 @@ mod bitwise_op;
|
||||
mod cmux;
|
||||
mod comparison;
|
||||
mod div_mod;
|
||||
mod erc20;
|
||||
mod even_odd;
|
||||
mod ilog2;
|
||||
mod mul;
|
||||
|
||||
Reference in New Issue
Block a user