mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-09 14:47:56 -05:00
chore(gpu): structure to encapsulate streams
This commit is contained in:
committed by
Andrei Stoian
parent
1a2643d1da
commit
1dcc3c8c89
5
Makefile
5
Makefile
@@ -999,6 +999,11 @@ test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
|
||||
--test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p tfhe \
|
||||
-E "test(/high_level_api::.*gpu.*/)"
|
||||
|
||||
test_list_gpu: install_rs_build_toolchain install_cargo_nextest
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest list --cargo-profile $(CARGO_PROFILE) \
|
||||
--features=integer,internal-keycache,gpu,zk-pok -p tfhe \
|
||||
-E "test(/.*gpu.*/)"
|
||||
|
||||
test_high_level_api_hpu: install_rs_build_toolchain install_cargo_nextest
|
||||
ifeq ($(HPU_CONFIG), v80)
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
|
||||
|
||||
@@ -4,9 +4,7 @@
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <cuda_runtime.h>
|
||||
#include <vector>
|
||||
|
||||
extern "C" {
|
||||
|
||||
@@ -141,4 +139,5 @@ bool cuda_check_support_thread_block_clusters();
|
||||
template <typename Torus>
|
||||
void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *d_array, Torus value, Torus n);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
#include <variant>
|
||||
#include <vector>
|
||||
|
||||
#include "integer/integer.h"
|
||||
|
||||
extern std::mutex m;
|
||||
extern bool p2p_enabled;
|
||||
extern const int THRESHOLD_MULTI_GPU;
|
||||
@@ -37,10 +39,149 @@ get_variant_element(const std::variant<std::vector<Torus>, Torus> &variant,
|
||||
}
|
||||
}
|
||||
|
||||
int get_active_gpu_count(int num_inputs, int gpu_count);
|
||||
uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count);
|
||||
|
||||
int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);
|
||||
|
||||
int get_gpu_offset(int total_num_inputs, int gpu_index, int gpu_count);
|
||||
|
||||
// A Set of GPU Streams and associated GPUs
|
||||
// Can be constructed from the FFI struct CudaStreamsFFI which
|
||||
// is only used to pass the streams/gpus at the rust/C interface
|
||||
// This class should only be constructed from the FFI struct,
|
||||
// through class methods or through the copy constructor. The class
|
||||
// can also be constructed as an empty set
|
||||
struct CudaStreams {
|
||||
private:
|
||||
cudaStream_t const *_streams;
|
||||
uint32_t const *_gpu_indexes;
|
||||
uint32_t _gpu_count;
|
||||
bool _owns_streams;
|
||||
|
||||
// Prevent the construction of a CudaStreams class from user-code
|
||||
CudaStreams(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count)
|
||||
: _streams(streams), _gpu_indexes(gpu_indexes), _gpu_count(gpu_count),
|
||||
_owns_streams(false) {}
|
||||
|
||||
public:
|
||||
// Construct an empty set. Invalid use of an empty set should raise an error
|
||||
// right away through asserts or because of a nullptr dereference
|
||||
CudaStreams()
|
||||
: _streams(nullptr), _gpu_indexes(nullptr), _gpu_count((uint32_t)-1),
|
||||
_owns_streams(false) {}
|
||||
|
||||
// Returns a subset of this set as an active subset. An active subset is one
|
||||
// that is temporarily used to perform some computation
|
||||
CudaStreams active_gpu_subset(int num_radix_blocks) {
|
||||
return CudaStreams(_streams, _gpu_indexes,
|
||||
get_active_gpu_count(num_radix_blocks, _gpu_count));
|
||||
}
|
||||
|
||||
// Returns a subset containing only the first gpu of this set. It
|
||||
// is used to create subset of streams for mono-GPU functions
|
||||
CudaStreams subset_first_gpu() const {
|
||||
return CudaStreams(_streams, _gpu_indexes, 1);
|
||||
}
|
||||
|
||||
// Synchronize all the streams in the set
|
||||
void synchronize() const {
|
||||
for (uint32_t i = 0; i < _gpu_count; i++) {
|
||||
cuda_synchronize_stream(_streams[i], _gpu_indexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
cudaStream_t stream(uint32_t idx) const {
|
||||
PANIC_IF_FALSE(idx < _gpu_count, "Invalid GPU index");
|
||||
return _streams[idx];
|
||||
}
|
||||
uint32_t gpu_index(uint32_t idx) const {
|
||||
PANIC_IF_FALSE(idx < _gpu_count, "Invalid GPU index");
|
||||
return _gpu_indexes[idx];
|
||||
}
|
||||
uint32_t count() const { return _gpu_count; }
|
||||
|
||||
// Construct from the rust FFI stream set. Streams are created in rust
|
||||
// using the bindings.
|
||||
CudaStreams(CudaStreamsFFI &ffi)
|
||||
: _streams((cudaStream_t *)ffi.streams), _gpu_indexes(ffi.gpu_indexes),
|
||||
_gpu_count(ffi.gpu_count), _owns_streams(false) {}
|
||||
|
||||
// Create a new set of streams on the same gpus as those of the current stream
|
||||
// set Can be used to parallelize computation by issuing kernels on multiple
|
||||
// streams on the same GPU
|
||||
void create_on_same_gpus(const CudaStreams &other) {
|
||||
PANIC_IF_FALSE(_streams == nullptr,
|
||||
"Assign clone to non-empty cudastreams");
|
||||
|
||||
cudaStream_t *new_streams = new cudaStream_t[other._gpu_count];
|
||||
|
||||
uint32_t *gpu_indexes_clone = new uint32_t[_gpu_count];
|
||||
for (uint32_t i = 0; i < other._gpu_count; ++i) {
|
||||
new_streams[i] = cuda_create_stream(other._gpu_indexes[i]);
|
||||
gpu_indexes_clone[i] = other._gpu_indexes[i];
|
||||
}
|
||||
|
||||
this->_streams = new_streams;
|
||||
this->_gpu_indexes = gpu_indexes_clone;
|
||||
this->_gpu_count = other._gpu_count;
|
||||
|
||||
// Flag this instance as owning streams so that we can destroy
|
||||
// the streams when they aren't needed anymore
|
||||
this->_owns_streams = true;
|
||||
}
|
||||
|
||||
// Copy constructor, setting the own flag to false
|
||||
// Only the initial instance of CudaStreams created with
|
||||
// assign_clone owns streams, all copies of it do not own the
|
||||
// streams
|
||||
CudaStreams(const CudaStreams &src)
|
||||
: _streams(src._streams), _gpu_indexes(src._gpu_indexes),
|
||||
_gpu_count(src._gpu_count), _owns_streams(false) {}
|
||||
|
||||
CudaStreams &operator=(CudaStreams const &other) {
|
||||
PANIC_IF_FALSE(this->_streams == nullptr ||
|
||||
this->_streams == other._streams,
|
||||
"Assigning an already initialized CudaStreams");
|
||||
this->_streams = other._streams;
|
||||
this->_gpu_indexes = other._gpu_indexes;
|
||||
this->_gpu_count = other._gpu_count;
|
||||
|
||||
// Only the initial instance of CudaStreams created with
|
||||
// assign_clone owns streams, all copies of it do not own the
|
||||
// streams
|
||||
this->_owns_streams = false;
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Destroy the streams if they are created by assign_clone.
|
||||
// We require the developer to call `destroy` on all instances
|
||||
// of cloned streams.
|
||||
void release() {
|
||||
// If this instance doesn't own streams, there's nothing to do
|
||||
// as the streams were created on the Rust side.
|
||||
if (_owns_streams) {
|
||||
for (uint32_t i = 0; i < _gpu_count; ++i) {
|
||||
cuda_destroy_stream(_streams[i], _gpu_indexes[i]);
|
||||
}
|
||||
delete[] _streams;
|
||||
_streams = nullptr;
|
||||
delete[] _gpu_indexes;
|
||||
_gpu_indexes = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// The destructor checks that streams created with assign_clone
|
||||
// were destroyed manually with `destroy`.
|
||||
~CudaStreams() {
|
||||
// Ensure streams are destroyed
|
||||
PANIC_IF_FALSE(
|
||||
!_owns_streams || _streams == nullptr,
|
||||
"Destroy (this=%p) was not called on a CudaStreams object that "
|
||||
"is a clone "
|
||||
"of another one, %p",
|
||||
this, this->_streams);
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
#define CUDA_INTEGER_COMPRESSION_H
|
||||
|
||||
#include "../../pbs/pbs_enums.h"
|
||||
#include "../integer.h"
|
||||
|
||||
typedef struct {
|
||||
void *ptr;
|
||||
@@ -25,77 +26,65 @@ typedef struct {
|
||||
|
||||
extern "C" {
|
||||
uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t compression_glwe_dimension,
|
||||
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint32_t lwe_per_glwe, bool allocate_gpu_memory);
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr,
|
||||
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, bool allocate_gpu_memory);
|
||||
|
||||
uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t encryption_glwe_dimension,
|
||||
uint32_t encryption_polynomial_size, uint32_t compression_glwe_dimension,
|
||||
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr,
|
||||
uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
|
||||
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t num_blocks_to_decompress, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_integer_compress_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaPackedGlweCiphertextListFFI *glwe_array_out,
|
||||
CudaStreamsFFI streams, CudaPackedGlweCiphertextListFFI *glwe_array_out,
|
||||
CudaLweCiphertextListFFI const *lwe_array_in, void *const *fp_ksk,
|
||||
int8_t *mem_ptr);
|
||||
|
||||
void cuda_integer_decompress_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaLweCiphertextListFFI *lwe_array_out,
|
||||
CudaStreamsFFI streams, CudaLweCiphertextListFFI *lwe_array_out,
|
||||
CudaPackedGlweCiphertextListFFI const *glwe_in,
|
||||
uint32_t const *indexes_array, void *const *bsks, int8_t *mem_ptr);
|
||||
|
||||
void cleanup_cuda_integer_compress_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void cleanup_cuda_integer_compress_radix_ciphertext_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void cleanup_cuda_integer_decompress_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void cleanup_cuda_integer_decompress_radix_ciphertext_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_integer_compress_radix_ciphertext_128(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t compression_glwe_dimension,
|
||||
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint32_t lwe_per_glwe, bool allocate_gpu_memory);
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr,
|
||||
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, bool allocate_gpu_memory);
|
||||
|
||||
uint64_t scratch_cuda_integer_decompress_radix_ciphertext_128(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t compression_glwe_dimension,
|
||||
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
|
||||
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
bool allocate_gpu_memory);
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr,
|
||||
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t num_radix_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_compress_radix_ciphertext_128(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaPackedGlweCiphertextListFFI *glwe_array_out,
|
||||
CudaStreamsFFI streams, CudaPackedGlweCiphertextListFFI *glwe_array_out,
|
||||
CudaLweCiphertextListFFI const *lwe_array_in, void *const *fp_ksk,
|
||||
int8_t *mem_ptr);
|
||||
|
||||
void cuda_integer_decompress_radix_ciphertext_128(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaLweCiphertextListFFI *lwe_array_out,
|
||||
CudaStreamsFFI streams, CudaLweCiphertextListFFI *lwe_array_out,
|
||||
CudaPackedGlweCiphertextListFFI const *glwe_in,
|
||||
uint32_t const *indexes_array, int8_t *mem_ptr);
|
||||
|
||||
void cleanup_cuda_integer_compress_radix_ciphertext_128(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void cleanup_cuda_integer_compress_radix_ciphertext_128(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void cleanup_cuda_integer_decompress_radix_ciphertext_128(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr_void);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -12,8 +12,7 @@ template <typename Torus> struct int_compression {
|
||||
bool gpu_memory_allocated;
|
||||
uint32_t lwe_per_glwe;
|
||||
|
||||
int_compression(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params compression_params,
|
||||
int_compression(CudaStreams streams, int_radix_params compression_params,
|
||||
uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
@@ -25,26 +24,29 @@ template <typename Torus> struct int_compression {
|
||||
tmp_lwe = static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
|
||||
num_radix_blocks * (compression_params.small_lwe_dimension + 1) *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0], size_tracker, allocate_gpu_memory));
|
||||
streams.stream(0), streams.gpu_index(0), size_tracker,
|
||||
allocate_gpu_memory));
|
||||
tmp_glwe_array_out =
|
||||
static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
|
||||
lwe_per_glwe * glwe_accumulator_size * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0], size_tracker, allocate_gpu_memory));
|
||||
lwe_per_glwe * glwe_accumulator_size * sizeof(Torus),
|
||||
streams.stream(0), streams.gpu_index(0), size_tracker,
|
||||
allocate_gpu_memory));
|
||||
|
||||
size_tracker += scratch_packing_keyswitch_lwe_list_to_glwe<Torus>(
|
||||
streams[0], gpu_indexes[0], &fp_ks_buffer,
|
||||
streams.stream(0), streams.gpu_index(0), &fp_ks_buffer,
|
||||
compression_params.small_lwe_dimension,
|
||||
compression_params.glwe_dimension, compression_params.polynomial_size,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
}
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
cuda_drop_with_size_tracking_async(tmp_lwe, streams[0], gpu_indexes[0],
|
||||
void release(CudaStreams streams) {
|
||||
cuda_drop_with_size_tracking_async(
|
||||
tmp_lwe, streams.stream(0), streams.gpu_index(0), gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(tmp_glwe_array_out, streams.stream(0),
|
||||
streams.gpu_index(0),
|
||||
gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(tmp_glwe_array_out, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
cleanup_packing_keyswitch_lwe_list_to_glwe(
|
||||
streams[0], gpu_indexes[0], &fp_ks_buffer, gpu_memory_allocated);
|
||||
streams.stream(0), streams.gpu_index(0), &fp_ks_buffer,
|
||||
gpu_memory_allocated);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -60,8 +62,7 @@ template <typename Torus> struct int_decompression {
|
||||
int_radix_lut<Torus> *decompression_rescale_lut;
|
||||
bool gpu_memory_allocated;
|
||||
|
||||
int_decompression(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params encryption_params,
|
||||
int_decompression(CudaStreams streams, int_radix_params encryption_params,
|
||||
int_radix_params compression_params,
|
||||
uint32_t num_blocks_to_decompress, bool allocate_gpu_memory,
|
||||
uint64_t &size_tracker) {
|
||||
@@ -78,19 +79,21 @@ template <typename Torus> struct int_decompression {
|
||||
|
||||
tmp_extracted_glwe = (Torus *)cuda_malloc_with_size_tracking_async(
|
||||
num_blocks_to_decompress * glwe_accumulator_size * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0], size_tracker, allocate_gpu_memory);
|
||||
streams.stream(0), streams.gpu_index(0), size_tracker,
|
||||
allocate_gpu_memory);
|
||||
tmp_indexes_array = (uint32_t *)cuda_malloc_with_size_tracking_async(
|
||||
num_blocks_to_decompress * sizeof(uint32_t), streams[0], gpu_indexes[0],
|
||||
size_tracker, allocate_gpu_memory);
|
||||
num_blocks_to_decompress * sizeof(uint32_t), streams.stream(0),
|
||||
streams.gpu_index(0), size_tracker, allocate_gpu_memory);
|
||||
tmp_extracted_lwe = (Torus *)cuda_malloc_with_size_tracking_async(
|
||||
num_blocks_to_decompress * lwe_accumulator_size * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0], size_tracker, allocate_gpu_memory);
|
||||
streams.stream(0), streams.gpu_index(0), size_tracker,
|
||||
allocate_gpu_memory);
|
||||
|
||||
// rescale is only needed on 64-bit decompression
|
||||
if constexpr (std::is_same_v<Torus, uint64_t>) {
|
||||
decompression_rescale_lut = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, encryption_params, 1,
|
||||
num_blocks_to_decompress, allocate_gpu_memory, size_tracker);
|
||||
streams, encryption_params, 1, num_blocks_to_decompress,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
// Rescale is done using an identity LUT
|
||||
// Here we do not divide by message_modulus
|
||||
@@ -98,8 +101,8 @@ template <typename Torus> struct int_decompression {
|
||||
// space, we want to keep the original 2-bit value in the 4-bit space,
|
||||
// so we apply the identity and the encoding will rescale it for us.
|
||||
decompression_rescale_lut = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, encryption_params, 1,
|
||||
num_blocks_to_decompress, allocate_gpu_memory, size_tracker);
|
||||
streams, encryption_params, 1, num_blocks_to_decompress,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
auto decompression_rescale_f = [](Torus x) -> Torus { return x; };
|
||||
|
||||
auto effective_compression_message_modulus =
|
||||
@@ -107,7 +110,8 @@ template <typename Torus> struct int_decompression {
|
||||
auto effective_compression_carry_modulus = 1;
|
||||
|
||||
generate_device_accumulator_with_encoding<Torus>(
|
||||
streams[0], gpu_indexes[0], decompression_rescale_lut->get_lut(0, 0),
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
decompression_rescale_lut->get_lut(0, 0),
|
||||
decompression_rescale_lut->get_degree(0),
|
||||
decompression_rescale_lut->get_max_degree(0),
|
||||
encryption_params.glwe_dimension, encryption_params.polynomial_size,
|
||||
@@ -115,22 +119,22 @@ template <typename Torus> struct int_decompression {
|
||||
effective_compression_carry_modulus,
|
||||
encryption_params.message_modulus, encryption_params.carry_modulus,
|
||||
decompression_rescale_f, gpu_memory_allocated);
|
||||
auto active_gpu_count =
|
||||
get_active_gpu_count(num_blocks_to_decompress, gpu_count);
|
||||
decompression_rescale_lut->broadcast_lut(streams, gpu_indexes,
|
||||
active_gpu_count);
|
||||
auto active_streams = streams.active_gpu_subset(num_blocks_to_decompress);
|
||||
decompression_rescale_lut->broadcast_lut(active_streams);
|
||||
}
|
||||
}
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
cuda_drop_with_size_tracking_async(tmp_extracted_glwe, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(tmp_extracted_lwe, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(tmp_indexes_array, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
void release(CudaStreams streams) {
|
||||
cuda_drop_with_size_tracking_async(tmp_extracted_glwe, streams.stream(0),
|
||||
streams.gpu_index(0),
|
||||
gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(tmp_extracted_lwe, streams.stream(0),
|
||||
streams.gpu_index(0),
|
||||
gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(tmp_indexes_array, streams.stream(0),
|
||||
streams.gpu_index(0),
|
||||
gpu_memory_allocated);
|
||||
if constexpr (std::is_same_v<Torus, uint64_t>) {
|
||||
decompression_rescale_lut->release(streams, gpu_indexes, gpu_count);
|
||||
decompression_rescale_lut->release(streams);
|
||||
delete decompression_rescale_lut;
|
||||
decompression_rescale_lut = nullptr;
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -7,28 +7,25 @@
|
||||
|
||||
extern "C" {
|
||||
uint64_t scratch_cuda_expand_without_verification_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension,
|
||||
uint32_t computing_ks_level, uint32_t computing_ks_base_log,
|
||||
uint32_t casting_input_dimension, uint32_t casting_output_dimension,
|
||||
uint32_t casting_ks_level, uint32_t casting_ks_base_log, uint32_t pbs_level,
|
||||
uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
const uint32_t *num_lwes_per_compact_list, const bool *is_boolean_array,
|
||||
uint32_t num_compact_lists, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, KS_TYPE casting_key_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t computing_ks_level,
|
||||
uint32_t computing_ks_base_log, uint32_t casting_input_dimension,
|
||||
uint32_t casting_output_dimension, uint32_t casting_ks_level,
|
||||
uint32_t casting_ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, const uint32_t *num_lwes_per_compact_list,
|
||||
const bool *is_boolean_array, uint32_t num_compact_lists,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
KS_TYPE casting_key_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_expand_without_verification_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, const void *lwe_flattened_compact_array_in,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *computing_ksks,
|
||||
void *const *casting_keys,
|
||||
CudaStreamsFFI streams, void *lwe_array_out,
|
||||
const void *lwe_flattened_compact_array_in, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *computing_ksks, void *const *casting_keys,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
|
||||
void cleanup_expand_without_verification_64(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
void cleanup_expand_without_verification_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
}
|
||||
#endif // ZK_H
|
||||
|
||||
@@ -113,8 +113,7 @@ template <typename Torus> struct zk_expand_mem {
|
||||
expand_job<Torus> *d_expand_jobs;
|
||||
expand_job<Torus> *h_expand_jobs;
|
||||
|
||||
zk_expand_mem(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params computing_params,
|
||||
zk_expand_mem(CudaStreams streams, int_radix_params computing_params,
|
||||
int_radix_params casting_params, KS_TYPE casting_key_type,
|
||||
const uint32_t *num_lwes_per_compact_list,
|
||||
const bool *is_boolean_array, uint32_t num_compact_lists,
|
||||
@@ -172,11 +171,10 @@ template <typename Torus> struct zk_expand_mem {
|
||||
params = computing_params;
|
||||
}
|
||||
message_and_carry_extract_luts = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, 4, 2 * num_lwes,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
streams, params, 4, 2 * num_lwes, allocate_gpu_memory, size_tracker);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0],
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
message_and_carry_extract_luts->get_lut(0, 0),
|
||||
message_and_carry_extract_luts->get_degree(0),
|
||||
message_and_carry_extract_luts->get_max_degree(0),
|
||||
@@ -184,7 +182,7 @@ template <typename Torus> struct zk_expand_mem {
|
||||
params.carry_modulus, message_extract_lut_f, gpu_memory_allocated);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0],
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
message_and_carry_extract_luts->get_lut(0, 1),
|
||||
message_and_carry_extract_luts->get_degree(1),
|
||||
message_and_carry_extract_luts->get_max_degree(1),
|
||||
@@ -192,7 +190,7 @@ template <typename Torus> struct zk_expand_mem {
|
||||
params.carry_modulus, carry_extract_lut_f, gpu_memory_allocated);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0],
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
message_and_carry_extract_luts->get_lut(0, 2),
|
||||
message_and_carry_extract_luts->get_degree(2),
|
||||
message_and_carry_extract_luts->get_max_degree(2),
|
||||
@@ -201,7 +199,7 @@ template <typename Torus> struct zk_expand_mem {
|
||||
gpu_memory_allocated);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0],
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
message_and_carry_extract_luts->get_lut(0, 3),
|
||||
message_and_carry_extract_luts->get_degree(3),
|
||||
message_and_carry_extract_luts->get_max_degree(3),
|
||||
@@ -226,8 +224,8 @@ template <typename Torus> struct zk_expand_mem {
|
||||
|
||||
d_expand_jobs =
|
||||
static_cast<expand_job<Torus> *>(cuda_malloc_with_size_tracking_async(
|
||||
num_lwes * sizeof(expand_job<Torus>), streams[0], gpu_indexes[0],
|
||||
size_tracker, allocate_gpu_memory));
|
||||
num_lwes * sizeof(expand_job<Torus>), streams.stream(0),
|
||||
streams.gpu_index(0), size_tracker, allocate_gpu_memory));
|
||||
|
||||
h_expand_jobs = static_cast<expand_job<Torus> *>(
|
||||
malloc(num_lwes * sizeof(expand_job<Torus>)));
|
||||
@@ -284,50 +282,51 @@ template <typename Torus> struct zk_expand_mem {
|
||||
}
|
||||
|
||||
message_and_carry_extract_luts->set_lwe_indexes(
|
||||
streams[0], gpu_indexes[0], h_indexes_in, h_indexes_out);
|
||||
streams.stream(0), streams.gpu_index(0), h_indexes_in, h_indexes_out);
|
||||
auto lut_indexes = message_and_carry_extract_luts->get_lut_indexes(0, 0);
|
||||
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
lut_indexes, h_lut_indexes, num_packed_msgs * num_lwes * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0], allocate_gpu_memory);
|
||||
streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
|
||||
|
||||
auto active_gpu_count = get_active_gpu_count(2 * num_lwes, gpu_count);
|
||||
message_and_carry_extract_luts->broadcast_lut(streams, gpu_indexes,
|
||||
active_gpu_count);
|
||||
auto active_streams = streams.active_gpu_subset(2 * num_lwes);
|
||||
message_and_carry_extract_luts->broadcast_lut(active_streams);
|
||||
|
||||
message_and_carry_extract_luts->allocate_lwe_vector_for_non_trivial_indexes(
|
||||
streams, gpu_indexes, active_gpu_count, 2 * num_lwes, size_tracker,
|
||||
allocate_gpu_memory);
|
||||
active_streams, 2 * num_lwes, size_tracker, allocate_gpu_memory);
|
||||
// The expanded LWEs will always be on the casting key format
|
||||
tmp_expanded_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
|
||||
num_lwes * (casting_params.big_lwe_dimension + 1) * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0], size_tracker, allocate_gpu_memory);
|
||||
streams.stream(0), streams.gpu_index(0), size_tracker,
|
||||
allocate_gpu_memory);
|
||||
|
||||
tmp_ksed_small_to_big_expanded_lwes =
|
||||
(Torus *)cuda_malloc_with_size_tracking_async(
|
||||
num_lwes * (casting_params.big_lwe_dimension + 1) * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0], size_tracker, allocate_gpu_memory);
|
||||
streams.stream(0), streams.gpu_index(0), size_tracker,
|
||||
allocate_gpu_memory);
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
||||
free(h_indexes_in);
|
||||
free(h_indexes_out);
|
||||
free(h_lut_indexes);
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
void release(CudaStreams streams) {
|
||||
|
||||
message_and_carry_extract_luts->release(streams, gpu_indexes, gpu_count);
|
||||
message_and_carry_extract_luts->release(streams);
|
||||
delete message_and_carry_extract_luts;
|
||||
|
||||
cuda_drop_with_size_tracking_async(tmp_expanded_lwes, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(tmp_ksed_small_to_big_expanded_lwes,
|
||||
streams[0], gpu_indexes[0],
|
||||
cuda_drop_with_size_tracking_async(tmp_expanded_lwes, streams.stream(0),
|
||||
streams.gpu_index(0),
|
||||
gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(d_expand_jobs, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
cuda_drop_with_size_tracking_async(tmp_ksed_small_to_big_expanded_lwes,
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(d_expand_jobs, streams.stream(0),
|
||||
streams.gpu_index(0),
|
||||
gpu_memory_allocated);
|
||||
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
||||
free(num_lwes_per_compact_list);
|
||||
free(h_expand_jobs);
|
||||
}
|
||||
|
||||
@@ -49,17 +49,16 @@ __global__ void device_batch_fft_ggsw_vector(double2 *dest, T *src,
|
||||
* global memory
|
||||
*/
|
||||
template <typename T, typename ST, class params>
|
||||
void batch_fft_ggsw_vector(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, double2 *dest, T *src,
|
||||
void batch_fft_ggsw_vector(CudaStreams streams, double2 *dest, T *src,
|
||||
int8_t *d_mem, uint32_t r, uint32_t glwe_dim,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t max_shared_memory) {
|
||||
PANIC_IF_FALSE(gpu_count == 1,
|
||||
PANIC_IF_FALSE(streams.count() == 1,
|
||||
"GPU error (batch_fft_ggsw_vector): multi-GPU execution on %d "
|
||||
"gpus is not supported yet.",
|
||||
gpu_count);
|
||||
streams.count());
|
||||
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
cuda_set_device(streams.gpu_index(0));
|
||||
|
||||
int shared_memory_size = sizeof(double) * polynomial_size;
|
||||
|
||||
@@ -68,11 +67,11 @@ void batch_fft_ggsw_vector(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
|
||||
if (max_shared_memory < shared_memory_size) {
|
||||
device_batch_fft_ggsw_vector<T, ST, params, NOSM>
|
||||
<<<gridSize, blockSize, 0, streams[0]>>>(dest, src, d_mem);
|
||||
<<<gridSize, blockSize, 0, streams.stream(0)>>>(dest, src, d_mem);
|
||||
} else {
|
||||
device_batch_fft_ggsw_vector<T, ST, params, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, streams[0]>>>(dest, src,
|
||||
d_mem);
|
||||
<<<gridSize, blockSize, shared_memory_size, streams.stream(0)>>>(
|
||||
dest, src, d_mem);
|
||||
}
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
@@ -142,8 +142,7 @@ __host__ void host_keyswitch_lwe_ciphertext_vector(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void execute_keyswitch_async(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void execute_keyswitch_async(CudaStreams streams,
|
||||
const LweArrayVariant<Torus> &lwe_array_out,
|
||||
const LweArrayVariant<Torus> &lwe_output_indexes,
|
||||
const LweArrayVariant<Torus> &lwe_array_in,
|
||||
@@ -154,8 +153,9 @@ void execute_keyswitch_async(cudaStream_t const *streams,
|
||||
|
||||
/// If the number of radix blocks is lower than the number of GPUs, not all
|
||||
/// GPUs will be active and there will be 1 input per GPU
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
int num_samples_on_gpu = get_num_inputs_on_gpu(num_samples, i, gpu_count);
|
||||
for (uint i = 0; i < streams.count(); i++) {
|
||||
int num_samples_on_gpu =
|
||||
get_num_inputs_on_gpu(num_samples, i, streams.count());
|
||||
|
||||
Torus *current_lwe_array_out = get_variant_element(lwe_array_out, i);
|
||||
Torus *current_lwe_output_indexes =
|
||||
@@ -166,7 +166,7 @@ void execute_keyswitch_async(cudaStream_t const *streams,
|
||||
|
||||
// Compute Keyswitch
|
||||
host_keyswitch_lwe_ciphertext_vector<Torus>(
|
||||
streams[i], gpu_indexes[i], current_lwe_array_out,
|
||||
streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
|
||||
current_lwe_output_indexes, current_lwe_array_in,
|
||||
current_lwe_input_indexes, ksks[i], lwe_dimension_in, lwe_dimension_out,
|
||||
base_log, level_count, num_samples_on_gpu);
|
||||
|
||||
@@ -128,13 +128,6 @@ void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index) {
|
||||
check_cuda_error(cudaStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
void synchronize_streams(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count) {
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Determine if a CUDA device is available at runtime
|
||||
uint32_t cuda_is_available() { return cudaSetDevice(0) == cudaSuccess; }
|
||||
|
||||
|
||||
@@ -1,13 +1,12 @@
|
||||
#include "integer/abs.cuh"
|
||||
|
||||
uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, bool is_signed, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, bool is_signed,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
@@ -16,31 +15,27 @@ uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_integer_abs_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_abs_buffer<uint64_t> **)mem_ptr, is_signed, num_blocks, params,
|
||||
allocate_gpu_memory);
|
||||
CudaStreams(streams), (int_abs_buffer<uint64_t> **)mem_ptr, is_signed,
|
||||
num_blocks, params, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, bool is_signed,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *ct, int8_t *mem_ptr,
|
||||
bool is_signed, void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
auto mem = (int_abs_buffer<uint64_t> *)mem_ptr;
|
||||
|
||||
host_integer_abs_kb<uint64_t>((cudaStream_t *)(streams), gpu_indexes,
|
||||
gpu_count, ct, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, mem, is_signed);
|
||||
host_integer_abs_kb<uint64_t>(CudaStreams(streams), ct, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key,
|
||||
mem, is_signed);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_abs_inplace(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
void cleanup_cuda_integer_abs_inplace(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_abs_buffer<uint64_t> *mem_ptr =
|
||||
(int_abs_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
@@ -18,14 +18,12 @@
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_abs_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_abs_buffer<Torus> **mem_ptr, bool is_signed,
|
||||
CudaStreams streams, int_abs_buffer<Torus> **mem_ptr, bool is_signed,
|
||||
uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
if (is_signed) {
|
||||
*mem_ptr = new int_abs_buffer<Torus>(streams, gpu_indexes, gpu_count,
|
||||
params, num_blocks,
|
||||
*mem_ptr = new int_abs_buffer<Torus>(streams, params, num_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
}
|
||||
return size_tracker;
|
||||
@@ -33,8 +31,7 @@ __host__ uint64_t scratch_cuda_integer_abs_kb(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_abs_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *ct, void *const *bsks,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *ct, void *const *bsks,
|
||||
uint64_t *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int_abs_buffer<uint64_t> *mem_ptr, bool is_signed) {
|
||||
@@ -47,24 +44,24 @@ __host__ void host_integer_abs_kb(
|
||||
(31 - __builtin_clz(mem_ptr->params.message_modulus)) *
|
||||
ct->num_radix_blocks;
|
||||
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], mask, ct);
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
mask, ct);
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mask, num_bits_in_ciphertext - 1,
|
||||
streams, mask, num_bits_in_ciphertext - 1,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
|
||||
host_addition<Torus>(streams.stream(0), streams.gpu_index(0), ct, mask, ct,
|
||||
ct->num_radix_blocks, mem_ptr->params.message_modulus,
|
||||
mem_ptr->params.carry_modulus);
|
||||
|
||||
uint32_t requested_flag = outputFlag::FLAG_NONE;
|
||||
uint32_t uses_carry = 0;
|
||||
host_propagate_single_carry<Torus>(
|
||||
streams, gpu_indexes, gpu_count, ct, nullptr, nullptr, mem_ptr->scp_mem,
|
||||
bsks, ksks, ms_noise_reduction_key, requested_flag, uses_carry);
|
||||
streams, ct, nullptr, nullptr, mem_ptr->scp_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, requested_flag, uses_carry);
|
||||
|
||||
host_integer_radix_bitop_kb<Torus>(streams, gpu_indexes, gpu_count, ct, mask,
|
||||
ct, mem_ptr->bitxor_mem, bsks, ksks,
|
||||
ms_noise_reduction_key);
|
||||
host_integer_radix_bitop_kb<Torus>(streams, ct, mask, ct, mem_ptr->bitxor_mem,
|
||||
bsks, ksks, ms_noise_reduction_key);
|
||||
}
|
||||
|
||||
#endif // TFHE_RS_ABS_CUH
|
||||
|
||||
@@ -1,14 +1,13 @@
|
||||
#include "integer/bitwise_ops.cuh"
|
||||
|
||||
uint64_t scratch_cuda_integer_radix_bitop_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
BITOP_TYPE op_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
@@ -16,32 +15,28 @@ uint64_t scratch_cuda_integer_radix_bitop_kb_64(
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_integer_radix_bitop_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_bitop_buffer<uint64_t> **)mem_ptr, lwe_ciphertext_count, params,
|
||||
op_type, allocate_gpu_memory);
|
||||
CudaStreams(streams), (int_bitop_buffer<uint64_t> **)mem_ptr,
|
||||
lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_bitop_integer_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
host_integer_radix_bitop_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_1, lwe_array_2, (int_bitop_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key);
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2,
|
||||
(int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_bitop(void *const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
void cleanup_cuda_integer_bitop(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
|
||||
|
||||
int_bitop_buffer<uint64_t> *mem_ptr =
|
||||
(int_bitop_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
@@ -13,8 +13,7 @@
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_bitop_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2, int_bitop_buffer<Torus> *mem_ptr,
|
||||
void *const *bsks, Torus *const *ksks,
|
||||
@@ -46,8 +45,8 @@ __host__ void host_integer_radix_bitop_kb(
|
||||
}
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_1, lwe_array_2,
|
||||
bsks, ksks, ms_noise_reduction_key, lut, lwe_array_out->num_radix_blocks,
|
||||
streams, lwe_array_out, lwe_array_1, lwe_array_2, bsks, ksks,
|
||||
ms_noise_reduction_key, lut, lwe_array_out->num_radix_blocks,
|
||||
lut->params.message_modulus);
|
||||
|
||||
memcpy(lwe_array_out->degrees, degrees,
|
||||
@@ -56,14 +55,12 @@ __host__ void host_integer_radix_bitop_kb(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_radix_bitop_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_bitop_buffer<Torus> **mem_ptr,
|
||||
CudaStreams streams, int_bitop_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params, BITOP_TYPE op,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_bitop_buffer<Torus>(streams, gpu_indexes, gpu_count, op,
|
||||
params, num_radix_blocks,
|
||||
*mem_ptr = new int_bitop_buffer<Torus>(streams, op, params, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
@@ -2,28 +2,26 @@
|
||||
|
||||
void extend_radix_with_trivial_zero_blocks_msb_64(
|
||||
CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
|
||||
void *const *streams, uint32_t const *gpu_indexes) {
|
||||
CudaStreamsFFI streams) {
|
||||
host_extend_radix_with_trivial_zero_blocks_msb<uint64_t>(
|
||||
output, input, (cudaStream_t *)streams, gpu_indexes);
|
||||
output, input, CudaStreams(streams));
|
||||
}
|
||||
|
||||
void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input,
|
||||
void *const *streams,
|
||||
uint32_t const *gpu_indexes) {
|
||||
CudaStreamsFFI streams) {
|
||||
|
||||
host_trim_radix_blocks_lsb<uint64_t>(output, input, (cudaStream_t *)streams,
|
||||
gpu_indexes);
|
||||
host_trim_radix_blocks_lsb<uint64_t>(output, input, CudaStreams(streams));
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t num_additional_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks,
|
||||
uint32_t num_additional_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
@@ -32,34 +30,31 @@ uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
|
||||
noise_reduction_type);
|
||||
|
||||
return scratch_extend_radix_with_sign_msb<uint64_t>(
|
||||
(cudaStream_t *)streams, gpu_indexes, gpu_count,
|
||||
CudaStreams(streams),
|
||||
(int_extend_radix_with_sign_msb_buffer<uint64_t> **)mem_ptr, params,
|
||||
num_blocks, num_additional_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_extend_radix_with_sign_msb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
|
||||
int8_t *mem_ptr, uint32_t num_additional_blocks, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input, int8_t *mem_ptr,
|
||||
uint32_t num_additional_blocks, void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
PUSH_RANGE("cast")
|
||||
host_extend_radix_with_sign_msb<uint64_t>(
|
||||
(cudaStream_t *)streams, gpu_indexes, gpu_count, output, input,
|
||||
CudaStreams(streams), output, input,
|
||||
(int_extend_radix_with_sign_msb_buffer<uint64_t> *)mem_ptr,
|
||||
num_additional_blocks, bsks, (uint64_t **)ksks, ms_noise_reduction_key);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cleanup_cuda_extend_radix_with_sign_msb_64(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
void cleanup_cuda_extend_radix_with_sign_msb_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
PUSH_RANGE("clean cast")
|
||||
int_extend_radix_with_sign_msb_buffer<uint64_t> *mem_ptr =
|
||||
(int_extend_radix_with_sign_msb_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
POP_RANGE()
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
|
||||
@@ -8,19 +8,18 @@
|
||||
template <typename Torus>
|
||||
__host__ void host_extend_radix_with_trivial_zero_blocks_msb(
|
||||
CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes) {
|
||||
CudaStreams streams) {
|
||||
PUSH_RANGE("extend only")
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
|
||||
0, input->num_radix_blocks, input, 0,
|
||||
input->num_radix_blocks);
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), output, 0,
|
||||
input->num_radix_blocks, input, 0, input->num_radix_blocks);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_trim_radix_blocks_lsb(CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input,
|
||||
cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes) {
|
||||
CudaStreams streams) {
|
||||
|
||||
const uint32_t input_start_lwe_index =
|
||||
input->num_radix_blocks - output->num_radix_blocks;
|
||||
@@ -31,30 +30,29 @@ __host__ void host_trim_radix_blocks_lsb(CudaRadixCiphertextFFI *output,
|
||||
input->num_radix_blocks, output->num_radix_blocks);
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], output, 0, output->num_radix_blocks, input,
|
||||
input_start_lwe_index, input->num_radix_blocks);
|
||||
streams.stream(0), streams.gpu_index(0), output, 0,
|
||||
output->num_radix_blocks, input, input_start_lwe_index,
|
||||
input->num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_extend_radix_with_sign_msb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_extend_radix_with_sign_msb_buffer<Torus> **mem_ptr,
|
||||
CudaStreams streams, int_extend_radix_with_sign_msb_buffer<Torus> **mem_ptr,
|
||||
const int_radix_params params, uint32_t num_radix_blocks,
|
||||
uint32_t num_additional_blocks, const bool allocate_gpu_memory) {
|
||||
PUSH_RANGE("scratch cast/extend")
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
*mem_ptr = new int_extend_radix_with_sign_msb_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
num_additional_blocks, allocate_gpu_memory, size_tracker);
|
||||
streams, params, num_radix_blocks, num_additional_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
POP_RANGE()
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_extend_radix_with_sign_msb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *output,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input,
|
||||
int_extend_radix_with_sign_msb_buffer<Torus> *mem_ptr,
|
||||
uint32_t num_additional_blocks, void *const *bsks, Torus *const *ksks,
|
||||
@@ -62,8 +60,8 @@ __host__ void host_extend_radix_with_sign_msb(
|
||||
|
||||
if (num_additional_blocks == 0) {
|
||||
PUSH_RANGE("cast/extend no addblocks")
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], output,
|
||||
input);
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
output, input);
|
||||
POP_RANGE()
|
||||
return;
|
||||
}
|
||||
@@ -72,24 +70,24 @@ __host__ void host_extend_radix_with_sign_msb(
|
||||
|
||||
PANIC_IF_FALSE(input_blocks > 0, "Cuda error: input blocks cannot be zero");
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
|
||||
0, input_blocks, input, 0,
|
||||
input_blocks);
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), output, 0, input_blocks, input,
|
||||
0, input_blocks);
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
mem_ptr->last_block, 0, 1, input,
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), mem_ptr->last_block, 0, 1, input,
|
||||
input_blocks - 1, input_blocks);
|
||||
|
||||
host_apply_univariate_lut_kb(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->padding_block,
|
||||
mem_ptr->last_block, mem_ptr->lut, ksks, ms_noise_reduction_key, bsks);
|
||||
host_apply_univariate_lut_kb(streams, mem_ptr->padding_block,
|
||||
mem_ptr->last_block, mem_ptr->lut, ksks,
|
||||
ms_noise_reduction_key, bsks);
|
||||
|
||||
for (uint32_t i = 0; i < num_additional_blocks; ++i) {
|
||||
uint32_t dst_block_idx = input_blocks + i;
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
|
||||
dst_block_idx, dst_block_idx + 1,
|
||||
mem_ptr->padding_block, 0, 1);
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), output, dst_block_idx,
|
||||
dst_block_idx + 1, mem_ptr->padding_block, 0, 1);
|
||||
}
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
#include "integer/cmux.cuh"
|
||||
|
||||
uint64_t scratch_cuda_integer_radix_cmux_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
PUSH_RANGE("scratch cmux")
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
@@ -18,16 +18,14 @@ uint64_t scratch_cuda_integer_radix_cmux_kb_64(
|
||||
[](uint64_t x) -> uint64_t { return x == 1; };
|
||||
|
||||
uint64_t ret = scratch_cuda_integer_radix_cmux_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
|
||||
lwe_ciphertext_count, params, allocate_gpu_memory);
|
||||
CudaStreams(streams), (int_cmux_buffer<uint64_t> **)mem_ptr,
|
||||
predicate_lut_f, lwe_ciphertext_count, params, allocate_gpu_memory);
|
||||
POP_RANGE()
|
||||
return ret;
|
||||
}
|
||||
|
||||
void cuda_cmux_integer_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_condition,
|
||||
CudaRadixCiphertextFFI const *lwe_array_true,
|
||||
CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
|
||||
@@ -35,21 +33,18 @@ void cuda_cmux_integer_radix_ciphertext_kb_64(
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
PUSH_RANGE("cmux")
|
||||
host_integer_radix_cmux_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_condition, lwe_array_true, lwe_array_false,
|
||||
(int_cmux_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key);
|
||||
CudaStreams(streams), lwe_array_out, lwe_condition, lwe_array_true,
|
||||
lwe_array_false, (int_cmux_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_cmux(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
void cleanup_cuda_integer_radix_cmux(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
PUSH_RANGE("cleanup cmux")
|
||||
int_cmux_buffer<uint64_t> *mem_ptr =
|
||||
(int_cmux_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
POP_RANGE()
|
||||
|
||||
@@ -6,8 +6,7 @@
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
zero_out_if(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
zero_out_if(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_input,
|
||||
CudaRadixCiphertextFFI const *lwe_condition,
|
||||
int_zero_out_if_buffer<Torus> *mem_ptr,
|
||||
@@ -27,26 +26,25 @@ zero_out_if(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
"Cuda error: input and output radix ciphertexts must have the same "
|
||||
"lwe dimension");
|
||||
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
cuda_set_device(streams.gpu_index(0));
|
||||
auto params = mem_ptr->params;
|
||||
|
||||
// We can't use integer_radix_apply_bivariate_lookup_table_kb since the
|
||||
// second operand is not an array
|
||||
auto tmp_lwe_array_input = mem_ptr->tmp;
|
||||
host_pack_bivariate_blocks_with_single_block<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp_lwe_array_input,
|
||||
predicate->lwe_indexes_in, lwe_array_input, lwe_condition,
|
||||
predicate->lwe_indexes_in, params.message_modulus, num_radix_blocks);
|
||||
streams, tmp_lwe_array_input, predicate->lwe_indexes_in, lwe_array_input,
|
||||
lwe_condition, predicate->lwe_indexes_in, params.message_modulus,
|
||||
num_radix_blocks);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, tmp_lwe_array_input, bsks,
|
||||
ksks, ms_noise_reduction_key, predicate, num_radix_blocks);
|
||||
streams, lwe_array_out, tmp_lwe_array_input, bsks, ksks,
|
||||
ms_noise_reduction_key, predicate, num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_cmux_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_condition,
|
||||
CudaRadixCiphertextFFI const *lwe_array_true,
|
||||
CudaRadixCiphertextFFI const *lwe_array_false,
|
||||
@@ -62,18 +60,19 @@ __host__ void host_integer_radix_cmux_kb(
|
||||
auto params = mem_ptr->params;
|
||||
Torus lwe_size = params.big_lwe_dimension + 1;
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], mem_ptr->buffer_in, 0, num_radix_blocks,
|
||||
lwe_array_true, 0, num_radix_blocks);
|
||||
streams.stream(0), streams.gpu_index(0), mem_ptr->buffer_in, 0,
|
||||
num_radix_blocks, lwe_array_true, 0, num_radix_blocks);
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], mem_ptr->buffer_in, num_radix_blocks,
|
||||
2 * num_radix_blocks, lwe_array_false, 0, num_radix_blocks);
|
||||
streams.stream(0), streams.gpu_index(0), mem_ptr->buffer_in,
|
||||
num_radix_blocks, 2 * num_radix_blocks, lwe_array_false, 0,
|
||||
num_radix_blocks);
|
||||
for (uint i = 0; i < 2 * num_radix_blocks; i++) {
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
mem_ptr->condition_array, i, i + 1,
|
||||
lwe_condition, 0, 1);
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), mem_ptr->condition_array, i,
|
||||
i + 1, lwe_condition, 0, 1);
|
||||
}
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->buffer_out, mem_ptr->buffer_in,
|
||||
streams, mem_ptr->buffer_out, mem_ptr->buffer_in,
|
||||
mem_ptr->condition_array, bsks, ksks, ms_noise_reduction_key,
|
||||
mem_ptr->predicate_lut, 2 * num_radix_blocks, params.message_modulus);
|
||||
|
||||
@@ -87,25 +86,24 @@ __host__ void host_integer_radix_cmux_kb(
|
||||
as_radix_ciphertext_slice<Torus>(&mem_false, mem_ptr->buffer_out,
|
||||
num_radix_blocks, 2 * num_radix_blocks);
|
||||
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], &mem_true, &mem_true,
|
||||
&mem_false, num_radix_blocks, params.message_modulus,
|
||||
params.carry_modulus);
|
||||
host_addition<Torus>(streams.stream(0), streams.gpu_index(0), &mem_true,
|
||||
&mem_true, &mem_false, num_radix_blocks,
|
||||
params.message_modulus, params.carry_modulus);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, &mem_true, bsks, ksks,
|
||||
ms_noise_reduction_key, mem_ptr->message_extract_lut, num_radix_blocks);
|
||||
streams, lwe_array_out, &mem_true, bsks, ksks, ms_noise_reduction_key,
|
||||
mem_ptr->message_extract_lut, num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_radix_cmux_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_cmux_buffer<Torus> **mem_ptr,
|
||||
CudaStreams streams, int_cmux_buffer<Torus> **mem_ptr,
|
||||
std::function<Torus(Torus)> predicate_lut_f, uint32_t num_radix_blocks,
|
||||
int_radix_params params, bool allocate_gpu_memory) {
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_cmux_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, predicate_lut_f, params,
|
||||
num_radix_blocks, allocate_gpu_memory, size_tracker);
|
||||
*mem_ptr = new int_cmux_buffer<Torus>(streams, predicate_lut_f, params,
|
||||
num_radix_blocks, allocate_gpu_memory,
|
||||
size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1,14 +1,13 @@
|
||||
#include "integer/comparison.cuh"
|
||||
|
||||
uint64_t scratch_cuda_integer_radix_comparison_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
COMPARISON_TYPE op_type, bool is_signed, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, COMPARISON_TYPE op_type, bool is_signed,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
PUSH_RANGE("scratch comparison")
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
@@ -20,9 +19,8 @@ uint64_t scratch_cuda_integer_radix_comparison_kb_64(
|
||||
case EQ:
|
||||
case NE:
|
||||
size_tracker += scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params,
|
||||
op_type, false, allocate_gpu_memory);
|
||||
CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
|
||||
num_radix_blocks, params, op_type, false, allocate_gpu_memory);
|
||||
break;
|
||||
case GT:
|
||||
case GE:
|
||||
@@ -31,9 +29,8 @@ uint64_t scratch_cuda_integer_radix_comparison_kb_64(
|
||||
case MAX:
|
||||
case MIN:
|
||||
size_tracker += scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params,
|
||||
op_type, is_signed, allocate_gpu_memory);
|
||||
CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
|
||||
num_radix_blocks, params, op_type, is_signed, allocate_gpu_memory);
|
||||
break;
|
||||
}
|
||||
POP_RANGE()
|
||||
@@ -41,8 +38,7 @@ uint64_t scratch_cuda_integer_radix_comparison_kb_64(
|
||||
}
|
||||
|
||||
void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
@@ -60,9 +56,8 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
case EQ:
|
||||
case NE:
|
||||
host_integer_radix_equality_check_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_1, lwe_array_2, buffer, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, num_radix_blocks);
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2, buffer,
|
||||
bsks, (uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
|
||||
break;
|
||||
case GT:
|
||||
case GE:
|
||||
@@ -72,18 +67,17 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
PANIC("Cuda error (comparisons): the number of radix blocks has to be "
|
||||
"even.")
|
||||
host_integer_radix_difference_check_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_1, lwe_array_2, buffer, buffer->diff_buffer->operator_f, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2, buffer,
|
||||
buffer->diff_buffer->operator_f, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, num_radix_blocks);
|
||||
break;
|
||||
case MAX:
|
||||
case MIN:
|
||||
if (num_radix_blocks % 2 != 0)
|
||||
PANIC("Cuda error (max/min): the number of radix blocks has to be even.")
|
||||
host_integer_radix_maxmin_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_1, lwe_array_2, buffer, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, num_radix_blocks);
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2, buffer,
|
||||
bsks, (uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error: integer operation not supported")
|
||||
@@ -91,27 +85,25 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_comparison(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
void cleanup_cuda_integer_comparison(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
PUSH_RANGE("cleanup comparison")
|
||||
int_comparison_buffer<uint64_t> *mem_ptr =
|
||||
(int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
@@ -119,14 +111,12 @@ uint64_t scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params, EQ,
|
||||
false, allocate_gpu_memory);
|
||||
CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
|
||||
num_radix_blocks, params, EQ, false, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_are_all_comparisons_block_true_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
@@ -136,30 +126,28 @@ void cuda_integer_are_all_comparisons_block_true_kb_64(
|
||||
(int_comparison_buffer<uint64_t> *)mem_ptr;
|
||||
|
||||
host_integer_are_all_comparisons_block_true_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_in, buffer, bsks, (uint64_t **)(ksks), ms_noise_reduction_key,
|
||||
num_radix_blocks);
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_in, buffer, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_are_all_comparisons_block_true(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
|
||||
|
||||
int_comparison_buffer<uint64_t> *mem_ptr =
|
||||
(int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
@@ -167,14 +155,12 @@ uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params, EQ,
|
||||
false, allocate_gpu_memory);
|
||||
CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
|
||||
num_radix_blocks, params, EQ, false, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
@@ -184,18 +170,16 @@ void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
|
||||
(int_comparison_buffer<uint64_t> *)mem_ptr;
|
||||
|
||||
host_integer_is_at_least_one_comparisons_block_true_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_in, buffer, bsks, (uint64_t **)(ksks), ms_noise_reduction_key,
|
||||
num_radix_blocks);
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_in, buffer, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
|
||||
|
||||
int_comparison_buffer<uint64_t> *mem_ptr =
|
||||
(int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
@@ -58,8 +58,7 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
|
||||
*/
|
||||
template <typename Torus>
|
||||
__host__ void are_all_comparisons_block_true(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
@@ -86,9 +85,9 @@ __host__ void are_all_comparisons_block_true(
|
||||
uint32_t total_modulus = message_modulus * carry_modulus;
|
||||
uint32_t max_value = (total_modulus - 1) / (message_modulus - 1);
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], tmp_out,
|
||||
0, num_radix_blocks, lwe_array_in, 0,
|
||||
num_radix_blocks);
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), tmp_out, 0, num_radix_blocks,
|
||||
lwe_array_in, 0, num_radix_blocks);
|
||||
|
||||
uint32_t remaining_blocks = num_radix_blocks;
|
||||
|
||||
@@ -108,9 +107,9 @@ __host__ void are_all_comparisons_block_true(
|
||||
uint32_t chunk_length =
|
||||
std::min(max_value, begin_remaining_blocks - i * max_value);
|
||||
chunk_lengths[i] = chunk_length;
|
||||
accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator_ptr,
|
||||
input_blocks, big_lwe_dimension,
|
||||
chunk_length);
|
||||
accumulate_all_blocks<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
accumulator_ptr, input_blocks,
|
||||
big_lwe_dimension, chunk_length);
|
||||
|
||||
accumulator_ptr += (big_lwe_dimension + 1);
|
||||
remaining_blocks -= (chunk_length - 1);
|
||||
@@ -131,8 +130,8 @@ __host__ void are_all_comparisons_block_true(
|
||||
return x == chunk_length;
|
||||
};
|
||||
generate_device_accumulator_with_cpu_prealloc<Torus>(
|
||||
streams[0], gpu_indexes[0], is_max_value_lut->get_lut(0, 1),
|
||||
is_max_value_lut->get_degree(1),
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
is_max_value_lut->get_lut(0, 1), is_max_value_lut->get_degree(1),
|
||||
is_max_value_lut->get_max_degree(1), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus,
|
||||
is_equal_to_num_blocks_lut_f, true,
|
||||
@@ -148,9 +147,9 @@ __host__ void are_all_comparisons_block_true(
|
||||
}
|
||||
cuda_memcpy_async_to_gpu(is_max_value_lut->get_lut_indexes(0, 0),
|
||||
h_lut_indexes, num_chunks * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
auto active_gpu_count = get_active_gpu_count(num_chunks, gpu_count);
|
||||
is_max_value_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
streams.stream(0), streams.gpu_index(0));
|
||||
auto active_streams = streams.active_gpu_subset(num_chunks);
|
||||
is_max_value_lut->broadcast_lut(active_streams);
|
||||
}
|
||||
lut = is_max_value_lut;
|
||||
}
|
||||
@@ -159,8 +158,8 @@ __host__ void are_all_comparisons_block_true(
|
||||
if (remaining_blocks == 1) {
|
||||
// In the last iteration we copy the output to the final address
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsks,
|
||||
ksks, ms_noise_reduction_key, lut, 1);
|
||||
streams, lwe_array_out, accumulator, bsks, ksks,
|
||||
ms_noise_reduction_key, lut, 1);
|
||||
// Reset max_value_lut_indexes before returning, otherwise if the lut is
|
||||
// reused the lut indexes will be wrong
|
||||
memset(is_max_value_lut->h_lut_indexes, 0,
|
||||
@@ -168,17 +167,17 @@ __host__ void are_all_comparisons_block_true(
|
||||
cuda_memcpy_async_to_gpu(is_max_value_lut->get_lut_indexes(0, 0),
|
||||
is_max_value_lut->h_lut_indexes,
|
||||
is_max_value_lut->num_blocks * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
streams.stream(0), streams.gpu_index(0));
|
||||
auto active_gpu_count_is_max =
|
||||
get_active_gpu_count(is_max_value_lut->num_blocks, gpu_count);
|
||||
is_max_value_lut->broadcast_lut(streams, gpu_indexes,
|
||||
active_gpu_count_is_max, false);
|
||||
streams.active_gpu_subset(is_max_value_lut->num_blocks);
|
||||
is_max_value_lut->broadcast_lut(active_gpu_count_is_max, false);
|
||||
|
||||
reset_radix_ciphertext_blocks(lwe_array_out, 1);
|
||||
return;
|
||||
} else {
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp_out, accumulator, bsks, ksks,
|
||||
ms_noise_reduction_key, lut, num_chunks);
|
||||
streams, tmp_out, accumulator, bsks, ksks, ms_noise_reduction_key,
|
||||
lut, num_chunks);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -191,8 +190,7 @@ __host__ void are_all_comparisons_block_true(
|
||||
*/
|
||||
template <typename Torus>
|
||||
__host__ void is_at_least_one_comparisons_block_true(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
@@ -216,7 +214,7 @@ __host__ void is_at_least_one_comparisons_block_true(
|
||||
uint32_t max_value = (total_modulus - 1) / (message_modulus - 1);
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], mem_ptr->tmp_lwe_array_out, 0,
|
||||
streams.stream(0), streams.gpu_index(0), mem_ptr->tmp_lwe_array_out, 0,
|
||||
num_radix_blocks, lwe_array_in, 0, num_radix_blocks);
|
||||
|
||||
uint32_t remaining_blocks = num_radix_blocks;
|
||||
@@ -234,8 +232,8 @@ __host__ void is_at_least_one_comparisons_block_true(
|
||||
uint32_t chunk_length =
|
||||
std::min(max_value, begin_remaining_blocks - i * max_value);
|
||||
chunk_lengths[i] = chunk_length;
|
||||
accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator,
|
||||
input_blocks, big_lwe_dimension,
|
||||
accumulate_all_blocks<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
accumulator, input_blocks, big_lwe_dimension,
|
||||
chunk_length);
|
||||
|
||||
accumulator += (big_lwe_dimension + 1);
|
||||
@@ -250,23 +248,20 @@ __host__ void is_at_least_one_comparisons_block_true(
|
||||
if (remaining_blocks == 1) {
|
||||
// In the last iteration we copy the output to the final address
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
buffer->tmp_block_accumulated, bsks, ksks, ms_noise_reduction_key,
|
||||
lut, 1);
|
||||
streams, lwe_array_out, buffer->tmp_block_accumulated, bsks, ksks,
|
||||
ms_noise_reduction_key, lut, 1);
|
||||
return;
|
||||
} else {
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
|
||||
buffer->tmp_block_accumulated, bsks, ksks, ms_noise_reduction_key,
|
||||
lut, num_chunks);
|
||||
streams, mem_ptr->tmp_lwe_array_out, buffer->tmp_block_accumulated,
|
||||
bsks, ksks, ms_noise_reduction_key, lut, num_chunks);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_compare_blocks_with_zero(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
@@ -303,8 +298,8 @@ __host__ void host_compare_blocks_with_zero(
|
||||
|
||||
if (num_radix_blocks == 1) {
|
||||
// Just copy
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], sum, 0,
|
||||
1, lwe_array_in, 0, 1);
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), sum, 0, 1, lwe_array_in, 0, 1);
|
||||
num_sum_blocks = 1;
|
||||
} else {
|
||||
uint32_t remainder_blocks = num_radix_blocks;
|
||||
@@ -314,8 +309,8 @@ __host__ void host_compare_blocks_with_zero(
|
||||
uint32_t chunk_size =
|
||||
std::min(remainder_blocks, num_elements_to_fill_carry);
|
||||
|
||||
accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], sum_i, chunk,
|
||||
big_lwe_dimension, chunk_size);
|
||||
accumulate_all_blocks<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
sum_i, chunk, big_lwe_dimension, chunk_size);
|
||||
|
||||
num_sum_blocks++;
|
||||
remainder_blocks -= (chunk_size - 1);
|
||||
@@ -327,16 +322,15 @@ __host__ void host_compare_blocks_with_zero(
|
||||
}
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, sum, bsks, ksks,
|
||||
ms_noise_reduction_key, zero_comparison, num_sum_blocks);
|
||||
streams, lwe_array_out, sum, bsks, ksks, ms_noise_reduction_key,
|
||||
zero_comparison, num_sum_blocks);
|
||||
|
||||
reset_radix_ciphertext_blocks(lwe_array_out, num_sum_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_equality_check_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
@@ -352,23 +346,22 @@ __host__ void host_integer_radix_equality_check_kb(
|
||||
// Applies the LUT for the comparison operation
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, comparisons, lwe_array_1, lwe_array_2,
|
||||
bsks, ksks, ms_noise_reduction_key, eq_buffer->operator_lut,
|
||||
num_radix_blocks, eq_buffer->operator_lut->params.message_modulus);
|
||||
streams, comparisons, lwe_array_1, lwe_array_2, bsks, ksks,
|
||||
ms_noise_reduction_key, eq_buffer->operator_lut, num_radix_blocks,
|
||||
eq_buffer->operator_lut->params.message_modulus);
|
||||
|
||||
// This takes a Vec of blocks, where each block is either 0 or 1.
|
||||
//
|
||||
// It returns a block encrypting 1 if all input blocks are 1
|
||||
// otherwise the block encrypts 0
|
||||
are_all_comparisons_block_true<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, comparisons, mem_ptr,
|
||||
bsks, ksks, ms_noise_reduction_key, num_radix_blocks);
|
||||
streams, lwe_array_out, comparisons, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void compare_radix_blocks_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_left,
|
||||
CudaRadixCiphertextFFI const *lwe_array_right,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
@@ -400,22 +393,21 @@ __host__ void compare_radix_blocks_kb(
|
||||
|
||||
// Subtract
|
||||
host_subtraction<Torus>(
|
||||
streams[0], gpu_indexes[0], (Torus *)lwe_array_out->ptr,
|
||||
streams.stream(0), streams.gpu_index(0), (Torus *)lwe_array_out->ptr,
|
||||
(Torus *)lwe_array_left->ptr, (Torus *)lwe_array_right->ptr,
|
||||
big_lwe_dimension, num_radix_blocks);
|
||||
|
||||
// Apply LUT to compare to 0
|
||||
auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_out, bsks, ksks,
|
||||
ms_noise_reduction_key, is_non_zero_lut, num_radix_blocks);
|
||||
streams, lwe_array_out, lwe_array_out, bsks, ksks, ms_noise_reduction_key,
|
||||
is_non_zero_lut, num_radix_blocks);
|
||||
|
||||
// Add one
|
||||
// Here Lhs can have the following values: (-1) % (message modulus * carry
|
||||
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
|
||||
host_integer_radix_add_scalar_one_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, message_modulus,
|
||||
carry_modulus);
|
||||
streams, lwe_array_out, message_modulus, carry_modulus);
|
||||
}
|
||||
|
||||
// Reduces a vec containing shortint blocks that encrypts a sign
|
||||
@@ -423,8 +415,7 @@ __host__ void compare_radix_blocks_kb(
|
||||
// final sign
|
||||
template <typename Torus>
|
||||
__host__ void tree_sign_reduction(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI *lwe_block_comparisons,
|
||||
int_tree_sign_reduction_buffer<Torus> *tree_buffer,
|
||||
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
|
||||
@@ -453,26 +444,26 @@ __host__ void tree_sign_reduction(
|
||||
auto y = tree_buffer->tmp_y;
|
||||
if (x != lwe_block_comparisons)
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], x, 0, num_radix_blocks,
|
||||
streams.stream(0), streams.gpu_index(0), x, 0, num_radix_blocks,
|
||||
lwe_block_comparisons, 0, num_radix_blocks);
|
||||
|
||||
uint32_t partial_block_count = num_radix_blocks;
|
||||
|
||||
auto inner_tree_leaf = tree_buffer->tree_inner_leaf_lut;
|
||||
while (partial_block_count > 2) {
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, partial_block_count,
|
||||
message_modulus);
|
||||
pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), y, x,
|
||||
partial_block_count, message_modulus);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, x, y, bsks, ksks,
|
||||
ms_noise_reduction_key, inner_tree_leaf, partial_block_count >> 1);
|
||||
streams, x, y, bsks, ksks, ms_noise_reduction_key, inner_tree_leaf,
|
||||
partial_block_count >> 1);
|
||||
|
||||
if ((partial_block_count % 2) != 0) {
|
||||
partial_block_count >>= 1;
|
||||
partial_block_count++;
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], x, partial_block_count - 1,
|
||||
streams.stream(0), streams.gpu_index(0), x, partial_block_count - 1,
|
||||
partial_block_count, y, partial_block_count - 1, partial_block_count);
|
||||
} else {
|
||||
partial_block_count >>= 1;
|
||||
@@ -484,8 +475,8 @@ __host__ void tree_sign_reduction(
|
||||
std::function<Torus(Torus)> f;
|
||||
auto num_bits_in_message = log2_int(params.message_modulus);
|
||||
if (partial_block_count == 2) {
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, partial_block_count,
|
||||
message_modulus);
|
||||
pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), y, x,
|
||||
partial_block_count, message_modulus);
|
||||
|
||||
f = [block_selector_f, sign_handler_f, num_bits_in_message,
|
||||
message_modulus](Torus x) -> Torus {
|
||||
@@ -501,24 +492,23 @@ __host__ void tree_sign_reduction(
|
||||
f = sign_handler_f;
|
||||
}
|
||||
generate_device_accumulator_with_cpu_prealloc<Torus>(
|
||||
streams[0], gpu_indexes[0], last_lut->get_lut(0, 0),
|
||||
streams.stream(0), streams.gpu_index(0), last_lut->get_lut(0, 0),
|
||||
last_lut->get_degree(0), last_lut->get_max_degree(0), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, f, true,
|
||||
tree_buffer->preallocated_h_lut);
|
||||
|
||||
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
|
||||
last_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
auto active_streams = streams.active_gpu_subset(1);
|
||||
last_lut->broadcast_lut(active_streams);
|
||||
|
||||
// Last leaf
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, y, bsks, ksks,
|
||||
ms_noise_reduction_key, last_lut, 1);
|
||||
streams, lwe_array_out, y, bsks, ksks, ms_noise_reduction_key, last_lut,
|
||||
1);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_difference_check_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_left,
|
||||
CudaRadixCiphertextFFI const *lwe_array_right,
|
||||
int_comparison_buffer<Torus> *mem_ptr,
|
||||
@@ -552,19 +542,20 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
if (mem_ptr->is_signed) {
|
||||
packed_num_radix_blocks -= 2;
|
||||
}
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], &lhs, lwe_array_left,
|
||||
packed_num_radix_blocks, message_modulus);
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], &rhs, lwe_array_right,
|
||||
packed_num_radix_blocks, message_modulus);
|
||||
pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), &lhs,
|
||||
lwe_array_left, packed_num_radix_blocks,
|
||||
message_modulus);
|
||||
pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), &rhs,
|
||||
lwe_array_right, packed_num_radix_blocks,
|
||||
message_modulus);
|
||||
// From this point we have half number of blocks
|
||||
packed_num_radix_blocks /= 2;
|
||||
|
||||
// Clean noise
|
||||
auto identity_lut = mem_ptr->identity_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, diff_buffer->tmp_packed,
|
||||
diff_buffer->tmp_packed, bsks, ksks, ms_noise_reduction_key,
|
||||
identity_lut, 2 * packed_num_radix_blocks);
|
||||
streams, diff_buffer->tmp_packed, diff_buffer->tmp_packed, bsks, ksks,
|
||||
ms_noise_reduction_key, identity_lut, 2 * packed_num_radix_blocks);
|
||||
} else {
|
||||
as_radix_ciphertext_slice<Torus>(&lhs, lwe_array_left, 0,
|
||||
lwe_array_left->num_radix_blocks);
|
||||
@@ -581,17 +572,17 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
if (!mem_ptr->is_signed) {
|
||||
// Compare packed blocks, or simply the total number of radix blocks in the
|
||||
// inputs
|
||||
compare_radix_blocks_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, comparisons, &lhs, &rhs, mem_ptr, bsks,
|
||||
ksks, ms_noise_reduction_key, packed_num_radix_blocks);
|
||||
compare_radix_blocks_kb<Torus>(streams, comparisons, &lhs, &rhs, mem_ptr,
|
||||
bsks, ksks, ms_noise_reduction_key,
|
||||
packed_num_radix_blocks);
|
||||
num_comparisons = packed_num_radix_blocks;
|
||||
} else {
|
||||
// Packing is possible
|
||||
if (carry_modulus >= message_modulus) {
|
||||
// Compare (num_radix_blocks - 2) / 2 packed blocks
|
||||
compare_radix_blocks_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, comparisons, &lhs, &rhs, mem_ptr,
|
||||
bsks, ksks, ms_noise_reduction_key, packed_num_radix_blocks);
|
||||
compare_radix_blocks_kb<Torus>(streams, comparisons, &lhs, &rhs, mem_ptr,
|
||||
bsks, ksks, ms_noise_reduction_key,
|
||||
packed_num_radix_blocks);
|
||||
|
||||
// Compare the last block before the sign block separately
|
||||
auto identity_lut = mem_ptr->identity_lut;
|
||||
@@ -604,9 +595,8 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
num_radix_blocks - 2,
|
||||
num_radix_blocks - 1);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, &last_left_block_before_sign_block,
|
||||
&shifted_lwe_array_left, bsks, ksks, ms_noise_reduction_key,
|
||||
identity_lut, 1);
|
||||
streams, &last_left_block_before_sign_block, &shifted_lwe_array_left,
|
||||
bsks, ksks, ms_noise_reduction_key, identity_lut, 1);
|
||||
|
||||
CudaRadixCiphertextFFI last_right_block_before_sign_block;
|
||||
as_radix_ciphertext_slice<Torus>(
|
||||
@@ -618,7 +608,7 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
lwe_array_right, num_radix_blocks - 2,
|
||||
num_radix_blocks - 1);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, &last_right_block_before_sign_block,
|
||||
streams, &last_right_block_before_sign_block,
|
||||
&shifted_lwe_array_right, bsks, ksks, ms_noise_reduction_key,
|
||||
identity_lut, 1);
|
||||
|
||||
@@ -627,8 +617,7 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
packed_num_radix_blocks,
|
||||
packed_num_radix_blocks + 1);
|
||||
compare_radix_blocks_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, &shifted_comparisons,
|
||||
&last_left_block_before_sign_block,
|
||||
streams, &shifted_comparisons, &last_left_block_before_sign_block,
|
||||
&last_right_block_before_sign_block, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, 1);
|
||||
|
||||
@@ -643,17 +632,15 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
as_radix_ciphertext_slice<Torus>(&last_right_block, lwe_array_right,
|
||||
num_radix_blocks - 1, num_radix_blocks);
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, &shifted_comparisons,
|
||||
&last_left_block, &last_right_block, bsks, ksks,
|
||||
ms_noise_reduction_key, mem_ptr->signed_lut, 1,
|
||||
streams, &shifted_comparisons, &last_left_block, &last_right_block,
|
||||
bsks, ksks, ms_noise_reduction_key, mem_ptr->signed_lut, 1,
|
||||
mem_ptr->signed_lut->params.message_modulus);
|
||||
num_comparisons = packed_num_radix_blocks + 2;
|
||||
|
||||
} else {
|
||||
compare_radix_blocks_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, comparisons, lwe_array_left,
|
||||
lwe_array_right, mem_ptr, bsks, ksks, ms_noise_reduction_key,
|
||||
num_radix_blocks - 1);
|
||||
streams, comparisons, lwe_array_left, lwe_array_right, mem_ptr, bsks,
|
||||
ksks, ms_noise_reduction_key, num_radix_blocks - 1);
|
||||
// Compare the sign block separately
|
||||
CudaRadixCiphertextFFI shifted_comparisons;
|
||||
as_radix_ciphertext_slice<Torus>(&shifted_comparisons, comparisons,
|
||||
@@ -665,9 +652,8 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
as_radix_ciphertext_slice<Torus>(&last_right_block, lwe_array_right,
|
||||
num_radix_blocks - 1, num_radix_blocks);
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, &shifted_comparisons,
|
||||
&last_left_block, &last_right_block, bsks, ksks,
|
||||
ms_noise_reduction_key, mem_ptr->signed_lut, 1,
|
||||
streams, &shifted_comparisons, &last_left_block, &last_right_block,
|
||||
bsks, ksks, ms_noise_reduction_key, mem_ptr->signed_lut, 1,
|
||||
mem_ptr->signed_lut->params.message_modulus);
|
||||
num_comparisons = num_radix_blocks;
|
||||
}
|
||||
@@ -676,30 +662,27 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
tree_sign_reduction<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
comparisons, mem_ptr->diff_buffer->tree_buffer,
|
||||
reduction_lut_f, bsks, ksks,
|
||||
ms_noise_reduction_key, num_comparisons);
|
||||
tree_sign_reduction<Torus>(
|
||||
streams, lwe_array_out, comparisons, mem_ptr->diff_buffer->tree_buffer,
|
||||
reduction_lut_f, bsks, ksks, ms_noise_reduction_key, num_comparisons);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_radix_comparison_check_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_comparison_buffer<Torus> **mem_ptr,
|
||||
CudaStreams streams, int_comparison_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
|
||||
bool is_signed, bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_comparison_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, op, params, num_radix_blocks, is_signed,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
streams, op, params, num_radix_blocks, is_signed, allocate_gpu_memory,
|
||||
size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_maxmin_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_left,
|
||||
CudaRadixCiphertextFFI const *lwe_array_right,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
@@ -718,21 +701,20 @@ __host__ void host_integer_radix_maxmin_kb(
|
||||
|
||||
// Compute the sign
|
||||
host_integer_radix_difference_check_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
|
||||
lwe_array_left, lwe_array_right, mem_ptr, mem_ptr->identity_lut_f, bsks,
|
||||
ksks, ms_noise_reduction_key, num_radix_blocks);
|
||||
streams, mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
|
||||
mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, ms_noise_reduction_key,
|
||||
num_radix_blocks);
|
||||
|
||||
// Selector
|
||||
host_integer_radix_cmux_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
|
||||
mem_ptr->cmux_buffer, bsks, ksks, ms_noise_reduction_key);
|
||||
host_integer_radix_cmux_kb<Torus>(streams, lwe_array_out,
|
||||
mem_ptr->tmp_lwe_array_out, lwe_array_left,
|
||||
lwe_array_right, mem_ptr->cmux_buffer, bsks,
|
||||
ksks, ms_noise_reduction_key);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_are_all_comparisons_block_true_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
@@ -742,14 +724,13 @@ __host__ void host_integer_are_all_comparisons_block_true_kb(
|
||||
// It returns a block encrypting 1 if all input blocks are 1
|
||||
// otherwise the block encrypts 0
|
||||
are_all_comparisons_block_true<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, mem_ptr,
|
||||
bsks, ksks, ms_noise_reduction_key, num_radix_blocks);
|
||||
streams, lwe_array_out, lwe_array_in, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_is_at_least_one_comparisons_block_true_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
@@ -759,7 +740,7 @@ __host__ void host_integer_is_at_least_one_comparisons_block_true_kb(
|
||||
// It returns a block encrypting 1 if all input blocks are 1
|
||||
// otherwise the block encrypts 0
|
||||
is_at_least_one_comparisons_block_true<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, mem_ptr,
|
||||
bsks, ksks, ms_noise_reduction_key, num_radix_blocks);
|
||||
streams, lwe_array_out, lwe_array_in, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, num_radix_blocks);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
#include "compression.cuh"
|
||||
|
||||
uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t compression_glwe_dimension,
|
||||
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint32_t lwe_per_glwe, bool allocate_gpu_memory) {
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr,
|
||||
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params compression_params(
|
||||
pbs_type, compression_glwe_dimension, compression_polynomial_size,
|
||||
@@ -15,16 +14,14 @@ uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
carry_modulus, PBS_MS_REDUCTION_T::NO_REDUCTION);
|
||||
|
||||
return scratch_cuda_compress_integer_radix_ciphertext<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_compression<uint64_t> **)mem_ptr, num_radix_blocks,
|
||||
compression_params, lwe_per_glwe, allocate_gpu_memory);
|
||||
CudaStreams(streams), (int_compression<uint64_t> **)mem_ptr,
|
||||
num_radix_blocks, compression_params, lwe_per_glwe, allocate_gpu_memory);
|
||||
}
|
||||
uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t encryption_glwe_dimension,
|
||||
uint32_t encryption_polynomial_size, uint32_t compression_glwe_dimension,
|
||||
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr,
|
||||
uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
|
||||
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t num_blocks_to_decompress, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
@@ -42,59 +39,53 @@ uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64(
|
||||
noise_reduction_type);
|
||||
|
||||
return scratch_cuda_integer_decompress_radix_ciphertext<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_decompression<uint64_t> **)mem_ptr, num_blocks_to_decompress,
|
||||
encryption_params, compression_params, allocate_gpu_memory);
|
||||
CudaStreams(streams), (int_decompression<uint64_t> **)mem_ptr,
|
||||
num_blocks_to_decompress, encryption_params, compression_params,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
void cuda_integer_compress_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaPackedGlweCiphertextListFFI *glwe_array_out,
|
||||
CudaStreamsFFI streams, CudaPackedGlweCiphertextListFFI *glwe_array_out,
|
||||
CudaLweCiphertextListFFI const *lwe_array_in, void *const *fp_ksk,
|
||||
int8_t *mem_ptr) {
|
||||
|
||||
host_integer_compress<uint64_t>((cudaStream_t *)(streams), gpu_indexes,
|
||||
gpu_count, glwe_array_out, lwe_array_in,
|
||||
(uint64_t *const *)(fp_ksk),
|
||||
host_integer_compress<uint64_t>(CudaStreams(streams), glwe_array_out,
|
||||
lwe_array_in, (uint64_t *const *)(fp_ksk),
|
||||
(int_compression<uint64_t> *)mem_ptr);
|
||||
}
|
||||
void cuda_integer_decompress_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaLweCiphertextListFFI *lwe_array_out,
|
||||
CudaStreamsFFI streams, CudaLweCiphertextListFFI *lwe_array_out,
|
||||
CudaPackedGlweCiphertextListFFI const *glwe_in,
|
||||
uint32_t const *indexes_array, void *const *bsks, int8_t *mem_ptr) {
|
||||
|
||||
host_integer_decompress<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out, glwe_in,
|
||||
indexes_array, bsks, (int_decompression<uint64_t> *)mem_ptr);
|
||||
host_integer_decompress<uint64_t>(CudaStreams(streams), lwe_array_out,
|
||||
glwe_in, indexes_array, bsks,
|
||||
(int_decompression<uint64_t> *)mem_ptr);
|
||||
}
|
||||
void cleanup_cuda_integer_compress_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void cleanup_cuda_integer_compress_radix_ciphertext_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_compression<uint64_t> *mem_ptr =
|
||||
(int_compression<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
void cleanup_cuda_integer_decompress_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
|
||||
|
||||
int_decompression<uint64_t> *mem_ptr =
|
||||
(int_decompression<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_integer_compress_radix_ciphertext_128(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t compression_glwe_dimension,
|
||||
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint32_t lwe_per_glwe, bool allocate_gpu_memory) {
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr,
|
||||
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params compression_params(
|
||||
pbs_type, compression_glwe_dimension, compression_polynomial_size,
|
||||
@@ -103,16 +94,14 @@ uint64_t scratch_cuda_integer_compress_radix_ciphertext_128(
|
||||
carry_modulus, PBS_MS_REDUCTION_T::NO_REDUCTION);
|
||||
|
||||
return scratch_cuda_compress_integer_radix_ciphertext<__uint128_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_compression<__uint128_t> **)mem_ptr, num_radix_blocks,
|
||||
compression_params, lwe_per_glwe, allocate_gpu_memory);
|
||||
CudaStreams(streams), (int_compression<__uint128_t> **)mem_ptr,
|
||||
num_radix_blocks, compression_params, lwe_per_glwe, allocate_gpu_memory);
|
||||
}
|
||||
uint64_t scratch_cuda_integer_decompress_radix_ciphertext_128(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t compression_glwe_dimension,
|
||||
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
|
||||
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
bool allocate_gpu_memory) {
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr,
|
||||
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t num_radix_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, bool allocate_gpu_memory) {
|
||||
|
||||
// 128-bit decompression doesn't run PBSs, so we don't need encryption_params
|
||||
int_radix_params compression_params(
|
||||
@@ -123,50 +112,45 @@ uint64_t scratch_cuda_integer_decompress_radix_ciphertext_128(
|
||||
PBS_MS_REDUCTION_T::NO_REDUCTION);
|
||||
|
||||
return scratch_cuda_integer_decompress_radix_ciphertext<__uint128_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_decompression<__uint128_t> **)mem_ptr, num_radix_blocks,
|
||||
compression_params, compression_params, allocate_gpu_memory);
|
||||
CudaStreams(streams), (int_decompression<__uint128_t> **)mem_ptr,
|
||||
num_radix_blocks, compression_params, compression_params,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
void cuda_integer_compress_radix_ciphertext_128(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaPackedGlweCiphertextListFFI *glwe_array_out,
|
||||
CudaStreamsFFI streams, CudaPackedGlweCiphertextListFFI *glwe_array_out,
|
||||
CudaLweCiphertextListFFI const *lwe_array_in, void *const *fp_ksk,
|
||||
int8_t *mem_ptr) {
|
||||
|
||||
host_integer_compress<__uint128_t>((cudaStream_t *)(streams), gpu_indexes,
|
||||
gpu_count, glwe_array_out, lwe_array_in,
|
||||
(__uint128_t *const *)(fp_ksk),
|
||||
(int_compression<__uint128_t> *)mem_ptr);
|
||||
host_integer_compress<__uint128_t>(
|
||||
CudaStreams(streams), glwe_array_out, lwe_array_in,
|
||||
(__uint128_t *const *)(fp_ksk), (int_compression<__uint128_t> *)mem_ptr);
|
||||
}
|
||||
void cuda_integer_decompress_radix_ciphertext_128(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaLweCiphertextListFFI *lwe_array_out,
|
||||
CudaStreamsFFI streams, CudaLweCiphertextListFFI *lwe_array_out,
|
||||
CudaPackedGlweCiphertextListFFI const *glwe_in,
|
||||
uint32_t const *indexes_array, int8_t *mem_ptr) {
|
||||
|
||||
host_integer_decompress<__uint128_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out, glwe_in,
|
||||
indexes_array, nullptr, (int_decompression<__uint128_t> *)mem_ptr);
|
||||
CudaStreams(streams), lwe_array_out, glwe_in, indexes_array, nullptr,
|
||||
(int_decompression<__uint128_t> *)mem_ptr);
|
||||
}
|
||||
void cleanup_cuda_integer_compress_radix_ciphertext_128(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void cleanup_cuda_integer_compress_radix_ciphertext_128(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_compression<__uint128_t> *mem_ptr =
|
||||
(int_compression<__uint128_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_decompress_radix_ciphertext_128(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
|
||||
|
||||
int_decompression<__uint128_t> *mem_ptr =
|
||||
(int_decompression<__uint128_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
|
||||
@@ -80,8 +80,7 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
host_integer_compress(CudaStreams streams,
|
||||
CudaPackedGlweCiphertextListFFI *glwe_array_out,
|
||||
CudaLweCiphertextListFFI const *lwe_array_in,
|
||||
Torus *const *fp_ksk, int_compression<Torus> *mem_ptr) {
|
||||
@@ -98,7 +97,7 @@ host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
if constexpr (std::is_same_v<Torus, uint64_t>) {
|
||||
lwe_pksk_input = mem_ptr->tmp_lwe;
|
||||
host_cleartext_multiplication<Torus>(
|
||||
streams[0], gpu_indexes[0], lwe_pksk_input, lwe_array_in,
|
||||
streams.stream(0), streams.gpu_index(0), lwe_pksk_input, lwe_array_in,
|
||||
(uint64_t)compression_params.message_modulus);
|
||||
}
|
||||
|
||||
@@ -115,7 +114,7 @@ host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
cuda_memset_async(tmp_glwe_array_out, 0,
|
||||
num_glwes * (compression_params.glwe_dimension + 1) *
|
||||
compression_params.polynomial_size * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
streams.stream(0), streams.gpu_index(0));
|
||||
auto fp_ks_buffer = mem_ptr->fp_ks_buffer;
|
||||
auto rem_lwes = glwe_array_out->total_lwe_bodies_count;
|
||||
|
||||
@@ -125,8 +124,8 @@ host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
auto chunk_size = min(rem_lwes, glwe_array_out->lwe_per_glwe);
|
||||
|
||||
host_packing_keyswitch_lwe_list_to_glwe<Torus>(
|
||||
streams[0], gpu_indexes[0], glwe_out, lwe_pksk_input, fp_ksk[0],
|
||||
fp_ks_buffer, compression_params.small_lwe_dimension,
|
||||
streams.stream(0), streams.gpu_index(0), glwe_out, lwe_pksk_input,
|
||||
fp_ksk[0], fp_ks_buffer, compression_params.small_lwe_dimension,
|
||||
compression_params.glwe_dimension, compression_params.polynomial_size,
|
||||
compression_params.ks_base_log, compression_params.ks_level,
|
||||
chunk_size);
|
||||
@@ -141,11 +140,11 @@ host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
compression_params.polynomial_size +
|
||||
glwe_array_out->total_lwe_bodies_count;
|
||||
|
||||
host_modulus_switch_inplace<Torus>(streams[0], gpu_indexes[0],
|
||||
host_modulus_switch_inplace<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
tmp_glwe_array_out, size,
|
||||
glwe_array_out->storage_log_modulus);
|
||||
|
||||
host_pack<Torus>(streams[0], gpu_indexes[0], glwe_array_out,
|
||||
host_pack<Torus>(streams.stream(0), streams.gpu_index(0), glwe_array_out,
|
||||
tmp_glwe_array_out, num_glwes, compression_params);
|
||||
}
|
||||
|
||||
@@ -247,8 +246,7 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_integer_decompress(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
host_integer_decompress(CudaStreams streams,
|
||||
CudaLweCiphertextListFFI *d_lwe_array_out,
|
||||
CudaPackedGlweCiphertextListFFI const *d_packed_glwe_in,
|
||||
uint32_t const *h_indexes_array, void *const *d_bsks,
|
||||
@@ -262,7 +260,7 @@ host_integer_decompress(cudaStream_t const *streams,
|
||||
auto d_indexes_array = h_mem_ptr->tmp_indexes_array;
|
||||
cuda_memcpy_async_to_gpu(d_indexes_array, (void *)h_indexes_array,
|
||||
num_blocks_to_decompress * sizeof(uint32_t),
|
||||
streams[0], gpu_indexes[0]);
|
||||
streams.stream(0), streams.gpu_index(0));
|
||||
|
||||
auto compression_params = h_mem_ptr->compression_params;
|
||||
auto lwe_per_glwe = compression_params.polynomial_size;
|
||||
@@ -276,7 +274,7 @@ host_integer_decompress(cudaStream_t const *streams,
|
||||
|
||||
auto current_glwe_index = h_indexes_array[0] / lwe_per_glwe;
|
||||
auto extracted_glwe = h_mem_ptr->tmp_extracted_glwe;
|
||||
host_extract<Torus>(streams[0], gpu_indexes[0], extracted_glwe,
|
||||
host_extract<Torus>(streams.stream(0), streams.gpu_index(0), extracted_glwe,
|
||||
d_packed_glwe_in, current_glwe_index);
|
||||
glwe_vec.push_back(std::make_pair(1, extracted_glwe));
|
||||
for (int i = 1; i < num_blocks_to_decompress; i++) {
|
||||
@@ -285,8 +283,8 @@ host_integer_decompress(cudaStream_t const *streams,
|
||||
extracted_glwe += glwe_accumulator_size;
|
||||
current_glwe_index = glwe_index;
|
||||
// Extracts a new GLWE
|
||||
host_extract<Torus>(streams[0], gpu_indexes[0], extracted_glwe,
|
||||
d_packed_glwe_in, glwe_index);
|
||||
host_extract<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
extracted_glwe, d_packed_glwe_in, glwe_index);
|
||||
glwe_vec.push_back(std::make_pair(1, extracted_glwe));
|
||||
} else {
|
||||
// Updates the quantity
|
||||
@@ -312,17 +310,17 @@ host_integer_decompress(cudaStream_t const *streams,
|
||||
extracted_glwe = max_idx_and_glwe.second;
|
||||
|
||||
if constexpr (std::is_same_v<Torus, uint64_t>)
|
||||
cuda_glwe_sample_extract_64(streams[0], gpu_indexes[0], extracted_lwe,
|
||||
extracted_glwe, d_indexes_array_chunk,
|
||||
num_lwes, compression_params.polynomial_size,
|
||||
compression_params.glwe_dimension,
|
||||
cuda_glwe_sample_extract_64(
|
||||
streams.stream(0), streams.gpu_index(0), extracted_lwe,
|
||||
extracted_glwe, d_indexes_array_chunk, num_lwes,
|
||||
compression_params.polynomial_size, compression_params.glwe_dimension,
|
||||
compression_params.polynomial_size);
|
||||
else
|
||||
// 128 bits
|
||||
cuda_glwe_sample_extract_128(streams[0], gpu_indexes[0], extracted_lwe,
|
||||
extracted_glwe, d_indexes_array_chunk,
|
||||
num_lwes, compression_params.polynomial_size,
|
||||
compression_params.glwe_dimension,
|
||||
cuda_glwe_sample_extract_128(
|
||||
streams.stream(0), streams.gpu_index(0), extracted_lwe,
|
||||
extracted_glwe, d_indexes_array_chunk, num_lwes,
|
||||
compression_params.polynomial_size, compression_params.glwe_dimension,
|
||||
compression_params.polynomial_size);
|
||||
|
||||
d_indexes_array_chunk += num_lwes;
|
||||
@@ -341,13 +339,12 @@ host_integer_decompress(cudaStream_t const *streams,
|
||||
/// dimension to a big LWE dimension
|
||||
auto encryption_params = h_mem_ptr->encryption_params;
|
||||
auto lut = h_mem_ptr->decompression_rescale_lut;
|
||||
auto active_gpu_count =
|
||||
get_active_gpu_count(num_blocks_to_decompress, gpu_count);
|
||||
if (active_gpu_count == 1) {
|
||||
auto active_streams = streams.active_gpu_subset(num_blocks_to_decompress);
|
||||
if (active_streams.count() == 1) {
|
||||
execute_pbs_async<Torus, Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, (Torus *)d_lwe_array_out->ptr,
|
||||
lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec,
|
||||
extracted_lwe, lut->lwe_indexes_in, d_bsks, nullptr, lut->buffer,
|
||||
active_streams, (Torus *)d_lwe_array_out->ptr, lut->lwe_indexes_out,
|
||||
lut->lut_vec, lut->lut_indexes_vec, extracted_lwe,
|
||||
lut->lwe_indexes_in, d_bsks, nullptr, lut->buffer,
|
||||
encryption_params.glwe_dimension,
|
||||
compression_params.small_lwe_dimension,
|
||||
encryption_params.polynomial_size, encryption_params.pbs_base_log,
|
||||
@@ -363,25 +360,26 @@ host_integer_decompress(cudaStream_t const *streams,
|
||||
lut->lwe_trivial_indexes_vec;
|
||||
|
||||
/// Make sure all data that should be on GPU 0 is indeed there
|
||||
cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
|
||||
for (int j = 1; j < active_gpu_count; j++) {
|
||||
cuda_stream_wait_event(streams[j], lut->event_scatter_in,
|
||||
gpu_indexes[j]);
|
||||
cuda_event_record(lut->event_scatter_in, streams.stream(0),
|
||||
streams.gpu_index(0));
|
||||
for (int j = 1; j < active_streams.count(); j++) {
|
||||
cuda_stream_wait_event(streams.stream(j), lut->event_scatter_in,
|
||||
streams.gpu_index(j));
|
||||
}
|
||||
/// With multiple GPUs we push to the vectors on each GPU then when we
|
||||
/// gather data to GPU 0 we can copy back to the original indexing
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
|
||||
extracted_lwe, lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
|
||||
lut->lwe_aligned_vec, lut->active_gpu_count, num_blocks_to_decompress,
|
||||
active_streams, lwe_array_in_vec, extracted_lwe, lut->lwe_indexes_in,
|
||||
lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
|
||||
lut->active_streams.count(), num_blocks_to_decompress,
|
||||
compression_params.small_lwe_dimension + 1);
|
||||
|
||||
/// Apply PBS
|
||||
execute_pbs_async<Torus, Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
|
||||
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
|
||||
lwe_array_in_vec, lwe_trivial_indexes_vec, d_bsks, nullptr,
|
||||
lut->buffer, encryption_params.glwe_dimension,
|
||||
active_streams, lwe_after_pbs_vec, lwe_trivial_indexes_vec,
|
||||
lut->lut_vec, lut->lut_indexes_vec, lwe_array_in_vec,
|
||||
lwe_trivial_indexes_vec, d_bsks, nullptr, lut->buffer,
|
||||
encryption_params.glwe_dimension,
|
||||
compression_params.small_lwe_dimension,
|
||||
encryption_params.polynomial_size, encryption_params.pbs_base_log,
|
||||
encryption_params.pbs_level, encryption_params.grouping_factor,
|
||||
@@ -390,21 +388,21 @@ host_integer_decompress(cudaStream_t const *streams,
|
||||
|
||||
/// Copy data back to GPU 0 and release vecs
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, (Torus *)d_lwe_array_out->ptr,
|
||||
lwe_after_pbs_vec, lut->lwe_indexes_out,
|
||||
lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
|
||||
num_blocks_to_decompress, encryption_params.big_lwe_dimension + 1);
|
||||
active_streams, (Torus *)d_lwe_array_out->ptr, lwe_after_pbs_vec,
|
||||
lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
|
||||
lut->lwe_aligned_vec, num_blocks_to_decompress,
|
||||
encryption_params.big_lwe_dimension + 1);
|
||||
|
||||
/// Synchronize all GPUs
|
||||
// other gpus record their events
|
||||
for (int j = 1; j < active_gpu_count; j++) {
|
||||
cuda_event_record(lut->event_scatter_out[j], streams[j],
|
||||
gpu_indexes[j]);
|
||||
for (int j = 1; j < active_streams.count(); j++) {
|
||||
cuda_event_record(lut->event_scatter_out[j], active_streams.stream(j),
|
||||
active_streams.gpu_index(j));
|
||||
}
|
||||
// GPU 0 waits for all
|
||||
for (int j = 1; j < active_gpu_count; j++) {
|
||||
cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
|
||||
gpu_indexes[0]);
|
||||
for (int j = 1; j < active_streams.count(); j++) {
|
||||
cuda_stream_wait_event(streams.stream(0), lut->event_scatter_out[j],
|
||||
streams.gpu_index(0));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@@ -415,29 +413,27 @@ host_integer_decompress(cudaStream_t const *streams,
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_compress_integer_radix_ciphertext(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_compression<Torus> **mem_ptr,
|
||||
CudaStreams streams, int_compression<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params compression_params,
|
||||
uint32_t lwe_per_glwe, bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_compression<Torus>(
|
||||
streams, gpu_indexes, gpu_count, compression_params, num_radix_blocks,
|
||||
lwe_per_glwe, allocate_gpu_memory, size_tracker);
|
||||
*mem_ptr = new int_compression<Torus>(streams, compression_params,
|
||||
num_radix_blocks, lwe_per_glwe,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_decompress_radix_ciphertext(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_decompression<Torus> **mem_ptr,
|
||||
CudaStreams streams, int_decompression<Torus> **mem_ptr,
|
||||
uint32_t num_blocks_to_decompress, int_radix_params encryption_params,
|
||||
int_radix_params compression_params, bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_decompression<Torus>(
|
||||
streams, gpu_indexes, gpu_count, encryption_params, compression_params,
|
||||
num_blocks_to_decompress, allocate_gpu_memory, size_tracker);
|
||||
streams, encryption_params, compression_params, num_blocks_to_decompress,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1,13 +1,12 @@
|
||||
#include "integer/div_rem.cuh"
|
||||
|
||||
uint64_t scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
bool is_signed, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
CudaStreamsFFI streams, bool is_signed, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
PUSH_RANGE("scratch div")
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
@@ -16,16 +15,14 @@ uint64_t scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_integer_div_rem_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, is_signed,
|
||||
(int_div_rem_memory<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
allocate_gpu_memory);
|
||||
CudaStreams(streams), is_signed, (int_div_rem_memory<uint64_t> **)mem_ptr,
|
||||
num_blocks, params, allocate_gpu_memory);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *quotient, CudaRadixCiphertextFFI *remainder,
|
||||
CudaRadixCiphertextFFI const *numerator,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient,
|
||||
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
|
||||
CudaRadixCiphertextFFI const *divisor, bool is_signed, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
@@ -33,20 +30,18 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;
|
||||
|
||||
host_integer_div_rem_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, quotient, remainder,
|
||||
numerator, divisor, is_signed, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, mem);
|
||||
CudaStreams(streams), quotient, remainder, numerator, divisor, is_signed,
|
||||
bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_div_rem(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int8_t **mem_ptr_void) {
|
||||
void cleanup_cuda_integer_div_rem(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
PUSH_RANGE("cleanup div")
|
||||
int_div_rem_memory<uint64_t> *mem_ptr =
|
||||
(int_div_rem_memory<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
POP_RANGE()
|
||||
|
||||
@@ -21,21 +21,19 @@
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_div_rem_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, bool is_signed, int_div_rem_memory<Torus> **mem_ptr,
|
||||
CudaStreams streams, bool is_signed, int_div_rem_memory<Torus> **mem_ptr,
|
||||
uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_div_rem_memory<Torus>(streams, gpu_indexes, gpu_count,
|
||||
params, is_signed, num_blocks,
|
||||
*mem_ptr =
|
||||
new int_div_rem_memory<Torus>(streams, params, is_signed, num_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_unsigned_integer_div_rem_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *quotient,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *quotient,
|
||||
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
|
||||
CudaRadixCiphertextFFI const *divisor, void *const *bsks,
|
||||
uint64_t *const *ksks,
|
||||
@@ -75,10 +73,10 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
auto cleaned_merged_interesting_remainder =
|
||||
mem_ptr->cleaned_merged_interesting_remainder;
|
||||
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
numerator_block_stack, numerator);
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
quotient, 0, num_blocks);
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), quotient, 0, num_blocks);
|
||||
|
||||
for (int i = total_bits - 1; i >= 0; i--) {
|
||||
uint32_t pos_in_block = i % num_bits_in_message;
|
||||
@@ -97,17 +95,17 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
(msb_bit_set + 1) / num_bits_in_message);
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], interesting_remainder1, 0,
|
||||
streams.stream(0), streams.gpu_index(0), interesting_remainder1, 0,
|
||||
first_trivial_block, remainder1, 0, first_trivial_block);
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], interesting_remainder2, 0,
|
||||
streams.stream(0), streams.gpu_index(0), interesting_remainder2, 0,
|
||||
first_trivial_block, remainder2, 0, first_trivial_block);
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], interesting_divisor, 0, first_trivial_block,
|
||||
divisor, 0, first_trivial_block);
|
||||
streams.stream(0), streams.gpu_index(0), interesting_divisor, 0,
|
||||
first_trivial_block, divisor, 0, first_trivial_block);
|
||||
if ((msb_bit_set + 1) / num_bits_in_message < num_blocks)
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], divisor_ms_blocks, 0,
|
||||
streams.stream(0), streams.gpu_index(0), divisor_ms_blocks, 0,
|
||||
num_blocks - (msb_bit_set + 1) / num_bits_in_message, divisor,
|
||||
(msb_bit_set + 1) / num_bits_in_message, num_blocks);
|
||||
|
||||
@@ -116,9 +114,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
// msb_bit_set) the split versions share some bits they should not. So we do
|
||||
// one PBS on the last block of the interesting_divisor, and first block of
|
||||
// divisor_ms_blocks to trim out bits which should not be there
|
||||
auto trim_last_interesting_divisor_bits = [&](cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
auto trim_last_interesting_divisor_bits = [&](CudaStreams streams) {
|
||||
if ((msb_bit_set + 1) % num_bits_in_message == 0) {
|
||||
return;
|
||||
}
|
||||
@@ -149,14 +145,12 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
interesting_divisor->num_radix_blocks - 1,
|
||||
interesting_divisor->num_radix_blocks);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, &last_interesting_divisor_block,
|
||||
streams, &last_interesting_divisor_block,
|
||||
&last_interesting_divisor_block, bsks, ksks, ms_noise_reduction_key,
|
||||
mem_ptr->masking_luts_1[shifted_mask], 1);
|
||||
}; // trim_last_interesting_divisor_bits
|
||||
|
||||
auto trim_first_divisor_ms_bits = [&](cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
auto trim_first_divisor_ms_bits = [&](CudaStreams streams) {
|
||||
if (divisor_ms_blocks->num_radix_blocks == 0 ||
|
||||
((msb_bit_set + 1) % num_bits_in_message) == 0) {
|
||||
return;
|
||||
@@ -178,9 +172,8 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
shifted_mask = shifted_mask & full_message_mask;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, divisor_ms_blocks, divisor_ms_blocks,
|
||||
bsks, ksks, ms_noise_reduction_key,
|
||||
mem_ptr->masking_luts_2[shifted_mask], 1);
|
||||
streams, divisor_ms_blocks, divisor_ms_blocks, bsks, ksks,
|
||||
ms_noise_reduction_key, mem_ptr->masking_luts_2[shifted_mask], 1);
|
||||
}; // trim_first_divisor_ms_bits
|
||||
|
||||
// This does
|
||||
@@ -192,75 +185,64 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
// However, to keep the remainder clean (noise wise), what we do is that we
|
||||
// put the remainder block from which we need to extract the bit, as the LSB
|
||||
// of the Remainder, so that left shifting will pull the bit we need.
|
||||
auto left_shift_interesting_remainder1 = [&](cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
pop_radix_ciphertext_block_async<Torus>(streams[0], gpu_indexes[0],
|
||||
mem_ptr->numerator_block_1,
|
||||
auto left_shift_interesting_remainder1 = [&](CudaStreams streams) {
|
||||
pop_radix_ciphertext_block_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), mem_ptr->numerator_block_1,
|
||||
numerator_block_stack);
|
||||
insert_block_in_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
|
||||
mem_ptr->numerator_block_1,
|
||||
insert_block_in_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), mem_ptr->numerator_block_1,
|
||||
interesting_remainder1, 0);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, interesting_remainder1, 1,
|
||||
mem_ptr->shift_mem_1, bsks, ksks, ms_noise_reduction_key,
|
||||
interesting_remainder1->num_radix_blocks);
|
||||
streams, interesting_remainder1, 1, mem_ptr->shift_mem_1, bsks, ksks,
|
||||
ms_noise_reduction_key, interesting_remainder1->num_radix_blocks);
|
||||
|
||||
reset_radix_ciphertext_blocks(mem_ptr->tmp_radix,
|
||||
interesting_remainder1->num_radix_blocks);
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
|
||||
mem_ptr->tmp_radix,
|
||||
copy_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), mem_ptr->tmp_radix,
|
||||
interesting_remainder1);
|
||||
|
||||
host_radix_blocks_rotate_left<Torus>(
|
||||
streams, gpu_indexes, gpu_count, interesting_remainder1,
|
||||
mem_ptr->tmp_radix, 1, interesting_remainder1->num_radix_blocks);
|
||||
streams, interesting_remainder1, mem_ptr->tmp_radix, 1,
|
||||
interesting_remainder1->num_radix_blocks);
|
||||
|
||||
pop_radix_ciphertext_block_async<Torus>(streams[0], gpu_indexes[0],
|
||||
mem_ptr->numerator_block_1,
|
||||
pop_radix_ciphertext_block_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), mem_ptr->numerator_block_1,
|
||||
interesting_remainder1);
|
||||
|
||||
if (pos_in_block != 0) {
|
||||
// We have not yet extracted all the bits from this numerator
|
||||
// so, we put it back on the front so that it gets taken next
|
||||
// iteration
|
||||
push_block_to_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
|
||||
mem_ptr->numerator_block_1,
|
||||
push_block_to_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), mem_ptr->numerator_block_1,
|
||||
numerator_block_stack);
|
||||
}
|
||||
}; // left_shift_interesting_remainder1
|
||||
|
||||
auto left_shift_interesting_remainder2 = [&](cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
auto left_shift_interesting_remainder2 = [&](CudaStreams streams) {
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, interesting_remainder2, 1,
|
||||
mem_ptr->shift_mem_2, bsks, ksks, ms_noise_reduction_key,
|
||||
interesting_remainder2->num_radix_blocks);
|
||||
streams, interesting_remainder2, 1, mem_ptr->shift_mem_2, bsks, ksks,
|
||||
ms_noise_reduction_key, interesting_remainder2->num_radix_blocks);
|
||||
}; // left_shift_interesting_remainder2
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
streams.synchronize();
|
||||
|
||||
// interesting_divisor
|
||||
trim_last_interesting_divisor_bits(mem_ptr->sub_streams_1, gpu_indexes,
|
||||
gpu_count);
|
||||
trim_last_interesting_divisor_bits(mem_ptr->sub_streams_1);
|
||||
// divisor_ms_blocks
|
||||
trim_first_divisor_ms_bits(mem_ptr->sub_streams_2, gpu_indexes, gpu_count);
|
||||
trim_first_divisor_ms_bits(mem_ptr->sub_streams_2);
|
||||
// interesting_remainder1
|
||||
// numerator_block_stack
|
||||
left_shift_interesting_remainder1(mem_ptr->sub_streams_3, gpu_indexes,
|
||||
gpu_count);
|
||||
left_shift_interesting_remainder1(mem_ptr->sub_streams_3);
|
||||
// interesting_remainder2
|
||||
left_shift_interesting_remainder2(mem_ptr->sub_streams_4, gpu_indexes,
|
||||
gpu_count);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_4[j], gpu_indexes[j]);
|
||||
}
|
||||
left_shift_interesting_remainder2(mem_ptr->sub_streams_4);
|
||||
|
||||
mem_ptr->sub_streams_1.synchronize();
|
||||
mem_ptr->sub_streams_2.synchronize();
|
||||
mem_ptr->sub_streams_3.synchronize();
|
||||
mem_ptr->sub_streams_4.synchronize();
|
||||
|
||||
// if interesting_remainder1 != 0 -> interesting_remainder2 == 0
|
||||
// if interesting_remainder1 == 0 -> interesting_remainder2 != 0
|
||||
@@ -269,7 +251,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
auto merged_interesting_remainder = interesting_remainder1;
|
||||
|
||||
host_addition<Torus>(
|
||||
streams[0], gpu_indexes[0], merged_interesting_remainder,
|
||||
streams.stream(0), streams.gpu_index(0), merged_interesting_remainder,
|
||||
merged_interesting_remainder, interesting_remainder2,
|
||||
merged_interesting_remainder->num_radix_blocks,
|
||||
radix_params.message_modulus, radix_params.carry_modulus);
|
||||
@@ -280,7 +262,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
reset_radix_ciphertext_blocks(
|
||||
cleaned_merged_interesting_remainder,
|
||||
merged_interesting_remainder->num_radix_blocks);
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
cleaned_merged_interesting_remainder,
|
||||
merged_interesting_remainder);
|
||||
|
||||
@@ -296,9 +278,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
// fills:
|
||||
// `new_remainder` - radix ciphertext
|
||||
// `subtraction_overflowed` - single ciphertext
|
||||
auto do_overflowing_sub = [&](cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
auto do_overflowing_sub = [&](CudaStreams streams) {
|
||||
uint32_t compute_borrow = 1;
|
||||
uint32_t uses_input_borrow = 0;
|
||||
auto first_indexes =
|
||||
@@ -311,40 +291,37 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
mem_ptr->scalars_for_overflow_sub
|
||||
[merged_interesting_remainder->num_radix_blocks - 1];
|
||||
mem_ptr->overflow_sub_mem->update_lut_indexes(
|
||||
streams, gpu_indexes, gpu_count, first_indexes, second_indexes,
|
||||
scalar_indexes, merged_interesting_remainder->num_radix_blocks);
|
||||
streams, first_indexes, second_indexes, scalar_indexes,
|
||||
merged_interesting_remainder->num_radix_blocks);
|
||||
host_integer_overflowing_sub<uint64_t>(
|
||||
streams, gpu_indexes, gpu_count, new_remainder,
|
||||
merged_interesting_remainder, interesting_divisor,
|
||||
subtraction_overflowed, (const CudaRadixCiphertextFFI *)nullptr,
|
||||
mem_ptr->overflow_sub_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
compute_borrow, uses_input_borrow);
|
||||
streams, new_remainder, merged_interesting_remainder,
|
||||
interesting_divisor, subtraction_overflowed,
|
||||
(const CudaRadixCiphertextFFI *)nullptr, mem_ptr->overflow_sub_mem,
|
||||
bsks, ksks, ms_noise_reduction_key, compute_borrow,
|
||||
uses_input_borrow);
|
||||
};
|
||||
|
||||
// fills:
|
||||
// `at_least_one_upper_block_is_non_zero` - single ciphertext
|
||||
auto check_divisor_upper_blocks = [&](cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
auto check_divisor_upper_blocks = [&](CudaStreams streams) {
|
||||
auto trivial_blocks = divisor_ms_blocks;
|
||||
if (trivial_blocks->num_radix_blocks == 0) {
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], at_least_one_upper_block_is_non_zero, 0,
|
||||
1);
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
at_least_one_upper_block_is_non_zero, 0, 1);
|
||||
} else {
|
||||
|
||||
// We could call unchecked_scalar_ne
|
||||
// But we are in the special case where scalar == 0
|
||||
// So we can skip some stuff
|
||||
host_compare_blocks_with_zero<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->tmp_1, trivial_blocks,
|
||||
mem_ptr->comparison_buffer, bsks, ksks, ms_noise_reduction_key,
|
||||
streams, mem_ptr->tmp_1, trivial_blocks, mem_ptr->comparison_buffer,
|
||||
bsks, ksks, ms_noise_reduction_key,
|
||||
trivial_blocks->num_radix_blocks,
|
||||
mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);
|
||||
|
||||
is_at_least_one_comparisons_block_true<Torus>(
|
||||
streams, gpu_indexes, gpu_count,
|
||||
at_least_one_upper_block_is_non_zero, mem_ptr->tmp_1,
|
||||
streams, at_least_one_upper_block_is_non_zero, mem_ptr->tmp_1,
|
||||
mem_ptr->comparison_buffer, bsks, ksks, ms_noise_reduction_key,
|
||||
mem_ptr->tmp_1->num_radix_blocks);
|
||||
}
|
||||
@@ -354,56 +331,47 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
// so that it can be safely used in bivariate PBSes
|
||||
// fills:
|
||||
// `cleaned_merged_interesting_remainder` - radix ciphertext
|
||||
auto create_clean_version_of_merged_remainder =
|
||||
[&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
auto create_clean_version_of_merged_remainder = [&](CudaStreams streams) {
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count,
|
||||
cleaned_merged_interesting_remainder,
|
||||
streams, cleaned_merged_interesting_remainder,
|
||||
cleaned_merged_interesting_remainder, bsks, ksks,
|
||||
ms_noise_reduction_key, mem_ptr->message_extract_lut_1,
|
||||
cleaned_merged_interesting_remainder->num_radix_blocks);
|
||||
};
|
||||
|
||||
// phase 2
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
streams.synchronize();
|
||||
// new_remainder
|
||||
// subtraction_overflowed
|
||||
do_overflowing_sub(mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
|
||||
do_overflowing_sub(mem_ptr->sub_streams_1);
|
||||
// at_least_one_upper_block_is_non_zero
|
||||
check_divisor_upper_blocks(mem_ptr->sub_streams_2, gpu_indexes, gpu_count);
|
||||
check_divisor_upper_blocks(mem_ptr->sub_streams_2);
|
||||
// cleaned_merged_interesting_remainder
|
||||
create_clean_version_of_merged_remainder(mem_ptr->sub_streams_3,
|
||||
gpu_indexes, gpu_count);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
|
||||
}
|
||||
create_clean_version_of_merged_remainder(mem_ptr->sub_streams_3);
|
||||
|
||||
mem_ptr->sub_streams_1.synchronize();
|
||||
mem_ptr->sub_streams_2.synchronize();
|
||||
mem_ptr->sub_streams_3.synchronize();
|
||||
|
||||
host_addition<Torus>(
|
||||
streams[0], gpu_indexes[0], overflow_sum, subtraction_overflowed,
|
||||
at_least_one_upper_block_is_non_zero, 1, radix_params.message_modulus,
|
||||
radix_params.carry_modulus);
|
||||
streams.stream(0), streams.gpu_index(0), overflow_sum,
|
||||
subtraction_overflowed, at_least_one_upper_block_is_non_zero, 1,
|
||||
radix_params.message_modulus, radix_params.carry_modulus);
|
||||
|
||||
auto message_modulus = radix_params.message_modulus;
|
||||
int factor = (i) ? message_modulus - 1 : message_modulus - 2;
|
||||
int factor_lut_id = (i) ? 1 : 0;
|
||||
for (size_t k = 0;
|
||||
k < cleaned_merged_interesting_remainder->num_radix_blocks; k++) {
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
overflow_sum_radix, k, k + 1,
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), overflow_sum_radix, k, k + 1,
|
||||
overflow_sum, 0, 1);
|
||||
}
|
||||
|
||||
auto conditionally_zero_out_merged_interesting_remainder =
|
||||
[&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
[&](CudaStreams streams) {
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count,
|
||||
cleaned_merged_interesting_remainder,
|
||||
streams, cleaned_merged_interesting_remainder,
|
||||
cleaned_merged_interesting_remainder, overflow_sum_radix, bsks,
|
||||
ksks, ms_noise_reduction_key,
|
||||
mem_ptr->zero_out_if_overflow_did_not_happen[factor_lut_id],
|
||||
@@ -411,23 +379,20 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
};
|
||||
|
||||
auto conditionally_zero_out_merged_new_remainder =
|
||||
[&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
[&](CudaStreams streams) {
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, new_remainder, new_remainder,
|
||||
overflow_sum_radix, bsks, ksks, ms_noise_reduction_key,
|
||||
streams, new_remainder, new_remainder, overflow_sum_radix, bsks,
|
||||
ksks, ms_noise_reduction_key,
|
||||
mem_ptr->zero_out_if_overflow_happened[factor_lut_id],
|
||||
new_remainder->num_radix_blocks, factor);
|
||||
};
|
||||
|
||||
auto set_quotient_bit = [&](cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
auto set_quotient_bit = [&](CudaStreams streams) {
|
||||
uint32_t block_of_bit = i / num_bits_in_message;
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->did_not_overflow,
|
||||
subtraction_overflowed, at_least_one_upper_block_is_non_zero, bsks,
|
||||
ksks, ms_noise_reduction_key,
|
||||
streams, mem_ptr->did_not_overflow, subtraction_overflowed,
|
||||
at_least_one_upper_block_is_non_zero, bsks, ksks,
|
||||
ms_noise_reduction_key,
|
||||
mem_ptr->merge_overflow_flags_luts[pos_in_block], 1,
|
||||
mem_ptr->merge_overflow_flags_luts[pos_in_block]
|
||||
->params.message_modulus);
|
||||
@@ -435,28 +400,24 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
CudaRadixCiphertextFFI quotient_block;
|
||||
as_radix_ciphertext_slice<Torus>("ient_block, quotient, block_of_bit,
|
||||
block_of_bit + 1);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], "ient_block,
|
||||
host_addition<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), "ient_block,
|
||||
"ient_block, mem_ptr->did_not_overflow, 1,
|
||||
radix_params.message_modulus,
|
||||
radix_params.carry_modulus);
|
||||
radix_params.message_modulus, radix_params.carry_modulus);
|
||||
};
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
streams.synchronize();
|
||||
|
||||
// cleaned_merged_interesting_remainder
|
||||
conditionally_zero_out_merged_interesting_remainder(mem_ptr->sub_streams_1,
|
||||
gpu_indexes, gpu_count);
|
||||
conditionally_zero_out_merged_interesting_remainder(mem_ptr->sub_streams_1);
|
||||
// new_remainder
|
||||
conditionally_zero_out_merged_new_remainder(mem_ptr->sub_streams_2,
|
||||
gpu_indexes, gpu_count);
|
||||
conditionally_zero_out_merged_new_remainder(mem_ptr->sub_streams_2);
|
||||
// quotient
|
||||
set_quotient_bit(mem_ptr->sub_streams_3, gpu_indexes, gpu_count);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
|
||||
}
|
||||
set_quotient_bit(mem_ptr->sub_streams_3);
|
||||
|
||||
mem_ptr->sub_streams_1.synchronize();
|
||||
mem_ptr->sub_streams_2.synchronize();
|
||||
mem_ptr->sub_streams_3.synchronize();
|
||||
|
||||
if (first_trivial_block !=
|
||||
cleaned_merged_interesting_remainder->num_radix_blocks)
|
||||
@@ -467,11 +428,12 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
"num blocks")
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], remainder1, 0, first_trivial_block,
|
||||
cleaned_merged_interesting_remainder, 0, first_trivial_block);
|
||||
streams.stream(0), streams.gpu_index(0), remainder1, 0,
|
||||
first_trivial_block, cleaned_merged_interesting_remainder, 0,
|
||||
first_trivial_block);
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], remainder2, 0, first_trivial_block,
|
||||
new_remainder, 0, first_trivial_block);
|
||||
streams.stream(0), streams.gpu_index(0), remainder2, 0,
|
||||
first_trivial_block, new_remainder, 0, first_trivial_block);
|
||||
}
|
||||
|
||||
if (remainder1->num_radix_blocks != remainder2->num_radix_blocks)
|
||||
@@ -480,31 +442,27 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
|
||||
// Clean the quotient and remainder
|
||||
// as even though they have no carries, they are not at nominal noise level
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], remainder, remainder1,
|
||||
remainder2, remainder1->num_radix_blocks,
|
||||
host_addition<Torus>(streams.stream(0), streams.gpu_index(0), remainder,
|
||||
remainder1, remainder2, remainder1->num_radix_blocks,
|
||||
radix_params.message_modulus,
|
||||
radix_params.carry_modulus);
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
streams.synchronize();
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
|
||||
bsks, ksks, ms_noise_reduction_key, mem_ptr->message_extract_lut_1,
|
||||
num_blocks);
|
||||
mem_ptr->sub_streams_1, remainder, remainder, bsks, ksks,
|
||||
ms_noise_reduction_key, mem_ptr->message_extract_lut_1, num_blocks);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient, bsks,
|
||||
ksks, ms_noise_reduction_key, mem_ptr->message_extract_lut_2, num_blocks);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
}
|
||||
mem_ptr->sub_streams_2, quotient, quotient, bsks, ksks,
|
||||
ms_noise_reduction_key, mem_ptr->message_extract_lut_2, num_blocks);
|
||||
|
||||
mem_ptr->sub_streams_1.synchronize();
|
||||
mem_ptr->sub_streams_2.synchronize();
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_div_rem_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *quotient,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *quotient,
|
||||
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
|
||||
CudaRadixCiphertextFFI const *divisor, bool is_signed, void *const *bsks,
|
||||
uint64_t *const *ksks,
|
||||
@@ -526,32 +484,27 @@ __host__ void host_integer_div_rem_kb(
|
||||
// temporary memory
|
||||
auto positive_numerator = int_mem_ptr->positive_numerator;
|
||||
auto positive_divisor = int_mem_ptr->positive_divisor;
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
positive_numerator, numerator);
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
positive_divisor, divisor);
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
streams.synchronize();
|
||||
|
||||
host_integer_abs_kb<Torus>(
|
||||
int_mem_ptr->sub_streams_1, gpu_indexes, int_mem_ptr->active_gpu_count,
|
||||
positive_numerator, bsks, ksks, ms_noise_reduction_key,
|
||||
host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_1, positive_numerator,
|
||||
bsks, ksks, ms_noise_reduction_key,
|
||||
int_mem_ptr->abs_mem_1, true);
|
||||
host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_2, gpu_indexes,
|
||||
int_mem_ptr->active_gpu_count, positive_divisor,
|
||||
host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_2, positive_divisor,
|
||||
bsks, ksks, ms_noise_reduction_key,
|
||||
int_mem_ptr->abs_mem_2, true);
|
||||
for (uint j = 0; j < int_mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(int_mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(int_mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
int_mem_ptr->sub_streams_1.synchronize();
|
||||
int_mem_ptr->sub_streams_2.synchronize();
|
||||
|
||||
host_unsigned_integer_div_rem_kb<Torus>(
|
||||
int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count, quotient, remainder,
|
||||
positive_numerator, positive_divisor, bsks, ksks,
|
||||
ms_noise_reduction_key, int_mem_ptr->unsigned_mem);
|
||||
int_mem_ptr->sub_streams_1, quotient, remainder, positive_numerator,
|
||||
positive_divisor, bsks, ksks, ms_noise_reduction_key,
|
||||
int_mem_ptr->unsigned_mem);
|
||||
|
||||
CudaRadixCiphertextFFI numerator_sign;
|
||||
as_radix_ciphertext_slice<Torus>(&numerator_sign, numerator, num_blocks - 1,
|
||||
@@ -560,59 +513,51 @@ __host__ void host_integer_div_rem_kb(
|
||||
as_radix_ciphertext_slice<Torus>(&divisor_sign, divisor, num_blocks - 1,
|
||||
num_blocks);
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
|
||||
int_mem_ptr->sign_bits_are_different, &numerator_sign, &divisor_sign,
|
||||
bsks, ksks, ms_noise_reduction_key,
|
||||
int_mem_ptr->sub_streams_2, int_mem_ptr->sign_bits_are_different,
|
||||
&numerator_sign, &divisor_sign, bsks, ksks, ms_noise_reduction_key,
|
||||
int_mem_ptr->compare_signed_bits_lut, 1,
|
||||
int_mem_ptr->compare_signed_bits_lut->params.message_modulus);
|
||||
|
||||
for (uint j = 0; j < int_mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(int_mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(int_mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
}
|
||||
int_mem_ptr->sub_streams_1.synchronize();
|
||||
int_mem_ptr->sub_streams_2.synchronize();
|
||||
|
||||
host_integer_radix_negation<Torus>(int_mem_ptr->sub_streams_1, gpu_indexes,
|
||||
gpu_count, int_mem_ptr->negated_quotient,
|
||||
quotient, radix_params.message_modulus,
|
||||
radix_params.carry_modulus, num_blocks);
|
||||
host_integer_radix_negation<Torus>(
|
||||
int_mem_ptr->sub_streams_1, int_mem_ptr->negated_quotient, quotient,
|
||||
radix_params.message_modulus, radix_params.carry_modulus, num_blocks);
|
||||
|
||||
uint32_t requested_flag = outputFlag::FLAG_NONE;
|
||||
uint32_t uses_carry = 0;
|
||||
host_propagate_single_carry<Torus>(
|
||||
int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
|
||||
int_mem_ptr->negated_quotient, nullptr, nullptr, int_mem_ptr->scp_mem_1,
|
||||
bsks, ksks, ms_noise_reduction_key, requested_flag, uses_carry);
|
||||
|
||||
host_integer_radix_negation<Torus>(
|
||||
int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
|
||||
int_mem_ptr->negated_remainder, remainder, radix_params.message_modulus,
|
||||
radix_params.carry_modulus, num_blocks);
|
||||
|
||||
host_propagate_single_carry<Torus>(
|
||||
int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
|
||||
int_mem_ptr->negated_remainder, nullptr, nullptr,
|
||||
int_mem_ptr->scp_mem_2, bsks, ksks, ms_noise_reduction_key,
|
||||
int_mem_ptr->sub_streams_1, int_mem_ptr->negated_quotient, nullptr,
|
||||
nullptr, int_mem_ptr->scp_mem_1, bsks, ksks, ms_noise_reduction_key,
|
||||
requested_flag, uses_carry);
|
||||
|
||||
host_integer_radix_cmux_kb<Torus>(
|
||||
int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count, quotient,
|
||||
int_mem_ptr->sign_bits_are_different, int_mem_ptr->negated_quotient,
|
||||
quotient, int_mem_ptr->cmux_quotient_mem, bsks, ksks,
|
||||
ms_noise_reduction_key);
|
||||
host_integer_radix_negation<Torus>(
|
||||
int_mem_ptr->sub_streams_2, int_mem_ptr->negated_remainder, remainder,
|
||||
radix_params.message_modulus, radix_params.carry_modulus, num_blocks);
|
||||
|
||||
host_propagate_single_carry<Torus>(
|
||||
int_mem_ptr->sub_streams_2, int_mem_ptr->negated_remainder, nullptr,
|
||||
nullptr, int_mem_ptr->scp_mem_2, bsks, ksks, ms_noise_reduction_key,
|
||||
requested_flag, uses_carry);
|
||||
|
||||
host_integer_radix_cmux_kb<Torus>(int_mem_ptr->sub_streams_1, quotient,
|
||||
int_mem_ptr->sign_bits_are_different,
|
||||
int_mem_ptr->negated_quotient, quotient,
|
||||
int_mem_ptr->cmux_quotient_mem, bsks,
|
||||
ksks, ms_noise_reduction_key);
|
||||
|
||||
host_integer_radix_cmux_kb<Torus>(
|
||||
int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count, remainder,
|
||||
&numerator_sign, int_mem_ptr->negated_remainder, remainder,
|
||||
int_mem_ptr->sub_streams_2, remainder, &numerator_sign,
|
||||
int_mem_ptr->negated_remainder, remainder,
|
||||
int_mem_ptr->cmux_remainder_mem, bsks, ksks, ms_noise_reduction_key);
|
||||
|
||||
for (uint j = 0; j < int_mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(int_mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(int_mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
}
|
||||
int_mem_ptr->sub_streams_1.synchronize();
|
||||
int_mem_ptr->sub_streams_2.synchronize();
|
||||
} else {
|
||||
host_unsigned_integer_div_rem_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, quotient, remainder, numerator,
|
||||
divisor, bsks, ksks, ms_noise_reduction_key, int_mem_ptr->unsigned_mem);
|
||||
streams, quotient, remainder, numerator, divisor, bsks, ksks,
|
||||
ms_noise_reduction_key, int_mem_ptr->unsigned_mem);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,13 +1,12 @@
|
||||
#include "ilog2.cuh"
|
||||
|
||||
uint64_t scratch_integer_count_of_consecutive_bits_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t counter_num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, Direction direction,
|
||||
BitValue bit_value, bool allocate_gpu_memory,
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t counter_num_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
Direction direction, BitValue bit_value, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
@@ -17,7 +16,7 @@ uint64_t scratch_integer_count_of_consecutive_bits_kb_64(
|
||||
noise_reduction_type);
|
||||
|
||||
return scratch_integer_count_of_consecutive_bits<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
|
||||
CudaStreams(streams), params,
|
||||
(int_count_of_consecutive_bits_buffer<uint64_t> **)mem_ptr, num_blocks,
|
||||
counter_num_blocks, direction, bit_value, allocate_gpu_memory);
|
||||
}
|
||||
@@ -28,37 +27,35 @@ uint64_t scratch_integer_count_of_consecutive_bits_kb_64(
|
||||
// stored in the output ciphertext.
|
||||
//
|
||||
void cuda_integer_count_of_consecutive_bits_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *output_ct, CudaRadixCiphertextFFI const *input_ct,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_ct,
|
||||
CudaRadixCiphertextFFI const *input_ct, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key) {
|
||||
|
||||
host_integer_count_of_consecutive_bits<uint64_t>(
|
||||
(cudaStream_t *)streams, gpu_indexes, gpu_count, output_ct, input_ct,
|
||||
CudaStreams(streams), output_ct, input_ct,
|
||||
(int_count_of_consecutive_bits_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)ksks, ms_noise_reduction_key);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_count_of_consecutive_bits_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
|
||||
|
||||
int_count_of_consecutive_bits_buffer<uint64_t> *mem_ptr =
|
||||
(int_count_of_consecutive_bits_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_integer_ilog2_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint32_t input_num_blocks, uint32_t counter_num_blocks,
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t input_num_blocks, uint32_t counter_num_blocks,
|
||||
uint32_t num_bits_in_ciphertext, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
@@ -69,9 +66,9 @@ uint64_t scratch_integer_ilog2_kb_64(
|
||||
noise_reduction_type);
|
||||
|
||||
return scratch_integer_ilog2<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
|
||||
(int_ilog2_buffer<uint64_t> **)mem_ptr, input_num_blocks,
|
||||
counter_num_blocks, num_bits_in_ciphertext, allocate_gpu_memory);
|
||||
CudaStreams(streams), params, (int_ilog2_buffer<uint64_t> **)mem_ptr,
|
||||
input_num_blocks, counter_num_blocks, num_bits_in_ciphertext,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
// Computes the integer logarithm base 2 of an encrypted integer.
|
||||
@@ -79,30 +76,27 @@ uint64_t scratch_integer_ilog2_kb_64(
|
||||
// The result is stored in the output ciphertext.
|
||||
//
|
||||
void cuda_integer_ilog2_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *output_ct, CudaRadixCiphertextFFI const *input_ct,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_ct,
|
||||
CudaRadixCiphertextFFI const *input_ct,
|
||||
CudaRadixCiphertextFFI const *trivial_ct_neg_n,
|
||||
CudaRadixCiphertextFFI const *trivial_ct_2,
|
||||
CudaRadixCiphertextFFI const *trivial_ct_m_minus_1_block, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key) {
|
||||
|
||||
host_integer_ilog2<uint64_t>((cudaStream_t *)streams, gpu_indexes, gpu_count,
|
||||
output_ct, input_ct, trivial_ct_neg_n,
|
||||
trivial_ct_2, trivial_ct_m_minus_1_block,
|
||||
(int_ilog2_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
host_integer_ilog2<uint64_t>(
|
||||
CudaStreams(streams), output_ct, input_ct, trivial_ct_neg_n, trivial_ct_2,
|
||||
trivial_ct_m_minus_1_block, (int_ilog2_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)ksks, ms_noise_reduction_key);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_ilog2_kb_64(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
void cleanup_cuda_integer_ilog2_kb_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_ilog2_buffer<uint64_t> *mem_ptr =
|
||||
(int_ilog2_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
|
||||
@@ -7,31 +7,29 @@
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_prepare_count_of_consecutive_bits(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *ciphertext,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *ciphertext,
|
||||
int_prepare_count_of_consecutive_bits_buffer<Torus> *mem_ptr,
|
||||
void *const *bsks, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
auto tmp = mem_ptr->tmp_ct;
|
||||
|
||||
host_apply_univariate_lut_kb<Torus>(streams, gpu_indexes, gpu_count, tmp,
|
||||
ciphertext, mem_ptr->univ_lut_mem, ksks,
|
||||
host_apply_univariate_lut_kb<Torus>(streams, tmp, ciphertext,
|
||||
mem_ptr->univ_lut_mem, ksks,
|
||||
ms_noise_reduction_key, bsks);
|
||||
|
||||
if (mem_ptr->direction == Leading) {
|
||||
host_radix_blocks_reverse_inplace<Torus>(streams, gpu_indexes, tmp);
|
||||
host_radix_blocks_reverse_inplace<Torus>(streams, tmp);
|
||||
}
|
||||
|
||||
host_compute_prefix_sum_hillis_steele<uint64_t>(
|
||||
streams, gpu_indexes, gpu_count, ciphertext, tmp, mem_ptr->biv_lut_mem,
|
||||
bsks, ksks, ms_noise_reduction_key, ciphertext->num_radix_blocks);
|
||||
streams, ciphertext, tmp, mem_ptr->biv_lut_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, ciphertext->num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_integer_count_of_consecutive_bits(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, const int_radix_params params,
|
||||
CudaStreams streams, const int_radix_params params,
|
||||
int_count_of_consecutive_bits_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, uint32_t counter_num_blocks, Direction direction,
|
||||
BitValue bit_value, const bool allocate_gpu_memory) {
|
||||
@@ -39,17 +37,15 @@ __host__ uint64_t scratch_integer_count_of_consecutive_bits(
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
*mem_ptr = new int_count_of_consecutive_bits_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
counter_num_blocks, direction, bit_value, allocate_gpu_memory,
|
||||
size_tracker);
|
||||
streams, params, num_radix_blocks, counter_num_blocks, direction,
|
||||
bit_value, allocate_gpu_memory, size_tracker);
|
||||
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_count_of_consecutive_bits(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *output_ct,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *output_ct,
|
||||
CudaRadixCiphertextFFI const *input_ct,
|
||||
int_count_of_consecutive_bits_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
@@ -59,13 +55,13 @@ __host__ void host_integer_count_of_consecutive_bits(
|
||||
auto ct_prepared = mem_ptr->ct_prepared;
|
||||
auto counter_num_blocks = mem_ptr->counter_num_blocks;
|
||||
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], ct_prepared,
|
||||
input_ct);
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
ct_prepared, input_ct);
|
||||
|
||||
// Prepare count of consecutive bits
|
||||
//
|
||||
host_integer_prepare_count_of_consecutive_bits(
|
||||
streams, gpu_indexes, gpu_count, ct_prepared, mem_ptr->prepare_mem, bsks,
|
||||
host_integer_prepare_count_of_consecutive_bits(streams, ct_prepared,
|
||||
mem_ptr->prepare_mem, bsks,
|
||||
ksks, ms_noise_reduction_key);
|
||||
|
||||
// Perform addition and propagation of prepared cts
|
||||
@@ -75,42 +71,40 @@ __host__ void host_integer_count_of_consecutive_bits(
|
||||
for (uint32_t i = 0; i < ct_prepared->num_radix_blocks; ++i) {
|
||||
uint32_t output_start_index = i * counter_num_blocks;
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], cts, output_start_index,
|
||||
streams.stream(0), streams.gpu_index(0), cts, output_start_index,
|
||||
output_start_index + 1, ct_prepared, i, i + 1);
|
||||
}
|
||||
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, output_ct, cts, bsks, ksks,
|
||||
ms_noise_reduction_key, mem_ptr->sum_mem, counter_num_blocks,
|
||||
ct_prepared->num_radix_blocks);
|
||||
streams, output_ct, cts, bsks, ksks, ms_noise_reduction_key,
|
||||
mem_ptr->sum_mem, counter_num_blocks, ct_prepared->num_radix_blocks);
|
||||
|
||||
host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count, output_ct,
|
||||
nullptr, nullptr, mem_ptr->propagate_mem,
|
||||
bsks, ksks, ms_noise_reduction_key, 0, 0);
|
||||
host_propagate_single_carry<Torus>(streams, output_ct, nullptr, nullptr,
|
||||
mem_ptr->propagate_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, 0, 0);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_integer_ilog2(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, const int_radix_params params,
|
||||
int_ilog2_buffer<Torus> **mem_ptr, uint32_t input_num_blocks,
|
||||
uint32_t counter_num_blocks, uint32_t num_bits_in_ciphertext,
|
||||
__host__ uint64_t scratch_integer_ilog2(CudaStreams streams,
|
||||
const int_radix_params params,
|
||||
int_ilog2_buffer<Torus> **mem_ptr,
|
||||
uint32_t input_num_blocks,
|
||||
uint32_t counter_num_blocks,
|
||||
uint32_t num_bits_in_ciphertext,
|
||||
const bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
*mem_ptr = new int_ilog2_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, input_num_blocks,
|
||||
counter_num_blocks, num_bits_in_ciphertext, allocate_gpu_memory,
|
||||
size_tracker);
|
||||
streams, params, input_num_blocks, counter_num_blocks,
|
||||
num_bits_in_ciphertext, allocate_gpu_memory, size_tracker);
|
||||
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_ilog2(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *output_ct,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *output_ct,
|
||||
CudaRadixCiphertextFFI const *input_ct,
|
||||
CudaRadixCiphertextFFI const *trivial_ct_neg_n,
|
||||
CudaRadixCiphertextFFI const *trivial_ct_2,
|
||||
@@ -121,18 +115,18 @@ __host__ void host_integer_ilog2(
|
||||
// Prepare the input ciphertext by computing the number of consecutive
|
||||
// leading zeros for each of its blocks.
|
||||
//
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
mem_ptr->ct_in_buffer, input_ct);
|
||||
host_integer_prepare_count_of_consecutive_bits<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->ct_in_buffer,
|
||||
mem_ptr->prepare_mem, bsks, ksks, ms_noise_reduction_key);
|
||||
streams, mem_ptr->ct_in_buffer, mem_ptr->prepare_mem, bsks, ksks,
|
||||
ms_noise_reduction_key);
|
||||
|
||||
// Build the input for the sum by taking each block's leading zero count
|
||||
// and placing it into a separate, zero-padded ct slot.
|
||||
//
|
||||
for (uint32_t i = 0; i < mem_ptr->input_num_blocks; ++i) {
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], mem_ptr->sum_input_cts,
|
||||
streams.stream(0), streams.gpu_index(0), mem_ptr->sum_input_cts,
|
||||
i * mem_ptr->counter_num_blocks, (i * mem_ptr->counter_num_blocks) + 1,
|
||||
mem_ptr->ct_in_buffer, i, i + 1);
|
||||
}
|
||||
@@ -145,7 +139,7 @@ __host__ void host_integer_ilog2(
|
||||
"num blocks of trivial_ct_neg_n should be equal to counter_num_blocks");
|
||||
}
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], mem_ptr->sum_input_cts,
|
||||
streams.stream(0), streams.gpu_index(0), mem_ptr->sum_input_cts,
|
||||
mem_ptr->input_num_blocks * mem_ptr->counter_num_blocks,
|
||||
(mem_ptr->input_num_blocks + 1) * mem_ptr->counter_num_blocks,
|
||||
trivial_ct_neg_n, 0, trivial_ct_neg_n->num_radix_blocks);
|
||||
@@ -153,34 +147,31 @@ __host__ void host_integer_ilog2(
|
||||
// Perform a partial sum of all the elements without carry propagation.
|
||||
//
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->sum_output_not_propagated,
|
||||
mem_ptr->sum_input_cts, bsks, ksks, ms_noise_reduction_key,
|
||||
mem_ptr->sum_mem, mem_ptr->counter_num_blocks,
|
||||
mem_ptr->input_num_blocks + 1);
|
||||
streams, mem_ptr->sum_output_not_propagated, mem_ptr->sum_input_cts, bsks,
|
||||
ksks, ms_noise_reduction_key, mem_ptr->sum_mem,
|
||||
mem_ptr->counter_num_blocks, mem_ptr->input_num_blocks + 1);
|
||||
|
||||
// Apply luts to the partial sum.
|
||||
//
|
||||
host_apply_univariate_lut_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->message_blocks_not,
|
||||
mem_ptr->sum_output_not_propagated, mem_ptr->lut_message_not, ksks,
|
||||
ms_noise_reduction_key, bsks);
|
||||
streams, mem_ptr->message_blocks_not, mem_ptr->sum_output_not_propagated,
|
||||
mem_ptr->lut_message_not, ksks, ms_noise_reduction_key, bsks);
|
||||
host_apply_univariate_lut_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->carry_blocks_not,
|
||||
mem_ptr->sum_output_not_propagated, mem_ptr->lut_carry_not, ksks,
|
||||
ms_noise_reduction_key, bsks);
|
||||
streams, mem_ptr->carry_blocks_not, mem_ptr->sum_output_not_propagated,
|
||||
mem_ptr->lut_carry_not, ksks, ms_noise_reduction_key, bsks);
|
||||
|
||||
// Left-shift the bitwise-negated carry blocks by one position.
|
||||
//
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], mem_ptr->rotated_carry_blocks, 1,
|
||||
streams.stream(0), streams.gpu_index(0), mem_ptr->rotated_carry_blocks, 1,
|
||||
mem_ptr->counter_num_blocks, mem_ptr->carry_blocks_not, 0,
|
||||
mem_ptr->counter_num_blocks - 1);
|
||||
|
||||
// Insert a block of (mod - 1) at the least significant position.
|
||||
//
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
mem_ptr->rotated_carry_blocks, 0, 1,
|
||||
trivial_ct_m_minus_1_block, 0, 1);
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), mem_ptr->rotated_carry_blocks, 0,
|
||||
1, trivial_ct_m_minus_1_block, 0, 1);
|
||||
|
||||
// Update the degree metadata for the rotated carry blocks.
|
||||
//
|
||||
@@ -190,28 +181,27 @@ __host__ void host_integer_ilog2(
|
||||
}
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], mem_ptr->sum_input_cts, 0,
|
||||
streams.stream(0), streams.gpu_index(0), mem_ptr->sum_input_cts, 0,
|
||||
mem_ptr->counter_num_blocks, mem_ptr->message_blocks_not, 0,
|
||||
mem_ptr->counter_num_blocks);
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], mem_ptr->sum_input_cts,
|
||||
streams.stream(0), streams.gpu_index(0), mem_ptr->sum_input_cts,
|
||||
mem_ptr->counter_num_blocks, 2 * mem_ptr->counter_num_blocks,
|
||||
mem_ptr->rotated_carry_blocks, 0, mem_ptr->counter_num_blocks);
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], mem_ptr->sum_input_cts,
|
||||
streams.stream(0), streams.gpu_index(0), mem_ptr->sum_input_cts,
|
||||
2 * mem_ptr->counter_num_blocks, 3 * mem_ptr->counter_num_blocks,
|
||||
trivial_ct_2, 0, mem_ptr->counter_num_blocks);
|
||||
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, output_ct, mem_ptr->sum_input_cts, bsks,
|
||||
ksks, ms_noise_reduction_key, mem_ptr->sum_mem,
|
||||
mem_ptr->counter_num_blocks, 3);
|
||||
streams, output_ct, mem_ptr->sum_input_cts, bsks, ksks,
|
||||
ms_noise_reduction_key, mem_ptr->sum_mem, mem_ptr->counter_num_blocks, 3);
|
||||
|
||||
host_full_propagate_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, output_ct, mem_ptr->final_propagate_mem,
|
||||
ksks, ms_noise_reduction_key, bsks, mem_ptr->counter_num_blocks);
|
||||
streams, output_ct, mem_ptr->final_propagate_mem, ksks,
|
||||
ms_noise_reduction_key, bsks, mem_ptr->counter_num_blocks);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -3,8 +3,8 @@
|
||||
#include <linear_algebra.h>
|
||||
|
||||
void cuda_full_propagation_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *input_blocks, int8_t *mem_ptr, void *const *ksks,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *input_blocks,
|
||||
int8_t *mem_ptr, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_blocks) {
|
||||
|
||||
@@ -12,17 +12,17 @@ void cuda_full_propagation_64_inplace(
|
||||
(int_fullprop_buffer<uint64_t> *)mem_ptr;
|
||||
|
||||
host_full_propagate_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, input_blocks, buffer,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, bsks, num_blocks);
|
||||
CudaStreams(streams), input_blocks, buffer, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, bsks, num_blocks);
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_full_propagation_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
@@ -30,112 +30,105 @@ uint64_t scratch_cuda_full_propagation_64(
|
||||
noise_reduction_type);
|
||||
|
||||
return scratch_cuda_full_propagation<uint64_t>(
|
||||
(cudaStream_t *)streams, gpu_indexes, gpu_count,
|
||||
(int_fullprop_buffer<uint64_t> **)mem_ptr, params, allocate_gpu_memory);
|
||||
CudaStreams(streams), (int_fullprop_buffer<uint64_t> **)mem_ptr, params,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cleanup_cuda_full_propagation(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int8_t **mem_ptr_void) {
|
||||
void cleanup_cuda_full_propagation(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_fullprop_buffer<uint64_t> *mem_ptr =
|
||||
(int_fullprop_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_propagate_single_carry_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
|
||||
uint32_t uses_carry, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
requested_flag, uses_carry, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
|
||||
uint32_t uses_carry, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
requested_flag, uses_carry, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t compute_overflow,
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t requested_flag, uint32_t uses_carry,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
|
||||
CudaStreams(streams), (int_sc_prop_memory<uint64_t> **)mem_ptr,
|
||||
num_blocks, params, requested_flag, uses_carry, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t requested_flag, uint32_t uses_carry,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
|
||||
CudaStreams(streams), (int_sc_prop_memory<uint64_t> **)mem_ptr,
|
||||
num_blocks, params, requested_flag, uses_carry, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t compute_overflow, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_integer_overflowing_sub<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_borrow_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
compute_overflow, allocate_gpu_memory);
|
||||
CudaStreams(streams), (int_borrow_prop_memory<uint64_t> **)mem_ptr,
|
||||
num_blocks, params, compute_overflow, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_propagate_single_carry_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array, CudaRadixCiphertextFFI *carry_out,
|
||||
const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t requested_flag, uint32_t uses_carry) {
|
||||
|
||||
host_propagate_single_carry<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, carry_out,
|
||||
carry_in, (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, requested_flag, uses_carry);
|
||||
}
|
||||
|
||||
void cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t requested_flag, uint32_t uses_carry) {
|
||||
|
||||
host_propagate_single_carry<uint64_t>(
|
||||
CudaStreams(streams), lwe_array, carry_out, carry_in,
|
||||
(int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, requested_flag, uses_carry);
|
||||
}
|
||||
|
||||
void cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
|
||||
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
|
||||
const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t requested_flag, uint32_t uses_carry) {
|
||||
|
||||
host_add_and_propagate_single_carry<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lhs_array, rhs_array,
|
||||
carry_out, carry_in, (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, requested_flag, uses_carry);
|
||||
CudaStreams(streams), lhs_array, rhs_array, carry_out, carry_in,
|
||||
(int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, requested_flag, uses_carry);
|
||||
}
|
||||
|
||||
void cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
|
||||
const CudaRadixCiphertextFFI *rhs_array,
|
||||
CudaRadixCiphertextFFI *overflow_block,
|
||||
const CudaRadixCiphertextFFI *input_borrow, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
@@ -143,57 +136,50 @@ void cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
uint32_t compute_overflow, uint32_t uses_input_borrow) {
|
||||
PUSH_RANGE("overflow sub")
|
||||
host_integer_overflowing_sub<uint64_t>(
|
||||
(cudaStream_t const *)streams, gpu_indexes, gpu_count, lhs_array,
|
||||
lhs_array, rhs_array, overflow_block, input_borrow,
|
||||
(int_borrow_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
|
||||
ms_noise_reduction_key, compute_overflow, uses_input_borrow);
|
||||
CudaStreams(streams), lhs_array, lhs_array, rhs_array, overflow_block,
|
||||
input_borrow, (int_borrow_prop_memory<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)ksks, ms_noise_reduction_key, compute_overflow,
|
||||
uses_input_borrow);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cleanup_cuda_propagate_single_carry(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
void cleanup_cuda_propagate_single_carry(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
PUSH_RANGE("cleanup propagate sc")
|
||||
int_sc_prop_memory<uint64_t> *mem_ptr =
|
||||
(int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cleanup_cuda_add_and_propagate_single_carry(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
void cleanup_cuda_add_and_propagate_single_carry(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
PUSH_RANGE("cleanup add & propagate sc")
|
||||
int_sc_prop_memory<uint64_t> *mem_ptr =
|
||||
(int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
POP_RANGE()
|
||||
}
|
||||
void cleanup_cuda_integer_overflowing_sub(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
void cleanup_cuda_integer_overflowing_sub(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
PUSH_RANGE("cleanup overflow sub")
|
||||
int_borrow_prop_memory<uint64_t> *mem_ptr =
|
||||
(int_borrow_prop_memory<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_apply_univariate_lut_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint64_t lut_degree, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
@@ -204,18 +190,16 @@ uint64_t scratch_cuda_apply_univariate_lut_kb_64(
|
||||
noise_reduction_type);
|
||||
|
||||
return scratch_cuda_apply_univariate_lut_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_radix_lut<uint64_t> **)mem_ptr,
|
||||
CudaStreams(streams), (int_radix_lut<uint64_t> **)mem_ptr,
|
||||
static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
|
||||
lut_degree, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_apply_many_univariate_lut_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint32_t num_many_lut, uint64_t lut_degree, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
@@ -226,58 +210,52 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb_64(
|
||||
noise_reduction_type);
|
||||
|
||||
return scratch_cuda_apply_many_univariate_lut_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_radix_lut<uint64_t> **)mem_ptr,
|
||||
CudaStreams(streams), (int_radix_lut<uint64_t> **)mem_ptr,
|
||||
static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
|
||||
num_many_lut, lut_degree, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_apply_univariate_lut_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks) {
|
||||
|
||||
host_apply_univariate_lut_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe,
|
||||
input_radix_lwe, (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks),
|
||||
CudaStreams(streams), output_radix_lwe, input_radix_lwe,
|
||||
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, bsks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
void cleanup_cuda_apply_univariate_lut_kb_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
PUSH_RANGE("cleanup univar lut")
|
||||
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cuda_apply_many_univariate_lut_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
host_apply_many_univariate_lut_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe,
|
||||
input_radix_lwe, (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks),
|
||||
CudaStreams(streams), output_radix_lwe, input_radix_lwe,
|
||||
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, bsks, num_many_lut, lut_stride);
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_apply_bivariate_lut_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint64_t lut_degree, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
@@ -288,15 +266,13 @@ uint64_t scratch_cuda_apply_bivariate_lut_kb_64(
|
||||
noise_reduction_type);
|
||||
|
||||
return scratch_cuda_apply_bivariate_lut_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_radix_lut<uint64_t> **)mem_ptr,
|
||||
CudaStreams(streams), (int_radix_lut<uint64_t> **)mem_ptr,
|
||||
static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
|
||||
lut_degree, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_apply_bivariate_lut_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe_1,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe_2, int8_t *mem_ptr,
|
||||
void *const *ksks,
|
||||
@@ -304,30 +280,27 @@ void cuda_apply_bivariate_lut_kb_64(
|
||||
void *const *bsks, uint32_t num_radix_blocks, uint32_t shift) {
|
||||
|
||||
host_apply_bivariate_lut_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe,
|
||||
input_radix_lwe_1, input_radix_lwe_2, (int_radix_lut<uint64_t> *)mem_ptr,
|
||||
CudaStreams(streams), output_radix_lwe, input_radix_lwe_1,
|
||||
input_radix_lwe_2, (int_radix_lut<uint64_t> *)mem_ptr,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, bsks, num_radix_blocks,
|
||||
shift);
|
||||
}
|
||||
|
||||
void cleanup_cuda_apply_bivariate_lut_kb_64(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
void cleanup_cuda_apply_bivariate_lut_kb_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
PUSH_RANGE("cleanup bivar lut")
|
||||
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint64_t lut_degree, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
@@ -339,42 +312,36 @@ uint64_t scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
noise_reduction_type);
|
||||
|
||||
return scratch_cuda_apply_bivariate_lut_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_radix_lut<uint64_t> **)mem_ptr,
|
||||
CudaStreams(streams), (int_radix_lut<uint64_t> **)mem_ptr,
|
||||
static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
|
||||
lut_degree, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI *generates_or_propagates, int8_t *mem_ptr,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_radix_blocks) {
|
||||
|
||||
host_compute_prefix_sum_hillis_steele<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe,
|
||||
generates_or_propagates, (int_radix_lut<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
|
||||
CudaStreams(streams), output_radix_lwe, generates_or_propagates,
|
||||
(int_radix_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, num_radix_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
|
||||
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
void cuda_integer_reverse_blocks_64_inplace(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
void cuda_integer_reverse_blocks_64_inplace(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *lwe_array) {
|
||||
|
||||
host_radix_blocks_reverse_inplace<uint64_t>((cudaStream_t *)(streams),
|
||||
gpu_indexes, lwe_array);
|
||||
host_radix_blocks_reverse_inplace<uint64_t>(CudaStreams(streams), lwe_array);
|
||||
}
|
||||
|
||||
void reverseArray(uint64_t arr[], size_t n) {
|
||||
@@ -395,30 +362,28 @@ void reverseArray(uint64_t arr[], size_t n) {
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_apply_noise_squashing_mem(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int_radix_params params, int_noise_squashing_lut<uint64_t> **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t num_radix_blocks, uint32_t original_num_blocks,
|
||||
bool allocate_gpu_memory) {
|
||||
CudaStreamsFFI streams, int_radix_params params,
|
||||
int_noise_squashing_lut<uint64_t> **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t num_radix_blocks,
|
||||
uint32_t original_num_blocks, bool allocate_gpu_memory) {
|
||||
PUSH_RANGE("scratch noise squashing")
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_noise_squashing_lut<uint64_t>(
|
||||
(cudaStream_t *)streams, gpu_indexes, gpu_count, params, glwe_dimension,
|
||||
polynomial_size, num_radix_blocks, original_num_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
CudaStreams(streams), params, glwe_dimension, polynomial_size,
|
||||
num_radix_blocks, original_num_blocks, allocate_gpu_memory, size_tracker);
|
||||
POP_RANGE()
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_apply_noise_squashing_kb(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t input_glwe_dimension,
|
||||
uint32_t input_polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_radix_blocks, uint32_t original_num_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_glwe_dimension, uint32_t input_polynomial_size,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t original_num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
@@ -426,15 +391,13 @@ uint64_t scratch_cuda_apply_noise_squashing_kb(
|
||||
noise_reduction_type);
|
||||
|
||||
return scratch_cuda_apply_noise_squashing_mem(
|
||||
streams, gpu_indexes, gpu_count, params,
|
||||
(int_noise_squashing_lut<uint64_t> **)mem_ptr, input_glwe_dimension,
|
||||
input_polynomial_size, num_radix_blocks, original_num_blocks,
|
||||
allocate_gpu_memory);
|
||||
streams, params, (int_noise_squashing_lut<uint64_t> **)mem_ptr,
|
||||
input_glwe_dimension, input_polynomial_size, num_radix_blocks,
|
||||
original_num_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_apply_noise_squashing_kb(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
@@ -442,20 +405,18 @@ void cuda_apply_noise_squashing_kb(
|
||||
|
||||
PUSH_RANGE("apply noise squashing")
|
||||
integer_radix_apply_noise_squashing_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe,
|
||||
input_radix_lwe, (int_noise_squashing_lut<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)ksks, ms_noise_reduction_key);
|
||||
CudaStreams(streams), output_radix_lwe, input_radix_lwe,
|
||||
(int_noise_squashing_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
|
||||
ms_noise_reduction_key);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cleanup_cuda_apply_noise_squashing_kb(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
void cleanup_cuda_apply_noise_squashing_kb(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
PUSH_RANGE("cleanup noise squashing")
|
||||
int_noise_squashing_lut<uint64_t> *mem_ptr =
|
||||
(int_noise_squashing_lut<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
POP_RANGE()
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -66,13 +66,13 @@ void generate_ids_update_degrees(uint64_t *terms_degree, size_t *h_lwe_idx_in,
|
||||
* the integer radix multiplication in keyswitch->bootstrap order.
|
||||
*/
|
||||
uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, bool const is_boolean_left, bool const is_boolean_right,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
|
||||
uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, bool const is_boolean_left,
|
||||
bool const is_boolean_right, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level,
|
||||
uint32_t ks_base_log, uint32_t ks_level, uint32_t grouping_factor,
|
||||
uint32_t num_radix_blocks, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
polynomial_size * glwe_dimension, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
@@ -88,9 +88,9 @@ uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
case 8192:
|
||||
case 16384:
|
||||
return scratch_cuda_integer_mult_radix_ciphertext_kb<uint64_t>(
|
||||
(cudaStream_t const *)(streams), gpu_indexes, gpu_count,
|
||||
(int_mul_memory<uint64_t> **)mem_ptr, is_boolean_left, is_boolean_right,
|
||||
num_radix_blocks, params, allocate_gpu_memory);
|
||||
CudaStreams(streams), (int_mul_memory<uint64_t> **)mem_ptr,
|
||||
is_boolean_left, is_boolean_right, num_radix_blocks, params,
|
||||
allocate_gpu_memory);
|
||||
default:
|
||||
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
|
||||
"Supported N's are powers of two in the interval [256..16384].")
|
||||
@@ -125,8 +125,7 @@ uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
* - 'pbs_type' selects which PBS implementation should be used
|
||||
*/
|
||||
void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
|
||||
void *const *bsks, void *const *ksks,
|
||||
@@ -136,52 +135,52 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<256>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key,
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
num_blocks);
|
||||
break;
|
||||
case 512:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<512>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key,
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
num_blocks);
|
||||
break;
|
||||
case 1024:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<1024>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key,
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
num_blocks);
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<2048>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key,
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
num_blocks);
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<4096>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key,
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
num_blocks);
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<8192>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key,
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
num_blocks);
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<16384>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key,
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
num_blocks);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
|
||||
@@ -190,26 +189,24 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_mult(void *const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
void cleanup_cuda_integer_mult(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
|
||||
PUSH_RANGE("cleanup mul")
|
||||
int_mul_memory<uint64_t> *mem_ptr =
|
||||
(int_mul_memory<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks_in_radix,
|
||||
uint32_t max_num_radix_in_vec, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool reduce_degrees_for_single_carry_propagation, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
@@ -219,15 +216,14 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
grouping_factor, message_modulus, carry_modulus,
|
||||
noise_reduction_type);
|
||||
return scratch_cuda_integer_partial_sum_ciphertexts_vec_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
CudaStreams(streams),
|
||||
(int_sum_ciphertexts_vec_memory<uint64_t> **)mem_ptr, num_blocks_in_radix,
|
||||
max_num_radix_in_vec, reduce_degrees_for_single_carry_propagation, params,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
@@ -237,19 +233,18 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
PANIC("Cuda error: input vector length should be a multiple of the "
|
||||
"output's number of radix blocks")
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_vec, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
|
||||
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr =
|
||||
(int_sum_ciphertexts_vec_memory<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
@@ -274,24 +274,22 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
|
||||
CudaStreams streams, int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
|
||||
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
|
||||
bool reduce_degrees_for_single_carry_propagation, int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_sum_ciphertexts_vec_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_blocks_in_radix,
|
||||
max_num_radix_in_vec, reduce_degrees_for_single_carry_propagation,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
streams, params, num_blocks_in_radix, max_num_radix_in_vec,
|
||||
reduce_degrees_for_single_carry_propagation, allocate_gpu_memory,
|
||||
size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI *terms, void *const *bsks, uint64_t *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
|
||||
@@ -335,9 +333,9 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
return;
|
||||
}
|
||||
if (num_radix_in_vec == 1) {
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
radix_lwe_out, 0, num_radix_blocks,
|
||||
terms, 0, num_radix_blocks);
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), radix_lwe_out, 0,
|
||||
num_radix_blocks, terms, 0, num_radix_blocks);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -345,24 +343,24 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
CudaRadixCiphertextFFI terms_slice;
|
||||
as_radix_ciphertext_slice<Torus>(&terms_slice, terms, num_radix_blocks,
|
||||
2 * num_radix_blocks);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, terms,
|
||||
&terms_slice, num_radix_blocks,
|
||||
host_addition<Torus>(streams.stream(0), streams.gpu_index(0), radix_lwe_out,
|
||||
terms, &terms_slice, num_radix_blocks,
|
||||
mem_ptr->params.message_modulus,
|
||||
mem_ptr->params.carry_modulus);
|
||||
return;
|
||||
}
|
||||
|
||||
if (current_blocks != terms) {
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
current_blocks, terms);
|
||||
}
|
||||
|
||||
cuda_memcpy_async_to_gpu(d_degrees, current_blocks->degrees,
|
||||
total_blocks_in_vec * sizeof(uint64_t), streams[0],
|
||||
gpu_indexes[0]);
|
||||
total_blocks_in_vec * sizeof(uint64_t),
|
||||
streams.stream(0), streams.gpu_index(0));
|
||||
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
radix_vec_to_columns<<<1, num_radix_blocks, 0, streams[0]>>>(
|
||||
cuda_set_device(streams.gpu_index(0));
|
||||
radix_vec_to_columns<<<1, num_radix_blocks, 0, streams.stream(0)>>>(
|
||||
d_columns, d_columns_counter, d_degrees, num_radix_blocks,
|
||||
num_radix_in_vec);
|
||||
|
||||
@@ -373,19 +371,20 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
int part_count = (big_lwe_size + number_of_threads - 1) / number_of_threads;
|
||||
const dim3 number_of_blocks_2d(num_radix_blocks, part_count, 1);
|
||||
|
||||
mem_ptr->setup_lookup_tables(streams, gpu_indexes, gpu_count,
|
||||
num_radix_in_vec, current_blocks->degrees);
|
||||
mem_ptr->setup_lookup_tables(streams, num_radix_in_vec,
|
||||
current_blocks->degrees);
|
||||
|
||||
while (needs_processing) {
|
||||
auto luts_message_carry = mem_ptr->luts_message_carry;
|
||||
auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
|
||||
auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
|
||||
calculate_chunks<Torus>
|
||||
<<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
|
||||
<<<number_of_blocks_2d, number_of_threads, 0, streams.stream(0)>>>(
|
||||
(Torus *)(current_blocks->ptr), d_columns, d_columns_counter,
|
||||
chunk_size, big_lwe_size);
|
||||
|
||||
prepare_new_columns_and_pbs_indexes<<<1, num_radix_blocks, 0, streams[0]>>>(
|
||||
prepare_new_columns_and_pbs_indexes<<<1, num_radix_blocks, 0,
|
||||
streams.stream(0)>>>(
|
||||
d_new_columns, d_new_columns_counter, d_pbs_indexes_in,
|
||||
d_pbs_indexes_out, luts_message_carry->get_lut_indexes(0, 0), d_columns,
|
||||
d_columns_counter, chunk_size);
|
||||
@@ -395,17 +394,18 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
current_columns.next_accumulation(total_ciphertexts, total_messages,
|
||||
needs_processing);
|
||||
|
||||
auto active_gpu_count = get_active_gpu_count(total_ciphertexts, gpu_count);
|
||||
if (active_gpu_count == 1) {
|
||||
auto active_streams = streams.active_gpu_subset(total_ciphertexts);
|
||||
|
||||
if (active_streams.count() == 1) {
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
|
||||
streams.subset_first_gpu(), (Torus *)small_lwe_vector->ptr,
|
||||
d_pbs_indexes_in, (Torus *)current_blocks->ptr, d_pbs_indexes_in,
|
||||
ksks, big_lwe_dimension, small_lwe_dimension,
|
||||
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level,
|
||||
total_messages);
|
||||
|
||||
execute_pbs_async<Torus, Torus>(
|
||||
streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
|
||||
streams.subset_first_gpu(), (Torus *)current_blocks->ptr,
|
||||
d_pbs_indexes_out, luts_message_carry->lut_vec,
|
||||
luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
|
||||
d_pbs_indexes_in, bsks, ms_noise_reduction_key,
|
||||
@@ -417,21 +417,20 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
} else {
|
||||
|
||||
// we just need to broadcast the indexes
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes, active_gpu_count,
|
||||
false);
|
||||
luts_message_carry->broadcast_lut(active_streams, false);
|
||||
luts_message_carry->using_trivial_lwe_indexes = false;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, current_blocks, current_blocks, bsks,
|
||||
ksks, ms_noise_reduction_key, luts_message_carry, total_ciphertexts);
|
||||
streams, current_blocks, current_blocks, bsks, ksks,
|
||||
ms_noise_reduction_key, luts_message_carry, total_ciphertexts);
|
||||
}
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
cuda_set_device(streams.gpu_index(0));
|
||||
std::swap(d_columns, d_new_columns);
|
||||
std::swap(d_columns_counter, d_new_columns_counter);
|
||||
}
|
||||
|
||||
calculate_final_chunk_into_radix<Torus>
|
||||
<<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
|
||||
<<<number_of_blocks_2d, number_of_threads, 0, streams.stream(0)>>>(
|
||||
(Torus *)(radix_lwe_out->ptr), (Torus *)(current_blocks->ptr),
|
||||
d_columns, d_columns_counter, chunk_size, big_lwe_size);
|
||||
|
||||
@@ -440,26 +439,25 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
|
||||
auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
|
||||
prepare_final_pbs_indexes<Torus>
|
||||
<<<1, 2 * num_radix_blocks, 0, streams[0]>>>(
|
||||
<<<1, 2 * num_radix_blocks, 0, streams.stream(0)>>>(
|
||||
d_pbs_indexes_in, d_pbs_indexes_out,
|
||||
luts_message_carry->get_lut_indexes(0, 0), num_radix_blocks);
|
||||
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], current_blocks, num_radix_blocks,
|
||||
num_radix_blocks + 1);
|
||||
streams.stream(0), streams.gpu_index(0), current_blocks,
|
||||
num_radix_blocks, num_radix_blocks + 1);
|
||||
|
||||
auto active_gpu_count =
|
||||
get_active_gpu_count(2 * num_radix_blocks, gpu_count);
|
||||
auto active_streams = streams.active_gpu_subset(2 * num_radix_blocks);
|
||||
|
||||
if (active_gpu_count == 1) {
|
||||
if (active_streams.count() == 1) {
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
|
||||
streams.subset_first_gpu(), (Torus *)small_lwe_vector->ptr,
|
||||
d_pbs_indexes_in, (Torus *)radix_lwe_out->ptr, d_pbs_indexes_in, ksks,
|
||||
big_lwe_dimension, small_lwe_dimension, mem_ptr->params.ks_base_log,
|
||||
mem_ptr->params.ks_level, num_radix_blocks);
|
||||
|
||||
execute_pbs_async<Torus, Torus>(
|
||||
streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
|
||||
streams.subset_first_gpu(), (Torus *)current_blocks->ptr,
|
||||
d_pbs_indexes_out, luts_message_carry->lut_vec,
|
||||
luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
|
||||
d_pbs_indexes_in, bsks, ms_noise_reduction_key,
|
||||
@@ -471,24 +469,22 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
} else {
|
||||
uint32_t num_blocks_in_apply_lut = 2 * num_radix_blocks;
|
||||
// we just need to broadcast the indexes
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes, active_gpu_count,
|
||||
false);
|
||||
luts_message_carry->broadcast_lut(active_streams, false);
|
||||
luts_message_carry->using_trivial_lwe_indexes = false;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, current_blocks, radix_lwe_out,
|
||||
bsks, ksks, ms_noise_reduction_key, luts_message_carry,
|
||||
num_blocks_in_apply_lut);
|
||||
active_streams, current_blocks, radix_lwe_out, bsks, ksks,
|
||||
ms_noise_reduction_key, luts_message_carry, num_blocks_in_apply_lut);
|
||||
}
|
||||
calculate_final_degrees(radix_lwe_out->degrees, terms->degrees,
|
||||
num_radix_blocks, num_radix_in_vec, chunk_size,
|
||||
mem_ptr->params.message_modulus);
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
cuda_set_device(streams.gpu_index(0));
|
||||
CudaRadixCiphertextFFI current_blocks_slice;
|
||||
as_radix_ciphertext_slice<Torus>(¤t_blocks_slice, current_blocks,
|
||||
num_radix_blocks, 2 * num_radix_blocks);
|
||||
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out,
|
||||
host_addition<Torus>(streams.stream(0), streams.gpu_index(0), radix_lwe_out,
|
||||
current_blocks, ¤t_blocks_slice,
|
||||
num_radix_blocks, mem_ptr->params.message_modulus,
|
||||
mem_ptr->params.carry_modulus);
|
||||
@@ -497,8 +493,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_integer_mult_radix_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
|
||||
void *const *bsks, uint64_t *const *ksks,
|
||||
@@ -519,18 +514,16 @@ __host__ void host_integer_mult_radix_kb(
|
||||
int big_lwe_size = big_lwe_dimension + 1;
|
||||
|
||||
if (is_bool_right) {
|
||||
zero_out_if<Torus>(streams, gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_left, radix_lwe_right, mem_ptr->zero_out_mem,
|
||||
mem_ptr->zero_out_predicate_lut, bsks, ksks,
|
||||
ms_noise_reduction_key, num_blocks);
|
||||
zero_out_if<Torus>(streams, radix_lwe_out, radix_lwe_left, radix_lwe_right,
|
||||
mem_ptr->zero_out_mem, mem_ptr->zero_out_predicate_lut,
|
||||
bsks, ksks, ms_noise_reduction_key, num_blocks);
|
||||
return;
|
||||
}
|
||||
|
||||
if (is_bool_left) {
|
||||
zero_out_if<Torus>(streams, gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_right, radix_lwe_left, mem_ptr->zero_out_mem,
|
||||
mem_ptr->zero_out_predicate_lut, bsks, ksks,
|
||||
ms_noise_reduction_key, num_blocks);
|
||||
zero_out_if<Torus>(streams, radix_lwe_out, radix_lwe_right, radix_lwe_left,
|
||||
mem_ptr->zero_out_mem, mem_ptr->zero_out_predicate_lut,
|
||||
bsks, ksks, ms_noise_reduction_key, num_blocks);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -590,27 +583,27 @@ __host__ void host_integer_mult_radix_kb(
|
||||
dim3 grid(lsb_vector_block_count, 1, 1);
|
||||
dim3 thds(params::degree / params::opt, 1, 1);
|
||||
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
all_shifted_lhs_rhs<Torus, params><<<grid, thds, 0, streams[0]>>>(
|
||||
cuda_set_device(streams.gpu_index(0));
|
||||
all_shifted_lhs_rhs<Torus, params><<<grid, thds, 0, streams.stream(0)>>>(
|
||||
(Torus *)radix_lwe_left->ptr, (Torus *)vector_result_lsb->ptr,
|
||||
(Torus *)vector_result_msb.ptr, (Torus *)radix_lwe_right->ptr,
|
||||
(Torus *)vector_lsb_rhs->ptr, (Torus *)vector_msb_rhs.ptr, num_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, block_mul_res, block_mul_res,
|
||||
vector_result_sb, bsks, ksks, ms_noise_reduction_key, luts_array,
|
||||
total_block_count, luts_array->params.message_modulus);
|
||||
streams, block_mul_res, block_mul_res, vector_result_sb, bsks, ksks,
|
||||
ms_noise_reduction_key, luts_array, total_block_count,
|
||||
luts_array->params.message_modulus);
|
||||
|
||||
vector_result_lsb = block_mul_res;
|
||||
as_radix_ciphertext_slice<Torus>(&vector_result_msb, block_mul_res,
|
||||
lsb_vector_block_count,
|
||||
block_mul_res->num_radix_blocks);
|
||||
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
cuda_set_device(streams.gpu_index(0));
|
||||
fill_radix_from_lsb_msb<Torus, params>
|
||||
<<<num_blocks * num_blocks, params::degree / params::opt, 0,
|
||||
streams[0]>>>(
|
||||
streams.stream(0)>>>(
|
||||
(Torus *)vector_result_sb->ptr, (Torus *)vector_result_lsb->ptr,
|
||||
(Torus *)vector_result_msb.ptr, big_lwe_size, num_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
@@ -627,31 +620,29 @@ __host__ void host_integer_mult_radix_kb(
|
||||
terms_degree_msb[i] = (b_id > r_id) ? message_modulus - 2 : 0;
|
||||
}
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb, bsks,
|
||||
ksks, ms_noise_reduction_key, mem_ptr->sum_ciphertexts_mem, num_blocks,
|
||||
streams, radix_lwe_out, vector_result_sb, bsks, ksks,
|
||||
ms_noise_reduction_key, mem_ptr->sum_ciphertexts_mem, num_blocks,
|
||||
2 * num_blocks);
|
||||
|
||||
auto scp_mem_ptr = mem_ptr->sc_prop_mem;
|
||||
uint32_t requested_flag = outputFlag::FLAG_NONE;
|
||||
uint32_t uses_carry = 0;
|
||||
host_propagate_single_carry<Torus>(
|
||||
streams, gpu_indexes, gpu_count, radix_lwe_out, nullptr, nullptr,
|
||||
scp_mem_ptr, bsks, ksks, ms_noise_reduction_key, requested_flag,
|
||||
uses_carry);
|
||||
streams, radix_lwe_out, nullptr, nullptr, scp_mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, requested_flag, uses_carry);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_mul_memory<Torus> **mem_ptr,
|
||||
CudaStreams streams, int_mul_memory<Torus> **mem_ptr,
|
||||
bool const is_boolean_left, bool const is_boolean_right,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
PUSH_RANGE("scratch mul")
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_mul_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, is_boolean_left,
|
||||
is_boolean_right, num_radix_blocks, allocate_gpu_memory, size_tracker);
|
||||
*mem_ptr = new int_mul_memory<Torus>(streams, params, is_boolean_left,
|
||||
is_boolean_right, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
POP_RANGE()
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
#include "integer/negation.cuh"
|
||||
|
||||
void cuda_negate_integer_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, uint32_t num_radix_blocks) {
|
||||
|
||||
host_integer_radix_negation<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_in, message_modulus, carry_modulus, num_radix_blocks);
|
||||
host_integer_radix_negation<uint64_t>(CudaStreams(streams), lwe_array_out,
|
||||
lwe_array_in, message_modulus,
|
||||
carry_modulus, num_radix_blocks);
|
||||
}
|
||||
|
||||
@@ -55,11 +55,10 @@ device_integer_radix_negation(Torus *output, Torus const *input,
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_negation(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, uint64_t message_modulus,
|
||||
uint64_t carry_modulus, uint32_t num_radix_blocks) {
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
cuda_set_device(streams.gpu_index(0));
|
||||
|
||||
if (lwe_array_out->num_radix_blocks < num_radix_blocks ||
|
||||
lwe_array_in->num_radix_blocks < num_radix_blocks)
|
||||
@@ -86,7 +85,7 @@ __host__ void host_integer_radix_negation(
|
||||
// this
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_integer_radix_negation<Torus><<<grid, thds, 0, streams[0]>>>(
|
||||
device_integer_radix_negation<Torus><<<grid, thds, 0, streams.stream(0)>>>(
|
||||
static_cast<Torus *>(lwe_array_out->ptr),
|
||||
static_cast<Torus *>(lwe_array_in->ptr), num_radix_blocks, lwe_dimension,
|
||||
message_modulus, delta);
|
||||
@@ -114,24 +113,22 @@ __host__ void host_integer_radix_negation(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_overflowing_sub_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_overflowing_sub_memory<Torus> **mem_ptr,
|
||||
CudaStreams streams, int_overflowing_sub_memory<Torus> **mem_ptr,
|
||||
uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
PUSH_RANGE("scratch overflowing sub")
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_overflowing_sub_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory,
|
||||
noise_reduction_type, size_tracker);
|
||||
streams, params, num_blocks, allocate_gpu_memory, noise_reduction_type,
|
||||
size_tracker);
|
||||
POP_RANGE()
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_overflowing_sub(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *output,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI *input_left,
|
||||
const CudaRadixCiphertextFFI *input_right,
|
||||
CudaRadixCiphertextFFI *overflow_block,
|
||||
@@ -162,13 +159,12 @@ __host__ void host_integer_overflowing_sub(
|
||||
uint32_t grouping_size = num_bits_in_block;
|
||||
uint32_t num_groups = (num_blocks + grouping_size - 1) / grouping_size;
|
||||
|
||||
auto stream = (cudaStream_t *)streams;
|
||||
host_unchecked_sub_with_correcting_term<Torus>(
|
||||
stream[0], gpu_indexes[0], output, input_left, input_right, num_blocks,
|
||||
radix_params.message_modulus, radix_params.carry_modulus);
|
||||
streams.stream(0), streams.gpu_index(0), output, input_left, input_right,
|
||||
num_blocks, radix_params.message_modulus, radix_params.carry_modulus);
|
||||
|
||||
host_single_borrow_propagate<Torus>(
|
||||
streams, gpu_indexes, gpu_count, output, overflow_block, input_borrow,
|
||||
streams, output, overflow_block, input_borrow,
|
||||
(int_borrow_prop_memory<Torus> *)mem_ptr, bsks, (Torus **)(ksks),
|
||||
ms_noise_reduction_key, num_groups, compute_overflow, uses_input_borrow);
|
||||
POP_RANGE()
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
#include "integer/oprf.cuh"
|
||||
|
||||
uint64_t scratch_cuda_integer_grouped_oprf_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks_to_process, uint32_t num_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, uint32_t message_bits_per_block,
|
||||
uint32_t total_random_bits, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks_to_process,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
uint32_t message_bits_per_block, uint32_t total_random_bits,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
@@ -17,33 +17,30 @@ uint64_t scratch_cuda_integer_grouped_oprf_64(
|
||||
noise_reduction_type);
|
||||
|
||||
return scratch_cuda_integer_grouped_oprf<uint64_t>(
|
||||
(cudaStream_t *)streams, gpu_indexes, gpu_count,
|
||||
(int_grouped_oprf_memory<uint64_t> **)mem_ptr, params,
|
||||
num_blocks_to_process, num_blocks, message_bits_per_block,
|
||||
CudaStreams(streams), (int_grouped_oprf_memory<uint64_t> **)mem_ptr,
|
||||
params, num_blocks_to_process, num_blocks, message_bits_per_block,
|
||||
total_random_bits, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_grouped_oprf_async_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out, const void *seeded_lwe_input,
|
||||
uint32_t num_blocks_to_process, int8_t *mem, void *const *bsks,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
const void *seeded_lwe_input, uint32_t num_blocks_to_process, int8_t *mem,
|
||||
void *const *bsks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
host_integer_grouped_oprf<uint64_t>(
|
||||
(cudaStream_t *)streams, gpu_indexes, gpu_count, radix_lwe_out,
|
||||
(const uint64_t *)seeded_lwe_input, num_blocks_to_process,
|
||||
(int_grouped_oprf_memory<uint64_t> *)mem, bsks, ms_noise_reduction_key);
|
||||
CudaStreams(streams), radix_lwe_out, (const uint64_t *)seeded_lwe_input,
|
||||
num_blocks_to_process, (int_grouped_oprf_memory<uint64_t> *)mem, bsks,
|
||||
ms_noise_reduction_key);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_grouped_oprf_64(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
void cleanup_cuda_integer_grouped_oprf_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_grouped_oprf_memory<uint64_t> *mem_ptr =
|
||||
(int_grouped_oprf_memory<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
|
||||
@@ -6,37 +6,34 @@
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t scratch_cuda_integer_grouped_oprf(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_grouped_oprf_memory<Torus> **mem_ptr,
|
||||
CudaStreams streams, int_grouped_oprf_memory<Torus> **mem_ptr,
|
||||
int_radix_params params, uint32_t num_blocks_to_process,
|
||||
uint32_t num_blocks, uint32_t message_bits_per_block,
|
||||
uint64_t total_random_bits, bool allocate_gpu_memory) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
*mem_ptr = new int_grouped_oprf_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_blocks_to_process,
|
||||
num_blocks, message_bits_per_block, total_random_bits,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
streams, params, num_blocks_to_process, num_blocks,
|
||||
message_bits_per_block, total_random_bits, allocate_gpu_memory,
|
||||
size_tracker);
|
||||
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_integer_grouped_oprf(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
const Torus *seeded_lwe_input, uint32_t num_blocks_to_process,
|
||||
int_grouped_oprf_memory<Torus> *mem_ptr, void *const *bsks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
auto active_gpu_count =
|
||||
get_active_gpu_count(num_blocks_to_process, gpu_count);
|
||||
auto active_streams = streams.active_gpu_subset(num_blocks_to_process);
|
||||
auto lut = mem_ptr->luts;
|
||||
|
||||
if (active_gpu_count == 1) {
|
||||
if (active_streams.count() == 1) {
|
||||
execute_pbs_async<Torus, Torus>(
|
||||
streams, gpu_indexes, (uint32_t)1, (Torus *)(radix_lwe_out->ptr),
|
||||
lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec,
|
||||
streams, (Torus *)(radix_lwe_out->ptr), lut->lwe_indexes_out,
|
||||
lut->lut_vec, lut->lut_indexes_vec,
|
||||
const_cast<Torus *>(seeded_lwe_input), lut->lwe_indexes_in, bsks,
|
||||
ms_noise_reduction_key, lut->buffer, mem_ptr->params.glwe_dimension,
|
||||
mem_ptr->params.small_lwe_dimension, mem_ptr->params.polynomial_size,
|
||||
@@ -48,9 +45,11 @@ void host_integer_grouped_oprf(
|
||||
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
|
||||
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
|
||||
|
||||
cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
|
||||
for (int j = 1; j < active_gpu_count; j++) {
|
||||
cuda_stream_wait_event(streams[j], lut->event_scatter_in, gpu_indexes[j]);
|
||||
cuda_event_record(lut->event_scatter_in, streams.stream(0),
|
||||
streams.gpu_index(0));
|
||||
for (int j = 1; j < active_streams.count(); j++) {
|
||||
cuda_stream_wait_event(streams.stream(j), lut->event_scatter_in,
|
||||
streams.gpu_index(j));
|
||||
}
|
||||
|
||||
if (!lut->using_trivial_lwe_indexes) {
|
||||
@@ -58,35 +57,35 @@ void host_integer_grouped_oprf(
|
||||
}
|
||||
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
|
||||
seeded_lwe_input, lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
|
||||
lut->lwe_aligned_vec, active_gpu_count, num_blocks_to_process,
|
||||
active_streams, lwe_array_in_vec, seeded_lwe_input, lut->lwe_indexes_in,
|
||||
lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
|
||||
active_streams.count(), num_blocks_to_process,
|
||||
mem_ptr->params.small_lwe_dimension + 1);
|
||||
|
||||
execute_pbs_async<Torus, Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
|
||||
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
|
||||
lwe_array_in_vec, lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
|
||||
lut->buffer, mem_ptr->params.glwe_dimension,
|
||||
mem_ptr->params.small_lwe_dimension, mem_ptr->params.polynomial_size,
|
||||
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
|
||||
mem_ptr->params.grouping_factor, num_blocks_to_process,
|
||||
mem_ptr->params.pbs_type, 1, 0);
|
||||
active_streams, lwe_after_pbs_vec, lwe_trivial_indexes_vec,
|
||||
lut->lut_vec, lut->lut_indexes_vec, lwe_array_in_vec,
|
||||
lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key, lut->buffer,
|
||||
mem_ptr->params.glwe_dimension, mem_ptr->params.small_lwe_dimension,
|
||||
mem_ptr->params.polynomial_size, mem_ptr->params.pbs_base_log,
|
||||
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
|
||||
num_blocks_to_process, mem_ptr->params.pbs_type, 1, 0);
|
||||
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, (Torus *)radix_lwe_out->ptr,
|
||||
lwe_after_pbs_vec, lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
|
||||
active_streams, (Torus *)radix_lwe_out->ptr, lwe_after_pbs_vec,
|
||||
lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
|
||||
lut->lwe_aligned_vec, num_blocks_to_process,
|
||||
mem_ptr->params.big_lwe_dimension + 1);
|
||||
|
||||
// other gpus record their events
|
||||
for (int j = 1; j < active_gpu_count; j++) {
|
||||
cuda_event_record(lut->event_scatter_out[j], streams[j], gpu_indexes[j]);
|
||||
for (int j = 1; j < active_streams.count(); j++) {
|
||||
cuda_event_record(lut->event_scatter_out[j], streams.stream(j),
|
||||
streams.gpu_index(j));
|
||||
}
|
||||
// GPU 0 waits for all
|
||||
for (int j = 1; j < active_gpu_count; j++) {
|
||||
cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
|
||||
gpu_indexes[0]);
|
||||
for (int j = 1; j < active_streams.count(); j++) {
|
||||
cuda_stream_wait_event(streams.stream(0), lut->event_scatter_out[j],
|
||||
streams.gpu_index(0));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -96,9 +95,9 @@ void host_integer_grouped_oprf(
|
||||
radix_lwe_out->noise_levels[i] = NoiseLevel::NOMINAL;
|
||||
}
|
||||
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, radix_lwe_out,
|
||||
mem_ptr->plaintext_corrections, num_blocks_to_process,
|
||||
mem_ptr->params.message_modulus,
|
||||
host_addition<Torus>(streams.stream(0), streams.gpu_index(0), radix_lwe_out,
|
||||
radix_lwe_out, mem_ptr->plaintext_corrections,
|
||||
num_blocks_to_process, mem_ptr->params.message_modulus,
|
||||
mem_ptr->params.carry_modulus);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,13 +1,12 @@
|
||||
#include "integer/scalar_addition.cuh"
|
||||
|
||||
void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array, void const *scalar_input,
|
||||
void const *h_scalar_input, uint32_t num_scalars, uint32_t message_modulus,
|
||||
uint32_t carry_modulus) {
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
void const *scalar_input, void const *h_scalar_input, uint32_t num_scalars,
|
||||
uint32_t message_modulus, uint32_t carry_modulus) {
|
||||
|
||||
host_integer_radix_scalar_addition_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
|
||||
CudaStreams(streams), lwe_array,
|
||||
static_cast<const uint64_t *>(scalar_input),
|
||||
static_cast<const uint64_t *>(h_scalar_input), num_scalars,
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#endif
|
||||
|
||||
#include "device.h"
|
||||
#include "helper_multi_gpu.h"
|
||||
#include "radix_ciphertext.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <stdio.h>
|
||||
@@ -25,14 +26,13 @@ __global__ void device_integer_radix_scalar_addition_inplace(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_addition_inplace(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
Torus const *scalar_input, Torus const *h_scalar_input,
|
||||
uint32_t num_scalars, uint32_t message_modulus, uint32_t carry_modulus) {
|
||||
if (lwe_array->num_radix_blocks < num_scalars)
|
||||
PANIC("Cuda error: num scalars should be smaller or equal to input num "
|
||||
"radix blocks")
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
cuda_set_device(streams.gpu_index(0));
|
||||
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
@@ -47,9 +47,9 @@ __host__ void host_integer_radix_scalar_addition_inplace(
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_integer_radix_scalar_addition_inplace<Torus>
|
||||
<<<grid, thds, 0, streams[0]>>>((Torus *)lwe_array->ptr, scalar_input,
|
||||
num_scalars, lwe_array->lwe_dimension,
|
||||
delta);
|
||||
<<<grid, thds, 0, streams.stream(0)>>>((Torus *)lwe_array->ptr,
|
||||
scalar_input, num_scalars,
|
||||
lwe_array->lwe_dimension, delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
for (uint i = 0; i < num_scalars; i++) {
|
||||
lwe_array->degrees[i] = lwe_array->degrees[i] + h_scalar_input[i];
|
||||
@@ -70,10 +70,9 @@ __global__ void device_integer_radix_add_scalar_one_inplace(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_add_scalar_one_inplace(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
uint32_t message_modulus, uint32_t carry_modulus) {
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
cuda_set_device(streams.gpu_index(0));
|
||||
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
@@ -88,7 +87,7 @@ __host__ void host_integer_radix_add_scalar_one_inplace(
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_integer_radix_add_scalar_one_inplace<Torus>
|
||||
<<<grid, thds, 0, streams[0]>>>((Torus *)lwe_array->ptr,
|
||||
<<<grid, thds, 0, streams.stream(0)>>>((Torus *)lwe_array->ptr,
|
||||
lwe_array->num_radix_blocks,
|
||||
lwe_array->lwe_dimension, delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
@@ -113,11 +112,10 @@ __global__ void device_integer_radix_scalar_subtraction_inplace(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_subtraction_inplace(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array, Torus *scalar_input,
|
||||
CudaStreams streams, Torus *lwe_array, Torus *scalar_input,
|
||||
uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus) {
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
cuda_set_device(streams.gpu_index(0));
|
||||
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
@@ -132,9 +130,9 @@ __host__ void host_integer_radix_scalar_subtraction_inplace(
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_integer_radix_scalar_subtraction_inplace<Torus>
|
||||
<<<grid, thds, 0, streams[0]>>>(lwe_array, scalar_input,
|
||||
input_lwe_ciphertext_count, lwe_dimension,
|
||||
delta);
|
||||
<<<grid, thds, 0, streams.stream(0)>>>(lwe_array, scalar_input,
|
||||
input_lwe_ciphertext_count,
|
||||
lwe_dimension, delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1,16 +1,15 @@
|
||||
#include "integer/scalar_bitops.cuh"
|
||||
|
||||
void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
|
||||
void const *h_clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
host_integer_radix_scalar_bitop_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_input, static_cast<const uint64_t *>(clear_blocks),
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_input,
|
||||
static_cast<const uint64_t *>(clear_blocks),
|
||||
static_cast<const uint64_t *>(h_clear_blocks), num_clear_blocks,
|
||||
(int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key);
|
||||
|
||||
@@ -6,8 +6,7 @@
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_bitop_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *output,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input, Torus const *clear_blocks,
|
||||
Torus const *h_clear_blocks, uint32_t num_clear_blocks,
|
||||
int_bitop_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks,
|
||||
@@ -23,12 +22,12 @@ __host__ void host_integer_radix_scalar_bitop_kb(
|
||||
|
||||
if (num_clear_blocks == 0) {
|
||||
if (op == SCALAR_BITAND) {
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
output, 0, num_radix_blocks);
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), output, 0, num_radix_blocks);
|
||||
} else {
|
||||
if (input != output)
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], output,
|
||||
input);
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0),
|
||||
streams.gpu_index(0), output, input);
|
||||
}
|
||||
} else {
|
||||
// We have all possible LUTs pre-computed and we use the decomposed scalar
|
||||
@@ -45,19 +44,19 @@ __host__ void host_integer_radix_scalar_bitop_kb(
|
||||
input->degrees, num_clear_blocks);
|
||||
}
|
||||
cuda_memcpy_async_gpu_to_gpu(lut->get_lut_indexes(0, 0), clear_blocks,
|
||||
num_clear_blocks * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
auto active_gpu_count = get_active_gpu_count(num_clear_blocks, gpu_count);
|
||||
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count, false);
|
||||
num_clear_blocks * sizeof(Torus),
|
||||
streams.stream(0), streams.gpu_index(0));
|
||||
auto active_streams = streams.active_gpu_subset(num_clear_blocks);
|
||||
lut->broadcast_lut(active_streams, false);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, output, input, bsks, ksks,
|
||||
ms_noise_reduction_key, lut, num_clear_blocks);
|
||||
streams, output, input, bsks, ksks, ms_noise_reduction_key, lut,
|
||||
num_clear_blocks);
|
||||
memcpy(output->degrees, degrees, num_clear_blocks * sizeof(uint64_t));
|
||||
|
||||
if (op == SCALAR_BITAND && num_clear_blocks < num_radix_blocks) {
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
output, num_clear_blocks,
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), output, num_clear_blocks,
|
||||
num_radix_blocks);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,8 +32,7 @@ std::pair<bool, bool> get_invert_flags(COMPARISON_TYPE compare) {
|
||||
}
|
||||
|
||||
void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, void const *scalar_blocks,
|
||||
void const *h_scalar_blocks, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
@@ -50,9 +49,9 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
case EQ:
|
||||
case NE:
|
||||
host_integer_radix_scalar_equality_check_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_in, static_cast<const uint64_t *>(scalar_blocks), buffer,
|
||||
bsks, (uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks,
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_in,
|
||||
static_cast<const uint64_t *>(scalar_blocks), buffer, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks,
|
||||
num_scalar_blocks);
|
||||
break;
|
||||
case GT:
|
||||
@@ -63,8 +62,8 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
PANIC("Cuda error (scalar comparisons): the number of radix blocks has "
|
||||
"to be even or equal to 1.")
|
||||
host_integer_radix_scalar_difference_check_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_in, static_cast<const uint64_t *>(scalar_blocks),
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_in,
|
||||
static_cast<const uint64_t *>(scalar_blocks),
|
||||
static_cast<const uint64_t *>(h_scalar_blocks), buffer,
|
||||
buffer->diff_buffer->operator_f, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, num_radix_blocks, num_scalar_blocks);
|
||||
@@ -75,8 +74,8 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
PANIC("Cuda error (scalar max/min): the number of radix blocks has to be "
|
||||
"even.")
|
||||
host_integer_radix_scalar_maxmin_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_in, static_cast<const uint64_t *>(scalar_blocks),
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_in,
|
||||
static_cast<const uint64_t *>(scalar_blocks),
|
||||
static_cast<const uint64_t *>(h_scalar_blocks), buffer, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks,
|
||||
num_scalar_blocks);
|
||||
|
||||
@@ -26,8 +26,7 @@ Torus is_x_less_than_y_given_input_borrow(Torus last_x_block,
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scalar_compare_radix_blocks_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI *lwe_array_in, Torus *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
@@ -61,36 +60,33 @@ __host__ void scalar_compare_radix_blocks_kb(
|
||||
// space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000
|
||||
|
||||
auto subtracted_blocks = mem_ptr->tmp_block_comparisons;
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
subtracted_blocks, lwe_array_in);
|
||||
// Subtract
|
||||
// Here we need the true lwe sub, not the one that comes from shortint.
|
||||
host_integer_radix_scalar_subtraction_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, (Torus *)subtracted_blocks->ptr,
|
||||
scalar_blocks, big_lwe_dimension, num_radix_blocks, message_modulus,
|
||||
carry_modulus);
|
||||
streams, (Torus *)subtracted_blocks->ptr, scalar_blocks,
|
||||
big_lwe_dimension, num_radix_blocks, message_modulus, carry_modulus);
|
||||
|
||||
// Apply LUT to compare to 0
|
||||
auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, subtracted_blocks, bsks,
|
||||
ksks, ms_noise_reduction_key, sign_lut, num_radix_blocks);
|
||||
streams, lwe_array_out, subtracted_blocks, bsks, ksks,
|
||||
ms_noise_reduction_key, sign_lut, num_radix_blocks);
|
||||
|
||||
// FIXME: without this sync signed scalar eq tests fail, I don't understand
|
||||
// the reason
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
||||
// Add one
|
||||
// Here Lhs can have the following values: (-1) % (message modulus * carry
|
||||
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
|
||||
host_integer_radix_add_scalar_one_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, message_modulus,
|
||||
carry_modulus);
|
||||
streams, lwe_array_out, message_modulus, carry_modulus);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
|
||||
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
|
||||
@@ -135,12 +131,11 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
// We only have to compare blocks with zero
|
||||
// means scalar is zero
|
||||
host_compare_blocks_with_zero<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
|
||||
lwe_array_in, mem_ptr, bsks, ksks, ms_noise_reduction_key,
|
||||
num_radix_blocks, mem_ptr->is_zero_lut);
|
||||
streams, mem_ptr->tmp_lwe_array_out, lwe_array_in, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, num_radix_blocks, mem_ptr->is_zero_lut);
|
||||
are_all_comparisons_block_true<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
|
||||
mem_ptr->tmp_lwe_array_out, mem_ptr, bsks, ksks, ms_noise_reduction_key,
|
||||
streams, mem_ptr->tmp_lwe_array_out, mem_ptr->tmp_lwe_array_out,
|
||||
mem_ptr, bsks, ksks, ms_noise_reduction_key,
|
||||
mem_ptr->tmp_lwe_array_out->num_radix_blocks);
|
||||
|
||||
auto scalar_last_leaf_lut_f = [sign_handler_f](Torus x) -> Torus {
|
||||
@@ -151,16 +146,16 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
|
||||
auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
|
||||
generate_device_accumulator_with_cpu_prealloc<Torus>(
|
||||
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
|
||||
lut->get_max_degree(0), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, scalar_last_leaf_lut_f, true,
|
||||
mem_ptr->diff_buffer->tree_buffer->preallocated_h_lut);
|
||||
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
|
||||
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
|
||||
lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, scalar_last_leaf_lut_f,
|
||||
true, mem_ptr->diff_buffer->tree_buffer->preallocated_h_lut);
|
||||
auto active_streams = streams.active_gpu_subset(1);
|
||||
lut->broadcast_lut(active_streams);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
mem_ptr->tmp_lwe_array_out, bsks, ksks, ms_noise_reduction_key, lut, 1);
|
||||
streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, bsks, ksks,
|
||||
ms_noise_reduction_key, lut, 1);
|
||||
|
||||
} else if (num_scalar_blocks < num_radix_blocks) {
|
||||
// We have to handle both part of the work described above
|
||||
@@ -185,9 +180,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
auto lsb_streams = mem_ptr->lsb_streams;
|
||||
auto msb_streams = mem_ptr->msb_streams;
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
streams.synchronize();
|
||||
|
||||
//////////////
|
||||
// lsb
|
||||
@@ -196,9 +189,9 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
as_radix_ciphertext_slice<Torus>(&rhs, lhs, num_radix_blocks / 2,
|
||||
lhs->num_radix_blocks);
|
||||
|
||||
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
|
||||
num_lsb_radix_blocks, message_modulus);
|
||||
scalar_pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], &rhs,
|
||||
pack_blocks<Torus>(lsb_streams.stream(0), streams.gpu_index(0), lhs,
|
||||
lwe_array_in, num_lsb_radix_blocks, message_modulus);
|
||||
scalar_pack_blocks<Torus>(lsb_streams.stream(0), streams.gpu_index(0), &rhs,
|
||||
scalar_blocks, num_scalar_blocks,
|
||||
message_modulus);
|
||||
|
||||
@@ -213,31 +206,26 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
scalar_compare_radix_blocks_kb<Torus>(
|
||||
lsb_streams, gpu_indexes, gpu_count, comparisons,
|
||||
diff_buffer->tmp_packed, (Torus *)rhs.ptr, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, num_lsb_radix_blocks);
|
||||
lsb_streams, comparisons, diff_buffer->tmp_packed, (Torus *)rhs.ptr,
|
||||
mem_ptr, bsks, ksks, ms_noise_reduction_key, num_lsb_radix_blocks);
|
||||
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
tree_sign_reduction<Torus>(
|
||||
lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
|
||||
mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks, ksks,
|
||||
tree_sign_reduction<Torus>(lsb_streams, lwe_array_lsb_out, comparisons,
|
||||
mem_ptr->diff_buffer->tree_buffer,
|
||||
mem_ptr->identity_lut_f, bsks, ksks,
|
||||
ms_noise_reduction_key, num_lsb_radix_blocks);
|
||||
//////////////
|
||||
// msb
|
||||
host_compare_blocks_with_zero<Torus>(
|
||||
msb_streams, gpu_indexes, gpu_count, &lwe_array_msb_out, &msb, mem_ptr,
|
||||
bsks, ksks, ms_noise_reduction_key, num_msb_radix_blocks,
|
||||
mem_ptr->is_zero_lut);
|
||||
msb_streams, &lwe_array_msb_out, &msb, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, num_msb_radix_blocks, mem_ptr->is_zero_lut);
|
||||
are_all_comparisons_block_true<Torus>(
|
||||
msb_streams, gpu_indexes, gpu_count, &lwe_array_msb_out,
|
||||
&lwe_array_msb_out, mem_ptr, bsks, ksks, ms_noise_reduction_key,
|
||||
lwe_array_msb_out.num_radix_blocks);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
|
||||
}
|
||||
msb_streams, &lwe_array_msb_out, &lwe_array_msb_out, mem_ptr, bsks,
|
||||
ksks, ms_noise_reduction_key, lwe_array_msb_out.num_radix_blocks);
|
||||
lsb_streams.synchronize();
|
||||
msb_streams.synchronize();
|
||||
|
||||
//////////////
|
||||
// Reduce the two blocks into one final
|
||||
@@ -252,17 +240,17 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
|
||||
auto lut = diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
|
||||
generate_device_accumulator_bivariate_with_cpu_prealloc<Torus>(
|
||||
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
|
||||
lut->get_max_degree(0), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f, true,
|
||||
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
|
||||
lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus,
|
||||
scalar_bivariate_last_leaf_lut_f, true,
|
||||
mem_ptr->diff_buffer->tree_buffer->preallocated_h_lut);
|
||||
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
|
||||
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
auto active_streams = streams.active_gpu_subset(1);
|
||||
lut->broadcast_lut(active_streams);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
|
||||
&lwe_array_msb_out, bsks, ksks, ms_noise_reduction_key, lut, 1,
|
||||
lut->params.message_modulus);
|
||||
streams, lwe_array_out, lwe_array_lsb_out, &lwe_array_msb_out, bsks,
|
||||
ksks, ms_noise_reduction_key, lut, 1, lut->params.message_modulus);
|
||||
|
||||
} else {
|
||||
if (num_radix_blocks == 1) {
|
||||
@@ -282,22 +270,22 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
return (Torus)(invert_flags.second ^ overflowed);
|
||||
};
|
||||
uint64_t size = 0;
|
||||
int_radix_lut<Torus> *one_block_lut = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, 1, 1, true, size);
|
||||
int_radix_lut<Torus> *one_block_lut =
|
||||
new int_radix_lut<Torus>(streams, params, 1, 1, true, size);
|
||||
|
||||
generate_device_accumulator_with_cpu_prealloc<Torus>(
|
||||
streams[0], gpu_indexes[0], one_block_lut->get_lut(0, 0),
|
||||
streams.stream(0), streams.gpu_index(0), one_block_lut->get_lut(0, 0),
|
||||
one_block_lut->get_degree(0), one_block_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, one_block_lut_f, true,
|
||||
mem_ptr->preallocated_h_lut);
|
||||
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
|
||||
one_block_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
auto active_streams = streams.active_gpu_subset(1);
|
||||
one_block_lut->broadcast_lut(active_streams);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks,
|
||||
ksks, ms_noise_reduction_key, one_block_lut, 1);
|
||||
one_block_lut->release(streams, gpu_indexes, gpu_count);
|
||||
streams, lwe_array_out, lwe_array_in, bsks, ksks,
|
||||
ms_noise_reduction_key, one_block_lut, 1);
|
||||
one_block_lut->release(streams);
|
||||
delete one_block_lut;
|
||||
} else {
|
||||
// We only have to do the regular comparison
|
||||
@@ -310,10 +298,11 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
as_radix_ciphertext_slice<Torus>(&rhs, lhs, num_radix_blocks / 2,
|
||||
lhs->num_radix_blocks);
|
||||
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], lhs, lwe_array_in,
|
||||
num_lsb_radix_blocks, message_modulus);
|
||||
scalar_pack_blocks<Torus>(streams[0], gpu_indexes[0], &rhs, scalar_blocks,
|
||||
num_scalar_blocks, message_modulus);
|
||||
pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), lhs,
|
||||
lwe_array_in, num_lsb_radix_blocks, message_modulus);
|
||||
scalar_pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), &rhs,
|
||||
scalar_blocks, num_scalar_blocks,
|
||||
message_modulus);
|
||||
|
||||
// From this point we have half number of blocks
|
||||
num_lsb_radix_blocks /= 2;
|
||||
@@ -324,15 +313,14 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
// - 2 if lhs > rhs
|
||||
auto comparisons = mem_ptr->tmp_lwe_array_out;
|
||||
scalar_compare_radix_blocks_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, comparisons, diff_buffer->tmp_packed,
|
||||
(Torus *)rhs.ptr, mem_ptr, bsks, ksks, ms_noise_reduction_key,
|
||||
num_lsb_radix_blocks);
|
||||
streams, comparisons, diff_buffer->tmp_packed, (Torus *)rhs.ptr,
|
||||
mem_ptr, bsks, ksks, ms_noise_reduction_key, num_lsb_radix_blocks);
|
||||
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
tree_sign_reduction<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
comparisons, mem_ptr->diff_buffer->tree_buffer,
|
||||
tree_sign_reduction<Torus>(streams, lwe_array_out, comparisons,
|
||||
mem_ptr->diff_buffer->tree_buffer,
|
||||
sign_handler_f, bsks, ksks,
|
||||
ms_noise_reduction_key, num_lsb_radix_blocks);
|
||||
}
|
||||
@@ -341,8 +329,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
|
||||
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
|
||||
@@ -388,13 +375,11 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
// means scalar is zero
|
||||
auto are_all_msb_zeros = mem_ptr->tmp_lwe_array_out;
|
||||
host_compare_blocks_with_zero<Torus>(
|
||||
streams, gpu_indexes, gpu_count, are_all_msb_zeros, lwe_array_in,
|
||||
mem_ptr, bsks, ksks, ms_noise_reduction_key, num_radix_blocks,
|
||||
mem_ptr->is_zero_lut);
|
||||
streams, are_all_msb_zeros, lwe_array_in, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, num_radix_blocks, mem_ptr->is_zero_lut);
|
||||
are_all_comparisons_block_true<Torus>(
|
||||
streams, gpu_indexes, gpu_count, are_all_msb_zeros, are_all_msb_zeros,
|
||||
mem_ptr, bsks, ksks, ms_noise_reduction_key,
|
||||
are_all_msb_zeros->num_radix_blocks);
|
||||
streams, are_all_msb_zeros, are_all_msb_zeros, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, are_all_msb_zeros->num_radix_blocks);
|
||||
CudaRadixCiphertextFFI sign_block;
|
||||
as_radix_ciphertext_slice<Torus>(&sign_block, lwe_array_in,
|
||||
num_radix_blocks - 1, num_radix_blocks);
|
||||
@@ -436,17 +421,17 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
|
||||
auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
|
||||
generate_device_accumulator_bivariate_with_cpu_prealloc<Torus>(
|
||||
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
|
||||
lut->get_max_degree(0), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f, true,
|
||||
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
|
||||
lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus,
|
||||
scalar_bivariate_last_leaf_lut_f, true,
|
||||
mem_ptr->diff_buffer->tree_buffer->preallocated_h_lut);
|
||||
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
|
||||
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
auto active_streams = streams.active_gpu_subset(1);
|
||||
lut->broadcast_lut(active_streams);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, are_all_msb_zeros,
|
||||
&sign_block, bsks, ksks, ms_noise_reduction_key, lut, 1,
|
||||
lut->params.message_modulus);
|
||||
streams, lwe_array_out, are_all_msb_zeros, &sign_block, bsks, ksks,
|
||||
ms_noise_reduction_key, lut, 1, lut->params.message_modulus);
|
||||
|
||||
} else if (num_scalar_blocks < num_radix_blocks) {
|
||||
// We have to handle both part of the work described above
|
||||
@@ -465,9 +450,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
|
||||
auto lsb_streams = mem_ptr->lsb_streams;
|
||||
auto msb_streams = mem_ptr->msb_streams;
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
streams.synchronize();
|
||||
|
||||
//////////////
|
||||
// lsb
|
||||
@@ -476,9 +459,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
as_radix_ciphertext_slice<Torus>(&rhs, lhs, num_radix_blocks / 2,
|
||||
lhs->num_radix_blocks);
|
||||
|
||||
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
|
||||
num_lsb_radix_blocks, message_modulus);
|
||||
scalar_pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], &rhs,
|
||||
pack_blocks<Torus>(lsb_streams.stream(0), streams.gpu_index(0), lhs,
|
||||
lwe_array_in, num_lsb_radix_blocks, message_modulus);
|
||||
scalar_pack_blocks<Torus>(lsb_streams.stream(0), streams.gpu_index(0), &rhs,
|
||||
scalar_blocks, num_scalar_blocks,
|
||||
message_modulus);
|
||||
|
||||
@@ -493,29 +476,26 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
scalar_compare_radix_blocks_kb<Torus>(
|
||||
lsb_streams, gpu_indexes, gpu_count, comparisons,
|
||||
diff_buffer->tmp_packed, (Torus *)rhs.ptr, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, num_lsb_radix_blocks);
|
||||
lsb_streams, comparisons, diff_buffer->tmp_packed, (Torus *)rhs.ptr,
|
||||
mem_ptr, bsks, ksks, ms_noise_reduction_key, num_lsb_radix_blocks);
|
||||
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
tree_sign_reduction<Torus>(
|
||||
lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
|
||||
mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks, ksks,
|
||||
tree_sign_reduction<Torus>(lsb_streams, lwe_array_lsb_out, comparisons,
|
||||
mem_ptr->diff_buffer->tree_buffer,
|
||||
mem_ptr->identity_lut_f, bsks, ksks,
|
||||
ms_noise_reduction_key, num_lsb_radix_blocks);
|
||||
//////////////
|
||||
// msb
|
||||
// We remove the last block (which is the sign)
|
||||
auto are_all_msb_zeros = lwe_array_msb_out;
|
||||
host_compare_blocks_with_zero<Torus>(
|
||||
msb_streams, gpu_indexes, gpu_count, &are_all_msb_zeros, &msb, mem_ptr,
|
||||
bsks, ksks, ms_noise_reduction_key, num_msb_radix_blocks,
|
||||
mem_ptr->is_zero_lut);
|
||||
msb_streams, &are_all_msb_zeros, &msb, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, num_msb_radix_blocks, mem_ptr->is_zero_lut);
|
||||
are_all_comparisons_block_true<Torus>(
|
||||
msb_streams, gpu_indexes, gpu_count, &are_all_msb_zeros,
|
||||
&are_all_msb_zeros, mem_ptr, bsks, ksks, ms_noise_reduction_key,
|
||||
are_all_msb_zeros.num_radix_blocks);
|
||||
msb_streams, &are_all_msb_zeros, &are_all_msb_zeros, mem_ptr, bsks,
|
||||
ksks, ms_noise_reduction_key, are_all_msb_zeros.num_radix_blocks);
|
||||
|
||||
auto sign_bit_pos = (int)log2(message_modulus) - 1;
|
||||
|
||||
@@ -543,30 +523,28 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
|
||||
auto signed_msb_lut = mem_ptr->signed_msb_lut;
|
||||
generate_device_accumulator_bivariate_with_cpu_prealloc<Torus>(
|
||||
msb_streams[0], gpu_indexes[0], signed_msb_lut->get_lut(0, 0),
|
||||
signed_msb_lut->get_degree(0), signed_msb_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_f, true, mem_ptr->preallocated_h_lut);
|
||||
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
|
||||
signed_msb_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
msb_streams.stream(0), streams.gpu_index(0),
|
||||
signed_msb_lut->get_lut(0, 0), signed_msb_lut->get_degree(0),
|
||||
signed_msb_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
lut_f, true, mem_ptr->preallocated_h_lut);
|
||||
auto active_streams = streams.active_gpu_subset(1);
|
||||
signed_msb_lut->broadcast_lut(active_streams);
|
||||
|
||||
CudaRadixCiphertextFFI sign_block;
|
||||
as_radix_ciphertext_slice<Torus>(
|
||||
&sign_block, &msb, num_msb_radix_blocks - 1, num_msb_radix_blocks);
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
msb_streams, gpu_indexes, gpu_count, &lwe_array_msb_out, &sign_block,
|
||||
&are_all_msb_zeros, bsks, ksks, ms_noise_reduction_key, signed_msb_lut,
|
||||
1, signed_msb_lut->params.message_modulus);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
|
||||
}
|
||||
msb_streams, &lwe_array_msb_out, &sign_block, &are_all_msb_zeros, bsks,
|
||||
ksks, ms_noise_reduction_key, signed_msb_lut, 1,
|
||||
signed_msb_lut->params.message_modulus);
|
||||
lsb_streams.synchronize();
|
||||
msb_streams.synchronize();
|
||||
|
||||
//////////////
|
||||
// Reduce the two blocks into one final
|
||||
reduce_signs<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_lsb_out, mem_ptr, sign_handler_f, bsks, ksks,
|
||||
ms_noise_reduction_key, 2);
|
||||
reduce_signs<Torus>(streams, lwe_array_out, lwe_array_lsb_out, mem_ptr,
|
||||
sign_handler_f, bsks, ksks, ms_noise_reduction_key, 2);
|
||||
|
||||
} else {
|
||||
if (num_radix_blocks == 1) {
|
||||
@@ -588,22 +566,22 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
message_modulus);
|
||||
};
|
||||
uint64_t size = 0;
|
||||
int_radix_lut<Torus> *one_block_lut = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, 1, 1, true, size);
|
||||
int_radix_lut<Torus> *one_block_lut =
|
||||
new int_radix_lut<Torus>(streams, params, 1, 1, true, size);
|
||||
|
||||
generate_device_accumulator_with_cpu_prealloc<Torus>(
|
||||
streams[0], gpu_indexes[0], one_block_lut->get_lut(0, 0),
|
||||
streams.stream(0), streams.gpu_index(0), one_block_lut->get_lut(0, 0),
|
||||
one_block_lut->get_degree(0), one_block_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, one_block_lut_f, true,
|
||||
mem_ptr->preallocated_h_lut);
|
||||
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
|
||||
one_block_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
auto active_streams = streams.active_gpu_subset(1);
|
||||
one_block_lut->broadcast_lut(active_streams);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks,
|
||||
ksks, ms_noise_reduction_key, one_block_lut, 1);
|
||||
one_block_lut->release(streams, gpu_indexes, gpu_count);
|
||||
streams, lwe_array_out, lwe_array_in, bsks, ksks,
|
||||
ms_noise_reduction_key, one_block_lut, 1);
|
||||
one_block_lut->release(streams);
|
||||
delete one_block_lut;
|
||||
} else {
|
||||
// We only have to do the regular comparison
|
||||
@@ -611,9 +589,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
// total_num_radix_blocks == total_num_scalar_blocks
|
||||
uint32_t num_lsb_radix_blocks = num_radix_blocks;
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
streams.synchronize();
|
||||
auto lsb_streams = mem_ptr->lsb_streams;
|
||||
auto msb_streams = mem_ptr->msb_streams;
|
||||
|
||||
@@ -627,10 +603,11 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
as_radix_ciphertext_slice<Torus>(&rhs, lhs, num_radix_blocks / 2,
|
||||
lhs->num_radix_blocks);
|
||||
|
||||
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
|
||||
num_lsb_radix_blocks - 1, message_modulus);
|
||||
scalar_pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], &rhs,
|
||||
scalar_blocks, num_lsb_radix_blocks - 1,
|
||||
pack_blocks<Torus>(lsb_streams.stream(0), streams.gpu_index(0), lhs,
|
||||
lwe_array_in, num_lsb_radix_blocks - 1,
|
||||
message_modulus);
|
||||
scalar_pack_blocks<Torus>(lsb_streams.stream(0), streams.gpu_index(0),
|
||||
&rhs, scalar_blocks, num_lsb_radix_blocks - 1,
|
||||
message_modulus);
|
||||
|
||||
// From this point we have half number of blocks
|
||||
@@ -641,9 +618,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
scalar_compare_radix_blocks_kb<Torus>(
|
||||
lsb_streams, gpu_indexes, gpu_count, lwe_array_ct_out,
|
||||
diff_buffer->tmp_packed, (Torus *)rhs.ptr, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, num_lsb_radix_blocks);
|
||||
lsb_streams, lwe_array_ct_out, diff_buffer->tmp_packed,
|
||||
(Torus *)rhs.ptr, mem_ptr, bsks, ksks, ms_noise_reduction_key,
|
||||
num_lsb_radix_blocks);
|
||||
CudaRadixCiphertextFFI encrypted_sign_block;
|
||||
as_radix_ciphertext_slice<Torus>(&encrypted_sign_block, lwe_array_in,
|
||||
num_radix_blocks - 1, num_radix_blocks);
|
||||
@@ -653,33 +630,30 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
|
||||
auto trivial_sign_block = mem_ptr->tmp_trivial_sign_block;
|
||||
set_trivial_radix_ciphertext_async<Torus>(
|
||||
msb_streams[0], gpu_indexes[0], trivial_sign_block, scalar_sign_block,
|
||||
h_scalar_sign_block, 1, message_modulus, carry_modulus);
|
||||
msb_streams.stream(0), streams.gpu_index(0), trivial_sign_block,
|
||||
scalar_sign_block, h_scalar_sign_block, 1, message_modulus,
|
||||
carry_modulus);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
msb_streams, gpu_indexes, gpu_count, &lwe_array_sign_out,
|
||||
&encrypted_sign_block, trivial_sign_block, bsks, ksks,
|
||||
ms_noise_reduction_key, mem_ptr->signed_lut, 1,
|
||||
mem_ptr->signed_lut->params.message_modulus);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
|
||||
}
|
||||
msb_streams, &lwe_array_sign_out, &encrypted_sign_block,
|
||||
trivial_sign_block, bsks, ksks, ms_noise_reduction_key,
|
||||
mem_ptr->signed_lut, 1, mem_ptr->signed_lut->params.message_modulus);
|
||||
lsb_streams.synchronize();
|
||||
msb_streams.synchronize();
|
||||
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
reduce_signs<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_ct_out, mem_ptr, sign_handler_f, bsks, ksks,
|
||||
ms_noise_reduction_key, num_lsb_radix_blocks + 1);
|
||||
reduce_signs<Torus>(streams, lwe_array_out, lwe_array_ct_out, mem_ptr,
|
||||
sign_handler_f, bsks, ksks, ms_noise_reduction_key,
|
||||
num_lsb_radix_blocks + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_difference_check_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
|
||||
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
|
||||
@@ -696,21 +670,20 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
|
||||
if (mem_ptr->is_signed) {
|
||||
// is signed and scalar is positive
|
||||
integer_radix_signed_scalar_difference_check_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
|
||||
scalar_blocks, h_scalar_blocks, mem_ptr, sign_handler_f, bsks, ksks,
|
||||
ms_noise_reduction_key, num_radix_blocks, num_scalar_blocks);
|
||||
streams, lwe_array_out, lwe_array_in, scalar_blocks, h_scalar_blocks,
|
||||
mem_ptr, sign_handler_f, bsks, ksks, ms_noise_reduction_key,
|
||||
num_radix_blocks, num_scalar_blocks);
|
||||
} else {
|
||||
integer_radix_unsigned_scalar_difference_check_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
|
||||
scalar_blocks, h_scalar_blocks, mem_ptr, sign_handler_f, bsks, ksks,
|
||||
ms_noise_reduction_key, num_radix_blocks, num_scalar_blocks);
|
||||
streams, lwe_array_out, lwe_array_in, scalar_blocks, h_scalar_blocks,
|
||||
mem_ptr, sign_handler_f, bsks, ksks, ms_noise_reduction_key,
|
||||
num_radix_blocks, num_scalar_blocks);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_maxmin_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
|
||||
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
|
||||
void *const *bsks, Torus *const *ksks,
|
||||
@@ -732,9 +705,9 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
|
||||
// - 2 if lhs > rhs
|
||||
auto sign = mem_ptr->tmp_lwe_array_out;
|
||||
host_integer_radix_scalar_difference_check_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, sign, lwe_array_in, scalar_blocks,
|
||||
h_scalar_blocks, mem_ptr, mem_ptr->identity_lut_f, bsks, ksks,
|
||||
ms_noise_reduction_key, num_radix_blocks, num_scalar_blocks);
|
||||
streams, sign, lwe_array_in, scalar_blocks, h_scalar_blocks, mem_ptr,
|
||||
mem_ptr->identity_lut_f, bsks, ksks, ms_noise_reduction_key,
|
||||
num_radix_blocks, num_scalar_blocks);
|
||||
|
||||
// There is no optimized CMUX for scalars, so we convert to a trivial
|
||||
// ciphertext
|
||||
@@ -742,22 +715,21 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
|
||||
auto lwe_array_right = mem_ptr->tmp_block_comparisons;
|
||||
|
||||
set_trivial_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], lwe_array_right, scalar_blocks,
|
||||
streams.stream(0), streams.gpu_index(0), lwe_array_right, scalar_blocks,
|
||||
h_scalar_blocks, num_scalar_blocks, params.message_modulus,
|
||||
params.carry_modulus);
|
||||
|
||||
// Selector
|
||||
// CMUX for Max or Min
|
||||
host_integer_radix_cmux_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
|
||||
mem_ptr->cmux_buffer, bsks, ksks, ms_noise_reduction_key);
|
||||
host_integer_radix_cmux_kb<Torus>(streams, lwe_array_out,
|
||||
mem_ptr->tmp_lwe_array_out, lwe_array_left,
|
||||
lwe_array_right, mem_ptr->cmux_buffer, bsks,
|
||||
ksks, ms_noise_reduction_key);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_equality_check_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
@@ -797,9 +769,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
|
||||
num_halved_lsb_radix_blocks,
|
||||
lwe_array_in->num_radix_blocks);
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
streams.synchronize();
|
||||
|
||||
auto lsb_streams = mem_ptr->lsb_streams;
|
||||
auto msb_streams = mem_ptr->msb_streams;
|
||||
@@ -811,33 +781,34 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
|
||||
num_halved_lsb_radix_blocks,
|
||||
packed_blocks->num_radix_blocks);
|
||||
if (num_lsb_radix_blocks > 1) {
|
||||
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], packed_blocks,
|
||||
lwe_array_in, num_lsb_radix_blocks, message_modulus);
|
||||
scalar_pack_blocks(lsb_streams[0], gpu_indexes[0], &packed_scalar,
|
||||
scalar_blocks, num_scalar_blocks, message_modulus);
|
||||
pack_blocks<Torus>(lsb_streams.stream(0), lsb_streams.gpu_index(0),
|
||||
packed_blocks, lwe_array_in, num_lsb_radix_blocks,
|
||||
message_modulus);
|
||||
scalar_pack_blocks(lsb_streams.stream(0), streams.gpu_index(0),
|
||||
&packed_scalar, scalar_blocks, num_scalar_blocks,
|
||||
message_modulus);
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
scalar_comparison_luts->get_lut_indexes(0, 0), packed_scalar.ptr,
|
||||
num_halved_scalar_blocks * sizeof(Torus), lsb_streams[0],
|
||||
gpu_indexes[0]);
|
||||
num_halved_scalar_blocks * sizeof(Torus), lsb_streams.stream(0),
|
||||
lsb_streams.gpu_index(0));
|
||||
} else if (num_lsb_radix_blocks == 1) {
|
||||
copy_radix_ciphertext_slice_async<Torus>(lsb_streams[0], gpu_indexes[0],
|
||||
packed_blocks, 0, 1,
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
lsb_streams.stream(0), lsb_streams.gpu_index(0), packed_blocks, 0, 1,
|
||||
lwe_array_in, 0, 1);
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
scalar_comparison_luts->get_lut_indexes(0, 0), scalar_blocks,
|
||||
num_halved_scalar_blocks * sizeof(Torus), lsb_streams[0],
|
||||
gpu_indexes[0]);
|
||||
num_halved_scalar_blocks * sizeof(Torus), lsb_streams.stream(0),
|
||||
lsb_streams.gpu_index(0));
|
||||
}
|
||||
auto active_gpu_count =
|
||||
get_active_gpu_count(num_halved_scalar_blocks, gpu_count);
|
||||
auto active_streams =
|
||||
lsb_streams.active_gpu_subset(num_halved_scalar_blocks);
|
||||
// We use false cause we only will broadcast the indexes
|
||||
scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes,
|
||||
active_gpu_count, false);
|
||||
scalar_comparison_luts->broadcast_lut(active_streams, false);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
lsb_streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
|
||||
mem_ptr->tmp_packed_input, bsks, ksks, ms_noise_reduction_key,
|
||||
scalar_comparison_luts, num_halved_lsb_radix_blocks);
|
||||
lsb_streams, mem_ptr->tmp_lwe_array_out, mem_ptr->tmp_packed_input,
|
||||
bsks, ksks, ms_noise_reduction_key, scalar_comparison_luts,
|
||||
num_halved_lsb_radix_blocks);
|
||||
}
|
||||
//////////////
|
||||
// msb_in
|
||||
@@ -855,29 +826,27 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
|
||||
}
|
||||
|
||||
host_compare_blocks_with_zero<Torus>(
|
||||
msb_streams, gpu_indexes, gpu_count, &msb_out, &msb_in, mem_ptr, bsks,
|
||||
ksks, ms_noise_reduction_key, num_msb_radix_blocks, msb_lut);
|
||||
msb_streams, &msb_out, &msb_in, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, num_msb_radix_blocks, msb_lut);
|
||||
are_all_comparisons_block_true<Torus>(
|
||||
msb_streams, gpu_indexes, gpu_count, &msb_out, &msb_out, mem_ptr, bsks,
|
||||
ksks, ms_noise_reduction_key, msb_out.num_radix_blocks);
|
||||
msb_streams, &msb_out, &msb_out, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, msb_out.num_radix_blocks);
|
||||
}
|
||||
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
|
||||
}
|
||||
lsb_streams.synchronize();
|
||||
msb_streams.synchronize();
|
||||
|
||||
switch (mem_ptr->op) {
|
||||
case COMPARISON_TYPE::EQ:
|
||||
are_all_comparisons_block_true<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
mem_ptr->tmp_lwe_array_out, mem_ptr, bsks, ksks, ms_noise_reduction_key,
|
||||
streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key,
|
||||
num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
|
||||
break;
|
||||
case COMPARISON_TYPE::NE:
|
||||
is_at_least_one_comparisons_block_true<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
mem_ptr->tmp_lwe_array_out, mem_ptr, bsks, ksks, ms_noise_reduction_key,
|
||||
streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key,
|
||||
num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
|
||||
break;
|
||||
default:
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
#include "scalar_div.cuh"
|
||||
|
||||
uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
@@ -16,45 +16,43 @@ uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
noise_reduction_type);
|
||||
|
||||
return scratch_integer_unsigned_scalar_div_radix<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
|
||||
CudaStreams(streams), params,
|
||||
(int_unsigned_scalar_div_mem<uint64_t> **)mem_ptr, num_blocks,
|
||||
scalar_divisor_ffi, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *numerator_ct, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi) {
|
||||
|
||||
host_integer_unsigned_scalar_div_radix<uint64_t>(
|
||||
(cudaStream_t *)streams, gpu_indexes, gpu_count, numerator_ct,
|
||||
CudaStreams(streams), numerator_ct,
|
||||
(int_unsigned_scalar_div_mem<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
|
||||
ms_noise_reduction_key, scalar_divisor_ffi);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
|
||||
|
||||
int_unsigned_scalar_div_mem<uint64_t> *mem_ptr =
|
||||
(int_unsigned_scalar_div_mem<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
@@ -63,44 +61,42 @@ uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
|
||||
noise_reduction_type);
|
||||
|
||||
return scratch_integer_signed_scalar_div_radix_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
|
||||
CudaStreams(streams), params,
|
||||
(int_signed_scalar_div_mem<uint64_t> **)mem_ptr, num_blocks,
|
||||
scalar_divisor_ffi, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_signed_scalar_div_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *numerator_ct, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits) {
|
||||
|
||||
host_integer_signed_scalar_div_radix_kb<uint64_t>(
|
||||
(cudaStream_t *)streams, gpu_indexes, gpu_count, numerator_ct,
|
||||
CudaStreams(streams), numerator_ct,
|
||||
(int_signed_scalar_div_mem<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
|
||||
ms_noise_reduction_key, scalar_divisor_ffi, numerator_bits);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_signed_scalar_div_mem<uint64_t> *mem_ptr =
|
||||
(int_signed_scalar_div_mem<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
uint32_t const active_bits_divisor, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
@@ -111,15 +107,15 @@ uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
noise_reduction_type);
|
||||
|
||||
return scratch_integer_unsigned_scalar_div_rem_radix<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
|
||||
CudaStreams(streams), params,
|
||||
(int_unsigned_scalar_div_rem_buffer<uint64_t> **)mem_ptr, num_blocks,
|
||||
scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *quotient_ct, CudaRadixCiphertextFFI *remainder_ct,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient_ct,
|
||||
CudaRadixCiphertextFFI *remainder_ct, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
uint64_t const *divisor_has_at_least_one_set,
|
||||
@@ -128,33 +124,32 @@ void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
uint32_t num_clear_blocks) {
|
||||
|
||||
host_integer_unsigned_scalar_div_rem_radix<uint64_t>(
|
||||
(cudaStream_t *)streams, gpu_indexes, gpu_count, quotient_ct,
|
||||
remainder_ct, (int_unsigned_scalar_div_rem_buffer<uint64_t> *)mem_ptr,
|
||||
bsks, (uint64_t **)ksks, ms_noise_reduction_key, scalar_divisor_ffi,
|
||||
CudaStreams(streams), quotient_ct, remainder_ct,
|
||||
(int_unsigned_scalar_div_rem_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)ksks, ms_noise_reduction_key, scalar_divisor_ffi,
|
||||
divisor_has_at_least_one_set, decomposed_divisor, num_scalars_divisor,
|
||||
(uint64_t *)clear_blocks, (uint64_t *)h_clear_blocks, num_clear_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
|
||||
|
||||
int_unsigned_scalar_div_rem_buffer<uint64_t> *mem_ptr =
|
||||
(int_unsigned_scalar_div_rem_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
uint32_t const active_bits_divisor, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
@@ -165,15 +160,15 @@ uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
noise_reduction_type);
|
||||
|
||||
return scratch_integer_signed_scalar_div_rem_radix<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
|
||||
CudaStreams(streams), params,
|
||||
(int_signed_scalar_div_rem_buffer<uint64_t> **)mem_ptr, num_blocks,
|
||||
scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *quotient_ct, CudaRadixCiphertextFFI *remainder_ct,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient_ct,
|
||||
CudaRadixCiphertextFFI *remainder_ct, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
uint64_t const *divisor_has_at_least_one_set,
|
||||
@@ -181,21 +176,20 @@ void cuda_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
uint32_t numerator_bits) {
|
||||
|
||||
host_integer_signed_scalar_div_rem_radix<uint64_t>(
|
||||
(cudaStream_t *)streams, gpu_indexes, gpu_count, quotient_ct,
|
||||
remainder_ct, (int_signed_scalar_div_rem_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
CudaStreams(streams), quotient_ct, remainder_ct,
|
||||
(int_signed_scalar_div_rem_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)ksks, ms_noise_reduction_key, scalar_divisor_ffi,
|
||||
divisor_has_at_least_one_set, decomposed_divisor, num_scalars_divisor,
|
||||
numerator_bits);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
|
||||
|
||||
int_signed_scalar_div_rem_buffer<uint64_t> *mem_ptr =
|
||||
(int_signed_scalar_div_rem_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
|
||||
@@ -9,8 +9,7 @@
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_integer_unsigned_scalar_div_radix(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, const int_radix_params params,
|
||||
CudaStreams streams, const int_radix_params params,
|
||||
int_unsigned_scalar_div_mem<Torus> **mem_ptr, uint32_t num_radix_blocks,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
const bool allocate_gpu_memory) {
|
||||
@@ -18,16 +17,15 @@ __host__ uint64_t scratch_integer_unsigned_scalar_div_radix(
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
*mem_ptr = new int_unsigned_scalar_div_mem<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
scalar_divisor_ffi, allocate_gpu_memory, size_tracker);
|
||||
streams, params, num_radix_blocks, scalar_divisor_ffi,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_unsigned_scalar_div_radix(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *numerator_ct,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *numerator_ct,
|
||||
int_unsigned_scalar_div_mem<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
@@ -39,15 +37,15 @@ __host__ void host_integer_unsigned_scalar_div_radix(
|
||||
|
||||
if (scalar_divisor_ffi->is_divisor_pow2) {
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct,
|
||||
scalar_divisor_ffi->ilog2_divisor, mem_ptr->logical_scalar_shift_mem,
|
||||
bsks, ksks, ms_noise_reduction_key, numerator_ct->num_radix_blocks);
|
||||
streams, numerator_ct, scalar_divisor_ffi->ilog2_divisor,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
numerator_ct->num_radix_blocks);
|
||||
return;
|
||||
}
|
||||
|
||||
if (scalar_divisor_ffi->divisor_has_more_bits_than_numerator) {
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], numerator_ct,
|
||||
mem_ptr->tmp_ffi);
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
numerator_ct, mem_ptr->tmp_ffi);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -63,32 +61,29 @@ __host__ void host_integer_unsigned_scalar_div_radix(
|
||||
|
||||
CudaRadixCiphertextFFI *numerator_cpy = mem_ptr->tmp_ffi;
|
||||
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
numerator_cpy, numerator_ct);
|
||||
|
||||
host_integer_radix_scalar_mul_high_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, numerator_cpy,
|
||||
mem_ptr->scalar_mul_high_mem, ksks, ms_noise_reduction_key, bsks,
|
||||
scalar_divisor_ffi);
|
||||
streams, numerator_cpy, mem_ptr->scalar_mul_high_mem, ksks,
|
||||
ms_noise_reduction_key, bsks, scalar_divisor_ffi);
|
||||
|
||||
host_sub_and_propagate_single_carry<Torus>(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct, numerator_cpy, nullptr,
|
||||
nullptr, mem_ptr->sub_and_propagate_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct, (uint32_t)1,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
numerator_ct->num_radix_blocks);
|
||||
|
||||
host_add_and_propagate_single_carry<Torus>(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct, numerator_cpy, nullptr,
|
||||
nullptr, mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
streams, numerator_ct, numerator_cpy, nullptr, nullptr,
|
||||
mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
FLAG_NONE, (uint32_t)0);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct,
|
||||
scalar_divisor_ffi->shift_post - (uint32_t)1,
|
||||
streams, numerator_ct, (uint32_t)1, mem_ptr->logical_scalar_shift_mem,
|
||||
bsks, ksks, ms_noise_reduction_key, numerator_ct->num_radix_blocks);
|
||||
|
||||
host_add_and_propagate_single_carry<Torus>(
|
||||
streams, numerator_ct, numerator_cpy, nullptr, nullptr,
|
||||
mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key, FLAG_NONE,
|
||||
(uint32_t)0);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, numerator_ct, scalar_divisor_ffi->shift_post - (uint32_t)1,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
numerator_ct->num_radix_blocks);
|
||||
|
||||
@@ -96,25 +91,23 @@ __host__ void host_integer_unsigned_scalar_div_radix(
|
||||
}
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct,
|
||||
scalar_divisor_ffi->shift_pre, mem_ptr->logical_scalar_shift_mem, bsks,
|
||||
ksks, ms_noise_reduction_key, numerator_ct->num_radix_blocks);
|
||||
streams, numerator_ct, scalar_divisor_ffi->shift_pre,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
numerator_ct->num_radix_blocks);
|
||||
|
||||
host_integer_radix_scalar_mul_high_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct,
|
||||
mem_ptr->scalar_mul_high_mem, ksks, ms_noise_reduction_key, bsks,
|
||||
scalar_divisor_ffi);
|
||||
streams, numerator_ct, mem_ptr->scalar_mul_high_mem, ksks,
|
||||
ms_noise_reduction_key, bsks, scalar_divisor_ffi);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct,
|
||||
scalar_divisor_ffi->shift_post, mem_ptr->logical_scalar_shift_mem, bsks,
|
||||
ksks, ms_noise_reduction_key, numerator_ct->num_radix_blocks);
|
||||
streams, numerator_ct, scalar_divisor_ffi->shift_post,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
numerator_ct->num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_integer_signed_scalar_div_radix_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params,
|
||||
CudaStreams streams, int_radix_params params,
|
||||
int_signed_scalar_div_mem<Torus> **mem_ptr, uint32_t num_radix_blocks,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
const bool allocate_gpu_memory) {
|
||||
@@ -122,16 +115,15 @@ __host__ uint64_t scratch_integer_signed_scalar_div_radix_kb(
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
*mem_ptr = new int_signed_scalar_div_mem<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
scalar_divisor_ffi, allocate_gpu_memory, size_tracker);
|
||||
streams, params, num_radix_blocks, scalar_divisor_ffi,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_signed_scalar_div_radix_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *numerator_ct,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *numerator_ct,
|
||||
int_signed_scalar_div_mem<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
@@ -142,19 +134,18 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
|
||||
CudaRadixCiphertextFFI *tmp = mem_ptr->tmp_ffi;
|
||||
|
||||
host_integer_radix_negation<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp, numerator_ct,
|
||||
mem_ptr->params.message_modulus, mem_ptr->params.carry_modulus,
|
||||
numerator_ct->num_radix_blocks);
|
||||
streams, tmp, numerator_ct, mem_ptr->params.message_modulus,
|
||||
mem_ptr->params.carry_modulus, numerator_ct->num_radix_blocks);
|
||||
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
|
||||
numerator_ct, tmp);
|
||||
copy_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), numerator_ct, tmp);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (scalar_divisor_ffi->chosen_multiplier_has_more_bits_than_numerator) {
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], numerator_ct, 0,
|
||||
streams.stream(0), streams.gpu_index(0), numerator_ct, 0,
|
||||
numerator_ct->num_radix_blocks);
|
||||
return;
|
||||
}
|
||||
@@ -162,124 +153,114 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
|
||||
CudaRadixCiphertextFFI *tmp = mem_ptr->tmp_ffi;
|
||||
|
||||
if (scalar_divisor_ffi->is_divisor_pow2) {
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], tmp,
|
||||
numerator_ct);
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
tmp, numerator_ct);
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp,
|
||||
scalar_divisor_ffi->chosen_multiplier_num_bits - 1,
|
||||
streams, tmp, scalar_divisor_ffi->chosen_multiplier_num_bits - 1,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
|
||||
ms_noise_reduction_key);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp,
|
||||
streams, tmp,
|
||||
numerator_bits - scalar_divisor_ffi->chosen_multiplier_num_bits,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
tmp->num_radix_blocks);
|
||||
|
||||
host_add_and_propagate_single_carry<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp, numerator_ct, nullptr, nullptr,
|
||||
mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key, FLAG_NONE,
|
||||
(uint32_t)0);
|
||||
streams, tmp, numerator_ct, nullptr, nullptr, mem_ptr->scp_mem, bsks,
|
||||
ksks, ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp,
|
||||
scalar_divisor_ffi->chosen_multiplier_num_bits,
|
||||
streams, tmp, scalar_divisor_ffi->chosen_multiplier_num_bits,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
|
||||
ms_noise_reduction_key);
|
||||
|
||||
} else if (!scalar_divisor_ffi->is_chosen_multiplier_geq_two_pow_numerator) {
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], tmp,
|
||||
numerator_ct);
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
tmp, numerator_ct);
|
||||
|
||||
host_integer_radix_signed_scalar_mul_high_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp, mem_ptr->scalar_mul_high_mem,
|
||||
ksks, scalar_divisor_ffi, ms_noise_reduction_key, bsks);
|
||||
streams, tmp, mem_ptr->scalar_mul_high_mem, ksks, scalar_divisor_ffi,
|
||||
ms_noise_reduction_key, bsks);
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp, scalar_divisor_ffi->shift_post,
|
||||
streams, tmp, scalar_divisor_ffi->shift_post,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
|
||||
ms_noise_reduction_key);
|
||||
|
||||
CudaRadixCiphertextFFI *xsign = mem_ptr->xsign_ffi;
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], xsign,
|
||||
numerator_ct);
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
xsign, numerator_ct);
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, xsign, numerator_bits - 1,
|
||||
streams, xsign, numerator_bits - 1,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
|
||||
ms_noise_reduction_key);
|
||||
|
||||
host_sub_and_propagate_single_carry<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp, xsign, nullptr, nullptr,
|
||||
mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
FLAG_NONE, (uint32_t)0);
|
||||
streams, tmp, xsign, nullptr, nullptr, mem_ptr->sub_and_propagate_mem,
|
||||
bsks, ksks, ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
|
||||
|
||||
} else {
|
||||
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], tmp,
|
||||
numerator_ct);
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
tmp, numerator_ct);
|
||||
|
||||
host_integer_radix_signed_scalar_mul_high_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp, mem_ptr->scalar_mul_high_mem,
|
||||
ksks, scalar_divisor_ffi, ms_noise_reduction_key, bsks);
|
||||
streams, tmp, mem_ptr->scalar_mul_high_mem, ksks, scalar_divisor_ffi,
|
||||
ms_noise_reduction_key, bsks);
|
||||
|
||||
host_add_and_propagate_single_carry<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp, numerator_ct, nullptr, nullptr,
|
||||
mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key, FLAG_NONE,
|
||||
(uint32_t)0);
|
||||
streams, tmp, numerator_ct, nullptr, nullptr, mem_ptr->scp_mem, bsks,
|
||||
ksks, ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp, scalar_divisor_ffi->shift_post,
|
||||
streams, tmp, scalar_divisor_ffi->shift_post,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
|
||||
ms_noise_reduction_key);
|
||||
|
||||
CudaRadixCiphertextFFI *xsign = mem_ptr->xsign_ffi;
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], xsign,
|
||||
numerator_ct);
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
xsign, numerator_ct);
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, xsign, numerator_bits - 1,
|
||||
streams, xsign, numerator_bits - 1,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
|
||||
ms_noise_reduction_key);
|
||||
|
||||
host_sub_and_propagate_single_carry<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp, xsign, nullptr, nullptr,
|
||||
mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
FLAG_NONE, (uint32_t)0);
|
||||
streams, tmp, xsign, nullptr, nullptr, mem_ptr->sub_and_propagate_mem,
|
||||
bsks, ksks, ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
|
||||
}
|
||||
|
||||
if (scalar_divisor_ffi->is_divisor_negative) {
|
||||
host_integer_radix_negation<Torus>(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct, tmp,
|
||||
mem_ptr->params.message_modulus, mem_ptr->params.carry_modulus,
|
||||
numerator_ct->num_radix_blocks);
|
||||
streams, numerator_ct, tmp, mem_ptr->params.message_modulus,
|
||||
mem_ptr->params.carry_modulus, numerator_ct->num_radix_blocks);
|
||||
} else {
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], numerator_ct,
|
||||
tmp);
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
numerator_ct, tmp);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_integer_unsigned_scalar_div_rem_radix(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, const int_radix_params params,
|
||||
CudaStreams streams, const int_radix_params params,
|
||||
int_unsigned_scalar_div_rem_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
uint32_t const active_bits_divisor, const bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_unsigned_scalar_div_rem_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory,
|
||||
size_tracker);
|
||||
streams, params, num_radix_blocks, scalar_divisor_ffi,
|
||||
active_bits_divisor, allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_unsigned_scalar_div_rem_radix(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *quotient_ct,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *quotient_ct,
|
||||
CudaRadixCiphertextFFI *remainder_ct,
|
||||
int_unsigned_scalar_div_rem_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
@@ -291,32 +272,32 @@ __host__ void host_integer_unsigned_scalar_div_rem_radix(
|
||||
uint32_t num_clear_blocks) {
|
||||
|
||||
auto numerator_ct = mem_ptr->numerator_ct;
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], numerator_ct,
|
||||
quotient_ct);
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
numerator_ct, quotient_ct);
|
||||
|
||||
host_integer_unsigned_scalar_div_radix(
|
||||
streams, gpu_indexes, gpu_count, quotient_ct, mem_ptr->unsigned_div_mem,
|
||||
bsks, ksks, ms_noise_reduction_key, scalar_divisor_ffi);
|
||||
streams, quotient_ct, mem_ptr->unsigned_div_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, scalar_divisor_ffi);
|
||||
|
||||
if (scalar_divisor_ffi->is_divisor_pow2) {
|
||||
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
|
||||
numerator_ct);
|
||||
host_integer_radix_scalar_bitop_kb(
|
||||
streams, gpu_indexes, gpu_count, remainder_ct, remainder_ct,
|
||||
clear_blocks, h_clear_blocks, num_clear_blocks, mem_ptr->bitop_mem,
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
remainder_ct, numerator_ct);
|
||||
host_integer_radix_scalar_bitop_kb(streams, remainder_ct, remainder_ct,
|
||||
clear_blocks, h_clear_blocks,
|
||||
num_clear_blocks, mem_ptr->bitop_mem,
|
||||
bsks, ksks, ms_noise_reduction_key);
|
||||
|
||||
} else {
|
||||
if (!scalar_divisor_ffi->is_divisor_zero) {
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
|
||||
remainder_ct, quotient_ct);
|
||||
copy_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), remainder_ct, quotient_ct);
|
||||
|
||||
if (!scalar_divisor_ffi->is_abs_divisor_one &&
|
||||
remainder_ct->num_radix_blocks != 0) {
|
||||
|
||||
host_integer_scalar_mul_radix<Torus>(
|
||||
streams, gpu_indexes, gpu_count, remainder_ct, decomposed_divisor,
|
||||
streams, remainder_ct, decomposed_divisor,
|
||||
divisor_has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, mem_ptr->params.message_modulus,
|
||||
num_scalars_divisor);
|
||||
@@ -324,19 +305,18 @@ __host__ void host_integer_unsigned_scalar_div_rem_radix(
|
||||
}
|
||||
|
||||
host_sub_and_propagate_single_carry(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct, remainder_ct, nullptr,
|
||||
nullptr, mem_ptr->sub_and_propagate_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
|
||||
streams, numerator_ct, remainder_ct, nullptr, nullptr,
|
||||
mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
FLAG_NONE, (uint32_t)0);
|
||||
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
|
||||
numerator_ct);
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
remainder_ct, numerator_ct);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_integer_signed_scalar_div_rem_radix(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, const int_radix_params params,
|
||||
CudaStreams streams, const int_radix_params params,
|
||||
int_signed_scalar_div_rem_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
uint32_t const active_bits_divisor, const bool allocate_gpu_memory) {
|
||||
@@ -344,17 +324,15 @@ __host__ uint64_t scratch_integer_signed_scalar_div_rem_radix(
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
*mem_ptr = new int_signed_scalar_div_rem_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory,
|
||||
size_tracker);
|
||||
streams, params, num_radix_blocks, scalar_divisor_ffi,
|
||||
active_bits_divisor, allocate_gpu_memory, size_tracker);
|
||||
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_signed_scalar_div_rem_radix(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *quotient_ct,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *quotient_ct,
|
||||
CudaRadixCiphertextFFI *remainder_ct,
|
||||
int_signed_scalar_div_rem_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
@@ -365,38 +343,37 @@ __host__ void host_integer_signed_scalar_div_rem_radix(
|
||||
uint32_t numerator_bits) {
|
||||
|
||||
auto numerator_ct = mem_ptr->numerator_ct;
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], numerator_ct,
|
||||
quotient_ct);
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
numerator_ct, quotient_ct);
|
||||
|
||||
host_integer_signed_scalar_div_radix_kb(
|
||||
streams, gpu_indexes, gpu_count, quotient_ct, mem_ptr->signed_div_mem,
|
||||
bsks, ksks, ms_noise_reduction_key, scalar_divisor_ffi, numerator_bits);
|
||||
streams, quotient_ct, mem_ptr->signed_div_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, scalar_divisor_ffi, numerator_bits);
|
||||
|
||||
host_propagate_single_carry<Torus>(
|
||||
streams, gpu_indexes, gpu_count, quotient_ct, nullptr, nullptr,
|
||||
mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key, FLAG_NONE,
|
||||
(uint32_t)0);
|
||||
streams, quotient_ct, nullptr, nullptr, mem_ptr->scp_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
|
||||
|
||||
if (!scalar_divisor_ffi->is_divisor_negative &&
|
||||
scalar_divisor_ffi->is_divisor_pow2) {
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
|
||||
quotient_ct);
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
remainder_ct, quotient_ct);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
streams, gpu_indexes, gpu_count, remainder_ct,
|
||||
scalar_divisor_ffi->ilog2_divisor, mem_ptr->logical_scalar_shift_mem,
|
||||
bsks, ksks, ms_noise_reduction_key, remainder_ct->num_radix_blocks);
|
||||
streams, remainder_ct, scalar_divisor_ffi->ilog2_divisor,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
remainder_ct->num_radix_blocks);
|
||||
|
||||
} else if (!scalar_divisor_ffi->is_divisor_zero) {
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
|
||||
quotient_ct);
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
remainder_ct, quotient_ct);
|
||||
|
||||
bool is_divisor_one = scalar_divisor_ffi->is_abs_divisor_one &&
|
||||
!scalar_divisor_ffi->is_divisor_negative;
|
||||
|
||||
if (!is_divisor_one && remainder_ct->num_radix_blocks != 0) {
|
||||
host_integer_scalar_mul_radix<Torus>(
|
||||
streams, gpu_indexes, gpu_count, remainder_ct, decomposed_divisor,
|
||||
streams, remainder_ct, decomposed_divisor,
|
||||
divisor_has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, mem_ptr->params.message_modulus,
|
||||
num_scalars_divisor);
|
||||
@@ -404,12 +381,12 @@ __host__ void host_integer_signed_scalar_div_rem_radix(
|
||||
}
|
||||
|
||||
host_sub_and_propagate_single_carry(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct, remainder_ct, nullptr,
|
||||
nullptr, mem_ptr->sub_and_propagate_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
|
||||
streams, numerator_ct, remainder_ct, nullptr, nullptr,
|
||||
mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
FLAG_NONE, (uint32_t)0);
|
||||
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
|
||||
numerator_ct);
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
remainder_ct, numerator_ct);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,13 +1,12 @@
|
||||
#include "integer/scalar_mul.cuh"
|
||||
|
||||
uint64_t scratch_cuda_integer_scalar_mul_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t num_scalar_bits, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t num_scalar_bits,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
@@ -16,36 +15,31 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
|
||||
noise_reduction_type);
|
||||
|
||||
return scratch_cuda_integer_radix_scalar_mul_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_scalar_mul_buffer<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
num_scalar_bits, allocate_gpu_memory);
|
||||
CudaStreams(streams), (int_scalar_mul_buffer<uint64_t> **)mem_ptr,
|
||||
num_blocks, params, num_scalar_bits, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array, uint64_t const *decomposed_scalar,
|
||||
uint64_t const *has_at_least_one_set, int8_t *mem, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
uint64_t const *decomposed_scalar, uint64_t const *has_at_least_one_set,
|
||||
int8_t *mem, void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t polynomial_size, uint32_t message_modulus, uint32_t num_scalars) {
|
||||
|
||||
host_integer_scalar_mul_radix<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
|
||||
decomposed_scalar, has_at_least_one_set,
|
||||
CudaStreams(streams), lwe_array, decomposed_scalar, has_at_least_one_set,
|
||||
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
|
||||
num_scalars);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_scalar_mul(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
void cleanup_cuda_integer_radix_scalar_mul(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_scalar_mul_buffer<uint64_t> *mem_ptr =
|
||||
(int_scalar_mul_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
@@ -30,22 +30,20 @@ __global__ void device_small_scalar_radix_multiplication(T *output_lwe_array,
|
||||
|
||||
template <typename T>
|
||||
__host__ uint64_t scratch_cuda_integer_radix_scalar_mul_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_scalar_mul_buffer<T> **mem_ptr,
|
||||
CudaStreams streams, int_scalar_mul_buffer<T> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
uint32_t num_scalar_bits, bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_scalar_mul_buffer<T>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
num_scalar_bits, allocate_gpu_memory, true, size_tracker);
|
||||
*mem_ptr = new int_scalar_mul_buffer<T>(streams, params, num_radix_blocks,
|
||||
num_scalar_bits, allocate_gpu_memory,
|
||||
true, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void host_integer_scalar_mul_radix(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
T const *decomposed_scalar, T const *has_at_least_one_set,
|
||||
int_scalar_mul_buffer<T> *mem, void *const *bsks, T *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
@@ -66,17 +64,17 @@ __host__ void host_integer_scalar_mul_radix(
|
||||
shift_amount * num_radix_blocks,
|
||||
preshifted_buffer->num_radix_blocks);
|
||||
if (has_at_least_one_set[shift_amount] == 1) {
|
||||
copy_radix_ciphertext_slice_async<T>(streams[0], gpu_indexes[0],
|
||||
&shift_input, 0, num_radix_blocks,
|
||||
lwe_array, 0, num_radix_blocks);
|
||||
copy_radix_ciphertext_slice_async<T>(
|
||||
streams.stream(0), streams.gpu_index(0), &shift_input, 0,
|
||||
num_radix_blocks, lwe_array, 0, num_radix_blocks);
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<T>(
|
||||
streams, gpu_indexes, gpu_count, &shift_input, shift_amount,
|
||||
mem->logical_scalar_shift_buffer, bsks, ksks, ms_noise_reduction_key,
|
||||
num_radix_blocks);
|
||||
streams, &shift_input, shift_amount, mem->logical_scalar_shift_buffer,
|
||||
bsks, ksks, ms_noise_reduction_key, num_radix_blocks);
|
||||
} else {
|
||||
// create trivial assign for value = 0
|
||||
set_zero_radix_ciphertext_slice_async<T>(
|
||||
streams[0], gpu_indexes[0], &shift_input, 0, num_radix_blocks);
|
||||
streams.stream(0), streams.gpu_index(0), &shift_input, 0,
|
||||
num_radix_blocks);
|
||||
}
|
||||
}
|
||||
size_t j = 0;
|
||||
@@ -91,46 +89,46 @@ __host__ void host_integer_scalar_mul_radix(
|
||||
as_radix_ciphertext_slice<T>(&block_shift_buffer, all_shifted_buffer,
|
||||
j * num_radix_blocks,
|
||||
all_shifted_buffer->num_radix_blocks);
|
||||
host_radix_blocks_rotate_right<T>(
|
||||
streams, gpu_indexes, gpu_count, &block_shift_buffer,
|
||||
&preshifted_radix_ct, i / msg_bits, num_radix_blocks);
|
||||
host_radix_blocks_rotate_right<T>(streams, &block_shift_buffer,
|
||||
&preshifted_radix_ct, i / msg_bits,
|
||||
num_radix_blocks);
|
||||
// create trivial assign for value = 0
|
||||
set_zero_radix_ciphertext_slice_async<T>(
|
||||
streams[0], gpu_indexes[0], &block_shift_buffer, 0, i / msg_bits);
|
||||
streams.stream(0), streams.gpu_index(0), &block_shift_buffer, 0,
|
||||
i / msg_bits);
|
||||
j++;
|
||||
}
|
||||
}
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
||||
|
||||
if (mem->anticipated_buffers_drop) {
|
||||
mem->release_buffers(streams, gpu_indexes, gpu_count);
|
||||
mem->release_buffers(streams);
|
||||
}
|
||||
|
||||
if (j == 0) {
|
||||
// lwe array = 0
|
||||
set_zero_radix_ciphertext_slice_async<T>(streams[0], gpu_indexes[0],
|
||||
lwe_array, 0, num_radix_blocks);
|
||||
set_zero_radix_ciphertext_slice_async<T>(streams.stream(0),
|
||||
streams.gpu_index(0), lwe_array, 0,
|
||||
num_radix_blocks);
|
||||
} else {
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<T>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer, bsks,
|
||||
ksks, ms_noise_reduction_key, mem->sum_ciphertexts_vec_mem,
|
||||
num_radix_blocks, j);
|
||||
streams, lwe_array, all_shifted_buffer, bsks, ksks,
|
||||
ms_noise_reduction_key, mem->sum_ciphertexts_vec_mem, num_radix_blocks,
|
||||
j);
|
||||
|
||||
auto scp_mem_ptr = mem->sc_prop_mem;
|
||||
uint32_t requested_flag = outputFlag::FLAG_NONE;
|
||||
uint32_t uses_carry = 0;
|
||||
host_propagate_single_carry<T>(streams, gpu_indexes, gpu_count, lwe_array,
|
||||
nullptr, nullptr, scp_mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, requested_flag,
|
||||
uses_carry);
|
||||
host_propagate_single_carry<T>(
|
||||
streams, lwe_array, nullptr, nullptr, scp_mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, requested_flag, uses_carry);
|
||||
}
|
||||
}
|
||||
|
||||
// Small scalar_mul is used in shift/rotate
|
||||
template <typename T>
|
||||
__host__ void host_integer_small_scalar_mul_radix(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *output_lwe_array,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *output_lwe_array,
|
||||
CudaRadixCiphertextFFI *input_lwe_array, T scalar,
|
||||
const uint32_t message_modulus, const uint32_t carry_modulus) {
|
||||
|
||||
@@ -139,7 +137,7 @@ __host__ void host_integer_small_scalar_mul_radix(
|
||||
if (output_lwe_array->lwe_dimension != input_lwe_array->lwe_dimension)
|
||||
PANIC("Cuda error: input and output lwe_dimension must be the same")
|
||||
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
cuda_set_device(streams.gpu_index(0));
|
||||
auto lwe_dimension = input_lwe_array->lwe_dimension;
|
||||
auto num_radix_blocks = input_lwe_array->num_radix_blocks;
|
||||
|
||||
@@ -153,7 +151,8 @@ __host__ void host_integer_small_scalar_mul_radix(
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
device_small_scalar_radix_multiplication<<<grid, thds, 0, streams[0]>>>(
|
||||
device_small_scalar_radix_multiplication<<<grid, thds, 0,
|
||||
streams.stream(0)>>>(
|
||||
(T *)output_lwe_array->ptr, (T *)input_lwe_array->ptr, scalar,
|
||||
lwe_dimension, num_radix_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
@@ -169,22 +168,20 @@ __host__ void host_integer_small_scalar_mul_radix(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_mul_high_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *ct,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *ct,
|
||||
int_scalar_mul_high_buffer<Torus> *mem_ptr, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, const CudaScalarDivisorFFI *scalar_divisor_ffi) {
|
||||
|
||||
if (scalar_divisor_ffi->is_chosen_multiplier_zero) {
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], ct,
|
||||
0, ct->num_radix_blocks);
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), ct, 0, ct->num_radix_blocks);
|
||||
return;
|
||||
}
|
||||
|
||||
CudaRadixCiphertextFFI *tmp_ffi = mem_ptr->tmp;
|
||||
|
||||
host_extend_radix_with_trivial_zero_blocks_msb<Torus>(tmp_ffi, ct, streams,
|
||||
gpu_indexes);
|
||||
host_extend_radix_with_trivial_zero_blocks_msb<Torus>(tmp_ffi, ct, streams);
|
||||
|
||||
if (scalar_divisor_ffi->active_bits != (uint32_t)0 &&
|
||||
!scalar_divisor_ffi->is_abs_chosen_multiplier_one &&
|
||||
@@ -192,16 +189,14 @@ __host__ void host_integer_radix_scalar_mul_high_kb(
|
||||
|
||||
if (scalar_divisor_ffi->is_chosen_multiplier_pow2) {
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp_ffi,
|
||||
scalar_divisor_ffi->ilog2_chosen_multiplier,
|
||||
streams, tmp_ffi, scalar_divisor_ffi->ilog2_chosen_multiplier,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, (uint64_t **)ksks,
|
||||
ms_noise_reduction_key, tmp_ffi->num_radix_blocks);
|
||||
|
||||
} else {
|
||||
|
||||
host_integer_scalar_mul_radix<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp_ffi,
|
||||
scalar_divisor_ffi->decomposed_chosen_multiplier,
|
||||
streams, tmp_ffi, scalar_divisor_ffi->decomposed_chosen_multiplier,
|
||||
scalar_divisor_ffi->chosen_multiplier_has_at_least_one_set,
|
||||
mem_ptr->scalar_mul_mem, bsks, (uint64_t **)ksks,
|
||||
ms_noise_reduction_key, mem_ptr->params.message_modulus,
|
||||
@@ -209,29 +204,28 @@ __host__ void host_integer_radix_scalar_mul_high_kb(
|
||||
}
|
||||
}
|
||||
|
||||
host_trim_radix_blocks_lsb<Torus>(ct, tmp_ffi, streams, gpu_indexes);
|
||||
host_trim_radix_blocks_lsb<Torus>(ct, tmp_ffi, streams);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_signed_scalar_mul_high_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *ct,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *ct,
|
||||
int_signed_scalar_mul_high_buffer<Torus> *mem_ptr, Torus *const *ksks,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks) {
|
||||
|
||||
if (scalar_divisor_ffi->is_chosen_multiplier_zero) {
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], ct,
|
||||
0, ct->num_radix_blocks);
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), ct, 0, ct->num_radix_blocks);
|
||||
return;
|
||||
}
|
||||
|
||||
CudaRadixCiphertextFFI *tmp_ffi = mem_ptr->tmp;
|
||||
|
||||
host_extend_radix_with_sign_msb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp_ffi, ct, mem_ptr->extend_radix_mem,
|
||||
ct->num_radix_blocks, bsks, (uint64_t **)ksks, ms_noise_reduction_key);
|
||||
streams, tmp_ffi, ct, mem_ptr->extend_radix_mem, ct->num_radix_blocks,
|
||||
bsks, (uint64_t **)ksks, ms_noise_reduction_key);
|
||||
|
||||
if (scalar_divisor_ffi->active_bits != (uint32_t)0 &&
|
||||
!scalar_divisor_ffi->is_abs_chosen_multiplier_one &&
|
||||
@@ -239,14 +233,12 @@ __host__ void host_integer_radix_signed_scalar_mul_high_kb(
|
||||
|
||||
if (scalar_divisor_ffi->is_chosen_multiplier_pow2) {
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp_ffi,
|
||||
scalar_divisor_ffi->ilog2_chosen_multiplier,
|
||||
streams, tmp_ffi, scalar_divisor_ffi->ilog2_chosen_multiplier,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, (uint64_t **)ksks,
|
||||
ms_noise_reduction_key, tmp_ffi->num_radix_blocks);
|
||||
} else {
|
||||
host_integer_scalar_mul_radix<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp_ffi,
|
||||
scalar_divisor_ffi->decomposed_chosen_multiplier,
|
||||
streams, tmp_ffi, scalar_divisor_ffi->decomposed_chosen_multiplier,
|
||||
scalar_divisor_ffi->chosen_multiplier_has_at_least_one_set,
|
||||
mem_ptr->scalar_mul_mem, bsks, (uint64_t **)ksks,
|
||||
ms_noise_reduction_key, mem_ptr->params.message_modulus,
|
||||
@@ -254,7 +246,7 @@ __host__ void host_integer_radix_signed_scalar_mul_high_kb(
|
||||
}
|
||||
}
|
||||
|
||||
host_trim_radix_blocks_lsb<Torus>(ct, tmp_ffi, streams, gpu_indexes);
|
||||
host_trim_radix_blocks_lsb<Torus>(ct, tmp_ffi, streams);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
#include "scalar_rotate.cuh"
|
||||
|
||||
uint64_t scratch_cuda_integer_radix_scalar_rotate_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
@@ -15,32 +15,29 @@ uint64_t scratch_cuda_integer_radix_scalar_rotate_kb_64(
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_integer_radix_scalar_rotate_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
CudaStreams(streams),
|
||||
(int_logical_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
shift_type, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_radix_scalar_rotate_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array, uint32_t n, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t n,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
host_integer_radix_scalar_rotate_kb_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, n,
|
||||
CudaStreams(streams), lwe_array, n,
|
||||
(int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_scalar_rotate(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
void cleanup_cuda_integer_radix_scalar_rotate(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_logical_scalar_shift_buffer<uint64_t> *mem_ptr =
|
||||
(int_logical_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
@@ -13,22 +13,20 @@
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_radix_scalar_rotate_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_logical_scalar_shift_buffer<Torus> **mem_ptr,
|
||||
CudaStreams streams, int_logical_scalar_shift_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
streams, shift_type, params, num_radix_blocks, allocate_gpu_memory,
|
||||
size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array, uint32_t n,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array, uint32_t n,
|
||||
int_logical_scalar_shift_buffer<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
@@ -56,12 +54,11 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
// one block is responsible to process single lwe ciphertext
|
||||
if (mem->shift_type == LEFT_SHIFT) {
|
||||
// rotate right as the blocks are from LSB to MSB
|
||||
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
|
||||
rotated_buffer, lwe_array, rotations,
|
||||
num_blocks);
|
||||
host_radix_blocks_rotate_right<Torus>(streams, rotated_buffer, lwe_array,
|
||||
rotations, num_blocks);
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
lwe_array, 0, num_blocks,
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lwe_array, 0, num_blocks,
|
||||
rotated_buffer, 0, num_blocks);
|
||||
|
||||
if (shift_within_block == 0) {
|
||||
@@ -70,25 +67,23 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
|
||||
auto receiver_blocks = lwe_array;
|
||||
auto giver_blocks = rotated_buffer;
|
||||
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
|
||||
giver_blocks, lwe_array, 1,
|
||||
host_radix_blocks_rotate_right<Torus>(streams, giver_blocks, lwe_array, 1,
|
||||
num_blocks);
|
||||
|
||||
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
|
||||
giver_blocks, bsks, ksks, ms_noise_reduction_key, lut_bivariate,
|
||||
num_blocks, lut_bivariate->params.message_modulus);
|
||||
streams, lwe_array, receiver_blocks, giver_blocks, bsks, ksks,
|
||||
ms_noise_reduction_key, lut_bivariate, num_blocks,
|
||||
lut_bivariate->params.message_modulus);
|
||||
|
||||
} else {
|
||||
// rotate left as the blocks are from LSB to MSB
|
||||
host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
|
||||
rotated_buffer, lwe_array, rotations,
|
||||
num_blocks);
|
||||
host_radix_blocks_rotate_left<Torus>(streams, rotated_buffer, lwe_array,
|
||||
rotations, num_blocks);
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
lwe_array, 0, num_blocks,
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lwe_array, 0, num_blocks,
|
||||
rotated_buffer, 0, num_blocks);
|
||||
|
||||
if (shift_within_block == 0) {
|
||||
@@ -97,16 +92,15 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
|
||||
auto receiver_blocks = lwe_array;
|
||||
auto giver_blocks = rotated_buffer;
|
||||
host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
|
||||
giver_blocks, lwe_array, 1,
|
||||
host_radix_blocks_rotate_left<Torus>(streams, giver_blocks, lwe_array, 1,
|
||||
num_blocks);
|
||||
|
||||
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
|
||||
giver_blocks, bsks, ksks, ms_noise_reduction_key, lut_bivariate,
|
||||
num_blocks, lut_bivariate->params.message_modulus);
|
||||
streams, lwe_array, receiver_blocks, giver_blocks, bsks, ksks,
|
||||
ms_noise_reduction_key, lut_bivariate, num_blocks,
|
||||
lut_bivariate->params.message_modulus);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
#include "scalar_shifts.cuh"
|
||||
|
||||
uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
@@ -15,7 +15,7 @@ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_integer_radix_logical_scalar_shift_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
CudaStreams(streams),
|
||||
(int_logical_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
shift_type, allocate_gpu_memory);
|
||||
}
|
||||
@@ -25,24 +25,23 @@ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
|
||||
/// the application of a PBS onto the rotated blocks up to num_blocks -
|
||||
/// rotations - 1 The remaining blocks are padded with zeros
|
||||
void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array, uint32_t shift, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, shift,
|
||||
CudaStreams(streams), lwe_array, shift,
|
||||
(int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, lwe_array->num_radix_blocks);
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
@@ -51,7 +50,7 @@ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_integer_radix_arithmetic_scalar_shift_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
CudaStreams(streams),
|
||||
(int_arithmetic_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks,
|
||||
params, shift_type, allocate_gpu_memory);
|
||||
}
|
||||
@@ -64,37 +63,34 @@ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
|
||||
/// block, which is copied onto all remaining blocks instead of padding with
|
||||
/// zeros as would be done in the logical shift.
|
||||
void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array, uint32_t shift, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, shift,
|
||||
CudaStreams(streams), lwe_array, shift,
|
||||
(int_arithmetic_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_logical_scalar_shift(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void cleanup_cuda_integer_radix_logical_scalar_shift(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_logical_scalar_shift_buffer<uint64_t> *mem_ptr =
|
||||
(int_logical_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_arithmetic_scalar_shift(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void cleanup_cuda_integer_radix_arithmetic_scalar_shift(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_arithmetic_scalar_shift_buffer<uint64_t> *mem_ptr =
|
||||
(int_arithmetic_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
@@ -13,22 +13,20 @@
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_logical_scalar_shift_buffer<Torus> **mem_ptr,
|
||||
CudaStreams streams, int_logical_scalar_shift_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
streams, shift_type, params, num_radix_blocks, allocate_gpu_memory,
|
||||
size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
|
||||
int_logical_scalar_shift_buffer<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
@@ -56,15 +54,14 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
|
||||
if (mem->shift_type == LEFT_SHIFT) {
|
||||
// rotate right as the blocks are from LSB to MSB
|
||||
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
|
||||
&rotated_buffer, lwe_array, rotations,
|
||||
num_blocks);
|
||||
host_radix_blocks_rotate_right<Torus>(streams, &rotated_buffer, lwe_array,
|
||||
rotations, num_blocks);
|
||||
|
||||
// create trivial assign for value = 0
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
&rotated_buffer, 0, rotations);
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
lwe_array, 0, num_blocks,
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), &rotated_buffer, 0, rotations);
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lwe_array, 0, num_blocks,
|
||||
&rotated_buffer, 0, num_blocks);
|
||||
|
||||
if (shift_within_block == 0 || rotations == num_blocks) {
|
||||
@@ -83,24 +80,23 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
size_t partial_block_count = num_blocks - rotations;
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, &partial_current_blocks,
|
||||
&partial_current_blocks, &partial_previous_blocks, bsks, ksks,
|
||||
ms_noise_reduction_key, lut_bivariate, partial_block_count,
|
||||
streams, &partial_current_blocks, &partial_current_blocks,
|
||||
&partial_previous_blocks, bsks, ksks, ms_noise_reduction_key,
|
||||
lut_bivariate, partial_block_count,
|
||||
lut_bivariate->params.message_modulus);
|
||||
|
||||
} else {
|
||||
// right shift
|
||||
host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
|
||||
&rotated_buffer, lwe_array, rotations,
|
||||
num_blocks);
|
||||
host_radix_blocks_rotate_left<Torus>(streams, &rotated_buffer, lwe_array,
|
||||
rotations, num_blocks);
|
||||
|
||||
// rotate left as the blocks are from LSB to MSB
|
||||
// create trivial assign for value = 0
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], &rotated_buffer, num_blocks - rotations,
|
||||
num_blocks);
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
lwe_array, 0, num_blocks,
|
||||
streams.stream(0), streams.gpu_index(0), &rotated_buffer,
|
||||
num_blocks - rotations, num_blocks);
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lwe_array, 0, num_blocks,
|
||||
&rotated_buffer, 0, num_blocks);
|
||||
|
||||
if (shift_within_block == 0 || rotations == num_blocks) {
|
||||
@@ -116,31 +112,28 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
size_t partial_block_count = num_blocks - rotations;
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, partial_current_blocks,
|
||||
partial_current_blocks, &partial_next_blocks, bsks, ksks,
|
||||
ms_noise_reduction_key, lut_bivariate, partial_block_count,
|
||||
lut_bivariate->params.message_modulus);
|
||||
streams, partial_current_blocks, partial_current_blocks,
|
||||
&partial_next_blocks, bsks, ksks, ms_noise_reduction_key, lut_bivariate,
|
||||
partial_block_count, lut_bivariate->params.message_modulus);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_arithmetic_scalar_shift_buffer<Torus> **mem_ptr,
|
||||
CudaStreams streams, int_arithmetic_scalar_shift_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_arithmetic_scalar_shift_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
streams, shift_type, params, num_radix_blocks, allocate_gpu_memory,
|
||||
size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
|
||||
int_arithmetic_scalar_shift_buffer<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
@@ -167,11 +160,10 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
num_blocks + 2, num_blocks + 3);
|
||||
|
||||
if (mem->shift_type == RIGHT_SHIFT) {
|
||||
host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
|
||||
mem->tmp_rotated, lwe_array, rotations,
|
||||
num_blocks);
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
lwe_array, 0, num_blocks,
|
||||
host_radix_blocks_rotate_left<Torus>(streams, mem->tmp_rotated, lwe_array,
|
||||
rotations, num_blocks);
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lwe_array, 0, num_blocks,
|
||||
mem->tmp_rotated, 0, num_blocks);
|
||||
|
||||
if (num_bits_in_block == 1) {
|
||||
@@ -183,7 +175,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
// we can optimize things by not doing the pbs to extract this sign bit
|
||||
for (uint i = 0; i < num_blocks; i++) {
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], mem->tmp_rotated,
|
||||
streams.stream(0), streams.gpu_index(0), mem->tmp_rotated,
|
||||
num_blocks - rotations + i, num_blocks - rotations + i + 1,
|
||||
mem->tmp_rotated, num_blocks - rotations - 1,
|
||||
num_blocks - rotations);
|
||||
@@ -201,8 +193,8 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
num_blocks - rotations - 1,
|
||||
num_blocks - rotations);
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], &last_block_copy, 0, 1, mem->tmp_rotated,
|
||||
num_blocks - rotations - 1, num_blocks - rotations);
|
||||
streams.stream(0), streams.gpu_index(0), &last_block_copy, 0, 1,
|
||||
mem->tmp_rotated, num_blocks - rotations - 1, num_blocks - rotations);
|
||||
if (shift_within_block != 0) {
|
||||
auto partial_current_blocks = lwe_array;
|
||||
CudaRadixCiphertextFFI partial_next_blocks;
|
||||
@@ -212,42 +204,37 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, partial_current_blocks,
|
||||
partial_current_blocks, &partial_next_blocks, bsks, ksks,
|
||||
ms_noise_reduction_key, lut_bivariate, partial_block_count,
|
||||
streams, partial_current_blocks, partial_current_blocks,
|
||||
&partial_next_blocks, bsks, ksks, ms_noise_reduction_key,
|
||||
lut_bivariate, partial_block_count,
|
||||
lut_bivariate->params.message_modulus);
|
||||
}
|
||||
// Since our CPU threads will be working on different streams we shall
|
||||
// Ensure the work in the main stream is completed
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
streams.synchronize();
|
||||
auto lut_univariate_padding_block =
|
||||
mem->lut_buffers_univariate[num_bits_in_block - 1];
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem->local_streams_1, gpu_indexes, gpu_count, &padding_block,
|
||||
&last_block_copy, bsks, ksks, ms_noise_reduction_key,
|
||||
lut_univariate_padding_block, 1);
|
||||
mem->local_streams_1, &padding_block, &last_block_copy, bsks, ksks,
|
||||
ms_noise_reduction_key, lut_univariate_padding_block, 1);
|
||||
// Replace blocks 'pulled' from the left with the correct padding
|
||||
// block
|
||||
for (uint i = 0; i < rotations; i++) {
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
mem->local_streams_1[0], gpu_indexes[0], lwe_array,
|
||||
num_blocks - rotations + i, num_blocks - rotations + i + 1,
|
||||
&padding_block, 0, 1);
|
||||
mem->local_streams_1.stream(0), mem->local_streams_1.gpu_index(0),
|
||||
lwe_array, num_blocks - rotations + i,
|
||||
num_blocks - rotations + i + 1, &padding_block, 0, 1);
|
||||
}
|
||||
if (shift_within_block != 0) {
|
||||
auto lut_univariate_shift_last_block =
|
||||
mem->lut_buffers_univariate[shift_within_block - 1];
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem->local_streams_2, gpu_indexes, gpu_count, &last_block,
|
||||
&last_block_copy, bsks, ksks, ms_noise_reduction_key,
|
||||
lut_univariate_shift_last_block, 1);
|
||||
}
|
||||
for (uint j = 0; j < mem->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem->local_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem->local_streams_2[j], gpu_indexes[j]);
|
||||
mem->local_streams_2, &last_block, &last_block_copy, bsks, ksks,
|
||||
ms_noise_reduction_key, lut_univariate_shift_last_block, 1);
|
||||
}
|
||||
|
||||
mem->local_streams_1.synchronize();
|
||||
mem->local_streams_2.synchronize();
|
||||
}
|
||||
} else {
|
||||
PANIC("Cuda error (scalar shift): left scalar shift is never of the "
|
||||
|
||||
@@ -1,14 +1,13 @@
|
||||
#include "shift_and_rotate.cuh"
|
||||
|
||||
uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
bool is_signed, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
@@ -16,31 +15,28 @@ uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb_64(
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_integer_radix_shift_and_rotate_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_shift_and_rotate_buffer<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
shift_type, is_signed, allocate_gpu_memory);
|
||||
CudaStreams(streams), (int_shift_and_rotate_buffer<uint64_t> **)mem_ptr,
|
||||
num_blocks, params, shift_type, is_signed, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array, CudaRadixCiphertextFFI const *lwe_shift,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
CudaRadixCiphertextFFI const *lwe_shift, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
host_integer_radix_shift_and_rotate_kb_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, lwe_shift,
|
||||
CudaStreams(streams), lwe_array, lwe_shift,
|
||||
(int_shift_and_rotate_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_shift_and_rotate(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
void cleanup_cuda_integer_radix_shift_and_rotate(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_shift_and_rotate_buffer<uint64_t> *mem_ptr =
|
||||
(int_shift_and_rotate_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
@@ -14,26 +14,24 @@
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_shift_and_rotate_buffer<Torus> **mem_ptr,
|
||||
CudaStreams streams, int_shift_and_rotate_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed, bool allocate_gpu_memory) {
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_shift_and_rotate_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, shift_type, is_signed, params,
|
||||
num_radix_blocks, allocate_gpu_memory, size_tracker);
|
||||
streams, shift_type, is_signed, params, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
CudaRadixCiphertextFFI const *lwe_shift,
|
||||
int_shift_and_rotate_buffer<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
cuda_set_device(streams.gpu_index(0));
|
||||
|
||||
if (lwe_array->num_radix_blocks != lwe_shift->num_radix_blocks)
|
||||
PANIC("Cuda error: lwe_shift and lwe_array num radix blocks must be "
|
||||
@@ -58,8 +56,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
|
||||
// Extract all bits
|
||||
auto bits = mem->tmp_bits;
|
||||
extract_n_bits<Torus>(streams, gpu_indexes, gpu_count, bits, lwe_array, bsks,
|
||||
ksks, ms_noise_reduction_key,
|
||||
extract_n_bits<Torus>(streams, bits, lwe_array, bsks, ksks,
|
||||
ms_noise_reduction_key,
|
||||
num_radix_blocks * bits_per_block, num_radix_blocks,
|
||||
mem->bit_extract_luts);
|
||||
|
||||
@@ -80,10 +78,9 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
// Extracts bits and put them in the bit index 2 (=> bit number 3)
|
||||
// so that it is already aligned to the correct position of the cmux input
|
||||
// and we reduce noise growth
|
||||
extract_n_bits<Torus>(streams, gpu_indexes, gpu_count, shift_bits, lwe_shift,
|
||||
bsks, ksks, ms_noise_reduction_key,
|
||||
max_num_bits_that_tell_shift, num_radix_blocks,
|
||||
mem->bit_extract_luts_with_offset_2);
|
||||
extract_n_bits<Torus>(streams, shift_bits, lwe_shift, bsks, ksks,
|
||||
ms_noise_reduction_key, max_num_bits_that_tell_shift,
|
||||
num_radix_blocks, mem->bit_extract_luts_with_offset_2);
|
||||
|
||||
// If signed, do an "arithmetic shift" by padding with the sign bit
|
||||
CudaRadixCiphertextFFI last_bit;
|
||||
@@ -97,58 +94,54 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
auto mux_lut = mem->mux_lut;
|
||||
auto mux_inputs = mem->tmp_mux_inputs;
|
||||
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], input_bits_a,
|
||||
bits);
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
input_bits_a, bits);
|
||||
for (int d = 0; d < max_num_bits_that_tell_shift; d++) {
|
||||
CudaRadixCiphertextFFI shift_bit;
|
||||
as_radix_ciphertext_slice<Torus>(&shift_bit, shift_bits, d, d + 1);
|
||||
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], input_bits_b,
|
||||
input_bits_a);
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
input_bits_b, input_bits_a);
|
||||
auto rotations = 1 << d;
|
||||
switch (mem->shift_type) {
|
||||
case LEFT_SHIFT:
|
||||
// rotate right as the blocks are from LSB to MSB
|
||||
if (input_bits_b->num_radix_blocks != total_nb_bits)
|
||||
PANIC("Cuda error: incorrect number of blocks")
|
||||
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
|
||||
rotated_input, input_bits_b,
|
||||
rotations, total_nb_bits);
|
||||
host_radix_blocks_rotate_right<Torus>(
|
||||
streams, rotated_input, input_bits_b, rotations, total_nb_bits);
|
||||
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
rotated_input, 0, rotations);
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), rotated_input, 0, rotations);
|
||||
break;
|
||||
case RIGHT_SHIFT:
|
||||
// rotate left as the blocks are from LSB to MSB
|
||||
if (input_bits_b->num_radix_blocks != total_nb_bits)
|
||||
PANIC("Cuda error: incorrect number of blocks")
|
||||
host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
|
||||
rotated_input, input_bits_b,
|
||||
host_radix_blocks_rotate_left<Torus>(streams, rotated_input, input_bits_b,
|
||||
rotations, total_nb_bits);
|
||||
|
||||
if (mem->is_signed)
|
||||
for (int i = 0; i < rotations; i++) {
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], rotated_input,
|
||||
streams.stream(0), streams.gpu_index(0), rotated_input,
|
||||
total_nb_bits - rotations + i, total_nb_bits - rotations + i + 1,
|
||||
&last_bit, 0, 1);
|
||||
}
|
||||
else {
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], rotated_input,
|
||||
streams.stream(0), streams.gpu_index(0), rotated_input,
|
||||
total_nb_bits - rotations, total_nb_bits);
|
||||
}
|
||||
break;
|
||||
case LEFT_ROTATE:
|
||||
// rotate right as the blocks are from LSB to MSB
|
||||
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
|
||||
rotated_input, input_bits_b,
|
||||
rotations, total_nb_bits);
|
||||
host_radix_blocks_rotate_right<Torus>(
|
||||
streams, rotated_input, input_bits_b, rotations, total_nb_bits);
|
||||
break;
|
||||
case RIGHT_ROTATE:
|
||||
// rotate left as the blocks are from LSB to MSB
|
||||
host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
|
||||
rotated_input, input_bits_b,
|
||||
host_radix_blocks_rotate_left<Torus>(streams, rotated_input, input_bits_b,
|
||||
rotations, total_nb_bits);
|
||||
break;
|
||||
default:
|
||||
@@ -158,20 +151,20 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
// host_pack bits into one block so that we have
|
||||
// control_bit|b|a
|
||||
host_pack_bivariate_blocks<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mux_inputs, mux_lut->lwe_indexes_out,
|
||||
rotated_input, input_bits_a, mux_lut->lwe_indexes_in, 2, total_nb_bits,
|
||||
streams, mux_inputs, mux_lut->lwe_indexes_out, rotated_input,
|
||||
input_bits_a, mux_lut->lwe_indexes_in, 2, total_nb_bits,
|
||||
mem->params.message_modulus, mem->params.carry_modulus);
|
||||
|
||||
// The shift bit is already properly aligned/positioned
|
||||
host_add_the_same_block_to_all_blocks<Torus>(
|
||||
streams[0], gpu_indexes[0], mux_inputs, mux_inputs, &shift_bit,
|
||||
mem->params.message_modulus, mem->params.carry_modulus);
|
||||
streams.stream(0), streams.gpu_index(0), mux_inputs, mux_inputs,
|
||||
&shift_bit, mem->params.message_modulus, mem->params.carry_modulus);
|
||||
|
||||
// we have
|
||||
// control_bit|b|a
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, input_bits_a, mux_inputs, bsks, ksks,
|
||||
ms_noise_reduction_key, mux_lut, total_nb_bits);
|
||||
streams, input_bits_a, mux_inputs, bsks, ksks, ms_noise_reduction_key,
|
||||
mux_lut, total_nb_bits);
|
||||
}
|
||||
|
||||
// Initializes the output
|
||||
@@ -179,15 +172,15 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
for (int i = 0; i < num_radix_blocks; i++) {
|
||||
auto last_bit_index = (bits_per_block - 1) + i * bits_per_block;
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], lwe_array, i, i + 1, input_bits_a,
|
||||
last_bit_index, last_bit_index + 1);
|
||||
streams.stream(0), streams.gpu_index(0), lwe_array, i, i + 1,
|
||||
input_bits_a, last_bit_index, last_bit_index + 1);
|
||||
}
|
||||
|
||||
// Bitshift and add the other bits
|
||||
for (int i = bits_per_block - 2; i >= 0; i--) {
|
||||
host_integer_small_scalar_mul_radix<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, lwe_array, 2,
|
||||
mem->params.message_modulus, mem->params.carry_modulus);
|
||||
host_integer_small_scalar_mul_radix<Torus>(streams, lwe_array, lwe_array, 2,
|
||||
mem->params.message_modulus,
|
||||
mem->params.carry_modulus);
|
||||
for (int j = 0; j < num_radix_blocks; j++) {
|
||||
CudaRadixCiphertextFFI block;
|
||||
CudaRadixCiphertextFFI bit_to_add;
|
||||
@@ -195,16 +188,16 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
as_radix_ciphertext_slice<Torus>(&bit_to_add, input_bits_a,
|
||||
i + j * bits_per_block,
|
||||
i + j * bits_per_block + 1);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], &block, &block,
|
||||
&bit_to_add, 1, mem->params.message_modulus,
|
||||
host_addition<Torus>(streams.stream(0), streams.gpu_index(0), &block,
|
||||
&block, &bit_to_add, 1, mem->params.message_modulus,
|
||||
mem->params.carry_modulus);
|
||||
}
|
||||
|
||||
// To give back a clean ciphertext
|
||||
auto cleaning_lut = mem->cleaning_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks,
|
||||
ms_noise_reduction_key, cleaning_lut, num_radix_blocks);
|
||||
streams, lwe_array, lwe_array, bsks, ksks, ms_noise_reduction_key,
|
||||
cleaning_lut, num_radix_blocks);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
#include "subtraction.cuh"
|
||||
|
||||
uint64_t scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
@@ -15,35 +15,32 @@ uint64_t scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_sub_and_propagate_single_carry<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_sub_and_propagate<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
requested_flag, allocate_gpu_memory);
|
||||
CudaStreams(streams), (int_sub_and_propagate<uint64_t> **)mem_ptr,
|
||||
num_blocks, params, requested_flag, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_sub_and_propagate_single_carry_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
|
||||
CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
|
||||
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
|
||||
const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t requested_flag, uint32_t uses_carry) {
|
||||
PUSH_RANGE("sub")
|
||||
host_sub_and_propagate_single_carry<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lhs_array, rhs_array,
|
||||
carry_out, carry_in, (int_sub_and_propagate<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, requested_flag, uses_carry);
|
||||
CudaStreams(streams), lhs_array, rhs_array, carry_out, carry_in,
|
||||
(int_sub_and_propagate<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, requested_flag, uses_carry);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cleanup_cuda_sub_and_propagate_single_carry(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
void cleanup_cuda_sub_and_propagate_single_carry(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
PUSH_RANGE("cleanup sub")
|
||||
int_sub_and_propagate<uint64_t> *mem_ptr =
|
||||
(int_sub_and_propagate<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
POP_RANGE()
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
|
||||
@@ -14,24 +14,22 @@
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t scratch_cuda_sub_and_propagate_single_carry(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_sub_and_propagate<Torus> **mem_ptr,
|
||||
CudaStreams streams, int_sub_and_propagate<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params, uint32_t requested_flag,
|
||||
bool allocate_gpu_memory) {
|
||||
PUSH_RANGE("scratch sub")
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
*mem_ptr = new int_sub_and_propagate<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks, requested_flag,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
streams, params, num_radix_blocks, requested_flag, allocate_gpu_memory,
|
||||
size_tracker);
|
||||
POP_RANGE()
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_sub_and_propagate_single_carry(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lhs_array,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lhs_array,
|
||||
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
|
||||
const CudaRadixCiphertextFFI *input_carries,
|
||||
int_sub_and_propagate<Torus> *mem, void *const *bsks, Torus *const *ksks,
|
||||
@@ -39,24 +37,22 @@ void host_sub_and_propagate_single_carry(
|
||||
uint32_t requested_flag, uint32_t uses_carry) {
|
||||
|
||||
host_integer_radix_negation<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem->neg_rhs_array, rhs_array,
|
||||
mem->params.message_modulus, mem->params.carry_modulus,
|
||||
mem->neg_rhs_array->num_radix_blocks);
|
||||
streams, mem->neg_rhs_array, rhs_array, mem->params.message_modulus,
|
||||
mem->params.carry_modulus, mem->neg_rhs_array->num_radix_blocks);
|
||||
|
||||
host_add_and_propagate_single_carry<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lhs_array, mem->neg_rhs_array, carry_out,
|
||||
input_carries, mem->sc_prop_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
requested_flag, uses_carry);
|
||||
streams, lhs_array, mem->neg_rhs_array, carry_out, input_carries,
|
||||
mem->sc_prop_mem, bsks, ksks, ms_noise_reduction_key, requested_flag,
|
||||
uses_carry);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_subtraction(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in_2, uint64_t message_modulus,
|
||||
uint64_t carry_modulus, uint32_t num_radix_blocks) {
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
cuda_set_device(streams.gpu_index(0));
|
||||
|
||||
if (lwe_array_out->num_radix_blocks < num_radix_blocks ||
|
||||
lwe_array_in_1->num_radix_blocks < num_radix_blocks ||
|
||||
@@ -69,11 +65,11 @@ __host__ void host_integer_radix_subtraction(
|
||||
PANIC("Cuda error: lwe_array_in and lwe_array_out lwe_dimension must be "
|
||||
"the same")
|
||||
|
||||
host_integer_radix_negation<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in_2,
|
||||
message_modulus, carry_modulus, num_radix_blocks);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], lwe_array_out, lwe_array_out,
|
||||
lwe_array_in_1, num_radix_blocks, message_modulus,
|
||||
carry_modulus);
|
||||
host_integer_radix_negation<Torus>(streams, lwe_array_out, lwe_array_in_2,
|
||||
message_modulus, carry_modulus,
|
||||
num_radix_blocks);
|
||||
host_addition<Torus>(streams.stream(0), streams.gpu_index(0), lwe_array_out,
|
||||
lwe_array_out, lwe_array_in_1, num_radix_blocks,
|
||||
message_modulus, carry_modulus);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -205,8 +205,7 @@ __device__ void mul_ggsw_glwe_in_fourier_domain_2_2_params(
|
||||
|
||||
template <typename InputTorus, typename OutputTorus>
|
||||
void execute_pbs_async(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, const LweArrayVariant<OutputTorus> &lwe_array_out,
|
||||
CudaStreams streams, const LweArrayVariant<OutputTorus> &lwe_array_out,
|
||||
const LweArrayVariant<InputTorus> &lwe_output_indexes,
|
||||
const std::vector<OutputTorus *> lut_vec,
|
||||
const std::vector<InputTorus *> lut_indexes_vec,
|
||||
@@ -226,12 +225,12 @@ void execute_pbs_async(
|
||||
case MULTI_BIT:
|
||||
PANIC("Error: 32-bit multibit PBS is not supported.\n")
|
||||
case CLASSICAL:
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
int num_inputs_on_gpu =
|
||||
get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count);
|
||||
for (uint i = 0; i < streams.count(); i++) {
|
||||
int num_inputs_on_gpu = get_num_inputs_on_gpu(
|
||||
input_lwe_ciphertext_count, i, streams.count());
|
||||
|
||||
int gpu_offset =
|
||||
get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
|
||||
get_gpu_offset(input_lwe_ciphertext_count, i, streams.count());
|
||||
auto d_lut_vector_indexes =
|
||||
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
|
||||
|
||||
@@ -246,7 +245,7 @@ void execute_pbs_async(
|
||||
get_variant_element(lwe_input_indexes, i);
|
||||
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
streams[i], gpu_indexes[i], current_lwe_array_out,
|
||||
streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
|
||||
current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
|
||||
current_lwe_array_in, current_lwe_input_indexes,
|
||||
bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
|
||||
@@ -263,9 +262,9 @@ void execute_pbs_async(
|
||||
case MULTI_BIT:
|
||||
if (grouping_factor == 0)
|
||||
PANIC("Multi-bit PBS error: grouping factor should be > 0.")
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
int num_inputs_on_gpu =
|
||||
get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count);
|
||||
for (uint i = 0; i < streams.count(); i++) {
|
||||
int num_inputs_on_gpu = get_num_inputs_on_gpu(
|
||||
input_lwe_ciphertext_count, i, streams.count());
|
||||
|
||||
// Use the macro to get the correct elements for the current iteration
|
||||
// Handles the case when the input/output are scattered through
|
||||
@@ -278,12 +277,12 @@ void execute_pbs_async(
|
||||
get_variant_element(lwe_input_indexes, i);
|
||||
|
||||
int gpu_offset =
|
||||
get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
|
||||
get_gpu_offset(input_lwe_ciphertext_count, i, streams.count());
|
||||
auto d_lut_vector_indexes =
|
||||
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
|
||||
|
||||
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
streams[i], gpu_indexes[i], current_lwe_array_out,
|
||||
streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
|
||||
current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
|
||||
current_lwe_array_in, current_lwe_input_indexes,
|
||||
bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
|
||||
@@ -292,9 +291,9 @@ void execute_pbs_async(
|
||||
}
|
||||
break;
|
||||
case CLASSICAL:
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
int num_inputs_on_gpu =
|
||||
get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count);
|
||||
for (uint i = 0; i < streams.count(); i++) {
|
||||
int num_inputs_on_gpu = get_num_inputs_on_gpu(
|
||||
input_lwe_ciphertext_count, i, streams.count());
|
||||
|
||||
// Use the macro to get the correct elements for the current iteration
|
||||
// Handles the case when the input/output are scattered through
|
||||
@@ -307,7 +306,7 @@ void execute_pbs_async(
|
||||
get_variant_element(lwe_input_indexes, i);
|
||||
|
||||
int gpu_offset =
|
||||
get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
|
||||
get_gpu_offset(input_lwe_ciphertext_count, i, streams.count());
|
||||
auto d_lut_vector_indexes =
|
||||
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
|
||||
|
||||
@@ -316,7 +315,7 @@ void execute_pbs_async(
|
||||
ms_noise_reduction_key->ptr != nullptr)
|
||||
zeros = ms_noise_reduction_key->ptr[i];
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
streams[i], gpu_indexes[i], current_lwe_array_out,
|
||||
streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
|
||||
current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
|
||||
current_lwe_array_in, current_lwe_input_indexes,
|
||||
bootstrapping_keys[i], ms_noise_reduction_key, zeros, pbs_buffer[i],
|
||||
@@ -333,9 +332,9 @@ void execute_pbs_async(
|
||||
case MULTI_BIT:
|
||||
if (grouping_factor == 0)
|
||||
PANIC("Multi-bit PBS error: grouping factor should be > 0.")
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
int num_inputs_on_gpu =
|
||||
get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count);
|
||||
for (uint i = 0; i < streams.count(); i++) {
|
||||
int num_inputs_on_gpu = get_num_inputs_on_gpu(
|
||||
input_lwe_ciphertext_count, i, streams.count());
|
||||
|
||||
// Use the macro to get the correct elements for the current iteration
|
||||
// Handles the case when the input/output are scattered through
|
||||
@@ -348,7 +347,7 @@ void execute_pbs_async(
|
||||
get_variant_element(lwe_input_indexes, i);
|
||||
|
||||
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
streams[i], gpu_indexes[i], current_lwe_array_out,
|
||||
streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
|
||||
current_lwe_output_indexes, lut_vec[i], current_lwe_array_in,
|
||||
current_lwe_input_indexes, bootstrapping_keys[i], pbs_buffer[i],
|
||||
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
|
||||
@@ -356,9 +355,9 @@ void execute_pbs_async(
|
||||
}
|
||||
break;
|
||||
case CLASSICAL:
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
int num_inputs_on_gpu =
|
||||
get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count);
|
||||
for (uint i = 0; i < streams.count(); i++) {
|
||||
int num_inputs_on_gpu = get_num_inputs_on_gpu(
|
||||
input_lwe_ciphertext_count, i, streams.count());
|
||||
|
||||
// Use the macro to get the correct elements for the current iteration
|
||||
// Handles the case when the input/output are scattered through
|
||||
@@ -371,7 +370,7 @@ void execute_pbs_async(
|
||||
get_variant_element(lwe_input_indexes, i);
|
||||
|
||||
int gpu_offset =
|
||||
get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
|
||||
get_gpu_offset(input_lwe_ciphertext_count, i, streams.count());
|
||||
auto d_lut_vector_indexes =
|
||||
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
|
||||
|
||||
@@ -380,10 +379,11 @@ void execute_pbs_async(
|
||||
ms_noise_reduction_key->ptr != nullptr)
|
||||
zeros = ms_noise_reduction_key->ptr[i];
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
streams[i], gpu_indexes[i], current_lwe_array_out, lut_vec[i],
|
||||
current_lwe_array_in, bootstrapping_keys[i], ms_noise_reduction_key,
|
||||
zeros, pbs_buffer[i], lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_inputs_on_gpu);
|
||||
streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
|
||||
lut_vec[i], current_lwe_array_in, bootstrapping_keys[i],
|
||||
ms_noise_reduction_key, zeros, pbs_buffer[i], lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_inputs_on_gpu);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
|
||||
@@ -39,10 +39,11 @@ int32_t cuda_setup_multi_gpu(int device_0_id) {
|
||||
return (int32_t)(num_used_gpus);
|
||||
}
|
||||
|
||||
int get_active_gpu_count(int num_inputs, int gpu_count) {
|
||||
int ceil_div_inputs =
|
||||
std::max(1, (num_inputs + THRESHOLD_MULTI_GPU - 1) / THRESHOLD_MULTI_GPU);
|
||||
int active_gpu_count = std::min(ceil_div_inputs, gpu_count);
|
||||
uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count) {
|
||||
uint32_t ceil_div_inputs =
|
||||
std::max((uint32_t)1,
|
||||
(num_inputs + THRESHOLD_MULTI_GPU - 1) / THRESHOLD_MULTI_GPU);
|
||||
uint32_t active_gpu_count = std::min(ceil_div_inputs, gpu_count);
|
||||
return active_gpu_count;
|
||||
}
|
||||
|
||||
|
||||
@@ -5,19 +5,18 @@
|
||||
|
||||
/// Initialize same-size arrays on all active gpus
|
||||
template <typename Torus>
|
||||
void multi_gpu_alloc_array_async(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, std::vector<Torus *> &dest,
|
||||
void multi_gpu_alloc_array_async(CudaStreams streams,
|
||||
std::vector<Torus *> &dest,
|
||||
uint32_t elements_per_gpu,
|
||||
uint64_t &size_tracker_on_gpu_0,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
dest.resize(gpu_count);
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
dest.resize(streams.count());
|
||||
for (uint i = 0; i < streams.count(); i++) {
|
||||
uint64_t size_tracker_on_gpu_i = 0;
|
||||
Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
|
||||
elements_per_gpu * sizeof(Torus), streams[i], gpu_indexes[i],
|
||||
size_tracker_on_gpu_i, allocate_gpu_memory);
|
||||
elements_per_gpu * sizeof(Torus), streams.stream(i),
|
||||
streams.gpu_index(i), size_tracker_on_gpu_i, allocate_gpu_memory);
|
||||
dest[i] = d_array;
|
||||
if (i == 0) {
|
||||
size_tracker_on_gpu_0 += size_tracker_on_gpu_i;
|
||||
@@ -26,49 +25,46 @@ void multi_gpu_alloc_array_async(cudaStream_t const *streams,
|
||||
}
|
||||
/// Copy an array residing on one GPU to all active gpus
|
||||
template <typename Torus>
|
||||
void multi_gpu_copy_array_async(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
std::vector<Torus *> &dest, Torus const *src,
|
||||
uint32_t elements_per_gpu,
|
||||
void multi_gpu_copy_array_async(CudaStreams streams, std::vector<Torus *> &dest,
|
||||
Torus const *src, uint32_t elements_per_gpu,
|
||||
bool gpu_memory_allocated) {
|
||||
dest.resize(gpu_count);
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
dest.resize(streams.count());
|
||||
for (uint i = 0; i < streams.count(); i++) {
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
dest[i], src, elements_per_gpu * sizeof(Torus), streams[i],
|
||||
gpu_indexes[i], gpu_memory_allocated);
|
||||
dest[i], src, elements_per_gpu * sizeof(Torus), streams.stream(i),
|
||||
streams.gpu_index(i), gpu_memory_allocated);
|
||||
}
|
||||
}
|
||||
/// Copy an array residing on one CPU to all active gpus
|
||||
template <typename Torus>
|
||||
void multi_gpu_copy_array_from_cpu_async(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, std::vector<Torus *> &dest, Torus const *h_src,
|
||||
uint32_t elements_per_gpu, bool gpu_memory_allocated) {
|
||||
dest.resize(gpu_count);
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
void multi_gpu_copy_array_from_cpu_async(CudaStreams streams,
|
||||
std::vector<Torus *> &dest,
|
||||
Torus const *h_src,
|
||||
uint32_t elements_per_gpu,
|
||||
bool gpu_memory_allocated) {
|
||||
for (uint i = 0; i < streams.count(); i++) {
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
dest[i], h_src, elements_per_gpu * sizeof(Torus), streams[i],
|
||||
gpu_indexes[i], gpu_memory_allocated);
|
||||
dest[i], h_src, elements_per_gpu * sizeof(Torus), streams.stream(i),
|
||||
streams.gpu_index(i), gpu_memory_allocated);
|
||||
}
|
||||
}
|
||||
/// Allocates the input/output vector for all devices
|
||||
/// Initializes also the related indexing and initializes it to the trivial
|
||||
/// index
|
||||
template <typename Torus>
|
||||
void multi_gpu_alloc_lwe_async(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
std::vector<Torus *> &dest, uint32_t num_inputs,
|
||||
uint32_t lwe_size,
|
||||
void multi_gpu_alloc_lwe_async(CudaStreams streams, std::vector<Torus *> &dest,
|
||||
uint32_t num_inputs, uint32_t lwe_size,
|
||||
uint64_t &size_tracker_on_gpu_0,
|
||||
bool allocate_gpu_memory) {
|
||||
dest.resize(gpu_count);
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
dest.resize(streams.count());
|
||||
for (uint i = 0; i < streams.count(); i++) {
|
||||
uint64_t size_tracker_on_gpu_i = 0;
|
||||
auto inputs_on_gpu = std::max(
|
||||
THRESHOLD_MULTI_GPU, get_num_inputs_on_gpu(num_inputs, i, gpu_count));
|
||||
auto inputs_on_gpu =
|
||||
std::max(THRESHOLD_MULTI_GPU,
|
||||
get_num_inputs_on_gpu(num_inputs, i, streams.count()));
|
||||
Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
|
||||
inputs_on_gpu * lwe_size * sizeof(Torus), streams[i], gpu_indexes[i],
|
||||
size_tracker_on_gpu_i, allocate_gpu_memory);
|
||||
inputs_on_gpu * lwe_size * sizeof(Torus), streams.stream(i),
|
||||
streams.gpu_index(i), size_tracker_on_gpu_i, allocate_gpu_memory);
|
||||
dest[i] = d_array;
|
||||
if (i == 0) {
|
||||
size_tracker_on_gpu_0 += size_tracker_on_gpu_i;
|
||||
@@ -77,8 +73,7 @@ void multi_gpu_alloc_lwe_async(cudaStream_t const *streams,
|
||||
}
|
||||
|
||||
template void multi_gpu_alloc_lwe_async<__uint128_t>(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, std::vector<__uint128_t *> &dest, uint32_t num_inputs,
|
||||
CudaStreams streams, std::vector<__uint128_t *> &dest, uint32_t num_inputs,
|
||||
uint32_t lwe_size, uint64_t &size_tracker_on_gpu_0,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
@@ -87,18 +82,20 @@ template void multi_gpu_alloc_lwe_async<__uint128_t>(
|
||||
/// index
|
||||
template <typename Torus>
|
||||
void multi_gpu_alloc_lwe_many_lut_output_async(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, std::vector<Torus *> &dest, uint32_t num_inputs,
|
||||
CudaStreams streams, std::vector<Torus *> &dest, uint32_t num_inputs,
|
||||
uint32_t num_many_lut, uint32_t lwe_size, uint64_t &size_tracker_on_gpu_0,
|
||||
bool allocate_gpu_memory) {
|
||||
dest.resize(gpu_count);
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
|
||||
dest.resize(streams.count());
|
||||
for (uint i = 0; i < streams.count(); i++) {
|
||||
uint64_t size_tracker = 0;
|
||||
auto inputs_on_gpu = std::max(
|
||||
THRESHOLD_MULTI_GPU, get_num_inputs_on_gpu(num_inputs, i, gpu_count));
|
||||
auto inputs_on_gpu =
|
||||
std::max(THRESHOLD_MULTI_GPU,
|
||||
get_num_inputs_on_gpu(num_inputs, i, streams.count()));
|
||||
Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
|
||||
num_many_lut * inputs_on_gpu * lwe_size * sizeof(Torus), streams[i],
|
||||
gpu_indexes[i], size_tracker, allocate_gpu_memory);
|
||||
num_many_lut * inputs_on_gpu * lwe_size * sizeof(Torus),
|
||||
streams.stream(i), streams.gpu_index(i), size_tracker,
|
||||
allocate_gpu_memory);
|
||||
dest[i] = d_array;
|
||||
if (i == 0) {
|
||||
size_tracker_on_gpu_0 += size_tracker;
|
||||
@@ -141,32 +138,30 @@ __global__ void realign_with_indexes(Torus *d_vector,
|
||||
/// The output indexing is always the trivial one
|
||||
/// num_inputs: total num of lwe in src
|
||||
template <typename Torus>
|
||||
void multi_gpu_scatter_lwe_async(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, std::vector<Torus *> &dest,
|
||||
Torus const *src, Torus const *d_src_indexes,
|
||||
void multi_gpu_scatter_lwe_async(CudaStreams streams,
|
||||
std::vector<Torus *> &dest, Torus const *src,
|
||||
Torus const *d_src_indexes,
|
||||
bool is_trivial_index,
|
||||
std::vector<Torus *> &aligned_vec,
|
||||
uint32_t max_active_gpu_count,
|
||||
uint32_t num_inputs, uint32_t lwe_size) {
|
||||
|
||||
if (max_active_gpu_count < gpu_count)
|
||||
if (max_active_gpu_count < streams.count())
|
||||
PANIC("Cuda error: number of gpus in scatter should be <= number of gpus "
|
||||
"used to create the lut")
|
||||
dest.resize(gpu_count);
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
|
||||
for (uint i = 0; i < streams.count(); i++) {
|
||||
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, streams.count());
|
||||
auto gpu_offset = 0;
|
||||
for (uint j = 0; j < i; j++) {
|
||||
gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count);
|
||||
gpu_offset += get_num_inputs_on_gpu(num_inputs, j, streams.count());
|
||||
}
|
||||
|
||||
if (is_trivial_index) {
|
||||
auto d_dest = dest[i];
|
||||
auto d_src = src + gpu_offset * lwe_size;
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
d_dest, d_src, inputs_on_gpu * lwe_size * sizeof(Torus), streams[i],
|
||||
gpu_indexes[i], true);
|
||||
d_dest, d_src, inputs_on_gpu * lwe_size * sizeof(Torus),
|
||||
streams.stream(i), streams.gpu_index(i), true);
|
||||
|
||||
} else {
|
||||
if (aligned_vec.size() == 0)
|
||||
@@ -175,22 +170,24 @@ void multi_gpu_scatter_lwe_async(cudaStream_t const *streams,
|
||||
if (d_src_indexes == nullptr)
|
||||
PANIC("Cuda error: source indexes should be initialized!");
|
||||
|
||||
cudaEvent_t temp_event2 = cuda_create_event(gpu_indexes[0]);
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
align_with_indexes<Torus><<<inputs_on_gpu, 1024, 0, streams[0]>>>(
|
||||
cudaEvent_t temp_event2 = cuda_create_event(streams.gpu_index(0));
|
||||
cuda_set_device(streams.gpu_index(0));
|
||||
align_with_indexes<Torus><<<inputs_on_gpu, 1024, 0, streams.stream(0)>>>(
|
||||
aligned_vec[i], (Torus *)src, (Torus *)d_src_indexes + gpu_offset,
|
||||
lwe_size);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
cuda_event_record(temp_event2, streams[0], gpu_indexes[0]);
|
||||
cuda_stream_wait_event(streams[i], temp_event2, gpu_indexes[i]);
|
||||
cuda_event_record(temp_event2, streams.stream(0), streams.gpu_index(0));
|
||||
cuda_stream_wait_event(streams.stream(i), temp_event2,
|
||||
streams.gpu_index(i));
|
||||
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
dest[i], aligned_vec[i], inputs_on_gpu * lwe_size * sizeof(Torus),
|
||||
streams[i], gpu_indexes[i], true);
|
||||
streams.stream(i), streams.gpu_index(i), true);
|
||||
|
||||
cudaEvent_t temp_event = cuda_create_event(gpu_indexes[i]);
|
||||
cuda_event_record(temp_event, streams[i], gpu_indexes[i]);
|
||||
cuda_stream_wait_event(streams[0], temp_event, gpu_indexes[0]);
|
||||
cudaEvent_t temp_event = cuda_create_event(streams.gpu_index(i));
|
||||
cuda_event_record(temp_event, streams.stream(i), streams.gpu_index(i));
|
||||
cuda_stream_wait_event(streams.stream(0), temp_event,
|
||||
streams.gpu_index(0));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -199,18 +196,17 @@ void multi_gpu_scatter_lwe_async(cudaStream_t const *streams,
|
||||
/// dest_indexes
|
||||
/// The input indexing should be the trivial one
|
||||
template <typename Torus>
|
||||
void multi_gpu_gather_lwe_async(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *dest, const std::vector<Torus *> &src,
|
||||
void multi_gpu_gather_lwe_async(CudaStreams streams, Torus *dest,
|
||||
const std::vector<Torus *> &src,
|
||||
Torus *d_dest_indexes, bool is_trivial_index,
|
||||
std::vector<Torus *> &aligned_vec,
|
||||
uint32_t num_inputs, uint32_t lwe_size) {
|
||||
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
|
||||
for (uint i = 0; i < streams.count(); i++) {
|
||||
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, streams.count());
|
||||
auto gpu_offset = 0;
|
||||
for (uint j = 0; j < i; j++) {
|
||||
gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count);
|
||||
gpu_offset += get_num_inputs_on_gpu(num_inputs, j, streams.count());
|
||||
}
|
||||
|
||||
if (is_trivial_index) {
|
||||
@@ -218,29 +214,33 @@ void multi_gpu_gather_lwe_async(cudaStream_t const *streams,
|
||||
auto d_src = src[i];
|
||||
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
d_dest, d_src, inputs_on_gpu * lwe_size * sizeof(Torus), streams[i],
|
||||
gpu_indexes[i], true);
|
||||
d_dest, d_src, inputs_on_gpu * lwe_size * sizeof(Torus),
|
||||
streams.stream(i), streams.gpu_index(i), true);
|
||||
} else {
|
||||
if (aligned_vec.size() == 0)
|
||||
PANIC("Cuda error: auxiliary arrays should be setup!");
|
||||
if (d_dest_indexes == nullptr)
|
||||
PANIC("Cuda error: destination indexes should be initialized!");
|
||||
|
||||
cudaEvent_t temp_event2 = cuda_create_event(gpu_indexes[0]);
|
||||
cudaEvent_t temp_event2 = cuda_create_event(streams.gpu_index(0));
|
||||
|
||||
cuda_event_record(temp_event2, streams[0], gpu_indexes[0]);
|
||||
cuda_stream_wait_event(streams[i], temp_event2, gpu_indexes[i]);
|
||||
cuda_event_record(temp_event2, streams.stream(0), streams.gpu_index(0));
|
||||
cuda_stream_wait_event(streams.stream(i), temp_event2,
|
||||
streams.gpu_index(i));
|
||||
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
aligned_vec[i], src[i], inputs_on_gpu * lwe_size * sizeof(Torus),
|
||||
streams[i], gpu_indexes[i], true);
|
||||
streams.stream(i), streams.gpu_index(i), true);
|
||||
|
||||
cudaEvent_t temp_event3 = cuda_create_event(gpu_indexes[i]);
|
||||
cuda_event_record(temp_event3, streams[i], gpu_indexes[i]);
|
||||
cuda_stream_wait_event(streams[0], temp_event3, gpu_indexes[0]);
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
realign_with_indexes<Torus><<<inputs_on_gpu, 1024, 0, streams[0]>>>(
|
||||
dest, aligned_vec[i], (Torus *)d_dest_indexes + gpu_offset, lwe_size);
|
||||
cudaEvent_t temp_event3 = cuda_create_event(streams.gpu_index(i));
|
||||
cuda_event_record(temp_event3, streams.stream(i), streams.gpu_index(i));
|
||||
cuda_stream_wait_event(streams.stream(0), temp_event3,
|
||||
streams.gpu_index(0));
|
||||
cuda_set_device(streams.gpu_index(0));
|
||||
realign_with_indexes<Torus>
|
||||
<<<inputs_on_gpu, 1024, 0, streams.stream(0)>>>(
|
||||
dest, aligned_vec[i], (Torus *)d_dest_indexes + gpu_offset,
|
||||
lwe_size);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
}
|
||||
@@ -250,18 +250,20 @@ void multi_gpu_gather_lwe_async(cudaStream_t const *streams,
|
||||
/// dest_indexes
|
||||
/// The input indexing should be the trivial one
|
||||
template <typename Torus>
|
||||
void multi_gpu_gather_many_lut_lwe_async(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *dest, const std::vector<Torus *> &src,
|
||||
Torus *h_dest_indexes, bool is_trivial_index, uint32_t num_inputs,
|
||||
uint32_t lwe_size, uint32_t num_many_lut) {
|
||||
void multi_gpu_gather_many_lut_lwe_async(CudaStreams streams, Torus *dest,
|
||||
const std::vector<Torus *> &src,
|
||||
Torus *h_dest_indexes,
|
||||
bool is_trivial_index,
|
||||
uint32_t num_inputs, uint32_t lwe_size,
|
||||
uint32_t num_many_lut) {
|
||||
|
||||
for (uint lut_id = 0; lut_id < num_many_lut; lut_id++) {
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
|
||||
for (uint i = 0; i < streams.count(); i++) {
|
||||
auto inputs_on_gpu =
|
||||
get_num_inputs_on_gpu(num_inputs, i, streams.count());
|
||||
auto gpu_offset = 0;
|
||||
for (uint j = 0; j < i; j++) {
|
||||
gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count);
|
||||
gpu_offset += get_num_inputs_on_gpu(num_inputs, j, streams.count());
|
||||
}
|
||||
|
||||
if (is_trivial_index) {
|
||||
@@ -270,8 +272,8 @@ void multi_gpu_gather_many_lut_lwe_async(
|
||||
auto d_src = src[i] + lut_id * inputs_on_gpu * lwe_size;
|
||||
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
d_dest, d_src, inputs_on_gpu * lwe_size * sizeof(Torus), streams[i],
|
||||
gpu_indexes[i], true);
|
||||
d_dest, d_src, inputs_on_gpu * lwe_size * sizeof(Torus),
|
||||
streams.stream(i), streams.gpu_index(i), true);
|
||||
} else {
|
||||
auto dest_indexes = h_dest_indexes + gpu_offset;
|
||||
|
||||
@@ -282,8 +284,8 @@ void multi_gpu_gather_many_lut_lwe_async(
|
||||
src[i] + j * lwe_size + lut_id * inputs_on_gpu * lwe_size;
|
||||
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
d_dest, d_src, lwe_size * sizeof(Torus), streams[i],
|
||||
gpu_indexes[i], true);
|
||||
d_dest, d_src, lwe_size * sizeof(Torus), streams.stream(i),
|
||||
streams.gpu_index(i), true);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -291,16 +293,13 @@ void multi_gpu_gather_many_lut_lwe_async(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void multi_gpu_release_async(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
std::vector<Torus *> &vec) {
|
||||
void multi_gpu_release_async(CudaStreams streams, std::vector<Torus *> &vec) {
|
||||
|
||||
for (uint i = 0; i < vec.size(); i++)
|
||||
cuda_drop_async(vec[i], streams[i], gpu_indexes[i]);
|
||||
cuda_drop_async(vec[i], streams.stream(i), streams.gpu_index(i));
|
||||
}
|
||||
template void
|
||||
multi_gpu_release_async<__uint128_t>(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
multi_gpu_release_async<__uint128_t>(CudaStreams streams,
|
||||
std::vector<__uint128_t *> &vec);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,17 +1,17 @@
|
||||
#include "zk.cuh"
|
||||
|
||||
uint64_t scratch_cuda_expand_without_verification_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension,
|
||||
uint32_t computing_ks_level, uint32_t computing_ks_base_log,
|
||||
uint32_t casting_input_dimension, uint32_t casting_output_dimension,
|
||||
uint32_t casting_ks_level, uint32_t casting_ks_base_log, uint32_t pbs_level,
|
||||
uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
const uint32_t *num_lwes_per_compact_list, const bool *is_boolean_array,
|
||||
uint32_t num_compact_lists, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, KS_TYPE casting_key_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t computing_ks_level,
|
||||
uint32_t computing_ks_base_log, uint32_t casting_input_dimension,
|
||||
uint32_t casting_output_dimension, uint32_t casting_ks_level,
|
||||
uint32_t casting_ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, const uint32_t *num_lwes_per_compact_list,
|
||||
const bool *is_boolean_array, uint32_t num_compact_lists,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
KS_TYPE casting_key_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
// Since CUDA backend works with the concept of "big" and "small" key, instead
|
||||
// of "input" and "output", we need to do this or otherwise our PBS will throw
|
||||
@@ -34,17 +34,16 @@ uint64_t scratch_cuda_expand_without_verification_64(
|
||||
noise_reduction_type);
|
||||
|
||||
return scratch_cuda_expand_without_verification<uint64_t>(
|
||||
(cudaStream_t *)streams, gpu_indexes, gpu_count,
|
||||
CudaStreams(streams),
|
||||
reinterpret_cast<zk_expand_mem<uint64_t> **>(mem_ptr),
|
||||
num_lwes_per_compact_list, is_boolean_array, num_compact_lists,
|
||||
computing_params, casting_params, casting_key_type, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_expand_without_verification_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, const void *lwe_flattened_compact_array_in,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *computing_ksks,
|
||||
void *const *casting_keys,
|
||||
CudaStreamsFFI streams, void *lwe_array_out,
|
||||
const void *lwe_flattened_compact_array_in, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *computing_ksks, void *const *casting_keys,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
auto expand_buffer = reinterpret_cast<zk_expand_mem<uint64_t> *>(mem_ptr);
|
||||
@@ -52,56 +51,49 @@ void cuda_expand_without_verification_64(
|
||||
switch (expand_buffer->casting_params.big_lwe_dimension) {
|
||||
case 256:
|
||||
host_expand_without_verification<uint64_t, AmortizedDegree<256>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
streams, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
|
||||
expand_buffer, (uint64_t **)casting_keys, bsks,
|
||||
(uint64_t **)(computing_ksks), ms_noise_reduction_key);
|
||||
break;
|
||||
case 512:
|
||||
host_expand_without_verification<uint64_t, AmortizedDegree<512>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
streams, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
|
||||
expand_buffer, (uint64_t **)casting_keys, bsks,
|
||||
(uint64_t **)(computing_ksks), ms_noise_reduction_key);
|
||||
break;
|
||||
case 1024:
|
||||
host_expand_without_verification<uint64_t, AmortizedDegree<1024>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
streams, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
|
||||
expand_buffer, (uint64_t **)casting_keys, bsks,
|
||||
(uint64_t **)(computing_ksks), ms_noise_reduction_key);
|
||||
break;
|
||||
case 2048:
|
||||
host_expand_without_verification<uint64_t, AmortizedDegree<2048>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
streams, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
|
||||
expand_buffer, (uint64_t **)casting_keys, bsks,
|
||||
(uint64_t **)(computing_ksks), ms_noise_reduction_key);
|
||||
break;
|
||||
case 4096:
|
||||
host_expand_without_verification<uint64_t, AmortizedDegree<4096>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
streams, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
|
||||
expand_buffer, (uint64_t **)casting_keys, bsks,
|
||||
(uint64_t **)(computing_ksks), ms_noise_reduction_key);
|
||||
break;
|
||||
case 8192:
|
||||
host_expand_without_verification<uint64_t, AmortizedDegree<8192>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
streams, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
|
||||
expand_buffer, (uint64_t **)casting_keys, bsks,
|
||||
(uint64_t **)(computing_ksks), ms_noise_reduction_key);
|
||||
break;
|
||||
case 16384:
|
||||
host_expand_without_verification<uint64_t, AmortizedDegree<16384>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
streams, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
|
||||
expand_buffer, (uint64_t **)casting_keys, bsks,
|
||||
(uint64_t **)(computing_ksks), ms_noise_reduction_key);
|
||||
@@ -114,14 +106,12 @@ void cuda_expand_without_verification_64(
|
||||
}
|
||||
}
|
||||
|
||||
void cleanup_expand_without_verification_64(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
void cleanup_expand_without_verification_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
zk_expand_mem<uint64_t> *mem_ptr =
|
||||
reinterpret_cast<zk_expand_mem<uint64_t> *>(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
@@ -17,8 +17,7 @@
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_expand_without_verification(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out,
|
||||
CudaStreams streams, Torus *lwe_array_out,
|
||||
const Torus *lwe_flattened_compact_array_in, zk_expand_mem<Torus> *mem_ptr,
|
||||
Torus *const *casting_keys, void *const *bsks, Torus *const *compute_ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
@@ -51,13 +50,13 @@ __host__ void host_expand_without_verification(
|
||||
output_index++;
|
||||
}
|
||||
}
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(d_expand_jobs, h_expand_jobs,
|
||||
compact_lwe_lists.total_num_lwes *
|
||||
sizeof(expand_job<Torus>),
|
||||
streams[0], gpu_indexes[0], true);
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
d_expand_jobs, h_expand_jobs,
|
||||
compact_lwe_lists.total_num_lwes * sizeof(expand_job<Torus>),
|
||||
streams.stream(0), streams.gpu_index(0), true);
|
||||
|
||||
host_lwe_expand<Torus, params>(streams[0], gpu_indexes[0], expanded_lwes,
|
||||
d_expand_jobs, num_lwes);
|
||||
host_lwe_expand<Torus, params>(streams.stream(0), streams.gpu_index(0),
|
||||
expanded_lwes, d_expand_jobs, num_lwes);
|
||||
|
||||
auto ksks = casting_keys;
|
||||
auto lwe_array_input = expanded_lwes;
|
||||
@@ -78,7 +77,7 @@ __host__ void host_expand_without_verification(
|
||||
|
||||
// apply keyswitch to BIG
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams, gpu_indexes, 1, ksed_small_to_big_expanded_lwes,
|
||||
streams.subset_first_gpu(), ksed_small_to_big_expanded_lwes,
|
||||
lwe_trivial_indexes_vec[0], expanded_lwes, lwe_trivial_indexes_vec[0],
|
||||
casting_keys, casting_input_dimension, casting_output_dimension,
|
||||
casting_ks_base_log, casting_ks_level, num_lwes);
|
||||
@@ -91,20 +90,19 @@ __host__ void host_expand_without_verification(
|
||||
// Apply LUT
|
||||
cuda_memset_async(lwe_array_out, 0,
|
||||
(lwe_dimension + 1) * num_lwes * 2 * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
streams.stream(0), streams.gpu_index(0));
|
||||
auto output = new CudaRadixCiphertextFFI;
|
||||
into_radix_ciphertext(output, lwe_array_out, 2 * num_lwes, lwe_dimension);
|
||||
auto input = new CudaRadixCiphertextFFI;
|
||||
into_radix_ciphertext(input, lwe_array_input, 2 * num_lwes, lwe_dimension);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, output, input, bsks, ksks,
|
||||
ms_noise_reduction_key, message_and_carry_extract_luts, 2 * num_lwes);
|
||||
streams, output, input, bsks, ksks, ms_noise_reduction_key,
|
||||
message_and_carry_extract_luts, 2 * num_lwes);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_expand_without_verification(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, zk_expand_mem<Torus> **mem_ptr,
|
||||
CudaStreams streams, zk_expand_mem<Torus> **mem_ptr,
|
||||
const uint32_t *num_lwes_per_compact_list, const bool *is_boolean_array,
|
||||
uint32_t num_compact_lists, int_radix_params computing_params,
|
||||
int_radix_params casting_params, KS_TYPE casting_key_type,
|
||||
@@ -112,9 +110,9 @@ __host__ uint64_t scratch_cuda_expand_without_verification(
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new zk_expand_mem<Torus>(
|
||||
streams, gpu_indexes, gpu_count, computing_params, casting_params,
|
||||
casting_key_type, num_lwes_per_compact_list, is_boolean_array,
|
||||
num_compact_lists, allocate_gpu_memory, size_tracker);
|
||||
streams, computing_params, casting_params, casting_key_type,
|
||||
num_lwes_per_compact_list, is_boolean_array, num_compact_lists,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
#include "device.h"
|
||||
#include "pbs/pbs_utilities.h"
|
||||
#include "pbs/programmable_bootstrap.h"
|
||||
#include "utils.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <cstdint>
|
||||
#include <device.h>
|
||||
#include <functional>
|
||||
#include <random>
|
||||
#include <setup_and_teardown.h>
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include "device.h"
|
||||
#include "helper_multi_gpu.h"
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include "device.h"
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -109,6 +109,14 @@ impl CudaStreams {
|
||||
// The cast here is safe as GpuIndex is repr(transparent)
|
||||
self.gpu_indexes.as_ptr().cast()
|
||||
}
|
||||
|
||||
pub fn ffi(&self) -> CudaStreamsFFI {
|
||||
CudaStreamsFFI {
|
||||
streams: self.ptr.as_ptr(),
|
||||
gpu_indexes: self.gpu_indexes_ptr(),
|
||||
gpu_count: self.len() as u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for CudaStreams {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user