chore(gpu): structure to encapsulate streams

This commit is contained in:
Andrei Stoian
2025-09-03 16:07:05 +02:00
committed by Andrei Stoian
parent 1a2643d1da
commit 1dcc3c8c89
65 changed files with 4923 additions and 6211 deletions

View File

@@ -999,6 +999,11 @@ test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
--test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p tfhe \
-E "test(/high_level_api::.*gpu.*/)"
test_list_gpu: install_rs_build_toolchain install_cargo_nextest
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest list --cargo-profile $(CARGO_PROFILE) \
--features=integer,internal-keycache,gpu,zk-pok -p tfhe \
-E "test(/.*gpu.*/)"
test_high_level_api_hpu: install_rs_build_toolchain install_cargo_nextest
ifeq ($(HPU_CONFIG), v80)
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \

View File

@@ -4,9 +4,7 @@
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cuda_runtime.h>
#include <vector>
extern "C" {
@@ -141,4 +139,5 @@ bool cuda_check_support_thread_block_clusters();
template <typename Torus>
void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
Torus *d_array, Torus value, Torus n);
#endif

View File

@@ -4,6 +4,8 @@
#include <variant>
#include <vector>
#include "integer/integer.h"
extern std::mutex m;
extern bool p2p_enabled;
extern const int THRESHOLD_MULTI_GPU;
@@ -37,10 +39,149 @@ get_variant_element(const std::variant<std::vector<Torus>, Torus> &variant,
}
}
int get_active_gpu_count(int num_inputs, int gpu_count);
uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count);
int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);
int get_gpu_offset(int total_num_inputs, int gpu_index, int gpu_count);
// A Set of GPU Streams and associated GPUs
// Can be constructed from the FFI struct CudaStreamsFFI which
// is only used to pass the streams/gpus at the rust/C interface
// This class should only be constructed from the FFI struct,
// through class methods or through the copy constructor. The class
// can also be constructed as an empty set
struct CudaStreams {
private:
cudaStream_t const *_streams;
uint32_t const *_gpu_indexes;
uint32_t _gpu_count;
bool _owns_streams;
// Prevent the construction of a CudaStreams class from user-code
CudaStreams(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count)
: _streams(streams), _gpu_indexes(gpu_indexes), _gpu_count(gpu_count),
_owns_streams(false) {}
public:
// Construct an empty set. Invalid use of an empty set should raise an error
// right away through asserts or because of a nullptr dereference
CudaStreams()
: _streams(nullptr), _gpu_indexes(nullptr), _gpu_count((uint32_t)-1),
_owns_streams(false) {}
// Returns a subset of this set as an active subset. An active subset is one
// that is temporarily used to perform some computation
CudaStreams active_gpu_subset(int num_radix_blocks) {
return CudaStreams(_streams, _gpu_indexes,
get_active_gpu_count(num_radix_blocks, _gpu_count));
}
// Returns a subset containing only the first gpu of this set. It
// is used to create subset of streams for mono-GPU functions
CudaStreams subset_first_gpu() const {
return CudaStreams(_streams, _gpu_indexes, 1);
}
// Synchronize all the streams in the set
void synchronize() const {
for (uint32_t i = 0; i < _gpu_count; i++) {
cuda_synchronize_stream(_streams[i], _gpu_indexes[i]);
}
}
cudaStream_t stream(uint32_t idx) const {
PANIC_IF_FALSE(idx < _gpu_count, "Invalid GPU index");
return _streams[idx];
}
uint32_t gpu_index(uint32_t idx) const {
PANIC_IF_FALSE(idx < _gpu_count, "Invalid GPU index");
return _gpu_indexes[idx];
}
uint32_t count() const { return _gpu_count; }
// Construct from the rust FFI stream set. Streams are created in rust
// using the bindings.
CudaStreams(CudaStreamsFFI &ffi)
: _streams((cudaStream_t *)ffi.streams), _gpu_indexes(ffi.gpu_indexes),
_gpu_count(ffi.gpu_count), _owns_streams(false) {}
// Create a new set of streams on the same gpus as those of the current stream
// set Can be used to parallelize computation by issuing kernels on multiple
// streams on the same GPU
void create_on_same_gpus(const CudaStreams &other) {
PANIC_IF_FALSE(_streams == nullptr,
"Assign clone to non-empty cudastreams");
cudaStream_t *new_streams = new cudaStream_t[other._gpu_count];
uint32_t *gpu_indexes_clone = new uint32_t[_gpu_count];
for (uint32_t i = 0; i < other._gpu_count; ++i) {
new_streams[i] = cuda_create_stream(other._gpu_indexes[i]);
gpu_indexes_clone[i] = other._gpu_indexes[i];
}
this->_streams = new_streams;
this->_gpu_indexes = gpu_indexes_clone;
this->_gpu_count = other._gpu_count;
// Flag this instance as owning streams so that we can destroy
// the streams when they aren't needed anymore
this->_owns_streams = true;
}
// Copy constructor, setting the own flag to false
// Only the initial instance of CudaStreams created with
// assign_clone owns streams, all copies of it do not own the
// streams
CudaStreams(const CudaStreams &src)
: _streams(src._streams), _gpu_indexes(src._gpu_indexes),
_gpu_count(src._gpu_count), _owns_streams(false) {}
CudaStreams &operator=(CudaStreams const &other) {
PANIC_IF_FALSE(this->_streams == nullptr ||
this->_streams == other._streams,
"Assigning an already initialized CudaStreams");
this->_streams = other._streams;
this->_gpu_indexes = other._gpu_indexes;
this->_gpu_count = other._gpu_count;
// Only the initial instance of CudaStreams created with
// assign_clone owns streams, all copies of it do not own the
// streams
this->_owns_streams = false;
return *this;
}
// Destroy the streams if they are created by assign_clone.
// We require the developer to call `destroy` on all instances
// of cloned streams.
void release() {
// If this instance doesn't own streams, there's nothing to do
// as the streams were created on the Rust side.
if (_owns_streams) {
for (uint32_t i = 0; i < _gpu_count; ++i) {
cuda_destroy_stream(_streams[i], _gpu_indexes[i]);
}
delete[] _streams;
_streams = nullptr;
delete[] _gpu_indexes;
_gpu_indexes = nullptr;
}
}
// The destructor checks that streams created with assign_clone
// were destroyed manually with `destroy`.
~CudaStreams() {
// Ensure streams are destroyed
PANIC_IF_FALSE(
!_owns_streams || _streams == nullptr,
"Destroy (this=%p) was not called on a CudaStreams object that "
"is a clone "
"of another one, %p",
this, this->_streams);
}
};
#endif

View File

@@ -2,6 +2,7 @@
#define CUDA_INTEGER_COMPRESSION_H
#include "../../pbs/pbs_enums.h"
#include "../integer.h"
typedef struct {
void *ptr;
@@ -25,77 +26,65 @@ typedef struct {
extern "C" {
uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t compression_glwe_dimension,
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
uint32_t lwe_per_glwe, bool allocate_gpu_memory);
CudaStreamsFFI streams, int8_t **mem_ptr,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, bool allocate_gpu_memory);
uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t encryption_glwe_dimension,
uint32_t encryption_polynomial_size, uint32_t compression_glwe_dimension,
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
uint32_t pbs_level, uint32_t pbs_base_log,
CudaStreamsFFI streams, int8_t **mem_ptr,
uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t num_blocks_to_decompress, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_integer_compress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaPackedGlweCiphertextListFFI *glwe_array_out,
CudaStreamsFFI streams, CudaPackedGlweCiphertextListFFI *glwe_array_out,
CudaLweCiphertextListFFI const *lwe_array_in, void *const *fp_ksk,
int8_t *mem_ptr);
void cuda_integer_decompress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaLweCiphertextListFFI *lwe_array_out,
CudaStreamsFFI streams, CudaLweCiphertextListFFI *lwe_array_out,
CudaPackedGlweCiphertextListFFI const *glwe_in,
uint32_t const *indexes_array, void *const *bsks, int8_t *mem_ptr);
void cleanup_cuda_integer_compress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void cleanup_cuda_integer_compress_radix_ciphertext_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
void cleanup_cuda_integer_decompress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void cleanup_cuda_integer_decompress_radix_ciphertext_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_integer_compress_radix_ciphertext_128(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t compression_glwe_dimension,
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
uint32_t lwe_per_glwe, bool allocate_gpu_memory);
CudaStreamsFFI streams, int8_t **mem_ptr,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, bool allocate_gpu_memory);
uint64_t scratch_cuda_integer_decompress_radix_ciphertext_128(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t compression_glwe_dimension,
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
bool allocate_gpu_memory);
CudaStreamsFFI streams, int8_t **mem_ptr,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t num_radix_blocks, uint32_t message_modulus,
uint32_t carry_modulus, bool allocate_gpu_memory);
void cuda_integer_compress_radix_ciphertext_128(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaPackedGlweCiphertextListFFI *glwe_array_out,
CudaStreamsFFI streams, CudaPackedGlweCiphertextListFFI *glwe_array_out,
CudaLweCiphertextListFFI const *lwe_array_in, void *const *fp_ksk,
int8_t *mem_ptr);
void cuda_integer_decompress_radix_ciphertext_128(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaLweCiphertextListFFI *lwe_array_out,
CudaStreamsFFI streams, CudaLweCiphertextListFFI *lwe_array_out,
CudaPackedGlweCiphertextListFFI const *glwe_in,
uint32_t const *indexes_array, int8_t *mem_ptr);
void cleanup_cuda_integer_compress_radix_ciphertext_128(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void cleanup_cuda_integer_compress_radix_ciphertext_128(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
void cleanup_cuda_integer_decompress_radix_ciphertext_128(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);
CudaStreamsFFI streams, int8_t **mem_ptr_void);
}
#endif

View File

@@ -12,8 +12,7 @@ template <typename Torus> struct int_compression {
bool gpu_memory_allocated;
uint32_t lwe_per_glwe;
int_compression(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params compression_params,
int_compression(CudaStreams streams, int_radix_params compression_params,
uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
bool allocate_gpu_memory, uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
@@ -25,26 +24,29 @@ template <typename Torus> struct int_compression {
tmp_lwe = static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
num_radix_blocks * (compression_params.small_lwe_dimension + 1) *
sizeof(Torus),
streams[0], gpu_indexes[0], size_tracker, allocate_gpu_memory));
streams.stream(0), streams.gpu_index(0), size_tracker,
allocate_gpu_memory));
tmp_glwe_array_out =
static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
lwe_per_glwe * glwe_accumulator_size * sizeof(Torus), streams[0],
gpu_indexes[0], size_tracker, allocate_gpu_memory));
lwe_per_glwe * glwe_accumulator_size * sizeof(Torus),
streams.stream(0), streams.gpu_index(0), size_tracker,
allocate_gpu_memory));
size_tracker += scratch_packing_keyswitch_lwe_list_to_glwe<Torus>(
streams[0], gpu_indexes[0], &fp_ks_buffer,
streams.stream(0), streams.gpu_index(0), &fp_ks_buffer,
compression_params.small_lwe_dimension,
compression_params.glwe_dimension, compression_params.polynomial_size,
num_radix_blocks, allocate_gpu_memory);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
cuda_drop_with_size_tracking_async(tmp_lwe, streams[0], gpu_indexes[0],
void release(CudaStreams streams) {
cuda_drop_with_size_tracking_async(
tmp_lwe, streams.stream(0), streams.gpu_index(0), gpu_memory_allocated);
cuda_drop_with_size_tracking_async(tmp_glwe_array_out, streams.stream(0),
streams.gpu_index(0),
gpu_memory_allocated);
cuda_drop_with_size_tracking_async(tmp_glwe_array_out, streams[0],
gpu_indexes[0], gpu_memory_allocated);
cleanup_packing_keyswitch_lwe_list_to_glwe(
streams[0], gpu_indexes[0], &fp_ks_buffer, gpu_memory_allocated);
streams.stream(0), streams.gpu_index(0), &fp_ks_buffer,
gpu_memory_allocated);
}
};
@@ -60,8 +62,7 @@ template <typename Torus> struct int_decompression {
int_radix_lut<Torus> *decompression_rescale_lut;
bool gpu_memory_allocated;
int_decompression(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params encryption_params,
int_decompression(CudaStreams streams, int_radix_params encryption_params,
int_radix_params compression_params,
uint32_t num_blocks_to_decompress, bool allocate_gpu_memory,
uint64_t &size_tracker) {
@@ -78,19 +79,21 @@ template <typename Torus> struct int_decompression {
tmp_extracted_glwe = (Torus *)cuda_malloc_with_size_tracking_async(
num_blocks_to_decompress * glwe_accumulator_size * sizeof(Torus),
streams[0], gpu_indexes[0], size_tracker, allocate_gpu_memory);
streams.stream(0), streams.gpu_index(0), size_tracker,
allocate_gpu_memory);
tmp_indexes_array = (uint32_t *)cuda_malloc_with_size_tracking_async(
num_blocks_to_decompress * sizeof(uint32_t), streams[0], gpu_indexes[0],
size_tracker, allocate_gpu_memory);
num_blocks_to_decompress * sizeof(uint32_t), streams.stream(0),
streams.gpu_index(0), size_tracker, allocate_gpu_memory);
tmp_extracted_lwe = (Torus *)cuda_malloc_with_size_tracking_async(
num_blocks_to_decompress * lwe_accumulator_size * sizeof(Torus),
streams[0], gpu_indexes[0], size_tracker, allocate_gpu_memory);
streams.stream(0), streams.gpu_index(0), size_tracker,
allocate_gpu_memory);
// rescale is only needed on 64-bit decompression
if constexpr (std::is_same_v<Torus, uint64_t>) {
decompression_rescale_lut = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, encryption_params, 1,
num_blocks_to_decompress, allocate_gpu_memory, size_tracker);
streams, encryption_params, 1, num_blocks_to_decompress,
allocate_gpu_memory, size_tracker);
// Rescale is done using an identity LUT
// Here we do not divide by message_modulus
@@ -98,8 +101,8 @@ template <typename Torus> struct int_decompression {
// space, we want to keep the original 2-bit value in the 4-bit space,
// so we apply the identity and the encoding will rescale it for us.
decompression_rescale_lut = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, encryption_params, 1,
num_blocks_to_decompress, allocate_gpu_memory, size_tracker);
streams, encryption_params, 1, num_blocks_to_decompress,
allocate_gpu_memory, size_tracker);
auto decompression_rescale_f = [](Torus x) -> Torus { return x; };
auto effective_compression_message_modulus =
@@ -107,7 +110,8 @@ template <typename Torus> struct int_decompression {
auto effective_compression_carry_modulus = 1;
generate_device_accumulator_with_encoding<Torus>(
streams[0], gpu_indexes[0], decompression_rescale_lut->get_lut(0, 0),
streams.stream(0), streams.gpu_index(0),
decompression_rescale_lut->get_lut(0, 0),
decompression_rescale_lut->get_degree(0),
decompression_rescale_lut->get_max_degree(0),
encryption_params.glwe_dimension, encryption_params.polynomial_size,
@@ -115,22 +119,22 @@ template <typename Torus> struct int_decompression {
effective_compression_carry_modulus,
encryption_params.message_modulus, encryption_params.carry_modulus,
decompression_rescale_f, gpu_memory_allocated);
auto active_gpu_count =
get_active_gpu_count(num_blocks_to_decompress, gpu_count);
decompression_rescale_lut->broadcast_lut(streams, gpu_indexes,
active_gpu_count);
auto active_streams = streams.active_gpu_subset(num_blocks_to_decompress);
decompression_rescale_lut->broadcast_lut(active_streams);
}
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
cuda_drop_with_size_tracking_async(tmp_extracted_glwe, streams[0],
gpu_indexes[0], gpu_memory_allocated);
cuda_drop_with_size_tracking_async(tmp_extracted_lwe, streams[0],
gpu_indexes[0], gpu_memory_allocated);
cuda_drop_with_size_tracking_async(tmp_indexes_array, streams[0],
gpu_indexes[0], gpu_memory_allocated);
void release(CudaStreams streams) {
cuda_drop_with_size_tracking_async(tmp_extracted_glwe, streams.stream(0),
streams.gpu_index(0),
gpu_memory_allocated);
cuda_drop_with_size_tracking_async(tmp_extracted_lwe, streams.stream(0),
streams.gpu_index(0),
gpu_memory_allocated);
cuda_drop_with_size_tracking_async(tmp_indexes_array, streams.stream(0),
streams.gpu_index(0),
gpu_memory_allocated);
if constexpr (std::is_same_v<Torus, uint64_t>) {
decompression_rescale_lut->release(streams, gpu_indexes, gpu_count);
decompression_rescale_lut->release(streams);
delete decompression_rescale_lut;
decompression_rescale_lut = nullptr;
}

File diff suppressed because it is too large Load Diff

View File

@@ -7,28 +7,25 @@
extern "C" {
uint64_t scratch_cuda_expand_without_verification_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension,
uint32_t computing_ks_level, uint32_t computing_ks_base_log,
uint32_t casting_input_dimension, uint32_t casting_output_dimension,
uint32_t casting_ks_level, uint32_t casting_ks_base_log, uint32_t pbs_level,
uint32_t pbs_base_log, uint32_t grouping_factor,
const uint32_t *num_lwes_per_compact_list, const bool *is_boolean_array,
uint32_t num_compact_lists, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, KS_TYPE casting_key_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t computing_ks_level,
uint32_t computing_ks_base_log, uint32_t casting_input_dimension,
uint32_t casting_output_dimension, uint32_t casting_ks_level,
uint32_t casting_ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, const uint32_t *num_lwes_per_compact_list,
const bool *is_boolean_array, uint32_t num_compact_lists,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
KS_TYPE casting_key_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_expand_without_verification_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, const void *lwe_flattened_compact_array_in,
int8_t *mem_ptr, void *const *bsks, void *const *computing_ksks,
void *const *casting_keys,
CudaStreamsFFI streams, void *lwe_array_out,
const void *lwe_flattened_compact_array_in, int8_t *mem_ptr,
void *const *bsks, void *const *computing_ksks, void *const *casting_keys,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
void cleanup_expand_without_verification_64(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
void cleanup_expand_without_verification_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
}
#endif // ZK_H

View File

@@ -113,8 +113,7 @@ template <typename Torus> struct zk_expand_mem {
expand_job<Torus> *d_expand_jobs;
expand_job<Torus> *h_expand_jobs;
zk_expand_mem(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params computing_params,
zk_expand_mem(CudaStreams streams, int_radix_params computing_params,
int_radix_params casting_params, KS_TYPE casting_key_type,
const uint32_t *num_lwes_per_compact_list,
const bool *is_boolean_array, uint32_t num_compact_lists,
@@ -172,11 +171,10 @@ template <typename Torus> struct zk_expand_mem {
params = computing_params;
}
message_and_carry_extract_luts = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, params, 4, 2 * num_lwes,
allocate_gpu_memory, size_tracker);
streams, params, 4, 2 * num_lwes, allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0],
streams.stream(0), streams.gpu_index(0),
message_and_carry_extract_luts->get_lut(0, 0),
message_and_carry_extract_luts->get_degree(0),
message_and_carry_extract_luts->get_max_degree(0),
@@ -184,7 +182,7 @@ template <typename Torus> struct zk_expand_mem {
params.carry_modulus, message_extract_lut_f, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0],
streams.stream(0), streams.gpu_index(0),
message_and_carry_extract_luts->get_lut(0, 1),
message_and_carry_extract_luts->get_degree(1),
message_and_carry_extract_luts->get_max_degree(1),
@@ -192,7 +190,7 @@ template <typename Torus> struct zk_expand_mem {
params.carry_modulus, carry_extract_lut_f, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0],
streams.stream(0), streams.gpu_index(0),
message_and_carry_extract_luts->get_lut(0, 2),
message_and_carry_extract_luts->get_degree(2),
message_and_carry_extract_luts->get_max_degree(2),
@@ -201,7 +199,7 @@ template <typename Torus> struct zk_expand_mem {
gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0],
streams.stream(0), streams.gpu_index(0),
message_and_carry_extract_luts->get_lut(0, 3),
message_and_carry_extract_luts->get_degree(3),
message_and_carry_extract_luts->get_max_degree(3),
@@ -226,8 +224,8 @@ template <typename Torus> struct zk_expand_mem {
d_expand_jobs =
static_cast<expand_job<Torus> *>(cuda_malloc_with_size_tracking_async(
num_lwes * sizeof(expand_job<Torus>), streams[0], gpu_indexes[0],
size_tracker, allocate_gpu_memory));
num_lwes * sizeof(expand_job<Torus>), streams.stream(0),
streams.gpu_index(0), size_tracker, allocate_gpu_memory));
h_expand_jobs = static_cast<expand_job<Torus> *>(
malloc(num_lwes * sizeof(expand_job<Torus>)));
@@ -284,50 +282,51 @@ template <typename Torus> struct zk_expand_mem {
}
message_and_carry_extract_luts->set_lwe_indexes(
streams[0], gpu_indexes[0], h_indexes_in, h_indexes_out);
streams.stream(0), streams.gpu_index(0), h_indexes_in, h_indexes_out);
auto lut_indexes = message_and_carry_extract_luts->get_lut_indexes(0, 0);
cuda_memcpy_with_size_tracking_async_to_gpu(
lut_indexes, h_lut_indexes, num_packed_msgs * num_lwes * sizeof(Torus),
streams[0], gpu_indexes[0], allocate_gpu_memory);
streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
auto active_gpu_count = get_active_gpu_count(2 * num_lwes, gpu_count);
message_and_carry_extract_luts->broadcast_lut(streams, gpu_indexes,
active_gpu_count);
auto active_streams = streams.active_gpu_subset(2 * num_lwes);
message_and_carry_extract_luts->broadcast_lut(active_streams);
message_and_carry_extract_luts->allocate_lwe_vector_for_non_trivial_indexes(
streams, gpu_indexes, active_gpu_count, 2 * num_lwes, size_tracker,
allocate_gpu_memory);
active_streams, 2 * num_lwes, size_tracker, allocate_gpu_memory);
// The expanded LWEs will always be on the casting key format
tmp_expanded_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
num_lwes * (casting_params.big_lwe_dimension + 1) * sizeof(Torus),
streams[0], gpu_indexes[0], size_tracker, allocate_gpu_memory);
streams.stream(0), streams.gpu_index(0), size_tracker,
allocate_gpu_memory);
tmp_ksed_small_to_big_expanded_lwes =
(Torus *)cuda_malloc_with_size_tracking_async(
num_lwes * (casting_params.big_lwe_dimension + 1) * sizeof(Torus),
streams[0], gpu_indexes[0], size_tracker, allocate_gpu_memory);
streams.stream(0), streams.gpu_index(0), size_tracker,
allocate_gpu_memory);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
free(h_indexes_in);
free(h_indexes_out);
free(h_lut_indexes);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
void release(CudaStreams streams) {
message_and_carry_extract_luts->release(streams, gpu_indexes, gpu_count);
message_and_carry_extract_luts->release(streams);
delete message_and_carry_extract_luts;
cuda_drop_with_size_tracking_async(tmp_expanded_lwes, streams[0],
gpu_indexes[0], gpu_memory_allocated);
cuda_drop_with_size_tracking_async(tmp_ksed_small_to_big_expanded_lwes,
streams[0], gpu_indexes[0],
cuda_drop_with_size_tracking_async(tmp_expanded_lwes, streams.stream(0),
streams.gpu_index(0),
gpu_memory_allocated);
cuda_drop_with_size_tracking_async(d_expand_jobs, streams[0],
gpu_indexes[0], gpu_memory_allocated);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
cuda_drop_with_size_tracking_async(tmp_ksed_small_to_big_expanded_lwes,
streams.stream(0), streams.gpu_index(0),
gpu_memory_allocated);
cuda_drop_with_size_tracking_async(d_expand_jobs, streams.stream(0),
streams.gpu_index(0),
gpu_memory_allocated);
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
free(num_lwes_per_compact_list);
free(h_expand_jobs);
}

View File

@@ -49,17 +49,16 @@ __global__ void device_batch_fft_ggsw_vector(double2 *dest, T *src,
* global memory
*/
template <typename T, typename ST, class params>
void batch_fft_ggsw_vector(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, double2 *dest, T *src,
void batch_fft_ggsw_vector(CudaStreams streams, double2 *dest, T *src,
int8_t *d_mem, uint32_t r, uint32_t glwe_dim,
uint32_t polynomial_size, uint32_t level_count,
uint32_t max_shared_memory) {
PANIC_IF_FALSE(gpu_count == 1,
PANIC_IF_FALSE(streams.count() == 1,
"GPU error (batch_fft_ggsw_vector): multi-GPU execution on %d "
"gpus is not supported yet.",
gpu_count);
streams.count());
cuda_set_device(gpu_indexes[0]);
cuda_set_device(streams.gpu_index(0));
int shared_memory_size = sizeof(double) * polynomial_size;
@@ -68,11 +67,11 @@ void batch_fft_ggsw_vector(cudaStream_t *streams, uint32_t *gpu_indexes,
if (max_shared_memory < shared_memory_size) {
device_batch_fft_ggsw_vector<T, ST, params, NOSM>
<<<gridSize, blockSize, 0, streams[0]>>>(dest, src, d_mem);
<<<gridSize, blockSize, 0, streams.stream(0)>>>(dest, src, d_mem);
} else {
device_batch_fft_ggsw_vector<T, ST, params, FULLSM>
<<<gridSize, blockSize, shared_memory_size, streams[0]>>>(dest, src,
d_mem);
<<<gridSize, blockSize, shared_memory_size, streams.stream(0)>>>(
dest, src, d_mem);
}
check_cuda_error(cudaGetLastError());
}

View File

@@ -142,8 +142,7 @@ __host__ void host_keyswitch_lwe_ciphertext_vector(
}
template <typename Torus>
void execute_keyswitch_async(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
void execute_keyswitch_async(CudaStreams streams,
const LweArrayVariant<Torus> &lwe_array_out,
const LweArrayVariant<Torus> &lwe_output_indexes,
const LweArrayVariant<Torus> &lwe_array_in,
@@ -154,8 +153,9 @@ void execute_keyswitch_async(cudaStream_t const *streams,
/// If the number of radix blocks is lower than the number of GPUs, not all
/// GPUs will be active and there will be 1 input per GPU
for (uint i = 0; i < gpu_count; i++) {
int num_samples_on_gpu = get_num_inputs_on_gpu(num_samples, i, gpu_count);
for (uint i = 0; i < streams.count(); i++) {
int num_samples_on_gpu =
get_num_inputs_on_gpu(num_samples, i, streams.count());
Torus *current_lwe_array_out = get_variant_element(lwe_array_out, i);
Torus *current_lwe_output_indexes =
@@ -166,7 +166,7 @@ void execute_keyswitch_async(cudaStream_t const *streams,
// Compute Keyswitch
host_keyswitch_lwe_ciphertext_vector<Torus>(
streams[i], gpu_indexes[i], current_lwe_array_out,
streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
current_lwe_output_indexes, current_lwe_array_in,
current_lwe_input_indexes, ksks[i], lwe_dimension_in, lwe_dimension_out,
base_log, level_count, num_samples_on_gpu);

View File

@@ -128,13 +128,6 @@ void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index) {
check_cuda_error(cudaStreamSynchronize(stream));
}
void synchronize_streams(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count) {
for (uint i = 0; i < gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}
// Determine if a CUDA device is available at runtime
uint32_t cuda_is_available() { return cudaSetDevice(0) == cudaSuccess; }

View File

@@ -1,13 +1,12 @@
#include "integer/abs.cuh"
uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, bool is_signed, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory,
CudaStreamsFFI streams, int8_t **mem_ptr, bool is_signed,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
@@ -16,31 +15,27 @@ uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_integer_abs_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_abs_buffer<uint64_t> **)mem_ptr, is_signed, num_blocks, params,
allocate_gpu_memory);
CudaStreams(streams), (int_abs_buffer<uint64_t> **)mem_ptr, is_signed,
num_blocks, params, allocate_gpu_memory);
}
void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, bool is_signed,
void *const *bsks, void *const *ksks,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *ct, int8_t *mem_ptr,
bool is_signed, void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
auto mem = (int_abs_buffer<uint64_t> *)mem_ptr;
host_integer_abs_kb<uint64_t>((cudaStream_t *)(streams), gpu_indexes,
gpu_count, ct, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, mem, is_signed);
host_integer_abs_kb<uint64_t>(CudaStreams(streams), ct, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key,
mem, is_signed);
}
void cleanup_cuda_integer_abs_inplace(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
void cleanup_cuda_integer_abs_inplace(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_abs_buffer<uint64_t> *mem_ptr =
(int_abs_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}

View File

@@ -18,14 +18,12 @@
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_abs_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_abs_buffer<Torus> **mem_ptr, bool is_signed,
CudaStreams streams, int_abs_buffer<Torus> **mem_ptr, bool is_signed,
uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
if (is_signed) {
*mem_ptr = new int_abs_buffer<Torus>(streams, gpu_indexes, gpu_count,
params, num_blocks,
*mem_ptr = new int_abs_buffer<Torus>(streams, params, num_blocks,
allocate_gpu_memory, size_tracker);
}
return size_tracker;
@@ -33,8 +31,7 @@ __host__ uint64_t scratch_cuda_integer_abs_kb(
template <typename Torus>
__host__ void host_integer_abs_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *ct, void *const *bsks,
CudaStreams streams, CudaRadixCiphertextFFI *ct, void *const *bsks,
uint64_t *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
int_abs_buffer<uint64_t> *mem_ptr, bool is_signed) {
@@ -47,24 +44,24 @@ __host__ void host_integer_abs_kb(
(31 - __builtin_clz(mem_ptr->params.message_modulus)) *
ct->num_radix_blocks;
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], mask, ct);
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
mask, ct);
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, mask, num_bits_in_ciphertext - 1,
streams, mask, num_bits_in_ciphertext - 1,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key);
host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
host_addition<Torus>(streams.stream(0), streams.gpu_index(0), ct, mask, ct,
ct->num_radix_blocks, mem_ptr->params.message_modulus,
mem_ptr->params.carry_modulus);
uint32_t requested_flag = outputFlag::FLAG_NONE;
uint32_t uses_carry = 0;
host_propagate_single_carry<Torus>(
streams, gpu_indexes, gpu_count, ct, nullptr, nullptr, mem_ptr->scp_mem,
bsks, ksks, ms_noise_reduction_key, requested_flag, uses_carry);
streams, ct, nullptr, nullptr, mem_ptr->scp_mem, bsks, ksks,
ms_noise_reduction_key, requested_flag, uses_carry);
host_integer_radix_bitop_kb<Torus>(streams, gpu_indexes, gpu_count, ct, mask,
ct, mem_ptr->bitxor_mem, bsks, ksks,
ms_noise_reduction_key);
host_integer_radix_bitop_kb<Torus>(streams, ct, mask, ct, mem_ptr->bitxor_mem,
bsks, ksks, ms_noise_reduction_key);
}
#endif // TFHE_RS_ABS_CUH

View File

@@ -1,14 +1,13 @@
#include "integer/bitwise_ops.cuh"
uint64_t scratch_cuda_integer_radix_bitop_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
BITOP_TYPE op_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -16,32 +15,28 @@ uint64_t scratch_cuda_integer_radix_bitop_kb_64(
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_integer_radix_bitop_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_bitop_buffer<uint64_t> **)mem_ptr, lwe_ciphertext_count, params,
op_type, allocate_gpu_memory);
CudaStreams(streams), (int_bitop_buffer<uint64_t> **)mem_ptr,
lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
}
void cuda_bitop_integer_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *lwe_array_out,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
host_integer_radix_bitop_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
lwe_array_1, lwe_array_2, (int_bitop_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key);
CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2,
(int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key);
}
void cleanup_cuda_integer_bitop(void *const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
void cleanup_cuda_integer_bitop(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
int_bitop_buffer<uint64_t> *mem_ptr =
(int_bitop_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}

View File

@@ -13,8 +13,7 @@
template <typename Torus>
__host__ void host_integer_radix_bitop_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2, int_bitop_buffer<Torus> *mem_ptr,
void *const *bsks, Torus *const *ksks,
@@ -46,8 +45,8 @@ __host__ void host_integer_radix_bitop_kb(
}
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_1, lwe_array_2,
bsks, ksks, ms_noise_reduction_key, lut, lwe_array_out->num_radix_blocks,
streams, lwe_array_out, lwe_array_1, lwe_array_2, bsks, ksks,
ms_noise_reduction_key, lut, lwe_array_out->num_radix_blocks,
lut->params.message_modulus);
memcpy(lwe_array_out->degrees, degrees,
@@ -56,14 +55,12 @@ __host__ void host_integer_radix_bitop_kb(
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_radix_bitop_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_bitop_buffer<Torus> **mem_ptr,
CudaStreams streams, int_bitop_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params, BITOP_TYPE op,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_bitop_buffer<Torus>(streams, gpu_indexes, gpu_count, op,
params, num_radix_blocks,
*mem_ptr = new int_bitop_buffer<Torus>(streams, op, params, num_radix_blocks,
allocate_gpu_memory, size_tracker);
return size_tracker;
}

View File

@@ -2,28 +2,26 @@
void extend_radix_with_trivial_zero_blocks_msb_64(
CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
void *const *streams, uint32_t const *gpu_indexes) {
CudaStreamsFFI streams) {
host_extend_radix_with_trivial_zero_blocks_msb<uint64_t>(
output, input, (cudaStream_t *)streams, gpu_indexes);
output, input, CudaStreams(streams));
}
void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input,
void *const *streams,
uint32_t const *gpu_indexes) {
CudaStreamsFFI streams) {
host_trim_radix_blocks_lsb<uint64_t>(output, input, (cudaStream_t *)streams,
gpu_indexes);
host_trim_radix_blocks_lsb<uint64_t>(output, input, CudaStreams(streams));
}
uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t num_additional_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks,
uint32_t num_additional_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
@@ -32,34 +30,31 @@ uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
noise_reduction_type);
return scratch_extend_radix_with_sign_msb<uint64_t>(
(cudaStream_t *)streams, gpu_indexes, gpu_count,
CudaStreams(streams),
(int_extend_radix_with_sign_msb_buffer<uint64_t> **)mem_ptr, params,
num_blocks, num_additional_blocks, allocate_gpu_memory);
}
void cuda_extend_radix_with_sign_msb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
int8_t *mem_ptr, uint32_t num_additional_blocks, void *const *bsks,
void *const *ksks,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input, int8_t *mem_ptr,
uint32_t num_additional_blocks, void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
PUSH_RANGE("cast")
host_extend_radix_with_sign_msb<uint64_t>(
(cudaStream_t *)streams, gpu_indexes, gpu_count, output, input,
CudaStreams(streams), output, input,
(int_extend_radix_with_sign_msb_buffer<uint64_t> *)mem_ptr,
num_additional_blocks, bsks, (uint64_t **)ksks, ms_noise_reduction_key);
POP_RANGE()
}
void cleanup_cuda_extend_radix_with_sign_msb_64(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
void cleanup_cuda_extend_radix_with_sign_msb_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
PUSH_RANGE("clean cast")
int_extend_radix_with_sign_msb_buffer<uint64_t> *mem_ptr =
(int_extend_radix_with_sign_msb_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
POP_RANGE()
delete mem_ptr;
*mem_ptr_void = nullptr;

View File

@@ -8,19 +8,18 @@
template <typename Torus>
__host__ void host_extend_radix_with_trivial_zero_blocks_msb(
CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
cudaStream_t const *streams, uint32_t const *gpu_indexes) {
CudaStreams streams) {
PUSH_RANGE("extend only")
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
0, input->num_radix_blocks, input, 0,
input->num_radix_blocks);
copy_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), output, 0,
input->num_radix_blocks, input, 0, input->num_radix_blocks);
POP_RANGE()
}
template <typename Torus>
__host__ void host_trim_radix_blocks_lsb(CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input,
cudaStream_t const *streams,
uint32_t const *gpu_indexes) {
CudaStreams streams) {
const uint32_t input_start_lwe_index =
input->num_radix_blocks - output->num_radix_blocks;
@@ -31,30 +30,29 @@ __host__ void host_trim_radix_blocks_lsb(CudaRadixCiphertextFFI *output,
input->num_radix_blocks, output->num_radix_blocks);
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], output, 0, output->num_radix_blocks, input,
input_start_lwe_index, input->num_radix_blocks);
streams.stream(0), streams.gpu_index(0), output, 0,
output->num_radix_blocks, input, input_start_lwe_index,
input->num_radix_blocks);
}
template <typename Torus>
__host__ uint64_t scratch_extend_radix_with_sign_msb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_extend_radix_with_sign_msb_buffer<Torus> **mem_ptr,
CudaStreams streams, int_extend_radix_with_sign_msb_buffer<Torus> **mem_ptr,
const int_radix_params params, uint32_t num_radix_blocks,
uint32_t num_additional_blocks, const bool allocate_gpu_memory) {
PUSH_RANGE("scratch cast/extend")
uint64_t size_tracker = 0;
*mem_ptr = new int_extend_radix_with_sign_msb_buffer<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
num_additional_blocks, allocate_gpu_memory, size_tracker);
streams, params, num_radix_blocks, num_additional_blocks,
allocate_gpu_memory, size_tracker);
POP_RANGE()
return size_tracker;
}
template <typename Torus>
__host__ void host_extend_radix_with_sign_msb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *output,
CudaStreams streams, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input,
int_extend_radix_with_sign_msb_buffer<Torus> *mem_ptr,
uint32_t num_additional_blocks, void *const *bsks, Torus *const *ksks,
@@ -62,8 +60,8 @@ __host__ void host_extend_radix_with_sign_msb(
if (num_additional_blocks == 0) {
PUSH_RANGE("cast/extend no addblocks")
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], output,
input);
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
output, input);
POP_RANGE()
return;
}
@@ -72,24 +70,24 @@ __host__ void host_extend_radix_with_sign_msb(
PANIC_IF_FALSE(input_blocks > 0, "Cuda error: input blocks cannot be zero");
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
0, input_blocks, input, 0,
input_blocks);
copy_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), output, 0, input_blocks, input,
0, input_blocks);
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
mem_ptr->last_block, 0, 1, input,
copy_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), mem_ptr->last_block, 0, 1, input,
input_blocks - 1, input_blocks);
host_apply_univariate_lut_kb(
streams, gpu_indexes, gpu_count, mem_ptr->padding_block,
mem_ptr->last_block, mem_ptr->lut, ksks, ms_noise_reduction_key, bsks);
host_apply_univariate_lut_kb(streams, mem_ptr->padding_block,
mem_ptr->last_block, mem_ptr->lut, ksks,
ms_noise_reduction_key, bsks);
for (uint32_t i = 0; i < num_additional_blocks; ++i) {
uint32_t dst_block_idx = input_blocks + i;
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
dst_block_idx, dst_block_idx + 1,
mem_ptr->padding_block, 0, 1);
copy_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), output, dst_block_idx,
dst_block_idx + 1, mem_ptr->padding_block, 0, 1);
}
POP_RANGE()
}

View File

@@ -1,13 +1,13 @@
#include "integer/cmux.cuh"
uint64_t scratch_cuda_integer_radix_cmux_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
PUSH_RANGE("scratch cmux")
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -18,16 +18,14 @@ uint64_t scratch_cuda_integer_radix_cmux_kb_64(
[](uint64_t x) -> uint64_t { return x == 1; };
uint64_t ret = scratch_cuda_integer_radix_cmux_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
lwe_ciphertext_count, params, allocate_gpu_memory);
CudaStreams(streams), (int_cmux_buffer<uint64_t> **)mem_ptr,
predicate_lut_f, lwe_ciphertext_count, params, allocate_gpu_memory);
POP_RANGE()
return ret;
}
void cuda_cmux_integer_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *lwe_array_out,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_condition,
CudaRadixCiphertextFFI const *lwe_array_true,
CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
@@ -35,21 +33,18 @@ void cuda_cmux_integer_radix_ciphertext_kb_64(
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
PUSH_RANGE("cmux")
host_integer_radix_cmux_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
lwe_condition, lwe_array_true, lwe_array_false,
(int_cmux_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key);
CudaStreams(streams), lwe_array_out, lwe_condition, lwe_array_true,
lwe_array_false, (int_cmux_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key);
POP_RANGE()
}
void cleanup_cuda_integer_radix_cmux(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
void cleanup_cuda_integer_radix_cmux(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
PUSH_RANGE("cleanup cmux")
int_cmux_buffer<uint64_t> *mem_ptr =
(int_cmux_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
POP_RANGE()

View File

@@ -6,8 +6,7 @@
template <typename Torus>
__host__ void
zero_out_if(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
zero_out_if(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_input,
CudaRadixCiphertextFFI const *lwe_condition,
int_zero_out_if_buffer<Torus> *mem_ptr,
@@ -27,26 +26,25 @@ zero_out_if(cudaStream_t const *streams, uint32_t const *gpu_indexes,
"Cuda error: input and output radix ciphertexts must have the same "
"lwe dimension");
cuda_set_device(gpu_indexes[0]);
cuda_set_device(streams.gpu_index(0));
auto params = mem_ptr->params;
// We can't use integer_radix_apply_bivariate_lookup_table_kb since the
// second operand is not an array
auto tmp_lwe_array_input = mem_ptr->tmp;
host_pack_bivariate_blocks_with_single_block<Torus>(
streams, gpu_indexes, gpu_count, tmp_lwe_array_input,
predicate->lwe_indexes_in, lwe_array_input, lwe_condition,
predicate->lwe_indexes_in, params.message_modulus, num_radix_blocks);
streams, tmp_lwe_array_input, predicate->lwe_indexes_in, lwe_array_input,
lwe_condition, predicate->lwe_indexes_in, params.message_modulus,
num_radix_blocks);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, tmp_lwe_array_input, bsks,
ksks, ms_noise_reduction_key, predicate, num_radix_blocks);
streams, lwe_array_out, tmp_lwe_array_input, bsks, ksks,
ms_noise_reduction_key, predicate, num_radix_blocks);
}
template <typename Torus>
__host__ void host_integer_radix_cmux_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_condition,
CudaRadixCiphertextFFI const *lwe_array_true,
CudaRadixCiphertextFFI const *lwe_array_false,
@@ -62,18 +60,19 @@ __host__ void host_integer_radix_cmux_kb(
auto params = mem_ptr->params;
Torus lwe_size = params.big_lwe_dimension + 1;
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], mem_ptr->buffer_in, 0, num_radix_blocks,
lwe_array_true, 0, num_radix_blocks);
streams.stream(0), streams.gpu_index(0), mem_ptr->buffer_in, 0,
num_radix_blocks, lwe_array_true, 0, num_radix_blocks);
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], mem_ptr->buffer_in, num_radix_blocks,
2 * num_radix_blocks, lwe_array_false, 0, num_radix_blocks);
streams.stream(0), streams.gpu_index(0), mem_ptr->buffer_in,
num_radix_blocks, 2 * num_radix_blocks, lwe_array_false, 0,
num_radix_blocks);
for (uint i = 0; i < 2 * num_radix_blocks; i++) {
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
mem_ptr->condition_array, i, i + 1,
lwe_condition, 0, 1);
copy_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), mem_ptr->condition_array, i,
i + 1, lwe_condition, 0, 1);
}
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->buffer_out, mem_ptr->buffer_in,
streams, mem_ptr->buffer_out, mem_ptr->buffer_in,
mem_ptr->condition_array, bsks, ksks, ms_noise_reduction_key,
mem_ptr->predicate_lut, 2 * num_radix_blocks, params.message_modulus);
@@ -87,25 +86,24 @@ __host__ void host_integer_radix_cmux_kb(
as_radix_ciphertext_slice<Torus>(&mem_false, mem_ptr->buffer_out,
num_radix_blocks, 2 * num_radix_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], &mem_true, &mem_true,
&mem_false, num_radix_blocks, params.message_modulus,
params.carry_modulus);
host_addition<Torus>(streams.stream(0), streams.gpu_index(0), &mem_true,
&mem_true, &mem_false, num_radix_blocks,
params.message_modulus, params.carry_modulus);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, &mem_true, bsks, ksks,
ms_noise_reduction_key, mem_ptr->message_extract_lut, num_radix_blocks);
streams, lwe_array_out, &mem_true, bsks, ksks, ms_noise_reduction_key,
mem_ptr->message_extract_lut, num_radix_blocks);
}
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_radix_cmux_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_cmux_buffer<Torus> **mem_ptr,
CudaStreams streams, int_cmux_buffer<Torus> **mem_ptr,
std::function<Torus(Torus)> predicate_lut_f, uint32_t num_radix_blocks,
int_radix_params params, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_cmux_buffer<Torus>(
streams, gpu_indexes, gpu_count, predicate_lut_f, params,
num_radix_blocks, allocate_gpu_memory, size_tracker);
*mem_ptr = new int_cmux_buffer<Torus>(streams, predicate_lut_f, params,
num_radix_blocks, allocate_gpu_memory,
size_tracker);
return size_tracker;
}
#endif

View File

@@ -1,14 +1,13 @@
#include "integer/comparison.cuh"
uint64_t scratch_cuda_integer_radix_comparison_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
COMPARISON_TYPE op_type, bool is_signed, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, COMPARISON_TYPE op_type, bool is_signed,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
PUSH_RANGE("scratch comparison")
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -20,9 +19,8 @@ uint64_t scratch_cuda_integer_radix_comparison_kb_64(
case EQ:
case NE:
size_tracker += scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params,
op_type, false, allocate_gpu_memory);
CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
num_radix_blocks, params, op_type, false, allocate_gpu_memory);
break;
case GT:
case GE:
@@ -31,9 +29,8 @@ uint64_t scratch_cuda_integer_radix_comparison_kb_64(
case MAX:
case MIN:
size_tracker += scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params,
op_type, is_signed, allocate_gpu_memory);
CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
num_radix_blocks, params, op_type, is_signed, allocate_gpu_memory);
break;
}
POP_RANGE()
@@ -41,8 +38,7 @@ uint64_t scratch_cuda_integer_radix_comparison_kb_64(
}
void cuda_comparison_integer_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *lwe_array_out,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
@@ -60,9 +56,8 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
case EQ:
case NE:
host_integer_radix_equality_check_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
lwe_array_1, lwe_array_2, buffer, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, num_radix_blocks);
CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2, buffer,
bsks, (uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
break;
case GT:
case GE:
@@ -72,18 +67,17 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
PANIC("Cuda error (comparisons): the number of radix blocks has to be "
"even.")
host_integer_radix_difference_check_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
lwe_array_1, lwe_array_2, buffer, buffer->diff_buffer->operator_f, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2, buffer,
buffer->diff_buffer->operator_f, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, num_radix_blocks);
break;
case MAX:
case MIN:
if (num_radix_blocks % 2 != 0)
PANIC("Cuda error (max/min): the number of radix blocks has to be even.")
host_integer_radix_maxmin_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
lwe_array_1, lwe_array_2, buffer, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, num_radix_blocks);
CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2, buffer,
bsks, (uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
break;
default:
PANIC("Cuda error: integer operation not supported")
@@ -91,27 +85,25 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
POP_RANGE()
}
void cleanup_cuda_integer_comparison(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
void cleanup_cuda_integer_comparison(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
PUSH_RANGE("cleanup comparison")
int_comparison_buffer<uint64_t> *mem_ptr =
(int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
POP_RANGE()
}
uint64_t scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -119,14 +111,12 @@ uint64_t scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params, EQ,
false, allocate_gpu_memory);
CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
num_radix_blocks, params, EQ, false, allocate_gpu_memory);
}
void cuda_integer_are_all_comparisons_block_true_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *lwe_array_out,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
@@ -136,30 +126,28 @@ void cuda_integer_are_all_comparisons_block_true_kb_64(
(int_comparison_buffer<uint64_t> *)mem_ptr;
host_integer_are_all_comparisons_block_true_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
lwe_array_in, buffer, bsks, (uint64_t **)(ksks), ms_noise_reduction_key,
num_radix_blocks);
CudaStreams(streams), lwe_array_out, lwe_array_in, buffer, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
}
void cleanup_cuda_integer_are_all_comparisons_block_true(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
int_comparison_buffer<uint64_t> *mem_ptr =
(int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}
uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -167,14 +155,12 @@ uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params, EQ,
false, allocate_gpu_memory);
CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
num_radix_blocks, params, EQ, false, allocate_gpu_memory);
}
void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *lwe_array_out,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
@@ -184,18 +170,16 @@ void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
(int_comparison_buffer<uint64_t> *)mem_ptr;
host_integer_is_at_least_one_comparisons_block_true_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
lwe_array_in, buffer, bsks, (uint64_t **)(ksks), ms_noise_reduction_key,
num_radix_blocks);
CudaStreams(streams), lwe_array_out, lwe_array_in, buffer, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
}
void cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
int_comparison_buffer<uint64_t> *mem_ptr =
(int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}

View File

@@ -58,8 +58,7 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
*/
template <typename Torus>
__host__ void are_all_comparisons_block_true(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
@@ -86,9 +85,9 @@ __host__ void are_all_comparisons_block_true(
uint32_t total_modulus = message_modulus * carry_modulus;
uint32_t max_value = (total_modulus - 1) / (message_modulus - 1);
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], tmp_out,
0, num_radix_blocks, lwe_array_in, 0,
num_radix_blocks);
copy_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), tmp_out, 0, num_radix_blocks,
lwe_array_in, 0, num_radix_blocks);
uint32_t remaining_blocks = num_radix_blocks;
@@ -108,9 +107,9 @@ __host__ void are_all_comparisons_block_true(
uint32_t chunk_length =
std::min(max_value, begin_remaining_blocks - i * max_value);
chunk_lengths[i] = chunk_length;
accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator_ptr,
input_blocks, big_lwe_dimension,
chunk_length);
accumulate_all_blocks<Torus>(streams.stream(0), streams.gpu_index(0),
accumulator_ptr, input_blocks,
big_lwe_dimension, chunk_length);
accumulator_ptr += (big_lwe_dimension + 1);
remaining_blocks -= (chunk_length - 1);
@@ -131,8 +130,8 @@ __host__ void are_all_comparisons_block_true(
return x == chunk_length;
};
generate_device_accumulator_with_cpu_prealloc<Torus>(
streams[0], gpu_indexes[0], is_max_value_lut->get_lut(0, 1),
is_max_value_lut->get_degree(1),
streams.stream(0), streams.gpu_index(0),
is_max_value_lut->get_lut(0, 1), is_max_value_lut->get_degree(1),
is_max_value_lut->get_max_degree(1), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
is_equal_to_num_blocks_lut_f, true,
@@ -148,9 +147,9 @@ __host__ void are_all_comparisons_block_true(
}
cuda_memcpy_async_to_gpu(is_max_value_lut->get_lut_indexes(0, 0),
h_lut_indexes, num_chunks * sizeof(Torus),
streams[0], gpu_indexes[0]);
auto active_gpu_count = get_active_gpu_count(num_chunks, gpu_count);
is_max_value_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
streams.stream(0), streams.gpu_index(0));
auto active_streams = streams.active_gpu_subset(num_chunks);
is_max_value_lut->broadcast_lut(active_streams);
}
lut = is_max_value_lut;
}
@@ -159,8 +158,8 @@ __host__ void are_all_comparisons_block_true(
if (remaining_blocks == 1) {
// In the last iteration we copy the output to the final address
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsks,
ksks, ms_noise_reduction_key, lut, 1);
streams, lwe_array_out, accumulator, bsks, ksks,
ms_noise_reduction_key, lut, 1);
// Reset max_value_lut_indexes before returning, otherwise if the lut is
// reused the lut indexes will be wrong
memset(is_max_value_lut->h_lut_indexes, 0,
@@ -168,17 +167,17 @@ __host__ void are_all_comparisons_block_true(
cuda_memcpy_async_to_gpu(is_max_value_lut->get_lut_indexes(0, 0),
is_max_value_lut->h_lut_indexes,
is_max_value_lut->num_blocks * sizeof(Torus),
streams[0], gpu_indexes[0]);
streams.stream(0), streams.gpu_index(0));
auto active_gpu_count_is_max =
get_active_gpu_count(is_max_value_lut->num_blocks, gpu_count);
is_max_value_lut->broadcast_lut(streams, gpu_indexes,
active_gpu_count_is_max, false);
streams.active_gpu_subset(is_max_value_lut->num_blocks);
is_max_value_lut->broadcast_lut(active_gpu_count_is_max, false);
reset_radix_ciphertext_blocks(lwe_array_out, 1);
return;
} else {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, tmp_out, accumulator, bsks, ksks,
ms_noise_reduction_key, lut, num_chunks);
streams, tmp_out, accumulator, bsks, ksks, ms_noise_reduction_key,
lut, num_chunks);
}
}
}
@@ -191,8 +190,7 @@ __host__ void are_all_comparisons_block_true(
*/
template <typename Torus>
__host__ void is_at_least_one_comparisons_block_true(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
@@ -216,7 +214,7 @@ __host__ void is_at_least_one_comparisons_block_true(
uint32_t max_value = (total_modulus - 1) / (message_modulus - 1);
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], mem_ptr->tmp_lwe_array_out, 0,
streams.stream(0), streams.gpu_index(0), mem_ptr->tmp_lwe_array_out, 0,
num_radix_blocks, lwe_array_in, 0, num_radix_blocks);
uint32_t remaining_blocks = num_radix_blocks;
@@ -234,8 +232,8 @@ __host__ void is_at_least_one_comparisons_block_true(
uint32_t chunk_length =
std::min(max_value, begin_remaining_blocks - i * max_value);
chunk_lengths[i] = chunk_length;
accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator,
input_blocks, big_lwe_dimension,
accumulate_all_blocks<Torus>(streams.stream(0), streams.gpu_index(0),
accumulator, input_blocks, big_lwe_dimension,
chunk_length);
accumulator += (big_lwe_dimension + 1);
@@ -250,23 +248,20 @@ __host__ void is_at_least_one_comparisons_block_true(
if (remaining_blocks == 1) {
// In the last iteration we copy the output to the final address
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out,
buffer->tmp_block_accumulated, bsks, ksks, ms_noise_reduction_key,
lut, 1);
streams, lwe_array_out, buffer->tmp_block_accumulated, bsks, ksks,
ms_noise_reduction_key, lut, 1);
return;
} else {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
buffer->tmp_block_accumulated, bsks, ksks, ms_noise_reduction_key,
lut, num_chunks);
streams, mem_ptr->tmp_lwe_array_out, buffer->tmp_block_accumulated,
bsks, ksks, ms_noise_reduction_key, lut, num_chunks);
}
}
}
template <typename Torus>
__host__ void host_compare_blocks_with_zero(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
@@ -303,8 +298,8 @@ __host__ void host_compare_blocks_with_zero(
if (num_radix_blocks == 1) {
// Just copy
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], sum, 0,
1, lwe_array_in, 0, 1);
copy_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), sum, 0, 1, lwe_array_in, 0, 1);
num_sum_blocks = 1;
} else {
uint32_t remainder_blocks = num_radix_blocks;
@@ -314,8 +309,8 @@ __host__ void host_compare_blocks_with_zero(
uint32_t chunk_size =
std::min(remainder_blocks, num_elements_to_fill_carry);
accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], sum_i, chunk,
big_lwe_dimension, chunk_size);
accumulate_all_blocks<Torus>(streams.stream(0), streams.gpu_index(0),
sum_i, chunk, big_lwe_dimension, chunk_size);
num_sum_blocks++;
remainder_blocks -= (chunk_size - 1);
@@ -327,16 +322,15 @@ __host__ void host_compare_blocks_with_zero(
}
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, sum, bsks, ksks,
ms_noise_reduction_key, zero_comparison, num_sum_blocks);
streams, lwe_array_out, sum, bsks, ksks, ms_noise_reduction_key,
zero_comparison, num_sum_blocks);
reset_radix_ciphertext_blocks(lwe_array_out, num_sum_blocks);
}
template <typename Torus>
__host__ void host_integer_radix_equality_check_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
@@ -352,23 +346,22 @@ __host__ void host_integer_radix_equality_check_kb(
// Applies the LUT for the comparison operation
auto comparisons = mem_ptr->tmp_block_comparisons;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, comparisons, lwe_array_1, lwe_array_2,
bsks, ksks, ms_noise_reduction_key, eq_buffer->operator_lut,
num_radix_blocks, eq_buffer->operator_lut->params.message_modulus);
streams, comparisons, lwe_array_1, lwe_array_2, bsks, ksks,
ms_noise_reduction_key, eq_buffer->operator_lut, num_radix_blocks,
eq_buffer->operator_lut->params.message_modulus);
// This takes a Vec of blocks, where each block is either 0 or 1.
//
// It returns a block encrypting 1 if all input blocks are 1
// otherwise the block encrypts 0
are_all_comparisons_block_true<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, comparisons, mem_ptr,
bsks, ksks, ms_noise_reduction_key, num_radix_blocks);
streams, lwe_array_out, comparisons, mem_ptr, bsks, ksks,
ms_noise_reduction_key, num_radix_blocks);
}
template <typename Torus>
__host__ void compare_radix_blocks_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_left,
CudaRadixCiphertextFFI const *lwe_array_right,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
@@ -400,22 +393,21 @@ __host__ void compare_radix_blocks_kb(
// Subtract
host_subtraction<Torus>(
streams[0], gpu_indexes[0], (Torus *)lwe_array_out->ptr,
streams.stream(0), streams.gpu_index(0), (Torus *)lwe_array_out->ptr,
(Torus *)lwe_array_left->ptr, (Torus *)lwe_array_right->ptr,
big_lwe_dimension, num_radix_blocks);
// Apply LUT to compare to 0
auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_out, bsks, ksks,
ms_noise_reduction_key, is_non_zero_lut, num_radix_blocks);
streams, lwe_array_out, lwe_array_out, bsks, ksks, ms_noise_reduction_key,
is_non_zero_lut, num_radix_blocks);
// Add one
// Here Lhs can have the following values: (-1) % (message modulus * carry
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
host_integer_radix_add_scalar_one_inplace<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, message_modulus,
carry_modulus);
streams, lwe_array_out, message_modulus, carry_modulus);
}
// Reduces a vec containing shortint blocks that encrypts a sign
@@ -423,8 +415,7 @@ __host__ void compare_radix_blocks_kb(
// final sign
template <typename Torus>
__host__ void tree_sign_reduction(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI *lwe_block_comparisons,
int_tree_sign_reduction_buffer<Torus> *tree_buffer,
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
@@ -453,26 +444,26 @@ __host__ void tree_sign_reduction(
auto y = tree_buffer->tmp_y;
if (x != lwe_block_comparisons)
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], x, 0, num_radix_blocks,
streams.stream(0), streams.gpu_index(0), x, 0, num_radix_blocks,
lwe_block_comparisons, 0, num_radix_blocks);
uint32_t partial_block_count = num_radix_blocks;
auto inner_tree_leaf = tree_buffer->tree_inner_leaf_lut;
while (partial_block_count > 2) {
pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, partial_block_count,
message_modulus);
pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), y, x,
partial_block_count, message_modulus);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, x, y, bsks, ksks,
ms_noise_reduction_key, inner_tree_leaf, partial_block_count >> 1);
streams, x, y, bsks, ksks, ms_noise_reduction_key, inner_tree_leaf,
partial_block_count >> 1);
if ((partial_block_count % 2) != 0) {
partial_block_count >>= 1;
partial_block_count++;
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], x, partial_block_count - 1,
streams.stream(0), streams.gpu_index(0), x, partial_block_count - 1,
partial_block_count, y, partial_block_count - 1, partial_block_count);
} else {
partial_block_count >>= 1;
@@ -484,8 +475,8 @@ __host__ void tree_sign_reduction(
std::function<Torus(Torus)> f;
auto num_bits_in_message = log2_int(params.message_modulus);
if (partial_block_count == 2) {
pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, partial_block_count,
message_modulus);
pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), y, x,
partial_block_count, message_modulus);
f = [block_selector_f, sign_handler_f, num_bits_in_message,
message_modulus](Torus x) -> Torus {
@@ -501,24 +492,23 @@ __host__ void tree_sign_reduction(
f = sign_handler_f;
}
generate_device_accumulator_with_cpu_prealloc<Torus>(
streams[0], gpu_indexes[0], last_lut->get_lut(0, 0),
streams.stream(0), streams.gpu_index(0), last_lut->get_lut(0, 0),
last_lut->get_degree(0), last_lut->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, f, true,
tree_buffer->preallocated_h_lut);
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
last_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
auto active_streams = streams.active_gpu_subset(1);
last_lut->broadcast_lut(active_streams);
// Last leaf
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, y, bsks, ksks,
ms_noise_reduction_key, last_lut, 1);
streams, lwe_array_out, y, bsks, ksks, ms_noise_reduction_key, last_lut,
1);
}
template <typename Torus>
__host__ void host_integer_radix_difference_check_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_left,
CudaRadixCiphertextFFI const *lwe_array_right,
int_comparison_buffer<Torus> *mem_ptr,
@@ -552,19 +542,20 @@ __host__ void host_integer_radix_difference_check_kb(
if (mem_ptr->is_signed) {
packed_num_radix_blocks -= 2;
}
pack_blocks<Torus>(streams[0], gpu_indexes[0], &lhs, lwe_array_left,
packed_num_radix_blocks, message_modulus);
pack_blocks<Torus>(streams[0], gpu_indexes[0], &rhs, lwe_array_right,
packed_num_radix_blocks, message_modulus);
pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), &lhs,
lwe_array_left, packed_num_radix_blocks,
message_modulus);
pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), &rhs,
lwe_array_right, packed_num_radix_blocks,
message_modulus);
// From this point we have half number of blocks
packed_num_radix_blocks /= 2;
// Clean noise
auto identity_lut = mem_ptr->identity_lut;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, diff_buffer->tmp_packed,
diff_buffer->tmp_packed, bsks, ksks, ms_noise_reduction_key,
identity_lut, 2 * packed_num_radix_blocks);
streams, diff_buffer->tmp_packed, diff_buffer->tmp_packed, bsks, ksks,
ms_noise_reduction_key, identity_lut, 2 * packed_num_radix_blocks);
} else {
as_radix_ciphertext_slice<Torus>(&lhs, lwe_array_left, 0,
lwe_array_left->num_radix_blocks);
@@ -581,17 +572,17 @@ __host__ void host_integer_radix_difference_check_kb(
if (!mem_ptr->is_signed) {
// Compare packed blocks, or simply the total number of radix blocks in the
// inputs
compare_radix_blocks_kb<Torus>(
streams, gpu_indexes, gpu_count, comparisons, &lhs, &rhs, mem_ptr, bsks,
ksks, ms_noise_reduction_key, packed_num_radix_blocks);
compare_radix_blocks_kb<Torus>(streams, comparisons, &lhs, &rhs, mem_ptr,
bsks, ksks, ms_noise_reduction_key,
packed_num_radix_blocks);
num_comparisons = packed_num_radix_blocks;
} else {
// Packing is possible
if (carry_modulus >= message_modulus) {
// Compare (num_radix_blocks - 2) / 2 packed blocks
compare_radix_blocks_kb<Torus>(
streams, gpu_indexes, gpu_count, comparisons, &lhs, &rhs, mem_ptr,
bsks, ksks, ms_noise_reduction_key, packed_num_radix_blocks);
compare_radix_blocks_kb<Torus>(streams, comparisons, &lhs, &rhs, mem_ptr,
bsks, ksks, ms_noise_reduction_key,
packed_num_radix_blocks);
// Compare the last block before the sign block separately
auto identity_lut = mem_ptr->identity_lut;
@@ -604,9 +595,8 @@ __host__ void host_integer_radix_difference_check_kb(
num_radix_blocks - 2,
num_radix_blocks - 1);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, &last_left_block_before_sign_block,
&shifted_lwe_array_left, bsks, ksks, ms_noise_reduction_key,
identity_lut, 1);
streams, &last_left_block_before_sign_block, &shifted_lwe_array_left,
bsks, ksks, ms_noise_reduction_key, identity_lut, 1);
CudaRadixCiphertextFFI last_right_block_before_sign_block;
as_radix_ciphertext_slice<Torus>(
@@ -618,7 +608,7 @@ __host__ void host_integer_radix_difference_check_kb(
lwe_array_right, num_radix_blocks - 2,
num_radix_blocks - 1);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, &last_right_block_before_sign_block,
streams, &last_right_block_before_sign_block,
&shifted_lwe_array_right, bsks, ksks, ms_noise_reduction_key,
identity_lut, 1);
@@ -627,8 +617,7 @@ __host__ void host_integer_radix_difference_check_kb(
packed_num_radix_blocks,
packed_num_radix_blocks + 1);
compare_radix_blocks_kb<Torus>(
streams, gpu_indexes, gpu_count, &shifted_comparisons,
&last_left_block_before_sign_block,
streams, &shifted_comparisons, &last_left_block_before_sign_block,
&last_right_block_before_sign_block, mem_ptr, bsks, ksks,
ms_noise_reduction_key, 1);
@@ -643,17 +632,15 @@ __host__ void host_integer_radix_difference_check_kb(
as_radix_ciphertext_slice<Torus>(&last_right_block, lwe_array_right,
num_radix_blocks - 1, num_radix_blocks);
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, &shifted_comparisons,
&last_left_block, &last_right_block, bsks, ksks,
ms_noise_reduction_key, mem_ptr->signed_lut, 1,
streams, &shifted_comparisons, &last_left_block, &last_right_block,
bsks, ksks, ms_noise_reduction_key, mem_ptr->signed_lut, 1,
mem_ptr->signed_lut->params.message_modulus);
num_comparisons = packed_num_radix_blocks + 2;
} else {
compare_radix_blocks_kb<Torus>(
streams, gpu_indexes, gpu_count, comparisons, lwe_array_left,
lwe_array_right, mem_ptr, bsks, ksks, ms_noise_reduction_key,
num_radix_blocks - 1);
streams, comparisons, lwe_array_left, lwe_array_right, mem_ptr, bsks,
ksks, ms_noise_reduction_key, num_radix_blocks - 1);
// Compare the sign block separately
CudaRadixCiphertextFFI shifted_comparisons;
as_radix_ciphertext_slice<Torus>(&shifted_comparisons, comparisons,
@@ -665,9 +652,8 @@ __host__ void host_integer_radix_difference_check_kb(
as_radix_ciphertext_slice<Torus>(&last_right_block, lwe_array_right,
num_radix_blocks - 1, num_radix_blocks);
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, &shifted_comparisons,
&last_left_block, &last_right_block, bsks, ksks,
ms_noise_reduction_key, mem_ptr->signed_lut, 1,
streams, &shifted_comparisons, &last_left_block, &last_right_block,
bsks, ksks, ms_noise_reduction_key, mem_ptr->signed_lut, 1,
mem_ptr->signed_lut->params.message_modulus);
num_comparisons = num_radix_blocks;
}
@@ -676,30 +662,27 @@ __host__ void host_integer_radix_difference_check_kb(
// Reduces a vec containing radix blocks that encrypts a sign
// (inferior, equal, superior) to one single radix block containing the
// final sign
tree_sign_reduction<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
comparisons, mem_ptr->diff_buffer->tree_buffer,
reduction_lut_f, bsks, ksks,
ms_noise_reduction_key, num_comparisons);
tree_sign_reduction<Torus>(
streams, lwe_array_out, comparisons, mem_ptr->diff_buffer->tree_buffer,
reduction_lut_f, bsks, ksks, ms_noise_reduction_key, num_comparisons);
}
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_radix_comparison_check_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_comparison_buffer<Torus> **mem_ptr,
CudaStreams streams, int_comparison_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
bool is_signed, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_comparison_buffer<Torus>(
streams, gpu_indexes, gpu_count, op, params, num_radix_blocks, is_signed,
allocate_gpu_memory, size_tracker);
streams, op, params, num_radix_blocks, is_signed, allocate_gpu_memory,
size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void host_integer_radix_maxmin_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_left,
CudaRadixCiphertextFFI const *lwe_array_right,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
@@ -718,21 +701,20 @@ __host__ void host_integer_radix_maxmin_kb(
// Compute the sign
host_integer_radix_difference_check_kb<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
lwe_array_left, lwe_array_right, mem_ptr, mem_ptr->identity_lut_f, bsks,
ksks, ms_noise_reduction_key, num_radix_blocks);
streams, mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, ms_noise_reduction_key,
num_radix_blocks);
// Selector
host_integer_radix_cmux_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out,
mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
mem_ptr->cmux_buffer, bsks, ksks, ms_noise_reduction_key);
host_integer_radix_cmux_kb<Torus>(streams, lwe_array_out,
mem_ptr->tmp_lwe_array_out, lwe_array_left,
lwe_array_right, mem_ptr->cmux_buffer, bsks,
ksks, ms_noise_reduction_key);
}
template <typename Torus>
__host__ void host_integer_are_all_comparisons_block_true_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
@@ -742,14 +724,13 @@ __host__ void host_integer_are_all_comparisons_block_true_kb(
// It returns a block encrypting 1 if all input blocks are 1
// otherwise the block encrypts 0
are_all_comparisons_block_true<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, mem_ptr,
bsks, ksks, ms_noise_reduction_key, num_radix_blocks);
streams, lwe_array_out, lwe_array_in, mem_ptr, bsks, ksks,
ms_noise_reduction_key, num_radix_blocks);
}
template <typename Torus>
__host__ void host_integer_is_at_least_one_comparisons_block_true_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
@@ -759,7 +740,7 @@ __host__ void host_integer_is_at_least_one_comparisons_block_true_kb(
// It returns a block encrypting 1 if all input blocks are 1
// otherwise the block encrypts 0
is_at_least_one_comparisons_block_true<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, mem_ptr,
bsks, ksks, ms_noise_reduction_key, num_radix_blocks);
streams, lwe_array_out, lwe_array_in, mem_ptr, bsks, ksks,
ms_noise_reduction_key, num_radix_blocks);
}
#endif

View File

@@ -1,12 +1,11 @@
#include "compression.cuh"
uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t compression_glwe_dimension,
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
uint32_t lwe_per_glwe, bool allocate_gpu_memory) {
CudaStreamsFFI streams, int8_t **mem_ptr,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, bool allocate_gpu_memory) {
int_radix_params compression_params(
pbs_type, compression_glwe_dimension, compression_polynomial_size,
@@ -15,16 +14,14 @@ uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
carry_modulus, PBS_MS_REDUCTION_T::NO_REDUCTION);
return scratch_cuda_compress_integer_radix_ciphertext<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_compression<uint64_t> **)mem_ptr, num_radix_blocks,
compression_params, lwe_per_glwe, allocate_gpu_memory);
CudaStreams(streams), (int_compression<uint64_t> **)mem_ptr,
num_radix_blocks, compression_params, lwe_per_glwe, allocate_gpu_memory);
}
uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t encryption_glwe_dimension,
uint32_t encryption_polynomial_size, uint32_t compression_glwe_dimension,
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
uint32_t pbs_level, uint32_t pbs_base_log,
CudaStreamsFFI streams, int8_t **mem_ptr,
uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t num_blocks_to_decompress, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
@@ -42,59 +39,53 @@ uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64(
noise_reduction_type);
return scratch_cuda_integer_decompress_radix_ciphertext<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_decompression<uint64_t> **)mem_ptr, num_blocks_to_decompress,
encryption_params, compression_params, allocate_gpu_memory);
CudaStreams(streams), (int_decompression<uint64_t> **)mem_ptr,
num_blocks_to_decompress, encryption_params, compression_params,
allocate_gpu_memory);
}
void cuda_integer_compress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaPackedGlweCiphertextListFFI *glwe_array_out,
CudaStreamsFFI streams, CudaPackedGlweCiphertextListFFI *glwe_array_out,
CudaLweCiphertextListFFI const *lwe_array_in, void *const *fp_ksk,
int8_t *mem_ptr) {
host_integer_compress<uint64_t>((cudaStream_t *)(streams), gpu_indexes,
gpu_count, glwe_array_out, lwe_array_in,
(uint64_t *const *)(fp_ksk),
host_integer_compress<uint64_t>(CudaStreams(streams), glwe_array_out,
lwe_array_in, (uint64_t *const *)(fp_ksk),
(int_compression<uint64_t> *)mem_ptr);
}
void cuda_integer_decompress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaLweCiphertextListFFI *lwe_array_out,
CudaStreamsFFI streams, CudaLweCiphertextListFFI *lwe_array_out,
CudaPackedGlweCiphertextListFFI const *glwe_in,
uint32_t const *indexes_array, void *const *bsks, int8_t *mem_ptr) {
host_integer_decompress<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out, glwe_in,
indexes_array, bsks, (int_decompression<uint64_t> *)mem_ptr);
host_integer_decompress<uint64_t>(CudaStreams(streams), lwe_array_out,
glwe_in, indexes_array, bsks,
(int_decompression<uint64_t> *)mem_ptr);
}
void cleanup_cuda_integer_compress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void cleanup_cuda_integer_compress_radix_ciphertext_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_compression<uint64_t> *mem_ptr =
(int_compression<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}
void cleanup_cuda_integer_decompress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
int_decompression<uint64_t> *mem_ptr =
(int_decompression<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}
uint64_t scratch_cuda_integer_compress_radix_ciphertext_128(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t compression_glwe_dimension,
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
uint32_t lwe_per_glwe, bool allocate_gpu_memory) {
CudaStreamsFFI streams, int8_t **mem_ptr,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, bool allocate_gpu_memory) {
int_radix_params compression_params(
pbs_type, compression_glwe_dimension, compression_polynomial_size,
@@ -103,16 +94,14 @@ uint64_t scratch_cuda_integer_compress_radix_ciphertext_128(
carry_modulus, PBS_MS_REDUCTION_T::NO_REDUCTION);
return scratch_cuda_compress_integer_radix_ciphertext<__uint128_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_compression<__uint128_t> **)mem_ptr, num_radix_blocks,
compression_params, lwe_per_glwe, allocate_gpu_memory);
CudaStreams(streams), (int_compression<__uint128_t> **)mem_ptr,
num_radix_blocks, compression_params, lwe_per_glwe, allocate_gpu_memory);
}
uint64_t scratch_cuda_integer_decompress_radix_ciphertext_128(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t compression_glwe_dimension,
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
bool allocate_gpu_memory) {
CudaStreamsFFI streams, int8_t **mem_ptr,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t num_radix_blocks, uint32_t message_modulus,
uint32_t carry_modulus, bool allocate_gpu_memory) {
// 128-bit decompression doesn't run PBSs, so we don't need encryption_params
int_radix_params compression_params(
@@ -123,50 +112,45 @@ uint64_t scratch_cuda_integer_decompress_radix_ciphertext_128(
PBS_MS_REDUCTION_T::NO_REDUCTION);
return scratch_cuda_integer_decompress_radix_ciphertext<__uint128_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_decompression<__uint128_t> **)mem_ptr, num_radix_blocks,
compression_params, compression_params, allocate_gpu_memory);
CudaStreams(streams), (int_decompression<__uint128_t> **)mem_ptr,
num_radix_blocks, compression_params, compression_params,
allocate_gpu_memory);
}
void cuda_integer_compress_radix_ciphertext_128(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaPackedGlweCiphertextListFFI *glwe_array_out,
CudaStreamsFFI streams, CudaPackedGlweCiphertextListFFI *glwe_array_out,
CudaLweCiphertextListFFI const *lwe_array_in, void *const *fp_ksk,
int8_t *mem_ptr) {
host_integer_compress<__uint128_t>((cudaStream_t *)(streams), gpu_indexes,
gpu_count, glwe_array_out, lwe_array_in,
(__uint128_t *const *)(fp_ksk),
(int_compression<__uint128_t> *)mem_ptr);
host_integer_compress<__uint128_t>(
CudaStreams(streams), glwe_array_out, lwe_array_in,
(__uint128_t *const *)(fp_ksk), (int_compression<__uint128_t> *)mem_ptr);
}
void cuda_integer_decompress_radix_ciphertext_128(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaLweCiphertextListFFI *lwe_array_out,
CudaStreamsFFI streams, CudaLweCiphertextListFFI *lwe_array_out,
CudaPackedGlweCiphertextListFFI const *glwe_in,
uint32_t const *indexes_array, int8_t *mem_ptr) {
host_integer_decompress<__uint128_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out, glwe_in,
indexes_array, nullptr, (int_decompression<__uint128_t> *)mem_ptr);
CudaStreams(streams), lwe_array_out, glwe_in, indexes_array, nullptr,
(int_decompression<__uint128_t> *)mem_ptr);
}
void cleanup_cuda_integer_compress_radix_ciphertext_128(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void cleanup_cuda_integer_compress_radix_ciphertext_128(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_compression<__uint128_t> *mem_ptr =
(int_compression<__uint128_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}
void cleanup_cuda_integer_decompress_radix_ciphertext_128(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
int_decompression<__uint128_t> *mem_ptr =
(int_decompression<__uint128_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;

View File

@@ -80,8 +80,7 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
template <typename Torus>
__host__ void
host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count,
host_integer_compress(CudaStreams streams,
CudaPackedGlweCiphertextListFFI *glwe_array_out,
CudaLweCiphertextListFFI const *lwe_array_in,
Torus *const *fp_ksk, int_compression<Torus> *mem_ptr) {
@@ -98,7 +97,7 @@ host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
if constexpr (std::is_same_v<Torus, uint64_t>) {
lwe_pksk_input = mem_ptr->tmp_lwe;
host_cleartext_multiplication<Torus>(
streams[0], gpu_indexes[0], lwe_pksk_input, lwe_array_in,
streams.stream(0), streams.gpu_index(0), lwe_pksk_input, lwe_array_in,
(uint64_t)compression_params.message_modulus);
}
@@ -115,7 +114,7 @@ host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
cuda_memset_async(tmp_glwe_array_out, 0,
num_glwes * (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size * sizeof(Torus),
streams[0], gpu_indexes[0]);
streams.stream(0), streams.gpu_index(0));
auto fp_ks_buffer = mem_ptr->fp_ks_buffer;
auto rem_lwes = glwe_array_out->total_lwe_bodies_count;
@@ -125,8 +124,8 @@ host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
auto chunk_size = min(rem_lwes, glwe_array_out->lwe_per_glwe);
host_packing_keyswitch_lwe_list_to_glwe<Torus>(
streams[0], gpu_indexes[0], glwe_out, lwe_pksk_input, fp_ksk[0],
fp_ks_buffer, compression_params.small_lwe_dimension,
streams.stream(0), streams.gpu_index(0), glwe_out, lwe_pksk_input,
fp_ksk[0], fp_ks_buffer, compression_params.small_lwe_dimension,
compression_params.glwe_dimension, compression_params.polynomial_size,
compression_params.ks_base_log, compression_params.ks_level,
chunk_size);
@@ -141,11 +140,11 @@ host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
compression_params.polynomial_size +
glwe_array_out->total_lwe_bodies_count;
host_modulus_switch_inplace<Torus>(streams[0], gpu_indexes[0],
host_modulus_switch_inplace<Torus>(streams.stream(0), streams.gpu_index(0),
tmp_glwe_array_out, size,
glwe_array_out->storage_log_modulus);
host_pack<Torus>(streams[0], gpu_indexes[0], glwe_array_out,
host_pack<Torus>(streams.stream(0), streams.gpu_index(0), glwe_array_out,
tmp_glwe_array_out, num_glwes, compression_params);
}
@@ -247,8 +246,7 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
template <typename Torus>
__host__ void
host_integer_decompress(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
host_integer_decompress(CudaStreams streams,
CudaLweCiphertextListFFI *d_lwe_array_out,
CudaPackedGlweCiphertextListFFI const *d_packed_glwe_in,
uint32_t const *h_indexes_array, void *const *d_bsks,
@@ -262,7 +260,7 @@ host_integer_decompress(cudaStream_t const *streams,
auto d_indexes_array = h_mem_ptr->tmp_indexes_array;
cuda_memcpy_async_to_gpu(d_indexes_array, (void *)h_indexes_array,
num_blocks_to_decompress * sizeof(uint32_t),
streams[0], gpu_indexes[0]);
streams.stream(0), streams.gpu_index(0));
auto compression_params = h_mem_ptr->compression_params;
auto lwe_per_glwe = compression_params.polynomial_size;
@@ -276,7 +274,7 @@ host_integer_decompress(cudaStream_t const *streams,
auto current_glwe_index = h_indexes_array[0] / lwe_per_glwe;
auto extracted_glwe = h_mem_ptr->tmp_extracted_glwe;
host_extract<Torus>(streams[0], gpu_indexes[0], extracted_glwe,
host_extract<Torus>(streams.stream(0), streams.gpu_index(0), extracted_glwe,
d_packed_glwe_in, current_glwe_index);
glwe_vec.push_back(std::make_pair(1, extracted_glwe));
for (int i = 1; i < num_blocks_to_decompress; i++) {
@@ -285,8 +283,8 @@ host_integer_decompress(cudaStream_t const *streams,
extracted_glwe += glwe_accumulator_size;
current_glwe_index = glwe_index;
// Extracts a new GLWE
host_extract<Torus>(streams[0], gpu_indexes[0], extracted_glwe,
d_packed_glwe_in, glwe_index);
host_extract<Torus>(streams.stream(0), streams.gpu_index(0),
extracted_glwe, d_packed_glwe_in, glwe_index);
glwe_vec.push_back(std::make_pair(1, extracted_glwe));
} else {
// Updates the quantity
@@ -312,17 +310,17 @@ host_integer_decompress(cudaStream_t const *streams,
extracted_glwe = max_idx_and_glwe.second;
if constexpr (std::is_same_v<Torus, uint64_t>)
cuda_glwe_sample_extract_64(streams[0], gpu_indexes[0], extracted_lwe,
extracted_glwe, d_indexes_array_chunk,
num_lwes, compression_params.polynomial_size,
compression_params.glwe_dimension,
cuda_glwe_sample_extract_64(
streams.stream(0), streams.gpu_index(0), extracted_lwe,
extracted_glwe, d_indexes_array_chunk, num_lwes,
compression_params.polynomial_size, compression_params.glwe_dimension,
compression_params.polynomial_size);
else
// 128 bits
cuda_glwe_sample_extract_128(streams[0], gpu_indexes[0], extracted_lwe,
extracted_glwe, d_indexes_array_chunk,
num_lwes, compression_params.polynomial_size,
compression_params.glwe_dimension,
cuda_glwe_sample_extract_128(
streams.stream(0), streams.gpu_index(0), extracted_lwe,
extracted_glwe, d_indexes_array_chunk, num_lwes,
compression_params.polynomial_size, compression_params.glwe_dimension,
compression_params.polynomial_size);
d_indexes_array_chunk += num_lwes;
@@ -341,13 +339,12 @@ host_integer_decompress(cudaStream_t const *streams,
/// dimension to a big LWE dimension
auto encryption_params = h_mem_ptr->encryption_params;
auto lut = h_mem_ptr->decompression_rescale_lut;
auto active_gpu_count =
get_active_gpu_count(num_blocks_to_decompress, gpu_count);
if (active_gpu_count == 1) {
auto active_streams = streams.active_gpu_subset(num_blocks_to_decompress);
if (active_streams.count() == 1) {
execute_pbs_async<Torus, Torus>(
streams, gpu_indexes, active_gpu_count, (Torus *)d_lwe_array_out->ptr,
lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec,
extracted_lwe, lut->lwe_indexes_in, d_bsks, nullptr, lut->buffer,
active_streams, (Torus *)d_lwe_array_out->ptr, lut->lwe_indexes_out,
lut->lut_vec, lut->lut_indexes_vec, extracted_lwe,
lut->lwe_indexes_in, d_bsks, nullptr, lut->buffer,
encryption_params.glwe_dimension,
compression_params.small_lwe_dimension,
encryption_params.polynomial_size, encryption_params.pbs_base_log,
@@ -363,25 +360,26 @@ host_integer_decompress(cudaStream_t const *streams,
lut->lwe_trivial_indexes_vec;
/// Make sure all data that should be on GPU 0 is indeed there
cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
for (int j = 1; j < active_gpu_count; j++) {
cuda_stream_wait_event(streams[j], lut->event_scatter_in,
gpu_indexes[j]);
cuda_event_record(lut->event_scatter_in, streams.stream(0),
streams.gpu_index(0));
for (int j = 1; j < active_streams.count(); j++) {
cuda_stream_wait_event(streams.stream(j), lut->event_scatter_in,
streams.gpu_index(j));
}
/// With multiple GPUs we push to the vectors on each GPU then when we
/// gather data to GPU 0 we can copy back to the original indexing
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
extracted_lwe, lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
lut->lwe_aligned_vec, lut->active_gpu_count, num_blocks_to_decompress,
active_streams, lwe_array_in_vec, extracted_lwe, lut->lwe_indexes_in,
lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
lut->active_streams.count(), num_blocks_to_decompress,
compression_params.small_lwe_dimension + 1);
/// Apply PBS
execute_pbs_async<Torus, Torus>(
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_array_in_vec, lwe_trivial_indexes_vec, d_bsks, nullptr,
lut->buffer, encryption_params.glwe_dimension,
active_streams, lwe_after_pbs_vec, lwe_trivial_indexes_vec,
lut->lut_vec, lut->lut_indexes_vec, lwe_array_in_vec,
lwe_trivial_indexes_vec, d_bsks, nullptr, lut->buffer,
encryption_params.glwe_dimension,
compression_params.small_lwe_dimension,
encryption_params.polynomial_size, encryption_params.pbs_base_log,
encryption_params.pbs_level, encryption_params.grouping_factor,
@@ -390,21 +388,21 @@ host_integer_decompress(cudaStream_t const *streams,
/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, (Torus *)d_lwe_array_out->ptr,
lwe_after_pbs_vec, lut->lwe_indexes_out,
lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
num_blocks_to_decompress, encryption_params.big_lwe_dimension + 1);
active_streams, (Torus *)d_lwe_array_out->ptr, lwe_after_pbs_vec,
lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
lut->lwe_aligned_vec, num_blocks_to_decompress,
encryption_params.big_lwe_dimension + 1);
/// Synchronize all GPUs
// other gpus record their events
for (int j = 1; j < active_gpu_count; j++) {
cuda_event_record(lut->event_scatter_out[j], streams[j],
gpu_indexes[j]);
for (int j = 1; j < active_streams.count(); j++) {
cuda_event_record(lut->event_scatter_out[j], active_streams.stream(j),
active_streams.gpu_index(j));
}
// GPU 0 waits for all
for (int j = 1; j < active_gpu_count; j++) {
cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
gpu_indexes[0]);
for (int j = 1; j < active_streams.count(); j++) {
cuda_stream_wait_event(streams.stream(0), lut->event_scatter_out[j],
streams.gpu_index(0));
}
}
} else {
@@ -415,29 +413,27 @@ host_integer_decompress(cudaStream_t const *streams,
template <typename Torus>
__host__ uint64_t scratch_cuda_compress_integer_radix_ciphertext(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_compression<Torus> **mem_ptr,
CudaStreams streams, int_compression<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params compression_params,
uint32_t lwe_per_glwe, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_compression<Torus>(
streams, gpu_indexes, gpu_count, compression_params, num_radix_blocks,
lwe_per_glwe, allocate_gpu_memory, size_tracker);
*mem_ptr = new int_compression<Torus>(streams, compression_params,
num_radix_blocks, lwe_per_glwe,
allocate_gpu_memory, size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_decompress_radix_ciphertext(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_decompression<Torus> **mem_ptr,
CudaStreams streams, int_decompression<Torus> **mem_ptr,
uint32_t num_blocks_to_decompress, int_radix_params encryption_params,
int_radix_params compression_params, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_decompression<Torus>(
streams, gpu_indexes, gpu_count, encryption_params, compression_params,
num_blocks_to_decompress, allocate_gpu_memory, size_tracker);
streams, encryption_params, compression_params, num_blocks_to_decompress,
allocate_gpu_memory, size_tracker);
return size_tracker;
}
#endif

View File

@@ -1,13 +1,12 @@
#include "integer/div_rem.cuh"
uint64_t scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
bool is_signed, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory,
CudaStreamsFFI streams, bool is_signed, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
PUSH_RANGE("scratch div")
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
@@ -16,16 +15,14 @@ uint64_t scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_integer_div_rem_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, is_signed,
(int_div_rem_memory<uint64_t> **)mem_ptr, num_blocks, params,
allocate_gpu_memory);
CudaStreams(streams), is_signed, (int_div_rem_memory<uint64_t> **)mem_ptr,
num_blocks, params, allocate_gpu_memory);
POP_RANGE()
}
void cuda_integer_div_rem_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *quotient, CudaRadixCiphertextFFI *remainder,
CudaRadixCiphertextFFI const *numerator,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient,
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
CudaRadixCiphertextFFI const *divisor, bool is_signed, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
@@ -33,20 +30,18 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64(
auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;
host_integer_div_rem_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, quotient, remainder,
numerator, divisor, is_signed, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, mem);
CudaStreams(streams), quotient, remainder, numerator, divisor, is_signed,
bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem);
POP_RANGE()
}
void cleanup_cuda_integer_div_rem(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, int8_t **mem_ptr_void) {
void cleanup_cuda_integer_div_rem(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
PUSH_RANGE("cleanup div")
int_div_rem_memory<uint64_t> *mem_ptr =
(int_div_rem_memory<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
POP_RANGE()

View File

@@ -21,21 +21,19 @@
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_div_rem_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, bool is_signed, int_div_rem_memory<Torus> **mem_ptr,
CudaStreams streams, bool is_signed, int_div_rem_memory<Torus> **mem_ptr,
uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_div_rem_memory<Torus>(streams, gpu_indexes, gpu_count,
params, is_signed, num_blocks,
*mem_ptr =
new int_div_rem_memory<Torus>(streams, params, is_signed, num_blocks,
allocate_gpu_memory, size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void host_unsigned_integer_div_rem_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *quotient,
CudaStreams streams, CudaRadixCiphertextFFI *quotient,
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
CudaRadixCiphertextFFI const *divisor, void *const *bsks,
uint64_t *const *ksks,
@@ -75,10 +73,10 @@ __host__ void host_unsigned_integer_div_rem_kb(
auto cleaned_merged_interesting_remainder =
mem_ptr->cleaned_merged_interesting_remainder;
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
numerator_block_stack, numerator);
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
quotient, 0, num_blocks);
set_zero_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), quotient, 0, num_blocks);
for (int i = total_bits - 1; i >= 0; i--) {
uint32_t pos_in_block = i % num_bits_in_message;
@@ -97,17 +95,17 @@ __host__ void host_unsigned_integer_div_rem_kb(
(msb_bit_set + 1) / num_bits_in_message);
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], interesting_remainder1, 0,
streams.stream(0), streams.gpu_index(0), interesting_remainder1, 0,
first_trivial_block, remainder1, 0, first_trivial_block);
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], interesting_remainder2, 0,
streams.stream(0), streams.gpu_index(0), interesting_remainder2, 0,
first_trivial_block, remainder2, 0, first_trivial_block);
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], interesting_divisor, 0, first_trivial_block,
divisor, 0, first_trivial_block);
streams.stream(0), streams.gpu_index(0), interesting_divisor, 0,
first_trivial_block, divisor, 0, first_trivial_block);
if ((msb_bit_set + 1) / num_bits_in_message < num_blocks)
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], divisor_ms_blocks, 0,
streams.stream(0), streams.gpu_index(0), divisor_ms_blocks, 0,
num_blocks - (msb_bit_set + 1) / num_bits_in_message, divisor,
(msb_bit_set + 1) / num_bits_in_message, num_blocks);
@@ -116,9 +114,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
// msb_bit_set) the split versions share some bits they should not. So we do
// one PBS on the last block of the interesting_divisor, and first block of
// divisor_ms_blocks to trim out bits which should not be there
auto trim_last_interesting_divisor_bits = [&](cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count) {
auto trim_last_interesting_divisor_bits = [&](CudaStreams streams) {
if ((msb_bit_set + 1) % num_bits_in_message == 0) {
return;
}
@@ -149,14 +145,12 @@ __host__ void host_unsigned_integer_div_rem_kb(
interesting_divisor->num_radix_blocks - 1,
interesting_divisor->num_radix_blocks);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, &last_interesting_divisor_block,
streams, &last_interesting_divisor_block,
&last_interesting_divisor_block, bsks, ksks, ms_noise_reduction_key,
mem_ptr->masking_luts_1[shifted_mask], 1);
}; // trim_last_interesting_divisor_bits
auto trim_first_divisor_ms_bits = [&](cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count) {
auto trim_first_divisor_ms_bits = [&](CudaStreams streams) {
if (divisor_ms_blocks->num_radix_blocks == 0 ||
((msb_bit_set + 1) % num_bits_in_message) == 0) {
return;
@@ -178,9 +172,8 @@ __host__ void host_unsigned_integer_div_rem_kb(
shifted_mask = shifted_mask & full_message_mask;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, divisor_ms_blocks, divisor_ms_blocks,
bsks, ksks, ms_noise_reduction_key,
mem_ptr->masking_luts_2[shifted_mask], 1);
streams, divisor_ms_blocks, divisor_ms_blocks, bsks, ksks,
ms_noise_reduction_key, mem_ptr->masking_luts_2[shifted_mask], 1);
}; // trim_first_divisor_ms_bits
// This does
@@ -192,75 +185,64 @@ __host__ void host_unsigned_integer_div_rem_kb(
// However, to keep the remainder clean (noise wise), what we do is that we
// put the remainder block from which we need to extract the bit, as the LSB
// of the Remainder, so that left shifting will pull the bit we need.
auto left_shift_interesting_remainder1 = [&](cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count) {
pop_radix_ciphertext_block_async<Torus>(streams[0], gpu_indexes[0],
mem_ptr->numerator_block_1,
auto left_shift_interesting_remainder1 = [&](CudaStreams streams) {
pop_radix_ciphertext_block_async<Torus>(
streams.stream(0), streams.gpu_index(0), mem_ptr->numerator_block_1,
numerator_block_stack);
insert_block_in_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
mem_ptr->numerator_block_1,
insert_block_in_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), mem_ptr->numerator_block_1,
interesting_remainder1, 0);
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, interesting_remainder1, 1,
mem_ptr->shift_mem_1, bsks, ksks, ms_noise_reduction_key,
interesting_remainder1->num_radix_blocks);
streams, interesting_remainder1, 1, mem_ptr->shift_mem_1, bsks, ksks,
ms_noise_reduction_key, interesting_remainder1->num_radix_blocks);
reset_radix_ciphertext_blocks(mem_ptr->tmp_radix,
interesting_remainder1->num_radix_blocks);
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
mem_ptr->tmp_radix,
copy_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), mem_ptr->tmp_radix,
interesting_remainder1);
host_radix_blocks_rotate_left<Torus>(
streams, gpu_indexes, gpu_count, interesting_remainder1,
mem_ptr->tmp_radix, 1, interesting_remainder1->num_radix_blocks);
streams, interesting_remainder1, mem_ptr->tmp_radix, 1,
interesting_remainder1->num_radix_blocks);
pop_radix_ciphertext_block_async<Torus>(streams[0], gpu_indexes[0],
mem_ptr->numerator_block_1,
pop_radix_ciphertext_block_async<Torus>(
streams.stream(0), streams.gpu_index(0), mem_ptr->numerator_block_1,
interesting_remainder1);
if (pos_in_block != 0) {
// We have not yet extracted all the bits from this numerator
// so, we put it back on the front so that it gets taken next
// iteration
push_block_to_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
mem_ptr->numerator_block_1,
push_block_to_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), mem_ptr->numerator_block_1,
numerator_block_stack);
}
}; // left_shift_interesting_remainder1
auto left_shift_interesting_remainder2 = [&](cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count) {
auto left_shift_interesting_remainder2 = [&](CudaStreams streams) {
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, interesting_remainder2, 1,
mem_ptr->shift_mem_2, bsks, ksks, ms_noise_reduction_key,
interesting_remainder2->num_radix_blocks);
streams, interesting_remainder2, 1, mem_ptr->shift_mem_2, bsks, ksks,
ms_noise_reduction_key, interesting_remainder2->num_radix_blocks);
}; // left_shift_interesting_remainder2
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
streams.synchronize();
// interesting_divisor
trim_last_interesting_divisor_bits(mem_ptr->sub_streams_1, gpu_indexes,
gpu_count);
trim_last_interesting_divisor_bits(mem_ptr->sub_streams_1);
// divisor_ms_blocks
trim_first_divisor_ms_bits(mem_ptr->sub_streams_2, gpu_indexes, gpu_count);
trim_first_divisor_ms_bits(mem_ptr->sub_streams_2);
// interesting_remainder1
// numerator_block_stack
left_shift_interesting_remainder1(mem_ptr->sub_streams_3, gpu_indexes,
gpu_count);
left_shift_interesting_remainder1(mem_ptr->sub_streams_3);
// interesting_remainder2
left_shift_interesting_remainder2(mem_ptr->sub_streams_4, gpu_indexes,
gpu_count);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_4[j], gpu_indexes[j]);
}
left_shift_interesting_remainder2(mem_ptr->sub_streams_4);
mem_ptr->sub_streams_1.synchronize();
mem_ptr->sub_streams_2.synchronize();
mem_ptr->sub_streams_3.synchronize();
mem_ptr->sub_streams_4.synchronize();
// if interesting_remainder1 != 0 -> interesting_remainder2 == 0
// if interesting_remainder1 == 0 -> interesting_remainder2 != 0
@@ -269,7 +251,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
auto merged_interesting_remainder = interesting_remainder1;
host_addition<Torus>(
streams[0], gpu_indexes[0], merged_interesting_remainder,
streams.stream(0), streams.gpu_index(0), merged_interesting_remainder,
merged_interesting_remainder, interesting_remainder2,
merged_interesting_remainder->num_radix_blocks,
radix_params.message_modulus, radix_params.carry_modulus);
@@ -280,7 +262,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
reset_radix_ciphertext_blocks(
cleaned_merged_interesting_remainder,
merged_interesting_remainder->num_radix_blocks);
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
cleaned_merged_interesting_remainder,
merged_interesting_remainder);
@@ -296,9 +278,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
// fills:
// `new_remainder` - radix ciphertext
// `subtraction_overflowed` - single ciphertext
auto do_overflowing_sub = [&](cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count) {
auto do_overflowing_sub = [&](CudaStreams streams) {
uint32_t compute_borrow = 1;
uint32_t uses_input_borrow = 0;
auto first_indexes =
@@ -311,40 +291,37 @@ __host__ void host_unsigned_integer_div_rem_kb(
mem_ptr->scalars_for_overflow_sub
[merged_interesting_remainder->num_radix_blocks - 1];
mem_ptr->overflow_sub_mem->update_lut_indexes(
streams, gpu_indexes, gpu_count, first_indexes, second_indexes,
scalar_indexes, merged_interesting_remainder->num_radix_blocks);
streams, first_indexes, second_indexes, scalar_indexes,
merged_interesting_remainder->num_radix_blocks);
host_integer_overflowing_sub<uint64_t>(
streams, gpu_indexes, gpu_count, new_remainder,
merged_interesting_remainder, interesting_divisor,
subtraction_overflowed, (const CudaRadixCiphertextFFI *)nullptr,
mem_ptr->overflow_sub_mem, bsks, ksks, ms_noise_reduction_key,
compute_borrow, uses_input_borrow);
streams, new_remainder, merged_interesting_remainder,
interesting_divisor, subtraction_overflowed,
(const CudaRadixCiphertextFFI *)nullptr, mem_ptr->overflow_sub_mem,
bsks, ksks, ms_noise_reduction_key, compute_borrow,
uses_input_borrow);
};
// fills:
// `at_least_one_upper_block_is_non_zero` - single ciphertext
auto check_divisor_upper_blocks = [&](cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count) {
auto check_divisor_upper_blocks = [&](CudaStreams streams) {
auto trivial_blocks = divisor_ms_blocks;
if (trivial_blocks->num_radix_blocks == 0) {
set_zero_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], at_least_one_upper_block_is_non_zero, 0,
1);
streams.stream(0), streams.gpu_index(0),
at_least_one_upper_block_is_non_zero, 0, 1);
} else {
// We could call unchecked_scalar_ne
// But we are in the special case where scalar == 0
// So we can skip some stuff
host_compare_blocks_with_zero<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->tmp_1, trivial_blocks,
mem_ptr->comparison_buffer, bsks, ksks, ms_noise_reduction_key,
streams, mem_ptr->tmp_1, trivial_blocks, mem_ptr->comparison_buffer,
bsks, ksks, ms_noise_reduction_key,
trivial_blocks->num_radix_blocks,
mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);
is_at_least_one_comparisons_block_true<Torus>(
streams, gpu_indexes, gpu_count,
at_least_one_upper_block_is_non_zero, mem_ptr->tmp_1,
streams, at_least_one_upper_block_is_non_zero, mem_ptr->tmp_1,
mem_ptr->comparison_buffer, bsks, ksks, ms_noise_reduction_key,
mem_ptr->tmp_1->num_radix_blocks);
}
@@ -354,56 +331,47 @@ __host__ void host_unsigned_integer_div_rem_kb(
// so that it can be safely used in bivariate PBSes
// fills:
// `cleaned_merged_interesting_remainder` - radix ciphertext
auto create_clean_version_of_merged_remainder =
[&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
auto create_clean_version_of_merged_remainder = [&](CudaStreams streams) {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count,
cleaned_merged_interesting_remainder,
streams, cleaned_merged_interesting_remainder,
cleaned_merged_interesting_remainder, bsks, ksks,
ms_noise_reduction_key, mem_ptr->message_extract_lut_1,
cleaned_merged_interesting_remainder->num_radix_blocks);
};
// phase 2
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
streams.synchronize();
// new_remainder
// subtraction_overflowed
do_overflowing_sub(mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
do_overflowing_sub(mem_ptr->sub_streams_1);
// at_least_one_upper_block_is_non_zero
check_divisor_upper_blocks(mem_ptr->sub_streams_2, gpu_indexes, gpu_count);
check_divisor_upper_blocks(mem_ptr->sub_streams_2);
// cleaned_merged_interesting_remainder
create_clean_version_of_merged_remainder(mem_ptr->sub_streams_3,
gpu_indexes, gpu_count);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
}
create_clean_version_of_merged_remainder(mem_ptr->sub_streams_3);
mem_ptr->sub_streams_1.synchronize();
mem_ptr->sub_streams_2.synchronize();
mem_ptr->sub_streams_3.synchronize();
host_addition<Torus>(
streams[0], gpu_indexes[0], overflow_sum, subtraction_overflowed,
at_least_one_upper_block_is_non_zero, 1, radix_params.message_modulus,
radix_params.carry_modulus);
streams.stream(0), streams.gpu_index(0), overflow_sum,
subtraction_overflowed, at_least_one_upper_block_is_non_zero, 1,
radix_params.message_modulus, radix_params.carry_modulus);
auto message_modulus = radix_params.message_modulus;
int factor = (i) ? message_modulus - 1 : message_modulus - 2;
int factor_lut_id = (i) ? 1 : 0;
for (size_t k = 0;
k < cleaned_merged_interesting_remainder->num_radix_blocks; k++) {
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
overflow_sum_radix, k, k + 1,
copy_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), overflow_sum_radix, k, k + 1,
overflow_sum, 0, 1);
}
auto conditionally_zero_out_merged_interesting_remainder =
[&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
[&](CudaStreams streams) {
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count,
cleaned_merged_interesting_remainder,
streams, cleaned_merged_interesting_remainder,
cleaned_merged_interesting_remainder, overflow_sum_radix, bsks,
ksks, ms_noise_reduction_key,
mem_ptr->zero_out_if_overflow_did_not_happen[factor_lut_id],
@@ -411,23 +379,20 @@ __host__ void host_unsigned_integer_div_rem_kb(
};
auto conditionally_zero_out_merged_new_remainder =
[&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
[&](CudaStreams streams) {
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, new_remainder, new_remainder,
overflow_sum_radix, bsks, ksks, ms_noise_reduction_key,
streams, new_remainder, new_remainder, overflow_sum_radix, bsks,
ksks, ms_noise_reduction_key,
mem_ptr->zero_out_if_overflow_happened[factor_lut_id],
new_remainder->num_radix_blocks, factor);
};
auto set_quotient_bit = [&](cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count) {
auto set_quotient_bit = [&](CudaStreams streams) {
uint32_t block_of_bit = i / num_bits_in_message;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->did_not_overflow,
subtraction_overflowed, at_least_one_upper_block_is_non_zero, bsks,
ksks, ms_noise_reduction_key,
streams, mem_ptr->did_not_overflow, subtraction_overflowed,
at_least_one_upper_block_is_non_zero, bsks, ksks,
ms_noise_reduction_key,
mem_ptr->merge_overflow_flags_luts[pos_in_block], 1,
mem_ptr->merge_overflow_flags_luts[pos_in_block]
->params.message_modulus);
@@ -435,28 +400,24 @@ __host__ void host_unsigned_integer_div_rem_kb(
CudaRadixCiphertextFFI quotient_block;
as_radix_ciphertext_slice<Torus>(&quotient_block, quotient, block_of_bit,
block_of_bit + 1);
host_addition<Torus>(streams[0], gpu_indexes[0], &quotient_block,
host_addition<Torus>(
streams.stream(0), streams.gpu_index(0), &quotient_block,
&quotient_block, mem_ptr->did_not_overflow, 1,
radix_params.message_modulus,
radix_params.carry_modulus);
radix_params.message_modulus, radix_params.carry_modulus);
};
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
streams.synchronize();
// cleaned_merged_interesting_remainder
conditionally_zero_out_merged_interesting_remainder(mem_ptr->sub_streams_1,
gpu_indexes, gpu_count);
conditionally_zero_out_merged_interesting_remainder(mem_ptr->sub_streams_1);
// new_remainder
conditionally_zero_out_merged_new_remainder(mem_ptr->sub_streams_2,
gpu_indexes, gpu_count);
conditionally_zero_out_merged_new_remainder(mem_ptr->sub_streams_2);
// quotient
set_quotient_bit(mem_ptr->sub_streams_3, gpu_indexes, gpu_count);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
}
set_quotient_bit(mem_ptr->sub_streams_3);
mem_ptr->sub_streams_1.synchronize();
mem_ptr->sub_streams_2.synchronize();
mem_ptr->sub_streams_3.synchronize();
if (first_trivial_block !=
cleaned_merged_interesting_remainder->num_radix_blocks)
@@ -467,11 +428,12 @@ __host__ void host_unsigned_integer_div_rem_kb(
"num blocks")
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], remainder1, 0, first_trivial_block,
cleaned_merged_interesting_remainder, 0, first_trivial_block);
streams.stream(0), streams.gpu_index(0), remainder1, 0,
first_trivial_block, cleaned_merged_interesting_remainder, 0,
first_trivial_block);
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], remainder2, 0, first_trivial_block,
new_remainder, 0, first_trivial_block);
streams.stream(0), streams.gpu_index(0), remainder2, 0,
first_trivial_block, new_remainder, 0, first_trivial_block);
}
if (remainder1->num_radix_blocks != remainder2->num_radix_blocks)
@@ -480,31 +442,27 @@ __host__ void host_unsigned_integer_div_rem_kb(
// Clean the quotient and remainder
// as even though they have no carries, they are not at nominal noise level
host_addition<Torus>(streams[0], gpu_indexes[0], remainder, remainder1,
remainder2, remainder1->num_radix_blocks,
host_addition<Torus>(streams.stream(0), streams.gpu_index(0), remainder,
remainder1, remainder2, remainder1->num_radix_blocks,
radix_params.message_modulus,
radix_params.carry_modulus);
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
streams.synchronize();
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
bsks, ksks, ms_noise_reduction_key, mem_ptr->message_extract_lut_1,
num_blocks);
mem_ptr->sub_streams_1, remainder, remainder, bsks, ksks,
ms_noise_reduction_key, mem_ptr->message_extract_lut_1, num_blocks);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient, bsks,
ksks, ms_noise_reduction_key, mem_ptr->message_extract_lut_2, num_blocks);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
}
mem_ptr->sub_streams_2, quotient, quotient, bsks, ksks,
ms_noise_reduction_key, mem_ptr->message_extract_lut_2, num_blocks);
mem_ptr->sub_streams_1.synchronize();
mem_ptr->sub_streams_2.synchronize();
}
template <typename Torus>
__host__ void host_integer_div_rem_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *quotient,
CudaStreams streams, CudaRadixCiphertextFFI *quotient,
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
CudaRadixCiphertextFFI const *divisor, bool is_signed, void *const *bsks,
uint64_t *const *ksks,
@@ -526,32 +484,27 @@ __host__ void host_integer_div_rem_kb(
// temporary memory
auto positive_numerator = int_mem_ptr->positive_numerator;
auto positive_divisor = int_mem_ptr->positive_divisor;
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
positive_numerator, numerator);
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
positive_divisor, divisor);
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
streams.synchronize();
host_integer_abs_kb<Torus>(
int_mem_ptr->sub_streams_1, gpu_indexes, int_mem_ptr->active_gpu_count,
positive_numerator, bsks, ksks, ms_noise_reduction_key,
host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_1, positive_numerator,
bsks, ksks, ms_noise_reduction_key,
int_mem_ptr->abs_mem_1, true);
host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_2, gpu_indexes,
int_mem_ptr->active_gpu_count, positive_divisor,
host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_2, positive_divisor,
bsks, ksks, ms_noise_reduction_key,
int_mem_ptr->abs_mem_2, true);
for (uint j = 0; j < int_mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(int_mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(int_mem_ptr->sub_streams_2[j], gpu_indexes[j]);
}
int_mem_ptr->sub_streams_1.synchronize();
int_mem_ptr->sub_streams_2.synchronize();
host_unsigned_integer_div_rem_kb<Torus>(
int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count, quotient, remainder,
positive_numerator, positive_divisor, bsks, ksks,
ms_noise_reduction_key, int_mem_ptr->unsigned_mem);
int_mem_ptr->sub_streams_1, quotient, remainder, positive_numerator,
positive_divisor, bsks, ksks, ms_noise_reduction_key,
int_mem_ptr->unsigned_mem);
CudaRadixCiphertextFFI numerator_sign;
as_radix_ciphertext_slice<Torus>(&numerator_sign, numerator, num_blocks - 1,
@@ -560,59 +513,51 @@ __host__ void host_integer_div_rem_kb(
as_radix_ciphertext_slice<Torus>(&divisor_sign, divisor, num_blocks - 1,
num_blocks);
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
int_mem_ptr->sign_bits_are_different, &numerator_sign, &divisor_sign,
bsks, ksks, ms_noise_reduction_key,
int_mem_ptr->sub_streams_2, int_mem_ptr->sign_bits_are_different,
&numerator_sign, &divisor_sign, bsks, ksks, ms_noise_reduction_key,
int_mem_ptr->compare_signed_bits_lut, 1,
int_mem_ptr->compare_signed_bits_lut->params.message_modulus);
for (uint j = 0; j < int_mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(int_mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(int_mem_ptr->sub_streams_2[j], gpu_indexes[j]);
}
int_mem_ptr->sub_streams_1.synchronize();
int_mem_ptr->sub_streams_2.synchronize();
host_integer_radix_negation<Torus>(int_mem_ptr->sub_streams_1, gpu_indexes,
gpu_count, int_mem_ptr->negated_quotient,
quotient, radix_params.message_modulus,
radix_params.carry_modulus, num_blocks);
host_integer_radix_negation<Torus>(
int_mem_ptr->sub_streams_1, int_mem_ptr->negated_quotient, quotient,
radix_params.message_modulus, radix_params.carry_modulus, num_blocks);
uint32_t requested_flag = outputFlag::FLAG_NONE;
uint32_t uses_carry = 0;
host_propagate_single_carry<Torus>(
int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
int_mem_ptr->negated_quotient, nullptr, nullptr, int_mem_ptr->scp_mem_1,
bsks, ksks, ms_noise_reduction_key, requested_flag, uses_carry);
host_integer_radix_negation<Torus>(
int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
int_mem_ptr->negated_remainder, remainder, radix_params.message_modulus,
radix_params.carry_modulus, num_blocks);
host_propagate_single_carry<Torus>(
int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
int_mem_ptr->negated_remainder, nullptr, nullptr,
int_mem_ptr->scp_mem_2, bsks, ksks, ms_noise_reduction_key,
int_mem_ptr->sub_streams_1, int_mem_ptr->negated_quotient, nullptr,
nullptr, int_mem_ptr->scp_mem_1, bsks, ksks, ms_noise_reduction_key,
requested_flag, uses_carry);
host_integer_radix_cmux_kb<Torus>(
int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count, quotient,
int_mem_ptr->sign_bits_are_different, int_mem_ptr->negated_quotient,
quotient, int_mem_ptr->cmux_quotient_mem, bsks, ksks,
ms_noise_reduction_key);
host_integer_radix_negation<Torus>(
int_mem_ptr->sub_streams_2, int_mem_ptr->negated_remainder, remainder,
radix_params.message_modulus, radix_params.carry_modulus, num_blocks);
host_propagate_single_carry<Torus>(
int_mem_ptr->sub_streams_2, int_mem_ptr->negated_remainder, nullptr,
nullptr, int_mem_ptr->scp_mem_2, bsks, ksks, ms_noise_reduction_key,
requested_flag, uses_carry);
host_integer_radix_cmux_kb<Torus>(int_mem_ptr->sub_streams_1, quotient,
int_mem_ptr->sign_bits_are_different,
int_mem_ptr->negated_quotient, quotient,
int_mem_ptr->cmux_quotient_mem, bsks,
ksks, ms_noise_reduction_key);
host_integer_radix_cmux_kb<Torus>(
int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count, remainder,
&numerator_sign, int_mem_ptr->negated_remainder, remainder,
int_mem_ptr->sub_streams_2, remainder, &numerator_sign,
int_mem_ptr->negated_remainder, remainder,
int_mem_ptr->cmux_remainder_mem, bsks, ksks, ms_noise_reduction_key);
for (uint j = 0; j < int_mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(int_mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(int_mem_ptr->sub_streams_2[j], gpu_indexes[j]);
}
int_mem_ptr->sub_streams_1.synchronize();
int_mem_ptr->sub_streams_2.synchronize();
} else {
host_unsigned_integer_div_rem_kb<Torus>(
streams, gpu_indexes, gpu_count, quotient, remainder, numerator,
divisor, bsks, ksks, ms_noise_reduction_key, int_mem_ptr->unsigned_mem);
streams, quotient, remainder, numerator, divisor, bsks, ksks,
ms_noise_reduction_key, int_mem_ptr->unsigned_mem);
}
}

View File

@@ -1,13 +1,12 @@
#include "ilog2.cuh"
uint64_t scratch_integer_count_of_consecutive_bits_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t counter_num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, Direction direction,
BitValue bit_value, bool allocate_gpu_memory,
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t counter_num_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
Direction direction, BitValue bit_value, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
@@ -17,7 +16,7 @@ uint64_t scratch_integer_count_of_consecutive_bits_kb_64(
noise_reduction_type);
return scratch_integer_count_of_consecutive_bits<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
CudaStreams(streams), params,
(int_count_of_consecutive_bits_buffer<uint64_t> **)mem_ptr, num_blocks,
counter_num_blocks, direction, bit_value, allocate_gpu_memory);
}
@@ -28,37 +27,35 @@ uint64_t scratch_integer_count_of_consecutive_bits_kb_64(
// stored in the output ciphertext.
//
void cuda_integer_count_of_consecutive_bits_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *output_ct, CudaRadixCiphertextFFI const *input_ct,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_ct,
CudaRadixCiphertextFFI const *input_ct, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key) {
host_integer_count_of_consecutive_bits<uint64_t>(
(cudaStream_t *)streams, gpu_indexes, gpu_count, output_ct, input_ct,
CudaStreams(streams), output_ct, input_ct,
(int_count_of_consecutive_bits_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)ksks, ms_noise_reduction_key);
}
void cleanup_cuda_integer_count_of_consecutive_bits_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
int_count_of_consecutive_bits_buffer<uint64_t> *mem_ptr =
(int_count_of_consecutive_bits_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}
uint64_t scratch_integer_ilog2_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
uint32_t input_num_blocks, uint32_t counter_num_blocks,
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t input_num_blocks, uint32_t counter_num_blocks,
uint32_t num_bits_in_ciphertext, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
@@ -69,9 +66,9 @@ uint64_t scratch_integer_ilog2_kb_64(
noise_reduction_type);
return scratch_integer_ilog2<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
(int_ilog2_buffer<uint64_t> **)mem_ptr, input_num_blocks,
counter_num_blocks, num_bits_in_ciphertext, allocate_gpu_memory);
CudaStreams(streams), params, (int_ilog2_buffer<uint64_t> **)mem_ptr,
input_num_blocks, counter_num_blocks, num_bits_in_ciphertext,
allocate_gpu_memory);
}
// Computes the integer logarithm base 2 of an encrypted integer.
@@ -79,30 +76,27 @@ uint64_t scratch_integer_ilog2_kb_64(
// The result is stored in the output ciphertext.
//
void cuda_integer_ilog2_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *output_ct, CudaRadixCiphertextFFI const *input_ct,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_ct,
CudaRadixCiphertextFFI const *input_ct,
CudaRadixCiphertextFFI const *trivial_ct_neg_n,
CudaRadixCiphertextFFI const *trivial_ct_2,
CudaRadixCiphertextFFI const *trivial_ct_m_minus_1_block, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key) {
host_integer_ilog2<uint64_t>((cudaStream_t *)streams, gpu_indexes, gpu_count,
output_ct, input_ct, trivial_ct_neg_n,
trivial_ct_2, trivial_ct_m_minus_1_block,
(int_ilog2_buffer<uint64_t> *)mem_ptr, bsks,
host_integer_ilog2<uint64_t>(
CudaStreams(streams), output_ct, input_ct, trivial_ct_neg_n, trivial_ct_2,
trivial_ct_m_minus_1_block, (int_ilog2_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)ksks, ms_noise_reduction_key);
}
void cleanup_cuda_integer_ilog2_kb_64(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
void cleanup_cuda_integer_ilog2_kb_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_ilog2_buffer<uint64_t> *mem_ptr =
(int_ilog2_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;

View File

@@ -7,31 +7,29 @@
template <typename Torus>
__host__ void host_integer_prepare_count_of_consecutive_bits(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *ciphertext,
CudaStreams streams, CudaRadixCiphertextFFI *ciphertext,
int_prepare_count_of_consecutive_bits_buffer<Torus> *mem_ptr,
void *const *bsks, Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
auto tmp = mem_ptr->tmp_ct;
host_apply_univariate_lut_kb<Torus>(streams, gpu_indexes, gpu_count, tmp,
ciphertext, mem_ptr->univ_lut_mem, ksks,
host_apply_univariate_lut_kb<Torus>(streams, tmp, ciphertext,
mem_ptr->univ_lut_mem, ksks,
ms_noise_reduction_key, bsks);
if (mem_ptr->direction == Leading) {
host_radix_blocks_reverse_inplace<Torus>(streams, gpu_indexes, tmp);
host_radix_blocks_reverse_inplace<Torus>(streams, tmp);
}
host_compute_prefix_sum_hillis_steele<uint64_t>(
streams, gpu_indexes, gpu_count, ciphertext, tmp, mem_ptr->biv_lut_mem,
bsks, ksks, ms_noise_reduction_key, ciphertext->num_radix_blocks);
streams, ciphertext, tmp, mem_ptr->biv_lut_mem, bsks, ksks,
ms_noise_reduction_key, ciphertext->num_radix_blocks);
}
template <typename Torus>
__host__ uint64_t scratch_integer_count_of_consecutive_bits(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, const int_radix_params params,
CudaStreams streams, const int_radix_params params,
int_count_of_consecutive_bits_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, uint32_t counter_num_blocks, Direction direction,
BitValue bit_value, const bool allocate_gpu_memory) {
@@ -39,17 +37,15 @@ __host__ uint64_t scratch_integer_count_of_consecutive_bits(
uint64_t size_tracker = 0;
*mem_ptr = new int_count_of_consecutive_bits_buffer<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
counter_num_blocks, direction, bit_value, allocate_gpu_memory,
size_tracker);
streams, params, num_radix_blocks, counter_num_blocks, direction,
bit_value, allocate_gpu_memory, size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void host_integer_count_of_consecutive_bits(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *output_ct,
CudaStreams streams, CudaRadixCiphertextFFI *output_ct,
CudaRadixCiphertextFFI const *input_ct,
int_count_of_consecutive_bits_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
@@ -59,13 +55,13 @@ __host__ void host_integer_count_of_consecutive_bits(
auto ct_prepared = mem_ptr->ct_prepared;
auto counter_num_blocks = mem_ptr->counter_num_blocks;
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], ct_prepared,
input_ct);
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
ct_prepared, input_ct);
// Prepare count of consecutive bits
//
host_integer_prepare_count_of_consecutive_bits(
streams, gpu_indexes, gpu_count, ct_prepared, mem_ptr->prepare_mem, bsks,
host_integer_prepare_count_of_consecutive_bits(streams, ct_prepared,
mem_ptr->prepare_mem, bsks,
ksks, ms_noise_reduction_key);
// Perform addition and propagation of prepared cts
@@ -75,42 +71,40 @@ __host__ void host_integer_count_of_consecutive_bits(
for (uint32_t i = 0; i < ct_prepared->num_radix_blocks; ++i) {
uint32_t output_start_index = i * counter_num_blocks;
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], cts, output_start_index,
streams.stream(0), streams.gpu_index(0), cts, output_start_index,
output_start_index + 1, ct_prepared, i, i + 1);
}
host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
streams, gpu_indexes, gpu_count, output_ct, cts, bsks, ksks,
ms_noise_reduction_key, mem_ptr->sum_mem, counter_num_blocks,
ct_prepared->num_radix_blocks);
streams, output_ct, cts, bsks, ksks, ms_noise_reduction_key,
mem_ptr->sum_mem, counter_num_blocks, ct_prepared->num_radix_blocks);
host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count, output_ct,
nullptr, nullptr, mem_ptr->propagate_mem,
bsks, ksks, ms_noise_reduction_key, 0, 0);
host_propagate_single_carry<Torus>(streams, output_ct, nullptr, nullptr,
mem_ptr->propagate_mem, bsks, ksks,
ms_noise_reduction_key, 0, 0);
}
template <typename Torus>
__host__ uint64_t scratch_integer_ilog2(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, const int_radix_params params,
int_ilog2_buffer<Torus> **mem_ptr, uint32_t input_num_blocks,
uint32_t counter_num_blocks, uint32_t num_bits_in_ciphertext,
__host__ uint64_t scratch_integer_ilog2(CudaStreams streams,
const int_radix_params params,
int_ilog2_buffer<Torus> **mem_ptr,
uint32_t input_num_blocks,
uint32_t counter_num_blocks,
uint32_t num_bits_in_ciphertext,
const bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_ilog2_buffer<Torus>(
streams, gpu_indexes, gpu_count, params, input_num_blocks,
counter_num_blocks, num_bits_in_ciphertext, allocate_gpu_memory,
size_tracker);
streams, params, input_num_blocks, counter_num_blocks,
num_bits_in_ciphertext, allocate_gpu_memory, size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void host_integer_ilog2(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *output_ct,
CudaStreams streams, CudaRadixCiphertextFFI *output_ct,
CudaRadixCiphertextFFI const *input_ct,
CudaRadixCiphertextFFI const *trivial_ct_neg_n,
CudaRadixCiphertextFFI const *trivial_ct_2,
@@ -121,18 +115,18 @@ __host__ void host_integer_ilog2(
// Prepare the input ciphertext by computing the number of consecutive
// leading zeros for each of its blocks.
//
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
mem_ptr->ct_in_buffer, input_ct);
host_integer_prepare_count_of_consecutive_bits<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->ct_in_buffer,
mem_ptr->prepare_mem, bsks, ksks, ms_noise_reduction_key);
streams, mem_ptr->ct_in_buffer, mem_ptr->prepare_mem, bsks, ksks,
ms_noise_reduction_key);
// Build the input for the sum by taking each block's leading zero count
// and placing it into a separate, zero-padded ct slot.
//
for (uint32_t i = 0; i < mem_ptr->input_num_blocks; ++i) {
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], mem_ptr->sum_input_cts,
streams.stream(0), streams.gpu_index(0), mem_ptr->sum_input_cts,
i * mem_ptr->counter_num_blocks, (i * mem_ptr->counter_num_blocks) + 1,
mem_ptr->ct_in_buffer, i, i + 1);
}
@@ -145,7 +139,7 @@ __host__ void host_integer_ilog2(
"num blocks of trivial_ct_neg_n should be equal to counter_num_blocks");
}
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], mem_ptr->sum_input_cts,
streams.stream(0), streams.gpu_index(0), mem_ptr->sum_input_cts,
mem_ptr->input_num_blocks * mem_ptr->counter_num_blocks,
(mem_ptr->input_num_blocks + 1) * mem_ptr->counter_num_blocks,
trivial_ct_neg_n, 0, trivial_ct_neg_n->num_radix_blocks);
@@ -153,34 +147,31 @@ __host__ void host_integer_ilog2(
// Perform a partial sum of all the elements without carry propagation.
//
host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->sum_output_not_propagated,
mem_ptr->sum_input_cts, bsks, ksks, ms_noise_reduction_key,
mem_ptr->sum_mem, mem_ptr->counter_num_blocks,
mem_ptr->input_num_blocks + 1);
streams, mem_ptr->sum_output_not_propagated, mem_ptr->sum_input_cts, bsks,
ksks, ms_noise_reduction_key, mem_ptr->sum_mem,
mem_ptr->counter_num_blocks, mem_ptr->input_num_blocks + 1);
// Apply luts to the partial sum.
//
host_apply_univariate_lut_kb<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->message_blocks_not,
mem_ptr->sum_output_not_propagated, mem_ptr->lut_message_not, ksks,
ms_noise_reduction_key, bsks);
streams, mem_ptr->message_blocks_not, mem_ptr->sum_output_not_propagated,
mem_ptr->lut_message_not, ksks, ms_noise_reduction_key, bsks);
host_apply_univariate_lut_kb<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->carry_blocks_not,
mem_ptr->sum_output_not_propagated, mem_ptr->lut_carry_not, ksks,
ms_noise_reduction_key, bsks);
streams, mem_ptr->carry_blocks_not, mem_ptr->sum_output_not_propagated,
mem_ptr->lut_carry_not, ksks, ms_noise_reduction_key, bsks);
// Left-shift the bitwise-negated carry blocks by one position.
//
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], mem_ptr->rotated_carry_blocks, 1,
streams.stream(0), streams.gpu_index(0), mem_ptr->rotated_carry_blocks, 1,
mem_ptr->counter_num_blocks, mem_ptr->carry_blocks_not, 0,
mem_ptr->counter_num_blocks - 1);
// Insert a block of (mod - 1) at the least significant position.
//
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
mem_ptr->rotated_carry_blocks, 0, 1,
trivial_ct_m_minus_1_block, 0, 1);
copy_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), mem_ptr->rotated_carry_blocks, 0,
1, trivial_ct_m_minus_1_block, 0, 1);
// Update the degree metadata for the rotated carry blocks.
//
@@ -190,28 +181,27 @@ __host__ void host_integer_ilog2(
}
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], mem_ptr->sum_input_cts, 0,
streams.stream(0), streams.gpu_index(0), mem_ptr->sum_input_cts, 0,
mem_ptr->counter_num_blocks, mem_ptr->message_blocks_not, 0,
mem_ptr->counter_num_blocks);
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], mem_ptr->sum_input_cts,
streams.stream(0), streams.gpu_index(0), mem_ptr->sum_input_cts,
mem_ptr->counter_num_blocks, 2 * mem_ptr->counter_num_blocks,
mem_ptr->rotated_carry_blocks, 0, mem_ptr->counter_num_blocks);
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], mem_ptr->sum_input_cts,
streams.stream(0), streams.gpu_index(0), mem_ptr->sum_input_cts,
2 * mem_ptr->counter_num_blocks, 3 * mem_ptr->counter_num_blocks,
trivial_ct_2, 0, mem_ptr->counter_num_blocks);
host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
streams, gpu_indexes, gpu_count, output_ct, mem_ptr->sum_input_cts, bsks,
ksks, ms_noise_reduction_key, mem_ptr->sum_mem,
mem_ptr->counter_num_blocks, 3);
streams, output_ct, mem_ptr->sum_input_cts, bsks, ksks,
ms_noise_reduction_key, mem_ptr->sum_mem, mem_ptr->counter_num_blocks, 3);
host_full_propagate_inplace<Torus>(
streams, gpu_indexes, gpu_count, output_ct, mem_ptr->final_propagate_mem,
ksks, ms_noise_reduction_key, bsks, mem_ptr->counter_num_blocks);
streams, output_ct, mem_ptr->final_propagate_mem, ksks,
ms_noise_reduction_key, bsks, mem_ptr->counter_num_blocks);
}
#endif

View File

@@ -3,8 +3,8 @@
#include <linear_algebra.h>
void cuda_full_propagation_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *input_blocks, int8_t *mem_ptr, void *const *ksks,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *input_blocks,
int8_t *mem_ptr, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks, uint32_t num_blocks) {
@@ -12,17 +12,17 @@ void cuda_full_propagation_64_inplace(
(int_fullprop_buffer<uint64_t> *)mem_ptr;
host_full_propagate_inplace<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, input_blocks, buffer,
(uint64_t **)(ksks), ms_noise_reduction_key, bsks, num_blocks);
CudaStreams(streams), input_blocks, buffer, (uint64_t **)(ksks),
ms_noise_reduction_key, bsks, num_blocks);
}
uint64_t scratch_cuda_full_propagation_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
ks_level, ks_base_log, pbs_level, pbs_base_log,
@@ -30,112 +30,105 @@ uint64_t scratch_cuda_full_propagation_64(
noise_reduction_type);
return scratch_cuda_full_propagation<uint64_t>(
(cudaStream_t *)streams, gpu_indexes, gpu_count,
(int_fullprop_buffer<uint64_t> **)mem_ptr, params, allocate_gpu_memory);
CudaStreams(streams), (int_fullprop_buffer<uint64_t> **)mem_ptr, params,
allocate_gpu_memory);
}
void cleanup_cuda_full_propagation(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, int8_t **mem_ptr_void) {
void cleanup_cuda_full_propagation(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_fullprop_buffer<uint64_t> *mem_ptr =
(int_fullprop_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}
uint64_t scratch_cuda_propagate_single_carry_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
uint32_t uses_carry, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
requested_flag, uses_carry, allocate_gpu_memory);
}
uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
uint32_t uses_carry, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
requested_flag, uses_carry, allocate_gpu_memory);
}
uint64_t scratch_cuda_integer_overflowing_sub_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t compute_overflow,
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t requested_flag, uint32_t uses_carry,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
CudaStreams(streams), (int_sc_prop_memory<uint64_t> **)mem_ptr,
num_blocks, params, requested_flag, uses_carry, allocate_gpu_memory);
}
uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t requested_flag, uint32_t uses_carry,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
CudaStreams(streams), (int_sc_prop_memory<uint64_t> **)mem_ptr,
num_blocks, params, requested_flag, uses_carry, allocate_gpu_memory);
}
uint64_t scratch_cuda_integer_overflowing_sub_kb_64_inplace(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t compute_overflow, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_integer_overflowing_sub<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_borrow_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
compute_overflow, allocate_gpu_memory);
CudaStreams(streams), (int_borrow_prop_memory<uint64_t> **)mem_ptr,
num_blocks, params, compute_overflow, allocate_gpu_memory);
}
void cuda_propagate_single_carry_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *lwe_array, CudaRadixCiphertextFFI *carry_out,
const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t requested_flag, uint32_t uses_carry) {
host_propagate_single_carry<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, carry_out,
carry_in, (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, requested_flag, uses_carry);
}
void cuda_add_and_propagate_single_carry_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t requested_flag, uint32_t uses_carry) {
host_propagate_single_carry<uint64_t>(
CudaStreams(streams), lwe_array, carry_out, carry_in,
(int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, requested_flag, uses_carry);
}
void cuda_add_and_propagate_single_carry_kb_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t requested_flag, uint32_t uses_carry) {
host_add_and_propagate_single_carry<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lhs_array, rhs_array,
carry_out, carry_in, (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, requested_flag, uses_carry);
CudaStreams(streams), lhs_array, rhs_array, carry_out, carry_in,
(int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, requested_flag, uses_carry);
}
void cuda_integer_overflowing_sub_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
const CudaRadixCiphertextFFI *rhs_array,
CudaRadixCiphertextFFI *overflow_block,
const CudaRadixCiphertextFFI *input_borrow, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
@@ -143,57 +136,50 @@ void cuda_integer_overflowing_sub_kb_64_inplace(
uint32_t compute_overflow, uint32_t uses_input_borrow) {
PUSH_RANGE("overflow sub")
host_integer_overflowing_sub<uint64_t>(
(cudaStream_t const *)streams, gpu_indexes, gpu_count, lhs_array,
lhs_array, rhs_array, overflow_block, input_borrow,
(int_borrow_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
ms_noise_reduction_key, compute_overflow, uses_input_borrow);
CudaStreams(streams), lhs_array, lhs_array, rhs_array, overflow_block,
input_borrow, (int_borrow_prop_memory<uint64_t> *)mem_ptr, bsks,
(uint64_t **)ksks, ms_noise_reduction_key, compute_overflow,
uses_input_borrow);
POP_RANGE()
}
void cleanup_cuda_propagate_single_carry(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
void cleanup_cuda_propagate_single_carry(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
PUSH_RANGE("cleanup propagate sc")
int_sc_prop_memory<uint64_t> *mem_ptr =
(int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
POP_RANGE()
}
void cleanup_cuda_add_and_propagate_single_carry(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
void cleanup_cuda_add_and_propagate_single_carry(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
PUSH_RANGE("cleanup add & propagate sc")
int_sc_prop_memory<uint64_t> *mem_ptr =
(int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
POP_RANGE()
}
void cleanup_cuda_integer_overflowing_sub(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
void cleanup_cuda_integer_overflowing_sub(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
PUSH_RANGE("cleanup overflow sub")
int_borrow_prop_memory<uint64_t> *mem_ptr =
(int_borrow_prop_memory<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
POP_RANGE()
}
uint64_t scratch_cuda_apply_univariate_lut_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_radix_blocks,
CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
uint64_t lut_degree, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
@@ -204,18 +190,16 @@ uint64_t scratch_cuda_apply_univariate_lut_kb_64(
noise_reduction_type);
return scratch_cuda_apply_univariate_lut_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_radix_lut<uint64_t> **)mem_ptr,
CudaStreams(streams), (int_radix_lut<uint64_t> **)mem_ptr,
static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
lut_degree, allocate_gpu_memory);
}
uint64_t scratch_cuda_apply_many_univariate_lut_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_radix_blocks,
CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
uint32_t num_many_lut, uint64_t lut_degree, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
@@ -226,58 +210,52 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb_64(
noise_reduction_type);
return scratch_cuda_apply_many_univariate_lut_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_radix_lut<uint64_t> **)mem_ptr,
CudaStreams(streams), (int_radix_lut<uint64_t> **)mem_ptr,
static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
num_many_lut, lut_degree, allocate_gpu_memory);
}
void cuda_apply_univariate_lut_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *output_radix_lwe,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks) {
host_apply_univariate_lut_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe,
input_radix_lwe, (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks),
CudaStreams(streams), output_radix_lwe, input_radix_lwe,
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks),
ms_noise_reduction_key, bsks);
}
void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
void cleanup_cuda_apply_univariate_lut_kb_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
PUSH_RANGE("cleanup univar lut")
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
POP_RANGE()
}
void cuda_apply_many_univariate_lut_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *output_radix_lwe,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks, uint32_t num_many_lut, uint32_t lut_stride) {
host_apply_many_univariate_lut_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe,
input_radix_lwe, (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks),
CudaStreams(streams), output_radix_lwe, input_radix_lwe,
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks),
ms_noise_reduction_key, bsks, num_many_lut, lut_stride);
}
uint64_t scratch_cuda_apply_bivariate_lut_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_radix_blocks,
CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
uint64_t lut_degree, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
@@ -288,15 +266,13 @@ uint64_t scratch_cuda_apply_bivariate_lut_kb_64(
noise_reduction_type);
return scratch_cuda_apply_bivariate_lut_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_radix_lut<uint64_t> **)mem_ptr,
CudaStreams(streams), (int_radix_lut<uint64_t> **)mem_ptr,
static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
lut_degree, allocate_gpu_memory);
}
void cuda_apply_bivariate_lut_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *output_radix_lwe,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe_1,
CudaRadixCiphertextFFI const *input_radix_lwe_2, int8_t *mem_ptr,
void *const *ksks,
@@ -304,30 +280,27 @@ void cuda_apply_bivariate_lut_kb_64(
void *const *bsks, uint32_t num_radix_blocks, uint32_t shift) {
host_apply_bivariate_lut_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe,
input_radix_lwe_1, input_radix_lwe_2, (int_radix_lut<uint64_t> *)mem_ptr,
CudaStreams(streams), output_radix_lwe, input_radix_lwe_1,
input_radix_lwe_2, (int_radix_lut<uint64_t> *)mem_ptr,
(uint64_t **)(ksks), ms_noise_reduction_key, bsks, num_radix_blocks,
shift);
}
void cleanup_cuda_apply_bivariate_lut_kb_64(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
void cleanup_cuda_apply_bivariate_lut_kb_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
PUSH_RANGE("cleanup bivar lut")
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
POP_RANGE()
}
uint64_t scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_radix_blocks,
CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
uint64_t lut_degree, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
@@ -339,42 +312,36 @@ uint64_t scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
noise_reduction_type);
return scratch_cuda_apply_bivariate_lut_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_radix_lut<uint64_t> **)mem_ptr,
CudaStreams(streams), (int_radix_lut<uint64_t> **)mem_ptr,
static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
lut_degree, allocate_gpu_memory);
}
void cuda_integer_compute_prefix_sum_hillis_steele_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *output_radix_lwe,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI *generates_or_propagates, int8_t *mem_ptr,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks, uint32_t num_radix_blocks) {
host_compute_prefix_sum_hillis_steele<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe,
generates_or_propagates, (int_radix_lut<uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
CudaStreams(streams), output_radix_lwe, generates_or_propagates,
(int_radix_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, num_radix_blocks);
}
void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}
void cuda_integer_reverse_blocks_64_inplace(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
void cuda_integer_reverse_blocks_64_inplace(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *lwe_array) {
host_radix_blocks_reverse_inplace<uint64_t>((cudaStream_t *)(streams),
gpu_indexes, lwe_array);
host_radix_blocks_reverse_inplace<uint64_t>(CudaStreams(streams), lwe_array);
}
void reverseArray(uint64_t arr[], size_t n) {
@@ -395,30 +362,28 @@ void reverseArray(uint64_t arr[], size_t n) {
}
uint64_t scratch_cuda_apply_noise_squashing_mem(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int_radix_params params, int_noise_squashing_lut<uint64_t> **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t num_radix_blocks, uint32_t original_num_blocks,
bool allocate_gpu_memory) {
CudaStreamsFFI streams, int_radix_params params,
int_noise_squashing_lut<uint64_t> **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t num_radix_blocks,
uint32_t original_num_blocks, bool allocate_gpu_memory) {
PUSH_RANGE("scratch noise squashing")
uint64_t size_tracker = 0;
*mem_ptr = new int_noise_squashing_lut<uint64_t>(
(cudaStream_t *)streams, gpu_indexes, gpu_count, params, glwe_dimension,
polynomial_size, num_radix_blocks, original_num_blocks,
allocate_gpu_memory, size_tracker);
CudaStreams(streams), params, glwe_dimension, polynomial_size,
num_radix_blocks, original_num_blocks, allocate_gpu_memory, size_tracker);
POP_RANGE()
return size_tracker;
}
uint64_t scratch_cuda_apply_noise_squashing_kb(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t input_glwe_dimension,
uint32_t input_polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_radix_blocks, uint32_t original_num_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_glwe_dimension, uint32_t input_polynomial_size,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t original_num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
ks_level, ks_base_log, pbs_level, pbs_base_log,
@@ -426,15 +391,13 @@ uint64_t scratch_cuda_apply_noise_squashing_kb(
noise_reduction_type);
return scratch_cuda_apply_noise_squashing_mem(
streams, gpu_indexes, gpu_count, params,
(int_noise_squashing_lut<uint64_t> **)mem_ptr, input_glwe_dimension,
input_polynomial_size, num_radix_blocks, original_num_blocks,
allocate_gpu_memory);
streams, params, (int_noise_squashing_lut<uint64_t> **)mem_ptr,
input_glwe_dimension, input_polynomial_size, num_radix_blocks,
original_num_blocks, allocate_gpu_memory);
}
void cuda_apply_noise_squashing_kb(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *output_radix_lwe,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
@@ -442,20 +405,18 @@ void cuda_apply_noise_squashing_kb(
PUSH_RANGE("apply noise squashing")
integer_radix_apply_noise_squashing_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe,
input_radix_lwe, (int_noise_squashing_lut<uint64_t> *)mem_ptr, bsks,
(uint64_t **)ksks, ms_noise_reduction_key);
CudaStreams(streams), output_radix_lwe, input_radix_lwe,
(int_noise_squashing_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
ms_noise_reduction_key);
POP_RANGE()
}
void cleanup_cuda_apply_noise_squashing_kb(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
void cleanup_cuda_apply_noise_squashing_kb(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
PUSH_RANGE("cleanup noise squashing")
int_noise_squashing_lut<uint64_t> *mem_ptr =
(int_noise_squashing_lut<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
POP_RANGE()

File diff suppressed because it is too large Load Diff

View File

@@ -66,13 +66,13 @@ void generate_ids_update_degrees(uint64_t *terms_degree, size_t *h_lwe_idx_in,
* the integer radix multiplication in keyswitch->bootstrap order.
*/
uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, bool const is_boolean_left, bool const is_boolean_right,
uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
uint32_t grouping_factor, uint32_t num_radix_blocks, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
CudaStreamsFFI streams, int8_t **mem_ptr, bool const is_boolean_left,
bool const is_boolean_right, uint32_t message_modulus,
uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level,
uint32_t ks_base_log, uint32_t ks_level, uint32_t grouping_factor,
uint32_t num_radix_blocks, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
polynomial_size * glwe_dimension, lwe_dimension,
ks_level, ks_base_log, pbs_level, pbs_base_log,
@@ -88,9 +88,9 @@ uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb_64(
case 8192:
case 16384:
return scratch_cuda_integer_mult_radix_ciphertext_kb<uint64_t>(
(cudaStream_t const *)(streams), gpu_indexes, gpu_count,
(int_mul_memory<uint64_t> **)mem_ptr, is_boolean_left, is_boolean_right,
num_radix_blocks, params, allocate_gpu_memory);
CudaStreams(streams), (int_mul_memory<uint64_t> **)mem_ptr,
is_boolean_left, is_boolean_right, num_radix_blocks, params,
allocate_gpu_memory);
default:
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
"Supported N's are powers of two in the interval [256..16384].")
@@ -125,8 +125,7 @@ uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb_64(
* - 'pbs_type' selects which PBS implementation should be used
*/
void cuda_integer_mult_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *radix_lwe_out,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
void *const *bsks, void *const *ksks,
@@ -136,52 +135,52 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
switch (polynomial_size) {
case 256:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<256>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key,
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
num_blocks);
break;
case 512:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<512>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key,
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
num_blocks);
break;
case 1024:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<1024>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key,
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
num_blocks);
break;
case 2048:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<2048>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key,
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
num_blocks);
break;
case 4096:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<4096>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key,
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
num_blocks);
break;
case 8192:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<8192>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key,
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
num_blocks);
break;
case 16384:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<16384>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key,
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
num_blocks);
break;
default:
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
@@ -190,26 +189,24 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
POP_RANGE()
}
void cleanup_cuda_integer_mult(void *const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
void cleanup_cuda_integer_mult(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
PUSH_RANGE("cleanup mul")
int_mul_memory<uint64_t> *mem_ptr =
(int_mul_memory<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
POP_RANGE()
}
uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks_in_radix,
uint32_t max_num_radix_in_vec, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type,
bool reduce_degrees_for_single_carry_propagation, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
@@ -219,15 +216,14 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
grouping_factor, message_modulus, carry_modulus,
noise_reduction_type);
return scratch_cuda_integer_partial_sum_ciphertexts_vec_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
CudaStreams(streams),
(int_sum_ciphertexts_vec_memory<uint64_t> **)mem_ptr, num_blocks_in_radix,
max_num_radix_in_vec, reduce_degrees_for_single_carry_propagation, params,
allocate_gpu_memory);
}
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *radix_lwe_out,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
@@ -237,19 +233,18 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
PANIC("Cuda error: input vector length should be a multiple of the "
"output's number of radix blocks")
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
CudaStreams(streams), radix_lwe_out, radix_lwe_vec, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, mem,
radix_lwe_out->num_radix_blocks,
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
}
void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr =
(int_sum_ciphertexts_vec_memory<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}

View File

@@ -274,24 +274,22 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
CudaStreams streams, int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
bool reduce_degrees_for_single_carry_propagation, int_radix_params params,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_sum_ciphertexts_vec_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_blocks_in_radix,
max_num_radix_in_vec, reduce_degrees_for_single_carry_propagation,
allocate_gpu_memory, size_tracker);
streams, params, num_blocks_in_radix, max_num_radix_in_vec,
reduce_degrees_for_single_carry_propagation, allocate_gpu_memory,
size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void host_integer_partial_sum_ciphertexts_vec_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *radix_lwe_out,
CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI *terms, void *const *bsks, uint64_t *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
@@ -335,9 +333,9 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
return;
}
if (num_radix_in_vec == 1) {
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
radix_lwe_out, 0, num_radix_blocks,
terms, 0, num_radix_blocks);
copy_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), radix_lwe_out, 0,
num_radix_blocks, terms, 0, num_radix_blocks);
return;
}
@@ -345,24 +343,24 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
CudaRadixCiphertextFFI terms_slice;
as_radix_ciphertext_slice<Torus>(&terms_slice, terms, num_radix_blocks,
2 * num_radix_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, terms,
&terms_slice, num_radix_blocks,
host_addition<Torus>(streams.stream(0), streams.gpu_index(0), radix_lwe_out,
terms, &terms_slice, num_radix_blocks,
mem_ptr->params.message_modulus,
mem_ptr->params.carry_modulus);
return;
}
if (current_blocks != terms) {
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
current_blocks, terms);
}
cuda_memcpy_async_to_gpu(d_degrees, current_blocks->degrees,
total_blocks_in_vec * sizeof(uint64_t), streams[0],
gpu_indexes[0]);
total_blocks_in_vec * sizeof(uint64_t),
streams.stream(0), streams.gpu_index(0));
cuda_set_device(gpu_indexes[0]);
radix_vec_to_columns<<<1, num_radix_blocks, 0, streams[0]>>>(
cuda_set_device(streams.gpu_index(0));
radix_vec_to_columns<<<1, num_radix_blocks, 0, streams.stream(0)>>>(
d_columns, d_columns_counter, d_degrees, num_radix_blocks,
num_radix_in_vec);
@@ -373,19 +371,20 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
int part_count = (big_lwe_size + number_of_threads - 1) / number_of_threads;
const dim3 number_of_blocks_2d(num_radix_blocks, part_count, 1);
mem_ptr->setup_lookup_tables(streams, gpu_indexes, gpu_count,
num_radix_in_vec, current_blocks->degrees);
mem_ptr->setup_lookup_tables(streams, num_radix_in_vec,
current_blocks->degrees);
while (needs_processing) {
auto luts_message_carry = mem_ptr->luts_message_carry;
auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
calculate_chunks<Torus>
<<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
<<<number_of_blocks_2d, number_of_threads, 0, streams.stream(0)>>>(
(Torus *)(current_blocks->ptr), d_columns, d_columns_counter,
chunk_size, big_lwe_size);
prepare_new_columns_and_pbs_indexes<<<1, num_radix_blocks, 0, streams[0]>>>(
prepare_new_columns_and_pbs_indexes<<<1, num_radix_blocks, 0,
streams.stream(0)>>>(
d_new_columns, d_new_columns_counter, d_pbs_indexes_in,
d_pbs_indexes_out, luts_message_carry->get_lut_indexes(0, 0), d_columns,
d_columns_counter, chunk_size);
@@ -395,17 +394,18 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
current_columns.next_accumulation(total_ciphertexts, total_messages,
needs_processing);
auto active_gpu_count = get_active_gpu_count(total_ciphertexts, gpu_count);
if (active_gpu_count == 1) {
auto active_streams = streams.active_gpu_subset(total_ciphertexts);
if (active_streams.count() == 1) {
execute_keyswitch_async<Torus>(
streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
streams.subset_first_gpu(), (Torus *)small_lwe_vector->ptr,
d_pbs_indexes_in, (Torus *)current_blocks->ptr, d_pbs_indexes_in,
ksks, big_lwe_dimension, small_lwe_dimension,
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level,
total_messages);
execute_pbs_async<Torus, Torus>(
streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
streams.subset_first_gpu(), (Torus *)current_blocks->ptr,
d_pbs_indexes_out, luts_message_carry->lut_vec,
luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
d_pbs_indexes_in, bsks, ms_noise_reduction_key,
@@ -417,21 +417,20 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
} else {
// we just need to broadcast the indexes
luts_message_carry->broadcast_lut(streams, gpu_indexes, active_gpu_count,
false);
luts_message_carry->broadcast_lut(active_streams, false);
luts_message_carry->using_trivial_lwe_indexes = false;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, current_blocks, current_blocks, bsks,
ksks, ms_noise_reduction_key, luts_message_carry, total_ciphertexts);
streams, current_blocks, current_blocks, bsks, ksks,
ms_noise_reduction_key, luts_message_carry, total_ciphertexts);
}
cuda_set_device(gpu_indexes[0]);
cuda_set_device(streams.gpu_index(0));
std::swap(d_columns, d_new_columns);
std::swap(d_columns_counter, d_new_columns_counter);
}
calculate_final_chunk_into_radix<Torus>
<<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
<<<number_of_blocks_2d, number_of_threads, 0, streams.stream(0)>>>(
(Torus *)(radix_lwe_out->ptr), (Torus *)(current_blocks->ptr),
d_columns, d_columns_counter, chunk_size, big_lwe_size);
@@ -440,26 +439,25 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
prepare_final_pbs_indexes<Torus>
<<<1, 2 * num_radix_blocks, 0, streams[0]>>>(
<<<1, 2 * num_radix_blocks, 0, streams.stream(0)>>>(
d_pbs_indexes_in, d_pbs_indexes_out,
luts_message_carry->get_lut_indexes(0, 0), num_radix_blocks);
set_zero_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], current_blocks, num_radix_blocks,
num_radix_blocks + 1);
streams.stream(0), streams.gpu_index(0), current_blocks,
num_radix_blocks, num_radix_blocks + 1);
auto active_gpu_count =
get_active_gpu_count(2 * num_radix_blocks, gpu_count);
auto active_streams = streams.active_gpu_subset(2 * num_radix_blocks);
if (active_gpu_count == 1) {
if (active_streams.count() == 1) {
execute_keyswitch_async<Torus>(
streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
streams.subset_first_gpu(), (Torus *)small_lwe_vector->ptr,
d_pbs_indexes_in, (Torus *)radix_lwe_out->ptr, d_pbs_indexes_in, ksks,
big_lwe_dimension, small_lwe_dimension, mem_ptr->params.ks_base_log,
mem_ptr->params.ks_level, num_radix_blocks);
execute_pbs_async<Torus, Torus>(
streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
streams.subset_first_gpu(), (Torus *)current_blocks->ptr,
d_pbs_indexes_out, luts_message_carry->lut_vec,
luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
d_pbs_indexes_in, bsks, ms_noise_reduction_key,
@@ -471,24 +469,22 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
} else {
uint32_t num_blocks_in_apply_lut = 2 * num_radix_blocks;
// we just need to broadcast the indexes
luts_message_carry->broadcast_lut(streams, gpu_indexes, active_gpu_count,
false);
luts_message_carry->broadcast_lut(active_streams, false);
luts_message_carry->using_trivial_lwe_indexes = false;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, active_gpu_count, current_blocks, radix_lwe_out,
bsks, ksks, ms_noise_reduction_key, luts_message_carry,
num_blocks_in_apply_lut);
active_streams, current_blocks, radix_lwe_out, bsks, ksks,
ms_noise_reduction_key, luts_message_carry, num_blocks_in_apply_lut);
}
calculate_final_degrees(radix_lwe_out->degrees, terms->degrees,
num_radix_blocks, num_radix_in_vec, chunk_size,
mem_ptr->params.message_modulus);
cuda_set_device(gpu_indexes[0]);
cuda_set_device(streams.gpu_index(0));
CudaRadixCiphertextFFI current_blocks_slice;
as_radix_ciphertext_slice<Torus>(&current_blocks_slice, current_blocks,
num_radix_blocks, 2 * num_radix_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out,
host_addition<Torus>(streams.stream(0), streams.gpu_index(0), radix_lwe_out,
current_blocks, &current_blocks_slice,
num_radix_blocks, mem_ptr->params.message_modulus,
mem_ptr->params.carry_modulus);
@@ -497,8 +493,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
template <typename Torus, class params>
__host__ void host_integer_mult_radix_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *radix_lwe_out,
CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
void *const *bsks, uint64_t *const *ksks,
@@ -519,18 +514,16 @@ __host__ void host_integer_mult_radix_kb(
int big_lwe_size = big_lwe_dimension + 1;
if (is_bool_right) {
zero_out_if<Torus>(streams, gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_left, radix_lwe_right, mem_ptr->zero_out_mem,
mem_ptr->zero_out_predicate_lut, bsks, ksks,
ms_noise_reduction_key, num_blocks);
zero_out_if<Torus>(streams, radix_lwe_out, radix_lwe_left, radix_lwe_right,
mem_ptr->zero_out_mem, mem_ptr->zero_out_predicate_lut,
bsks, ksks, ms_noise_reduction_key, num_blocks);
return;
}
if (is_bool_left) {
zero_out_if<Torus>(streams, gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_right, radix_lwe_left, mem_ptr->zero_out_mem,
mem_ptr->zero_out_predicate_lut, bsks, ksks,
ms_noise_reduction_key, num_blocks);
zero_out_if<Torus>(streams, radix_lwe_out, radix_lwe_right, radix_lwe_left,
mem_ptr->zero_out_mem, mem_ptr->zero_out_predicate_lut,
bsks, ksks, ms_noise_reduction_key, num_blocks);
return;
}
@@ -590,27 +583,27 @@ __host__ void host_integer_mult_radix_kb(
dim3 grid(lsb_vector_block_count, 1, 1);
dim3 thds(params::degree / params::opt, 1, 1);
cuda_set_device(gpu_indexes[0]);
all_shifted_lhs_rhs<Torus, params><<<grid, thds, 0, streams[0]>>>(
cuda_set_device(streams.gpu_index(0));
all_shifted_lhs_rhs<Torus, params><<<grid, thds, 0, streams.stream(0)>>>(
(Torus *)radix_lwe_left->ptr, (Torus *)vector_result_lsb->ptr,
(Torus *)vector_result_msb.ptr, (Torus *)radix_lwe_right->ptr,
(Torus *)vector_lsb_rhs->ptr, (Torus *)vector_msb_rhs.ptr, num_blocks);
check_cuda_error(cudaGetLastError());
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, block_mul_res, block_mul_res,
vector_result_sb, bsks, ksks, ms_noise_reduction_key, luts_array,
total_block_count, luts_array->params.message_modulus);
streams, block_mul_res, block_mul_res, vector_result_sb, bsks, ksks,
ms_noise_reduction_key, luts_array, total_block_count,
luts_array->params.message_modulus);
vector_result_lsb = block_mul_res;
as_radix_ciphertext_slice<Torus>(&vector_result_msb, block_mul_res,
lsb_vector_block_count,
block_mul_res->num_radix_blocks);
cuda_set_device(gpu_indexes[0]);
cuda_set_device(streams.gpu_index(0));
fill_radix_from_lsb_msb<Torus, params>
<<<num_blocks * num_blocks, params::degree / params::opt, 0,
streams[0]>>>(
streams.stream(0)>>>(
(Torus *)vector_result_sb->ptr, (Torus *)vector_result_lsb->ptr,
(Torus *)vector_result_msb.ptr, big_lwe_size, num_blocks);
check_cuda_error(cudaGetLastError());
@@ -627,31 +620,29 @@ __host__ void host_integer_mult_radix_kb(
terms_degree_msb[i] = (b_id > r_id) ? message_modulus - 2 : 0;
}
host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb, bsks,
ksks, ms_noise_reduction_key, mem_ptr->sum_ciphertexts_mem, num_blocks,
streams, radix_lwe_out, vector_result_sb, bsks, ksks,
ms_noise_reduction_key, mem_ptr->sum_ciphertexts_mem, num_blocks,
2 * num_blocks);
auto scp_mem_ptr = mem_ptr->sc_prop_mem;
uint32_t requested_flag = outputFlag::FLAG_NONE;
uint32_t uses_carry = 0;
host_propagate_single_carry<Torus>(
streams, gpu_indexes, gpu_count, radix_lwe_out, nullptr, nullptr,
scp_mem_ptr, bsks, ksks, ms_noise_reduction_key, requested_flag,
uses_carry);
streams, radix_lwe_out, nullptr, nullptr, scp_mem_ptr, bsks, ksks,
ms_noise_reduction_key, requested_flag, uses_carry);
}
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_mul_memory<Torus> **mem_ptr,
CudaStreams streams, int_mul_memory<Torus> **mem_ptr,
bool const is_boolean_left, bool const is_boolean_right,
uint32_t num_radix_blocks, int_radix_params params,
bool allocate_gpu_memory) {
PUSH_RANGE("scratch mul")
uint64_t size_tracker = 0;
*mem_ptr = new int_mul_memory<Torus>(
streams, gpu_indexes, gpu_count, params, is_boolean_left,
is_boolean_right, num_radix_blocks, allocate_gpu_memory, size_tracker);
*mem_ptr = new int_mul_memory<Torus>(streams, params, is_boolean_left,
is_boolean_right, num_radix_blocks,
allocate_gpu_memory, size_tracker);
POP_RANGE()
return size_tracker;
}

View File

@@ -1,12 +1,11 @@
#include "integer/negation.cuh"
void cuda_negate_integer_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *lwe_array_out,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, uint32_t message_modulus,
uint32_t carry_modulus, uint32_t num_radix_blocks) {
host_integer_radix_negation<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
lwe_array_in, message_modulus, carry_modulus, num_radix_blocks);
host_integer_radix_negation<uint64_t>(CudaStreams(streams), lwe_array_out,
lwe_array_in, message_modulus,
carry_modulus, num_radix_blocks);
}

View File

@@ -55,11 +55,10 @@ device_integer_radix_negation(Torus *output, Torus const *input,
template <typename Torus>
__host__ void host_integer_radix_negation(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, uint64_t message_modulus,
uint64_t carry_modulus, uint32_t num_radix_blocks) {
cuda_set_device(gpu_indexes[0]);
cuda_set_device(streams.gpu_index(0));
if (lwe_array_out->num_radix_blocks < num_radix_blocks ||
lwe_array_in->num_radix_blocks < num_radix_blocks)
@@ -86,7 +85,7 @@ __host__ void host_integer_radix_negation(
// this
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
device_integer_radix_negation<Torus><<<grid, thds, 0, streams[0]>>>(
device_integer_radix_negation<Torus><<<grid, thds, 0, streams.stream(0)>>>(
static_cast<Torus *>(lwe_array_out->ptr),
static_cast<Torus *>(lwe_array_in->ptr), num_radix_blocks, lwe_dimension,
message_modulus, delta);
@@ -114,24 +113,22 @@ __host__ void host_integer_radix_negation(
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_overflowing_sub_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_overflowing_sub_memory<Torus> **mem_ptr,
CudaStreams streams, int_overflowing_sub_memory<Torus> **mem_ptr,
uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
PUSH_RANGE("scratch overflowing sub")
uint64_t size_tracker = 0;
*mem_ptr = new int_overflowing_sub_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory,
noise_reduction_type, size_tracker);
streams, params, num_blocks, allocate_gpu_memory, noise_reduction_type,
size_tracker);
POP_RANGE()
return size_tracker;
}
template <typename Torus>
__host__ void host_integer_overflowing_sub(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *output,
CudaStreams streams, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI *input_left,
const CudaRadixCiphertextFFI *input_right,
CudaRadixCiphertextFFI *overflow_block,
@@ -162,13 +159,12 @@ __host__ void host_integer_overflowing_sub(
uint32_t grouping_size = num_bits_in_block;
uint32_t num_groups = (num_blocks + grouping_size - 1) / grouping_size;
auto stream = (cudaStream_t *)streams;
host_unchecked_sub_with_correcting_term<Torus>(
stream[0], gpu_indexes[0], output, input_left, input_right, num_blocks,
radix_params.message_modulus, radix_params.carry_modulus);
streams.stream(0), streams.gpu_index(0), output, input_left, input_right,
num_blocks, radix_params.message_modulus, radix_params.carry_modulus);
host_single_borrow_propagate<Torus>(
streams, gpu_indexes, gpu_count, output, overflow_block, input_borrow,
streams, output, overflow_block, input_borrow,
(int_borrow_prop_memory<Torus> *)mem_ptr, bsks, (Torus **)(ksks),
ms_noise_reduction_key, num_groups, compute_overflow, uses_input_borrow);
POP_RANGE()

View File

@@ -1,14 +1,14 @@
#include "integer/oprf.cuh"
uint64_t scratch_cuda_integer_grouped_oprf_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks_to_process, uint32_t num_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, uint32_t message_bits_per_block,
uint32_t total_random_bits, PBS_MS_REDUCTION_T noise_reduction_type) {
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks_to_process,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory,
uint32_t message_bits_per_block, uint32_t total_random_bits,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
@@ -17,33 +17,30 @@ uint64_t scratch_cuda_integer_grouped_oprf_64(
noise_reduction_type);
return scratch_cuda_integer_grouped_oprf<uint64_t>(
(cudaStream_t *)streams, gpu_indexes, gpu_count,
(int_grouped_oprf_memory<uint64_t> **)mem_ptr, params,
num_blocks_to_process, num_blocks, message_bits_per_block,
CudaStreams(streams), (int_grouped_oprf_memory<uint64_t> **)mem_ptr,
params, num_blocks_to_process, num_blocks, message_bits_per_block,
total_random_bits, allocate_gpu_memory);
}
void cuda_integer_grouped_oprf_async_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *radix_lwe_out, const void *seeded_lwe_input,
uint32_t num_blocks_to_process, int8_t *mem, void *const *bsks,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
const void *seeded_lwe_input, uint32_t num_blocks_to_process, int8_t *mem,
void *const *bsks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
host_integer_grouped_oprf<uint64_t>(
(cudaStream_t *)streams, gpu_indexes, gpu_count, radix_lwe_out,
(const uint64_t *)seeded_lwe_input, num_blocks_to_process,
(int_grouped_oprf_memory<uint64_t> *)mem, bsks, ms_noise_reduction_key);
CudaStreams(streams), radix_lwe_out, (const uint64_t *)seeded_lwe_input,
num_blocks_to_process, (int_grouped_oprf_memory<uint64_t> *)mem, bsks,
ms_noise_reduction_key);
}
void cleanup_cuda_integer_grouped_oprf_64(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
void cleanup_cuda_integer_grouped_oprf_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_grouped_oprf_memory<uint64_t> *mem_ptr =
(int_grouped_oprf_memory<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;

View File

@@ -6,37 +6,34 @@
template <typename Torus>
uint64_t scratch_cuda_integer_grouped_oprf(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_grouped_oprf_memory<Torus> **mem_ptr,
CudaStreams streams, int_grouped_oprf_memory<Torus> **mem_ptr,
int_radix_params params, uint32_t num_blocks_to_process,
uint32_t num_blocks, uint32_t message_bits_per_block,
uint64_t total_random_bits, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_grouped_oprf_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_blocks_to_process,
num_blocks, message_bits_per_block, total_random_bits,
allocate_gpu_memory, size_tracker);
streams, params, num_blocks_to_process, num_blocks,
message_bits_per_block, total_random_bits, allocate_gpu_memory,
size_tracker);
return size_tracker;
}
template <typename Torus>
void host_integer_grouped_oprf(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *radix_lwe_out,
CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
const Torus *seeded_lwe_input, uint32_t num_blocks_to_process,
int_grouped_oprf_memory<Torus> *mem_ptr, void *const *bsks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
auto active_gpu_count =
get_active_gpu_count(num_blocks_to_process, gpu_count);
auto active_streams = streams.active_gpu_subset(num_blocks_to_process);
auto lut = mem_ptr->luts;
if (active_gpu_count == 1) {
if (active_streams.count() == 1) {
execute_pbs_async<Torus, Torus>(
streams, gpu_indexes, (uint32_t)1, (Torus *)(radix_lwe_out->ptr),
lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec,
streams, (Torus *)(radix_lwe_out->ptr), lut->lwe_indexes_out,
lut->lut_vec, lut->lut_indexes_vec,
const_cast<Torus *>(seeded_lwe_input), lut->lwe_indexes_in, bsks,
ms_noise_reduction_key, lut->buffer, mem_ptr->params.glwe_dimension,
mem_ptr->params.small_lwe_dimension, mem_ptr->params.polynomial_size,
@@ -48,9 +45,11 @@ void host_integer_grouped_oprf(
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
for (int j = 1; j < active_gpu_count; j++) {
cuda_stream_wait_event(streams[j], lut->event_scatter_in, gpu_indexes[j]);
cuda_event_record(lut->event_scatter_in, streams.stream(0),
streams.gpu_index(0));
for (int j = 1; j < active_streams.count(); j++) {
cuda_stream_wait_event(streams.stream(j), lut->event_scatter_in,
streams.gpu_index(j));
}
if (!lut->using_trivial_lwe_indexes) {
@@ -58,35 +57,35 @@ void host_integer_grouped_oprf(
}
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
seeded_lwe_input, lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
lut->lwe_aligned_vec, active_gpu_count, num_blocks_to_process,
active_streams, lwe_array_in_vec, seeded_lwe_input, lut->lwe_indexes_in,
lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
active_streams.count(), num_blocks_to_process,
mem_ptr->params.small_lwe_dimension + 1);
execute_pbs_async<Torus, Torus>(
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_array_in_vec, lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
lut->buffer, mem_ptr->params.glwe_dimension,
mem_ptr->params.small_lwe_dimension, mem_ptr->params.polynomial_size,
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
mem_ptr->params.grouping_factor, num_blocks_to_process,
mem_ptr->params.pbs_type, 1, 0);
active_streams, lwe_after_pbs_vec, lwe_trivial_indexes_vec,
lut->lut_vec, lut->lut_indexes_vec, lwe_array_in_vec,
lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key, lut->buffer,
mem_ptr->params.glwe_dimension, mem_ptr->params.small_lwe_dimension,
mem_ptr->params.polynomial_size, mem_ptr->params.pbs_base_log,
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
num_blocks_to_process, mem_ptr->params.pbs_type, 1, 0);
multi_gpu_gather_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, (Torus *)radix_lwe_out->ptr,
lwe_after_pbs_vec, lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
active_streams, (Torus *)radix_lwe_out->ptr, lwe_after_pbs_vec,
lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
lut->lwe_aligned_vec, num_blocks_to_process,
mem_ptr->params.big_lwe_dimension + 1);
// other gpus record their events
for (int j = 1; j < active_gpu_count; j++) {
cuda_event_record(lut->event_scatter_out[j], streams[j], gpu_indexes[j]);
for (int j = 1; j < active_streams.count(); j++) {
cuda_event_record(lut->event_scatter_out[j], streams.stream(j),
streams.gpu_index(j));
}
// GPU 0 waits for all
for (int j = 1; j < active_gpu_count; j++) {
cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
gpu_indexes[0]);
for (int j = 1; j < active_streams.count(); j++) {
cuda_stream_wait_event(streams.stream(0), lut->event_scatter_out[j],
streams.gpu_index(0));
}
}
@@ -96,9 +95,9 @@ void host_integer_grouped_oprf(
radix_lwe_out->noise_levels[i] = NoiseLevel::NOMINAL;
}
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, radix_lwe_out,
mem_ptr->plaintext_corrections, num_blocks_to_process,
mem_ptr->params.message_modulus,
host_addition<Torus>(streams.stream(0), streams.gpu_index(0), radix_lwe_out,
radix_lwe_out, mem_ptr->plaintext_corrections,
num_blocks_to_process, mem_ptr->params.message_modulus,
mem_ptr->params.carry_modulus);
}

View File

@@ -1,13 +1,12 @@
#include "integer/scalar_addition.cuh"
void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *lwe_array, void const *scalar_input,
void const *h_scalar_input, uint32_t num_scalars, uint32_t message_modulus,
uint32_t carry_modulus) {
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
void const *scalar_input, void const *h_scalar_input, uint32_t num_scalars,
uint32_t message_modulus, uint32_t carry_modulus) {
host_integer_radix_scalar_addition_inplace<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
CudaStreams(streams), lwe_array,
static_cast<const uint64_t *>(scalar_input),
static_cast<const uint64_t *>(h_scalar_input), num_scalars,
message_modulus, carry_modulus);

View File

@@ -7,6 +7,7 @@
#endif
#include "device.h"
#include "helper_multi_gpu.h"
#include "radix_ciphertext.cuh"
#include "utils/kernel_dimensions.cuh"
#include <stdio.h>
@@ -25,14 +26,13 @@ __global__ void device_integer_radix_scalar_addition_inplace(
template <typename Torus>
__host__ void host_integer_radix_scalar_addition_inplace(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array,
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
Torus const *scalar_input, Torus const *h_scalar_input,
uint32_t num_scalars, uint32_t message_modulus, uint32_t carry_modulus) {
if (lwe_array->num_radix_blocks < num_scalars)
PANIC("Cuda error: num scalars should be smaller or equal to input num "
"radix blocks")
cuda_set_device(gpu_indexes[0]);
cuda_set_device(streams.gpu_index(0));
// Create a 1-dimensional grid of threads
int num_blocks = 0, num_threads = 0;
@@ -47,9 +47,9 @@ __host__ void host_integer_radix_scalar_addition_inplace(
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
device_integer_radix_scalar_addition_inplace<Torus>
<<<grid, thds, 0, streams[0]>>>((Torus *)lwe_array->ptr, scalar_input,
num_scalars, lwe_array->lwe_dimension,
delta);
<<<grid, thds, 0, streams.stream(0)>>>((Torus *)lwe_array->ptr,
scalar_input, num_scalars,
lwe_array->lwe_dimension, delta);
check_cuda_error(cudaGetLastError());
for (uint i = 0; i < num_scalars; i++) {
lwe_array->degrees[i] = lwe_array->degrees[i] + h_scalar_input[i];
@@ -70,10 +70,9 @@ __global__ void device_integer_radix_add_scalar_one_inplace(
template <typename Torus>
__host__ void host_integer_radix_add_scalar_one_inplace(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array,
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
uint32_t message_modulus, uint32_t carry_modulus) {
cuda_set_device(gpu_indexes[0]);
cuda_set_device(streams.gpu_index(0));
// Create a 1-dimensional grid of threads
int num_blocks = 0, num_threads = 0;
@@ -88,7 +87,7 @@ __host__ void host_integer_radix_add_scalar_one_inplace(
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
device_integer_radix_add_scalar_one_inplace<Torus>
<<<grid, thds, 0, streams[0]>>>((Torus *)lwe_array->ptr,
<<<grid, thds, 0, streams.stream(0)>>>((Torus *)lwe_array->ptr,
lwe_array->num_radix_blocks,
lwe_array->lwe_dimension, delta);
check_cuda_error(cudaGetLastError());
@@ -113,11 +112,10 @@ __global__ void device_integer_radix_scalar_subtraction_inplace(
template <typename Torus>
__host__ void host_integer_radix_scalar_subtraction_inplace(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array, Torus *scalar_input,
CudaStreams streams, Torus *lwe_array, Torus *scalar_input,
uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus) {
cuda_set_device(gpu_indexes[0]);
cuda_set_device(streams.gpu_index(0));
// Create a 1-dimensional grid of threads
int num_blocks = 0, num_threads = 0;
@@ -132,9 +130,9 @@ __host__ void host_integer_radix_scalar_subtraction_inplace(
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
device_integer_radix_scalar_subtraction_inplace<Torus>
<<<grid, thds, 0, streams[0]>>>(lwe_array, scalar_input,
input_lwe_ciphertext_count, lwe_dimension,
delta);
<<<grid, thds, 0, streams.stream(0)>>>(lwe_array, scalar_input,
input_lwe_ciphertext_count,
lwe_dimension, delta);
check_cuda_error(cudaGetLastError());
}
#endif

View File

@@ -1,16 +1,15 @@
#include "integer/scalar_bitops.cuh"
void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *lwe_array_out,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
void const *h_clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
host_integer_radix_scalar_bitop_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
lwe_array_input, static_cast<const uint64_t *>(clear_blocks),
CudaStreams(streams), lwe_array_out, lwe_array_input,
static_cast<const uint64_t *>(clear_blocks),
static_cast<const uint64_t *>(h_clear_blocks), num_clear_blocks,
(int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key);

View File

@@ -6,8 +6,7 @@
template <typename Torus>
__host__ void host_integer_radix_scalar_bitop_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *output,
CudaStreams streams, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input, Torus const *clear_blocks,
Torus const *h_clear_blocks, uint32_t num_clear_blocks,
int_bitop_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks,
@@ -23,12 +22,12 @@ __host__ void host_integer_radix_scalar_bitop_kb(
if (num_clear_blocks == 0) {
if (op == SCALAR_BITAND) {
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
output, 0, num_radix_blocks);
set_zero_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), output, 0, num_radix_blocks);
} else {
if (input != output)
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], output,
input);
copy_radix_ciphertext_async<Torus>(streams.stream(0),
streams.gpu_index(0), output, input);
}
} else {
// We have all possible LUTs pre-computed and we use the decomposed scalar
@@ -45,19 +44,19 @@ __host__ void host_integer_radix_scalar_bitop_kb(
input->degrees, num_clear_blocks);
}
cuda_memcpy_async_gpu_to_gpu(lut->get_lut_indexes(0, 0), clear_blocks,
num_clear_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
auto active_gpu_count = get_active_gpu_count(num_clear_blocks, gpu_count);
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count, false);
num_clear_blocks * sizeof(Torus),
streams.stream(0), streams.gpu_index(0));
auto active_streams = streams.active_gpu_subset(num_clear_blocks);
lut->broadcast_lut(active_streams, false);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, output, input, bsks, ksks,
ms_noise_reduction_key, lut, num_clear_blocks);
streams, output, input, bsks, ksks, ms_noise_reduction_key, lut,
num_clear_blocks);
memcpy(output->degrees, degrees, num_clear_blocks * sizeof(uint64_t));
if (op == SCALAR_BITAND && num_clear_blocks < num_radix_blocks) {
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
output, num_clear_blocks,
set_zero_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), output, num_clear_blocks,
num_radix_blocks);
}
}

View File

@@ -32,8 +32,7 @@ std::pair<bool, bool> get_invert_flags(COMPARISON_TYPE compare) {
}
void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *lwe_array_out,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, void const *scalar_blocks,
void const *h_scalar_blocks, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
@@ -50,9 +49,9 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
case EQ:
case NE:
host_integer_radix_scalar_equality_check_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
lwe_array_in, static_cast<const uint64_t *>(scalar_blocks), buffer,
bsks, (uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks,
CudaStreams(streams), lwe_array_out, lwe_array_in,
static_cast<const uint64_t *>(scalar_blocks), buffer, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks,
num_scalar_blocks);
break;
case GT:
@@ -63,8 +62,8 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
PANIC("Cuda error (scalar comparisons): the number of radix blocks has "
"to be even or equal to 1.")
host_integer_radix_scalar_difference_check_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
lwe_array_in, static_cast<const uint64_t *>(scalar_blocks),
CudaStreams(streams), lwe_array_out, lwe_array_in,
static_cast<const uint64_t *>(scalar_blocks),
static_cast<const uint64_t *>(h_scalar_blocks), buffer,
buffer->diff_buffer->operator_f, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, num_radix_blocks, num_scalar_blocks);
@@ -75,8 +74,8 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
PANIC("Cuda error (scalar max/min): the number of radix blocks has to be "
"even.")
host_integer_radix_scalar_maxmin_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
lwe_array_in, static_cast<const uint64_t *>(scalar_blocks),
CudaStreams(streams), lwe_array_out, lwe_array_in,
static_cast<const uint64_t *>(scalar_blocks),
static_cast<const uint64_t *>(h_scalar_blocks), buffer, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks,
num_scalar_blocks);

View File

@@ -26,8 +26,7 @@ Torus is_x_less_than_y_given_input_borrow(Torus last_x_block,
template <typename Torus>
__host__ void scalar_compare_radix_blocks_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI *lwe_array_in, Torus *scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
@@ -61,36 +60,33 @@ __host__ void scalar_compare_radix_blocks_kb(
// space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000
auto subtracted_blocks = mem_ptr->tmp_block_comparisons;
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
subtracted_blocks, lwe_array_in);
// Subtract
// Here we need the true lwe sub, not the one that comes from shortint.
host_integer_radix_scalar_subtraction_inplace<Torus>(
streams, gpu_indexes, gpu_count, (Torus *)subtracted_blocks->ptr,
scalar_blocks, big_lwe_dimension, num_radix_blocks, message_modulus,
carry_modulus);
streams, (Torus *)subtracted_blocks->ptr, scalar_blocks,
big_lwe_dimension, num_radix_blocks, message_modulus, carry_modulus);
// Apply LUT to compare to 0
auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, subtracted_blocks, bsks,
ksks, ms_noise_reduction_key, sign_lut, num_radix_blocks);
streams, lwe_array_out, subtracted_blocks, bsks, ksks,
ms_noise_reduction_key, sign_lut, num_radix_blocks);
// FIXME: without this sync signed scalar eq tests fail, I don't understand
// the reason
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
// Add one
// Here Lhs can have the following values: (-1) % (message modulus * carry
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
host_integer_radix_add_scalar_one_inplace<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, message_modulus,
carry_modulus);
streams, lwe_array_out, message_modulus, carry_modulus);
}
template <typename Torus>
__host__ void integer_radix_unsigned_scalar_difference_check_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
@@ -135,12 +131,11 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
// We only have to compare blocks with zero
// means scalar is zero
host_compare_blocks_with_zero<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
lwe_array_in, mem_ptr, bsks, ksks, ms_noise_reduction_key,
num_radix_blocks, mem_ptr->is_zero_lut);
streams, mem_ptr->tmp_lwe_array_out, lwe_array_in, mem_ptr, bsks, ksks,
ms_noise_reduction_key, num_radix_blocks, mem_ptr->is_zero_lut);
are_all_comparisons_block_true<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
mem_ptr->tmp_lwe_array_out, mem_ptr, bsks, ksks, ms_noise_reduction_key,
streams, mem_ptr->tmp_lwe_array_out, mem_ptr->tmp_lwe_array_out,
mem_ptr, bsks, ksks, ms_noise_reduction_key,
mem_ptr->tmp_lwe_array_out->num_radix_blocks);
auto scalar_last_leaf_lut_f = [sign_handler_f](Torus x) -> Torus {
@@ -151,16 +146,16 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
generate_device_accumulator_with_cpu_prealloc<Torus>(
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
lut->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, scalar_last_leaf_lut_f, true,
mem_ptr->diff_buffer->tree_buffer->preallocated_h_lut);
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, scalar_last_leaf_lut_f,
true, mem_ptr->diff_buffer->tree_buffer->preallocated_h_lut);
auto active_streams = streams.active_gpu_subset(1);
lut->broadcast_lut(active_streams);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out,
mem_ptr->tmp_lwe_array_out, bsks, ksks, ms_noise_reduction_key, lut, 1);
streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, bsks, ksks,
ms_noise_reduction_key, lut, 1);
} else if (num_scalar_blocks < num_radix_blocks) {
// We have to handle both part of the work described above
@@ -185,9 +180,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
auto lsb_streams = mem_ptr->lsb_streams;
auto msb_streams = mem_ptr->msb_streams;
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
streams.synchronize();
//////////////
// lsb
@@ -196,9 +189,9 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
as_radix_ciphertext_slice<Torus>(&rhs, lhs, num_radix_blocks / 2,
lhs->num_radix_blocks);
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
num_lsb_radix_blocks, message_modulus);
scalar_pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], &rhs,
pack_blocks<Torus>(lsb_streams.stream(0), streams.gpu_index(0), lhs,
lwe_array_in, num_lsb_radix_blocks, message_modulus);
scalar_pack_blocks<Torus>(lsb_streams.stream(0), streams.gpu_index(0), &rhs,
scalar_blocks, num_scalar_blocks,
message_modulus);
@@ -213,31 +206,26 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
auto comparisons = mem_ptr->tmp_block_comparisons;
scalar_compare_radix_blocks_kb<Torus>(
lsb_streams, gpu_indexes, gpu_count, comparisons,
diff_buffer->tmp_packed, (Torus *)rhs.ptr, mem_ptr, bsks, ksks,
ms_noise_reduction_key, num_lsb_radix_blocks);
lsb_streams, comparisons, diff_buffer->tmp_packed, (Torus *)rhs.ptr,
mem_ptr, bsks, ksks, ms_noise_reduction_key, num_lsb_radix_blocks);
// Reduces a vec containing radix blocks that encrypts a sign
// (inferior, equal, superior) to one single radix block containing the
// final sign
tree_sign_reduction<Torus>(
lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks, ksks,
tree_sign_reduction<Torus>(lsb_streams, lwe_array_lsb_out, comparisons,
mem_ptr->diff_buffer->tree_buffer,
mem_ptr->identity_lut_f, bsks, ksks,
ms_noise_reduction_key, num_lsb_radix_blocks);
//////////////
// msb
host_compare_blocks_with_zero<Torus>(
msb_streams, gpu_indexes, gpu_count, &lwe_array_msb_out, &msb, mem_ptr,
bsks, ksks, ms_noise_reduction_key, num_msb_radix_blocks,
mem_ptr->is_zero_lut);
msb_streams, &lwe_array_msb_out, &msb, mem_ptr, bsks, ksks,
ms_noise_reduction_key, num_msb_radix_blocks, mem_ptr->is_zero_lut);
are_all_comparisons_block_true<Torus>(
msb_streams, gpu_indexes, gpu_count, &lwe_array_msb_out,
&lwe_array_msb_out, mem_ptr, bsks, ksks, ms_noise_reduction_key,
lwe_array_msb_out.num_radix_blocks);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
}
msb_streams, &lwe_array_msb_out, &lwe_array_msb_out, mem_ptr, bsks,
ksks, ms_noise_reduction_key, lwe_array_msb_out.num_radix_blocks);
lsb_streams.synchronize();
msb_streams.synchronize();
//////////////
// Reduce the two blocks into one final
@@ -252,17 +240,17 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
auto lut = diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
generate_device_accumulator_bivariate_with_cpu_prealloc<Torus>(
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
lut->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f, true,
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
scalar_bivariate_last_leaf_lut_f, true,
mem_ptr->diff_buffer->tree_buffer->preallocated_h_lut);
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
auto active_streams = streams.active_gpu_subset(1);
lut->broadcast_lut(active_streams);
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
&lwe_array_msb_out, bsks, ksks, ms_noise_reduction_key, lut, 1,
lut->params.message_modulus);
streams, lwe_array_out, lwe_array_lsb_out, &lwe_array_msb_out, bsks,
ksks, ms_noise_reduction_key, lut, 1, lut->params.message_modulus);
} else {
if (num_radix_blocks == 1) {
@@ -282,22 +270,22 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
return (Torus)(invert_flags.second ^ overflowed);
};
uint64_t size = 0;
int_radix_lut<Torus> *one_block_lut = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, params, 1, 1, true, size);
int_radix_lut<Torus> *one_block_lut =
new int_radix_lut<Torus>(streams, params, 1, 1, true, size);
generate_device_accumulator_with_cpu_prealloc<Torus>(
streams[0], gpu_indexes[0], one_block_lut->get_lut(0, 0),
streams.stream(0), streams.gpu_index(0), one_block_lut->get_lut(0, 0),
one_block_lut->get_degree(0), one_block_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, one_block_lut_f, true,
mem_ptr->preallocated_h_lut);
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
one_block_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
auto active_streams = streams.active_gpu_subset(1);
one_block_lut->broadcast_lut(active_streams);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks,
ksks, ms_noise_reduction_key, one_block_lut, 1);
one_block_lut->release(streams, gpu_indexes, gpu_count);
streams, lwe_array_out, lwe_array_in, bsks, ksks,
ms_noise_reduction_key, one_block_lut, 1);
one_block_lut->release(streams);
delete one_block_lut;
} else {
// We only have to do the regular comparison
@@ -310,10 +298,11 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
as_radix_ciphertext_slice<Torus>(&rhs, lhs, num_radix_blocks / 2,
lhs->num_radix_blocks);
pack_blocks<Torus>(streams[0], gpu_indexes[0], lhs, lwe_array_in,
num_lsb_radix_blocks, message_modulus);
scalar_pack_blocks<Torus>(streams[0], gpu_indexes[0], &rhs, scalar_blocks,
num_scalar_blocks, message_modulus);
pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), lhs,
lwe_array_in, num_lsb_radix_blocks, message_modulus);
scalar_pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), &rhs,
scalar_blocks, num_scalar_blocks,
message_modulus);
// From this point we have half number of blocks
num_lsb_radix_blocks /= 2;
@@ -324,15 +313,14 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
// - 2 if lhs > rhs
auto comparisons = mem_ptr->tmp_lwe_array_out;
scalar_compare_radix_blocks_kb<Torus>(
streams, gpu_indexes, gpu_count, comparisons, diff_buffer->tmp_packed,
(Torus *)rhs.ptr, mem_ptr, bsks, ksks, ms_noise_reduction_key,
num_lsb_radix_blocks);
streams, comparisons, diff_buffer->tmp_packed, (Torus *)rhs.ptr,
mem_ptr, bsks, ksks, ms_noise_reduction_key, num_lsb_radix_blocks);
// Reduces a vec containing radix blocks that encrypts a sign
// (inferior, equal, superior) to one single radix block containing the
// final sign
tree_sign_reduction<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
comparisons, mem_ptr->diff_buffer->tree_buffer,
tree_sign_reduction<Torus>(streams, lwe_array_out, comparisons,
mem_ptr->diff_buffer->tree_buffer,
sign_handler_f, bsks, ksks,
ms_noise_reduction_key, num_lsb_radix_blocks);
}
@@ -341,8 +329,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
template <typename Torus>
__host__ void integer_radix_signed_scalar_difference_check_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
@@ -388,13 +375,11 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
// means scalar is zero
auto are_all_msb_zeros = mem_ptr->tmp_lwe_array_out;
host_compare_blocks_with_zero<Torus>(
streams, gpu_indexes, gpu_count, are_all_msb_zeros, lwe_array_in,
mem_ptr, bsks, ksks, ms_noise_reduction_key, num_radix_blocks,
mem_ptr->is_zero_lut);
streams, are_all_msb_zeros, lwe_array_in, mem_ptr, bsks, ksks,
ms_noise_reduction_key, num_radix_blocks, mem_ptr->is_zero_lut);
are_all_comparisons_block_true<Torus>(
streams, gpu_indexes, gpu_count, are_all_msb_zeros, are_all_msb_zeros,
mem_ptr, bsks, ksks, ms_noise_reduction_key,
are_all_msb_zeros->num_radix_blocks);
streams, are_all_msb_zeros, are_all_msb_zeros, mem_ptr, bsks, ksks,
ms_noise_reduction_key, are_all_msb_zeros->num_radix_blocks);
CudaRadixCiphertextFFI sign_block;
as_radix_ciphertext_slice<Torus>(&sign_block, lwe_array_in,
num_radix_blocks - 1, num_radix_blocks);
@@ -436,17 +421,17 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
generate_device_accumulator_bivariate_with_cpu_prealloc<Torus>(
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
lut->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f, true,
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
scalar_bivariate_last_leaf_lut_f, true,
mem_ptr->diff_buffer->tree_buffer->preallocated_h_lut);
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
auto active_streams = streams.active_gpu_subset(1);
lut->broadcast_lut(active_streams);
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, are_all_msb_zeros,
&sign_block, bsks, ksks, ms_noise_reduction_key, lut, 1,
lut->params.message_modulus);
streams, lwe_array_out, are_all_msb_zeros, &sign_block, bsks, ksks,
ms_noise_reduction_key, lut, 1, lut->params.message_modulus);
} else if (num_scalar_blocks < num_radix_blocks) {
// We have to handle both part of the work described above
@@ -465,9 +450,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
auto lsb_streams = mem_ptr->lsb_streams;
auto msb_streams = mem_ptr->msb_streams;
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
streams.synchronize();
//////////////
// lsb
@@ -476,9 +459,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
as_radix_ciphertext_slice<Torus>(&rhs, lhs, num_radix_blocks / 2,
lhs->num_radix_blocks);
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
num_lsb_radix_blocks, message_modulus);
scalar_pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], &rhs,
pack_blocks<Torus>(lsb_streams.stream(0), streams.gpu_index(0), lhs,
lwe_array_in, num_lsb_radix_blocks, message_modulus);
scalar_pack_blocks<Torus>(lsb_streams.stream(0), streams.gpu_index(0), &rhs,
scalar_blocks, num_scalar_blocks,
message_modulus);
@@ -493,29 +476,26 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
auto comparisons = mem_ptr->tmp_block_comparisons;
scalar_compare_radix_blocks_kb<Torus>(
lsb_streams, gpu_indexes, gpu_count, comparisons,
diff_buffer->tmp_packed, (Torus *)rhs.ptr, mem_ptr, bsks, ksks,
ms_noise_reduction_key, num_lsb_radix_blocks);
lsb_streams, comparisons, diff_buffer->tmp_packed, (Torus *)rhs.ptr,
mem_ptr, bsks, ksks, ms_noise_reduction_key, num_lsb_radix_blocks);
// Reduces a vec containing radix blocks that encrypts a sign
// (inferior, equal, superior) to one single radix block containing the
// final sign
tree_sign_reduction<Torus>(
lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks, ksks,
tree_sign_reduction<Torus>(lsb_streams, lwe_array_lsb_out, comparisons,
mem_ptr->diff_buffer->tree_buffer,
mem_ptr->identity_lut_f, bsks, ksks,
ms_noise_reduction_key, num_lsb_radix_blocks);
//////////////
// msb
// We remove the last block (which is the sign)
auto are_all_msb_zeros = lwe_array_msb_out;
host_compare_blocks_with_zero<Torus>(
msb_streams, gpu_indexes, gpu_count, &are_all_msb_zeros, &msb, mem_ptr,
bsks, ksks, ms_noise_reduction_key, num_msb_radix_blocks,
mem_ptr->is_zero_lut);
msb_streams, &are_all_msb_zeros, &msb, mem_ptr, bsks, ksks,
ms_noise_reduction_key, num_msb_radix_blocks, mem_ptr->is_zero_lut);
are_all_comparisons_block_true<Torus>(
msb_streams, gpu_indexes, gpu_count, &are_all_msb_zeros,
&are_all_msb_zeros, mem_ptr, bsks, ksks, ms_noise_reduction_key,
are_all_msb_zeros.num_radix_blocks);
msb_streams, &are_all_msb_zeros, &are_all_msb_zeros, mem_ptr, bsks,
ksks, ms_noise_reduction_key, are_all_msb_zeros.num_radix_blocks);
auto sign_bit_pos = (int)log2(message_modulus) - 1;
@@ -543,30 +523,28 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
auto signed_msb_lut = mem_ptr->signed_msb_lut;
generate_device_accumulator_bivariate_with_cpu_prealloc<Torus>(
msb_streams[0], gpu_indexes[0], signed_msb_lut->get_lut(0, 0),
signed_msb_lut->get_degree(0), signed_msb_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f, true, mem_ptr->preallocated_h_lut);
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
signed_msb_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
msb_streams.stream(0), streams.gpu_index(0),
signed_msb_lut->get_lut(0, 0), signed_msb_lut->get_degree(0),
signed_msb_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f, true, mem_ptr->preallocated_h_lut);
auto active_streams = streams.active_gpu_subset(1);
signed_msb_lut->broadcast_lut(active_streams);
CudaRadixCiphertextFFI sign_block;
as_radix_ciphertext_slice<Torus>(
&sign_block, &msb, num_msb_radix_blocks - 1, num_msb_radix_blocks);
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
msb_streams, gpu_indexes, gpu_count, &lwe_array_msb_out, &sign_block,
&are_all_msb_zeros, bsks, ksks, ms_noise_reduction_key, signed_msb_lut,
1, signed_msb_lut->params.message_modulus);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
}
msb_streams, &lwe_array_msb_out, &sign_block, &are_all_msb_zeros, bsks,
ksks, ms_noise_reduction_key, signed_msb_lut, 1,
signed_msb_lut->params.message_modulus);
lsb_streams.synchronize();
msb_streams.synchronize();
//////////////
// Reduce the two blocks into one final
reduce_signs<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
lwe_array_lsb_out, mem_ptr, sign_handler_f, bsks, ksks,
ms_noise_reduction_key, 2);
reduce_signs<Torus>(streams, lwe_array_out, lwe_array_lsb_out, mem_ptr,
sign_handler_f, bsks, ksks, ms_noise_reduction_key, 2);
} else {
if (num_radix_blocks == 1) {
@@ -588,22 +566,22 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
message_modulus);
};
uint64_t size = 0;
int_radix_lut<Torus> *one_block_lut = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, params, 1, 1, true, size);
int_radix_lut<Torus> *one_block_lut =
new int_radix_lut<Torus>(streams, params, 1, 1, true, size);
generate_device_accumulator_with_cpu_prealloc<Torus>(
streams[0], gpu_indexes[0], one_block_lut->get_lut(0, 0),
streams.stream(0), streams.gpu_index(0), one_block_lut->get_lut(0, 0),
one_block_lut->get_degree(0), one_block_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, one_block_lut_f, true,
mem_ptr->preallocated_h_lut);
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
one_block_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
auto active_streams = streams.active_gpu_subset(1);
one_block_lut->broadcast_lut(active_streams);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks,
ksks, ms_noise_reduction_key, one_block_lut, 1);
one_block_lut->release(streams, gpu_indexes, gpu_count);
streams, lwe_array_out, lwe_array_in, bsks, ksks,
ms_noise_reduction_key, one_block_lut, 1);
one_block_lut->release(streams);
delete one_block_lut;
} else {
// We only have to do the regular comparison
@@ -611,9 +589,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
// total_num_radix_blocks == total_num_scalar_blocks
uint32_t num_lsb_radix_blocks = num_radix_blocks;
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
streams.synchronize();
auto lsb_streams = mem_ptr->lsb_streams;
auto msb_streams = mem_ptr->msb_streams;
@@ -627,10 +603,11 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
as_radix_ciphertext_slice<Torus>(&rhs, lhs, num_radix_blocks / 2,
lhs->num_radix_blocks);
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
num_lsb_radix_blocks - 1, message_modulus);
scalar_pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], &rhs,
scalar_blocks, num_lsb_radix_blocks - 1,
pack_blocks<Torus>(lsb_streams.stream(0), streams.gpu_index(0), lhs,
lwe_array_in, num_lsb_radix_blocks - 1,
message_modulus);
scalar_pack_blocks<Torus>(lsb_streams.stream(0), streams.gpu_index(0),
&rhs, scalar_blocks, num_lsb_radix_blocks - 1,
message_modulus);
// From this point we have half number of blocks
@@ -641,9 +618,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
// - 1 if lhs == rhs
// - 2 if lhs > rhs
scalar_compare_radix_blocks_kb<Torus>(
lsb_streams, gpu_indexes, gpu_count, lwe_array_ct_out,
diff_buffer->tmp_packed, (Torus *)rhs.ptr, mem_ptr, bsks, ksks,
ms_noise_reduction_key, num_lsb_radix_blocks);
lsb_streams, lwe_array_ct_out, diff_buffer->tmp_packed,
(Torus *)rhs.ptr, mem_ptr, bsks, ksks, ms_noise_reduction_key,
num_lsb_radix_blocks);
CudaRadixCiphertextFFI encrypted_sign_block;
as_radix_ciphertext_slice<Torus>(&encrypted_sign_block, lwe_array_in,
num_radix_blocks - 1, num_radix_blocks);
@@ -653,33 +630,30 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
auto trivial_sign_block = mem_ptr->tmp_trivial_sign_block;
set_trivial_radix_ciphertext_async<Torus>(
msb_streams[0], gpu_indexes[0], trivial_sign_block, scalar_sign_block,
h_scalar_sign_block, 1, message_modulus, carry_modulus);
msb_streams.stream(0), streams.gpu_index(0), trivial_sign_block,
scalar_sign_block, h_scalar_sign_block, 1, message_modulus,
carry_modulus);
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
msb_streams, gpu_indexes, gpu_count, &lwe_array_sign_out,
&encrypted_sign_block, trivial_sign_block, bsks, ksks,
ms_noise_reduction_key, mem_ptr->signed_lut, 1,
mem_ptr->signed_lut->params.message_modulus);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
}
msb_streams, &lwe_array_sign_out, &encrypted_sign_block,
trivial_sign_block, bsks, ksks, ms_noise_reduction_key,
mem_ptr->signed_lut, 1, mem_ptr->signed_lut->params.message_modulus);
lsb_streams.synchronize();
msb_streams.synchronize();
// Reduces a vec containing radix blocks that encrypts a sign
// (inferior, equal, superior) to one single radix block containing the
// final sign
reduce_signs<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
lwe_array_ct_out, mem_ptr, sign_handler_f, bsks, ksks,
ms_noise_reduction_key, num_lsb_radix_blocks + 1);
reduce_signs<Torus>(streams, lwe_array_out, lwe_array_ct_out, mem_ptr,
sign_handler_f, bsks, ksks, ms_noise_reduction_key,
num_lsb_radix_blocks + 1);
}
}
}
template <typename Torus>
__host__ void host_integer_radix_scalar_difference_check_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
@@ -696,21 +670,20 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
if (mem_ptr->is_signed) {
// is signed and scalar is positive
integer_radix_signed_scalar_difference_check_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
scalar_blocks, h_scalar_blocks, mem_ptr, sign_handler_f, bsks, ksks,
ms_noise_reduction_key, num_radix_blocks, num_scalar_blocks);
streams, lwe_array_out, lwe_array_in, scalar_blocks, h_scalar_blocks,
mem_ptr, sign_handler_f, bsks, ksks, ms_noise_reduction_key,
num_radix_blocks, num_scalar_blocks);
} else {
integer_radix_unsigned_scalar_difference_check_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
scalar_blocks, h_scalar_blocks, mem_ptr, sign_handler_f, bsks, ksks,
ms_noise_reduction_key, num_radix_blocks, num_scalar_blocks);
streams, lwe_array_out, lwe_array_in, scalar_blocks, h_scalar_blocks,
mem_ptr, sign_handler_f, bsks, ksks, ms_noise_reduction_key,
num_radix_blocks, num_scalar_blocks);
}
}
template <typename Torus>
__host__ void host_integer_radix_scalar_maxmin_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
void *const *bsks, Torus *const *ksks,
@@ -732,9 +705,9 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
// - 2 if lhs > rhs
auto sign = mem_ptr->tmp_lwe_array_out;
host_integer_radix_scalar_difference_check_kb<Torus>(
streams, gpu_indexes, gpu_count, sign, lwe_array_in, scalar_blocks,
h_scalar_blocks, mem_ptr, mem_ptr->identity_lut_f, bsks, ksks,
ms_noise_reduction_key, num_radix_blocks, num_scalar_blocks);
streams, sign, lwe_array_in, scalar_blocks, h_scalar_blocks, mem_ptr,
mem_ptr->identity_lut_f, bsks, ksks, ms_noise_reduction_key,
num_radix_blocks, num_scalar_blocks);
// There is no optimized CMUX for scalars, so we convert to a trivial
// ciphertext
@@ -742,22 +715,21 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
auto lwe_array_right = mem_ptr->tmp_block_comparisons;
set_trivial_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], lwe_array_right, scalar_blocks,
streams.stream(0), streams.gpu_index(0), lwe_array_right, scalar_blocks,
h_scalar_blocks, num_scalar_blocks, params.message_modulus,
params.carry_modulus);
// Selector
// CMUX for Max or Min
host_integer_radix_cmux_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out,
mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
mem_ptr->cmux_buffer, bsks, ksks, ms_noise_reduction_key);
host_integer_radix_cmux_kb<Torus>(streams, lwe_array_out,
mem_ptr->tmp_lwe_array_out, lwe_array_left,
lwe_array_right, mem_ptr->cmux_buffer, bsks,
ksks, ms_noise_reduction_key);
}
template <typename Torus>
__host__ void host_integer_radix_scalar_equality_check_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
@@ -797,9 +769,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
num_halved_lsb_radix_blocks,
lwe_array_in->num_radix_blocks);
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
streams.synchronize();
auto lsb_streams = mem_ptr->lsb_streams;
auto msb_streams = mem_ptr->msb_streams;
@@ -811,33 +781,34 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
num_halved_lsb_radix_blocks,
packed_blocks->num_radix_blocks);
if (num_lsb_radix_blocks > 1) {
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], packed_blocks,
lwe_array_in, num_lsb_radix_blocks, message_modulus);
scalar_pack_blocks(lsb_streams[0], gpu_indexes[0], &packed_scalar,
scalar_blocks, num_scalar_blocks, message_modulus);
pack_blocks<Torus>(lsb_streams.stream(0), lsb_streams.gpu_index(0),
packed_blocks, lwe_array_in, num_lsb_radix_blocks,
message_modulus);
scalar_pack_blocks(lsb_streams.stream(0), streams.gpu_index(0),
&packed_scalar, scalar_blocks, num_scalar_blocks,
message_modulus);
cuda_memcpy_async_gpu_to_gpu(
scalar_comparison_luts->get_lut_indexes(0, 0), packed_scalar.ptr,
num_halved_scalar_blocks * sizeof(Torus), lsb_streams[0],
gpu_indexes[0]);
num_halved_scalar_blocks * sizeof(Torus), lsb_streams.stream(0),
lsb_streams.gpu_index(0));
} else if (num_lsb_radix_blocks == 1) {
copy_radix_ciphertext_slice_async<Torus>(lsb_streams[0], gpu_indexes[0],
packed_blocks, 0, 1,
copy_radix_ciphertext_slice_async<Torus>(
lsb_streams.stream(0), lsb_streams.gpu_index(0), packed_blocks, 0, 1,
lwe_array_in, 0, 1);
cuda_memcpy_async_gpu_to_gpu(
scalar_comparison_luts->get_lut_indexes(0, 0), scalar_blocks,
num_halved_scalar_blocks * sizeof(Torus), lsb_streams[0],
gpu_indexes[0]);
num_halved_scalar_blocks * sizeof(Torus), lsb_streams.stream(0),
lsb_streams.gpu_index(0));
}
auto active_gpu_count =
get_active_gpu_count(num_halved_scalar_blocks, gpu_count);
auto active_streams =
lsb_streams.active_gpu_subset(num_halved_scalar_blocks);
// We use false cause we only will broadcast the indexes
scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes,
active_gpu_count, false);
scalar_comparison_luts->broadcast_lut(active_streams, false);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
lsb_streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
mem_ptr->tmp_packed_input, bsks, ksks, ms_noise_reduction_key,
scalar_comparison_luts, num_halved_lsb_radix_blocks);
lsb_streams, mem_ptr->tmp_lwe_array_out, mem_ptr->tmp_packed_input,
bsks, ksks, ms_noise_reduction_key, scalar_comparison_luts,
num_halved_lsb_radix_blocks);
}
//////////////
// msb_in
@@ -855,29 +826,27 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
}
host_compare_blocks_with_zero<Torus>(
msb_streams, gpu_indexes, gpu_count, &msb_out, &msb_in, mem_ptr, bsks,
ksks, ms_noise_reduction_key, num_msb_radix_blocks, msb_lut);
msb_streams, &msb_out, &msb_in, mem_ptr, bsks, ksks,
ms_noise_reduction_key, num_msb_radix_blocks, msb_lut);
are_all_comparisons_block_true<Torus>(
msb_streams, gpu_indexes, gpu_count, &msb_out, &msb_out, mem_ptr, bsks,
ksks, ms_noise_reduction_key, msb_out.num_radix_blocks);
msb_streams, &msb_out, &msb_out, mem_ptr, bsks, ksks,
ms_noise_reduction_key, msb_out.num_radix_blocks);
}
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
}
lsb_streams.synchronize();
msb_streams.synchronize();
switch (mem_ptr->op) {
case COMPARISON_TYPE::EQ:
are_all_comparisons_block_true<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out,
mem_ptr->tmp_lwe_array_out, mem_ptr, bsks, ksks, ms_noise_reduction_key,
streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, mem_ptr, bsks, ksks,
ms_noise_reduction_key,
num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
break;
case COMPARISON_TYPE::NE:
is_at_least_one_comparisons_block_true<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out,
mem_ptr->tmp_lwe_array_out, mem_ptr, bsks, ksks, ms_noise_reduction_key,
streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, mem_ptr, bsks, ksks,
ms_noise_reduction_key,
num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
break;
default:

View File

@@ -1,13 +1,13 @@
#include "scalar_div.cuh"
uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type,
const CudaScalarDivisorFFI *scalar_divisor_ffi, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
@@ -16,45 +16,43 @@ uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
noise_reduction_type);
return scratch_integer_unsigned_scalar_div_radix<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
CudaStreams(streams), params,
(int_unsigned_scalar_div_mem<uint64_t> **)mem_ptr, num_blocks,
scalar_divisor_ffi, allocate_gpu_memory);
}
void cuda_integer_unsigned_scalar_div_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *numerator_ct, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
const CudaScalarDivisorFFI *scalar_divisor_ffi) {
host_integer_unsigned_scalar_div_radix<uint64_t>(
(cudaStream_t *)streams, gpu_indexes, gpu_count, numerator_ct,
CudaStreams(streams), numerator_ct,
(int_unsigned_scalar_div_mem<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
ms_noise_reduction_key, scalar_divisor_ffi);
}
void cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
int_unsigned_scalar_div_mem<uint64_t> *mem_ptr =
(int_unsigned_scalar_div_mem<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}
uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type,
const CudaScalarDivisorFFI *scalar_divisor_ffi, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
@@ -63,44 +61,42 @@ uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
noise_reduction_type);
return scratch_integer_signed_scalar_div_radix_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
CudaStreams(streams), params,
(int_signed_scalar_div_mem<uint64_t> **)mem_ptr, num_blocks,
scalar_divisor_ffi, allocate_gpu_memory);
}
void cuda_integer_signed_scalar_div_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *numerator_ct, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits) {
host_integer_signed_scalar_div_radix_kb<uint64_t>(
(cudaStream_t *)streams, gpu_indexes, gpu_count, numerator_ct,
CudaStreams(streams), numerator_ct,
(int_signed_scalar_div_mem<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
ms_noise_reduction_key, scalar_divisor_ffi, numerator_bits);
}
void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_signed_scalar_div_mem<uint64_t> *mem_ptr =
(int_signed_scalar_div_mem<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}
uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type,
const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint32_t const active_bits_divisor, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
@@ -111,15 +107,15 @@ uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
noise_reduction_type);
return scratch_integer_unsigned_scalar_div_rem_radix<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
CudaStreams(streams), params,
(int_unsigned_scalar_div_rem_buffer<uint64_t> **)mem_ptr, num_blocks,
scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory);
}
void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *quotient_ct, CudaRadixCiphertextFFI *remainder_ct,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient_ct,
CudaRadixCiphertextFFI *remainder_ct, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint64_t const *divisor_has_at_least_one_set,
@@ -128,33 +124,32 @@ void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
uint32_t num_clear_blocks) {
host_integer_unsigned_scalar_div_rem_radix<uint64_t>(
(cudaStream_t *)streams, gpu_indexes, gpu_count, quotient_ct,
remainder_ct, (int_unsigned_scalar_div_rem_buffer<uint64_t> *)mem_ptr,
bsks, (uint64_t **)ksks, ms_noise_reduction_key, scalar_divisor_ffi,
CudaStreams(streams), quotient_ct, remainder_ct,
(int_unsigned_scalar_div_rem_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)ksks, ms_noise_reduction_key, scalar_divisor_ffi,
divisor_has_at_least_one_set, decomposed_divisor, num_scalars_divisor,
(uint64_t *)clear_blocks, (uint64_t *)h_clear_blocks, num_clear_blocks);
}
void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
int_unsigned_scalar_div_rem_buffer<uint64_t> *mem_ptr =
(int_unsigned_scalar_div_rem_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}
uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type,
const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint32_t const active_bits_divisor, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
@@ -165,15 +160,15 @@ uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
noise_reduction_type);
return scratch_integer_signed_scalar_div_rem_radix<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
CudaStreams(streams), params,
(int_signed_scalar_div_rem_buffer<uint64_t> **)mem_ptr, num_blocks,
scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory);
}
void cuda_integer_signed_scalar_div_rem_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *quotient_ct, CudaRadixCiphertextFFI *remainder_ct,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient_ct,
CudaRadixCiphertextFFI *remainder_ct, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint64_t const *divisor_has_at_least_one_set,
@@ -181,21 +176,20 @@ void cuda_integer_signed_scalar_div_rem_radix_kb_64(
uint32_t numerator_bits) {
host_integer_signed_scalar_div_rem_radix<uint64_t>(
(cudaStream_t *)streams, gpu_indexes, gpu_count, quotient_ct,
remainder_ct, (int_signed_scalar_div_rem_buffer<uint64_t> *)mem_ptr, bsks,
CudaStreams(streams), quotient_ct, remainder_ct,
(int_signed_scalar_div_rem_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)ksks, ms_noise_reduction_key, scalar_divisor_ffi,
divisor_has_at_least_one_set, decomposed_divisor, num_scalars_divisor,
numerator_bits);
}
void cleanup_cuda_integer_signed_scalar_div_rem_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
int_signed_scalar_div_rem_buffer<uint64_t> *mem_ptr =
(int_signed_scalar_div_rem_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;

View File

@@ -9,8 +9,7 @@
template <typename Torus>
__host__ uint64_t scratch_integer_unsigned_scalar_div_radix(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, const int_radix_params params,
CudaStreams streams, const int_radix_params params,
int_unsigned_scalar_div_mem<Torus> **mem_ptr, uint32_t num_radix_blocks,
const CudaScalarDivisorFFI *scalar_divisor_ffi,
const bool allocate_gpu_memory) {
@@ -18,16 +17,15 @@ __host__ uint64_t scratch_integer_unsigned_scalar_div_radix(
uint64_t size_tracker = 0;
*mem_ptr = new int_unsigned_scalar_div_mem<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
scalar_divisor_ffi, allocate_gpu_memory, size_tracker);
streams, params, num_radix_blocks, scalar_divisor_ffi,
allocate_gpu_memory, size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void host_integer_unsigned_scalar_div_radix(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *numerator_ct,
CudaStreams streams, CudaRadixCiphertextFFI *numerator_ct,
int_unsigned_scalar_div_mem<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
@@ -39,15 +37,15 @@ __host__ void host_integer_unsigned_scalar_div_radix(
if (scalar_divisor_ffi->is_divisor_pow2) {
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, numerator_ct,
scalar_divisor_ffi->ilog2_divisor, mem_ptr->logical_scalar_shift_mem,
bsks, ksks, ms_noise_reduction_key, numerator_ct->num_radix_blocks);
streams, numerator_ct, scalar_divisor_ffi->ilog2_divisor,
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
numerator_ct->num_radix_blocks);
return;
}
if (scalar_divisor_ffi->divisor_has_more_bits_than_numerator) {
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], numerator_ct,
mem_ptr->tmp_ffi);
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
numerator_ct, mem_ptr->tmp_ffi);
return;
}
@@ -63,32 +61,29 @@ __host__ void host_integer_unsigned_scalar_div_radix(
CudaRadixCiphertextFFI *numerator_cpy = mem_ptr->tmp_ffi;
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
numerator_cpy, numerator_ct);
host_integer_radix_scalar_mul_high_kb<Torus>(
streams, gpu_indexes, gpu_count, numerator_cpy,
mem_ptr->scalar_mul_high_mem, ksks, ms_noise_reduction_key, bsks,
scalar_divisor_ffi);
streams, numerator_cpy, mem_ptr->scalar_mul_high_mem, ksks,
ms_noise_reduction_key, bsks, scalar_divisor_ffi);
host_sub_and_propagate_single_carry<Torus>(
streams, gpu_indexes, gpu_count, numerator_ct, numerator_cpy, nullptr,
nullptr, mem_ptr->sub_and_propagate_mem, bsks, ksks,
ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, numerator_ct, (uint32_t)1,
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
numerator_ct->num_radix_blocks);
host_add_and_propagate_single_carry<Torus>(
streams, gpu_indexes, gpu_count, numerator_ct, numerator_cpy, nullptr,
nullptr, mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key,
streams, numerator_ct, numerator_cpy, nullptr, nullptr,
mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
FLAG_NONE, (uint32_t)0);
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, numerator_ct,
scalar_divisor_ffi->shift_post - (uint32_t)1,
streams, numerator_ct, (uint32_t)1, mem_ptr->logical_scalar_shift_mem,
bsks, ksks, ms_noise_reduction_key, numerator_ct->num_radix_blocks);
host_add_and_propagate_single_carry<Torus>(
streams, numerator_ct, numerator_cpy, nullptr, nullptr,
mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key, FLAG_NONE,
(uint32_t)0);
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, numerator_ct, scalar_divisor_ffi->shift_post - (uint32_t)1,
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
numerator_ct->num_radix_blocks);
@@ -96,25 +91,23 @@ __host__ void host_integer_unsigned_scalar_div_radix(
}
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, numerator_ct,
scalar_divisor_ffi->shift_pre, mem_ptr->logical_scalar_shift_mem, bsks,
ksks, ms_noise_reduction_key, numerator_ct->num_radix_blocks);
streams, numerator_ct, scalar_divisor_ffi->shift_pre,
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
numerator_ct->num_radix_blocks);
host_integer_radix_scalar_mul_high_kb<Torus>(
streams, gpu_indexes, gpu_count, numerator_ct,
mem_ptr->scalar_mul_high_mem, ksks, ms_noise_reduction_key, bsks,
scalar_divisor_ffi);
streams, numerator_ct, mem_ptr->scalar_mul_high_mem, ksks,
ms_noise_reduction_key, bsks, scalar_divisor_ffi);
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, numerator_ct,
scalar_divisor_ffi->shift_post, mem_ptr->logical_scalar_shift_mem, bsks,
ksks, ms_noise_reduction_key, numerator_ct->num_radix_blocks);
streams, numerator_ct, scalar_divisor_ffi->shift_post,
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
numerator_ct->num_radix_blocks);
}
template <typename Torus>
__host__ uint64_t scratch_integer_signed_scalar_div_radix_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params params,
CudaStreams streams, int_radix_params params,
int_signed_scalar_div_mem<Torus> **mem_ptr, uint32_t num_radix_blocks,
const CudaScalarDivisorFFI *scalar_divisor_ffi,
const bool allocate_gpu_memory) {
@@ -122,16 +115,15 @@ __host__ uint64_t scratch_integer_signed_scalar_div_radix_kb(
uint64_t size_tracker = 0;
*mem_ptr = new int_signed_scalar_div_mem<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
scalar_divisor_ffi, allocate_gpu_memory, size_tracker);
streams, params, num_radix_blocks, scalar_divisor_ffi,
allocate_gpu_memory, size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void host_integer_signed_scalar_div_radix_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *numerator_ct,
CudaStreams streams, CudaRadixCiphertextFFI *numerator_ct,
int_signed_scalar_div_mem<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
@@ -142,19 +134,18 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
CudaRadixCiphertextFFI *tmp = mem_ptr->tmp_ffi;
host_integer_radix_negation<Torus>(
streams, gpu_indexes, gpu_count, tmp, numerator_ct,
mem_ptr->params.message_modulus, mem_ptr->params.carry_modulus,
numerator_ct->num_radix_blocks);
streams, tmp, numerator_ct, mem_ptr->params.message_modulus,
mem_ptr->params.carry_modulus, numerator_ct->num_radix_blocks);
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
numerator_ct, tmp);
copy_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), numerator_ct, tmp);
}
return;
}
if (scalar_divisor_ffi->chosen_multiplier_has_more_bits_than_numerator) {
set_zero_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], numerator_ct, 0,
streams.stream(0), streams.gpu_index(0), numerator_ct, 0,
numerator_ct->num_radix_blocks);
return;
}
@@ -162,124 +153,114 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
CudaRadixCiphertextFFI *tmp = mem_ptr->tmp_ffi;
if (scalar_divisor_ffi->is_divisor_pow2) {
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], tmp,
numerator_ct);
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
tmp, numerator_ct);
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, tmp,
scalar_divisor_ffi->chosen_multiplier_num_bits - 1,
streams, tmp, scalar_divisor_ffi->chosen_multiplier_num_bits - 1,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
ms_noise_reduction_key);
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, tmp,
streams, tmp,
numerator_bits - scalar_divisor_ffi->chosen_multiplier_num_bits,
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
tmp->num_radix_blocks);
host_add_and_propagate_single_carry<Torus>(
streams, gpu_indexes, gpu_count, tmp, numerator_ct, nullptr, nullptr,
mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key, FLAG_NONE,
(uint32_t)0);
streams, tmp, numerator_ct, nullptr, nullptr, mem_ptr->scp_mem, bsks,
ksks, ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, tmp,
scalar_divisor_ffi->chosen_multiplier_num_bits,
streams, tmp, scalar_divisor_ffi->chosen_multiplier_num_bits,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
ms_noise_reduction_key);
} else if (!scalar_divisor_ffi->is_chosen_multiplier_geq_two_pow_numerator) {
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], tmp,
numerator_ct);
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
tmp, numerator_ct);
host_integer_radix_signed_scalar_mul_high_kb<Torus>(
streams, gpu_indexes, gpu_count, tmp, mem_ptr->scalar_mul_high_mem,
ksks, scalar_divisor_ffi, ms_noise_reduction_key, bsks);
streams, tmp, mem_ptr->scalar_mul_high_mem, ksks, scalar_divisor_ffi,
ms_noise_reduction_key, bsks);
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, tmp, scalar_divisor_ffi->shift_post,
streams, tmp, scalar_divisor_ffi->shift_post,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
ms_noise_reduction_key);
CudaRadixCiphertextFFI *xsign = mem_ptr->xsign_ffi;
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], xsign,
numerator_ct);
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
xsign, numerator_ct);
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, xsign, numerator_bits - 1,
streams, xsign, numerator_bits - 1,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
ms_noise_reduction_key);
host_sub_and_propagate_single_carry<Torus>(
streams, gpu_indexes, gpu_count, tmp, xsign, nullptr, nullptr,
mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
FLAG_NONE, (uint32_t)0);
streams, tmp, xsign, nullptr, nullptr, mem_ptr->sub_and_propagate_mem,
bsks, ksks, ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
} else {
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], tmp,
numerator_ct);
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
tmp, numerator_ct);
host_integer_radix_signed_scalar_mul_high_kb<Torus>(
streams, gpu_indexes, gpu_count, tmp, mem_ptr->scalar_mul_high_mem,
ksks, scalar_divisor_ffi, ms_noise_reduction_key, bsks);
streams, tmp, mem_ptr->scalar_mul_high_mem, ksks, scalar_divisor_ffi,
ms_noise_reduction_key, bsks);
host_add_and_propagate_single_carry<Torus>(
streams, gpu_indexes, gpu_count, tmp, numerator_ct, nullptr, nullptr,
mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key, FLAG_NONE,
(uint32_t)0);
streams, tmp, numerator_ct, nullptr, nullptr, mem_ptr->scp_mem, bsks,
ksks, ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, tmp, scalar_divisor_ffi->shift_post,
streams, tmp, scalar_divisor_ffi->shift_post,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
ms_noise_reduction_key);
CudaRadixCiphertextFFI *xsign = mem_ptr->xsign_ffi;
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], xsign,
numerator_ct);
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
xsign, numerator_ct);
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, xsign, numerator_bits - 1,
streams, xsign, numerator_bits - 1,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
ms_noise_reduction_key);
host_sub_and_propagate_single_carry<Torus>(
streams, gpu_indexes, gpu_count, tmp, xsign, nullptr, nullptr,
mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
FLAG_NONE, (uint32_t)0);
streams, tmp, xsign, nullptr, nullptr, mem_ptr->sub_and_propagate_mem,
bsks, ksks, ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
}
if (scalar_divisor_ffi->is_divisor_negative) {
host_integer_radix_negation<Torus>(
streams, gpu_indexes, gpu_count, numerator_ct, tmp,
mem_ptr->params.message_modulus, mem_ptr->params.carry_modulus,
numerator_ct->num_radix_blocks);
streams, numerator_ct, tmp, mem_ptr->params.message_modulus,
mem_ptr->params.carry_modulus, numerator_ct->num_radix_blocks);
} else {
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], numerator_ct,
tmp);
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
numerator_ct, tmp);
}
}
template <typename Torus>
__host__ uint64_t scratch_integer_unsigned_scalar_div_rem_radix(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, const int_radix_params params,
CudaStreams streams, const int_radix_params params,
int_unsigned_scalar_div_rem_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint32_t const active_bits_divisor, const bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_unsigned_scalar_div_rem_buffer<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory,
size_tracker);
streams, params, num_radix_blocks, scalar_divisor_ffi,
active_bits_divisor, allocate_gpu_memory, size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void host_integer_unsigned_scalar_div_rem_radix(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *quotient_ct,
CudaStreams streams, CudaRadixCiphertextFFI *quotient_ct,
CudaRadixCiphertextFFI *remainder_ct,
int_unsigned_scalar_div_rem_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
@@ -291,32 +272,32 @@ __host__ void host_integer_unsigned_scalar_div_rem_radix(
uint32_t num_clear_blocks) {
auto numerator_ct = mem_ptr->numerator_ct;
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], numerator_ct,
quotient_ct);
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
numerator_ct, quotient_ct);
host_integer_unsigned_scalar_div_radix(
streams, gpu_indexes, gpu_count, quotient_ct, mem_ptr->unsigned_div_mem,
bsks, ksks, ms_noise_reduction_key, scalar_divisor_ffi);
streams, quotient_ct, mem_ptr->unsigned_div_mem, bsks, ksks,
ms_noise_reduction_key, scalar_divisor_ffi);
if (scalar_divisor_ffi->is_divisor_pow2) {
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
numerator_ct);
host_integer_radix_scalar_bitop_kb(
streams, gpu_indexes, gpu_count, remainder_ct, remainder_ct,
clear_blocks, h_clear_blocks, num_clear_blocks, mem_ptr->bitop_mem,
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
remainder_ct, numerator_ct);
host_integer_radix_scalar_bitop_kb(streams, remainder_ct, remainder_ct,
clear_blocks, h_clear_blocks,
num_clear_blocks, mem_ptr->bitop_mem,
bsks, ksks, ms_noise_reduction_key);
} else {
if (!scalar_divisor_ffi->is_divisor_zero) {
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
remainder_ct, quotient_ct);
copy_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), remainder_ct, quotient_ct);
if (!scalar_divisor_ffi->is_abs_divisor_one &&
remainder_ct->num_radix_blocks != 0) {
host_integer_scalar_mul_radix<Torus>(
streams, gpu_indexes, gpu_count, remainder_ct, decomposed_divisor,
streams, remainder_ct, decomposed_divisor,
divisor_has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks, ksks,
ms_noise_reduction_key, mem_ptr->params.message_modulus,
num_scalars_divisor);
@@ -324,19 +305,18 @@ __host__ void host_integer_unsigned_scalar_div_rem_radix(
}
host_sub_and_propagate_single_carry(
streams, gpu_indexes, gpu_count, numerator_ct, remainder_ct, nullptr,
nullptr, mem_ptr->sub_and_propagate_mem, bsks, ksks,
ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
streams, numerator_ct, remainder_ct, nullptr, nullptr,
mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
FLAG_NONE, (uint32_t)0);
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
numerator_ct);
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
remainder_ct, numerator_ct);
}
}
template <typename Torus>
__host__ uint64_t scratch_integer_signed_scalar_div_rem_radix(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, const int_radix_params params,
CudaStreams streams, const int_radix_params params,
int_signed_scalar_div_rem_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint32_t const active_bits_divisor, const bool allocate_gpu_memory) {
@@ -344,17 +324,15 @@ __host__ uint64_t scratch_integer_signed_scalar_div_rem_radix(
uint64_t size_tracker = 0;
*mem_ptr = new int_signed_scalar_div_rem_buffer<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory,
size_tracker);
streams, params, num_radix_blocks, scalar_divisor_ffi,
active_bits_divisor, allocate_gpu_memory, size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void host_integer_signed_scalar_div_rem_radix(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *quotient_ct,
CudaStreams streams, CudaRadixCiphertextFFI *quotient_ct,
CudaRadixCiphertextFFI *remainder_ct,
int_signed_scalar_div_rem_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
@@ -365,38 +343,37 @@ __host__ void host_integer_signed_scalar_div_rem_radix(
uint32_t numerator_bits) {
auto numerator_ct = mem_ptr->numerator_ct;
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], numerator_ct,
quotient_ct);
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
numerator_ct, quotient_ct);
host_integer_signed_scalar_div_radix_kb(
streams, gpu_indexes, gpu_count, quotient_ct, mem_ptr->signed_div_mem,
bsks, ksks, ms_noise_reduction_key, scalar_divisor_ffi, numerator_bits);
streams, quotient_ct, mem_ptr->signed_div_mem, bsks, ksks,
ms_noise_reduction_key, scalar_divisor_ffi, numerator_bits);
host_propagate_single_carry<Torus>(
streams, gpu_indexes, gpu_count, quotient_ct, nullptr, nullptr,
mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key, FLAG_NONE,
(uint32_t)0);
streams, quotient_ct, nullptr, nullptr, mem_ptr->scp_mem, bsks, ksks,
ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
if (!scalar_divisor_ffi->is_divisor_negative &&
scalar_divisor_ffi->is_divisor_pow2) {
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
quotient_ct);
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
remainder_ct, quotient_ct);
host_integer_radix_logical_scalar_shift_kb_inplace(
streams, gpu_indexes, gpu_count, remainder_ct,
scalar_divisor_ffi->ilog2_divisor, mem_ptr->logical_scalar_shift_mem,
bsks, ksks, ms_noise_reduction_key, remainder_ct->num_radix_blocks);
streams, remainder_ct, scalar_divisor_ffi->ilog2_divisor,
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
remainder_ct->num_radix_blocks);
} else if (!scalar_divisor_ffi->is_divisor_zero) {
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
quotient_ct);
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
remainder_ct, quotient_ct);
bool is_divisor_one = scalar_divisor_ffi->is_abs_divisor_one &&
!scalar_divisor_ffi->is_divisor_negative;
if (!is_divisor_one && remainder_ct->num_radix_blocks != 0) {
host_integer_scalar_mul_radix<Torus>(
streams, gpu_indexes, gpu_count, remainder_ct, decomposed_divisor,
streams, remainder_ct, decomposed_divisor,
divisor_has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks, ksks,
ms_noise_reduction_key, mem_ptr->params.message_modulus,
num_scalars_divisor);
@@ -404,12 +381,12 @@ __host__ void host_integer_signed_scalar_div_rem_radix(
}
host_sub_and_propagate_single_carry(
streams, gpu_indexes, gpu_count, numerator_ct, remainder_ct, nullptr,
nullptr, mem_ptr->sub_and_propagate_mem, bsks, ksks,
ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
streams, numerator_ct, remainder_ct, nullptr, nullptr,
mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
FLAG_NONE, (uint32_t)0);
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
numerator_ct);
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
remainder_ct, numerator_ct);
}
#endif

View File

@@ -1,13 +1,12 @@
#include "integer/scalar_mul.cuh"
uint64_t scratch_cuda_integer_scalar_mul_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t num_scalar_bits, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t num_scalar_bits,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
@@ -16,36 +15,31 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
noise_reduction_type);
return scratch_cuda_integer_radix_scalar_mul_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_scalar_mul_buffer<uint64_t> **)mem_ptr, num_blocks, params,
num_scalar_bits, allocate_gpu_memory);
CudaStreams(streams), (int_scalar_mul_buffer<uint64_t> **)mem_ptr,
num_blocks, params, num_scalar_bits, allocate_gpu_memory);
}
void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *lwe_array, uint64_t const *decomposed_scalar,
uint64_t const *has_at_least_one_set, int8_t *mem, void *const *bsks,
void *const *ksks,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
uint64_t const *decomposed_scalar, uint64_t const *has_at_least_one_set,
int8_t *mem, void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t polynomial_size, uint32_t message_modulus, uint32_t num_scalars) {
host_integer_scalar_mul_radix<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
decomposed_scalar, has_at_least_one_set,
CudaStreams(streams), lwe_array, decomposed_scalar, has_at_least_one_set,
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
num_scalars);
}
void cleanup_cuda_integer_radix_scalar_mul(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
void cleanup_cuda_integer_radix_scalar_mul(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_scalar_mul_buffer<uint64_t> *mem_ptr =
(int_scalar_mul_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}

View File

@@ -30,22 +30,20 @@ __global__ void device_small_scalar_radix_multiplication(T *output_lwe_array,
template <typename T>
__host__ uint64_t scratch_cuda_integer_radix_scalar_mul_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_scalar_mul_buffer<T> **mem_ptr,
CudaStreams streams, int_scalar_mul_buffer<T> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
uint32_t num_scalar_bits, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_scalar_mul_buffer<T>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
num_scalar_bits, allocate_gpu_memory, true, size_tracker);
*mem_ptr = new int_scalar_mul_buffer<T>(streams, params, num_radix_blocks,
num_scalar_bits, allocate_gpu_memory,
true, size_tracker);
return size_tracker;
}
template <typename T>
__host__ void host_integer_scalar_mul_radix(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array,
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
T const *decomposed_scalar, T const *has_at_least_one_set,
int_scalar_mul_buffer<T> *mem, void *const *bsks, T *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
@@ -66,17 +64,17 @@ __host__ void host_integer_scalar_mul_radix(
shift_amount * num_radix_blocks,
preshifted_buffer->num_radix_blocks);
if (has_at_least_one_set[shift_amount] == 1) {
copy_radix_ciphertext_slice_async<T>(streams[0], gpu_indexes[0],
&shift_input, 0, num_radix_blocks,
lwe_array, 0, num_radix_blocks);
copy_radix_ciphertext_slice_async<T>(
streams.stream(0), streams.gpu_index(0), &shift_input, 0,
num_radix_blocks, lwe_array, 0, num_radix_blocks);
host_integer_radix_logical_scalar_shift_kb_inplace<T>(
streams, gpu_indexes, gpu_count, &shift_input, shift_amount,
mem->logical_scalar_shift_buffer, bsks, ksks, ms_noise_reduction_key,
num_radix_blocks);
streams, &shift_input, shift_amount, mem->logical_scalar_shift_buffer,
bsks, ksks, ms_noise_reduction_key, num_radix_blocks);
} else {
// create trivial assign for value = 0
set_zero_radix_ciphertext_slice_async<T>(
streams[0], gpu_indexes[0], &shift_input, 0, num_radix_blocks);
streams.stream(0), streams.gpu_index(0), &shift_input, 0,
num_radix_blocks);
}
}
size_t j = 0;
@@ -91,46 +89,46 @@ __host__ void host_integer_scalar_mul_radix(
as_radix_ciphertext_slice<T>(&block_shift_buffer, all_shifted_buffer,
j * num_radix_blocks,
all_shifted_buffer->num_radix_blocks);
host_radix_blocks_rotate_right<T>(
streams, gpu_indexes, gpu_count, &block_shift_buffer,
&preshifted_radix_ct, i / msg_bits, num_radix_blocks);
host_radix_blocks_rotate_right<T>(streams, &block_shift_buffer,
&preshifted_radix_ct, i / msg_bits,
num_radix_blocks);
// create trivial assign for value = 0
set_zero_radix_ciphertext_slice_async<T>(
streams[0], gpu_indexes[0], &block_shift_buffer, 0, i / msg_bits);
streams.stream(0), streams.gpu_index(0), &block_shift_buffer, 0,
i / msg_bits);
j++;
}
}
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
if (mem->anticipated_buffers_drop) {
mem->release_buffers(streams, gpu_indexes, gpu_count);
mem->release_buffers(streams);
}
if (j == 0) {
// lwe array = 0
set_zero_radix_ciphertext_slice_async<T>(streams[0], gpu_indexes[0],
lwe_array, 0, num_radix_blocks);
set_zero_radix_ciphertext_slice_async<T>(streams.stream(0),
streams.gpu_index(0), lwe_array, 0,
num_radix_blocks);
} else {
host_integer_partial_sum_ciphertexts_vec_kb<T>(
streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer, bsks,
ksks, ms_noise_reduction_key, mem->sum_ciphertexts_vec_mem,
num_radix_blocks, j);
streams, lwe_array, all_shifted_buffer, bsks, ksks,
ms_noise_reduction_key, mem->sum_ciphertexts_vec_mem, num_radix_blocks,
j);
auto scp_mem_ptr = mem->sc_prop_mem;
uint32_t requested_flag = outputFlag::FLAG_NONE;
uint32_t uses_carry = 0;
host_propagate_single_carry<T>(streams, gpu_indexes, gpu_count, lwe_array,
nullptr, nullptr, scp_mem_ptr, bsks, ksks,
ms_noise_reduction_key, requested_flag,
uses_carry);
host_propagate_single_carry<T>(
streams, lwe_array, nullptr, nullptr, scp_mem_ptr, bsks, ksks,
ms_noise_reduction_key, requested_flag, uses_carry);
}
}
// Small scalar_mul is used in shift/rotate
template <typename T>
__host__ void host_integer_small_scalar_mul_radix(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *output_lwe_array,
CudaStreams streams, CudaRadixCiphertextFFI *output_lwe_array,
CudaRadixCiphertextFFI *input_lwe_array, T scalar,
const uint32_t message_modulus, const uint32_t carry_modulus) {
@@ -139,7 +137,7 @@ __host__ void host_integer_small_scalar_mul_radix(
if (output_lwe_array->lwe_dimension != input_lwe_array->lwe_dimension)
PANIC("Cuda error: input and output lwe_dimension must be the same")
cuda_set_device(gpu_indexes[0]);
cuda_set_device(streams.gpu_index(0));
auto lwe_dimension = input_lwe_array->lwe_dimension;
auto num_radix_blocks = input_lwe_array->num_radix_blocks;
@@ -153,7 +151,8 @@ __host__ void host_integer_small_scalar_mul_radix(
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
device_small_scalar_radix_multiplication<<<grid, thds, 0, streams[0]>>>(
device_small_scalar_radix_multiplication<<<grid, thds, 0,
streams.stream(0)>>>(
(T *)output_lwe_array->ptr, (T *)input_lwe_array->ptr, scalar,
lwe_dimension, num_radix_blocks);
check_cuda_error(cudaGetLastError());
@@ -169,22 +168,20 @@ __host__ void host_integer_small_scalar_mul_radix(
template <typename Torus>
__host__ void host_integer_radix_scalar_mul_high_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *ct,
CudaStreams streams, CudaRadixCiphertextFFI *ct,
int_scalar_mul_high_buffer<Torus> *mem_ptr, Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks, const CudaScalarDivisorFFI *scalar_divisor_ffi) {
if (scalar_divisor_ffi->is_chosen_multiplier_zero) {
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], ct,
0, ct->num_radix_blocks);
set_zero_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), ct, 0, ct->num_radix_blocks);
return;
}
CudaRadixCiphertextFFI *tmp_ffi = mem_ptr->tmp;
host_extend_radix_with_trivial_zero_blocks_msb<Torus>(tmp_ffi, ct, streams,
gpu_indexes);
host_extend_radix_with_trivial_zero_blocks_msb<Torus>(tmp_ffi, ct, streams);
if (scalar_divisor_ffi->active_bits != (uint32_t)0 &&
!scalar_divisor_ffi->is_abs_chosen_multiplier_one &&
@@ -192,16 +189,14 @@ __host__ void host_integer_radix_scalar_mul_high_kb(
if (scalar_divisor_ffi->is_chosen_multiplier_pow2) {
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, tmp_ffi,
scalar_divisor_ffi->ilog2_chosen_multiplier,
streams, tmp_ffi, scalar_divisor_ffi->ilog2_chosen_multiplier,
mem_ptr->logical_scalar_shift_mem, bsks, (uint64_t **)ksks,
ms_noise_reduction_key, tmp_ffi->num_radix_blocks);
} else {
host_integer_scalar_mul_radix<Torus>(
streams, gpu_indexes, gpu_count, tmp_ffi,
scalar_divisor_ffi->decomposed_chosen_multiplier,
streams, tmp_ffi, scalar_divisor_ffi->decomposed_chosen_multiplier,
scalar_divisor_ffi->chosen_multiplier_has_at_least_one_set,
mem_ptr->scalar_mul_mem, bsks, (uint64_t **)ksks,
ms_noise_reduction_key, mem_ptr->params.message_modulus,
@@ -209,29 +204,28 @@ __host__ void host_integer_radix_scalar_mul_high_kb(
}
}
host_trim_radix_blocks_lsb<Torus>(ct, tmp_ffi, streams, gpu_indexes);
host_trim_radix_blocks_lsb<Torus>(ct, tmp_ffi, streams);
}
template <typename Torus>
__host__ void host_integer_radix_signed_scalar_mul_high_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *ct,
CudaStreams streams, CudaRadixCiphertextFFI *ct,
int_signed_scalar_mul_high_buffer<Torus> *mem_ptr, Torus *const *ksks,
const CudaScalarDivisorFFI *scalar_divisor_ffi,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks) {
if (scalar_divisor_ffi->is_chosen_multiplier_zero) {
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], ct,
0, ct->num_radix_blocks);
set_zero_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), ct, 0, ct->num_radix_blocks);
return;
}
CudaRadixCiphertextFFI *tmp_ffi = mem_ptr->tmp;
host_extend_radix_with_sign_msb<Torus>(
streams, gpu_indexes, gpu_count, tmp_ffi, ct, mem_ptr->extend_radix_mem,
ct->num_radix_blocks, bsks, (uint64_t **)ksks, ms_noise_reduction_key);
streams, tmp_ffi, ct, mem_ptr->extend_radix_mem, ct->num_radix_blocks,
bsks, (uint64_t **)ksks, ms_noise_reduction_key);
if (scalar_divisor_ffi->active_bits != (uint32_t)0 &&
!scalar_divisor_ffi->is_abs_chosen_multiplier_one &&
@@ -239,14 +233,12 @@ __host__ void host_integer_radix_signed_scalar_mul_high_kb(
if (scalar_divisor_ffi->is_chosen_multiplier_pow2) {
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, tmp_ffi,
scalar_divisor_ffi->ilog2_chosen_multiplier,
streams, tmp_ffi, scalar_divisor_ffi->ilog2_chosen_multiplier,
mem_ptr->logical_scalar_shift_mem, bsks, (uint64_t **)ksks,
ms_noise_reduction_key, tmp_ffi->num_radix_blocks);
} else {
host_integer_scalar_mul_radix<Torus>(
streams, gpu_indexes, gpu_count, tmp_ffi,
scalar_divisor_ffi->decomposed_chosen_multiplier,
streams, tmp_ffi, scalar_divisor_ffi->decomposed_chosen_multiplier,
scalar_divisor_ffi->chosen_multiplier_has_at_least_one_set,
mem_ptr->scalar_mul_mem, bsks, (uint64_t **)ksks,
ms_noise_reduction_key, mem_ptr->params.message_modulus,
@@ -254,7 +246,7 @@ __host__ void host_integer_radix_signed_scalar_mul_high_kb(
}
}
host_trim_radix_blocks_lsb<Torus>(ct, tmp_ffi, streams, gpu_indexes);
host_trim_radix_blocks_lsb<Torus>(ct, tmp_ffi, streams);
}
#endif

View File

@@ -1,12 +1,12 @@
#include "scalar_rotate.cuh"
uint64_t scratch_cuda_integer_radix_scalar_rotate_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
@@ -15,32 +15,29 @@ uint64_t scratch_cuda_integer_radix_scalar_rotate_kb_64(
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_integer_radix_scalar_rotate_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
CudaStreams(streams),
(int_logical_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks, params,
shift_type, allocate_gpu_memory);
}
void cuda_integer_radix_scalar_rotate_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *lwe_array, uint32_t n, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t n,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
host_integer_radix_scalar_rotate_kb_inplace<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, n,
CudaStreams(streams), lwe_array, n,
(int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key);
}
void cleanup_cuda_integer_radix_scalar_rotate(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
void cleanup_cuda_integer_radix_scalar_rotate(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_logical_scalar_shift_buffer<uint64_t> *mem_ptr =
(int_logical_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}

View File

@@ -13,22 +13,20 @@
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_radix_scalar_rotate_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_logical_scalar_shift_buffer<Torus> **mem_ptr,
CudaStreams streams, int_logical_scalar_shift_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
allocate_gpu_memory, size_tracker);
streams, shift_type, params, num_radix_blocks, allocate_gpu_memory,
size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void host_integer_radix_scalar_rotate_kb_inplace(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array, uint32_t n,
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array, uint32_t n,
int_logical_scalar_shift_buffer<Torus> *mem, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
@@ -56,12 +54,11 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
// one block is responsible to process single lwe ciphertext
if (mem->shift_type == LEFT_SHIFT) {
// rotate right as the blocks are from LSB to MSB
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
rotated_buffer, lwe_array, rotations,
num_blocks);
host_radix_blocks_rotate_right<Torus>(streams, rotated_buffer, lwe_array,
rotations, num_blocks);
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
lwe_array, 0, num_blocks,
copy_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), lwe_array, 0, num_blocks,
rotated_buffer, 0, num_blocks);
if (shift_within_block == 0) {
@@ -70,25 +67,23 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
auto receiver_blocks = lwe_array;
auto giver_blocks = rotated_buffer;
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
giver_blocks, lwe_array, 1,
host_radix_blocks_rotate_right<Torus>(streams, giver_blocks, lwe_array, 1,
num_blocks);
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
giver_blocks, bsks, ksks, ms_noise_reduction_key, lut_bivariate,
num_blocks, lut_bivariate->params.message_modulus);
streams, lwe_array, receiver_blocks, giver_blocks, bsks, ksks,
ms_noise_reduction_key, lut_bivariate, num_blocks,
lut_bivariate->params.message_modulus);
} else {
// rotate left as the blocks are from LSB to MSB
host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
rotated_buffer, lwe_array, rotations,
num_blocks);
host_radix_blocks_rotate_left<Torus>(streams, rotated_buffer, lwe_array,
rotations, num_blocks);
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
lwe_array, 0, num_blocks,
copy_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), lwe_array, 0, num_blocks,
rotated_buffer, 0, num_blocks);
if (shift_within_block == 0) {
@@ -97,16 +92,15 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
auto receiver_blocks = lwe_array;
auto giver_blocks = rotated_buffer;
host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
giver_blocks, lwe_array, 1,
host_radix_blocks_rotate_left<Torus>(streams, giver_blocks, lwe_array, 1,
num_blocks);
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
giver_blocks, bsks, ksks, ms_noise_reduction_key, lut_bivariate,
num_blocks, lut_bivariate->params.message_modulus);
streams, lwe_array, receiver_blocks, giver_blocks, bsks, ksks,
ms_noise_reduction_key, lut_bivariate, num_blocks,
lut_bivariate->params.message_modulus);
}
}

View File

@@ -1,12 +1,12 @@
#include "scalar_shifts.cuh"
uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
@@ -15,7 +15,7 @@ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_integer_radix_logical_scalar_shift_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
CudaStreams(streams),
(int_logical_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks, params,
shift_type, allocate_gpu_memory);
}
@@ -25,24 +25,23 @@ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
/// the application of a PBS onto the rotated blocks up to num_blocks -
/// rotations - 1 The remaining blocks are padded with zeros
void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *lwe_array, uint32_t shift, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
host_integer_radix_logical_scalar_shift_kb_inplace<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, shift,
CudaStreams(streams), lwe_array, shift,
(int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, lwe_array->num_radix_blocks);
}
uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
@@ -51,7 +50,7 @@ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_integer_radix_arithmetic_scalar_shift_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
CudaStreams(streams),
(int_arithmetic_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks,
params, shift_type, allocate_gpu_memory);
}
@@ -64,37 +63,34 @@ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
/// block, which is copied onto all remaining blocks instead of padding with
/// zeros as would be done in the logical shift.
void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *lwe_array, uint32_t shift, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
host_integer_radix_arithmetic_scalar_shift_kb_inplace<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, shift,
CudaStreams(streams), lwe_array, shift,
(int_arithmetic_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key);
}
void cleanup_cuda_integer_radix_logical_scalar_shift(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void cleanup_cuda_integer_radix_logical_scalar_shift(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_logical_scalar_shift_buffer<uint64_t> *mem_ptr =
(int_logical_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}
void cleanup_cuda_integer_radix_arithmetic_scalar_shift(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void cleanup_cuda_integer_radix_arithmetic_scalar_shift(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_arithmetic_scalar_shift_buffer<uint64_t> *mem_ptr =
(int_arithmetic_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}

View File

@@ -13,22 +13,20 @@
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_logical_scalar_shift_buffer<Torus> **mem_ptr,
CudaStreams streams, int_logical_scalar_shift_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
allocate_gpu_memory, size_tracker);
streams, shift_type, params, num_radix_blocks, allocate_gpu_memory,
size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
int_logical_scalar_shift_buffer<Torus> *mem, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
@@ -56,15 +54,14 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
if (mem->shift_type == LEFT_SHIFT) {
// rotate right as the blocks are from LSB to MSB
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
&rotated_buffer, lwe_array, rotations,
num_blocks);
host_radix_blocks_rotate_right<Torus>(streams, &rotated_buffer, lwe_array,
rotations, num_blocks);
// create trivial assign for value = 0
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
&rotated_buffer, 0, rotations);
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
lwe_array, 0, num_blocks,
set_zero_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), &rotated_buffer, 0, rotations);
copy_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), lwe_array, 0, num_blocks,
&rotated_buffer, 0, num_blocks);
if (shift_within_block == 0 || rotations == num_blocks) {
@@ -83,24 +80,23 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
size_t partial_block_count = num_blocks - rotations;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, &partial_current_blocks,
&partial_current_blocks, &partial_previous_blocks, bsks, ksks,
ms_noise_reduction_key, lut_bivariate, partial_block_count,
streams, &partial_current_blocks, &partial_current_blocks,
&partial_previous_blocks, bsks, ksks, ms_noise_reduction_key,
lut_bivariate, partial_block_count,
lut_bivariate->params.message_modulus);
} else {
// right shift
host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
&rotated_buffer, lwe_array, rotations,
num_blocks);
host_radix_blocks_rotate_left<Torus>(streams, &rotated_buffer, lwe_array,
rotations, num_blocks);
// rotate left as the blocks are from LSB to MSB
// create trivial assign for value = 0
set_zero_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], &rotated_buffer, num_blocks - rotations,
num_blocks);
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
lwe_array, 0, num_blocks,
streams.stream(0), streams.gpu_index(0), &rotated_buffer,
num_blocks - rotations, num_blocks);
copy_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), lwe_array, 0, num_blocks,
&rotated_buffer, 0, num_blocks);
if (shift_within_block == 0 || rotations == num_blocks) {
@@ -116,31 +112,28 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
size_t partial_block_count = num_blocks - rotations;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, partial_current_blocks,
partial_current_blocks, &partial_next_blocks, bsks, ksks,
ms_noise_reduction_key, lut_bivariate, partial_block_count,
lut_bivariate->params.message_modulus);
streams, partial_current_blocks, partial_current_blocks,
&partial_next_blocks, bsks, ksks, ms_noise_reduction_key, lut_bivariate,
partial_block_count, lut_bivariate->params.message_modulus);
}
}
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_arithmetic_scalar_shift_buffer<Torus> **mem_ptr,
CudaStreams streams, int_arithmetic_scalar_shift_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_arithmetic_scalar_shift_buffer<Torus>(
streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
allocate_gpu_memory, size_tracker);
streams, shift_type, params, num_radix_blocks, allocate_gpu_memory,
size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
int_arithmetic_scalar_shift_buffer<Torus> *mem, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
@@ -167,11 +160,10 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
num_blocks + 2, num_blocks + 3);
if (mem->shift_type == RIGHT_SHIFT) {
host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
mem->tmp_rotated, lwe_array, rotations,
num_blocks);
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
lwe_array, 0, num_blocks,
host_radix_blocks_rotate_left<Torus>(streams, mem->tmp_rotated, lwe_array,
rotations, num_blocks);
copy_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), lwe_array, 0, num_blocks,
mem->tmp_rotated, 0, num_blocks);
if (num_bits_in_block == 1) {
@@ -183,7 +175,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
// we can optimize things by not doing the pbs to extract this sign bit
for (uint i = 0; i < num_blocks; i++) {
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], mem->tmp_rotated,
streams.stream(0), streams.gpu_index(0), mem->tmp_rotated,
num_blocks - rotations + i, num_blocks - rotations + i + 1,
mem->tmp_rotated, num_blocks - rotations - 1,
num_blocks - rotations);
@@ -201,8 +193,8 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
num_blocks - rotations - 1,
num_blocks - rotations);
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], &last_block_copy, 0, 1, mem->tmp_rotated,
num_blocks - rotations - 1, num_blocks - rotations);
streams.stream(0), streams.gpu_index(0), &last_block_copy, 0, 1,
mem->tmp_rotated, num_blocks - rotations - 1, num_blocks - rotations);
if (shift_within_block != 0) {
auto partial_current_blocks = lwe_array;
CudaRadixCiphertextFFI partial_next_blocks;
@@ -212,42 +204,37 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, partial_current_blocks,
partial_current_blocks, &partial_next_blocks, bsks, ksks,
ms_noise_reduction_key, lut_bivariate, partial_block_count,
streams, partial_current_blocks, partial_current_blocks,
&partial_next_blocks, bsks, ksks, ms_noise_reduction_key,
lut_bivariate, partial_block_count,
lut_bivariate->params.message_modulus);
}
// Since our CPU threads will be working on different streams we shall
// Ensure the work in the main stream is completed
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
streams.synchronize();
auto lut_univariate_padding_block =
mem->lut_buffers_univariate[num_bits_in_block - 1];
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem->local_streams_1, gpu_indexes, gpu_count, &padding_block,
&last_block_copy, bsks, ksks, ms_noise_reduction_key,
lut_univariate_padding_block, 1);
mem->local_streams_1, &padding_block, &last_block_copy, bsks, ksks,
ms_noise_reduction_key, lut_univariate_padding_block, 1);
// Replace blocks 'pulled' from the left with the correct padding
// block
for (uint i = 0; i < rotations; i++) {
copy_radix_ciphertext_slice_async<Torus>(
mem->local_streams_1[0], gpu_indexes[0], lwe_array,
num_blocks - rotations + i, num_blocks - rotations + i + 1,
&padding_block, 0, 1);
mem->local_streams_1.stream(0), mem->local_streams_1.gpu_index(0),
lwe_array, num_blocks - rotations + i,
num_blocks - rotations + i + 1, &padding_block, 0, 1);
}
if (shift_within_block != 0) {
auto lut_univariate_shift_last_block =
mem->lut_buffers_univariate[shift_within_block - 1];
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem->local_streams_2, gpu_indexes, gpu_count, &last_block,
&last_block_copy, bsks, ksks, ms_noise_reduction_key,
lut_univariate_shift_last_block, 1);
}
for (uint j = 0; j < mem->active_gpu_count; j++) {
cuda_synchronize_stream(mem->local_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem->local_streams_2[j], gpu_indexes[j]);
mem->local_streams_2, &last_block, &last_block_copy, bsks, ksks,
ms_noise_reduction_key, lut_univariate_shift_last_block, 1);
}
mem->local_streams_1.synchronize();
mem->local_streams_2.synchronize();
}
} else {
PANIC("Cuda error (scalar shift): left scalar shift is never of the "

View File

@@ -1,14 +1,13 @@
#include "shift_and_rotate.cuh"
uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
bool is_signed, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -16,31 +15,28 @@ uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb_64(
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_integer_radix_shift_and_rotate_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_shift_and_rotate_buffer<uint64_t> **)mem_ptr, num_blocks, params,
shift_type, is_signed, allocate_gpu_memory);
CudaStreams(streams), (int_shift_and_rotate_buffer<uint64_t> **)mem_ptr,
num_blocks, params, shift_type, is_signed, allocate_gpu_memory);
}
void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *lwe_array, CudaRadixCiphertextFFI const *lwe_shift,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
CudaRadixCiphertextFFI const *lwe_shift, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
host_integer_radix_shift_and_rotate_kb_inplace<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, lwe_shift,
CudaStreams(streams), lwe_array, lwe_shift,
(int_shift_and_rotate_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key);
}
void cleanup_cuda_integer_radix_shift_and_rotate(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
void cleanup_cuda_integer_radix_shift_and_rotate(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_shift_and_rotate_buffer<uint64_t> *mem_ptr =
(int_shift_and_rotate_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}

View File

@@ -14,26 +14,24 @@
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_shift_and_rotate_buffer<Torus> **mem_ptr,
CudaStreams streams, int_shift_and_rotate_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_shift_and_rotate_buffer<Torus>(
streams, gpu_indexes, gpu_count, shift_type, is_signed, params,
num_radix_blocks, allocate_gpu_memory, size_tracker);
streams, shift_type, is_signed, params, num_radix_blocks,
allocate_gpu_memory, size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void host_integer_radix_shift_and_rotate_kb_inplace(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array,
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
CudaRadixCiphertextFFI const *lwe_shift,
int_shift_and_rotate_buffer<Torus> *mem, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
cuda_set_device(gpu_indexes[0]);
cuda_set_device(streams.gpu_index(0));
if (lwe_array->num_radix_blocks != lwe_shift->num_radix_blocks)
PANIC("Cuda error: lwe_shift and lwe_array num radix blocks must be "
@@ -58,8 +56,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
// Extract all bits
auto bits = mem->tmp_bits;
extract_n_bits<Torus>(streams, gpu_indexes, gpu_count, bits, lwe_array, bsks,
ksks, ms_noise_reduction_key,
extract_n_bits<Torus>(streams, bits, lwe_array, bsks, ksks,
ms_noise_reduction_key,
num_radix_blocks * bits_per_block, num_radix_blocks,
mem->bit_extract_luts);
@@ -80,10 +78,9 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
// Extracts bits and put them in the bit index 2 (=> bit number 3)
// so that it is already aligned to the correct position of the cmux input
// and we reduce noise growth
extract_n_bits<Torus>(streams, gpu_indexes, gpu_count, shift_bits, lwe_shift,
bsks, ksks, ms_noise_reduction_key,
max_num_bits_that_tell_shift, num_radix_blocks,
mem->bit_extract_luts_with_offset_2);
extract_n_bits<Torus>(streams, shift_bits, lwe_shift, bsks, ksks,
ms_noise_reduction_key, max_num_bits_that_tell_shift,
num_radix_blocks, mem->bit_extract_luts_with_offset_2);
// If signed, do an "arithmetic shift" by padding with the sign bit
CudaRadixCiphertextFFI last_bit;
@@ -97,58 +94,54 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
auto mux_lut = mem->mux_lut;
auto mux_inputs = mem->tmp_mux_inputs;
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], input_bits_a,
bits);
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
input_bits_a, bits);
for (int d = 0; d < max_num_bits_that_tell_shift; d++) {
CudaRadixCiphertextFFI shift_bit;
as_radix_ciphertext_slice<Torus>(&shift_bit, shift_bits, d, d + 1);
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], input_bits_b,
input_bits_a);
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
input_bits_b, input_bits_a);
auto rotations = 1 << d;
switch (mem->shift_type) {
case LEFT_SHIFT:
// rotate right as the blocks are from LSB to MSB
if (input_bits_b->num_radix_blocks != total_nb_bits)
PANIC("Cuda error: incorrect number of blocks")
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
rotated_input, input_bits_b,
rotations, total_nb_bits);
host_radix_blocks_rotate_right<Torus>(
streams, rotated_input, input_bits_b, rotations, total_nb_bits);
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
rotated_input, 0, rotations);
set_zero_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), rotated_input, 0, rotations);
break;
case RIGHT_SHIFT:
// rotate left as the blocks are from LSB to MSB
if (input_bits_b->num_radix_blocks != total_nb_bits)
PANIC("Cuda error: incorrect number of blocks")
host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
rotated_input, input_bits_b,
host_radix_blocks_rotate_left<Torus>(streams, rotated_input, input_bits_b,
rotations, total_nb_bits);
if (mem->is_signed)
for (int i = 0; i < rotations; i++) {
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], rotated_input,
streams.stream(0), streams.gpu_index(0), rotated_input,
total_nb_bits - rotations + i, total_nb_bits - rotations + i + 1,
&last_bit, 0, 1);
}
else {
set_zero_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], rotated_input,
streams.stream(0), streams.gpu_index(0), rotated_input,
total_nb_bits - rotations, total_nb_bits);
}
break;
case LEFT_ROTATE:
// rotate right as the blocks are from LSB to MSB
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
rotated_input, input_bits_b,
rotations, total_nb_bits);
host_radix_blocks_rotate_right<Torus>(
streams, rotated_input, input_bits_b, rotations, total_nb_bits);
break;
case RIGHT_ROTATE:
// rotate left as the blocks are from LSB to MSB
host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
rotated_input, input_bits_b,
host_radix_blocks_rotate_left<Torus>(streams, rotated_input, input_bits_b,
rotations, total_nb_bits);
break;
default:
@@ -158,20 +151,20 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
// host_pack bits into one block so that we have
// control_bit|b|a
host_pack_bivariate_blocks<Torus>(
streams, gpu_indexes, gpu_count, mux_inputs, mux_lut->lwe_indexes_out,
rotated_input, input_bits_a, mux_lut->lwe_indexes_in, 2, total_nb_bits,
streams, mux_inputs, mux_lut->lwe_indexes_out, rotated_input,
input_bits_a, mux_lut->lwe_indexes_in, 2, total_nb_bits,
mem->params.message_modulus, mem->params.carry_modulus);
// The shift bit is already properly aligned/positioned
host_add_the_same_block_to_all_blocks<Torus>(
streams[0], gpu_indexes[0], mux_inputs, mux_inputs, &shift_bit,
mem->params.message_modulus, mem->params.carry_modulus);
streams.stream(0), streams.gpu_index(0), mux_inputs, mux_inputs,
&shift_bit, mem->params.message_modulus, mem->params.carry_modulus);
// we have
// control_bit|b|a
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, input_bits_a, mux_inputs, bsks, ksks,
ms_noise_reduction_key, mux_lut, total_nb_bits);
streams, input_bits_a, mux_inputs, bsks, ksks, ms_noise_reduction_key,
mux_lut, total_nb_bits);
}
// Initializes the output
@@ -179,15 +172,15 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
for (int i = 0; i < num_radix_blocks; i++) {
auto last_bit_index = (bits_per_block - 1) + i * bits_per_block;
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], lwe_array, i, i + 1, input_bits_a,
last_bit_index, last_bit_index + 1);
streams.stream(0), streams.gpu_index(0), lwe_array, i, i + 1,
input_bits_a, last_bit_index, last_bit_index + 1);
}
// Bitshift and add the other bits
for (int i = bits_per_block - 2; i >= 0; i--) {
host_integer_small_scalar_mul_radix<Torus>(
streams, gpu_indexes, gpu_count, lwe_array, lwe_array, 2,
mem->params.message_modulus, mem->params.carry_modulus);
host_integer_small_scalar_mul_radix<Torus>(streams, lwe_array, lwe_array, 2,
mem->params.message_modulus,
mem->params.carry_modulus);
for (int j = 0; j < num_radix_blocks; j++) {
CudaRadixCiphertextFFI block;
CudaRadixCiphertextFFI bit_to_add;
@@ -195,16 +188,16 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
as_radix_ciphertext_slice<Torus>(&bit_to_add, input_bits_a,
i + j * bits_per_block,
i + j * bits_per_block + 1);
host_addition<Torus>(streams[0], gpu_indexes[0], &block, &block,
&bit_to_add, 1, mem->params.message_modulus,
host_addition<Torus>(streams.stream(0), streams.gpu_index(0), &block,
&block, &bit_to_add, 1, mem->params.message_modulus,
mem->params.carry_modulus);
}
// To give back a clean ciphertext
auto cleaning_lut = mem->cleaning_lut;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks,
ms_noise_reduction_key, cleaning_lut, num_radix_blocks);
streams, lwe_array, lwe_array, bsks, ksks, ms_noise_reduction_key,
cleaning_lut, num_radix_blocks);
}
}
#endif

View File

@@ -1,13 +1,13 @@
#include "subtraction.cuh"
uint64_t scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -15,35 +15,32 @@ uint64_t scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_sub_and_propagate_single_carry<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_sub_and_propagate<uint64_t> **)mem_ptr, num_blocks, params,
requested_flag, allocate_gpu_memory);
CudaStreams(streams), (int_sub_and_propagate<uint64_t> **)mem_ptr,
num_blocks, params, requested_flag, allocate_gpu_memory);
}
void cuda_sub_and_propagate_single_carry_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t requested_flag, uint32_t uses_carry) {
PUSH_RANGE("sub")
host_sub_and_propagate_single_carry<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lhs_array, rhs_array,
carry_out, carry_in, (int_sub_and_propagate<uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, requested_flag, uses_carry);
CudaStreams(streams), lhs_array, rhs_array, carry_out, carry_in,
(int_sub_and_propagate<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, requested_flag, uses_carry);
POP_RANGE()
}
void cleanup_cuda_sub_and_propagate_single_carry(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
void cleanup_cuda_sub_and_propagate_single_carry(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
PUSH_RANGE("cleanup sub")
int_sub_and_propagate<uint64_t> *mem_ptr =
(int_sub_and_propagate<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
POP_RANGE()
delete mem_ptr;
*mem_ptr_void = nullptr;

View File

@@ -14,24 +14,22 @@
template <typename Torus>
uint64_t scratch_cuda_sub_and_propagate_single_carry(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_sub_and_propagate<Torus> **mem_ptr,
CudaStreams streams, int_sub_and_propagate<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params, uint32_t requested_flag,
bool allocate_gpu_memory) {
PUSH_RANGE("scratch sub")
uint64_t size_tracker = 0;
*mem_ptr = new int_sub_and_propagate<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks, requested_flag,
allocate_gpu_memory, size_tracker);
streams, params, num_radix_blocks, requested_flag, allocate_gpu_memory,
size_tracker);
POP_RANGE()
return size_tracker;
}
template <typename Torus>
void host_sub_and_propagate_single_carry(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lhs_array,
CudaStreams streams, CudaRadixCiphertextFFI *lhs_array,
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
const CudaRadixCiphertextFFI *input_carries,
int_sub_and_propagate<Torus> *mem, void *const *bsks, Torus *const *ksks,
@@ -39,24 +37,22 @@ void host_sub_and_propagate_single_carry(
uint32_t requested_flag, uint32_t uses_carry) {
host_integer_radix_negation<Torus>(
streams, gpu_indexes, gpu_count, mem->neg_rhs_array, rhs_array,
mem->params.message_modulus, mem->params.carry_modulus,
mem->neg_rhs_array->num_radix_blocks);
streams, mem->neg_rhs_array, rhs_array, mem->params.message_modulus,
mem->params.carry_modulus, mem->neg_rhs_array->num_radix_blocks);
host_add_and_propagate_single_carry<Torus>(
streams, gpu_indexes, gpu_count, lhs_array, mem->neg_rhs_array, carry_out,
input_carries, mem->sc_prop_mem, bsks, ksks, ms_noise_reduction_key,
requested_flag, uses_carry);
streams, lhs_array, mem->neg_rhs_array, carry_out, input_carries,
mem->sc_prop_mem, bsks, ksks, ms_noise_reduction_key, requested_flag,
uses_carry);
}
template <typename Torus>
__host__ void host_integer_radix_subtraction(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in_1,
CudaRadixCiphertextFFI const *lwe_array_in_2, uint64_t message_modulus,
uint64_t carry_modulus, uint32_t num_radix_blocks) {
cuda_set_device(gpu_indexes[0]);
cuda_set_device(streams.gpu_index(0));
if (lwe_array_out->num_radix_blocks < num_radix_blocks ||
lwe_array_in_1->num_radix_blocks < num_radix_blocks ||
@@ -69,11 +65,11 @@ __host__ void host_integer_radix_subtraction(
PANIC("Cuda error: lwe_array_in and lwe_array_out lwe_dimension must be "
"the same")
host_integer_radix_negation<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in_2,
message_modulus, carry_modulus, num_radix_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], lwe_array_out, lwe_array_out,
lwe_array_in_1, num_radix_blocks, message_modulus,
carry_modulus);
host_integer_radix_negation<Torus>(streams, lwe_array_out, lwe_array_in_2,
message_modulus, carry_modulus,
num_radix_blocks);
host_addition<Torus>(streams.stream(0), streams.gpu_index(0), lwe_array_out,
lwe_array_out, lwe_array_in_1, num_radix_blocks,
message_modulus, carry_modulus);
}
#endif

View File

@@ -205,8 +205,7 @@ __device__ void mul_ggsw_glwe_in_fourier_domain_2_2_params(
template <typename InputTorus, typename OutputTorus>
void execute_pbs_async(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, const LweArrayVariant<OutputTorus> &lwe_array_out,
CudaStreams streams, const LweArrayVariant<OutputTorus> &lwe_array_out,
const LweArrayVariant<InputTorus> &lwe_output_indexes,
const std::vector<OutputTorus *> lut_vec,
const std::vector<InputTorus *> lut_indexes_vec,
@@ -226,12 +225,12 @@ void execute_pbs_async(
case MULTI_BIT:
PANIC("Error: 32-bit multibit PBS is not supported.\n")
case CLASSICAL:
for (uint i = 0; i < gpu_count; i++) {
int num_inputs_on_gpu =
get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count);
for (uint i = 0; i < streams.count(); i++) {
int num_inputs_on_gpu = get_num_inputs_on_gpu(
input_lwe_ciphertext_count, i, streams.count());
int gpu_offset =
get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
get_gpu_offset(input_lwe_ciphertext_count, i, streams.count());
auto d_lut_vector_indexes =
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
@@ -246,7 +245,7 @@ void execute_pbs_async(
get_variant_element(lwe_input_indexes, i);
cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
streams[i], gpu_indexes[i], current_lwe_array_out,
streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
current_lwe_array_in, current_lwe_input_indexes,
bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
@@ -263,9 +262,9 @@ void execute_pbs_async(
case MULTI_BIT:
if (grouping_factor == 0)
PANIC("Multi-bit PBS error: grouping factor should be > 0.")
for (uint i = 0; i < gpu_count; i++) {
int num_inputs_on_gpu =
get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count);
for (uint i = 0; i < streams.count(); i++) {
int num_inputs_on_gpu = get_num_inputs_on_gpu(
input_lwe_ciphertext_count, i, streams.count());
// Use the macro to get the correct elements for the current iteration
// Handles the case when the input/output are scattered through
@@ -278,12 +277,12 @@ void execute_pbs_async(
get_variant_element(lwe_input_indexes, i);
int gpu_offset =
get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
get_gpu_offset(input_lwe_ciphertext_count, i, streams.count());
auto d_lut_vector_indexes =
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
streams[i], gpu_indexes[i], current_lwe_array_out,
streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
current_lwe_array_in, current_lwe_input_indexes,
bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
@@ -292,9 +291,9 @@ void execute_pbs_async(
}
break;
case CLASSICAL:
for (uint i = 0; i < gpu_count; i++) {
int num_inputs_on_gpu =
get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count);
for (uint i = 0; i < streams.count(); i++) {
int num_inputs_on_gpu = get_num_inputs_on_gpu(
input_lwe_ciphertext_count, i, streams.count());
// Use the macro to get the correct elements for the current iteration
// Handles the case when the input/output are scattered through
@@ -307,7 +306,7 @@ void execute_pbs_async(
get_variant_element(lwe_input_indexes, i);
int gpu_offset =
get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
get_gpu_offset(input_lwe_ciphertext_count, i, streams.count());
auto d_lut_vector_indexes =
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
@@ -316,7 +315,7 @@ void execute_pbs_async(
ms_noise_reduction_key->ptr != nullptr)
zeros = ms_noise_reduction_key->ptr[i];
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
streams[i], gpu_indexes[i], current_lwe_array_out,
streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
current_lwe_array_in, current_lwe_input_indexes,
bootstrapping_keys[i], ms_noise_reduction_key, zeros, pbs_buffer[i],
@@ -333,9 +332,9 @@ void execute_pbs_async(
case MULTI_BIT:
if (grouping_factor == 0)
PANIC("Multi-bit PBS error: grouping factor should be > 0.")
for (uint i = 0; i < gpu_count; i++) {
int num_inputs_on_gpu =
get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count);
for (uint i = 0; i < streams.count(); i++) {
int num_inputs_on_gpu = get_num_inputs_on_gpu(
input_lwe_ciphertext_count, i, streams.count());
// Use the macro to get the correct elements for the current iteration
// Handles the case when the input/output are scattered through
@@ -348,7 +347,7 @@ void execute_pbs_async(
get_variant_element(lwe_input_indexes, i);
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
streams[i], gpu_indexes[i], current_lwe_array_out,
streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
current_lwe_output_indexes, lut_vec[i], current_lwe_array_in,
current_lwe_input_indexes, bootstrapping_keys[i], pbs_buffer[i],
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
@@ -356,9 +355,9 @@ void execute_pbs_async(
}
break;
case CLASSICAL:
for (uint i = 0; i < gpu_count; i++) {
int num_inputs_on_gpu =
get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count);
for (uint i = 0; i < streams.count(); i++) {
int num_inputs_on_gpu = get_num_inputs_on_gpu(
input_lwe_ciphertext_count, i, streams.count());
// Use the macro to get the correct elements for the current iteration
// Handles the case when the input/output are scattered through
@@ -371,7 +370,7 @@ void execute_pbs_async(
get_variant_element(lwe_input_indexes, i);
int gpu_offset =
get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
get_gpu_offset(input_lwe_ciphertext_count, i, streams.count());
auto d_lut_vector_indexes =
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
@@ -380,10 +379,11 @@ void execute_pbs_async(
ms_noise_reduction_key->ptr != nullptr)
zeros = ms_noise_reduction_key->ptr[i];
cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
streams[i], gpu_indexes[i], current_lwe_array_out, lut_vec[i],
current_lwe_array_in, bootstrapping_keys[i], ms_noise_reduction_key,
zeros, pbs_buffer[i], lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, num_inputs_on_gpu);
streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
lut_vec[i], current_lwe_array_in, bootstrapping_keys[i],
ms_noise_reduction_key, zeros, pbs_buffer[i], lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count,
num_inputs_on_gpu);
}
break;
default:

View File

@@ -39,10 +39,11 @@ int32_t cuda_setup_multi_gpu(int device_0_id) {
return (int32_t)(num_used_gpus);
}
int get_active_gpu_count(int num_inputs, int gpu_count) {
int ceil_div_inputs =
std::max(1, (num_inputs + THRESHOLD_MULTI_GPU - 1) / THRESHOLD_MULTI_GPU);
int active_gpu_count = std::min(ceil_div_inputs, gpu_count);
uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count) {
uint32_t ceil_div_inputs =
std::max((uint32_t)1,
(num_inputs + THRESHOLD_MULTI_GPU - 1) / THRESHOLD_MULTI_GPU);
uint32_t active_gpu_count = std::min(ceil_div_inputs, gpu_count);
return active_gpu_count;
}

View File

@@ -5,19 +5,18 @@
/// Initialize same-size arrays on all active gpus
template <typename Torus>
void multi_gpu_alloc_array_async(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, std::vector<Torus *> &dest,
void multi_gpu_alloc_array_async(CudaStreams streams,
std::vector<Torus *> &dest,
uint32_t elements_per_gpu,
uint64_t &size_tracker_on_gpu_0,
bool allocate_gpu_memory) {
dest.resize(gpu_count);
for (uint i = 0; i < gpu_count; i++) {
dest.resize(streams.count());
for (uint i = 0; i < streams.count(); i++) {
uint64_t size_tracker_on_gpu_i = 0;
Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
elements_per_gpu * sizeof(Torus), streams[i], gpu_indexes[i],
size_tracker_on_gpu_i, allocate_gpu_memory);
elements_per_gpu * sizeof(Torus), streams.stream(i),
streams.gpu_index(i), size_tracker_on_gpu_i, allocate_gpu_memory);
dest[i] = d_array;
if (i == 0) {
size_tracker_on_gpu_0 += size_tracker_on_gpu_i;
@@ -26,49 +25,46 @@ void multi_gpu_alloc_array_async(cudaStream_t const *streams,
}
/// Copy an array residing on one GPU to all active gpus
template <typename Torus>
void multi_gpu_copy_array_async(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
std::vector<Torus *> &dest, Torus const *src,
uint32_t elements_per_gpu,
void multi_gpu_copy_array_async(CudaStreams streams, std::vector<Torus *> &dest,
Torus const *src, uint32_t elements_per_gpu,
bool gpu_memory_allocated) {
dest.resize(gpu_count);
for (uint i = 0; i < gpu_count; i++) {
dest.resize(streams.count());
for (uint i = 0; i < streams.count(); i++) {
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
dest[i], src, elements_per_gpu * sizeof(Torus), streams[i],
gpu_indexes[i], gpu_memory_allocated);
dest[i], src, elements_per_gpu * sizeof(Torus), streams.stream(i),
streams.gpu_index(i), gpu_memory_allocated);
}
}
/// Copy an array residing on one CPU to all active gpus
template <typename Torus>
void multi_gpu_copy_array_from_cpu_async(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, std::vector<Torus *> &dest, Torus const *h_src,
uint32_t elements_per_gpu, bool gpu_memory_allocated) {
dest.resize(gpu_count);
for (uint i = 0; i < gpu_count; i++) {
void multi_gpu_copy_array_from_cpu_async(CudaStreams streams,
std::vector<Torus *> &dest,
Torus const *h_src,
uint32_t elements_per_gpu,
bool gpu_memory_allocated) {
for (uint i = 0; i < streams.count(); i++) {
cuda_memcpy_with_size_tracking_async_to_gpu(
dest[i], h_src, elements_per_gpu * sizeof(Torus), streams[i],
gpu_indexes[i], gpu_memory_allocated);
dest[i], h_src, elements_per_gpu * sizeof(Torus), streams.stream(i),
streams.gpu_index(i), gpu_memory_allocated);
}
}
/// Allocates the input/output vector for all devices
/// Initializes also the related indexing and initializes it to the trivial
/// index
template <typename Torus>
void multi_gpu_alloc_lwe_async(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
std::vector<Torus *> &dest, uint32_t num_inputs,
uint32_t lwe_size,
void multi_gpu_alloc_lwe_async(CudaStreams streams, std::vector<Torus *> &dest,
uint32_t num_inputs, uint32_t lwe_size,
uint64_t &size_tracker_on_gpu_0,
bool allocate_gpu_memory) {
dest.resize(gpu_count);
for (uint i = 0; i < gpu_count; i++) {
dest.resize(streams.count());
for (uint i = 0; i < streams.count(); i++) {
uint64_t size_tracker_on_gpu_i = 0;
auto inputs_on_gpu = std::max(
THRESHOLD_MULTI_GPU, get_num_inputs_on_gpu(num_inputs, i, gpu_count));
auto inputs_on_gpu =
std::max(THRESHOLD_MULTI_GPU,
get_num_inputs_on_gpu(num_inputs, i, streams.count()));
Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
inputs_on_gpu * lwe_size * sizeof(Torus), streams[i], gpu_indexes[i],
size_tracker_on_gpu_i, allocate_gpu_memory);
inputs_on_gpu * lwe_size * sizeof(Torus), streams.stream(i),
streams.gpu_index(i), size_tracker_on_gpu_i, allocate_gpu_memory);
dest[i] = d_array;
if (i == 0) {
size_tracker_on_gpu_0 += size_tracker_on_gpu_i;
@@ -77,8 +73,7 @@ void multi_gpu_alloc_lwe_async(cudaStream_t const *streams,
}
template void multi_gpu_alloc_lwe_async<__uint128_t>(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, std::vector<__uint128_t *> &dest, uint32_t num_inputs,
CudaStreams streams, std::vector<__uint128_t *> &dest, uint32_t num_inputs,
uint32_t lwe_size, uint64_t &size_tracker_on_gpu_0,
bool allocate_gpu_memory);
@@ -87,18 +82,20 @@ template void multi_gpu_alloc_lwe_async<__uint128_t>(
/// index
template <typename Torus>
void multi_gpu_alloc_lwe_many_lut_output_async(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, std::vector<Torus *> &dest, uint32_t num_inputs,
CudaStreams streams, std::vector<Torus *> &dest, uint32_t num_inputs,
uint32_t num_many_lut, uint32_t lwe_size, uint64_t &size_tracker_on_gpu_0,
bool allocate_gpu_memory) {
dest.resize(gpu_count);
for (uint i = 0; i < gpu_count; i++) {
dest.resize(streams.count());
for (uint i = 0; i < streams.count(); i++) {
uint64_t size_tracker = 0;
auto inputs_on_gpu = std::max(
THRESHOLD_MULTI_GPU, get_num_inputs_on_gpu(num_inputs, i, gpu_count));
auto inputs_on_gpu =
std::max(THRESHOLD_MULTI_GPU,
get_num_inputs_on_gpu(num_inputs, i, streams.count()));
Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
num_many_lut * inputs_on_gpu * lwe_size * sizeof(Torus), streams[i],
gpu_indexes[i], size_tracker, allocate_gpu_memory);
num_many_lut * inputs_on_gpu * lwe_size * sizeof(Torus),
streams.stream(i), streams.gpu_index(i), size_tracker,
allocate_gpu_memory);
dest[i] = d_array;
if (i == 0) {
size_tracker_on_gpu_0 += size_tracker;
@@ -141,32 +138,30 @@ __global__ void realign_with_indexes(Torus *d_vector,
/// The output indexing is always the trivial one
/// num_inputs: total num of lwe in src
template <typename Torus>
void multi_gpu_scatter_lwe_async(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, std::vector<Torus *> &dest,
Torus const *src, Torus const *d_src_indexes,
void multi_gpu_scatter_lwe_async(CudaStreams streams,
std::vector<Torus *> &dest, Torus const *src,
Torus const *d_src_indexes,
bool is_trivial_index,
std::vector<Torus *> &aligned_vec,
uint32_t max_active_gpu_count,
uint32_t num_inputs, uint32_t lwe_size) {
if (max_active_gpu_count < gpu_count)
if (max_active_gpu_count < streams.count())
PANIC("Cuda error: number of gpus in scatter should be <= number of gpus "
"used to create the lut")
dest.resize(gpu_count);
for (uint i = 0; i < gpu_count; i++) {
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
for (uint i = 0; i < streams.count(); i++) {
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, streams.count());
auto gpu_offset = 0;
for (uint j = 0; j < i; j++) {
gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count);
gpu_offset += get_num_inputs_on_gpu(num_inputs, j, streams.count());
}
if (is_trivial_index) {
auto d_dest = dest[i];
auto d_src = src + gpu_offset * lwe_size;
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
d_dest, d_src, inputs_on_gpu * lwe_size * sizeof(Torus), streams[i],
gpu_indexes[i], true);
d_dest, d_src, inputs_on_gpu * lwe_size * sizeof(Torus),
streams.stream(i), streams.gpu_index(i), true);
} else {
if (aligned_vec.size() == 0)
@@ -175,22 +170,24 @@ void multi_gpu_scatter_lwe_async(cudaStream_t const *streams,
if (d_src_indexes == nullptr)
PANIC("Cuda error: source indexes should be initialized!");
cudaEvent_t temp_event2 = cuda_create_event(gpu_indexes[0]);
cuda_set_device(gpu_indexes[0]);
align_with_indexes<Torus><<<inputs_on_gpu, 1024, 0, streams[0]>>>(
cudaEvent_t temp_event2 = cuda_create_event(streams.gpu_index(0));
cuda_set_device(streams.gpu_index(0));
align_with_indexes<Torus><<<inputs_on_gpu, 1024, 0, streams.stream(0)>>>(
aligned_vec[i], (Torus *)src, (Torus *)d_src_indexes + gpu_offset,
lwe_size);
check_cuda_error(cudaGetLastError());
cuda_event_record(temp_event2, streams[0], gpu_indexes[0]);
cuda_stream_wait_event(streams[i], temp_event2, gpu_indexes[i]);
cuda_event_record(temp_event2, streams.stream(0), streams.gpu_index(0));
cuda_stream_wait_event(streams.stream(i), temp_event2,
streams.gpu_index(i));
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
dest[i], aligned_vec[i], inputs_on_gpu * lwe_size * sizeof(Torus),
streams[i], gpu_indexes[i], true);
streams.stream(i), streams.gpu_index(i), true);
cudaEvent_t temp_event = cuda_create_event(gpu_indexes[i]);
cuda_event_record(temp_event, streams[i], gpu_indexes[i]);
cuda_stream_wait_event(streams[0], temp_event, gpu_indexes[0]);
cudaEvent_t temp_event = cuda_create_event(streams.gpu_index(i));
cuda_event_record(temp_event, streams.stream(i), streams.gpu_index(i));
cuda_stream_wait_event(streams.stream(0), temp_event,
streams.gpu_index(0));
}
}
}
@@ -199,18 +196,17 @@ void multi_gpu_scatter_lwe_async(cudaStream_t const *streams,
/// dest_indexes
/// The input indexing should be the trivial one
template <typename Torus>
void multi_gpu_gather_lwe_async(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
Torus *dest, const std::vector<Torus *> &src,
void multi_gpu_gather_lwe_async(CudaStreams streams, Torus *dest,
const std::vector<Torus *> &src,
Torus *d_dest_indexes, bool is_trivial_index,
std::vector<Torus *> &aligned_vec,
uint32_t num_inputs, uint32_t lwe_size) {
for (uint i = 0; i < gpu_count; i++) {
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
for (uint i = 0; i < streams.count(); i++) {
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, streams.count());
auto gpu_offset = 0;
for (uint j = 0; j < i; j++) {
gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count);
gpu_offset += get_num_inputs_on_gpu(num_inputs, j, streams.count());
}
if (is_trivial_index) {
@@ -218,29 +214,33 @@ void multi_gpu_gather_lwe_async(cudaStream_t const *streams,
auto d_src = src[i];
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
d_dest, d_src, inputs_on_gpu * lwe_size * sizeof(Torus), streams[i],
gpu_indexes[i], true);
d_dest, d_src, inputs_on_gpu * lwe_size * sizeof(Torus),
streams.stream(i), streams.gpu_index(i), true);
} else {
if (aligned_vec.size() == 0)
PANIC("Cuda error: auxiliary arrays should be setup!");
if (d_dest_indexes == nullptr)
PANIC("Cuda error: destination indexes should be initialized!");
cudaEvent_t temp_event2 = cuda_create_event(gpu_indexes[0]);
cudaEvent_t temp_event2 = cuda_create_event(streams.gpu_index(0));
cuda_event_record(temp_event2, streams[0], gpu_indexes[0]);
cuda_stream_wait_event(streams[i], temp_event2, gpu_indexes[i]);
cuda_event_record(temp_event2, streams.stream(0), streams.gpu_index(0));
cuda_stream_wait_event(streams.stream(i), temp_event2,
streams.gpu_index(i));
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
aligned_vec[i], src[i], inputs_on_gpu * lwe_size * sizeof(Torus),
streams[i], gpu_indexes[i], true);
streams.stream(i), streams.gpu_index(i), true);
cudaEvent_t temp_event3 = cuda_create_event(gpu_indexes[i]);
cuda_event_record(temp_event3, streams[i], gpu_indexes[i]);
cuda_stream_wait_event(streams[0], temp_event3, gpu_indexes[0]);
cuda_set_device(gpu_indexes[0]);
realign_with_indexes<Torus><<<inputs_on_gpu, 1024, 0, streams[0]>>>(
dest, aligned_vec[i], (Torus *)d_dest_indexes + gpu_offset, lwe_size);
cudaEvent_t temp_event3 = cuda_create_event(streams.gpu_index(i));
cuda_event_record(temp_event3, streams.stream(i), streams.gpu_index(i));
cuda_stream_wait_event(streams.stream(0), temp_event3,
streams.gpu_index(0));
cuda_set_device(streams.gpu_index(0));
realign_with_indexes<Torus>
<<<inputs_on_gpu, 1024, 0, streams.stream(0)>>>(
dest, aligned_vec[i], (Torus *)d_dest_indexes + gpu_offset,
lwe_size);
check_cuda_error(cudaGetLastError());
}
}
@@ -250,18 +250,20 @@ void multi_gpu_gather_lwe_async(cudaStream_t const *streams,
/// dest_indexes
/// The input indexing should be the trivial one
template <typename Torus>
void multi_gpu_gather_many_lut_lwe_async(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *dest, const std::vector<Torus *> &src,
Torus *h_dest_indexes, bool is_trivial_index, uint32_t num_inputs,
uint32_t lwe_size, uint32_t num_many_lut) {
void multi_gpu_gather_many_lut_lwe_async(CudaStreams streams, Torus *dest,
const std::vector<Torus *> &src,
Torus *h_dest_indexes,
bool is_trivial_index,
uint32_t num_inputs, uint32_t lwe_size,
uint32_t num_many_lut) {
for (uint lut_id = 0; lut_id < num_many_lut; lut_id++) {
for (uint i = 0; i < gpu_count; i++) {
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
for (uint i = 0; i < streams.count(); i++) {
auto inputs_on_gpu =
get_num_inputs_on_gpu(num_inputs, i, streams.count());
auto gpu_offset = 0;
for (uint j = 0; j < i; j++) {
gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count);
gpu_offset += get_num_inputs_on_gpu(num_inputs, j, streams.count());
}
if (is_trivial_index) {
@@ -270,8 +272,8 @@ void multi_gpu_gather_many_lut_lwe_async(
auto d_src = src[i] + lut_id * inputs_on_gpu * lwe_size;
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
d_dest, d_src, inputs_on_gpu * lwe_size * sizeof(Torus), streams[i],
gpu_indexes[i], true);
d_dest, d_src, inputs_on_gpu * lwe_size * sizeof(Torus),
streams.stream(i), streams.gpu_index(i), true);
} else {
auto dest_indexes = h_dest_indexes + gpu_offset;
@@ -282,8 +284,8 @@ void multi_gpu_gather_many_lut_lwe_async(
src[i] + j * lwe_size + lut_id * inputs_on_gpu * lwe_size;
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
d_dest, d_src, lwe_size * sizeof(Torus), streams[i],
gpu_indexes[i], true);
d_dest, d_src, lwe_size * sizeof(Torus), streams.stream(i),
streams.gpu_index(i), true);
}
}
}
@@ -291,16 +293,13 @@ void multi_gpu_gather_many_lut_lwe_async(
}
template <typename Torus>
void multi_gpu_release_async(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
std::vector<Torus *> &vec) {
void multi_gpu_release_async(CudaStreams streams, std::vector<Torus *> &vec) {
for (uint i = 0; i < vec.size(); i++)
cuda_drop_async(vec[i], streams[i], gpu_indexes[i]);
cuda_drop_async(vec[i], streams.stream(i), streams.gpu_index(i));
}
template void
multi_gpu_release_async<__uint128_t>(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
multi_gpu_release_async<__uint128_t>(CudaStreams streams,
std::vector<__uint128_t *> &vec);
#endif

View File

@@ -1,17 +1,17 @@
#include "zk.cuh"
uint64_t scratch_cuda_expand_without_verification_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension,
uint32_t computing_ks_level, uint32_t computing_ks_base_log,
uint32_t casting_input_dimension, uint32_t casting_output_dimension,
uint32_t casting_ks_level, uint32_t casting_ks_base_log, uint32_t pbs_level,
uint32_t pbs_base_log, uint32_t grouping_factor,
const uint32_t *num_lwes_per_compact_list, const bool *is_boolean_array,
uint32_t num_compact_lists, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, KS_TYPE casting_key_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t computing_ks_level,
uint32_t computing_ks_base_log, uint32_t casting_input_dimension,
uint32_t casting_output_dimension, uint32_t casting_ks_level,
uint32_t casting_ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, const uint32_t *num_lwes_per_compact_list,
const bool *is_boolean_array, uint32_t num_compact_lists,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
KS_TYPE casting_key_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
// Since CUDA backend works with the concept of "big" and "small" key, instead
// of "input" and "output", we need to do this or otherwise our PBS will throw
@@ -34,17 +34,16 @@ uint64_t scratch_cuda_expand_without_verification_64(
noise_reduction_type);
return scratch_cuda_expand_without_verification<uint64_t>(
(cudaStream_t *)streams, gpu_indexes, gpu_count,
CudaStreams(streams),
reinterpret_cast<zk_expand_mem<uint64_t> **>(mem_ptr),
num_lwes_per_compact_list, is_boolean_array, num_compact_lists,
computing_params, casting_params, casting_key_type, allocate_gpu_memory);
}
void cuda_expand_without_verification_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, const void *lwe_flattened_compact_array_in,
int8_t *mem_ptr, void *const *bsks, void *const *computing_ksks,
void *const *casting_keys,
CudaStreamsFFI streams, void *lwe_array_out,
const void *lwe_flattened_compact_array_in, int8_t *mem_ptr,
void *const *bsks, void *const *computing_ksks, void *const *casting_keys,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
auto expand_buffer = reinterpret_cast<zk_expand_mem<uint64_t> *>(mem_ptr);
@@ -52,56 +51,49 @@ void cuda_expand_without_verification_64(
switch (expand_buffer->casting_params.big_lwe_dimension) {
case 256:
host_expand_without_verification<uint64_t, AmortizedDegree<256>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
streams, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
expand_buffer, (uint64_t **)casting_keys, bsks,
(uint64_t **)(computing_ksks), ms_noise_reduction_key);
break;
case 512:
host_expand_without_verification<uint64_t, AmortizedDegree<512>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
streams, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
expand_buffer, (uint64_t **)casting_keys, bsks,
(uint64_t **)(computing_ksks), ms_noise_reduction_key);
break;
case 1024:
host_expand_without_verification<uint64_t, AmortizedDegree<1024>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
streams, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
expand_buffer, (uint64_t **)casting_keys, bsks,
(uint64_t **)(computing_ksks), ms_noise_reduction_key);
break;
case 2048:
host_expand_without_verification<uint64_t, AmortizedDegree<2048>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
streams, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
expand_buffer, (uint64_t **)casting_keys, bsks,
(uint64_t **)(computing_ksks), ms_noise_reduction_key);
break;
case 4096:
host_expand_without_verification<uint64_t, AmortizedDegree<4096>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
streams, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
expand_buffer, (uint64_t **)casting_keys, bsks,
(uint64_t **)(computing_ksks), ms_noise_reduction_key);
break;
case 8192:
host_expand_without_verification<uint64_t, AmortizedDegree<8192>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
streams, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
expand_buffer, (uint64_t **)casting_keys, bsks,
(uint64_t **)(computing_ksks), ms_noise_reduction_key);
break;
case 16384:
host_expand_without_verification<uint64_t, AmortizedDegree<16384>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
streams, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
expand_buffer, (uint64_t **)casting_keys, bsks,
(uint64_t **)(computing_ksks), ms_noise_reduction_key);
@@ -114,14 +106,12 @@ void cuda_expand_without_verification_64(
}
}
void cleanup_expand_without_verification_64(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
void cleanup_expand_without_verification_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
zk_expand_mem<uint64_t> *mem_ptr =
reinterpret_cast<zk_expand_mem<uint64_t> *>(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}

View File

@@ -17,8 +17,7 @@
template <typename Torus, class params>
__host__ void host_expand_without_verification(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out,
CudaStreams streams, Torus *lwe_array_out,
const Torus *lwe_flattened_compact_array_in, zk_expand_mem<Torus> *mem_ptr,
Torus *const *casting_keys, void *const *bsks, Torus *const *compute_ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
@@ -51,13 +50,13 @@ __host__ void host_expand_without_verification(
output_index++;
}
}
cuda_memcpy_with_size_tracking_async_to_gpu(d_expand_jobs, h_expand_jobs,
compact_lwe_lists.total_num_lwes *
sizeof(expand_job<Torus>),
streams[0], gpu_indexes[0], true);
cuda_memcpy_with_size_tracking_async_to_gpu(
d_expand_jobs, h_expand_jobs,
compact_lwe_lists.total_num_lwes * sizeof(expand_job<Torus>),
streams.stream(0), streams.gpu_index(0), true);
host_lwe_expand<Torus, params>(streams[0], gpu_indexes[0], expanded_lwes,
d_expand_jobs, num_lwes);
host_lwe_expand<Torus, params>(streams.stream(0), streams.gpu_index(0),
expanded_lwes, d_expand_jobs, num_lwes);
auto ksks = casting_keys;
auto lwe_array_input = expanded_lwes;
@@ -78,7 +77,7 @@ __host__ void host_expand_without_verification(
// apply keyswitch to BIG
execute_keyswitch_async<Torus>(
streams, gpu_indexes, 1, ksed_small_to_big_expanded_lwes,
streams.subset_first_gpu(), ksed_small_to_big_expanded_lwes,
lwe_trivial_indexes_vec[0], expanded_lwes, lwe_trivial_indexes_vec[0],
casting_keys, casting_input_dimension, casting_output_dimension,
casting_ks_base_log, casting_ks_level, num_lwes);
@@ -91,20 +90,19 @@ __host__ void host_expand_without_verification(
// Apply LUT
cuda_memset_async(lwe_array_out, 0,
(lwe_dimension + 1) * num_lwes * 2 * sizeof(Torus),
streams[0], gpu_indexes[0]);
streams.stream(0), streams.gpu_index(0));
auto output = new CudaRadixCiphertextFFI;
into_radix_ciphertext(output, lwe_array_out, 2 * num_lwes, lwe_dimension);
auto input = new CudaRadixCiphertextFFI;
into_radix_ciphertext(input, lwe_array_input, 2 * num_lwes, lwe_dimension);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, output, input, bsks, ksks,
ms_noise_reduction_key, message_and_carry_extract_luts, 2 * num_lwes);
streams, output, input, bsks, ksks, ms_noise_reduction_key,
message_and_carry_extract_luts, 2 * num_lwes);
}
template <typename Torus>
__host__ uint64_t scratch_cuda_expand_without_verification(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, zk_expand_mem<Torus> **mem_ptr,
CudaStreams streams, zk_expand_mem<Torus> **mem_ptr,
const uint32_t *num_lwes_per_compact_list, const bool *is_boolean_array,
uint32_t num_compact_lists, int_radix_params computing_params,
int_radix_params casting_params, KS_TYPE casting_key_type,
@@ -112,9 +110,9 @@ __host__ uint64_t scratch_cuda_expand_without_verification(
uint64_t size_tracker = 0;
*mem_ptr = new zk_expand_mem<Torus>(
streams, gpu_indexes, gpu_count, computing_params, casting_params,
casting_key_type, num_lwes_per_compact_list, is_boolean_array,
num_compact_lists, allocate_gpu_memory, size_tracker);
streams, computing_params, casting_params, casting_key_type,
num_lwes_per_compact_list, is_boolean_array, num_compact_lists,
allocate_gpu_memory, size_tracker);
return size_tracker;
}

View File

@@ -1,9 +1,9 @@
#include "device.h"
#include "pbs/pbs_utilities.h"
#include "pbs/programmable_bootstrap.h"
#include "utils.h"
#include "gtest/gtest.h"
#include <cstdint>
#include <device.h>
#include <functional>
#include <random>
#include <setup_and_teardown.h>

View File

@@ -1,3 +1,4 @@
#include "device.h"
#include "helper_multi_gpu.h"
#include <cmath>
#include <cstdint>

View File

@@ -1,3 +1,4 @@
#include "device.h"
#include <cmath>
#include <cstdint>
#include <cstdio>

File diff suppressed because it is too large Load Diff

View File

@@ -109,6 +109,14 @@ impl CudaStreams {
// The cast here is safe as GpuIndex is repr(transparent)
self.gpu_indexes.as_ptr().cast()
}
pub fn ffi(&self) -> CudaStreamsFFI {
CudaStreamsFFI {
streams: self.ptr.as_ptr(),
gpu_indexes: self.gpu_indexes_ptr(),
gpu_count: self.len() as u32,
}
}
}
impl Clone for CudaStreams {

File diff suppressed because it is too large Load Diff