perf(backend-cuda): Update cuda backend to the latest tfhe-rs version (0.10.0)

This commit is contained in:
Bourgerie Quentin
2024-09-24 15:05:51 +00:00
committed by Quentin Bourgerie
parent 9a85d33c5b
commit ccf491e0a1
100 changed files with 183 additions and 27993 deletions

3
.gitmodules vendored
View File

@@ -6,3 +6,6 @@
[submodule "lattice-estimator"]
path = third_party/lattice-estimator
url = https://github.com/malb/lattice-estimator
[submodule "third_party/tfhe-rs"]
path = third_party/tfhe-rs
url = https://github.com/zama-ai/tfhe-rs.git

View File

@@ -4,6 +4,7 @@ autofix: false
# list of paths to ignore, uses gitignore syntaxes (executes before any rule)
ignore:
- compilers/concrete-compiler/llvm-project
- backends/concrete-cuda/implementation
rules:
# checks if file ends in a newline character

View File

@@ -0,0 +1 @@
../../third_party/tfhe-rs/backends/tfhe-cuda-backend/cuda/

View File

@@ -1,86 +0,0 @@
cmake_minimum_required(VERSION 3.24 FATAL_ERROR)
project(concrete_cuda LANGUAGES CXX CUDA)
# See if the minimum CUDA version is available. If not, only enable documentation building.
set(MINIMUM_SUPPORTED_CUDA_VERSION 10.0)
include(CheckLanguage)
# See if CUDA is available
check_language(CUDA)
# If so, enable CUDA to check the version.
if(CMAKE_CUDA_COMPILER)
enable_language(CUDA)
endif()
# If CUDA is not available, or the minimum version is too low do not build
if(NOT CMAKE_CUDA_COMPILER)
message(FATAL_ERROR "Cuda compiler not found.")
endif()
if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS ${MINIMUM_SUPPORTED_CUDA_VERSION})
message(FATAL_ERROR "CUDA ${MINIMUM_SUPPORTED_CUDA_VERSION} or greater is required for compilation.")
endif()
# Get CUDA compute capability
set(OUTPUTFILE ${CMAKE_CURRENT_SOURCE_DIR}/cuda_script) # No suffix required
set(CUDAFILE ${CMAKE_CURRENT_SOURCE_DIR}/check_cuda.cu)
execute_process(COMMAND nvcc -lcuda ${CUDAFILE} -o ${OUTPUTFILE})
execute_process(
COMMAND ${OUTPUTFILE}
RESULT_VARIABLE CUDA_RETURN_CODE
OUTPUT_VARIABLE ARCH)
file(REMOVE ${OUTPUTFILE})
if(${CUDA_RETURN_CODE} EQUAL 0)
set(CUDA_SUCCESS "TRUE")
else()
set(CUDA_SUCCESS "FALSE")
endif()
if(${CUDA_SUCCESS})
message(STATUS "CUDA Architecture: ${ARCH}")
message(STATUS "CUDA Version: ${CUDA_VERSION_STRING}")
message(STATUS "CUDA Path: ${CUDA_TOOLKIT_ROOT_DIR}")
message(STATUS "CUDA Libraries: ${CUDA_LIBRARIES}")
message(STATUS "CUDA Performance Primitives: ${CUDA_npp_LIBRARY}")
set(CUDA_NVCC_FLAGS "${ARCH}")
# add_definitions(-DGPU) #You may not require this
else()
message(WARNING ${ARCH})
endif()
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
if(NOT CUDA_NVCC_FLAGS)
set(CUDA_NVCC_FLAGS -arch=sm_70)
endif()
# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O3 ${CUDA_NVCC_FLAGS} \
-std=c++17 --no-exceptions --expt-relaxed-constexpr -rdc=true --use_fast_math -Xcompiler -fPIC")
set(INCLUDE_DIR include)
add_subdirectory(src)
add_subdirectory(test_and_benchmark)
target_include_directories(concrete_cuda PRIVATE ${INCLUDE_DIR})
# This is required for rust cargo build
install(TARGETS concrete_cuda DESTINATION .)
install(TARGETS concrete_cuda DESTINATION lib)
# Define a function to add a lint target.
find_file(CPPLINT NAMES cpplint cpplint.exe)
if(CPPLINT)
# Add a custom target to lint all child projects. Dependencies are specified in child projects.
add_custom_target(all_lint)
# Don't trigger this target on ALL_BUILD or Visual Studio 'Rebuild Solution'
set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_ALL TRUE)
# set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD TRUE)
endif()
enable_testing()

View File

@@ -1,3 +0,0 @@
set noparent
linelength=240
filter=-legal/copyright,-readability/todo,-runtime/references,-build/c++17

View File

@@ -1,53 +0,0 @@
# Concrete Cuda
## Introduction
Concrete-cuda holds the code for GPU acceleration of Zama's variant of TFHE.
It is one of the backends of the Concrete Compiler.
It implements CUDA/C++ functions to perform homomorphic operations on LWE ciphertexts.
It provides functions to allocate memory on the GPU, to copy data back
and forth between the CPU and the GPU, to create and destroy Cuda streams, etc.:
- `cuda_create_stream`, `cuda_destroy_stream`
- `cuda_malloc`, `cuda_check_valid_malloc`
- `cuda_memcpy_async_to_cpu`, `cuda_memcpy_async_to_gpu`
- `cuda_get_number_of_gpus`
- `cuda_synchronize_device`
The cryptographic operations it provides are:
- an amortized implementation of the TFHE programmable bootstrap: `cuda_bootstrap_amortized_lwe_ciphertext_vector_32` and `cuda_bootstrap_amortized_lwe_ciphertext_vector_64`
- a low latency implementation of the TFHE programmable bootstrap: `cuda_bootstrap_low latency_lwe_ciphertext_vector_32` and `cuda_bootstrap_low_latency_lwe_ciphertext_vector_64`
- the keyswitch: `cuda_keyswitch_lwe_ciphertext_vector_32` and `cuda_keyswitch_lwe_ciphertext_vector_64`
- the larger precision programmable bootstrap (wop PBS, which supports up to 16 bits of message while the classical PBS only supports up to 8 bits of message) and its sub-components: `cuda_wop_pbs_64`, `cuda_extract_bits_64`, `cuda_circuit_bootstrap_64`, `cuda_cmux_tree_64`, `cuda_blind_rotation_sample_extraction_64`
- acceleration for leveled operations: `cuda_negate_lwe_ciphertext_vector_64`, `cuda_add_lwe_ciphertext_vector_64`, `cuda_add_lwe_ciphertext_vector_plaintext_vector_64`, `cuda_mult_lwe_ciphertext_vector_cleartext_vector`.
## Dependencies
**Disclaimer**: Compilation on Windows/Mac is not supported yet. Only Nvidia GPUs are supported.
<!-- markdown-link-check-disable-next-line -->
- nvidia driver - for example, if you're running Ubuntu 20.04 check this [page](https://linuxconfig.org/how-to-install-the-nvidia-drivers-on-ubuntu-20-04-focal-fossa-linux) for installation
- [nvcc](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) >= 10.0
- [gcc](https://gcc.gnu.org/) >= 8.0 - check this [page](https://gist.github.com/ax3l/9489132) for more details about nvcc/gcc compatible versions
- [cmake](https://cmake.org/) >= 3.24
## Build
The Cuda project held in `concrete-cuda` can be compiled independently from Concrete in the
following way:
```
git clone git@github.com:zama-ai/concrete
cd backends/concrete-cuda/implementation
mkdir build
cd build
cmake ..
make
```
The compute capability is detected automatically (with the first GPU information) and set accordingly.
## Links
- [TFHE](https://eprint.iacr.org/2018/421.pdf)
## License
This software is distributed under the BSD-3-Clause-Clear license. If you have any questions,
please contact us at `hello@zama.ai`.

View File

@@ -1,22 +0,0 @@
#include <stdio.h>
int main(int argc, char **argv) {
cudaDeviceProp dP;
float min_cc = 3.0;
int rc = cudaGetDeviceProperties(&dP, 0);
if (rc != cudaSuccess) {
cudaError_t error = cudaGetLastError();
printf("CUDA error: %s", cudaGetErrorString(error));
return rc; /* Failure */
}
if ((dP.major + (dP.minor / 10)) < min_cc) {
printf("Min Compute Capability of %2.1f required: %d.%d found\n Not "
"Building CUDA Code",
min_cc, dP.major, dP.minor);
return 1; /* Failure */
} else {
printf("-arch=sm_%d%d", dP.major, dP.minor);
return 0; /* Success */
}
}

View File

@@ -1,7 +0,0 @@
#!/bin/bash
find ./{include,src,test_and_benchmark} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-11 -i -style='file'
cmake-format -i CMakeLists.txt -c ../../../compilers/concrete-compiler/compiler/.cmake-format-config.py
find ./{include,src,test_and_benchmark} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c ../../../compilers/concrete-compiler/compiler/.cmake-format-config.py'

View File

@@ -1,48 +0,0 @@
#ifndef CUDA_BIT_EXTRACT_H
#define CUDA_BIT_EXTRACT_H
#include <cstdint>
extern "C" {
void scratch_cuda_extract_bits_32(
void *v_stream, uint32_t gpu_index, int8_t **bit_extract_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t crt_decomposition_size,
uint32_t max_shared_memory, bool allocate_gpu_memory);
void scratch_cuda_extract_bits_64(
void *v_stream, uint32_t gpu_index, int8_t **bit_extract_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t crt_decomposition_size,
uint32_t max_shared_memory, bool allocate_gpu_memory);
void cuda_extract_bits_32(void *v_stream, uint32_t gpu_index,
void *list_lwe_array_out, void *lwe_array_in,
int8_t *bit_extract_buffer, void *ksk,
void *fourier_bsk, uint32_t *number_of_bits_array,
uint32_t *delta_log_array, uint32_t lwe_dimension_in,
uint32_t lwe_dimension_out, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log_bsk,
uint32_t level_count_bsk, uint32_t base_log_ksk,
uint32_t level_count_ksk,
uint32_t crt_decomposition_size,
uint32_t max_shared_memory);
void cuda_extract_bits_64(void *v_stream, uint32_t gpu_index,
void *list_lwe_array_out, void *lwe_array_in,
int8_t *bit_extract_buffer, void *ksk,
void *fourier_bsk, uint32_t *number_of_bits_array,
uint32_t *delta_log_array, uint32_t lwe_dimension_in,
uint32_t lwe_dimension_out, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log_bsk,
uint32_t level_count_bsk, uint32_t base_log_ksk,
uint32_t level_count_ksk,
uint32_t crt_decomposition_size,
uint32_t max_shared_memory);
void cleanup_cuda_extract_bits(void *v_stream, uint32_t gpu_index,
int8_t **bit_extract_buffer);
}
#endif // CUDA_BIT_EXTRACT_H

View File

@@ -1,74 +0,0 @@
#ifndef CUDA_BOOLEAN_GATES_H
#define CUDA_BOOLEAN_GATES_H
#include <cstdint>
extern "C" {
void cuda_boolean_not_32(void *v_stream, uint32_t gpu_index,
void *lwe_array_out, void *lwe_array_in,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_boolean_and_32(void *v_stream, uint32_t gpu_index,
void *lwe_array_out, void *lwe_array_in_1,
void *lwe_array_in_2, void *bootstrapping_key,
void *ksk, uint32_t input_lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t pbs_base_log, uint32_t pbs_level_count,
uint32_t ks_base_log, uint32_t ks_level_count,
uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory);
void cuda_boolean_nand_32(void *v_stream, uint32_t gpu_index,
void *lwe_array_out, void *lwe_array_in_1,
void *lwe_array_in_2, void *bootstrapping_key,
void *ksk, uint32_t input_lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t pbs_base_log, uint32_t pbs_level_count,
uint32_t ks_base_log, uint32_t ks_level_count,
uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory);
void cuda_boolean_nor_32(void *v_stream, uint32_t gpu_index,
void *lwe_array_out, void *lwe_array_in_1,
void *lwe_array_in_2, void *bootstrapping_key,
void *ksk, uint32_t input_lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t pbs_base_log, uint32_t pbs_level_count,
uint32_t ks_base_log, uint32_t ks_level_count,
uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory);
void cuda_boolean_or_32(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_array_in_1, void *lwe_array_in_2,
void *bootstrapping_key, void *ksk,
uint32_t input_lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t pbs_base_log,
uint32_t pbs_level_count, uint32_t ks_base_log,
uint32_t ks_level_count,
uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory);
void cuda_boolean_xor_32(void *v_stream, uint32_t gpu_index,
void *lwe_array_out, void *lwe_array_in_1,
void *lwe_array_in_2, void *bootstrapping_key,
void *ksk, uint32_t input_lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t pbs_base_log, uint32_t pbs_level_count,
uint32_t ks_base_log, uint32_t ks_level_count,
uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory);
void cuda_boolean_xnor_32(void *v_stream, uint32_t gpu_index,
void *lwe_array_out, void *lwe_array_in_1,
void *lwe_array_in_2, void *bootstrapping_key,
void *ksk, uint32_t input_lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t pbs_base_log, uint32_t pbs_level_count,
uint32_t ks_base_log, uint32_t ks_level_count,
uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory);
}
#endif // CUDA_BOOLAN_GATES_H

View File

@@ -1,184 +0,0 @@
#ifndef CUDA_BOOTSTRAP_H
#define CUDA_BOOTSTRAP_H
#include <cstdint>
extern "C" {
void cuda_fourier_polynomial_mul(void *input1, void *input2, void *output,
void *v_stream, uint32_t gpu_index,
uint32_t polynomial_size,
uint32_t total_polynomials);
void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src, void *v_stream,
uint32_t gpu_index,
uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size);
void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src, void *v_stream,
uint32_t gpu_index,
uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size);
void scratch_cuda_bootstrap_amortized_32(void *v_stream, uint32_t gpu_index,
int8_t **pbs_buffer,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory,
bool allocate_gpu_memory);
void scratch_cuda_bootstrap_amortized_64(void *v_stream, uint32_t gpu_index,
int8_t **pbs_buffer,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory,
bool allocate_gpu_memory);
void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
uint32_t max_shared_memory);
void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
uint32_t max_shared_memory);
void cleanup_cuda_bootstrap_amortized(void *v_stream, uint32_t gpu_index,
int8_t **pbs_buffer);
void scratch_cuda_bootstrap_low_latency_32(
void *v_stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
void scratch_cuda_bootstrap_low_latency_64(
void *v_stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
uint32_t max_shared_memory);
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
uint32_t max_shared_memory);
void cleanup_cuda_bootstrap_low_latency(void *v_stream, uint32_t gpu_index,
int8_t **pbs_buffer);
void scratch_cuda_circuit_bootstrap_vertical_packing_32(
void *v_stream, uint32_t gpu_index, int8_t **cbs_vp_buffer,
uint32_t *cbs_delta_log, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t level_count_cbs,
uint32_t number_of_inputs, uint32_t tau, uint32_t max_shared_memory,
bool allocate_gpu_memory);
void scratch_cuda_circuit_bootstrap_vertical_packing_64(
void *v_stream, uint32_t gpu_index, int8_t **cbs_vp_buffer,
uint32_t *cbs_delta_log, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t level_count_cbs,
uint32_t number_of_inputs, uint32_t tau, uint32_t max_shared_memory,
bool allocate_gpu_memory);
void scratch_cuda_wop_pbs_32(void *v_stream, uint32_t gpu_index,
int8_t **wop_pbs_buffer, uint32_t *delta_log_array,
uint32_t *cbs_delta_log, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t level_count_cbs, uint32_t level_count_bsk,
uint32_t *number_of_bits_to_extract_array,
uint32_t crt_decomposition_size,
uint32_t max_shared_memory,
bool allocate_gpu_memory);
void scratch_cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index,
int8_t **wop_pbs_buffer, uint32_t *delta_log_array,
uint32_t *cbs_delta_log, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t level_count_cbs, uint32_t level_count_bsk,
uint32_t *number_of_bits_to_extract_array,
uint32_t crt_decomposition_size,
uint32_t max_shared_memory,
bool allocate_gpu_memory);
void cuda_circuit_bootstrap_vertical_packing_64(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *fourier_bsk, void *cbs_fpksk, void *lut_vector, int8_t *cbs_vp_buffer,
uint32_t cbs_delta_log, uint32_t polynomial_size, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t level_count_bsk, uint32_t base_log_bsk,
uint32_t level_count_pksk, uint32_t base_log_pksk, uint32_t level_count_cbs,
uint32_t base_log_cbs, uint32_t number_of_inputs, uint32_t lut_number,
uint32_t max_shared_memory);
void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_array_in, void *lut_vector, void *fourier_bsk,
void *ksk, void *cbs_fpksk, int8_t *wop_pbs_buffer,
uint32_t cbs_delta_log, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log_bsk, uint32_t level_count_bsk,
uint32_t base_log_ksk, uint32_t level_count_ksk,
uint32_t base_log_pksk, uint32_t level_count_pksk,
uint32_t base_log_cbs, uint32_t level_count_cbs,
uint32_t *number_of_bits_to_extract_array,
uint32_t *delta_log_array, uint32_t crt_decomposition_size,
uint32_t max_shared_memory);
void cleanup_cuda_wop_pbs(void *v_stream, uint32_t gpu_index,
int8_t **wop_pbs_buffer);
void cleanup_cuda_circuit_bootstrap_vertical_packing(void *v_stream,
uint32_t gpu_index,
int8_t **cbs_vp_buffer);
uint64_t get_buffer_size_bootstrap_amortized_64(
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
uint64_t get_buffer_size_bootstrap_low_latency_64(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
}
#ifdef __CUDACC__
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template <typename T>
__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
uint32_t polynomial_size,
int glwe_dimension, uint32_t level_count);
template <typename T>
__device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
uint32_t polynomial_size,
int glwe_dimension, uint32_t level_count);
template <typename T>
__device__ T *get_multi_bit_ith_lwe_gth_group_kth_block(
T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
#endif
#endif // CUDA_BOOTSTRAP_H

View File

@@ -1,42 +0,0 @@
#ifndef CUDA_MULTI_BIT_H
#define CUDA_MULTI_BIT_H
#include <cstdint>
extern "C" {
void cuda_convert_lwe_multi_bit_bootstrap_key_64(
void *dest, void *src, void *v_stream, uint32_t gpu_index,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size, uint32_t grouping_factor);
void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_lut_vectors,
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t chunk_size = 0);
void scratch_cuda_multi_bit_pbs_64(
void *v_stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t chunk_size = 0);
void cleanup_cuda_multi_bit_pbs(void *v_stream, uint32_t gpu_index,
int8_t **pbs_buffer);
}
#ifdef __CUDACC__
__host__ uint32_t get_lwe_chunk_size(uint32_t lwe_dimension,
uint32_t level_count,
uint32_t glwe_dimension,
uint32_t num_samples);
__host__ uint64_t get_max_buffer_size_multibit_bootstrap(uint32_t
lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t max_input_lwe_ciphertext_count);
#endif
#endif // CUDA_MULTI_BIT_H

View File

@@ -1,18 +0,0 @@
#ifndef CUDA_CIPHERTEXT_H
#define CUDA_CIPHERTEXT_H
#include <cstdint>
extern "C" {
void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *dest, void *src,
void *v_stream,
uint32_t gpu_index,
uint32_t number_of_cts,
uint32_t lwe_dimension);
void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *dest, void *src,
void *v_stream,
uint32_t gpu_index,
uint32_t number_of_cts,
uint32_t lwe_dimension);
};
#endif

View File

@@ -1,44 +0,0 @@
#ifndef CUDA_CIRCUIT_BOOTSTRAP_H
#define CUDA_CIRCUIT_BOOTSTRAP_H
#include <cstdint>
extern "C" {
void scratch_cuda_circuit_bootstrap_32(
void *v_stream, uint32_t gpu_index, int8_t **cbs_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t level_count_bsk, uint32_t level_count_cbs,
uint32_t number_of_inputs, uint32_t max_shared_memory,
bool allocate_gpu_memory);
void scratch_cuda_circuit_bootstrap_64(
void *v_stream, uint32_t gpu_index, int8_t **cbs_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t level_count_bsk, uint32_t level_count_cbs,
uint32_t number_of_inputs, uint32_t max_shared_memory,
bool allocate_gpu_memory);
void cuda_circuit_bootstrap_32(
void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
void *fourier_bsk, void *fp_ksk_array, void *lut_vector_indexes,
int8_t *cbs_buffer, uint32_t delta_log, uint32_t polynomial_size,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t level_bsk,
uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
uint32_t max_shared_memory);
void cuda_circuit_bootstrap_64(
void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
void *fourier_bsk, void *fp_ksk_array, void *lut_vector_indexes,
int8_t *cbs_buffer, uint32_t delta_log, uint32_t polynomial_size,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t level_bsk,
uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
uint32_t max_shared_memory);
void cleanup_cuda_circuit_bootstrap(void *v_stream, uint32_t gpu_index,
int8_t **cbs_buffer);
}
#endif // CUDA_CIRCUIT_BOOTSTRAP_H

View File

@@ -1,65 +0,0 @@
#ifndef DEVICE_H
#define DEVICE_H
#pragma once
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cuda_runtime.h>
extern "C" {
cudaStream_t *cuda_create_stream(uint32_t gpu_index);
int cuda_destroy_stream(cudaStream_t *stream, uint32_t gpu_index);
void *cuda_malloc(uint64_t size, uint32_t gpu_index);
void *cuda_malloc_async(uint64_t size, cudaStream_t *stream,
uint32_t gpu_index);
int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
int cuda_check_support_cooperative_groups();
int cuda_memcpy_to_cpu(void *dest, const void *src, uint64_t size,
uint32_t gpu_index);
int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
cudaStream_t *stream, uint32_t gpu_index);
int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size,
uint32_t gpu_index);
int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
cudaStream_t *stream, uint32_t gpu_index);
int cuda_memset_async(void *dest, uint64_t val, uint64_t size,
cudaStream_t *stream, uint32_t gpu_index);
int cuda_get_number_of_gpus();
int cuda_synchronize_device(uint32_t gpu_index);
int cuda_drop(void *ptr, uint32_t gpu_index);
int cuda_drop_async(void *ptr, cudaStream_t *stream, uint32_t gpu_index);
int cuda_get_max_shared_memory(uint32_t gpu_index);
int cuda_synchronize_stream(void *v_stream);
#define check_cuda_error(ans) \
{ cuda_error((ans), __FILE__, __LINE__); }
inline void cuda_error(cudaError_t code, const char *file, int line,
bool abort = true) {
if (code != cudaSuccess) {
fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code), file,
line);
if (abort)
exit(code);
}
}
}
#endif

View File

@@ -1,100 +0,0 @@
#include "cuComplex.h"
#include "thrust/complex.h"
#include <iostream>
#include <string>
#include <type_traits>
#define PRINT_VARS
#ifdef PRINT_VARS
#define PRINT_DEBUG_5(var, begin, end, step, cond) \
_print_debug(var, #var, begin, end, step, cond, "", false)
#define PRINT_DEBUG_6(var, begin, end, step, cond, text) \
_print_debug(var, #var, begin, end, step, cond, text, true)
#define CAT(A, B) A##B
#define PRINT_SELECT(NAME, NUM) CAT(NAME##_, NUM)
#define GET_COUNT(_1, _2, _3, _4, _5, _6, COUNT, ...) COUNT
#define VA_SIZE(...) GET_COUNT(__VA_ARGS__, 6, 5, 4, 3, 2, 1)
#define PRINT_DEBUG(...) \
PRINT_SELECT(PRINT_DEBUG, VA_SIZE(__VA_ARGS__))(__VA_ARGS__)
#else
#define PRINT_DEBUG(...)
#endif
template <typename T>
__device__ typename std::enable_if<std::is_unsigned<T>::value, void>::type
_print_debug(T *var, const char *var_name, int start, int end, int step,
bool cond, const char *text, bool has_text) {
__syncthreads();
if (cond) {
if (has_text)
printf("%s\n", text);
for (int i = start; i < end; i += step) {
printf("%s[%u]: %u\n", var_name, i, var[i]);
}
}
__syncthreads();
}
template <typename T>
__device__ typename std::enable_if<std::is_signed<T>::value, void>::type
_print_debug(T *var, const char *var_name, int start, int end, int step,
bool cond, const char *text, bool has_text) {
__syncthreads();
if (cond) {
if (has_text)
printf("%s\n", text);
for (int i = start; i < end; i += step) {
printf("%s[%u]: %d\n", var_name, i, var[i]);
}
}
__syncthreads();
}
template <typename T>
__device__ typename std::enable_if<std::is_floating_point<T>::value, void>::type
_print_debug(T *var, const char *var_name, int start, int end, int step,
bool cond, const char *text, bool has_text) {
__syncthreads();
if (cond) {
if (has_text)
printf("%s\n", text);
for (int i = start; i < end; i += step) {
printf("%s[%u]: %.15f\n", var_name, i, var[i]);
}
}
__syncthreads();
}
template <typename T>
__device__
typename std::enable_if<std::is_same<T, thrust::complex<double>>::value,
void>::type
_print_debug(T *var, const char *var_name, int start, int end, int step,
bool cond, const char *text, bool has_text) {
__syncthreads();
if (cond) {
if (has_text)
printf("%s\n", text);
for (int i = start; i < end; i += step) {
printf("%s[%u]: %.15f , %.15f\n", var_name, i, var[i].real(),
var[i].imag());
}
}
__syncthreads();
}
template <typename T>
__device__
typename std::enable_if<std::is_same<T, cuDoubleComplex>::value, void>::type
_print_debug(T *var, const char *var_name, int start, int end, int step,
bool cond, const char *text, bool has_text) {
__syncthreads();
if (cond) {
if (has_text)
printf("%s\n", text);
for (int i = start; i < end; i += step) {
printf("%s[%u]: %.15f , %.15f\n", var_name, i, var[i].x, var[i].y);
}
}
__syncthreads();
}

View File

@@ -1,33 +0,0 @@
#ifndef CNCRT_KS_H_
#define CNCRT_KS_H_
#include <cstdint>
extern "C" {
void cuda_keyswitch_lwe_ciphertext_vector_32(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
void cuda_keyswitch_lwe_ciphertext_vector_64(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
void cuda_fp_keyswitch_lwe_to_glwe_32(
void *v_stream, uint32_t gpu_index, void *glwe_array_out,
void *lwe_array_in, void *fp_ksk_array, uint32_t input_lwe_dimension,
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t number_of_input_lwe,
uint32_t number_of_keys);
void cuda_fp_keyswitch_lwe_to_glwe_64(
void *v_stream, uint32_t gpu_index, void *glwe_array_out,
void *lwe_array_in, void *fp_ksk_array, uint32_t input_lwe_dimension,
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t number_of_input_lwe,
uint32_t number_of_keys);
}
#endif // CNCRT_KS_H_

View File

@@ -1,89 +0,0 @@
#ifndef CUDA_LINALG_H_
#define CUDA_LINALG_H_
#include <cstdint>
extern "C" {
// Three types of pbs are available for integer multiplication
enum PBS_TYPE { MULTI_BIT = 0, LOW_LAT = 1, AMORTIZED = 2 };
void cuda_negate_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_negate_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in_1,
void *lwe_array_in_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in_1,
void *lwe_array_in_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *plaintext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *plaintext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *cleartext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *cleartext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
void *v_stream, uint32_t gpu_index, void *mem_ptr, uint32_t message_modulus,
uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level,
uint32_t ks_base_log, uint32_t ks_level, uint32_t num_blocks,
PBS_TYPE pbs_type, uint32_t max_shared_memory, bool allocate_gpu_memory);
void cuda_integer_mult_radix_ciphertext_kb_64(
void *v_stream, uint32_t gpu_index, void *radix_lwe_out,
void *radix_lwe_left, void *radix_lwe_right, uint32_t *ct_degree_out,
uint32_t *ct_degree_left, uint32_t *ct_degree_right, void *bsk, void *ksk,
void *mem_ptr, uint32_t message_modulus, uint32_t carry_modulus,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t pbs_base_log, uint32_t pbs_level, uint32_t ks_base_log,
uint32_t ks_level, uint32_t num_blocks, PBS_TYPE pbs_type,
uint32_t max_shared_memory);
void scratch_cuda_integer_mult_radix_ciphertext_kb_64_multi_gpu(
void *mem_ptr, void *bsk, void *ksk, uint32_t message_modulus,
uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level,
uint32_t ks_base_log, uint32_t ks_level, uint32_t num_blocks,
PBS_TYPE pbs_type, uint32_t max_shared_memory, bool allocate_gpu_memory);
void cuda_integer_mult_radix_ciphertext_kb_64_multi_gpu(
void *radix_lwe_out, void *radix_lwe_left, void *radix_lwe_right,
uint32_t *ct_degree_out, uint32_t *ct_degree_left,
uint32_t *ct_degree_right, void *bsk, void *ksk, void *mem_ptr,
uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
uint32_t num_blocks, PBS_TYPE pbs_type, uint32_t max_shared_memory);
}
#endif // CUDA_LINALG_H_

View File

@@ -1,63 +0,0 @@
#ifndef VERTICAL_PACKING_H
#define VERTICAL_PACKING_H
#include <cstdint>
extern "C" {
void scratch_cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index,
int8_t **cmux_tree_buffer,
uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t lut_vector_size, uint32_t tau,
uint32_t max_shared_memory,
bool allocate_gpu_memory);
void scratch_cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index,
int8_t **cmux_tree_buffer,
uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t lut_vector_size, uint32_t tau,
uint32_t max_shared_memory,
bool allocate_gpu_memory);
void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
void *ggsw_in, void *lut_vector,
int8_t *cmux_tree_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t lut_vector_size,
uint32_t tau, uint32_t max_shared_memory);
void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
void *ggsw_in, void *lut_vector,
int8_t *cmux_tree_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t lut_vector_size,
uint32_t tau, uint32_t max_shared_memory);
void cleanup_cuda_cmux_tree(void *v_stream, uint32_t gpu_index,
int8_t **cmux_tree_buffer);
void scratch_cuda_blind_rotation_sample_extraction_32(
void *v_stream, uint32_t gpu_index, int8_t **br_se_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t mbr_size, uint32_t tau, uint32_t max_shared_memory,
bool allocate_gpu_memory);
void scratch_cuda_blind_rotation_sample_extraction_64(
void *v_stream, uint32_t gpu_index, int8_t **br_se_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t mbr_size, uint32_t tau, uint32_t max_shared_memory,
bool allocate_gpu_memory);
void cuda_blind_rotate_and_sample_extraction_64(
void *v_stream, uint32_t gpu_index, void *lwe_out, void *ggsw_in,
void *lut_vector, int8_t *br_se_buffer, uint32_t mbr_size, uint32_t tau,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t l_gadget, uint32_t max_shared_memory);
void cleanup_cuda_blind_rotation_sample_extraction(void *v_stream,
uint32_t gpu_index,
int8_t **br_se_buffer);
}
#endif // VERTICAL_PACKING_H

View File

@@ -1,13 +0,0 @@
set(SOURCES
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/boolean_gates.h ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h)
file(GLOB SOURCES "*.cu" "*.h" "fft/*.cu")
add_library(concrete_cuda STATIC ${SOURCES})
set_target_properties(
concrete_cuda
PROPERTIES CUDA_SEPARABLE_COMPILATION ON
CUDA_RESOLVE_DEVICE_SYMBOLS ON
CUDA_ARCHITECTURES native)
target_link_libraries(concrete_cuda PUBLIC cudart)
target_include_directories(concrete_cuda PRIVATE .)

View File

@@ -1,111 +0,0 @@
#include "addition.cuh"
/*
* Perform the addition of two u32 input LWE ciphertext vectors.
* See the equivalent operation on u64 ciphertexts for more details.
*/
void cuda_add_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in_1,
void *lwe_array_in_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_addition(v_stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in_1),
static_cast<uint32_t *>(lwe_array_in_2), input_lwe_dimension,
input_lwe_ciphertext_count);
}
/*
* Perform the addition of two u64 input LWE ciphertext vectors.
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - `lwe_array_out` is an array of size
* `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
* been allocated on the GPU before calling this function, and that will hold
* the result of the computation.
* - `lwe_array_in_1` is the first LWE ciphertext vector used as input, it
* should have been allocated and initialized before calling this function. It
* has the same size as the output array.
* - `lwe_array_in_2` is the second LWE ciphertext vector used as input, it
* should have been allocated and initialized before calling this function. It
* has the same size as the output array.
* - `input_lwe_dimension` is the number of mask elements in the two input and
* in the output ciphertext vectors
* - `input_lwe_ciphertext_count` is the number of ciphertexts contained in each
* input LWE ciphertext vector, as well as in the output.
*
* Each element (mask element or body) of the input LWE ciphertext vector 1 is
* added to the corresponding element in the input LWE ciphertext 2. The result
* is stored in the output LWE ciphertext vector. The two input LWE ciphertext
* vectors are left unchanged. This function is a wrapper to a device function
* that performs the operation on the GPU.
*/
void cuda_add_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in_1,
void *lwe_array_in_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_addition(v_stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in_1),
static_cast<uint64_t *>(lwe_array_in_2), input_lwe_dimension,
input_lwe_ciphertext_count);
}
/*
* Perform the addition of a u32 input LWE ciphertext vector with a u32
* plaintext vector. See the equivalent operation on u64 data for more details.
*/
void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *plaintext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_addition_plaintext(v_stream, gpu_index,
static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(plaintext_array_in),
input_lwe_dimension, input_lwe_ciphertext_count);
}
/*
* Perform the addition of a u64 input LWE ciphertext vector with a u64 input
* plaintext vector.
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - `lwe_array_out` is an array of size
* `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
* been allocated on the GPU before calling this function, and that will hold
* the result of the computation.
* - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
* been allocated and initialized before calling this function. It has the same
* size as the output array.
* - `plaintext_array_in` is the plaintext vector used as input, it should have
* been allocated and initialized before calling this function. It should be of
* size `input_lwe_ciphertext_count`.
* - `input_lwe_dimension` is the number of mask elements in the input and
* output LWE ciphertext vectors
* - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the
* input LWE ciphertext vector, as well as in the output. It is also the number
* of plaintexts in the input plaintext vector.
*
* Each plaintext of the input plaintext vector is added to the body of the
* corresponding LWE ciphertext in the LWE ciphertext vector. The result of the
* operation is stored in the output LWE ciphertext vector. The two input
* vectors are unchanged. This function is a wrapper to a device function that
* performs the operation on the GPU.
*/
void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *plaintext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_addition_plaintext(v_stream, gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(plaintext_array_in),
input_lwe_dimension, input_lwe_ciphertext_count);
}

View File

@@ -1,87 +0,0 @@
#ifndef CUDA_ADD_H
#define CUDA_ADD_H
#ifdef __CDT_PARSER__
#undef __CUDA_RUNTIME_H__
#include <cuda_runtime.h>
#endif
#include "device.h"
#include "linear_algebra.h"
#include "utils/kernel_dimensions.cuh"
#include <stdio.h>
template <typename T>
__global__ void addition(T *output, T *input_1, T *input_2,
uint32_t num_entries) {
int tid = threadIdx.x;
int index = blockIdx.x * blockDim.x + tid;
if (index < num_entries) {
// Here we take advantage of the wrapping behaviour of uint
output[index] = input_1[index] + input_2[index];
}
}
template <typename T>
__global__ void plaintext_addition(T *output, T *lwe_input, T *plaintext_input,
uint32_t input_lwe_dimension,
uint32_t num_entries) {
int tid = threadIdx.x;
int plaintext_index = blockIdx.x * blockDim.x + tid;
if (plaintext_index < num_entries) {
int index =
plaintext_index * (input_lwe_dimension + 1) + input_lwe_dimension;
// Here we take advantage of the wrapping behaviour of uint
output[index] = lwe_input[index] + plaintext_input[plaintext_index];
}
}
template <typename T>
__host__ void host_addition(void *v_stream, uint32_t gpu_index, T *output,
T *input_1, T *input_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
cudaSetDevice(gpu_index);
// lwe_size includes the presence of the body
// whereas lwe_dimension is the number of elements in the mask
int lwe_size = input_lwe_dimension + 1;
// Create a 1-dimensional grid of threads
int num_blocks = 0, num_threads = 0;
int num_entries = input_lwe_ciphertext_count * lwe_size;
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
auto stream = static_cast<cudaStream_t *>(v_stream);
addition<<<grid, thds, 0, *stream>>>(output, input_1, input_2, num_entries);
check_cuda_error(cudaGetLastError());
}
template <typename T>
__host__ void host_addition_plaintext(void *v_stream, uint32_t gpu_index,
T *output, T *lwe_input,
T *plaintext_input,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
cudaSetDevice(gpu_index);
int num_blocks = 0, num_threads = 0;
int num_entries = input_lwe_ciphertext_count;
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
auto stream = static_cast<cudaStream_t *>(v_stream);
check_cuda_error(cudaMemcpyAsync(output, lwe_input,
(input_lwe_dimension + 1) *
input_lwe_ciphertext_count * sizeof(T),
cudaMemcpyDeviceToDevice, *stream));
plaintext_addition<<<grid, thds, 0, *stream>>>(
output, lwe_input, plaintext_input, input_lwe_dimension, num_entries);
check_cuda_error(cudaGetLastError());
}
#endif // CUDA_ADD_H

View File

@@ -1,356 +0,0 @@
#include "bit_extraction.cuh"
/*
* Runs standard checks to validate the inputs
*/
void checks_fast_extract_bits(int glwe_dimension, int polynomial_size,
int level_count_bsk, int crt_decomposition_size) {
assert(("Error (GPU extract bits): polynomial_size should be one of "
"256, 512, 1024, 2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
// The number of samples should be lower than four time the number of
// streaming multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being
// related to the occupancy of 50%).
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
"equal to the number of streaming multiprocessors on the device "
"divided by 4 * (k + 1) "
"level_count_bsk",
crt_decomposition_size <=
number_of_sm / 4. / (glwe_dimension + 1) / level_count_bsk));
}
/*
* Runs standard checks to validate the inputs
*/
void checks_extract_bits(int nbits, int glwe_dimension, int polynomial_size,
int base_log_bsk, int level_count_bsk,
int crt_decomposition_size) {
assert(("Error (GPU extract bits): base log should be <= nbits",
base_log_bsk <= nbits));
checks_fast_extract_bits(glwe_dimension, polynomial_size, level_count_bsk,
crt_decomposition_size);
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the bit extraction on 32 bits inputs, into `cbs_buffer`. It also
* configures SM options on the GPU in case FULLSM mode is going to be used.
*/
void scratch_cuda_extract_bits_32(
void *v_stream, uint32_t gpu_index, int8_t **bit_extract_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t crt_decomposition_size,
uint32_t max_shared_memory, bool allocate_gpu_memory) {
checks_fast_extract_bits(glwe_dimension, polynomial_size, level_count,
crt_decomposition_size);
switch (polynomial_size) {
case 256:
scratch_extract_bits<uint32_t, int32_t, Degree<256>>(
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_count, crt_decomposition_size, max_shared_memory,
allocate_gpu_memory);
break;
case 512:
scratch_extract_bits<uint32_t, int32_t, Degree<512>>(
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_count, crt_decomposition_size, max_shared_memory,
allocate_gpu_memory);
break;
case 1024:
scratch_extract_bits<uint32_t, int32_t, Degree<1024>>(
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_count, crt_decomposition_size, max_shared_memory,
allocate_gpu_memory);
break;
case 2048:
scratch_extract_bits<uint32_t, int32_t, Degree<2048>>(
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_count, crt_decomposition_size, max_shared_memory,
allocate_gpu_memory);
break;
case 4096:
scratch_extract_bits<uint32_t, int32_t, Degree<4096>>(
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_count, crt_decomposition_size, max_shared_memory,
allocate_gpu_memory);
break;
case 8192:
scratch_extract_bits<uint32_t, int32_t, Degree<8192>>(
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_count, crt_decomposition_size, max_shared_memory,
allocate_gpu_memory);
break;
default:
break;
}
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the bit extraction on 64 bits inputs, into `cbs_buffer`. It also
* configures SM options on the GPU in case FULLSM mode is going to be used.
*/
void scratch_cuda_extract_bits_64(
void *v_stream, uint32_t gpu_index, int8_t **bit_extract_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t crt_decomposition_size,
uint32_t max_shared_memory, bool allocate_gpu_memory) {
checks_fast_extract_bits(glwe_dimension, polynomial_size, level_count,
crt_decomposition_size);
switch (polynomial_size) {
case 256:
scratch_extract_bits<uint64_t, int64_t, Degree<256>>(
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_count, crt_decomposition_size, max_shared_memory,
allocate_gpu_memory);
break;
case 512:
scratch_extract_bits<uint64_t, int64_t, Degree<512>>(
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_count, crt_decomposition_size, max_shared_memory,
allocate_gpu_memory);
break;
case 1024:
scratch_extract_bits<uint64_t, int64_t, Degree<1024>>(
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_count, crt_decomposition_size, max_shared_memory,
allocate_gpu_memory);
break;
case 2048:
scratch_extract_bits<uint64_t, int64_t, Degree<2048>>(
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_count, crt_decomposition_size, max_shared_memory,
allocate_gpu_memory);
break;
case 4096:
scratch_extract_bits<uint64_t, int64_t, Degree<4096>>(
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_count, crt_decomposition_size, max_shared_memory,
allocate_gpu_memory);
break;
case 8192:
scratch_extract_bits<uint64_t, int64_t, Degree<8192>>(
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_count, crt_decomposition_size, max_shared_memory,
allocate_gpu_memory);
break;
default:
break;
}
}
/* Perform bit extract on a batch of 32 bit LWE ciphertexts.
* See the corresponding function on 64 bit LWE ciphertexts for more details.
*/
void cuda_extract_bits_32(void *v_stream, uint32_t gpu_index,
void *list_lwe_array_out, void *lwe_array_in,
int8_t *bit_extract_buffer, void *ksk,
void *fourier_bsk, uint32_t *number_of_bits_array,
uint32_t *delta_log_array, uint32_t lwe_dimension_in,
uint32_t lwe_dimension_out, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log_bsk,
uint32_t level_count_bsk, uint32_t base_log_ksk,
uint32_t level_count_ksk,
uint32_t crt_decomposition_size,
uint32_t max_shared_memory) {
checks_extract_bits(32, glwe_dimension, polynomial_size, base_log_bsk,
level_count_bsk, crt_decomposition_size);
switch (polynomial_size) {
case 256:
host_extract_bits<uint32_t, Degree<256>>(
v_stream, gpu_index, (uint32_t *)list_lwe_array_out,
(uint32_t *)lwe_array_in, bit_extract_buffer, (uint32_t *)ksk,
(double2 *)fourier_bsk, number_of_bits_array, delta_log_array,
lwe_dimension_in, lwe_dimension_out, glwe_dimension, polynomial_size,
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
crt_decomposition_size, max_shared_memory);
break;
case 512:
host_extract_bits<uint32_t, Degree<512>>(
v_stream, gpu_index, (uint32_t *)list_lwe_array_out,
(uint32_t *)lwe_array_in, bit_extract_buffer, (uint32_t *)ksk,
(double2 *)fourier_bsk, number_of_bits_array, delta_log_array,
lwe_dimension_in, lwe_dimension_out, glwe_dimension, polynomial_size,
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
crt_decomposition_size, max_shared_memory);
break;
case 1024:
host_extract_bits<uint32_t, Degree<1024>>(
v_stream, gpu_index, (uint32_t *)list_lwe_array_out,
(uint32_t *)lwe_array_in, bit_extract_buffer, (uint32_t *)ksk,
(double2 *)fourier_bsk, number_of_bits_array, delta_log_array,
lwe_dimension_in, lwe_dimension_out, glwe_dimension, polynomial_size,
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
crt_decomposition_size, max_shared_memory);
break;
case 2048:
host_extract_bits<uint32_t, Degree<2048>>(
v_stream, gpu_index, (uint32_t *)list_lwe_array_out,
(uint32_t *)lwe_array_in, bit_extract_buffer, (uint32_t *)ksk,
(double2 *)fourier_bsk, number_of_bits_array, delta_log_array,
lwe_dimension_in, lwe_dimension_out, glwe_dimension, polynomial_size,
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
crt_decomposition_size, max_shared_memory);
break;
case 4096:
host_extract_bits<uint32_t, Degree<4096>>(
v_stream, gpu_index, (uint32_t *)list_lwe_array_out,
(uint32_t *)lwe_array_in, bit_extract_buffer, (uint32_t *)ksk,
(double2 *)fourier_bsk, number_of_bits_array, delta_log_array,
lwe_dimension_in, lwe_dimension_out, glwe_dimension, polynomial_size,
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
crt_decomposition_size, max_shared_memory);
break;
case 8192:
host_extract_bits<uint32_t, Degree<8192>>(
v_stream, gpu_index, (uint32_t *)list_lwe_array_out,
(uint32_t *)lwe_array_in, bit_extract_buffer, (uint32_t *)ksk,
(double2 *)fourier_bsk, number_of_bits_array, delta_log_array,
lwe_dimension_in, lwe_dimension_out, glwe_dimension, polynomial_size,
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
crt_decomposition_size, max_shared_memory);
break;
default:
break;
}
}
/* Perform bit extract on a batch of 64 bit lwe ciphertexts.
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - 'number_of_bits' will be extracted from each ciphertext
* starting at the bit number 'delta_log' (0-indexed) included.
* Output bits are ordered from the MSB to LSB. Every extracted bit is
* represented as an LWE ciphertext, containing the encryption of the bit scaled
* by q/2.
* - 'list_lwe_array_out' output batch LWE ciphertexts for each bit of every
* input ciphertext
* - 'lwe_array_in' batch of input LWE ciphertexts, with size -
* ('lwe_dimension_in' + 1) * crt_decomposition_size * sizeof(u64)
* The following 5 parameters are used during calculations, they are not actual
* inputs of the function they are just allocated memory for calculation
* process, like this, memory can be allocated once and can be used as much
* as needed for different calls of extract_bit function.
* - 'lwe_array_in_buffer' same size as 'lwe_array_in'
* - 'lwe_array_in_shifted_buffer' same size as 'lwe_array_in'
* - 'lwe_array_out_ks_buffer' with size:
* ('lwe_dimension_out' + 1) * crt_decomposition_size * sizeof(u64)
* - 'lwe_array_out_pbs_buffer' same size as 'lwe_array_in'
* - 'lut_pbs' with size:
* (glwe_dimension + 1) * (lwe_dimension_in + 1) * sizeof(u64)
* The other inputs are:
* - 'lut_vector_indexes' stores the index corresponding to which test
* vector to use
* - 'ksk' keyswitch key
* - 'fourier_bsk' complex compressed bsk in fourier domain
* - 'lwe_dimension_in' input LWE ciphertext dimension, supported input
* dimensions are: {256, 512, 1024,2048, 4096, 8192}
* - 'lwe_dimension_out' output LWE ciphertext dimension
* - 'glwe_dimension' GLWE dimension, only glwe_dimension = 1 is supported
* for now
* - 'base_log_bsk' base_log for bootstrapping
* - 'level_count_bsk' decomposition level count for bootstrapping
* - 'base_log_ksk' base_log for keyswitch
* - 'level_count_ksk' decomposition level for keyswitch
* - 'crt_decomposition_size' number of input LWE ciphertexts
* - 'max_shared_memory' maximum amount of shared memory to be used inside
* device functions
*
* This function will call corresponding template of wrapper host function which
* will manage the calls of device functions.
*/
void cuda_extract_bits_64(void *v_stream, uint32_t gpu_index,
void *list_lwe_array_out, void *lwe_array_in,
int8_t *bit_extract_buffer, void *ksk,
void *fourier_bsk, uint32_t *number_of_bits_array,
uint32_t *delta_log_array, uint32_t lwe_dimension_in,
uint32_t lwe_dimension_out, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log_bsk,
uint32_t level_count_bsk, uint32_t base_log_ksk,
uint32_t level_count_ksk,
uint32_t crt_decomposition_size,
uint32_t max_shared_memory) {
checks_extract_bits(64, glwe_dimension, polynomial_size, base_log_bsk,
level_count_bsk, crt_decomposition_size);
switch (polynomial_size) {
case 256:
host_extract_bits<uint64_t, Degree<256>>(
v_stream, gpu_index, (uint64_t *)list_lwe_array_out,
(uint64_t *)lwe_array_in, bit_extract_buffer, (uint64_t *)ksk,
(double2 *)fourier_bsk, number_of_bits_array, delta_log_array,
lwe_dimension_in, lwe_dimension_out, glwe_dimension, polynomial_size,
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
crt_decomposition_size, max_shared_memory);
break;
case 512:
host_extract_bits<uint64_t, Degree<512>>(
v_stream, gpu_index, (uint64_t *)list_lwe_array_out,
(uint64_t *)lwe_array_in, bit_extract_buffer, (uint64_t *)ksk,
(double2 *)fourier_bsk, number_of_bits_array, delta_log_array,
lwe_dimension_in, lwe_dimension_out, glwe_dimension, polynomial_size,
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
crt_decomposition_size, max_shared_memory);
break;
case 1024:
host_extract_bits<uint64_t, Degree<1024>>(
v_stream, gpu_index, (uint64_t *)list_lwe_array_out,
(uint64_t *)lwe_array_in, bit_extract_buffer, (uint64_t *)ksk,
(double2 *)fourier_bsk, number_of_bits_array, delta_log_array,
lwe_dimension_in, lwe_dimension_out, glwe_dimension, polynomial_size,
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
crt_decomposition_size, max_shared_memory);
break;
case 2048:
host_extract_bits<uint64_t, Degree<2048>>(
v_stream, gpu_index, (uint64_t *)list_lwe_array_out,
(uint64_t *)lwe_array_in, bit_extract_buffer, (uint64_t *)ksk,
(double2 *)fourier_bsk, number_of_bits_array, delta_log_array,
lwe_dimension_in, lwe_dimension_out, glwe_dimension, polynomial_size,
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
crt_decomposition_size, max_shared_memory);
break;
case 4096:
host_extract_bits<uint64_t, Degree<4096>>(
v_stream, gpu_index, (uint64_t *)list_lwe_array_out,
(uint64_t *)lwe_array_in, bit_extract_buffer, (uint64_t *)ksk,
(double2 *)fourier_bsk, number_of_bits_array, delta_log_array,
lwe_dimension_in, lwe_dimension_out, glwe_dimension, polynomial_size,
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
crt_decomposition_size, max_shared_memory);
break;
case 8192:
host_extract_bits<uint64_t, Degree<8192>>(
v_stream, gpu_index, (uint64_t *)list_lwe_array_out,
(uint64_t *)lwe_array_in, bit_extract_buffer, (uint64_t *)ksk,
(double2 *)fourier_bsk, number_of_bits_array, delta_log_array,
lwe_dimension_in, lwe_dimension_out, glwe_dimension, polynomial_size,
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
crt_decomposition_size, max_shared_memory);
break;
default:
break;
}
}
/*
* This cleanup function frees the data for the bit extraction on GPU in
* bit_extract_buffer for 32 or 64 bits inputs.
*/
void cleanup_cuda_extract_bits(void *v_stream, uint32_t gpu_index,
int8_t **bit_extract_buffer) {
auto stream = static_cast<cudaStream_t *>(v_stream);
// Free memory
cuda_drop_async(*bit_extract_buffer, stream, gpu_index);
}

View File

@@ -1,347 +0,0 @@
#ifndef BIT_EXTRACT_CUH
#define BIT_EXTRACT_CUH
#include "bit_extraction.h"
#include "bootstrap_fast_low_latency.cuh"
#include "device.h"
#include "keyswitch.cuh"
#include "polynomial/parameters.cuh"
#include "utils/timer.cuh"
/*
* Function copies batch lwe input to one that is shifted by value
* works for ciphertexts with sizes supported by params::degree
*
* Each x-block handles a params::degree-chunk of src
*/
template <typename Torus, class params>
__global__ void copy_and_shift_lwe(Torus *dst_shift, Torus *src, Torus value,
uint32_t glwe_dimension) {
int tid = threadIdx.x;
auto cur_dst_shift = &dst_shift[blockIdx.x * params::degree];
auto cur_src = &src[blockIdx.x * params::degree];
#pragma unroll
for (int i = 0; i < params::opt; i++) {
cur_dst_shift[tid] = cur_src[tid] * value;
tid += params::degree / params::opt;
}
if (threadIdx.x == 0 && blockIdx.x == 0) {
cur_dst_shift[glwe_dimension * params::degree] =
cur_src[glwe_dimension * params::degree] * value;
}
}
/*
* Function copies batch of lwe to lwe when size is not supported by
* params::degree
*/
template <typename Torus>
__global__ void copy_small_lwe(Torus *dst, Torus *src, uint32_t small_lwe_size,
uint32_t number_of_bits, uint32_t lwe_id) {
size_t blockId = blockIdx.x;
size_t threads_per_block = blockDim.x;
size_t opt = small_lwe_size / threads_per_block;
size_t rem = small_lwe_size & (threads_per_block - 1);
auto cur_lwe_list = &dst[blockId * small_lwe_size * number_of_bits];
auto cur_dst = &cur_lwe_list[lwe_id * small_lwe_size];
auto cur_src = &src[blockId * small_lwe_size];
size_t tid = threadIdx.x;
for (int i = 0; i < opt; i++) {
cur_dst[tid] = cur_src[tid];
tid += threads_per_block;
}
if (threadIdx.x < rem)
cur_dst[tid] = cur_src[tid];
}
/*
* Function used to wrapping add value on the body of ciphertexts,
* should be called with blocksize.x = 1;
* blickIdx.x refers id of ciphertext
* NOTE: check if putting thi functionality in copy_small_lwe or fill_pbs_lut
* is faster
*/
template <typename Torus>
__global__ void add_to_body(Torus *lwe, size_t lwe_dimension, Torus value) {
lwe[blockIdx.x * (lwe_dimension + 1) + lwe_dimension] += value;
}
/*
* Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption of 0
* if the extracted bit was 0 and 1 in the other case
* Remove the extracted bit from the state LWE to get a 0 at the extracted bit
* location.
* Shift on padding bit for next iteration, that's why
* alpha= 1ll << (ciphertext_n_bits - delta_log - bit_idx - 2) is used
* instead of alpha= 1ll << (ciphertext_n_bits - delta_log - bit_idx - 1)
*/
template <typename Torus, class params>
__global__ void add_sub_and_mul_lwe(Torus *shifted_lwe, Torus *state_lwe,
Torus *pbs_lwe_array_out, Torus add_value,
Torus mul_value, uint32_t glwe_dimension) {
size_t tid = threadIdx.x;
size_t blockId = blockIdx.x;
auto cur_shifted_lwe =
&shifted_lwe[blockId * (glwe_dimension * params::degree + 1)];
auto cur_state_lwe =
&state_lwe[blockId * (glwe_dimension * params::degree + 1)];
auto cur_pbs_lwe_array_out =
&pbs_lwe_array_out[blockId * (glwe_dimension * params::degree + 1)];
#pragma unroll
for (int i = 0; i < glwe_dimension * params::opt; i++) {
cur_shifted_lwe[tid] = cur_state_lwe[tid] -= cur_pbs_lwe_array_out[tid];
cur_shifted_lwe[tid] *= mul_value;
tid += params::degree / params::opt;
}
if (threadIdx.x == 0) {
cur_shifted_lwe[glwe_dimension * params::degree] =
cur_state_lwe[glwe_dimension * params::degree] -=
(cur_pbs_lwe_array_out[glwe_dimension * params::degree] + add_value);
cur_shifted_lwe[glwe_dimension * params::degree] *= mul_value;
}
}
/*
* Fill lut(only body) for the current bit, equivalent to trivial encryption as
* msk is 0s
* blockIdx.x refers id of lut vector
*/
template <typename Torus, class params>
__global__ void fill_lut_body_for_current_bit(Torus *lut, Torus value,
uint32_t glwe_dimension) {
Torus *cur_poly = &lut[(blockIdx.x * (glwe_dimension + 1) + glwe_dimension) *
params::degree];
size_t tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
cur_poly[tid] = value;
tid += params::degree / params::opt;
}
}
template <typename Torus>
__host__ __device__ uint64_t get_buffer_size_extract_bits(
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t crt_decomposition_size) {
uint64_t buffer_size =
sizeof(Torus) // lut_vector_indexes
+ ((glwe_dimension + 1) * polynomial_size) * sizeof(Torus) // lut_pbs
+ (glwe_dimension * polynomial_size + 1) *
sizeof(Torus) // lwe_array_in_buffer
+ (glwe_dimension * polynomial_size + 1) *
sizeof(Torus) // lwe_array_in_shifted_buffer
+ (lwe_dimension + 1) * sizeof(Torus) // lwe_array_out_ks_buffer
+ (glwe_dimension * polynomial_size + 1) *
sizeof(Torus); // lwe_array_out_pbs_buffer
buffer_size =
(buffer_size + buffer_size % sizeof(double2)) * crt_decomposition_size;
return buffer_size;
}
template <typename Torus, typename STorus, typename params>
__host__ void
scratch_extract_bits(void *v_stream, uint32_t gpu_index,
int8_t **bit_extract_buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t crt_decomposition_size,
uint32_t max_shared_memory, bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
uint64_t buffer_size = get_buffer_size_extract_bits<Torus>(
glwe_dimension, lwe_dimension, polynomial_size,
crt_decomposition_size) +
get_buffer_size_bootstrap_fast_low_latency<Torus>(
glwe_dimension, polynomial_size, level_count,
crt_decomposition_size, max_shared_memory);
// allocate and initialize device pointers for bit extraction
if (allocate_gpu_memory) {
*bit_extract_buffer =
(int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
}
// lut_vector_indexes is the last buffer in the bit_extract_buffer
// it's hard set to 0: only one LUT is given as input, it's the same for all
// LWE inputs For simplicity we initialize the whole buffer to 0
check_cuda_error(
cudaMemsetAsync(*bit_extract_buffer, 0, buffer_size, *stream));
scratch_bootstrap_fast_low_latency<Torus, STorus, params>(
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, polynomial_size,
level_count, crt_decomposition_size, max_shared_memory, false);
}
/*
* Host function for cuda single ciphertext extract bits.
* it executes device functions in specific order and manages
* parallelism
*/
template <typename Torus, class params>
__host__ void host_single_ciphertext_extract_bits(
void *v_stream, uint32_t gpu_index, Torus *list_lwe_array_out,
Torus *lwe_array_in, int8_t *bit_extract_buffer, Torus *ksk,
double2 *fourier_bsk, uint32_t number_of_bits, uint32_t delta_log,
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log_bsk,
uint32_t level_count_bsk, uint32_t base_log_ksk, uint32_t level_count_ksk,
uint32_t max_shared_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
uint32_t ciphertext_n_bits = sizeof(Torus) * 8;
int threads = params::degree / params::opt;
// Always define the PBS buffer first, because it has the strongest memory
// alignment requirement (16 bytes for double2)
int8_t *pbs_buffer = (int8_t *)bit_extract_buffer;
Torus *lut_pbs =
(Torus *)pbs_buffer +
(ptrdiff_t)(get_buffer_size_bootstrap_fast_low_latency<Torus>(
glwe_dimension, polynomial_size, level_count_bsk, 1,
max_shared_memory) /
sizeof(Torus));
Torus *lwe_array_in_buffer =
(Torus *)lut_pbs + (ptrdiff_t)((glwe_dimension + 1) * polynomial_size);
Torus *lwe_array_in_shifted_buffer =
(Torus *)lwe_array_in_buffer +
(ptrdiff_t)(glwe_dimension * polynomial_size + 1);
Torus *lwe_array_out_ks_buffer =
(Torus *)lwe_array_in_shifted_buffer +
(ptrdiff_t)(glwe_dimension * polynomial_size + 1);
Torus *lwe_array_out_pbs_buffer =
(Torus *)lwe_array_out_ks_buffer + (ptrdiff_t)(lwe_dimension_out + 1);
// lut_vector_indexes is the last array in the bit_extract buffer
Torus *lut_vector_indexes =
(Torus *)lwe_array_out_pbs_buffer +
(ptrdiff_t)((glwe_dimension * polynomial_size + 1));
// shift lwe on padding bit and copy in new buffer
check_cuda_error(
cudaMemcpyAsync(lwe_array_in_buffer, lwe_array_in,
(glwe_dimension * polynomial_size + 1) * sizeof(Torus),
cudaMemcpyDeviceToDevice, *stream));
copy_and_shift_lwe<Torus, params><<<glwe_dimension, threads, 0, *stream>>>(
lwe_array_in_shifted_buffer, lwe_array_in,
(Torus)(1ll << (ciphertext_n_bits - delta_log - 1)), glwe_dimension);
check_cuda_error(cudaGetLastError());
for (int bit_idx = 0; bit_idx < number_of_bits; bit_idx++) {
cuda_keyswitch_lwe_ciphertext_vector(
v_stream, gpu_index, lwe_array_out_ks_buffer,
lwe_array_in_shifted_buffer, ksk, lwe_dimension_in, lwe_dimension_out,
base_log_ksk, level_count_ksk, 1);
copy_small_lwe<<<1, 256, 0, *stream>>>(
list_lwe_array_out, lwe_array_out_ks_buffer, lwe_dimension_out + 1,
number_of_bits, number_of_bits - bit_idx - 1);
check_cuda_error(cudaGetLastError());
if (bit_idx == number_of_bits - 1) {
break;
}
// Add q/4 to center the error while computing a negacyclic LUT
add_to_body<Torus>
<<<1, 1, 0, *stream>>>(lwe_array_out_ks_buffer, lwe_dimension_out,
(Torus)(1ll << (ciphertext_n_bits - 2)));
check_cuda_error(cudaGetLastError());
// Fill lut for the current bit (equivalent to trivial encryption as mask is
// 0s) The LUT is filled with -alpha in each coefficient where alpha =
// delta*2^{bit_idx-1}
fill_lut_body_for_current_bit<Torus, params><<<1, threads, 0, *stream>>>(
lut_pbs, (Torus)(0ll - 1ll << (delta_log - 1 + bit_idx)),
glwe_dimension);
check_cuda_error(cudaGetLastError());
host_bootstrap_fast_low_latency<Torus, params>(
v_stream, gpu_index, lwe_array_out_pbs_buffer, lut_pbs,
lut_vector_indexes, lwe_array_out_ks_buffer, fourier_bsk, pbs_buffer,
glwe_dimension, lwe_dimension_out, polynomial_size, base_log_bsk,
level_count_bsk, 1, 1, max_shared_memory);
// Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption
// of 0 if the extracted bit was 0 and 1 in the other case
add_sub_and_mul_lwe<Torus, params><<<1, threads, 0, *stream>>>(
lwe_array_in_shifted_buffer, lwe_array_in_buffer,
lwe_array_out_pbs_buffer, (Torus)(1ll << (delta_log - 1 + bit_idx)),
(Torus)(1ll << (ciphertext_n_bits - delta_log - bit_idx - 2)),
glwe_dimension);
check_cuda_error(cudaGetLastError());
}
}
/*
* Host function for cuda extract bits.
* it executes device functions in specific order and manages
* parallelism
*/
template <typename Torus, class params>
__host__ void
host_extract_bits(void *v_stream, uint32_t gpu_index, Torus *list_lwe_array_out,
Torus *lwe_array_in, int8_t *bit_extract_buffer, Torus *ksk,
double2 *fourier_bsk, uint32_t *number_of_bits_array,
uint32_t *delta_log_array, uint32_t lwe_dimension_in,
uint32_t lwe_dimension_out, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log_bsk,
uint32_t level_count_bsk, uint32_t base_log_ksk,
uint32_t level_count_ksk, uint32_t crt_decomposition_size,
uint32_t max_shared_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
cudaStream_t *sub_streams[crt_decomposition_size];
for (int i = 0; i < crt_decomposition_size; i++) {
sub_streams[i] = cuda_create_stream(gpu_index);
}
int bit_extract_buffer_size =
get_buffer_size_extract_bits<Torus>(glwe_dimension, lwe_dimension_out,
polynomial_size, 1) +
get_buffer_size_bootstrap_fast_low_latency<Torus>(
glwe_dimension, polynomial_size, level_count_bsk, 1,
max_shared_memory);
int cur_total_lwe = 0;
for (int i = 0; i < crt_decomposition_size; i++) {
uint32_t number_of_bits = number_of_bits_array[i];
auto cur_input_lwe = &lwe_array_in[i * (lwe_dimension_in + 1)];
auto cur_output_lwe_array =
&list_lwe_array_out[cur_total_lwe * (lwe_dimension_out + 1)];
auto cur_bit_extract_buffer =
&bit_extract_buffer[i * bit_extract_buffer_size];
host_single_ciphertext_extract_bits<Torus, params>(
(void *)sub_streams[i], gpu_index, cur_output_lwe_array, cur_input_lwe,
cur_bit_extract_buffer, ksk, fourier_bsk, number_of_bits,
delta_log_array[i], lwe_dimension_in, lwe_dimension_out, glwe_dimension,
polynomial_size, base_log_bsk, level_count_bsk, base_log_ksk,
level_count_ksk, max_shared_memory);
cur_total_lwe += number_of_bits_array[i];
}
cudaEvent_t event;
cudaEventCreate(&event);
for (int i = 0; i < crt_decomposition_size; i++) {
cudaEventRecord(event, *(sub_streams[i]));
cudaStreamWaitEvent(*stream, event, 0);
}
for (int i = 0; i < crt_decomposition_size; i++) {
cuda_destroy_stream((sub_streams[i]), gpu_index);
}
cudaEventDestroy(event);
}
#endif // BIT_EXTRACT_CUH

View File

@@ -1,686 +0,0 @@
#ifndef CUDA_BOOLEAN_GATES_CU
#define CUDA_BOOLEAN_GATES_CU
#include "bootstrap.h"
#include "device.h"
#include "keyswitch.h"
#include "linear_algebra.h"
constexpr uint32_t PLAINTEXT_TRUE{1 << (32 - 3)};
constexpr uint32_t PLAINTEXT_FALSE{static_cast<uint32_t>(7 << (32 - 3))};
extern "C" void cuda_boolean_not_32(void *v_stream, uint32_t gpu_index,
void *lwe_array_out, void *lwe_array_in,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
cuda_negate_lwe_ciphertext_vector_32(v_stream, gpu_index, lwe_array_out,
lwe_array_in, input_lwe_dimension,
input_lwe_ciphertext_count);
}
extern "C" void cuda_boolean_and_32(
void *v_stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_array_in_1, void *lwe_array_in_2, void *bootstrapping_key,
void *ksk, uint32_t input_lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level_count,
uint32_t ks_base_log, uint32_t ks_level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
auto stream = static_cast<cudaStream_t *>(v_stream);
uint32_t *lwe_buffer_1 = (uint32_t *)cuda_malloc_async(
(input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
// 1. Add the two ciphertexts
cuda_add_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_buffer_1, lwe_array_in_1, lwe_array_in_2,
input_lwe_dimension, input_lwe_ciphertext_count);
// 2. Add "false" plaintext, where "false" is 7 << (32 - 3)
uint32_t *h_false_plaintext_array =
(uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
h_false_plaintext_array[index] = PLAINTEXT_FALSE;
}
uint32_t *false_plaintext_array = (uint32_t *)cuda_malloc_async(
input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index);
cuda_memcpy_async_to_gpu(false_plaintext_array, h_false_plaintext_array,
input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
check_cuda_error(cudaGetLastError());
uint32_t *lwe_buffer_2 = (uint32_t *)cuda_malloc_async(
(input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
v_stream, gpu_index, lwe_buffer_2, lwe_buffer_1, false_plaintext_array,
input_lwe_dimension, input_lwe_ciphertext_count);
cuda_drop_async(lwe_buffer_1, stream, gpu_index);
cuda_drop_async(false_plaintext_array, stream, gpu_index);
free(h_false_plaintext_array);
// 3. Compute a PBS with the LUT created below
uint32_t *h_pbs_lut = (uint32_t *)malloc((glwe_dimension + 1) *
polynomial_size * sizeof(uint32_t));
for (uint index = 0; index < (glwe_dimension + 1) * polynomial_size;
index++) {
h_pbs_lut[index] =
index < (glwe_dimension * polynomial_size) ? 0 : PLAINTEXT_TRUE;
}
uint32_t *pbs_lut = (uint32_t *)cuda_malloc_async(
(glwe_dimension + 1) * polynomial_size * sizeof(uint32_t), stream,
gpu_index);
cuda_memcpy_async_to_gpu(pbs_lut, h_pbs_lut,
(glwe_dimension + 1) * polynomial_size *
sizeof(uint32_t),
stream, gpu_index);
check_cuda_error(cudaGetLastError());
uint32_t *h_pbs_lut_indexes =
(uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
h_pbs_lut_indexes[index] = 0;
}
uint32_t *pbs_lut_indexes = (uint32_t *)cuda_malloc_async(
input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index);
cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes,
input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
check_cuda_error(cudaGetLastError());
uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async(
(glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count *
sizeof(uint32_t),
stream, gpu_index);
check_cuda_error(cudaGetLastError());
int8_t *pbs_buffer = nullptr;
scratch_cuda_bootstrap_low_latency_32(
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
lwe_buffer_2, bootstrapping_key, pbs_buffer, input_lwe_dimension,
glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
input_lwe_ciphertext_count, 1, 0, max_shared_memory);
cleanup_cuda_bootstrap_low_latency(v_stream, gpu_index, &pbs_buffer);
check_cuda_error(cudaGetLastError());
cuda_drop_async(lwe_buffer_2, stream, gpu_index);
cuda_drop_async(pbs_lut, stream, gpu_index);
cuda_drop_async(pbs_lut_indexes, stream, gpu_index);
free(h_pbs_lut);
free(h_pbs_lut_indexes);
cuda_keyswitch_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_array_out, lwe_pbs_buffer, ksk,
glwe_dimension * polynomial_size, input_lwe_dimension, ks_base_log,
ks_level_count, input_lwe_ciphertext_count);
cuda_drop_async(lwe_pbs_buffer, stream, gpu_index);
}
extern "C" void cuda_boolean_nand_32(
void *v_stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_array_in_1, void *lwe_array_in_2, void *bootstrapping_key,
void *ksk, uint32_t input_lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level_count,
uint32_t ks_base_log, uint32_t ks_level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
auto stream = static_cast<cudaStream_t *>(v_stream);
uint32_t *lwe_buffer_1 = (uint32_t *)cuda_malloc_async(
(input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
// 1. Add the two ciphertexts
cuda_add_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_buffer_1, lwe_array_in_1, lwe_array_in_2,
input_lwe_dimension, input_lwe_ciphertext_count);
// 2. Negate ciphertext
uint32_t *lwe_buffer_2 = (uint32_t *)cuda_malloc_async(
(input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
cuda_negate_lwe_ciphertext_vector_32(v_stream, gpu_index, lwe_buffer_2,
lwe_buffer_1, input_lwe_dimension,
input_lwe_ciphertext_count);
cuda_drop_async(lwe_buffer_1, stream, gpu_index);
// 3. Add "true" plaintext, where "true" is 1 << (32 - 3)
uint32_t *h_true_plaintext_array =
(uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
h_true_plaintext_array[index] = PLAINTEXT_TRUE;
}
uint32_t *true_plaintext_array = (uint32_t *)cuda_malloc_async(
input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index);
cuda_memcpy_async_to_gpu(true_plaintext_array, h_true_plaintext_array,
input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
check_cuda_error(cudaGetLastError());
uint32_t *lwe_buffer_3 = (uint32_t *)cuda_malloc_async(
(input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
v_stream, gpu_index, lwe_buffer_3, lwe_buffer_2, true_plaintext_array,
input_lwe_dimension, input_lwe_ciphertext_count);
cuda_drop_async(lwe_buffer_2, stream, gpu_index);
cuda_drop_async(true_plaintext_array, stream, gpu_index);
free(h_true_plaintext_array);
// 3. Compute a PBS with the LUT created below
uint32_t *h_pbs_lut = (uint32_t *)malloc((glwe_dimension + 1) *
polynomial_size * sizeof(uint32_t));
for (uint index = 0; index < (glwe_dimension + 1) * polynomial_size;
index++) {
h_pbs_lut[index] =
index < (glwe_dimension * polynomial_size) ? 0 : PLAINTEXT_TRUE;
}
uint32_t *pbs_lut = (uint32_t *)cuda_malloc_async(
(glwe_dimension + 1) * polynomial_size * sizeof(uint32_t), stream,
gpu_index);
cuda_memcpy_async_to_gpu(pbs_lut, h_pbs_lut,
(glwe_dimension + 1) * polynomial_size *
sizeof(uint32_t),
stream, gpu_index);
check_cuda_error(cudaGetLastError());
uint32_t *h_pbs_lut_indexes =
(uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
h_pbs_lut_indexes[index] = 0;
}
uint32_t *pbs_lut_indexes = (uint32_t *)cuda_malloc_async(
input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index);
cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes,
input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
check_cuda_error(cudaGetLastError());
uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async(
(glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count *
sizeof(uint32_t),
stream, gpu_index);
check_cuda_error(cudaGetLastError());
int8_t *pbs_buffer = nullptr;
scratch_cuda_bootstrap_low_latency_32(
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
lwe_buffer_3, bootstrapping_key, pbs_buffer, input_lwe_dimension,
glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
input_lwe_ciphertext_count, 1, 0, max_shared_memory);
cleanup_cuda_bootstrap_low_latency(v_stream, gpu_index, &pbs_buffer);
check_cuda_error(cudaGetLastError());
cuda_drop_async(lwe_buffer_3, stream, gpu_index);
cuda_drop_async(pbs_lut, stream, gpu_index);
cuda_drop_async(pbs_lut_indexes, stream, gpu_index);
free(h_pbs_lut);
free(h_pbs_lut_indexes);
cuda_keyswitch_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_array_out, lwe_pbs_buffer, ksk,
glwe_dimension * polynomial_size, input_lwe_dimension, ks_base_log,
ks_level_count, input_lwe_ciphertext_count);
cuda_drop_async(lwe_pbs_buffer, stream, gpu_index);
}
extern "C" void cuda_boolean_nor_32(
void *v_stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_array_in_1, void *lwe_array_in_2, void *bootstrapping_key,
void *ksk, uint32_t input_lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level_count,
uint32_t ks_base_log, uint32_t ks_level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
auto stream = static_cast<cudaStream_t *>(v_stream);
uint32_t *lwe_buffer_1 = (uint32_t *)cuda_malloc_async(
(input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
// 1. Add the two ciphertexts
cuda_add_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_buffer_1, lwe_array_in_1, lwe_array_in_2,
input_lwe_dimension, input_lwe_ciphertext_count);
// 2. Negate ciphertext
uint32_t *lwe_buffer_2 = (uint32_t *)cuda_malloc_async(
(input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
cuda_negate_lwe_ciphertext_vector_32(v_stream, gpu_index, lwe_buffer_2,
lwe_buffer_1, input_lwe_dimension,
input_lwe_ciphertext_count);
cuda_drop_async(lwe_buffer_1, stream, gpu_index);
// 3. Add "false" plaintext
uint32_t *h_false_plaintext_array =
(uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
h_false_plaintext_array[index] = PLAINTEXT_FALSE;
}
uint32_t *false_plaintext_array = (uint32_t *)cuda_malloc_async(
input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index);
cuda_memcpy_async_to_gpu(false_plaintext_array, h_false_plaintext_array,
input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
check_cuda_error(cudaGetLastError());
uint32_t *lwe_buffer_3 = (uint32_t *)cuda_malloc_async(
(input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
v_stream, gpu_index, lwe_buffer_3, lwe_buffer_2, false_plaintext_array,
input_lwe_dimension, input_lwe_ciphertext_count);
cuda_drop_async(lwe_buffer_2, stream, gpu_index);
cuda_drop_async(false_plaintext_array, stream, gpu_index);
free(h_false_plaintext_array);
// 3. Compute a PBS with the LUT created below
uint32_t *h_pbs_lut = (uint32_t *)malloc((glwe_dimension + 1) *
polynomial_size * sizeof(uint32_t));
for (uint index = 0; index < (glwe_dimension + 1) * polynomial_size;
index++) {
h_pbs_lut[index] =
index < (glwe_dimension * polynomial_size) ? 0 : PLAINTEXT_TRUE;
}
uint32_t *pbs_lut = (uint32_t *)cuda_malloc_async(
(glwe_dimension + 1) * polynomial_size * sizeof(uint32_t), stream,
gpu_index);
cuda_memcpy_async_to_gpu(pbs_lut, h_pbs_lut,
(glwe_dimension + 1) * polynomial_size *
sizeof(uint32_t),
stream, gpu_index);
check_cuda_error(cudaGetLastError());
uint32_t *h_pbs_lut_indexes =
(uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
h_pbs_lut_indexes[index] = 0;
}
uint32_t *pbs_lut_indexes = (uint32_t *)cuda_malloc_async(
input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index);
cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes,
input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
check_cuda_error(cudaGetLastError());
uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async(
(glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count *
sizeof(uint32_t),
stream, gpu_index);
check_cuda_error(cudaGetLastError());
int8_t *pbs_buffer = nullptr;
scratch_cuda_bootstrap_low_latency_32(
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
lwe_buffer_3, bootstrapping_key, pbs_buffer, input_lwe_dimension,
glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
input_lwe_ciphertext_count, 1, 0, max_shared_memory);
cleanup_cuda_bootstrap_low_latency(v_stream, gpu_index, &pbs_buffer);
check_cuda_error(cudaGetLastError());
cuda_drop_async(lwe_buffer_3, stream, gpu_index);
cuda_drop_async(pbs_lut, stream, gpu_index);
cuda_drop_async(pbs_lut_indexes, stream, gpu_index);
free(h_pbs_lut);
free(h_pbs_lut_indexes);
cuda_keyswitch_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_array_out, lwe_pbs_buffer, ksk,
glwe_dimension * polynomial_size, input_lwe_dimension, ks_base_log,
ks_level_count, input_lwe_ciphertext_count);
cuda_drop_async(lwe_pbs_buffer, stream, gpu_index);
}
extern "C" void cuda_boolean_or_32(
void *v_stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_array_in_1, void *lwe_array_in_2, void *bootstrapping_key,
void *ksk, uint32_t input_lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level_count,
uint32_t ks_base_log, uint32_t ks_level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
auto stream = static_cast<cudaStream_t *>(v_stream);
uint32_t *lwe_buffer_1 = (uint32_t *)cuda_malloc_async(
(input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
// 1. Add the two ciphertexts
cuda_add_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_buffer_1, lwe_array_in_1, lwe_array_in_2,
input_lwe_dimension, input_lwe_ciphertext_count);
// 2. Add "true" plaintext
uint32_t *h_true_plaintext_array =
(uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
h_true_plaintext_array[index] = PLAINTEXT_TRUE;
}
uint32_t *true_plaintext_array = (uint32_t *)cuda_malloc_async(
input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index);
cuda_memcpy_async_to_gpu(true_plaintext_array, h_true_plaintext_array,
input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
check_cuda_error(cudaGetLastError());
uint32_t *lwe_buffer_2 = (uint32_t *)cuda_malloc_async(
(input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
v_stream, gpu_index, lwe_buffer_2, lwe_buffer_1, true_plaintext_array,
input_lwe_dimension, input_lwe_ciphertext_count);
cuda_drop_async(lwe_buffer_1, stream, gpu_index);
cuda_drop_async(true_plaintext_array, stream, gpu_index);
free(h_true_plaintext_array);
// 3. Compute a PBS with the LUT created below
uint32_t *h_pbs_lut = (uint32_t *)malloc((glwe_dimension + 1) *
polynomial_size * sizeof(uint32_t));
for (uint index = 0; index < (glwe_dimension + 1) * polynomial_size;
index++) {
h_pbs_lut[index] =
index < (glwe_dimension * polynomial_size) ? 0 : PLAINTEXT_TRUE;
}
uint32_t *pbs_lut = (uint32_t *)cuda_malloc_async(
(glwe_dimension + 1) * polynomial_size * sizeof(uint32_t), stream,
gpu_index);
cuda_memcpy_async_to_gpu(pbs_lut, h_pbs_lut,
(glwe_dimension + 1) * polynomial_size *
sizeof(uint32_t),
stream, gpu_index);
check_cuda_error(cudaGetLastError());
uint32_t *h_pbs_lut_indexes =
(uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
h_pbs_lut_indexes[index] = 0;
}
uint32_t *pbs_lut_indexes = (uint32_t *)cuda_malloc_async(
input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index);
cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes,
input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
check_cuda_error(cudaGetLastError());
uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async(
(glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count *
sizeof(uint32_t),
stream, gpu_index);
check_cuda_error(cudaGetLastError());
int8_t *pbs_buffer = nullptr;
scratch_cuda_bootstrap_low_latency_32(
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
lwe_buffer_2, bootstrapping_key, pbs_buffer, input_lwe_dimension,
glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
input_lwe_ciphertext_count, 1, 0, max_shared_memory);
cleanup_cuda_bootstrap_low_latency(v_stream, gpu_index, &pbs_buffer);
check_cuda_error(cudaGetLastError());
cuda_drop_async(lwe_buffer_2, stream, gpu_index);
cuda_drop_async(pbs_lut, stream, gpu_index);
cuda_drop_async(pbs_lut_indexes, stream, gpu_index);
free(h_pbs_lut);
free(h_pbs_lut_indexes);
cuda_keyswitch_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_array_out, lwe_pbs_buffer, ksk,
glwe_dimension * polynomial_size, input_lwe_dimension, ks_base_log,
ks_level_count, input_lwe_ciphertext_count);
cuda_drop_async(lwe_pbs_buffer, stream, gpu_index);
}
extern "C" void cuda_boolean_xor_32(
void *v_stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_array_in_1, void *lwe_array_in_2, void *bootstrapping_key,
void *ksk, uint32_t input_lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level_count,
uint32_t ks_base_log, uint32_t ks_level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
auto stream = static_cast<cudaStream_t *>(v_stream);
uint32_t *lwe_buffer_1 = (uint32_t *)cuda_malloc_async(
(input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
// 1. Add the two ciphertexts
cuda_add_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_buffer_1, lwe_array_in_1, lwe_array_in_2,
input_lwe_dimension, input_lwe_ciphertext_count);
// 2. Add "true" plaintext
uint32_t *h_true_plaintext_array =
(uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
h_true_plaintext_array[index] = PLAINTEXT_TRUE;
}
uint32_t *true_plaintext_array = (uint32_t *)cuda_malloc_async(
input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index);
cuda_memcpy_async_to_gpu(true_plaintext_array, h_true_plaintext_array,
input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
check_cuda_error(cudaGetLastError());
uint32_t *lwe_buffer_2 = (uint32_t *)cuda_malloc_async(
(input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
v_stream, gpu_index, lwe_buffer_2, lwe_buffer_1, true_plaintext_array,
input_lwe_dimension, input_lwe_ciphertext_count);
cuda_drop_async(lwe_buffer_1, stream, gpu_index);
cuda_drop_async(true_plaintext_array, stream, gpu_index);
free(h_true_plaintext_array);
// 3. Multiply by 2
uint32_t *h_cleartext_array =
(uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
h_cleartext_array[index] = 2;
}
uint32_t *cleartext_array = (uint32_t *)cuda_malloc_async(
input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index);
cuda_memcpy_async_to_gpu(cleartext_array, h_cleartext_array,
input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
check_cuda_error(cudaGetLastError());
uint32_t *lwe_buffer_3 = (uint32_t *)cuda_malloc_async(
(input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
v_stream, gpu_index, lwe_buffer_3, lwe_buffer_2, cleartext_array,
input_lwe_dimension, input_lwe_ciphertext_count);
cuda_drop_async(lwe_buffer_2, stream, gpu_index);
// 4. Compute a PBS with the LUT created below
uint32_t *h_pbs_lut = (uint32_t *)malloc((glwe_dimension + 1) *
polynomial_size * sizeof(uint32_t));
for (uint index = 0; index < (glwe_dimension + 1) * polynomial_size;
index++) {
h_pbs_lut[index] =
index < (glwe_dimension * polynomial_size) ? 0 : PLAINTEXT_TRUE;
}
uint32_t *pbs_lut = (uint32_t *)cuda_malloc_async(
(glwe_dimension + 1) * polynomial_size * sizeof(uint32_t), stream,
gpu_index);
cuda_memcpy_async_to_gpu(pbs_lut, h_pbs_lut,
(glwe_dimension + 1) * polynomial_size *
sizeof(uint32_t),
stream, gpu_index);
check_cuda_error(cudaGetLastError());
uint32_t *h_pbs_lut_indexes =
(uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
h_pbs_lut_indexes[index] = 0;
}
uint32_t *pbs_lut_indexes = (uint32_t *)cuda_malloc_async(
input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index);
cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes,
input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
check_cuda_error(cudaGetLastError());
uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async(
(glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count *
sizeof(uint32_t),
stream, gpu_index);
check_cuda_error(cudaGetLastError());
int8_t *pbs_buffer = nullptr;
scratch_cuda_bootstrap_low_latency_32(
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
lwe_buffer_3, bootstrapping_key, pbs_buffer, input_lwe_dimension,
glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
input_lwe_ciphertext_count, 1, 0, max_shared_memory);
cleanup_cuda_bootstrap_low_latency(v_stream, gpu_index, &pbs_buffer);
check_cuda_error(cudaGetLastError());
cuda_drop_async(lwe_buffer_3, stream, gpu_index);
cuda_drop_async(pbs_lut, stream, gpu_index);
cuda_drop_async(pbs_lut_indexes, stream, gpu_index);
free(h_pbs_lut);
free(h_pbs_lut_indexes);
cuda_keyswitch_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_array_out, lwe_pbs_buffer, ksk,
glwe_dimension * polynomial_size, input_lwe_dimension, ks_base_log,
ks_level_count, input_lwe_ciphertext_count);
cuda_drop_async(lwe_pbs_buffer, stream, gpu_index);
}
extern "C" void cuda_boolean_xnor_32(
void *v_stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_array_in_1, void *lwe_array_in_2, void *bootstrapping_key,
void *ksk, uint32_t input_lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level_count,
uint32_t ks_base_log, uint32_t ks_level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
auto stream = static_cast<cudaStream_t *>(v_stream);
uint32_t *lwe_buffer_1 = (uint32_t *)cuda_malloc_async(
(input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
// 1. Add the two ciphertexts
cuda_add_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_buffer_1, lwe_array_in_1, lwe_array_in_2,
input_lwe_dimension, input_lwe_ciphertext_count);
// 2. Add "true" plaintext
uint32_t *h_true_plaintext_array =
(uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
h_true_plaintext_array[index] = PLAINTEXT_TRUE;
}
uint32_t *true_plaintext_array = (uint32_t *)cuda_malloc_async(
input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index);
cuda_memcpy_async_to_gpu(true_plaintext_array, h_true_plaintext_array,
input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
check_cuda_error(cudaGetLastError());
uint32_t *lwe_buffer_2 = (uint32_t *)cuda_malloc_async(
(input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
v_stream, gpu_index, lwe_buffer_2, lwe_buffer_1, true_plaintext_array,
input_lwe_dimension, input_lwe_ciphertext_count);
cuda_drop_async(lwe_buffer_1, stream, gpu_index);
cuda_drop_async(true_plaintext_array, stream, gpu_index);
free(h_true_plaintext_array);
// 3. Negate ciphertext
uint32_t *lwe_buffer_3 = (uint32_t *)cuda_malloc_async(
(input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
cuda_negate_lwe_ciphertext_vector_32(v_stream, gpu_index, lwe_buffer_3,
lwe_buffer_2, input_lwe_dimension,
input_lwe_ciphertext_count);
cuda_drop_async(lwe_buffer_2, stream, gpu_index);
// 4. Multiply by 2
uint32_t *h_cleartext_array =
(uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
h_cleartext_array[index] = 2;
}
uint32_t *cleartext_array = (uint32_t *)cuda_malloc_async(
input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index);
cuda_memcpy_async_to_gpu(cleartext_array, h_cleartext_array,
input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
check_cuda_error(cudaGetLastError());
uint32_t *lwe_buffer_4 = (uint32_t *)cuda_malloc_async(
(input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
v_stream, gpu_index, lwe_buffer_4, lwe_buffer_3, cleartext_array,
input_lwe_dimension, input_lwe_ciphertext_count);
cuda_drop_async(lwe_buffer_3, stream, gpu_index);
// 5. Compute a PBS with the LUT created below
uint32_t *h_pbs_lut = (uint32_t *)malloc((glwe_dimension + 1) *
polynomial_size * sizeof(uint32_t));
for (uint index = 0; index < (glwe_dimension + 1) * polynomial_size;
index++) {
h_pbs_lut[index] =
index < (glwe_dimension * polynomial_size) ? 0 : PLAINTEXT_TRUE;
}
uint32_t *pbs_lut = (uint32_t *)cuda_malloc_async(
(glwe_dimension + 1) * polynomial_size * sizeof(uint32_t), stream,
gpu_index);
cuda_memcpy_async_to_gpu(pbs_lut, h_pbs_lut,
(glwe_dimension + 1) * polynomial_size *
sizeof(uint32_t),
stream, gpu_index);
check_cuda_error(cudaGetLastError());
uint32_t *h_pbs_lut_indexes =
(uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
h_pbs_lut_indexes[index] = 0;
}
uint32_t *pbs_lut_indexes = (uint32_t *)cuda_malloc_async(
input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index);
cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes,
input_lwe_ciphertext_count * sizeof(uint32_t),
stream, gpu_index);
check_cuda_error(cudaGetLastError());
uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async(
(glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count *
sizeof(uint32_t),
stream, gpu_index);
check_cuda_error(cudaGetLastError());
int8_t *pbs_buffer = nullptr;
scratch_cuda_bootstrap_low_latency_32(
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
lwe_buffer_4, bootstrapping_key, pbs_buffer, input_lwe_dimension,
glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
input_lwe_ciphertext_count, 1, 0, max_shared_memory);
cleanup_cuda_bootstrap_low_latency(v_stream, gpu_index, &pbs_buffer);
check_cuda_error(cudaGetLastError());
cuda_drop_async(lwe_buffer_4, stream, gpu_index);
cuda_drop_async(pbs_lut, stream, gpu_index);
cuda_drop_async(pbs_lut_indexes, stream, gpu_index);
free(h_pbs_lut);
free(h_pbs_lut_indexes);
cuda_keyswitch_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_array_out, lwe_pbs_buffer, ksk,
glwe_dimension * polynomial_size, input_lwe_dimension, ks_base_log,
ks_level_count, input_lwe_ciphertext_count);
cuda_drop_async(lwe_pbs_buffer, stream, gpu_index);
}
#endif // CUDA_BOOLEAN_GATES_CU

View File

@@ -1 +0,0 @@
#include "crypto/bootstrapping_key.cuh"

View File

@@ -1,378 +0,0 @@
#include <err.h>
#include "bootstrap_amortized.cuh"
/*
* Returns the buffer size for 64 bits executions
*/
uint64_t get_buffer_size_bootstrap_amortized_64(
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
return get_buffer_size_bootstrap_amortized<uint64_t>(
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
max_shared_memory);
}
/*
* Runs standard checks to validate the inputs
*/
void checks_fast_bootstrap_amortized(int polynomial_size) {
assert(
("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
"1024, 2048, 4096, 8192, 16384",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192 ||
polynomial_size == 16384));
}
/*
* Runs standard checks to validate the inputs
*/
void checks_bootstrap_amortized(int nbits, int base_log, int polynomial_size) {
assert(("Error (GPU amortized PBS): base log should be <= nbits",
base_log <= nbits));
checks_fast_bootstrap_amortized(polynomial_size);
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the amortized PBS on 32 bits inputs, into `pbs_buffer`. It also
* configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
* be used.
*/
void scratch_cuda_bootstrap_amortized_32(void *v_stream, uint32_t gpu_index,
int8_t **pbs_buffer,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory,
bool allocate_gpu_memory) {
checks_fast_bootstrap_amortized(polynomial_size);
switch (polynomial_size) {
case 256:
scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<256>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 512:
scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<512>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 1024:
scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<1024>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 2048:
scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<2048>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 4096:
scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<4096>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 8192:
scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<8192>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 16384:
scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<16384>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
default:
errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
"are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
break;
}
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the amortized PBS on 64 bits inputs, into `pbs_buffer`. It also
* configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
* be used.
*/
void scratch_cuda_bootstrap_amortized_64(void *v_stream, uint32_t gpu_index,
int8_t **pbs_buffer,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory,
bool allocate_gpu_memory) {
checks_fast_bootstrap_amortized(polynomial_size);
switch (polynomial_size) {
case 256:
scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<256>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 512:
scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<512>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 1024:
scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<1024>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 2048:
scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<2048>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 4096:
scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<4096>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 8192:
scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<8192>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 16384:
scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<16384>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
default:
errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
"are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
break;
}
}
/* Perform the programmable bootstrapping on a batch of input u32 LWE
* ciphertexts. See the corresponding operation on 64 bits for more details.
*/
void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
uint32_t max_shared_memory) {
checks_bootstrap_amortized(32, base_log, polynomial_size);
switch (polynomial_size) {
case 256:
host_bootstrap_amortized<uint32_t, AmortizedDegree<256>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 512:
host_bootstrap_amortized<uint32_t, AmortizedDegree<512>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 1024:
host_bootstrap_amortized<uint32_t, AmortizedDegree<1024>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 2048:
host_bootstrap_amortized<uint32_t, AmortizedDegree<2048>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 4096:
host_bootstrap_amortized<uint32_t, AmortizedDegree<4096>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 8192:
host_bootstrap_amortized<uint32_t, AmortizedDegree<8192>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 16384:
host_bootstrap_amortized<uint32_t, AmortizedDegree<16384>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
default:
errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
"are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
break;
}
}
/* Perform the programmable bootstrapping on a batch of input u64 LWE
* ciphertexts. This functions performs best for large numbers of inputs (> 10).
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
* (a0,..an-1,b) where n is the LWE dimension
* - lut_vector: should hold as many test vectors of size polynomial_size
* as there are input ciphertexts, but actually holds
* num_lut_vectors vectors to reduce memory usage
* - lut_vector_indexes: stores the index corresponding to
* which test vector of lut_vector to use for each LWE input in
* lwe_array_in
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
* mask values + 1 body value
* - bootstrapping_key: GGSW encryption of the LWE secret key sk1
* under secret key sk2
* bsk = Z + sk1 H
* where H is the gadget matrix and Z is a matrix (k+1).l
* containing GLWE encryptions of 0 under sk2.
* bsk is thus a tensor of size (k+1)^2.l.N.n
* where l is the number of decomposition levels and
* k is the GLWE dimension, N is the polynomial size for
* GLWE. The polynomial size for GLWE and the test vector
* are the same because they have to be in the same ring
* to be multiplied.
* - input_lwe_dimension: size of the Torus vector used to encrypt the input
* LWE ciphertexts - referred to as n above (~ 600)
* - polynomial_size: size of the test polynomial (test vector) and size of the
* GLWE polynomials (~1024) (where `size` refers to the polynomial degree + 1).
* - base_log: log of the base used for the gadget matrix - B = 2^base_log (~8)
* - level_count: number of decomposition levels in the gadget matrix (~4)
* - num_samples: number of encrypted input messages
* - num_lut_vectors: parameter to set the actual number of test vectors to be
* used
* - lwe_idx: the index of the LWE input to consider for the GPU of index
* gpu_index. In case of multi-GPU computing, it is assumed that only a part of
* the input LWE array is copied to each GPU, but the whole LUT array is copied
* (because the case when the number of LUTs is smaller than the number of input
* LWEs is not trivial to take into account in the data repartition on the
* GPUs). `lwe_idx` is used to determine which LUT to consider for a given LWE
* input in the LUT array `lut_vector`.
* - 'max_shared_memory' maximum amount of shared memory to be used inside
* device functions
*
* This function calls a wrapper to a device kernel that performs the
* bootstrapping:
* - the kernel is templatized based on integer discretization and
* polynomial degree
* - num_samples blocks of threads are launched, where each thread is going
* to handle one or more polynomial coefficients at each stage:
* - perform the blind rotation
* - round the result
* - decompose into level_count levels, then for each level:
* - switch to the FFT domain
* - multiply with the bootstrapping key
* - come back to the coefficients representation
* - between each stage a synchronization of the threads is necessary
* - in case the device has enough shared memory, temporary arrays used for
* the different stages (accumulators) are stored into the shared memory
* - the accumulators serve to combine the results for all decomposition
* levels
* - the constant memory (64K) is used for storing the roots of identity
* values for the FFT
*/
void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
uint32_t max_shared_memory) {
checks_bootstrap_amortized(64, base_log, polynomial_size);
switch (polynomial_size) {
case 256:
host_bootstrap_amortized<uint64_t, AmortizedDegree<256>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 512:
host_bootstrap_amortized<uint64_t, AmortizedDegree<512>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 1024:
host_bootstrap_amortized<uint64_t, AmortizedDegree<1024>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 2048:
host_bootstrap_amortized<uint64_t, AmortizedDegree<2048>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 4096:
host_bootstrap_amortized<uint64_t, AmortizedDegree<4096>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 8192:
host_bootstrap_amortized<uint64_t, AmortizedDegree<8192>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 16384:
host_bootstrap_amortized<uint64_t, AmortizedDegree<16384>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
default:
errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
"are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
break;
}
}
/*
* This cleanup function frees the data for the amortized PBS on GPU in
* pbs_buffer for 32 or 64 bits inputs.
*/
void cleanup_cuda_bootstrap_amortized(void *v_stream, uint32_t gpu_index,
int8_t **pbs_buffer) {
auto stream = static_cast<cudaStream_t *>(v_stream);
// Free memory
cuda_drop_async(*pbs_buffer, stream, gpu_index);
}

View File

@@ -1,363 +0,0 @@
#ifdef __CDT_PARSER__
#undef __CUDA_RUNTIME_H__
#include <cuda_runtime.h>
#endif
#ifndef CNCRT_AMORTIZED_PBS_H
#define CNCRT_AMORTIZED_PBS_H
#include "bootstrap.h"
#include "complex/operations.cuh"
#include "crypto/gadget.cuh"
#include "crypto/torus.cuh"
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "fft/twiddles.cuh"
#include "polynomial/functions.cuh"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial.cuh"
#include "polynomial/polynomial_math.cuh"
#include "utils/timer.cuh"
template <typename Torus, class params, sharedMemDegree SMD>
/*
* Kernel launched by host_bootstrap_amortized
*
* Uses shared memory to increase performance
* - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
* (a0,..an-1,b) where n is the LWE dimension
* - lut_vector: should hold as many test vectors of size polynomial_size
* as there are input ciphertexts, but actually holds
* num_lut_vectors vectors to reduce memory usage
* - lut_vector_indexes: stores the index corresponding to which test vector
* to use for each sample in lut_vector
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
* mask values + 1 body value
* - bootstrapping_key: RGSW encryption of the LWE secret key sk1 under secret
* key sk2
* - device_mem: pointer to the device's global memory in case we use it (SMD
* == NOSM or PARTIALSM)
* - lwe_dimension: size of the Torus vector used to encrypt the input
* LWE ciphertexts - referred to as n above (~ 600)
* - polynomial_size: size of the test polynomial (test vector) and size of the
* GLWE polynomial (~1024)
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
* - level_count: number of decomposition levels in the gadget matrix (~4)
* - gpu_num: index of the current GPU (useful for multi-GPU computations)
* - lwe_idx: equal to the number of samples per gpu x gpu_num
* - device_memory_size_per_sample: amount of global memory to allocate if SMD
* is not FULLSM
*/
__global__ void device_bootstrap_amortized(
Torus *lwe_array_out, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, double2 *bootstrapping_key, int8_t *device_mem,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t lwe_idx,
size_t device_memory_size_per_sample) {
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
// much faster than global memory
extern __shared__ int8_t sharedmem[];
int8_t *selected_memory;
if constexpr (SMD == FULLSM)
selected_memory = sharedmem;
else
selected_memory = &device_mem[blockIdx.x * device_memory_size_per_sample];
// For GPU bootstrapping the GLWE dimension is hard-set to 1: there is only
// one mask polynomial and 1 body to handle.
Torus *accumulator = (Torus *)selected_memory;
Torus *accumulator_rotated =
(Torus *)accumulator +
(ptrdiff_t)((glwe_dimension + 1) * polynomial_size);
double2 *res_fft =
(double2 *)accumulator_rotated + (glwe_dimension + 1) * polynomial_size /
(sizeof(double2) / sizeof(Torus));
double2 *accumulator_fft = (double2 *)sharedmem;
if constexpr (SMD != PARTIALSM)
accumulator_fft = (double2 *)res_fft +
(ptrdiff_t)((glwe_dimension + 1) * polynomial_size / 2);
auto block_lwe_array_in = &lwe_array_in[blockIdx.x * (lwe_dimension + 1)];
Torus *block_lut_vector =
&lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree *
(glwe_dimension + 1)];
// Put "b", the body, in [0, 2N[
Torus b_hat = 0;
rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
2 * params::degree); // 2 * params::log2_degree + 1);
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, block_lut_vector, b_hat, false, glwe_dimension + 1);
// Loop over all the mask elements of the sample to accumulate
// (X^a_i-1) multiplication, decomposition of the resulting polynomial
// into level_count polynomials, and performing polynomial multiplication
// via an FFT with the RGSW encrypted secret key
for (int iteration = 0; iteration < lwe_dimension; iteration++) {
synchronize_threads_in_block();
// Put "a" in [0, 2N[ instead of Zq
Torus a_hat = 0;
rescale_torus_element(block_lwe_array_in[iteration], a_hat,
2 * params::degree); // 2 * params::log2_degree + 1);
// Perform ACC * (X^ä - 1)
multiply_by_monomial_negacyclic_and_sub_polynomial<
Torus, params::opt, params::degree / params::opt>(
accumulator, accumulator_rotated, a_hat, glwe_dimension + 1);
synchronize_threads_in_block();
// Perform a rounding to increase the accuracy of the
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator_rotated, base_log, level_count, glwe_dimension + 1);
// Initialize the polynomial multiplication via FFT arrays
// The polynomial multiplications happens at the block level
// and each thread handles two or more coefficients
int pos = threadIdx.x;
for (int i = 0; i < (glwe_dimension + 1); i++)
for (int j = 0; j < params::opt / 2; j++) {
res_fft[pos].x = 0;
res_fft[pos].y = 0;
pos += params::degree / params::opt;
}
GadgetMatrix<Torus, params> gadget(base_log, level_count,
accumulator_rotated, glwe_dimension + 1);
// Now that the rotation is done, decompose the resulting polynomial
// coefficients so as to multiply each decomposed level with the
// corresponding part of the bootstrapping key
for (int level = level_count - 1; level >= 0; level--) {
for (int i = 0; i < (glwe_dimension + 1); i++) {
gadget.decompose_and_compress_next_polynomial(accumulator_fft, i);
// Switch to the FFT space
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
// Get the bootstrapping key piece necessary for the multiplication
// It is already in the Fourier domain
auto bsk_slice = get_ith_mask_kth_block(bootstrapping_key, iteration, i,
level, polynomial_size,
glwe_dimension, level_count);
// Perform the coefficient-wise product with the two pieces of
// bootstrapping key
for (int j = 0; j < (glwe_dimension + 1); j++) {
auto bsk_poly = bsk_slice + j * params::degree / 2;
auto res_fft_poly = res_fft + j * params::degree / 2;
polynomial_product_accumulate_in_fourier_domain<params, double2>(
res_fft_poly, accumulator_fft, bsk_poly);
}
}
synchronize_threads_in_block();
}
// Come back to the coefficient representation
if constexpr (SMD == FULLSM || SMD == NOSM) {
synchronize_threads_in_block();
for (int i = 0; i < (glwe_dimension + 1); i++) {
auto res_fft_slice = res_fft + i * params::degree / 2;
NSMFFT_inverse<HalfDegree<params>>(res_fft_slice);
}
synchronize_threads_in_block();
for (int i = 0; i < (glwe_dimension + 1); i++) {
auto accumulator_slice = accumulator + i * params::degree;
auto res_fft_slice = res_fft + i * params::degree / 2;
add_to_torus<Torus, params>(res_fft_slice, accumulator_slice);
}
synchronize_threads_in_block();
} else {
#pragma unroll
for (int i = 0; i < (glwe_dimension + 1); i++) {
auto accumulator_slice = accumulator + i * params::degree;
auto res_fft_slice = res_fft + i * params::degree / 2;
int tid = threadIdx.x;
for (int j = 0; j < params::opt / 2; j++) {
accumulator_fft[tid] = res_fft_slice[tid];
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
synchronize_threads_in_block();
add_to_torus<Torus, params>(accumulator_fft, accumulator_slice);
}
synchronize_threads_in_block();
}
}
auto block_lwe_array_out =
&lwe_array_out[blockIdx.x * (glwe_dimension * polynomial_size + 1)];
// The blind rotation for this block is over
// Now we can perform the sample extraction: for the body it's just
// the resulting constant coefficient of the accumulator
// For the mask it's more complicated
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator,
glwe_dimension);
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator,
glwe_dimension);
}
template <typename Torus>
__host__ __device__ uint64_t get_buffer_size_full_sm_bootstrap_amortized(
uint32_t polynomial_size, uint32_t glwe_dimension) {
return sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // accumulator
sizeof(Torus) * polynomial_size *
(glwe_dimension + 1) + // accumulator rotated
sizeof(double2) * polynomial_size / 2 + // accumulator fft
sizeof(double2) * polynomial_size / 2 *
(glwe_dimension + 1); // res fft
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_bootstrap_amortized(uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ uint64_t get_buffer_size_bootstrap_amortized(
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
uint64_t full_sm = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
polynomial_size, glwe_dimension);
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_amortized<Torus>(polynomial_size);
uint64_t partial_dm = full_sm - partial_sm;
uint64_t full_dm = full_sm;
uint64_t device_mem = 0;
if (max_shared_memory < partial_sm) {
device_mem = full_dm * input_lwe_ciphertext_count;
} else if (max_shared_memory < full_sm) {
device_mem = partial_dm * input_lwe_ciphertext_count;
}
return device_mem + device_mem % sizeof(double2);
}
template <typename Torus, typename STorus, typename params>
__host__ void scratch_bootstrap_amortized(void *v_stream, uint32_t gpu_index,
int8_t **pbs_buffer,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory,
bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
uint64_t full_sm = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
polynomial_size, glwe_dimension);
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_amortized<Torus>(polynomial_size);
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
cudaFuncSetAttribute(device_bootstrap_amortized<Torus, params, PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
partial_sm);
cudaFuncSetCacheConfig(device_bootstrap_amortized<Torus, params, PARTIALSM>,
cudaFuncCachePreferShared);
} else if (max_shared_memory >= partial_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_bootstrap_amortized<Torus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
check_cuda_error(cudaFuncSetCacheConfig(
device_bootstrap_amortized<Torus, params, FULLSM>,
cudaFuncCachePreferShared));
}
if (allocate_gpu_memory) {
uint64_t buffer_size = get_buffer_size_bootstrap_amortized<Torus>(
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
max_shared_memory);
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
check_cuda_error(cudaGetLastError());
}
}
template <typename Torus, class params>
__host__ void host_bootstrap_amortized(
void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *lwe_array_in, double2 *bootstrapping_key,
int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
uint32_t lwe_idx, uint32_t max_shared_memory) {
cudaSetDevice(gpu_index);
uint64_t SM_FULL = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
polynomial_size, glwe_dimension);
uint64_t SM_PART =
get_buffer_size_partial_sm_bootstrap_amortized<Torus>(polynomial_size);
uint64_t DM_PART = SM_FULL - SM_PART;
uint64_t DM_FULL = SM_FULL;
auto stream = static_cast<cudaStream_t *>(v_stream);
// Create a 1-dimensional grid of threads
// where each block handles 1 sample and each thread
// handles opt polynomial coefficients
// (actually opt/2 coefficients since we compress the real polynomial into a
// complex)
dim3 grid(input_lwe_ciphertext_count, 1, 1);
dim3 thds(polynomial_size / params::opt, 1, 1);
// Launch the kernel using polynomial_size/opt threads
// where each thread computes opt polynomial coefficients
// Depending on the required amount of shared memory, choose
// from one of three templates (no use, partial use or full use
// of shared memory)
if (max_shared_memory < SM_PART) {
device_bootstrap_amortized<Torus, params, NOSM><<<grid, thds, 0, *stream>>>(
lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, lwe_idx, DM_FULL);
} else if (max_shared_memory < SM_FULL) {
device_bootstrap_amortized<Torus, params, PARTIALSM>
<<<grid, thds, SM_PART, *stream>>>(
lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, lwe_idx, DM_PART);
} else {
// For devices with compute capability 7.x a single thread block can
// address the full capacity of shared memory. Shared memory on the
// device then has to be allocated dynamically.
// For lower compute capabilities, this call
// just does nothing and the amount of shared memory used is 48 KB
device_bootstrap_amortized<Torus, params, FULLSM>
<<<grid, thds, SM_FULL, *stream>>>(
lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, lwe_idx, 0);
}
check_cuda_error(cudaGetLastError());
}
template <typename Torus, class params>
int cuda_get_pbs_per_gpu(int polynomial_size) {
int blocks_per_sm = 0;
int num_threads = polynomial_size / params::opt;
cudaGetDeviceCount(0);
cudaDeviceProp device_properties;
cudaGetDeviceProperties(&device_properties, 0);
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&blocks_per_sm, device_bootstrap_amortized<Torus, params>, num_threads,
0);
return device_properties.multiProcessorCount * blocks_per_sm;
}
#endif // CNCRT_PBS_H

View File

@@ -1,452 +0,0 @@
#ifdef __CDT_PARSER__
#undef __CUDA_RUNTIME_H__
#include <cuda_runtime.h>
#endif
#ifndef LOWLAT_FAST_PBS_H
#define LOWLAT_FAST_PBS_H
#include "cooperative_groups.h"
#include "bootstrap.h"
#include "complex/operations.cuh"
#include "crypto/gadget.cuh"
#include "crypto/torus.cuh"
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "fft/twiddles.cuh"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial.cuh"
#include "polynomial/polynomial_math.cuh"
#include "utils/timer.cuh"
// Cooperative groups are used in the low latency PBS
using namespace cooperative_groups;
namespace cg = cooperative_groups;
template <typename Torus, class params>
__device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft,
double2 *join_buffer, double2 *bootstrapping_key,
int polynomial_size, uint32_t glwe_dimension,
int level_count, int iteration,
grid_group &grid) {
// Switch to the FFT space
NSMFFT_direct<HalfDegree<params>>(fft);
synchronize_threads_in_block();
// Get the pieces of the bootstrapping key that will be needed for the
// external product; blockIdx.x is the ID of the block that's executing
// this function, so we end up getting the lines of the bootstrapping key
// needed to perform the external product in this block (corresponding to
// the same decomposition level)
auto bsk_slice = get_ith_mask_kth_block(
bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
glwe_dimension, level_count);
// Selects all GLWEs in a particular decomposition level
auto level_join_buffer =
join_buffer + blockIdx.x * (glwe_dimension + 1) * params::degree / 2;
// Perform the matrix multiplication between the GGSW and the GLWE,
// each block operating on a single level for mask and body
// The first product is used to initialize level_join_buffer
auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
auto buffer_slice = level_join_buffer + blockIdx.y * params::degree / 2;
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
buffer_slice[tid] = fft[tid] * bsk_poly[tid];
tid += params::degree / params::opt;
}
grid.sync();
// Continues multiplying fft by every polynomial in that particular bsk level
// Each y-block accumulates in a different polynomial at each iteration
for (int j = 1; j < (glwe_dimension + 1); j++) {
int idx = (j + blockIdx.y) % (glwe_dimension + 1);
auto bsk_poly = bsk_slice + idx * params::degree / 2;
auto buffer_slice = level_join_buffer + idx * params::degree / 2;
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
buffer_slice[tid] += fft[tid] * bsk_poly[tid];
tid += params::degree / params::opt;
}
grid.sync();
}
// -----------------------------------------------------------------
// All blocks are synchronized here; after this sync, level_join_buffer has
// the values needed from every other block
auto src_acc = join_buffer + blockIdx.y * params::degree / 2;
// copy first product into fft buffer
tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] = src_acc[tid];
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
// accumulate rest of the products into fft buffer
for (int l = 1; l < gridDim.x; l++) {
auto cur_src_acc = &src_acc[l * (glwe_dimension + 1) * params::degree / 2];
tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] += cur_src_acc[tid];
tid += params::degree / params::opt;
}
}
synchronize_threads_in_block();
// Perform the inverse FFT on the result of the GGSW x GLWE and add to the
// accumulator
NSMFFT_inverse<HalfDegree<params>>(fft);
synchronize_threads_in_block();
add_to_torus<Torus, params>(fft, accumulator);
__syncthreads();
}
template <typename Torus, class params, sharedMemDegree SMD>
/*
* Kernel launched by the low latency version of the
* bootstrapping, that uses cooperative groups
*
* - lwe_array_out: vector of output lwe s, with length
* (glwe_dimension * polynomial_size+1)*num_samples
* - lut_vector: vector of look up tables with
* length (glwe_dimension+1) * polynomial_size * num_samples
* - lut_vector_indexes: mapping between lwe_array_in and lut_vector
* lwe_array_in: vector of lwe inputs with length (lwe_dimension + 1) *
* num_samples
*
* Each y-block computes one element of the lwe_array_out.
*/
__global__ void device_bootstrap_fast_low_latency(
Torus *lwe_array_out, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, double2 *bootstrapping_key, double2 *join_buffer,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *device_mem,
uint64_t device_memory_size_per_block) {
grid_group grid = this_grid();
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
// much faster than global memory
extern __shared__ int8_t sharedmem[];
int8_t *selected_memory;
uint32_t glwe_dimension = gridDim.y - 1;
if constexpr (SMD == FULLSM) {
selected_memory = sharedmem;
} else {
int block_index = blockIdx.x + blockIdx.y * gridDim.x +
blockIdx.z * gridDim.x * gridDim.y;
selected_memory = &device_mem[block_index * device_memory_size_per_block];
}
// We always compute the pointer with most restrictive alignment to avoid
// alignment issues
double2 *accumulator_fft = (double2 *)selected_memory;
Torus *accumulator =
(Torus *)accumulator_fft +
(ptrdiff_t)(sizeof(double2) * polynomial_size / 2 / sizeof(Torus));
Torus *accumulator_rotated =
(Torus *)accumulator + (ptrdiff_t)polynomial_size;
if constexpr (SMD == PARTIALSM)
accumulator_fft = (double2 *)sharedmem;
// The third dimension of the block is used to determine on which ciphertext
// this block is operating, in the case of batch bootstraps
Torus *block_lwe_array_in = &lwe_array_in[blockIdx.z * (lwe_dimension + 1)];
Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
params::degree * (glwe_dimension + 1)];
double2 *block_join_buffer =
&join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
params::degree / 2];
// Since the space is L1 cache is small, we use the same memory location for
// the rotated accumulator and the fft accumulator, since we know that the
// rotated array is not in use anymore by the time we perform the fft
// Put "b" in [0, 2N[
Torus b_hat = 0;
rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
2 * params::degree);
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
false);
for (int i = 0; i < lwe_dimension; i++) {
synchronize_threads_in_block();
// Put "a" in [0, 2N[
Torus a_hat = 0;
rescale_torus_element(block_lwe_array_in[i], a_hat,
2 * params::degree); // 2 * params::log2_degree + 1);
// Perform ACC * (X^ä - 1)
multiply_by_monomial_negacyclic_and_sub_polynomial<
Torus, params::opt, params::degree / params::opt>(
accumulator, accumulator_rotated, a_hat);
// Perform a rounding to increase the accuracy of the
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator_rotated, base_log, level_count);
synchronize_threads_in_block();
// Decompose the accumulator. Each block gets one level of the
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
accumulator_rotated);
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
// We are using the same memory space for accumulator_fft and
// accumulator_rotated, so we need to synchronize here to make sure they
// don't modify the same memory space at the same time
synchronize_threads_in_block();
// Perform G^-1(ACC) * GGSW -> GLWE
mul_ggsw_glwe<Torus, params>(
accumulator, accumulator_fft, block_join_buffer, bootstrapping_key,
polynomial_size, glwe_dimension, level_count, i, grid);
synchronize_threads_in_block();
}
auto block_lwe_array_out =
&lwe_array_out[blockIdx.z * (glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
// Perform a sample extract. At this point, all blocks have the result, but
// we do the computation at block 0 to avoid waiting for extra blocks, in
// case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
}
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_bootstrap_fast_low_latency(uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator_rotated
sizeof(Torus) * polynomial_size + // accumulator
sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_bootstrap_fast_low_latency(
uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
}
template <typename Torus>
__host__ __device__ uint64_t get_buffer_size_bootstrap_fast_low_latency(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
uint64_t full_sm = get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
polynomial_size);
uint64_t partial_dm = full_sm - partial_sm;
uint64_t full_dm = full_sm;
uint64_t device_mem = 0;
if (max_shared_memory < partial_sm) {
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
} else if (max_shared_memory < full_sm) {
device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
}
uint64_t buffer_size = device_mem + (glwe_dimension + 1) * level_count *
input_lwe_ciphertext_count *
polynomial_size / 2 * sizeof(double2);
return buffer_size + buffer_size % sizeof(double2);
}
template <typename Torus, typename STorus, typename params>
__host__ void scratch_bootstrap_fast_low_latency(
void *v_stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
uint64_t full_sm = get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
polynomial_size);
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_bootstrap_fast_low_latency<Torus, params, PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
cudaFuncSetCacheConfig(
device_bootstrap_fast_low_latency<Torus, params, PARTIALSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else if (max_shared_memory >= partial_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_bootstrap_fast_low_latency<Torus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
cudaFuncSetCacheConfig(
device_bootstrap_fast_low_latency<Torus, params, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}
if (allocate_gpu_memory) {
uint64_t buffer_size = get_buffer_size_bootstrap_fast_low_latency<Torus>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
check_cuda_error(cudaGetLastError());
}
}
/*
* Host wrapper to the low latency version
* of bootstrapping
*/
template <typename Torus, class params>
__host__ void host_bootstrap_fast_low_latency(
void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *lwe_array_in, double2 *bootstrapping_key,
int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
uint32_t max_shared_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
// With SM each block corresponds to either the mask or body, no need to
// duplicate data for each
uint64_t full_sm = get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
polynomial_size);
uint64_t full_dm = full_sm;
uint64_t partial_dm = full_dm - partial_sm;
int8_t *d_mem = pbs_buffer;
double2 *buffer_fft =
(double2 *)d_mem +
(ptrdiff_t)(get_buffer_size_bootstrap_fast_low_latency<Torus>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory) /
sizeof(double2) -
(glwe_dimension + 1) * level_count *
input_lwe_ciphertext_count * polynomial_size / 2);
int thds = polynomial_size / params::opt;
dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
void *kernel_args[12];
kernel_args[0] = &lwe_array_out;
kernel_args[1] = &lut_vector;
kernel_args[2] = &lut_vector_indexes;
kernel_args[3] = &lwe_array_in;
kernel_args[4] = &bootstrapping_key;
kernel_args[5] = &buffer_fft;
kernel_args[6] = &lwe_dimension;
kernel_args[7] = &polynomial_size;
kernel_args[8] = &base_log;
kernel_args[9] = &level_count;
kernel_args[10] = &d_mem;
if (max_shared_memory < partial_sm) {
kernel_args[11] = &full_dm;
check_cuda_error(cudaLaunchCooperativeKernel(
(void *)device_bootstrap_fast_low_latency<Torus, params, NOSM>, grid,
thds, (void **)kernel_args, 0, *stream));
} else if (max_shared_memory < full_sm) {
kernel_args[11] = &partial_dm;
check_cuda_error(cudaLaunchCooperativeKernel(
(void *)device_bootstrap_fast_low_latency<Torus, params, PARTIALSM>,
grid, thds, (void **)kernel_args, partial_sm, *stream));
} else {
int no_dm = 0;
kernel_args[11] = &no_dm;
check_cuda_error(cudaLaunchCooperativeKernel(
(void *)device_bootstrap_fast_low_latency<Torus, params, FULLSM>, grid,
thds, (void **)kernel_args, full_sm, *stream));
}
check_cuda_error(cudaGetLastError());
}
// Verify if the grid size for the low latency kernel satisfies the cooperative
// group constraints
template <typename Torus, class params>
__host__ bool verify_cuda_bootstrap_fast_low_latency_grid_size(
int glwe_dimension, int level_count, int num_samples,
uint32_t max_shared_memory) {
// If Cooperative Groups is not supported, no need to check anything else
if (!cuda_check_support_cooperative_groups())
return false;
// Calculate the dimension of the kernel
uint64_t full_sm =
get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(params::degree);
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
params::degree);
int thds = params::degree / params::opt;
// Get the maximum number of active blocks per streaming multiprocessors
int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
int max_active_blocks_per_sm;
if (max_shared_memory < partial_sm) {
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_bootstrap_fast_low_latency<Torus, params, NOSM>, thds,
0);
} else if (max_shared_memory < full_sm) {
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_bootstrap_fast_low_latency<Torus, params, PARTIALSM>,
thds, 0);
} else {
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_bootstrap_fast_low_latency<Torus, params, FULLSM>, thds,
0);
}
// Get the number of streaming multiprocessors
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
}
#endif // LOWLAT_FAST_PBS_H

View File

@@ -1,313 +0,0 @@
#ifndef FASTMULTIBIT_PBS_H
#define FASTMULTIBIT_PBS_H
#include "bootstrap.h"
#include "bootstrap_multibit.cuh"
#include "bootstrap_multibit.h"
#include "complex/operations.cuh"
#include "cooperative_groups.h"
#include "crypto/gadget.cuh"
#include "crypto/ggsw.cuh"
#include "crypto/torus.cuh"
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "fft/twiddles.cuh"
#include "polynomial/functions.cuh"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial.cuh"
#include "polynomial/polynomial_math.cuh"
#include "utils/timer.cuh"
#include <vector>
template <typename Torus, class params>
__global__ void device_multi_bit_bootstrap_fast_accumulate(
Torus *lwe_array_out, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, double2 *keybundle_array, double2 *join_buffer,
Torus *global_accumulator, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t grouping_factor, uint32_t lwe_offset, uint32_t lwe_chunk_size,
uint32_t keybundle_size_per_input) {
grid_group grid = this_grid();
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
// much faster than global memory
extern __shared__ int8_t sharedmem[];
int8_t *selected_memory;
selected_memory = sharedmem;
// We always compute the pointer with most restrictive alignment to avoid
// alignment issues
double2 *accumulator_fft = (double2 *)selected_memory;
Torus *accumulator =
(Torus *)accumulator_fft +
(ptrdiff_t)(sizeof(double2) * polynomial_size / 2 / sizeof(Torus));
// The third dimension of the block is used to determine on which ciphertext
// this block is operating, in the case of batch bootstraps
Torus *block_lwe_array_in = &lwe_array_in[blockIdx.z * (lwe_dimension + 1)];
Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
params::degree * (glwe_dimension + 1)];
double2 *block_join_buffer =
&join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
params::degree / 2];
Torus *global_slice =
global_accumulator +
(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
double2 *keybundle = keybundle_array +
// select the input
blockIdx.z * keybundle_size_per_input;
if (lwe_offset == 0) {
// Put "b" in [0, 2N[
Torus b_hat = 0;
rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
2 * params::degree);
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
false);
} else {
// Load the accumulator calculated in previous iterations
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
global_slice, accumulator);
}
for (int i = 0; (i + lwe_offset) < lwe_dimension && i < lwe_chunk_size; i++) {
// Decompose the accumulator. Each block gets one level of the
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
// We are using the same memory space for accumulator_fft and
// accumulator_rotated, so we need to synchronize here to make sure they
// don't modify the same memory space at the same time
synchronize_threads_in_block();
// Perform G^-1(ACC) * GGSW -> GLWE
mul_ggsw_glwe<Torus, params>(accumulator, accumulator_fft,
block_join_buffer, keybundle, polynomial_size,
glwe_dimension, level_count, i, grid);
synchronize_threads_in_block();
}
if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
auto block_lwe_array_out =
&lwe_array_out[blockIdx.z * (glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
// Perform a sample extract. At this point, all blocks have the result,
// but we do the computation at block 0 to avoid waiting for extra blocks,
// in case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
}
} else {
// Load the accumulator calculated in previous iterations
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
accumulator, global_slice);
}
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_fast_multibit_bootstrap(uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size * 2; // accumulator
}
template <typename Torus>
__host__ __device__ uint64_t get_buffer_size_fast_multibit_bootstrap(
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t grouping_factor, uint32_t lwe_chunk_size,
uint32_t max_shared_memory) {
uint64_t buffer_size = 0;
buffer_size += input_lwe_ciphertext_count * lwe_chunk_size * level_count *
(glwe_dimension + 1) * (glwe_dimension + 1) *
(polynomial_size / 2) * sizeof(double2); // keybundle fft
buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
level_count * (polynomial_size / 2) *
sizeof(double2); // join buffer
buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
polynomial_size * sizeof(Torus); // global_accumulator
return buffer_size + buffer_size % sizeof(double2);
}
template <typename Torus, typename STorus, typename params>
__host__ void scratch_fast_multi_bit_pbs(
void *v_stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t grouping_factor, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
uint64_t full_sm_keybundle =
get_buffer_size_full_sm_multibit_bootstrap_keybundle<Torus>(
polynomial_size);
uint64_t full_sm_accumulate =
get_buffer_size_full_sm_fast_multibit_bootstrap<Torus>(polynomial_size);
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_bootstrap_keybundle<Torus, params>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
cudaFuncSetCacheConfig(device_multi_bit_bootstrap_keybundle<Torus, params>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_bootstrap_fast_accumulate<Torus, params>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_accumulate));
cudaFuncSetCacheConfig(
device_multi_bit_bootstrap_fast_accumulate<Torus, params>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
if (allocate_gpu_memory) {
if (!lwe_chunk_size)
lwe_chunk_size =
get_lwe_chunk_size(lwe_dimension, level_count, glwe_dimension,
input_lwe_ciphertext_count);
uint64_t buffer_size = get_buffer_size_fast_multibit_bootstrap<Torus>(
lwe_dimension, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, lwe_chunk_size,
max_shared_memory);
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
check_cuda_error(cudaGetLastError());
}
}
template <typename Torus, typename STorus, class params>
__host__ void host_fast_multi_bit_pbs(
void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *lwe_array_in, uint64_t *bootstrapping_key,
int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_lut_vectors,
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
if (!lwe_chunk_size)
lwe_chunk_size = get_lwe_chunk_size(lwe_dimension, level_count,
glwe_dimension, num_samples);
//
double2 *keybundle_fft = (double2 *)pbs_buffer;
double2 *buffer_fft = (double2 *)keybundle_fft +
num_samples * lwe_chunk_size * level_count *
(glwe_dimension + 1) * (glwe_dimension + 1) *
(polynomial_size / 2);
Torus *global_accumulator =
(Torus *)buffer_fft +
(ptrdiff_t)(sizeof(double2) * num_samples * (glwe_dimension + 1) *
level_count * (polynomial_size / 2) / sizeof(Torus));
//
uint64_t full_sm_keybundle =
get_buffer_size_full_sm_multibit_bootstrap_keybundle<Torus>(
polynomial_size);
uint64_t full_sm_accumulate =
get_buffer_size_full_sm_fast_multibit_bootstrap<Torus>(polynomial_size);
uint32_t keybundle_size_per_input =
lwe_chunk_size * level_count * (glwe_dimension + 1) *
(glwe_dimension + 1) * (polynomial_size / 2);
//
void *kernel_args[16];
kernel_args[0] = &lwe_array_out;
kernel_args[1] = &lut_vector;
kernel_args[2] = &lut_vector_indexes;
kernel_args[3] = &lwe_array_in;
kernel_args[4] = &keybundle_fft;
kernel_args[5] = &buffer_fft;
kernel_args[6] = &global_accumulator;
kernel_args[7] = &lwe_dimension;
kernel_args[8] = &glwe_dimension;
kernel_args[9] = &polynomial_size;
kernel_args[10] = &base_log;
kernel_args[11] = &level_count;
kernel_args[12] = &grouping_factor;
kernel_args[15] = &keybundle_size_per_input;
//
dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples);
dim3 thds(polynomial_size / params::opt, 1, 1);
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
lwe_offset += lwe_chunk_size) {
uint32_t chunk_size = std::min(
lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
// Compute a keybundle
dim3 grid_keybundle(num_samples * chunk_size,
(glwe_dimension + 1) * (glwe_dimension + 1),
level_count);
device_multi_bit_bootstrap_keybundle<Torus, params>
<<<grid_keybundle, thds, full_sm_keybundle, *stream>>>(
lwe_array_in, keybundle_fft, bootstrapping_key, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log,
level_count, lwe_offset, chunk_size, keybundle_size_per_input);
check_cuda_error(cudaGetLastError());
kernel_args[13] = &lwe_offset;
kernel_args[14] = &chunk_size;
check_cuda_error(cudaLaunchCooperativeKernel(
(void *)device_multi_bit_bootstrap_fast_accumulate<Torus, params>,
grid_accumulate, thds, (void **)kernel_args, full_sm_accumulate,
*stream));
}
}
// Verify if the grid size for the low latency kernel satisfies the cooperative
// group constraints
template <typename Torus, class params>
__host__ bool
verify_cuda_bootstrap_fast_multi_bit_grid_size(int glwe_dimension,
int level_count, int num_samples,
uint32_t max_shared_memory) {
// If Cooperative Groups is not supported, no need to check anything else
if (!cuda_check_support_cooperative_groups())
return false;
// Calculate the dimension of the kernel
uint64_t full_sm =
get_buffer_size_full_sm_fast_multibit_bootstrap<Torus>(params::degree);
int thds = params::degree / params::opt;
// Get the maximum number of active blocks per streaming multiprocessors
int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
int max_active_blocks_per_sm;
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_multi_bit_bootstrap_fast_accumulate<Torus, params>, thds,
full_sm);
// Get the number of streaming multiprocessors
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
}
#endif // FASTMULTIBIT_PBS_H

View File

@@ -1,773 +0,0 @@
#include <err.h>
#include "bootstrap_fast_low_latency.cuh"
#include "bootstrap_low_latency.cuh"
/*
* Returns the buffer size for 64 bits executions
*/
uint64_t get_buffer_size_bootstrap_low_latency_64(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
switch (polynomial_size) {
case 256:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
AmortizedDegree<256>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
else
return get_buffer_size_bootstrap_low_latency<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
break;
case 512:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
AmortizedDegree<512>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
else
return get_buffer_size_bootstrap_low_latency<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
break;
case 1024:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
AmortizedDegree<1024>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
else
return get_buffer_size_bootstrap_low_latency<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
break;
case 2048:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
AmortizedDegree<2048>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
else
return get_buffer_size_bootstrap_low_latency<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
break;
case 4096:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
AmortizedDegree<4096>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
else
return get_buffer_size_bootstrap_low_latency<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
break;
case 8192:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
AmortizedDegree<8192>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
else
return get_buffer_size_bootstrap_low_latency<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
break;
case 16384:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<
uint64_t, AmortizedDegree<16384>>(glwe_dimension, level_count,
input_lwe_ciphertext_count,
max_shared_memory))
return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
else
return get_buffer_size_bootstrap_low_latency<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
break;
default:
errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
"are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
return 0;
break;
}
}
/*
* Runs standard checks to validate the inputs
*/
void checks_fast_bootstrap_low_latency(int glwe_dimension, int level_count,
int polynomial_size, int num_samples) {
assert((
"Error (GPU low latency PBS): polynomial size should be one of 256, 512, "
"1024, 2048, 4096, 8192, 16384",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192 ||
polynomial_size == 16384));
}
/*
* Runs standard checks to validate the inputs
*/
void checks_bootstrap_low_latency(int nbits, int glwe_dimension,
int level_count, int base_log,
int polynomial_size, int num_samples) {
assert(("Error (GPU low latency PBS): base log should be <= nbits",
base_log <= nbits));
checks_fast_bootstrap_low_latency(glwe_dimension, level_count,
polynomial_size, num_samples);
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the low latency PBS on 32 bits inputs, into `pbs_buffer`. It also
* configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
* be used.
*/
void scratch_cuda_bootstrap_low_latency_32(
void *v_stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
checks_fast_bootstrap_low_latency(
glwe_dimension, level_count, polynomial_size, input_lwe_ciphertext_count);
switch (polynomial_size) {
case 256:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<256>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
AmortizedDegree<256>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
else
scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<256>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
break;
case 512:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<512>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
AmortizedDegree<512>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
else
scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<512>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
break;
case 2048:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<2048>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
AmortizedDegree<2048>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
else
scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<2048>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
break;
case 4096:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<4096>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
AmortizedDegree<4096>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
else
scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<4096>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
break;
case 8192:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<8192>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
AmortizedDegree<8192>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
else
scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<8192>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
break;
case 16384:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<
uint32_t, AmortizedDegree<16384>>(glwe_dimension, level_count,
input_lwe_ciphertext_count,
max_shared_memory))
scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
AmortizedDegree<16384>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
else
scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<16384>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
break;
default:
errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
"are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
break;
}
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the low_latency PBS on 64 bits inputs, into `pbs_buffer`. It also
* configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
* be used.
*/
void scratch_cuda_bootstrap_low_latency_64(
void *v_stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
checks_fast_bootstrap_low_latency(
glwe_dimension, level_count, polynomial_size, input_lwe_ciphertext_count);
switch (polynomial_size) {
case 256:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
AmortizedDegree<256>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
AmortizedDegree<256>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
else
scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<256>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
break;
case 512:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
AmortizedDegree<512>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
AmortizedDegree<512>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
else
scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<512>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
break;
case 1024:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
AmortizedDegree<1024>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
AmortizedDegree<1024>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
else
scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<1024>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
break;
case 2048:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
AmortizedDegree<2048>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
AmortizedDegree<2048>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
else
scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<2048>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
break;
case 4096:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
AmortizedDegree<4096>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
AmortizedDegree<4096>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
else
scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<4096>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
break;
case 8192:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
AmortizedDegree<8192>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
AmortizedDegree<8192>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
else
scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<8192>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
break;
case 16384:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<
uint64_t, AmortizedDegree<16384>>(glwe_dimension, level_count,
input_lwe_ciphertext_count,
max_shared_memory))
scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
AmortizedDegree<16384>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
else
scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<16384>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
break;
default:
errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
"are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
break;
}
}
/* Perform bootstrapping on a batch of input u32 LWE ciphertexts.
* This function performs best for small numbers of inputs. Beyond a certain
* number of inputs (the exact number depends on the cryptographic parameters),
* the kernel cannot be launched and it is necessary to split the kernel call
* into several calls on smaller batches of inputs. For more details on this
* operation, head on to the equivalent u64 operation.
*/
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
uint32_t max_shared_memory) {
checks_bootstrap_low_latency(32, glwe_dimension, level_count, base_log,
polynomial_size, num_samples);
switch (polynomial_size) {
case 256:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<256>>(
glwe_dimension, level_count, num_samples, max_shared_memory))
host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<256>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_lut_vectors, max_shared_memory);
else
host_bootstrap_low_latency<uint32_t, Degree<256>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_lut_vectors, max_shared_memory);
break;
case 512:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<512>>(
glwe_dimension, level_count, num_samples, max_shared_memory))
host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<512>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_lut_vectors, max_shared_memory);
else
host_bootstrap_low_latency<uint32_t, Degree<512>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_lut_vectors, max_shared_memory);
break;
case 1024:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<1024>>(
glwe_dimension, level_count, num_samples, max_shared_memory))
host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<1024>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_lut_vectors, max_shared_memory);
else
host_bootstrap_low_latency<uint32_t, Degree<1024>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_lut_vectors, max_shared_memory);
break;
case 2048:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<2048>>(
glwe_dimension, level_count, num_samples, max_shared_memory))
host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<2048>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_lut_vectors, max_shared_memory);
else
host_bootstrap_low_latency<uint32_t, Degree<2048>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_lut_vectors, max_shared_memory);
break;
case 4096:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<4096>>(
glwe_dimension, level_count, num_samples, max_shared_memory))
host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<4096>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_lut_vectors, max_shared_memory);
else
host_bootstrap_low_latency<uint32_t, Degree<4096>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_lut_vectors, max_shared_memory);
break;
case 8192:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<8192>>(
glwe_dimension, level_count, num_samples, max_shared_memory))
host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<8192>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_lut_vectors, max_shared_memory);
else
host_bootstrap_low_latency<uint32_t, Degree<8192>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_lut_vectors, max_shared_memory);
break;
case 16384:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<
uint32_t, AmortizedDegree<16384>>(glwe_dimension, level_count,
num_samples, max_shared_memory))
host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<16384>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_lut_vectors, max_shared_memory);
else
host_bootstrap_low_latency<uint32_t, Degree<16384>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_lut_vectors, max_shared_memory);
break;
default:
errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
"are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
break;
}
}
/* Perform bootstrapping on a batch of input u64 LWE ciphertexts.
* This function performs best for small numbers of inputs. Beyond a certain
* number of inputs (the exact number depends on the cryptographic parameters),
* the kernel cannot be launched and it is necessary to split the kernel call
* into several calls on smaller batches of inputs.
*
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
* (a0,..an-1,b) where n is the LWE dimension
* - lut_vector: should hold as many test vectors of size polynomial_size
* as there are input ciphertexts, but actually holds
* num_lut_vectors vectors to reduce memory usage
* - lut_vector_indexes: stores the index corresponding to
* which test vector to use for each sample in
* lut_vector
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
* mask values + 1 body value
* - bootstrapping_key: GGSW encryption of the LWE secret key sk1
* under secret key sk2
* bsk = Z + sk1 H
* where H is the gadget matrix and Z is a matrix (k+1).l
* containing GLWE encryptions of 0 under sk2.
* bsk is thus a tensor of size (k+1)^2.l.N.n
* where l is the number of decomposition levels and
* k is the GLWE dimension, N is the polynomial size for
* GLWE. The polynomial size for GLWE and the test vector
* are the same because they have to be in the same ring
* to be multiplied.
* - lwe_dimension: size of the Torus vector used to encrypt the input
* LWE ciphertexts - referred to as n above (~ 600)
* - glwe_dimension: size of the polynomial vector used to encrypt the LUT
* GLWE ciphertexts - referred to as k above. Only the value 1 is supported for
* this parameter.
* - polynomial_size: size of the test polynomial (test vector) and size of the
* GLWE polynomial (~1024)
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
* - level_count: number of decomposition levels in the gadget matrix (~4)
* - num_samples: number of encrypted input messages
* - num_lut_vectors: parameter to set the actual number of test vectors to be
* used
* - lwe_idx: the index of the LWE input to consider for the GPU of index
* gpu_index. In case of multi-GPU computing, it is assumed that only a part of
* the input LWE array is copied to each GPU, but the whole LUT array is copied
* (because the case when the number of LUTs is smaller than the number of input
* LWEs is not trivial to take into account in the data repartition on the
* GPUs). `lwe_idx` is used to determine which LUT to consider for a given LWE
* input in the LUT array `lut_vector`.
* - 'max_shared_memory' maximum amount of shared memory to be used inside
* device functions
*
* This function calls a wrapper to a device kernel that performs the
* bootstrapping:
* - the kernel is templatized based on integer discretization and
* polynomial degree
* - num_samples * level_count * (glwe_dimension + 1) blocks of threads are
* launched, where each thread is going to handle one or more polynomial
* coefficients at each stage, for a given level of decomposition, either for
* the LUT mask or its body:
* - perform the blind rotation
* - round the result
* - get the decomposition for the current level
* - switch to the FFT domain
* - multiply with the bootstrapping key
* - come back to the coefficients representation
* - between each stage a synchronization of the threads is necessary (some
* synchronizations happen at the block level, some happen between blocks, using
* cooperative groups).
* - in case the device has enough shared memory, temporary arrays used for
* the different stages (accumulators) are stored into the shared memory
* - the accumulators serve to combine the results for all decomposition
* levels
* - the constant memory (64K) is used for storing the roots of identity
* values for the FFT
*/
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
uint32_t max_shared_memory) {
checks_bootstrap_low_latency(64, glwe_dimension, level_count, base_log,
polynomial_size, num_samples);
switch (polynomial_size) {
case 256:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
AmortizedDegree<256>>(
glwe_dimension, level_count, num_samples, max_shared_memory))
host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<256>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_lut_vectors, max_shared_memory);
else
host_bootstrap_low_latency<uint64_t, Degree<256>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_lut_vectors, max_shared_memory);
break;
case 512:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
AmortizedDegree<512>>(
glwe_dimension, level_count, num_samples, max_shared_memory))
host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<512>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_lut_vectors, max_shared_memory);
else
host_bootstrap_low_latency<uint64_t, Degree<512>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_lut_vectors, max_shared_memory);
break;
case 1024:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<1024>>(
glwe_dimension, level_count, num_samples, max_shared_memory))
host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<1024>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_lut_vectors, max_shared_memory);
else
host_bootstrap_low_latency<uint64_t, Degree<1024>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_lut_vectors, max_shared_memory);
break;
case 2048:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<2048>>(
glwe_dimension, level_count, num_samples, max_shared_memory))
host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<2048>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_lut_vectors, max_shared_memory);
else
host_bootstrap_low_latency<uint64_t, Degree<2048>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_lut_vectors, max_shared_memory);
break;
case 4096:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<4096>>(
glwe_dimension, level_count, num_samples, max_shared_memory))
host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<4096>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_lut_vectors, max_shared_memory);
else
host_bootstrap_low_latency<uint64_t, Degree<4096>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_lut_vectors, max_shared_memory);
break;
case 8192:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<8192>>(
glwe_dimension, level_count, num_samples, max_shared_memory))
host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<8192>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_lut_vectors, max_shared_memory);
else
host_bootstrap_low_latency<uint64_t, Degree<8192>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_lut_vectors, max_shared_memory);
break;
case 16384:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<
uint64_t, AmortizedDegree<16384>>(glwe_dimension, level_count,
num_samples, max_shared_memory))
host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<16384>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_lut_vectors, max_shared_memory);
else
host_bootstrap_low_latency<uint64_t, Degree<16384>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_lut_vectors, max_shared_memory);
break;
default:
errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
"are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
break;
}
}
/*
* This cleanup function frees the data for the low latency PBS on GPU in
* pbs_buffer for 32 or 64 bits inputs.
*/
void cleanup_cuda_bootstrap_low_latency(void *v_stream, uint32_t gpu_index,
int8_t **pbs_buffer) {
auto stream = static_cast<cudaStream_t *>(v_stream);
// Free memory
cuda_drop_async(*pbs_buffer, stream, gpu_index);
}

View File

@@ -1,490 +0,0 @@
#ifdef __CDT_PARSER__
#undef __CUDA_RUNTIME_H__
#include <cuda_runtime.h>
#endif
#ifndef LOWLAT_PBS_H
#define LOWLAT_PBS_H
#include "bootstrap.h"
#include "complex/operations.cuh"
#include "crypto/gadget.cuh"
#include "crypto/torus.cuh"
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "fft/twiddles.cuh"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial.cuh"
#include "polynomial/polynomial_math.cuh"
#include "utils/timer.cuh"
template <typename Torus, class params, sharedMemDegree SMD>
__global__ void device_bootstrap_low_latency_step_one(
Torus *lwe_array_out, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, double2 *bootstrapping_key, Torus *global_accumulator,
double2 *global_accumulator_fft, uint32_t lwe_iteration,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *device_mem,
uint64_t device_memory_size_per_block) {
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
// much faster than global memory
extern __shared__ int8_t sharedmem[];
int8_t *selected_memory;
uint32_t glwe_dimension = gridDim.y - 1;
if constexpr (SMD == FULLSM) {
selected_memory = sharedmem;
} else {
int block_index = blockIdx.x + blockIdx.y * gridDim.x +
blockIdx.z * gridDim.x * gridDim.y;
selected_memory = &device_mem[block_index * device_memory_size_per_block];
}
Torus *accumulator = (Torus *)selected_memory;
double2 *accumulator_fft =
(double2 *)accumulator +
(ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2));
if constexpr (SMD == PARTIALSM)
accumulator_fft = (double2 *)sharedmem;
// The third dimension of the block is used to determine on which ciphertext
// this block is operating, in the case of batch bootstraps
Torus *block_lwe_array_in = &lwe_array_in[blockIdx.z * (lwe_dimension + 1)];
Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
params::degree * (glwe_dimension + 1)];
Torus *global_slice =
global_accumulator +
(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
double2 *global_fft_slice =
global_accumulator_fft +
(blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
blockIdx.z * level_count * (glwe_dimension + 1)) *
(polynomial_size / 2);
if (lwe_iteration == 0) {
// First iteration
// Put "b" in [0, 2N[
Torus b_hat = 0;
rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
2 * params::degree);
// The y-dimension is used to select the element of the GLWE this block will
// compute
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
false);
// Persist
int tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
global_slice[tid] = accumulator[tid];
tid += params::degree / params::opt;
}
}
// Put "a" in [0, 2N[
Torus a_hat = 0;
rescale_torus_element(block_lwe_array_in[lwe_iteration], a_hat,
2 * params::degree); // 2 * params::log2_degree + 1);
synchronize_threads_in_block();
// Perform ACC * (X^ä - 1)
multiply_by_monomial_negacyclic_and_sub_polynomial<
Torus, params::opt, params::degree / params::opt>(global_slice,
accumulator, a_hat);
// Perform a rounding to increase the accuracy of the
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, base_log, level_count);
synchronize_threads_in_block();
// Decompose the accumulator. Each block gets one level of the
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
// We are using the same memory space for accumulator_fft and
// accumulator_rotated, so we need to synchronize here to make sure they
// don't modify the same memory space at the same time
// Switch to the FFT space
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
global_fft_slice[tid] = accumulator_fft[tid];
tid += params::degree / params::opt;
}
}
template <typename Torus, class params, sharedMemDegree SMD>
__global__ void device_bootstrap_low_latency_step_two(
Torus *lwe_array_out, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, double2 *bootstrapping_key, Torus *global_accumulator,
double2 *global_accumulator_fft, uint32_t lwe_iteration,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *device_mem,
uint64_t device_memory_size_per_block) {
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
// much faster than global memory
extern __shared__ int8_t sharedmem[];
int8_t *selected_memory;
uint32_t glwe_dimension = gridDim.y - 1;
if constexpr (SMD == FULLSM) {
selected_memory = sharedmem;
} else {
int block_index = blockIdx.x + blockIdx.y * gridDim.x +
blockIdx.z * gridDim.x * gridDim.y;
selected_memory = &device_mem[block_index * device_memory_size_per_block];
}
// We always compute the pointer with most restrictive alignment to avoid
// alignment issues
double2 *accumulator_fft = (double2 *)selected_memory;
Torus *accumulator =
(Torus *)accumulator_fft +
(ptrdiff_t)(sizeof(double2) * params::degree / 2 / sizeof(Torus));
if constexpr (SMD == PARTIALSM)
accumulator_fft = (double2 *)sharedmem;
for (int level = 0; level < level_count; level++) {
double2 *global_fft_slice = global_accumulator_fft +
(level + blockIdx.x * level_count) *
(glwe_dimension + 1) * (params::degree / 2);
for (int j = 0; j < (glwe_dimension + 1); j++) {
double2 *fft = global_fft_slice + j * params::degree / 2;
// Get the bootstrapping key piece necessary for the multiplication
// It is already in the Fourier domain
auto bsk_slice =
get_ith_mask_kth_block(bootstrapping_key, lwe_iteration, j, level,
polynomial_size, glwe_dimension, level_count);
auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
polynomial_product_accumulate_in_fourier_domain<params, double2>(
accumulator_fft, fft, bsk_poly, !level && !j);
}
}
Torus *global_slice =
global_accumulator +
(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree;
// Load the persisted accumulator
int tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
accumulator[tid] = global_slice[tid];
tid += params::degree / params::opt;
}
// Perform the inverse FFT on the result of the GGSW x GLWE and add to the
// accumulator
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
add_to_torus<Torus, params>(accumulator_fft, accumulator);
if (lwe_iteration + 1 == lwe_dimension) {
// Last iteration
auto block_lwe_array_out =
&lwe_array_out[blockIdx.x * (glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
if (blockIdx.y < glwe_dimension) {
// Perform a sample extract. At this point, all blocks have the result,
// but we do the computation at block 0 to avoid waiting for extra blocks,
// in case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
} else if (blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
}
} else {
// Persist the updated accumulator
tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
global_slice[tid] = accumulator[tid];
tid += params::degree / params::opt;
}
}
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_bootstrap_low_latency_step_one(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator_rotated
sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_bootstrap_low_latency_step_two(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator
sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_bootstrap_low_latency(uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ uint64_t get_buffer_size_bootstrap_low_latency(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
uint64_t full_sm_step_one =
get_buffer_size_full_sm_bootstrap_low_latency_step_one<Torus>(
polynomial_size);
uint64_t full_sm_step_two =
get_buffer_size_full_sm_bootstrap_low_latency_step_two<Torus>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(polynomial_size);
uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
uint64_t full_dm = full_sm_step_one;
uint64_t device_mem = 0;
if (max_shared_memory < partial_sm) {
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
} else if (max_shared_memory < full_sm_step_two) {
device_mem = (partial_dm_step_two + partial_dm_step_one * level_count) *
input_lwe_ciphertext_count * (glwe_dimension + 1);
} else if (max_shared_memory < full_sm_step_one) {
device_mem = partial_dm_step_one * input_lwe_ciphertext_count *
level_count * (glwe_dimension + 1);
}
// Otherwise, both kernels run all in shared memory
uint64_t buffer_size = device_mem +
// global_accumulator_fft
(glwe_dimension + 1) * level_count *
input_lwe_ciphertext_count *
(polynomial_size / 2) * sizeof(double2) +
// global_accumulator
(glwe_dimension + 1) * input_lwe_ciphertext_count *
polynomial_size * sizeof(Torus);
return buffer_size + buffer_size % sizeof(double2);
}
template <typename Torus, typename STorus, typename params>
__host__ void scratch_bootstrap_low_latency(
void *v_stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
uint64_t full_sm_step_one =
get_buffer_size_full_sm_bootstrap_low_latency_step_one<Torus>(
polynomial_size);
uint64_t full_sm_step_two =
get_buffer_size_full_sm_bootstrap_low_latency_step_two<Torus>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(polynomial_size);
// Configure step one
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
check_cuda_error(cudaFuncSetAttribute(
device_bootstrap_low_latency_step_one<Torus, params, PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
cudaFuncSetCacheConfig(
device_bootstrap_low_latency_step_one<Torus, params, PARTIALSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else if (max_shared_memory >= partial_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_bootstrap_low_latency_step_one<Torus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
cudaFuncSetCacheConfig(
device_bootstrap_low_latency_step_one<Torus, params, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}
// Configure step two
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_two) {
check_cuda_error(cudaFuncSetAttribute(
device_bootstrap_low_latency_step_two<Torus, params, PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
cudaFuncSetCacheConfig(
device_bootstrap_low_latency_step_two<Torus, params, PARTIALSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else if (max_shared_memory >= partial_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_bootstrap_low_latency_step_two<Torus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_two));
cudaFuncSetCacheConfig(
device_bootstrap_low_latency_step_two<Torus, params, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}
if (allocate_gpu_memory) {
uint64_t buffer_size = get_buffer_size_bootstrap_low_latency<Torus>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
check_cuda_error(cudaGetLastError());
}
}
template <typename Torus, class params>
__host__ void execute_low_latency_step_one(
void *v_stream, Torus *lwe_array_out, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *lwe_array_in, double2 *bootstrapping_key,
Torus *global_accumulator, double2 *global_accumulator_fft,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *d_mem, uint32_t max_shared_memory,
int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm,
uint64_t full_sm, uint64_t full_dm) {
int thds = polynomial_size / params::opt;
dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
auto stream = static_cast<cudaStream_t *>(v_stream);
if (max_shared_memory < partial_sm) {
device_bootstrap_low_latency_step_one<Torus, params, NOSM>
<<<grid, thds, 0, *stream>>>(
lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
bootstrapping_key, global_accumulator, global_accumulator_fft,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, full_dm);
} else if (max_shared_memory < full_sm) {
device_bootstrap_low_latency_step_one<Torus, params, PARTIALSM>
<<<grid, thds, partial_sm, *stream>>>(
lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
bootstrapping_key, global_accumulator, global_accumulator_fft,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, partial_dm);
} else {
device_bootstrap_low_latency_step_one<Torus, params, FULLSM>
<<<grid, thds, full_sm, *stream>>>(
lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
bootstrapping_key, global_accumulator, global_accumulator_fft,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, 0);
}
check_cuda_error(cudaGetLastError());
}
template <typename Torus, class params>
__host__ void execute_low_latency_step_two(
void *v_stream, Torus *lwe_array_out, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *lwe_array_in, double2 *bootstrapping_key,
Torus *global_accumulator, double2 *global_accumulator_fft,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *d_mem, uint32_t max_shared_memory,
int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm,
uint64_t full_sm, uint64_t full_dm) {
int thds = polynomial_size / params::opt;
dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1);
auto stream = static_cast<cudaStream_t *>(v_stream);
if (max_shared_memory < partial_sm) {
device_bootstrap_low_latency_step_two<Torus, params, NOSM>
<<<grid, thds, 0, *stream>>>(
lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
bootstrapping_key, global_accumulator, global_accumulator_fft,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, full_dm);
} else if (max_shared_memory < full_sm) {
device_bootstrap_low_latency_step_two<Torus, params, PARTIALSM>
<<<grid, thds, partial_sm, *stream>>>(
lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
bootstrapping_key, global_accumulator, global_accumulator_fft,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, partial_dm);
} else {
device_bootstrap_low_latency_step_two<Torus, params, FULLSM>
<<<grid, thds, full_sm, *stream>>>(
lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
bootstrapping_key, global_accumulator, global_accumulator_fft,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, 0);
}
check_cuda_error(cudaGetLastError());
}
/*
* Host wrapper to the low latency version
* of bootstrapping
*/
template <typename Torus, class params>
__host__ void host_bootstrap_low_latency(
void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *lwe_array_in, double2 *bootstrapping_key,
int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
uint32_t max_shared_memory) {
cudaSetDevice(gpu_index);
// With SM each block corresponds to either the mask or body, no need to
// duplicate data for each
uint64_t full_sm_step_one =
get_buffer_size_full_sm_bootstrap_low_latency_step_one<Torus>(
polynomial_size);
uint64_t full_sm_step_two =
get_buffer_size_full_sm_bootstrap_low_latency_step_two<Torus>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(polynomial_size);
uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
uint64_t full_dm_step_one = full_sm_step_one;
uint64_t full_dm_step_two = full_sm_step_two;
double2 *global_accumulator_fft = (double2 *)pbs_buffer;
Torus *global_accumulator =
(Torus *)global_accumulator_fft +
(ptrdiff_t)(sizeof(double2) * (glwe_dimension + 1) * level_count *
input_lwe_ciphertext_count * (polynomial_size / 2) /
sizeof(Torus));
int8_t *d_mem = (int8_t *)global_accumulator +
(ptrdiff_t)(sizeof(Torus) * (glwe_dimension + 1) *
input_lwe_ciphertext_count * polynomial_size /
sizeof(int8_t));
for (int i = 0; i < lwe_dimension; i++) {
execute_low_latency_step_one<Torus, params>(
v_stream, lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
bootstrapping_key, global_accumulator, global_accumulator_fft,
input_lwe_ciphertext_count, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, d_mem, max_shared_memory, i,
partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
execute_low_latency_step_two<Torus, params>(
v_stream, lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
bootstrapping_key, global_accumulator, global_accumulator_fft,
input_lwe_ciphertext_count, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, d_mem, max_shared_memory, i,
partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two);
}
}
#endif // LOWLAT_PBS_H

View File

@@ -1,399 +0,0 @@
#include <err.h>
#include "bootstrap_fast_multibit.cuh"
#include "bootstrap_multibit.cuh"
#include "bootstrap_multibit.h"
#include "polynomial/parameters.cuh"
void checks_multi_bit_pbs(int polynomial_size) {
assert(
("Error (GPU multi-bit PBS): polynomial size should be one of 256, 512, "
"1024, 2048, 4096, 8192, 16384",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192 ||
polynomial_size == 16384));
}
void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_lut_vectors,
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t lwe_chunk_size) {
checks_multi_bit_pbs(polynomial_size);
switch (polynomial_size) {
case 256:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<256>>(
glwe_dimension, level_count, num_samples, max_shared_memory)) {
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
} else {
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
}
break;
case 512:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<512>>(
glwe_dimension, level_count, num_samples, max_shared_memory)) {
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
} else {
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
}
break;
case 1024:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<1024>>(
glwe_dimension, level_count, num_samples, max_shared_memory)) {
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
} else {
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
}
break;
case 2048:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<2048>>(
glwe_dimension, level_count, num_samples, max_shared_memory)) {
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
} else {
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
}
break;
case 4096:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<4096>>(
glwe_dimension, level_count, num_samples, max_shared_memory)) {
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
} else {
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
}
break;
case 8192:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<8192>>(
glwe_dimension, level_count, num_samples, max_shared_memory)) {
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
} else {
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
}
break;
case 16384:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<16384>>(
glwe_dimension, level_count, num_samples, max_shared_memory)) {
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
} else {
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
}
break;
default:
errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
"are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
break;
}
}
void scratch_cuda_multi_bit_pbs_64(
void *v_stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size) {
switch (polynomial_size) {
case 256:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<256>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory)) {
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
v_stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
grouping_factor, max_shared_memory, allocate_gpu_memory,
lwe_chunk_size);
} else {
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
v_stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
grouping_factor, max_shared_memory, allocate_gpu_memory,
lwe_chunk_size);
}
break;
case 512:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<512>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory)) {
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
v_stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
grouping_factor, max_shared_memory, allocate_gpu_memory,
lwe_chunk_size);
} else {
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
v_stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
grouping_factor, max_shared_memory, allocate_gpu_memory,
lwe_chunk_size);
}
break;
case 1024:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<1024>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory)) {
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
v_stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
grouping_factor, max_shared_memory, allocate_gpu_memory,
lwe_chunk_size);
} else {
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
v_stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
grouping_factor, max_shared_memory, allocate_gpu_memory,
lwe_chunk_size);
}
break;
case 2048:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<2048>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory)) {
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
v_stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
grouping_factor, max_shared_memory, allocate_gpu_memory,
lwe_chunk_size);
} else {
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
v_stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
grouping_factor, max_shared_memory, allocate_gpu_memory,
lwe_chunk_size);
}
break;
case 4096:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<4096>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory)) {
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
v_stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
grouping_factor, max_shared_memory, allocate_gpu_memory,
lwe_chunk_size);
} else {
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
v_stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
grouping_factor, max_shared_memory, allocate_gpu_memory,
lwe_chunk_size);
}
break;
case 8192:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<8192>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory)) {
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
v_stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
grouping_factor, max_shared_memory, allocate_gpu_memory,
lwe_chunk_size);
} else {
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
v_stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
grouping_factor, max_shared_memory, allocate_gpu_memory,
lwe_chunk_size);
}
break;
case 16384:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<16384>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory)) {
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
v_stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
grouping_factor, max_shared_memory, allocate_gpu_memory,
lwe_chunk_size);
} else {
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
v_stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
grouping_factor, max_shared_memory, allocate_gpu_memory,
lwe_chunk_size);
}
break;
default:
errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
"are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
break;
}
}
void cleanup_cuda_multi_bit_pbs(void *v_stream, uint32_t gpu_index,
int8_t **pbs_buffer) {
auto stream = static_cast<cudaStream_t *>(v_stream);
// Free memory
cuda_drop_async(*pbs_buffer, stream, gpu_index);
}
__host__ uint32_t get_lwe_chunk_size(uint32_t lwe_dimension,
uint32_t level_count,
uint32_t glwe_dimension,
uint32_t num_samples) {
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0); // Assuming device 0
const char *v100Name = "V100"; // Known name of V100 GPU
const char *a100Name = "A100"; // Known name of V100 GPU
if (std::strstr(deviceProp.name, v100Name) != nullptr) {
// Tesla V100
if (num_samples < 16)
return 80 / num_samples;
else if (num_samples == 16)
return 40;
else if (num_samples < 1024)
return 20;
else if (num_samples < 8192)
return 10;
} else if (std::strstr(deviceProp.name, a100Name) != nullptr) {
// Tesla A100
if (num_samples < 4)
return 11;
else if (num_samples < 8)
return 6;
else if (num_samples < 16)
return 13;
else if (num_samples < 64)
return 19;
else if (num_samples < 128)
return 1;
else if (num_samples < 512)
return 19;
else if (num_samples < 1024)
return 17;
else if (num_samples < 8192)
return 19;
else if (num_samples < 16384)
return 12;
else
return 9;
}
// Generic case
return 1;
}
// Returns the maximum buffer size required to execute batches up to
// max_input_lwe_ciphertext_count
__host__ uint64_t get_max_buffer_size_multibit_bootstrap(
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_input_lwe_ciphertext_count) {
uint64_t max_buffer_size = 0;
for (uint32_t input_lwe_ciphertext_count = 1;
input_lwe_ciphertext_count <= max_input_lwe_ciphertext_count;
input_lwe_ciphertext_count++) {
max_buffer_size = std::max(
max_buffer_size,
get_buffer_size_multibit_bootstrap<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count,
get_lwe_chunk_size(lwe_dimension, level_count, glwe_dimension,
input_lwe_ciphertext_count)));
}
return max_buffer_size;
}

View File

@@ -1,467 +0,0 @@
#ifndef MULTIBIT_PBS_H
#define MULTIBIT_PBS_H
#include "bootstrap.h"
#include "bootstrap_fast_low_latency.cuh"
#include "bootstrap_multibit.h"
#include "complex/operations.cuh"
#include "cooperative_groups.h"
#include "crypto/gadget.cuh"
#include "crypto/ggsw.cuh"
#include "crypto/torus.cuh"
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "fft/twiddles.cuh"
#include "polynomial/functions.cuh"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial.cuh"
#include "polynomial/polynomial_math.cuh"
#include "utils/timer.cuh"
#include <vector>
template <typename Torus, class params>
__device__ Torus calculates_monomial_degree(Torus *lwe_array_group,
uint32_t ggsw_idx,
uint32_t grouping_factor) {
Torus x = 0;
for (int i = 0; i < grouping_factor; i++) {
uint32_t mask_position = grouping_factor - (i + 1);
int selection_bit = (ggsw_idx >> mask_position) & 1;
x += selection_bit * lwe_array_group[i];
}
return rescale_torus_element(
x, 2 * params::degree); // 2 * params::log2_degree + 1);
}
template <typename Torus, class params>
__global__ void device_multi_bit_bootstrap_keybundle(
Torus *lwe_array_in, double2 *keybundle_array, Torus *bootstrapping_key,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
uint32_t lwe_offset, uint32_t lwe_chunk_size,
uint32_t keybundle_size_per_input) {
extern __shared__ int8_t sharedmem[];
int8_t *selected_memory = sharedmem;
// Ids
uint32_t level_id = blockIdx.z;
uint32_t glwe_id = blockIdx.y / (glwe_dimension + 1);
uint32_t poly_id = blockIdx.y % (glwe_dimension + 1);
uint32_t lwe_iteration = (blockIdx.x % lwe_chunk_size + lwe_offset);
uint32_t input_idx = blockIdx.x / lwe_chunk_size;
if (lwe_iteration < (lwe_dimension / grouping_factor)) {
//
Torus *accumulator = (Torus *)selected_memory;
Torus *block_lwe_array_in = &lwe_array_in[input_idx * (lwe_dimension + 1)];
double2 *keybundle = keybundle_array +
// select the input
input_idx * keybundle_size_per_input;
////////////////////////////////////////////////////////////
// Computes all keybundles
uint32_t rev_lwe_iteration =
((lwe_dimension / grouping_factor) - lwe_iteration - 1);
// ////////////////////////////////
// Keygen guarantees the first term is a constant term of the polynomial, no
// polynomial multiplication required
Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
bootstrapping_key, 0, rev_lwe_iteration, glwe_id, level_id,
grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
Torus *bsk_poly = bsk_slice + poly_id * params::degree;
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
bsk_poly, accumulator);
// Accumulate the other terms
for (int g = 1; g < (1 << grouping_factor); g++) {
Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
bootstrapping_key, g, rev_lwe_iteration, glwe_id, level_id,
grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
Torus *bsk_poly = bsk_slice + poly_id * params::degree;
// Calculates the monomial degree
Torus *lwe_array_group =
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
uint32_t monomial_degree = calculates_monomial_degree<Torus, params>(
lwe_array_group, g, grouping_factor);
synchronize_threads_in_block();
// Multiply by the bsk element
polynomial_product_accumulate_by_monomial<Torus, params>(
accumulator, bsk_poly, monomial_degree, false);
}
synchronize_threads_in_block();
double2 *fft = (double2 *)sharedmem;
// Move accumulator to local memory
double2 temp[params::opt / 2];
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
temp[i].x = __ll2double_rn((int64_t)accumulator[tid]);
temp[i].y =
__ll2double_rn((int64_t)accumulator[tid + params::degree / 2]);
temp[i].x /= (double)std::numeric_limits<Torus>::max();
temp[i].y /= (double)std::numeric_limits<Torus>::max();
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
// Move from local memory back to shared memory but as complex
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] = temp[i];
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
NSMFFT_direct<HalfDegree<params>>(fft);
// lwe iteration
auto keybundle_out = get_ith_mask_kth_block(
keybundle, blockIdx.x % lwe_chunk_size, glwe_id, level_id,
polynomial_size, glwe_dimension, level_count);
auto keybundle_poly = keybundle_out + poly_id * params::degree / 2;
copy_polynomial<double2, params::opt / 2, params::degree / params::opt>(
fft, keybundle_poly);
}
}
template <typename Torus, class params>
__global__ void device_multi_bit_bootstrap_accumulate_step_one(
Torus *lwe_array_in, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *global_accumulator, double2 *global_accumulator_fft,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t lwe_iteration) {
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
// much faster than global memory
extern __shared__ int8_t sharedmem[];
int8_t *selected_memory;
selected_memory = sharedmem;
Torus *accumulator = (Torus *)selected_memory;
double2 *accumulator_fft =
(double2 *)accumulator +
(ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2));
Torus *block_lwe_array_in = &lwe_array_in[blockIdx.z * (lwe_dimension + 1)];
Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
params::degree * (glwe_dimension + 1)];
Torus *global_slice =
global_accumulator +
(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
double2 *global_fft_slice =
global_accumulator_fft +
(blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
blockIdx.z * level_count * (glwe_dimension + 1)) *
(polynomial_size / 2);
if (lwe_iteration == 0) {
// First iteration
////////////////////////////////////////////////////////////
// Initializes the accumulator with the body of LWE
// Put "b" in [0, 2N[
Torus b_hat = 0;
rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
2 * params::degree);
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
false);
// Persist
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
accumulator, global_slice);
} else {
// Load the accumulator calculated in previous iterations
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
global_slice, accumulator);
}
// Decompose the accumulator. Each block gets one level of the
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
gadget_acc.decompose_and_compress_next_polynomial(accumulator_fft,
blockIdx.x);
// We are using the same memory space for accumulator_fft and
// accumulator_rotated, so we need to synchronize here to make sure they
// don't modify the same memory space at the same time
// Switch to the FFT space
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
copy_polynomial<double2, params::opt / 2, params::degree / params::opt>(
accumulator_fft, global_fft_slice);
}
template <typename Torus, class params>
__global__ void device_multi_bit_bootstrap_accumulate_step_two(
Torus *lwe_array_out, double2 *keybundle_array, Torus *global_accumulator,
double2 *global_accumulator_fft, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor, uint32_t iteration, uint32_t lwe_offset,
uint32_t lwe_chunk_size) {
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
// much faster than global memory
extern __shared__ int8_t sharedmem[];
int8_t *selected_memory;
selected_memory = sharedmem;
double2 *accumulator_fft = (double2 *)selected_memory;
double2 *keybundle = keybundle_array +
// select the input
blockIdx.x * lwe_chunk_size * level_count *
(glwe_dimension + 1) * (glwe_dimension + 1) *
(polynomial_size / 2);
double2 *global_accumulator_fft_input =
global_accumulator_fft +
blockIdx.x * level_count * (glwe_dimension + 1) * (polynomial_size / 2);
for (int level = 0; level < level_count; level++) {
double2 *global_fft_slice =
global_accumulator_fft_input +
level * (glwe_dimension + 1) * (polynomial_size / 2);
for (int j = 0; j < (glwe_dimension + 1); j++) {
double2 *fft = global_fft_slice + j * params::degree / 2;
// Get the bootstrapping key piece necessary for the multiplication
// It is already in the Fourier domain
auto bsk_slice =
get_ith_mask_kth_block(keybundle, iteration, j, level,
polynomial_size, glwe_dimension, level_count);
auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
polynomial_product_accumulate_in_fourier_domain<params, double2>(
accumulator_fft, fft, bsk_poly, !level && !j);
}
}
// Perform the inverse FFT on the result of the GGSW x GLWE and add to the
// accumulator
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
Torus *global_slice =
global_accumulator +
(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree;
add_to_torus<Torus, params>(accumulator_fft, global_slice, true);
synchronize_threads_in_block();
uint32_t lwe_iteration = iteration + lwe_offset;
if (lwe_iteration + 1 == (lwe_dimension / grouping_factor)) {
// Last iteration
auto block_lwe_array_out =
&lwe_array_out[blockIdx.x * (glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
if (blockIdx.y < glwe_dimension) {
// Perform a sample extract. At this point, all blocks have the result,
// but we do the computation at block 0 to avoid waiting for extra blocks,
// in case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, global_slice);
} else if (blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, global_slice, 0);
}
}
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_multibit_bootstrap_keybundle(uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size; // accumulator
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_multibit_bootstrap_step_one(uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size * 2; // accumulator
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_multibit_bootstrap_step_two(uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size; // accumulator
}
template <typename Torus>
__host__ __device__ uint64_t get_buffer_size_multibit_bootstrap(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size) {
uint64_t buffer_size = 0;
buffer_size += input_lwe_ciphertext_count * lwe_chunk_size * level_count *
(glwe_dimension + 1) * (glwe_dimension + 1) *
(polynomial_size / 2) * sizeof(double2); // keybundle fft
buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
level_count * (polynomial_size / 2) *
sizeof(double2); // global_accumulator_fft
buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
polynomial_size * sizeof(Torus); // global_accumulator
return buffer_size + buffer_size % sizeof(double2);
}
template <typename Torus, typename STorus, typename params>
__host__ void
scratch_multi_bit_pbs(void *v_stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count,
uint32_t grouping_factor, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
uint64_t full_sm_keybundle =
get_buffer_size_full_sm_multibit_bootstrap_keybundle<Torus>(
polynomial_size);
uint64_t full_sm_accumulate_step_one =
get_buffer_size_full_sm_multibit_bootstrap_step_one<Torus>(
polynomial_size);
uint64_t full_sm_accumulate_step_two =
get_buffer_size_full_sm_multibit_bootstrap_step_two<Torus>(
polynomial_size);
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_bootstrap_keybundle<Torus, params>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
cudaFuncSetCacheConfig(device_multi_bit_bootstrap_keybundle<Torus, params>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_bootstrap_accumulate_step_one<Torus, params>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
full_sm_accumulate_step_one));
cudaFuncSetCacheConfig(
device_multi_bit_bootstrap_accumulate_step_one<Torus, params>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_bootstrap_accumulate_step_two<Torus, params>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
full_sm_accumulate_step_two));
cudaFuncSetCacheConfig(
device_multi_bit_bootstrap_accumulate_step_two<Torus, params>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
if (allocate_gpu_memory) {
if (!lwe_chunk_size)
lwe_chunk_size =
get_lwe_chunk_size(lwe_dimension, level_count, glwe_dimension,
input_lwe_ciphertext_count);
uint64_t buffer_size = get_buffer_size_multibit_bootstrap<Torus>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, lwe_chunk_size);
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
check_cuda_error(cudaGetLastError());
}
}
template <typename Torus, typename STorus, class params>
__host__ void host_multi_bit_pbs(
void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *lwe_array_in, uint64_t *bootstrapping_key,
int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_lut_vectors,
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
// If a chunk size is not passed to this function, select one.
if (!lwe_chunk_size)
lwe_chunk_size = get_lwe_chunk_size(lwe_dimension, level_count,
glwe_dimension, num_samples);
//
double2 *keybundle_fft = (double2 *)pbs_buffer;
double2 *global_accumulator_fft =
(double2 *)keybundle_fft +
num_samples * lwe_chunk_size * level_count * (glwe_dimension + 1) *
(glwe_dimension + 1) * (polynomial_size / 2);
Torus *global_accumulator =
(Torus *)global_accumulator_fft +
(ptrdiff_t)(sizeof(double2) * num_samples * (glwe_dimension + 1) *
level_count * (polynomial_size / 2) / sizeof(Torus));
//
uint64_t full_sm_keybundle =
get_buffer_size_full_sm_multibit_bootstrap_keybundle<Torus>(
polynomial_size);
uint64_t full_sm_accumulate_step_one =
get_buffer_size_full_sm_multibit_bootstrap_step_one<Torus>(
polynomial_size);
uint64_t full_sm_accumulate_step_two =
get_buffer_size_full_sm_multibit_bootstrap_step_two<Torus>(
polynomial_size);
uint32_t keybundle_size_per_input =
lwe_chunk_size * level_count * (glwe_dimension + 1) *
(glwe_dimension + 1) * (polynomial_size / 2);
//
dim3 grid_accumulate_step_one(level_count, glwe_dimension + 1, num_samples);
dim3 grid_accumulate_step_two(num_samples, glwe_dimension + 1);
dim3 thds(polynomial_size / params::opt, 1, 1);
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
lwe_offset += lwe_chunk_size) {
uint32_t chunk_size = std::min(
lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
// Compute a keybundle
dim3 grid_keybundle(num_samples * chunk_size,
(glwe_dimension + 1) * (glwe_dimension + 1),
level_count);
device_multi_bit_bootstrap_keybundle<Torus, params>
<<<grid_keybundle, thds, full_sm_keybundle, *stream>>>(
lwe_array_in, keybundle_fft, bootstrapping_key, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log,
level_count, lwe_offset, chunk_size, keybundle_size_per_input);
check_cuda_error(cudaGetLastError());
// Accumulate
for (int j = 0; j < chunk_size; j++) {
device_multi_bit_bootstrap_accumulate_step_one<Torus, params>
<<<grid_accumulate_step_one, thds, full_sm_accumulate_step_one,
*stream>>>(lwe_array_in, lut_vector, lut_vector_indexes,
global_accumulator, global_accumulator_fft,
lwe_dimension, glwe_dimension, polynomial_size,
base_log, level_count, j + lwe_offset);
check_cuda_error(cudaGetLastError());
device_multi_bit_bootstrap_accumulate_step_two<Torus, params>
<<<grid_accumulate_step_two, thds, full_sm_accumulate_step_two,
*stream>>>(lwe_array_out, keybundle_fft, global_accumulator,
global_accumulator_fft, lwe_dimension, glwe_dimension,
polynomial_size, level_count, grouping_factor, j,
lwe_offset, lwe_chunk_size);
check_cuda_error(cudaGetLastError());
}
}
}
#endif // MULTIBIT_PBS_H

View File

@@ -1 +0,0 @@
#include "crypto/ciphertext.cuh"

View File

@@ -1,329 +0,0 @@
#include "circuit_bootstrap.cuh"
#include "circuit_bootstrap.h"
/*
* Runs standard checks to validate the inputs
*/
void checks_fast_circuit_bootstrap(int polynomial_size) {
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
"256, 512, 1024, 2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
}
/*
* Runs standard checks to validate the inputs
*/
void checks_circuit_bootstrap(int glwe_dimension, int polynomial_size,
int level_bsk, int number_of_inputs) {
// The number of samples should be lower than the number of streaming
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
// to the occupancy of 50%). The only supported value for k is 1, so
// k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
"equal to the "
"number of streaming multiprocessors on the device divided by 4 * "
"(k + 1) * level_count_bsk",
number_of_inputs <=
number_of_sm / 4. / (glwe_dimension + 1) / level_bsk));
checks_fast_circuit_bootstrap(polynomial_size);
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the circuit bootstrap on 32 bits inputs, into `cbs_buffer`. It also
* configures SM options on the GPU in case FULLSM mode is going to be used.
*/
void scratch_cuda_circuit_bootstrap_32(
void *v_stream, uint32_t gpu_index, int8_t **cbs_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t level_bsk, uint32_t level_count_cbs, uint32_t number_of_inputs,
uint32_t max_shared_memory, bool allocate_gpu_memory) {
checks_fast_circuit_bootstrap(polynomial_size);
switch (polynomial_size) {
case 256:
scratch_circuit_bootstrap<uint32_t, int32_t, Degree<256>>(
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_bsk, level_count_cbs, number_of_inputs,
max_shared_memory, allocate_gpu_memory);
break;
case 512:
scratch_circuit_bootstrap<uint32_t, int32_t, Degree<512>>(
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_bsk, level_count_cbs, number_of_inputs,
max_shared_memory, allocate_gpu_memory);
break;
case 1024:
scratch_circuit_bootstrap<uint32_t, int32_t, Degree<1024>>(
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_bsk, level_count_cbs, number_of_inputs,
max_shared_memory, allocate_gpu_memory);
break;
case 2048:
scratch_circuit_bootstrap<uint32_t, int32_t, Degree<2048>>(
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_bsk, level_count_cbs, number_of_inputs,
max_shared_memory, allocate_gpu_memory);
break;
case 4096:
scratch_circuit_bootstrap<uint32_t, int32_t, Degree<4096>>(
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_bsk, level_count_cbs, number_of_inputs,
max_shared_memory, allocate_gpu_memory);
break;
case 8192:
scratch_circuit_bootstrap<uint32_t, int32_t, Degree<8192>>(
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_bsk, level_count_cbs, number_of_inputs,
max_shared_memory, allocate_gpu_memory);
break;
default:
break;
}
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the circuit bootstrap on 64 bits inputs, into `cbs_buffer`. It also
* configures SM options on the GPU in case FULLSM mode is going to be used.
*/
void scratch_cuda_circuit_bootstrap_64(
void *v_stream, uint32_t gpu_index, int8_t **cbs_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t level_bsk, uint32_t level_count_cbs, uint32_t number_of_inputs,
uint32_t max_shared_memory, bool allocate_gpu_memory) {
checks_fast_circuit_bootstrap(polynomial_size);
switch (polynomial_size) {
case 256:
scratch_circuit_bootstrap<uint64_t, int64_t, Degree<256>>(
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_bsk, level_count_cbs, number_of_inputs,
max_shared_memory, allocate_gpu_memory);
break;
case 512:
scratch_circuit_bootstrap<uint64_t, int64_t, Degree<512>>(
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_bsk, level_count_cbs, number_of_inputs,
max_shared_memory, allocate_gpu_memory);
break;
case 1024:
scratch_circuit_bootstrap<uint64_t, int64_t, Degree<1024>>(
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_bsk, level_count_cbs, number_of_inputs,
max_shared_memory, allocate_gpu_memory);
break;
case 2048:
scratch_circuit_bootstrap<uint64_t, int64_t, Degree<2048>>(
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_bsk, level_count_cbs, number_of_inputs,
max_shared_memory, allocate_gpu_memory);
break;
case 4096:
scratch_circuit_bootstrap<uint64_t, int64_t, Degree<4096>>(
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_bsk, level_count_cbs, number_of_inputs,
max_shared_memory, allocate_gpu_memory);
break;
case 8192:
scratch_circuit_bootstrap<uint64_t, int64_t, Degree<8192>>(
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_bsk, level_count_cbs, number_of_inputs,
max_shared_memory, allocate_gpu_memory);
break;
default:
break;
}
}
/*
* Perform circuit bootstrapping for the batch of 32 bit LWE ciphertexts.
* Head out to the equivalent operation on 64 bits for more details.
*/
void cuda_circuit_bootstrap_32(
void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
void *fourier_bsk, void *fp_ksk_array, void *lut_vector_indexes,
int8_t *cbs_buffer, uint32_t delta_log, uint32_t polynomial_size,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t level_bsk,
uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
uint32_t max_shared_memory) {
checks_circuit_bootstrap(glwe_dimension, polynomial_size, level_bsk,
number_of_inputs);
switch (polynomial_size) {
case 256:
host_circuit_bootstrap<uint32_t, Degree<256>>(
v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
(double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
(uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
max_shared_memory);
break;
case 512:
host_circuit_bootstrap<uint32_t, Degree<512>>(
v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
(double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
(uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
max_shared_memory);
break;
case 1024:
host_circuit_bootstrap<uint32_t, Degree<1024>>(
v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
(double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
(uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
max_shared_memory);
break;
case 2048:
host_circuit_bootstrap<uint32_t, Degree<2048>>(
v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
(double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
(uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
max_shared_memory);
break;
case 4096:
host_circuit_bootstrap<uint32_t, Degree<4096>>(
v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
(double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
(uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
max_shared_memory);
break;
case 8192:
host_circuit_bootstrap<uint32_t, Degree<8192>>(
v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
(double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
(uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
max_shared_memory);
break;
default:
break;
}
}
/*
* Perform circuit bootstrapping on a batch of 64 bit input LWE ciphertexts.
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - 'ggsw_out' output batch of ggsw with size:
* 'number_of_inputs' * 'level_cbs' * ('glwe_dimension' + 1)^2 *
* polynomial_size * sizeof(u64)
* - 'lwe_array_in' input batch of lwe ciphertexts, with size:
* 'number_of_inputs' * '(lwe_dimension' + 1) * sizeof(u64)
* - 'fourier_bsk' bootstrapping key in fourier domain with size:
* 'lwe_dimension' * 'level_bsk' * ('glwe_dimension' + 1)^2 *
* 'polynomial_size' / 2 * sizeof(double2)
* - 'fp_ksk_array' batch of fp-keyswitch keys with size:
* ('polynomial_size' + 1) * 'level_pksk' * ('glwe_dimension' + 1)^2 *
* 'polynomial_size' * sizeof(u64)
* - 'cbs_buffer': buffer used during calculations, it is not an actual
* inputs of the function, just allocated memory for calculation
* process, like this, memory can be allocated once and can be used as much
* as needed for different calls of circuit_bootstrap function
*
* This function calls a wrapper to a device kernel that performs the
* circuit bootstrap. The kernel is templatized based on integer discretization
* and polynomial degree.
*/
void cuda_circuit_bootstrap_64(
void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
void *fourier_bsk, void *fp_ksk_array, void *lut_vector_indexes,
int8_t *cbs_buffer, uint32_t delta_log, uint32_t polynomial_size,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t level_bsk,
uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
uint32_t max_shared_memory) {
checks_circuit_bootstrap(glwe_dimension, polynomial_size, level_bsk,
number_of_inputs);
switch (polynomial_size) {
case 256:
host_circuit_bootstrap<uint64_t, Degree<256>>(
v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
(double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
(uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
max_shared_memory);
break;
case 512:
host_circuit_bootstrap<uint64_t, Degree<512>>(
v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
(double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
(uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
max_shared_memory);
break;
case 1024:
host_circuit_bootstrap<uint64_t, Degree<1024>>(
v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
(double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
(uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
max_shared_memory);
break;
case 2048:
host_circuit_bootstrap<uint64_t, Degree<2048>>(
v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
(double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
(uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
max_shared_memory);
break;
case 4096:
host_circuit_bootstrap<uint64_t, Degree<4096>>(
v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
(double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
(uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
max_shared_memory);
break;
case 8192:
host_circuit_bootstrap<uint64_t, Degree<8192>>(
v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
(double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
(uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
max_shared_memory);
break;
default:
break;
}
}
/*
* This cleanup function frees the data for the circuit bootstrap on GPU in
* cbs_buffer for 32 or 64 bits inputs.
*/
void cleanup_cuda_circuit_bootstrap(void *v_stream, uint32_t gpu_index,
int8_t **cbs_buffer) {
auto stream = static_cast<cudaStream_t *>(v_stream);
// Free memory
cuda_drop_async(*cbs_buffer, stream, gpu_index);
}

View File

@@ -1,239 +0,0 @@
#ifndef CBS_CUH
#define CBS_CUH
#include "bit_extraction.cuh"
#include "bootstrap_low_latency.cuh"
#include "device.h"
#include "keyswitch.cuh"
#include "polynomial/parameters.cuh"
#include "utils/timer.cuh"
/*
* scalar multiplication to value for batch of lwe_ciphertext
* works for any size of lwe input
* blockIdx.x refers to input ciphertext it
*/
template <typename Torus, class params>
__global__ void shift_lwe_cbs(Torus *dst_shift, Torus *src, Torus value,
size_t lwe_size) {
size_t blockId = blockIdx.y * gridDim.x + blockIdx.x;
size_t threads_per_block = blockDim.x;
size_t opt = lwe_size / threads_per_block;
size_t rem = lwe_size & (threads_per_block - 1);
auto cur_dst = &dst_shift[blockId * lwe_size];
auto cur_src = &src[blockIdx.y * lwe_size];
size_t tid = threadIdx.x;
for (size_t i = 0; i < opt; i++) {
cur_dst[tid] = cur_src[tid] * value;
tid += threads_per_block;
}
if (threadIdx.x < rem)
cur_dst[tid] = cur_src[tid] * value;
}
/*
* Fill lut, equivalent to trivial encryption as mask is 0s.
* The LUT is filled with -alpha in each coefficient where
* alpha = 2^{log(q) - 1 - base_log * level}
* blockIdx.x refers to lut id
* value is not passed and calculated inside function because lut id is one
* of the variable.
*/
template <typename Torus, class params>
__global__ void fill_lut_body_for_cbs(Torus *lut, uint32_t ciphertext_n_bits,
uint32_t base_log_cbs,
uint32_t glwe_dimension) {
Torus *cur_body = &lut[(blockIdx.x * (glwe_dimension + 1) + glwe_dimension) *
params::degree];
size_t tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
cur_body[tid] =
0ll -
(1ll << (ciphertext_n_bits - 1 - base_log_cbs * (blockIdx.x + 1)));
tid += params::degree / params::opt;
}
}
/*
* copy pbs result (glwe_dimension + 1) times to be an input of fp-ks
* each of the input ciphertext from lwe_src is copied (glwe_dimension + 1)
* times inside lwe_dst, and then value is added to the body.
* blockIdx.x refers to destination lwe ciphertext id: 'dst_lwe_id'
* 'src_lwe_id' = 'dst_lwe_id' / (glwe_dimension + 1)
*
* example: glwe_dimension = 1
* src_0 ... src_n
* / \ / \
* / \ / \
* dst_0 dst_1 dst_2n dst_2n+1
*/
template <typename Torus, class params>
__global__ void copy_add_lwe_cbs(Torus *lwe_dst, Torus *lwe_src,
uint32_t ciphertext_n_bits,
uint32_t base_log_cbs, uint32_t level_cbs,
uint32_t glwe_dimension) {
size_t tid = threadIdx.x;
size_t src_lwe_id = blockIdx.x / (glwe_dimension + 1);
size_t dst_lwe_id = blockIdx.x;
size_t cur_cbs_level = src_lwe_id % level_cbs + 1;
auto cur_src = &lwe_src[src_lwe_id * (glwe_dimension * params::degree + 1)];
auto cur_dst = &lwe_dst[dst_lwe_id * (glwe_dimension * params::degree + 1)];
auto cur_src_slice = cur_src + blockIdx.y * params::degree;
auto cur_dst_slice = cur_dst + blockIdx.y * params::degree;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
cur_dst_slice[tid] = cur_src_slice[tid];
tid += params::degree / params::opt;
}
Torus val = 1ll << (ciphertext_n_bits - 1 - base_log_cbs * cur_cbs_level);
if (threadIdx.x == 0 && blockIdx.y == 0) {
cur_dst[glwe_dimension * params::degree] =
cur_src[glwe_dimension * params::degree] + val;
}
}
template <typename Torus>
__host__ __device__ uint64_t get_buffer_size_cbs(uint32_t glwe_dimension,
uint32_t lwe_dimension,
uint32_t polynomial_size,
uint32_t level_count_cbs,
uint32_t number_of_inputs) {
uint64_t buffer_size =
number_of_inputs * level_count_cbs * (glwe_dimension + 1) *
(glwe_dimension * polynomial_size + 1) *
sizeof(Torus) + // lwe_array_in_fp_ks_buffer
number_of_inputs * level_count_cbs *
(glwe_dimension * polynomial_size + 1) *
sizeof(Torus) + // lwe_array_out_pbs_buffer
number_of_inputs * level_count_cbs * (lwe_dimension + 1) *
sizeof(Torus) + // lwe_array_in_shifted_buffer
level_count_cbs * (glwe_dimension + 1) * polynomial_size *
sizeof(Torus); // lut_vector_cbs
return buffer_size + buffer_size % sizeof(double2);
}
template <typename Torus, typename STorus, typename params>
__host__ void scratch_circuit_bootstrap(
void *v_stream, uint32_t gpu_index, int8_t **cbs_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t level_bsk, uint32_t level_count_cbs, uint32_t number_of_inputs,
uint32_t max_shared_memory, bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
int pbs_count = number_of_inputs * level_count_cbs;
// allocate and initialize device pointers for circuit bootstrap
if (allocate_gpu_memory) {
uint64_t buffer_size = get_buffer_size_cbs<Torus>(
glwe_dimension, lwe_dimension, polynomial_size,
level_count_cbs, number_of_inputs) +
get_buffer_size_bootstrap_low_latency<Torus>(
glwe_dimension, polynomial_size, level_bsk,
pbs_count, max_shared_memory);
*cbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
}
scratch_bootstrap_low_latency<Torus, STorus, params>(
v_stream, gpu_index, cbs_buffer, glwe_dimension, polynomial_size,
level_bsk, pbs_count, max_shared_memory, false);
}
/*
* Host function for cuda circuit bootstrap.
* It executes device functions in specific order and manages
* parallelism
*/
template <typename Torus, class params>
__host__ void host_circuit_bootstrap(
void *v_stream, uint32_t gpu_index, Torus *ggsw_out, Torus *lwe_array_in,
double2 *fourier_bsk, Torus *fp_ksk_array, Torus *lut_vector_indexes,
int8_t *cbs_buffer, uint32_t delta_log, uint32_t polynomial_size,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t level_bsk,
uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
uint32_t max_shared_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
uint32_t ciphertext_n_bits = sizeof(Torus) * 8;
uint32_t lwe_size = lwe_dimension + 1;
int pbs_count = number_of_inputs * level_cbs;
dim3 blocks(level_cbs, number_of_inputs, 1);
int threads = 256;
// Always define the PBS buffer first, because it has the strongest memory
// alignment requirement (16 bytes for double2)
int8_t *pbs_buffer = (int8_t *)cbs_buffer;
Torus *lwe_array_out_pbs_buffer =
(Torus *)pbs_buffer +
(ptrdiff_t)(get_buffer_size_bootstrap_low_latency<Torus>(
glwe_dimension, polynomial_size, level_bsk, pbs_count,
max_shared_memory) /
sizeof(Torus));
Torus *lwe_array_in_shifted_buffer =
lwe_array_out_pbs_buffer +
(ptrdiff_t)(number_of_inputs * level_cbs *
(glwe_dimension * polynomial_size + 1));
Torus *lut_vector =
lwe_array_in_shifted_buffer +
(ptrdiff_t)(number_of_inputs * level_cbs * (lwe_dimension + 1));
Torus *lwe_array_in_fp_ks_buffer =
lut_vector +
(ptrdiff_t)(level_cbs * (glwe_dimension + 1) * polynomial_size);
// Shift message LSB on padding bit, at this point we expect to have messages
// with only 1 bit of information
shift_lwe_cbs<Torus, params><<<blocks, threads, 0, *stream>>>(
lwe_array_in_shifted_buffer, lwe_array_in,
1LL << (ciphertext_n_bits - delta_log - 1), lwe_size);
// Add q/4 to center the error while computing a negacyclic LUT
add_to_body<Torus>
<<<pbs_count, 1, 0, *stream>>>(lwe_array_in_shifted_buffer, lwe_dimension,
1ll << (ciphertext_n_bits - 2));
// Fill lut (equivalent to trivial encryption as mask is 0s)
// The LUT is filled with -alpha in each coefficient where
// alpha = 2^{log(q) - 1 - base_log * level}
check_cuda_error(cudaMemsetAsync(lut_vector, 0,
level_cbs * (glwe_dimension + 1) *
polynomial_size * sizeof(Torus),
*stream));
fill_lut_body_for_cbs<Torus, params>
<<<level_cbs, params::degree / params::opt, 0, *stream>>>(
lut_vector, ciphertext_n_bits, base_log_cbs, glwe_dimension);
// Applying a negacyclic LUT on a ciphertext with one bit of message in the
// MSB and no bit of padding
host_bootstrap_low_latency<Torus, params>(
v_stream, gpu_index, lwe_array_out_pbs_buffer, lut_vector,
lut_vector_indexes, lwe_array_in_shifted_buffer, fourier_bsk, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log_bsk, level_bsk,
pbs_count, level_cbs, max_shared_memory);
dim3 copy_grid(pbs_count * (glwe_dimension + 1), glwe_dimension, 1);
dim3 copy_block(params::degree / params::opt, 1, 1);
// Add q/4 to center the error while computing a negacyclic LUT
// copy pbs result (glwe_dimension + 1) times to be an input of fp-ks
copy_add_lwe_cbs<Torus, params><<<copy_grid, copy_block, 0, *stream>>>(
lwe_array_in_fp_ks_buffer, lwe_array_out_pbs_buffer, ciphertext_n_bits,
base_log_cbs, level_cbs, glwe_dimension);
cuda_fp_keyswitch_lwe_to_glwe(
v_stream, gpu_index, ggsw_out, lwe_array_in_fp_ks_buffer, fp_ksk_array,
glwe_dimension * polynomial_size, glwe_dimension, polynomial_size,
base_log_pksk, level_pksk, pbs_count * (glwe_dimension + 1),
glwe_dimension + 1);
}
#endif // CBS_CUH

View File

@@ -1,138 +0,0 @@
#ifndef GPU_BOOTSTRAP_COMMON_CUH
#define GPU_BOOTSTRAP_COMMON_CUH
#include <cassert>
#include <cstdint>
#include <cstdio>
#define SNT 1
#define dPI 6.283185307179586231995926937088
using sTorus = int32_t;
// using Torus = uint32_t;
using sTorus = int32_t;
using u32 = uint32_t;
using i32 = int32_t;
//--------------------------------------------------
// Basic double2 operations
__device__ inline double2 conjugate(const double2 num) {
double2 res;
res.x = num.x;
res.y = -num.y;
return res;
}
__device__ inline void operator+=(double2 &lh, const double2 rh) {
lh.x += rh.x;
lh.y += rh.y;
}
__device__ inline void operator-=(double2 &lh, const double2 rh) {
lh.x -= rh.x;
lh.y -= rh.y;
}
__device__ inline double2 operator+(const double2 a, const double2 b) {
double2 res;
res.x = a.x + b.x;
res.y = a.y + b.y;
return res;
}
__device__ inline double2 operator-(const double2 a, const double2 b) {
double2 res;
res.x = a.x - b.x;
res.y = a.y - b.y;
return res;
}
__device__ inline double2 operator*(const double2 a, const double2 b) {
double xx = a.x * b.x;
double xy = a.x * b.y;
double yx = a.y * b.x;
double yy = a.y * b.y;
double2 res;
// asm volatile("fma.rn.f64 %0, %1, %2, %3;": "=d"(res.x) : "d"(a.x),
// "d"(b.x), "d"(yy));
res.x = xx - yy;
res.y = xy + yx;
return res;
}
__device__ inline double2 operator*(const double2 a, double b) {
double2 res;
res.x = a.x * b;
res.y = a.y * b;
return res;
}
__device__ inline void operator*=(double2 &a, const double2 b) {
double tmp = a.x;
a.x *= b.x;
a.x -= a.y * b.y;
a.y *= b.x;
a.y += b.y * tmp;
}
__device__ inline double2 operator*(double a, double2 b) {
double2 res;
res.x = b.x * a;
res.y = b.y * a;
return res;
}
template <typename T> __global__ void print_debug_kernel(T *src, int N) {
for (int i = 0; i < N; i++) {
printf("%lu, ", src[i]);
}
}
template <typename T> void print_debug(const char *name, T *src, int N) {
printf("%s: ", name);
cudaDeviceSynchronize();
print_debug_kernel<<<1, 1>>>(src, N);
cudaDeviceSynchronize();
printf("\n");
}
template <typename Torus> struct int_mul_memory {
Torus *vector_result_sb;
Torus *block_mul_res;
Torus *small_lwe_vector;
Torus *lwe_pbs_out_array;
Torus *test_vector_array;
Torus *message_acc;
Torus *carry_acc;
Torus *test_vector_indexes;
Torus *tvi_message;
Torus *tvi_carry;
int8_t *pbs_buffer;
int p2p_gpu_count = 0;
cudaStream_t *streams[32];
int8_t *pbs_buffer_multi_gpu[32];
Torus *pbs_input_multi_gpu[32];
Torus *pbs_output_multi_gpu[32];
Torus *test_vector_multi_gpu[32];
Torus *tvi_lsb_multi_gpu[32];
Torus *tvi_msb_multi_gpu[32];
Torus *tvi_message_multi_gpu[32];
Torus *tvi_carry_multi_gpu[32];
Torus *bsk_multi_gpu[32];
Torus *ksk_multi_gpu[32];
Torus *device_to_device_buffer[8];
bool IsAppBuiltAs64() { return sizeof(void *) == 8; }
};
#endif

View File

@@ -1,498 +0,0 @@
#ifndef CNCRT_BSK_H
#define CNCRT_BSK_H
#include "bootstrap.h"
#include "bootstrap_multibit.h"
#include "device.h"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial.cuh"
#include <atomic>
#include <cstdint>
#include <err.h>
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count) {
return i * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) *
level_count;
}
////////////////////////////////////////////////
template <typename T>
__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
uint32_t polynomial_size,
int glwe_dimension, uint32_t level_count) {
return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension,
level_count) +
level * polynomial_size / 2 * (glwe_dimension + 1) *
(glwe_dimension + 1) +
k * polynomial_size / 2 * (glwe_dimension + 1)];
}
template <typename T>
__device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
uint32_t polynomial_size,
int glwe_dimension, uint32_t level_count) {
return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension,
level_count) +
level * polynomial_size / 2 * (glwe_dimension + 1) *
(glwe_dimension + 1) +
k * polynomial_size / 2 * (glwe_dimension + 1) +
glwe_dimension * polynomial_size / 2];
}
////////////////////////////////////////////////
__device__ inline int get_start_ith_lwe(uint32_t i, uint32_t grouping_factor,
uint32_t polynomial_size,
uint32_t glwe_dimension,
uint32_t level_count) {
return i * (1 << grouping_factor) * polynomial_size / 2 *
(glwe_dimension + 1) * (glwe_dimension + 1) * level_count;
}
template <typename T>
__device__ T *get_multi_bit_ith_lwe_gth_group_kth_block(
T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count) {
T *ptr_group = ptr + get_start_ith_lwe(i, grouping_factor, polynomial_size,
glwe_dimension, level_count);
return get_ith_mask_kth_block(ptr_group, g, k, level, polynomial_size,
glwe_dimension, level_count);
}
////////////////////////////////////////////////
template <typename T, typename ST>
void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
uint32_t gpu_index, uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size,
uint32_t total_polynomials) {
cudaSetDevice(gpu_index);
int shared_memory_size = sizeof(double) * polynomial_size;
// Here the buffer size is the size of double2 times the number of polynomials
// times the polynomial size over 2 because the polynomials are compressed
// into the complex domain to perform the FFT
size_t buffer_size =
total_polynomials * polynomial_size / 2 * sizeof(double2);
int gridSize = total_polynomials;
int blockSize = polynomial_size / choose_opt_amortized(polynomial_size);
double2 *h_bsk = (double2 *)malloc(buffer_size);
auto stream = static_cast<cudaStream_t *>(v_stream);
double2 *d_bsk = (double2 *)cuda_malloc_async(buffer_size, stream, gpu_index);
// compress real bsk to complex and divide it on DOUBLE_MAX
for (int i = 0; i < total_polynomials; i++) {
int complex_current_poly_idx = i * polynomial_size / 2;
int torus_current_poly_idx = i * polynomial_size;
for (int j = 0; j < polynomial_size / 2; j++) {
h_bsk[complex_current_poly_idx + j].x = src[torus_current_poly_idx + j];
h_bsk[complex_current_poly_idx + j].y =
src[torus_current_poly_idx + j + polynomial_size / 2];
h_bsk[complex_current_poly_idx + j].x /=
(double)std::numeric_limits<T>::max();
h_bsk[complex_current_poly_idx + j].y /=
(double)std::numeric_limits<T>::max();
}
}
cuda_memcpy_async_to_gpu(d_bsk, h_bsk, buffer_size, stream, gpu_index);
double2 *buffer;
switch (polynomial_size) {
case 256:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
}
break;
case 512:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
}
break;
case 1024:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
}
break;
case 2048:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
}
break;
case 4096:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
}
break;
case 8192:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
}
break;
case 16384:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
}
break;
default:
errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
"are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
break;
}
cuda_drop_async(d_bsk, stream, gpu_index);
cuda_drop_async(buffer, stream, gpu_index);
free(h_bsk);
}
void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src, void *v_stream,
uint32_t gpu_index,
uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size) {
uint32_t total_polynomials =
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
cuda_convert_lwe_bootstrap_key<uint32_t, int32_t>(
(double2 *)dest, (int32_t *)src, v_stream, gpu_index, input_lwe_dim,
glwe_dim, level_count, polynomial_size, total_polynomials);
}
void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src, void *v_stream,
uint32_t gpu_index,
uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size) {
uint32_t total_polynomials =
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
cuda_convert_lwe_bootstrap_key<uint64_t, int64_t>(
(double2 *)dest, (int64_t *)src, v_stream, gpu_index, input_lwe_dim,
glwe_dim, level_count, polynomial_size, total_polynomials);
}
void cuda_convert_lwe_multi_bit_bootstrap_key_64(
void *dest, void *src, void *v_stream, uint32_t gpu_index,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size, uint32_t grouping_factor) {
uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
level_count * (1 << grouping_factor) /
grouping_factor;
size_t buffer_size = total_polynomials * polynomial_size * sizeof(uint64_t);
cuda_memcpy_async_to_gpu((uint64_t *)dest, (uint64_t *)src, buffer_size,
(cudaStream_t *)v_stream, gpu_index);
}
void cuda_fourier_polynomial_mul(void *_input1, void *_input2, void *_output,
void *v_stream, uint32_t gpu_index,
uint32_t polynomial_size,
uint32_t total_polynomials) {
auto stream = static_cast<cudaStream_t *>(v_stream);
auto input1 = (double2 *)_input1;
auto input2 = (double2 *)_input2;
auto output = (double2 *)_output;
size_t shared_memory_size = sizeof(double2) * polynomial_size / 2;
int gridSize = total_polynomials;
int blockSize = polynomial_size / choose_opt_amortized(polynomial_size);
double2 *buffer;
switch (polynomial_size) {
case 256:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(
input1, input2, output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, *stream>>>(input1, input2, output, buffer);
}
break;
case 512:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<521>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(
input1, input2, output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, *stream>>>(input1, input2, output, buffer);
}
break;
case 1024:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(
input1, input2, output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, *stream>>>(input1, input2, output, buffer);
}
break;
case 2048:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(
input1, input2, output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, *stream>>>(input1, input2, output, buffer);
}
break;
case 4096:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(
input1, input2, output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, *stream>>>(input1, input2, output, buffer);
}
break;
case 8192:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(
input1, input2, output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, *stream>>>(input1, input2, output, buffer);
}
break;
case 16384:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
FULLSM>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(
input1, input2, output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, *stream>>>(input1, input2, output, buffer);
}
break;
default:
errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
"are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
break;
}
cuda_drop_async(buffer, stream, gpu_index);
}
// We need these lines so the compiler knows how to specialize these functions
template __device__ uint64_t *get_ith_mask_kth_block(uint64_t *ptr, int i,
int k, int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template __device__ uint32_t *get_ith_mask_kth_block(uint32_t *ptr, int i,
int k, int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template __device__ double2 *get_ith_mask_kth_block(double2 *ptr, int i, int k,
int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template __device__ uint64_t *get_ith_body_kth_block(uint64_t *ptr, int i,
int k, int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template __device__ uint32_t *get_ith_body_kth_block(uint32_t *ptr, int i,
int k, int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k,
int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template __device__ uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block(
uint64_t *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
template __device__ double2 *get_multi_bit_ith_lwe_gth_group_kth_block(
double2 *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
#endif // CNCRT_BSK_H

View File

@@ -1,50 +0,0 @@
#ifndef CIPHERTEXT_H
#define CIPHERTEXT_H
#include "ciphertext.h"
#include "device.h"
#include <cstdint>
template <typename T>
void cuda_convert_lwe_ciphertext_vector_to_gpu(T *dest, T *src, void *v_stream,
uint32_t gpu_index,
uint32_t number_of_cts,
uint32_t lwe_dimension) {
cudaSetDevice(gpu_index);
cudaStream_t *stream = static_cast<cudaStream_t *>(v_stream);
uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
cuda_memcpy_async_to_gpu(dest, src, size, stream, gpu_index);
}
void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *dest, void *src,
void *v_stream,
uint32_t gpu_index,
uint32_t number_of_cts,
uint32_t lwe_dimension) {
cuda_convert_lwe_ciphertext_vector_to_gpu<uint64_t>(
(uint64_t *)dest, (uint64_t *)src, v_stream, gpu_index, number_of_cts,
lwe_dimension);
}
template <typename T>
void cuda_convert_lwe_ciphertext_vector_to_cpu(T *dest, T *src, void *v_stream,
uint32_t gpu_index,
uint32_t number_of_cts,
uint32_t lwe_dimension) {
cudaSetDevice(gpu_index);
cudaStream_t *stream = static_cast<cudaStream_t *>(v_stream);
uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
cuda_memcpy_async_to_cpu(dest, src, size, stream, gpu_index);
}
void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *dest, void *src,
void *v_stream,
uint32_t gpu_index,
uint32_t number_of_cts,
uint32_t lwe_dimension) {
cuda_convert_lwe_ciphertext_vector_to_cpu<uint64_t>(
(uint64_t *)dest, (uint64_t *)src, v_stream, gpu_index, number_of_cts,
lwe_dimension);
}
#endif

View File

@@ -1,131 +0,0 @@
#ifndef CNCRT_CRYPTO_H
#define CNCRT_CRPYTO_H
#include "polynomial/polynomial.cuh"
#include <cstdint>
/**
* GadgetMatrix implements the iterator design pattern to decompose a set of
* num_poly consecutive polynomials with degree params::degree. A total of
* level_count levels is expected and each call to decompose_and_compress_next()
* writes to the result the next level. It is also possible to advance an
* arbitrary amount of levels by using decompose_and_compress_level().
*
* This class always decomposes the entire set of num_poly polynomials.
* By default, it works on a single polynomial.
*/
#pragma once
template <typename T, class params> class GadgetMatrix {
private:
uint32_t level_count;
uint32_t base_log;
uint32_t mask;
uint32_t halfbg;
uint32_t num_poly;
T offset;
int current_level;
T mask_mod_b;
T *state;
public:
__device__ GadgetMatrix(uint32_t base_log, uint32_t level_count, T *state,
uint32_t num_poly = 1)
: base_log(base_log), level_count(level_count), num_poly(num_poly),
state(state) {
mask_mod_b = (1ll << base_log) - 1ll;
current_level = level_count;
int tid = threadIdx.x;
for (int i = 0; i < num_poly * params::opt; i++) {
state[tid] >>= (sizeof(T) * 8 - base_log * level_count);
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
}
// Decomposes all polynomials at once
__device__ void decompose_and_compress_next(double2 *result) {
for (int j = 0; j < num_poly; j++) {
auto result_slice = result + j * params::degree / 2;
decompose_and_compress_next_polynomial(result_slice, j);
}
}
// Decomposes a single polynomial
__device__ void decompose_and_compress_next_polynomial(double2 *result,
int j) {
if (j == 0)
current_level -= 1;
int tid = threadIdx.x;
auto state_slice = state + j * params::degree;
for (int i = 0; i < params::opt / 2; i++) {
T res_re = state_slice[tid] & mask_mod_b;
T res_im = state_slice[tid + params::degree / 2] & mask_mod_b;
state_slice[tid] >>= base_log;
state_slice[tid + params::degree / 2] >>= base_log;
T carry_re = ((res_re - 1ll) | state_slice[tid]) & res_re;
T carry_im =
((res_im - 1ll) | state_slice[tid + params::degree / 2]) & res_im;
carry_re >>= (base_log - 1);
carry_im >>= (base_log - 1);
state_slice[tid] += carry_re;
state_slice[tid + params::degree / 2] += carry_im;
res_re -= carry_re << base_log;
res_im -= carry_im << base_log;
result[tid].x = (int32_t)res_re;
result[tid].y = (int32_t)res_im;
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
}
__device__ void decompose_and_compress_level(double2 *result, int level) {
for (int i = 0; i < level_count - level; i++)
decompose_and_compress_next(result);
}
};
template <typename T> class GadgetMatrixSingle {
private:
uint32_t level_count;
uint32_t base_log;
uint32_t mask;
uint32_t halfbg;
T offset;
public:
__device__ GadgetMatrixSingle(uint32_t base_log, uint32_t level_count)
: base_log(base_log), level_count(level_count) {
uint32_t bg = 1 << base_log;
this->halfbg = bg / 2;
this->mask = bg - 1;
T temp = 0;
for (int i = 0; i < this->level_count; i++) {
temp += 1ULL << (sizeof(T) * 8 - (i + 1) * this->base_log);
}
this->offset = temp * this->halfbg;
}
__device__ T decompose_one_level_single(T element, uint32_t level) {
T s = element + this->offset;
uint32_t decal = (sizeof(T) * 8 - (level + 1) * this->base_log);
T temp1 = (s >> decal) & this->mask;
return (T)(temp1 - this->halfbg);
}
};
template <typename Torus>
__device__ Torus decompose_one(Torus &state, Torus mask_mod_b, int base_log) {
Torus res = state & mask_mod_b;
state >>= base_log;
Torus carry = ((res - 1ll) | state) & res;
carry >>= base_log - 1;
state += carry;
res -= carry << base_log;
return res;
}
#endif // CNCRT_CRPYTO_H

View File

@@ -1,73 +0,0 @@
#ifndef CNCRT_GGSW_CUH
#define CNCRT_GGSW_CUH
#include "device.h"
#include "polynomial/parameters.cuh"
template <typename T, typename ST, class params, sharedMemDegree SMD>
__global__ void device_batch_fft_ggsw_vector(double2 *dest, T *src,
int8_t *device_mem) {
extern __shared__ int8_t sharedmem[];
double2 *selected_memory;
if constexpr (SMD == FULLSM)
selected_memory = (double2 *)sharedmem;
else
selected_memory = (double2 *)device_mem[blockIdx.x * params::degree];
// Compression
int offset = blockIdx.x * blockDim.x;
int tid = threadIdx.x;
int log_2_opt = params::opt >> 1;
#pragma unroll
for (int i = 0; i < log_2_opt; i++) {
ST x = src[(tid) + params::opt * offset];
ST y = src[(tid + params::degree / 2) + params::opt * offset];
selected_memory[tid].x = x / (double)std::numeric_limits<T>::max();
selected_memory[tid].y = y / (double)std::numeric_limits<T>::max();
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
// Switch to the FFT space
NSMFFT_direct<HalfDegree<params>>(selected_memory);
synchronize_threads_in_block();
// Write the output to global memory
tid = threadIdx.x;
#pragma unroll
for (int j = 0; j < log_2_opt; j++) {
dest[tid + (params::opt >> 1) * offset] = selected_memory[tid];
tid += params::degree / params::opt;
}
}
/**
* Applies the FFT transform on sequence of GGSW ciphertexts already in the
* global memory
*/
template <typename T, typename ST, class params>
void batch_fft_ggsw_vector(cudaStream_t *stream, double2 *dest, T *src,
int8_t *d_mem, uint32_t r, uint32_t glwe_dim,
uint32_t polynomial_size, uint32_t level_count,
uint32_t gpu_index, uint32_t max_shared_memory) {
int shared_memory_size = sizeof(double) * polynomial_size;
int gridSize = r * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
int blockSize = polynomial_size / params::opt;
if (max_shared_memory < shared_memory_size) {
device_batch_fft_ggsw_vector<T, ST, params, NOSM>
<<<gridSize, blockSize, 0, *stream>>>(dest, src, d_mem);
} else {
device_batch_fft_ggsw_vector<T, ST, params, FULLSM>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(dest, src,
d_mem);
}
check_cuda_error(cudaGetLastError());
}
#endif // CONCRETE_CORE_GGSW_CUH

View File

@@ -1,74 +0,0 @@
#ifndef CNCRT_TORUS_H
#define CNCRT_TORUS_H
#include "types/int128.cuh"
#include <limits>
template <typename T>
__device__ inline void typecast_double_to_torus(double x, T &r) {
r = T(x);
}
template <>
__device__ inline void typecast_double_to_torus<uint32_t>(double x,
uint32_t &r) {
r = __double2uint_rn(x);
}
template <>
__device__ inline void typecast_double_to_torus<uint64_t>(double x,
uint64_t &r) {
// The ull intrinsic does not behave in the same way on all architectures and
// on some platforms this causes the cmux tree test to fail
// Hence the intrinsic is not used here
uint128 nnnn = make_uint128_from_float(x);
uint64_t lll = nnnn.lo_;
r = lll;
}
template <typename T>
__device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
uint32_t level_count) {
T shift = sizeof(T) * 8 - level_count * base_log;
T mask = 1ll << (shift - 1);
T b = (x & mask) >> (shift - 1);
T res = x >> shift;
res += b;
res <<= shift;
return res;
}
template <typename T>
__device__ __forceinline__ void rescale_torus_element(T element, T &output,
uint32_t log_shift) {
output =
round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
(double)log_shift);
}
template <typename T>
__device__ __forceinline__ T rescale_torus_element(T element,
uint32_t log_shift) {
return round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
(double)log_shift);
}
template <>
__device__ __forceinline__ void
rescale_torus_element<uint32_t>(uint32_t element, uint32_t &output,
uint32_t log_shift) {
output =
round(__uint2double_rn(element) /
(__uint2double_rn(std::numeric_limits<uint32_t>::max()) + 1.0) *
__uint2double_rn(log_shift));
}
template <>
__device__ __forceinline__ void
rescale_torus_element<uint64_t>(uint64_t element, uint64_t &output,
uint32_t log_shift) {
output = round(__ull2double_rn(element) /
(__ull2double_rn(std::numeric_limits<uint64_t>::max()) + 1.0) *
__uint2double_rn(log_shift));
}
#endif // CNCRT_TORUS_H

View File

@@ -1,243 +0,0 @@
#include "device.h"
#include <cstdint>
#include <cuda_runtime.h>
/// Unsafe function to create a CUDA stream, must check first that GPU exists
cudaStream_t *cuda_create_stream(uint32_t gpu_index) {
cudaSetDevice(gpu_index);
cudaStream_t *stream = new cudaStream_t;
cudaStreamCreate(stream);
return stream;
}
/// Unsafe function to destroy CUDA stream, must check first the GPU exists
int cuda_destroy_stream(cudaStream_t *stream, uint32_t gpu_index) {
cudaSetDevice(gpu_index);
cudaStreamDestroy(*stream);
return 0;
}
/// Unsafe function that will try to allocate even if gpu_index is invalid
/// or if there's not enough memory. A safe wrapper around it must call
/// cuda_check_valid_malloc() first
void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
cudaSetDevice(gpu_index);
void *ptr;
cudaMalloc((void **)&ptr, size);
check_cuda_error(cudaGetLastError());
return ptr;
}
/// Allocates a size-byte array at the device memory. Tries to do it
/// asynchronously.
void *cuda_malloc_async(uint64_t size, cudaStream_t *stream,
uint32_t gpu_index) {
cudaSetDevice(gpu_index);
void *ptr;
#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
#elif (CUDART_VERSION >= 11020)
int support_async_alloc;
check_cuda_error(cudaDeviceGetAttribute(
&support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
if (support_async_alloc) {
check_cuda_error(cudaMallocAsync((void **)&ptr, size, *stream));
} else {
check_cuda_error(cudaMalloc((void **)&ptr, size));
}
#else
check_cuda_error(cudaMalloc((void **)&ptr, size));
#endif
return ptr;
}
/// Checks that allocation is valid
/// 0: valid
/// -1: invalid, not enough memory in device
/// -2: invalid, gpu index doesn't exist
int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
if (gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
cudaSetDevice(gpu_index);
size_t total_mem, free_mem;
cudaMemGetInfo(&free_mem, &total_mem);
if (size > free_mem) {
// error code: not enough memory
return -1;
}
return 0;
}
/// Returns
/// -> 0 if Cooperative Groups is not supported.
/// -> 1 otherwise
int cuda_check_support_cooperative_groups() {
int cooperative_groups_supported = 0;
cudaDeviceGetAttribute(&cooperative_groups_supported,
cudaDevAttrCooperativeLaunch, 0);
return cooperative_groups_supported > 0;
}
/// Tries to copy memory to the GPU asynchronously
/// 0: success
/// -1: error, invalid device pointer
/// -2: error, gpu index doesn't exist
/// -3: error, zero copy size
int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
cudaStream_t *stream, uint32_t gpu_index) {
if (size == 0) {
// error code: zero copy size
return -3;
}
if (gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
cudaPointerAttributes attr;
cudaPointerGetAttributes(&attr, dest);
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
// error code: invalid device pointer
return -1;
}
cudaSetDevice(gpu_index);
check_cuda_error(
cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, *stream));
return 0;
}
/// Synchronizes device
/// 0: success
/// -2: error, gpu index doesn't exist
int cuda_synchronize_device(uint32_t gpu_index) {
if (gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
cudaSetDevice(gpu_index);
cudaDeviceSynchronize();
return 0;
}
int cuda_memset_async(void *dest, uint64_t val, uint64_t size,
cudaStream_t *stream, uint32_t gpu_index) {
if (size == 0) {
// error code: zero copy size
return -3;
}
if (gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
cudaPointerAttributes attr;
cudaPointerGetAttributes(&attr, dest);
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
// error code: invalid device pointer
return -1;
}
cudaSetDevice(gpu_index);
cudaMemsetAsync(dest, val, size, *stream);
return 0;
}
/// Tries to copy memory to the GPU asynchronously
/// 0: success
/// -1: error, invalid device pointer
/// -2: error, gpu index doesn't exist
/// -3: error, zero copy size
int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
cudaStream_t *stream, uint32_t gpu_index) {
if (size == 0) {
// error code: zero copy size
return -3;
}
if (gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
cudaPointerAttributes attr;
cudaPointerGetAttributes(&attr, src);
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
// error code: invalid device pointer
return -1;
}
cudaSetDevice(gpu_index);
check_cuda_error(
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, *stream));
return 0;
}
/// Return number of GPUs available
int cuda_get_number_of_gpus() {
int num_gpus;
cudaGetDeviceCount(&num_gpus);
return num_gpus;
}
/// Drop a cuda array
int cuda_drop(void *ptr, uint32_t gpu_index) {
if (gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
cudaSetDevice(gpu_index);
check_cuda_error(cudaFree(ptr));
return 0;
}
/// Drop a cuda array. Tries to do it asynchronously
int cuda_drop_async(void *ptr, cudaStream_t *stream, uint32_t gpu_index) {
cudaSetDevice(gpu_index);
#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
#elif (CUDART_VERSION >= 11020)
int support_async_alloc;
check_cuda_error(cudaDeviceGetAttribute(
&support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
if (support_async_alloc) {
check_cuda_error(cudaFreeAsync(ptr, *stream));
} else {
check_cuda_error(cudaFree(ptr));
}
#else
check_cuda_error(cudaFree(ptr));
#endif
return 0;
}
/// Get the maximum size for the shared memory
int cuda_get_max_shared_memory(uint32_t gpu_index) {
if (gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
cudaSetDevice(gpu_index);
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, gpu_index);
int max_shared_memory = 0;
if (prop.major >= 6) {
max_shared_memory = prop.sharedMemPerMultiprocessor;
} else {
max_shared_memory = prop.sharedMemPerBlock;
}
return max_shared_memory;
}
int cuda_synchronize_stream(void *v_stream) {
auto stream = static_cast<cudaStream_t *>(v_stream);
cudaStreamSynchronize(*stream);
return 0;
}

View File

@@ -1,739 +0,0 @@
#ifndef GPU_BOOTSTRAP_FFT_CUH
#define GPU_BOOTSTRAP_FFT_CUH
#include "complex/operations.cuh"
#include "polynomial/functions.cuh"
#include "polynomial/parameters.cuh"
#include "twiddles.cuh"
/*
* Direct negacyclic FFT:
* - before the FFT the N real coefficients are stored into a
* N/2 sized complex with the even coefficients in the real part
* and the odd coefficients in the imaginary part. This is referred to
* as the half-size FFT
* - when calling BNSMFFT_direct for the forward negacyclic FFT of PBS,
* opt is divided by 2 because the butterfly pattern is always applied
* between pairs of coefficients
* - instead of twisting each coefficient A_j before the FFT by
* multiplying by the w^j roots of unity (aka twiddles, w=exp(-i pi /N)),
* the FFT is modified, and for each level k of the FFT the twiddle:
* w_j,k = exp(-i pi j/2^k)
* is replaced with:
* \zeta_j,k = exp(-i pi (2j-1)/2^k)
*/
template <class params> __device__ void NSMFFT_direct(double2 *A) {
/* We don't make bit reverse here, since twiddles are already reversed
* Each thread is always in charge of "opt/2" pairs of coefficients,
* which is why we always loop through N/2 by N/opt strides
* The pragma unroll instruction tells the compiler to unroll the
* full loop, which should increase performance
*/
size_t tid = threadIdx.x;
size_t twid_id;
size_t i1, i2;
double2 u, v, w;
// level 1
// we don't make actual complex multiplication on level1 since we have only
// one twiddle, it's real and image parts are equal, so we can multiply
// it with simpler operations
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
i1 = tid;
i2 = tid + params::degree / 2;
u = A[i1];
v.x = (A[i2].x - A[i2].y) * 0.707106781186547461715008466854;
v.y = (A[i2].x + A[i2].y) * 0.707106781186547461715008466854;
A[i1].x += v.x;
A[i1].y += v.y;
A[i2].x = u.x - v.x;
A[i2].y = u.y - v.y;
tid += params::degree / params::opt;
}
__syncthreads();
// level 2
// from this level there are more than one twiddles and none of them has equal
// real and imag parts, so complete complex multiplication is needed
// for each level params::degree / 2^level represents number of coefficients
// inside divided chunk of specific level
//
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 4);
i1 = 2 * (params::degree / 4) * twid_id + (tid & (params::degree / 4 - 1));
i2 = i1 + params::degree / 4;
w = negtwiddles[twid_id + 2];
u = A[i1];
v.x = A[i2].x * w.x - A[i2].y * w.y;
v.y = A[i2].y * w.x + A[i2].x * w.y;
A[i1].x += v.x;
A[i1].y += v.y;
A[i2].x = u.x - v.x;
A[i2].y = u.y - v.y;
tid += params::degree / params::opt;
}
__syncthreads();
// level 3
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 8);
i1 = 2 * (params::degree / 8) * twid_id + (tid & (params::degree / 8 - 1));
i2 = i1 + params::degree / 8;
w = negtwiddles[twid_id + 4];
u = A[i1];
v.x = A[i2].x * w.x - A[i2].y * w.y;
v.y = A[i2].y * w.x + A[i2].x * w.y;
A[i1].x += v.x;
A[i1].y += v.y;
A[i2].x = u.x - v.x;
A[i2].y = u.y - v.y;
tid += params::degree / params::opt;
}
__syncthreads();
// level 4
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 16);
i1 =
2 * (params::degree / 16) * twid_id + (tid & (params::degree / 16 - 1));
i2 = i1 + params::degree / 16;
w = negtwiddles[twid_id + 8];
u = A[i1];
v.x = A[i2].x * w.x - A[i2].y * w.y;
v.y = A[i2].y * w.x + A[i2].x * w.y;
A[i1].x += v.x;
A[i1].y += v.y;
A[i2].x = u.x - v.x;
A[i2].y = u.y - v.y;
tid += params::degree / params::opt;
}
__syncthreads();
// level 5
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 32);
i1 =
2 * (params::degree / 32) * twid_id + (tid & (params::degree / 32 - 1));
i2 = i1 + params::degree / 32;
w = negtwiddles[twid_id + 16];
u = A[i1];
v.x = A[i2].x * w.x - A[i2].y * w.y;
v.y = A[i2].y * w.x + A[i2].x * w.y;
A[i1].x += v.x;
A[i1].y += v.y;
A[i2].x = u.x - v.x;
A[i2].y = u.y - v.y;
tid += params::degree / params::opt;
}
__syncthreads();
// level 6
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 64);
i1 =
2 * (params::degree / 64) * twid_id + (tid & (params::degree / 64 - 1));
i2 = i1 + params::degree / 64;
w = negtwiddles[twid_id + 32];
u = A[i1];
v.x = A[i2].x * w.x - A[i2].y * w.y;
v.y = A[i2].y * w.x + A[i2].x * w.y;
A[i1].x += v.x;
A[i1].y += v.y;
A[i2].x = u.x - v.x;
A[i2].y = u.y - v.y;
tid += params::degree / params::opt;
}
__syncthreads();
// level 7
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 128);
i1 = 2 * (params::degree / 128) * twid_id +
(tid & (params::degree / 128 - 1));
i2 = i1 + params::degree / 128;
w = negtwiddles[twid_id + 64];
u = A[i1];
v.x = A[i2].x * w.x - A[i2].y * w.y;
v.y = A[i2].y * w.x + A[i2].x * w.y;
A[i1].x += v.x;
A[i1].y += v.y;
A[i2].x = u.x - v.x;
A[i2].y = u.y - v.y;
tid += params::degree / params::opt;
}
__syncthreads();
// from level 8, we need to check size of params degree, because we support
// minimum actual polynomial size = 256, when compressed size is halfed and
// minimum supported compressed size is 128, so we always need first 7
// levels of butterfly operation, since butterfly levels are hardcoded
// we need to check if polynomial size is big enough to require specific level
// of butterfly.
if constexpr (params::degree >= 256) {
// level 8
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 256);
i1 = 2 * (params::degree / 256) * twid_id +
(tid & (params::degree / 256 - 1));
i2 = i1 + params::degree / 256;
w = negtwiddles[twid_id + 128];
u = A[i1];
v.x = A[i2].x * w.x - A[i2].y * w.y;
v.y = A[i2].y * w.x + A[i2].x * w.y;
A[i1].x += v.x;
A[i1].y += v.y;
A[i2].x = u.x - v.x;
A[i2].y = u.y - v.y;
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 512) {
// level 9
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 512);
i1 = 2 * (params::degree / 512) * twid_id +
(tid & (params::degree / 512 - 1));
i2 = i1 + params::degree / 512;
w = negtwiddles[twid_id + 256];
u = A[i1];
v.x = A[i2].x * w.x - A[i2].y * w.y;
v.y = A[i2].y * w.x + A[i2].x * w.y;
A[i1].x += v.x;
A[i1].y += v.y;
A[i2].x = u.x - v.x;
A[i2].y = u.y - v.y;
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 1024) {
// level 10
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 1024);
i1 = 2 * (params::degree / 1024) * twid_id +
(tid & (params::degree / 1024 - 1));
i2 = i1 + params::degree / 1024;
w = negtwiddles[twid_id + 512];
u = A[i1];
v.x = A[i2].x * w.x - A[i2].y * w.y;
v.y = A[i2].y * w.x + A[i2].x * w.y;
A[i1].x += v.x;
A[i1].y += v.y;
A[i2].x = u.x - v.x;
A[i2].y = u.y - v.y;
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 2048) {
// level 11
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 2048);
i1 = 2 * (params::degree / 2048) * twid_id +
(tid & (params::degree / 2048 - 1));
i2 = i1 + params::degree / 2048;
w = negtwiddles[twid_id + 1024];
u = A[i1];
v.x = A[i2].x * w.x - A[i2].y * w.y;
v.y = A[i2].y * w.x + A[i2].x * w.y;
A[i1].x += v.x;
A[i1].y += v.y;
A[i2].x = u.x - v.x;
A[i2].y = u.y - v.y;
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 4096) {
// level 12
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 4096);
i1 = 2 * (params::degree / 4096) * twid_id +
(tid & (params::degree / 4096 - 1));
i2 = i1 + params::degree / 4096;
w = negtwiddles[twid_id + 2048];
u = A[i1];
v.x = A[i2].x * w.x - A[i2].y * w.y;
v.y = A[i2].y * w.x + A[i2].x * w.y;
A[i1].x += v.x;
A[i1].y += v.y;
A[i2].x = u.x - v.x;
A[i2].y = u.y - v.y;
tid += params::degree / params::opt;
}
__syncthreads();
}
// compressed size = 8192 is actual polynomial size = 16384.
// from this size, twiddles can't fit in constant memory,
// so from here, butterfly operation access device memory.
if constexpr (params::degree >= 8192) {
// level 13
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 8192);
i1 = 2 * (params::degree / 8192) * twid_id +
(tid & (params::degree / 8192 - 1));
i2 = i1 + params::degree / 8192;
w = negtwiddles13[twid_id];
u = A[i1];
v.x = A[i2].x * w.x - A[i2].y * w.y;
v.y = A[i2].y * w.x + A[i2].x * w.y;
A[i1].x += v.x;
A[i1].y += v.y;
A[i2].x = u.x - v.x;
A[i2].y = u.y - v.y;
tid += params::degree / params::opt;
}
__syncthreads();
}
}
/*
* negacyclic inverse fft
*/
template <class params> __device__ void NSMFFT_inverse(double2 *A) {
/* We don't make bit reverse here, since twiddles are already reversed
* Each thread is always in charge of "opt/2" pairs of coefficients,
* which is why we always loop through N/2 by N/opt strides
* The pragma unroll instruction tells the compiler to unroll the
* full loop, which should increase performance
*/
size_t tid = threadIdx.x;
size_t twid_id;
size_t i1, i2;
double2 u, w;
// divide input by compressed polynomial size
tid = threadIdx.x;
for (size_t i = 0; i < params::opt; ++i) {
A[tid].x *= 1. / params::degree;
A[tid].y *= 1. / params::degree;
tid += params::degree / params::opt;
}
__syncthreads();
// none of the twiddles have equal real and imag part, so
// complete complex multiplication has to be done
// here we have more than one twiddle
// mapping in backward fft is reversed
// butterfly operation is started from last level
// compressed size = 8192 is actual polynomial size = 16384.
// twiddles for this size can't fit in constant memory so
// butterfly operation for this level access device memory to fetch
// twiddles
if constexpr (params::degree >= 8192) {
// level 13
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 8192);
i1 = 2 * (params::degree / 8192) * twid_id +
(tid & (params::degree / 8192 - 1));
i2 = i1 + params::degree / 8192;
w = negtwiddles13[twid_id];
u.x = A[i1].x - A[i2].x;
u.y = A[i1].y - A[i2].y;
A[i1].x += A[i2].x;
A[i1].y += A[i2].y;
A[i2].x = u.x * w.x + u.y * w.y;
A[i2].y = u.y * w.x - u.x * w.y;
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 4096) {
// level 12
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 4096);
i1 = 2 * (params::degree / 4096) * twid_id +
(tid & (params::degree / 4096 - 1));
i2 = i1 + params::degree / 4096;
w = negtwiddles[twid_id + 2048];
u.x = A[i1].x - A[i2].x;
u.y = A[i1].y - A[i2].y;
A[i1].x += A[i2].x;
A[i1].y += A[i2].y;
A[i2].x = u.x * w.x + u.y * w.y;
A[i2].y = u.y * w.x - u.x * w.y;
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 2048) {
// level 11
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 2048);
i1 = 2 * (params::degree / 2048) * twid_id +
(tid & (params::degree / 2048 - 1));
i2 = i1 + params::degree / 2048;
w = negtwiddles[twid_id + 1024];
u.x = A[i1].x - A[i2].x;
u.y = A[i1].y - A[i2].y;
A[i1].x += A[i2].x;
A[i1].y += A[i2].y;
A[i2].x = u.x * w.x + u.y * w.y;
A[i2].y = u.y * w.x - u.x * w.y;
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 1024) {
// level 10
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 1024);
i1 = 2 * (params::degree / 1024) * twid_id +
(tid & (params::degree / 1024 - 1));
i2 = i1 + params::degree / 1024;
w = negtwiddles[twid_id + 512];
u.x = A[i1].x - A[i2].x;
u.y = A[i1].y - A[i2].y;
A[i1].x += A[i2].x;
A[i1].y += A[i2].y;
A[i2].x = u.x * w.x + u.y * w.y;
A[i2].y = u.y * w.x - u.x * w.y;
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 512) {
// level 9
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 512);
i1 = 2 * (params::degree / 512) * twid_id +
(tid & (params::degree / 512 - 1));
i2 = i1 + params::degree / 512;
w = negtwiddles[twid_id + 256];
u.x = A[i1].x - A[i2].x;
u.y = A[i1].y - A[i2].y;
A[i1].x += A[i2].x;
A[i1].y += A[i2].y;
A[i2].x = u.x * w.x + u.y * w.y;
A[i2].y = u.y * w.x - u.x * w.y;
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 256) {
// level 8
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 256);
i1 = 2 * (params::degree / 256) * twid_id +
(tid & (params::degree / 256 - 1));
i2 = i1 + params::degree / 256;
w = negtwiddles[twid_id + 128];
u.x = A[i1].x - A[i2].x;
u.y = A[i1].y - A[i2].y;
A[i1].x += A[i2].x;
A[i1].y += A[i2].y;
A[i2].x = u.x * w.x + u.y * w.y;
A[i2].y = u.y * w.x - u.x * w.y;
tid += params::degree / params::opt;
}
__syncthreads();
}
// below level 8, we don't need to check size of params degree, because we
// support minimum actual polynomial size = 256, when compressed size is
// halfed and minimum supported compressed size is 128, so we always need
// last 7 levels of butterfly operation, since butterfly levels are hardcoded
// we don't need to check if polynomial size is big enough to require
// specific level of butterfly.
// level 7
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 128);
i1 = 2 * (params::degree / 128) * twid_id +
(tid & (params::degree / 128 - 1));
i2 = i1 + params::degree / 128;
w = negtwiddles[twid_id + 64];
u.x = A[i1].x - A[i2].x;
u.y = A[i1].y - A[i2].y;
A[i1].x += A[i2].x;
A[i1].y += A[i2].y;
A[i2].x = u.x * w.x + u.y * w.y;
A[i2].y = u.y * w.x - u.x * w.y;
tid += params::degree / params::opt;
}
__syncthreads();
// level 6
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 64);
i1 =
2 * (params::degree / 64) * twid_id + (tid & (params::degree / 64 - 1));
i2 = i1 + params::degree / 64;
w = negtwiddles[twid_id + 32];
u.x = A[i1].x - A[i2].x;
u.y = A[i1].y - A[i2].y;
A[i1].x += A[i2].x;
A[i1].y += A[i2].y;
A[i2].x = u.x * w.x + u.y * w.y;
A[i2].y = u.y * w.x - u.x * w.y;
tid += params::degree / params::opt;
}
__syncthreads();
// level 5
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 32);
i1 =
2 * (params::degree / 32) * twid_id + (tid & (params::degree / 32 - 1));
i2 = i1 + params::degree / 32;
w = negtwiddles[twid_id + 16];
u.x = A[i1].x - A[i2].x;
u.y = A[i1].y - A[i2].y;
A[i1].x += A[i2].x;
A[i1].y += A[i2].y;
A[i2].x = u.x * w.x + u.y * w.y;
A[i2].y = u.y * w.x - u.x * w.y;
tid += params::degree / params::opt;
}
__syncthreads();
// level 4
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 16);
i1 =
2 * (params::degree / 16) * twid_id + (tid & (params::degree / 16 - 1));
i2 = i1 + params::degree / 16;
w = negtwiddles[twid_id + 8];
u.x = A[i1].x - A[i2].x;
u.y = A[i1].y - A[i2].y;
A[i1].x += A[i2].x;
A[i1].y += A[i2].y;
A[i2].x = u.x * w.x + u.y * w.y;
A[i2].y = u.y * w.x - u.x * w.y;
tid += params::degree / params::opt;
}
__syncthreads();
// level 3
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 8);
i1 = 2 * (params::degree / 8) * twid_id + (tid & (params::degree / 8 - 1));
i2 = i1 + params::degree / 8;
w = negtwiddles[twid_id + 4];
u.x = A[i1].x - A[i2].x;
u.y = A[i1].y - A[i2].y;
A[i1].x += A[i2].x;
A[i1].y += A[i2].y;
A[i2].x = u.x * w.x + u.y * w.y;
A[i2].y = u.y * w.x - u.x * w.y;
tid += params::degree / params::opt;
}
__syncthreads();
// level 2
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 4);
i1 = 2 * (params::degree / 4) * twid_id + (tid & (params::degree / 4 - 1));
i2 = i1 + params::degree / 4;
w = negtwiddles[twid_id + 2];
u.x = A[i1].x - A[i2].x;
u.y = A[i1].y - A[i2].y;
A[i1].x += A[i2].x;
A[i1].y += A[i2].y;
A[i2].x = u.x * w.x + u.y * w.y;
A[i2].y = u.y * w.x - u.x * w.y;
tid += params::degree / params::opt;
}
__syncthreads();
// level 1
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 2);
i1 = 2 * (params::degree / 2) * twid_id + (tid & (params::degree / 2 - 1));
i2 = i1 + params::degree / 2;
w = negtwiddles[twid_id + 1];
u.x = A[i1].x - A[i2].x;
u.y = A[i1].y - A[i2].y;
A[i1].x += A[i2].x;
A[i1].y += A[i2].y;
A[i2].x = u.x * w.x + u.y * w.y;
A[i2].y = u.y * w.x - u.x * w.y;
tid += params::degree / params::opt;
}
__syncthreads();
}
/*
* global batch fft
* does fft in half size
* unrolling half size fft result in half size + 1 elements
* this function must be called with actual degree
* function takes as input already compressed input
*/
template <class params, sharedMemDegree SMD>
__global__ void batch_NSMFFT(double2 *d_input, double2 *d_output,
double2 *buffer) {
extern __shared__ double2 sharedMemoryFFT[];
double2 *fft = (SMD == NOSM) ? &buffer[blockIdx.x * params::degree / 2]
: sharedMemoryFFT;
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] = d_input[blockIdx.x * (params::degree / 2) + tid];
tid = tid + params::degree / params::opt;
}
__syncthreads();
NSMFFT_direct<HalfDegree<params>>(fft);
__syncthreads();
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
d_output[blockIdx.x * (params::degree / 2) + tid] = fft[tid];
tid = tid + params::degree / params::opt;
}
}
/*
* global batch polynomial multiplication
* only used for fft tests
* d_input1 and d_output must not have the same pointer
* d_input1 can be modified inside the function
*/
template <class params, sharedMemDegree SMD>
__global__ void batch_polynomial_mul(double2 *d_input1, double2 *d_input2,
double2 *d_output, double2 *buffer) {
extern __shared__ double2 sharedMemoryFFT[];
double2 *fft = (SMD == NOSM) ? &buffer[blockIdx.x * params::degree / 2]
: sharedMemoryFFT;
// Move first polynomial into shared memory(if possible otherwise it will
// be moved in device buffer)
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] = d_input1[blockIdx.x * (params::degree / 2) + tid];
tid = tid + params::degree / params::opt;
}
// Perform direct negacyclic fourier transform
__syncthreads();
NSMFFT_direct<HalfDegree<params>>(fft);
__syncthreads();
// Put the result of direct fft inside input1
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
d_input1[blockIdx.x * (params::degree / 2) + tid] = fft[tid];
tid = tid + params::degree / params::opt;
}
__syncthreads();
// Move first polynomial into shared memory(if possible otherwise it will
// be moved in device buffer)
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] = d_input2[blockIdx.x * (params::degree / 2) + tid];
tid = tid + params::degree / params::opt;
}
// Perform direct negacyclic fourier transform on the second polynomial
__syncthreads();
NSMFFT_direct<HalfDegree<params>>(fft);
__syncthreads();
// calculate pointwise multiplication inside fft buffer
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] *= d_input1[blockIdx.x * (params::degree / 2) + tid];
tid = tid + params::degree / params::opt;
}
// Perform backward negacyclic fourier transform
__syncthreads();
NSMFFT_inverse<HalfDegree<params>>(fft);
__syncthreads();
// copy results in output buffer
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
d_output[blockIdx.x * (params::degree / 2) + tid] = fft[tid];
tid = tid + params::degree / params::opt;
}
}
#endif // GPU_BOOTSTRAP_FFT_CUH

File diff suppressed because it is too large Load Diff

View File

@@ -1,14 +0,0 @@
#ifndef GPU_BOOTSTRAP_TWIDDLES_CUH
#define GPU_BOOTSTRAP_TWIDDLES_CUH
/*
* 'negtwiddles' are stored in constant memory for faster access times
* because of it's limited size, only twiddles for up to 2^12 polynomial size
* can be stored there, twiddles for 2^13 are stored in device memory
* 'negtwiddles13'
*/
extern __constant__ double2 negtwiddles[4096];
extern __device__ double2 negtwiddles13[4096];
#endif

View File

@@ -1,98 +0,0 @@
#include "keyswitch.cuh"
#include "keyswitch.h"
#include "polynomial/parameters.cuh"
#include <cstdint>
/* Perform keyswitch on a batch of 32 bits input LWE ciphertexts.
* Head out to the equivalent operation on 64 bits for more details.
*/
void cuda_keyswitch_lwe_ciphertext_vector_32(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
cuda_keyswitch_lwe_ciphertext_vector(
v_stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in), static_cast<uint32_t *>(ksk),
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
}
/* Perform keyswitch on a batch of 64 bits input LWE ciphertexts.
*
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - lwe_array_out: output batch of num_samples keyswitched ciphertexts c =
* (a0,..an-1,b) where n is the output LWE dimension (lwe_dimension_out)
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing
* lwe_dimension_in mask values + 1 body value
* - ksk: the keyswitch key to be used in the operation
* - base log: the log of the base used in the decomposition (should be the one
* used to create the ksk)
*
* This function calls a wrapper to a device kernel that performs the keyswitch
* - num_samples blocks of threads are launched
*/
void cuda_keyswitch_lwe_ciphertext_vector_64(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
cuda_keyswitch_lwe_ciphertext_vector(
v_stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in), static_cast<uint64_t *>(ksk),
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
}
/* Perform functional packing keyswitch on a batch of 32 bits input LWE
* ciphertexts. See the equivalent function on 64 bit inputs for more details.
*/
void cuda_fp_keyswitch_lwe_to_glwe_32(
void *v_stream, uint32_t gpu_index, void *glwe_array_out,
void *lwe_array_in, void *fp_ksk_array, uint32_t input_lwe_dimension,
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t number_of_input_lwe,
uint32_t number_of_keys) {
cuda_fp_keyswitch_lwe_to_glwe(
v_stream, gpu_index, static_cast<uint32_t *>(glwe_array_out),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(fp_ksk_array), input_lwe_dimension,
output_glwe_dimension, output_polynomial_size, base_log, level_count,
number_of_input_lwe, number_of_keys);
}
/* Perform functional packing keyswitch on a batch of 64 bits input LWE
* ciphertexts.
*
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - `glwe_array_out`: output batch of keyswitched ciphertexts
* - `lwe_array_in`: input batch of num_samples LWE ciphertexts, containing
* lwe_dimension_in mask values + 1 body value
* - `fp_ksk_array`: the functional packing keyswitch keys to be used in the
* operation
* - `base log`: the log of the base used in the decomposition (should be the
* one used to create the ksk)
* - `level_count`: the number of levels used in the decomposition (should be
* the one used to create the fp_ksks).
* - `number_of_input_lwe`: the number of inputs
* - `number_of_keys`: the number of fp_ksks
*
* This function calls a wrapper to a device kernel that performs the functional
* packing keyswitch.
*/
void cuda_fp_keyswitch_lwe_to_glwe_64(
void *v_stream, uint32_t gpu_index, void *glwe_array_out,
void *lwe_array_in, void *fp_ksk_array, uint32_t input_lwe_dimension,
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t number_of_input_lwe,
uint32_t number_of_keys) {
cuda_fp_keyswitch_lwe_to_glwe(
v_stream, gpu_index, static_cast<uint64_t *>(glwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(fp_ksk_array), input_lwe_dimension,
output_glwe_dimension, output_polynomial_size, base_log, level_count,
number_of_input_lwe, number_of_keys);
}

View File

@@ -1,228 +0,0 @@
#ifndef CNCRT_KS_H
#define CNCRT_KS_H
#include "crypto/gadget.cuh"
#include "crypto/torus.cuh"
#include "polynomial/polynomial.cuh"
#include <thread>
#include <vector>
template <typename Torus>
__device__ Torus *get_ith_block(Torus *ksk, int i, int level,
uint32_t lwe_dimension_out,
uint32_t level_count) {
int pos = i * level_count * (lwe_dimension_out + 1) +
level * (lwe_dimension_out + 1);
Torus *ptr = &ksk[pos];
return ptr;
}
// blockIdx.y represents single lwe ciphertext
// blockIdx.x represents chunk of lwe ciphertext,
// chunk_count = glwe_size * polynomial_size / threads.
// each threads will responsible to process only lwe_size times multiplication
template <typename Torus>
__global__ void
fp_keyswitch(Torus *glwe_array_out, Torus *lwe_array_in, Torus *fp_ksk_array,
uint32_t lwe_dimension_in, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t number_of_input_lwe, uint32_t number_of_keys) {
size_t tid = threadIdx.x;
size_t glwe_size = (glwe_dimension + 1);
size_t lwe_size = (lwe_dimension_in + 1);
// number of coefficients in a single fp-ksk
size_t ksk_size = lwe_size * level_count * glwe_size * polynomial_size;
// number of coefficients inside fp-ksk block for each lwe_input coefficient
size_t ksk_block_size = glwe_size * polynomial_size * level_count;
size_t ciphertext_id = blockIdx.y;
// number of coefficients processed inside single block
size_t coef_per_block = blockDim.x;
size_t chunk_id = blockIdx.x;
size_t ksk_id = ciphertext_id % number_of_keys;
extern __shared__ int8_t sharedmem[];
// result accumulator, shared memory is used because of frequent access
Torus *local_glwe_chunk = (Torus *)sharedmem;
// current input lwe ciphertext
auto cur_input_lwe = &lwe_array_in[ciphertext_id * lwe_size];
// current output glwe ciphertext
auto cur_output_glwe =
&glwe_array_out[ciphertext_id * glwe_size * polynomial_size];
// current out glwe chunk, will be processed inside single block
auto cur_glwe_chunk = &cur_output_glwe[chunk_id * coef_per_block];
// fp key used for current ciphertext
auto cur_ksk = &fp_ksk_array[ksk_id * ksk_size];
// set shared mem accumulator to 0
local_glwe_chunk[tid] = 0;
// iterate through each coefficient of input lwe
for (size_t i = 0; i <= lwe_dimension_in; i++) {
Torus a_i =
round_to_closest_multiple(cur_input_lwe[i], base_log, level_count);
Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
Torus mod_b_mask = (1ll << base_log) - 1ll;
// block of key for current lwe coefficient (cur_input_lwe[i])
auto ksk_block = &cur_ksk[i * ksk_block_size];
// iterate through levels, calculating decomposition in reverse order
for (size_t j = 0; j < level_count; j++) {
auto ksk_glwe =
&ksk_block[(level_count - j - 1) * glwe_size * polynomial_size];
auto ksk_glwe_chunk = &ksk_glwe[chunk_id * coef_per_block];
Torus decomposed = decompose_one<Torus>(state, mod_b_mask, base_log);
local_glwe_chunk[tid] -= decomposed * ksk_glwe_chunk[tid];
}
}
cur_glwe_chunk[tid] = local_glwe_chunk[tid];
}
/*
* keyswitch kernel
* Each thread handles a piece of the following equation:
* $$GLWE_s2(\Delta.m+e) = (0,0,..,0,b) - \sum_{i=0,k-1} <Dec(a_i),
* (GLWE_s2(s1_i q/beta),..,GLWE(s1_i q/beta^l)>$$ where k is the dimension of
* the GLWE ciphertext. If the polynomial dimension in GLWE is > 1, this
* equation is solved for each polynomial coefficient. where Dec denotes the
* decomposition with base beta and l levels and the inner product is done
* between the decomposition of a_i and l GLWE encryptions of s1_i q/\beta^j,
* with j in [1,l] We obtain a GLWE encryption of Delta.m (with Delta the
* scaling factor) under key s2 instead of s1, with an increased noise
*
*/
template <typename Torus>
__global__ void keyswitch(Torus *lwe_array_out, Torus *lwe_array_in, Torus *ksk,
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count,
int lwe_lower, int lwe_upper, int cutoff) {
int tid = threadIdx.x;
extern __shared__ int8_t sharedmem[];
Torus *local_lwe_array_out = (Torus *)sharedmem;
auto block_lwe_array_in =
get_chunk(lwe_array_in, blockIdx.x, lwe_dimension_in + 1);
auto block_lwe_array_out =
get_chunk(lwe_array_out, blockIdx.x, lwe_dimension_out + 1);
auto gadget = GadgetMatrixSingle<Torus>(base_log, level_count);
int lwe_part_per_thd;
if (tid < cutoff) {
lwe_part_per_thd = lwe_upper;
} else {
lwe_part_per_thd = lwe_lower;
}
__syncthreads();
for (int k = 0; k < lwe_part_per_thd; k++) {
int idx = tid + k * blockDim.x;
local_lwe_array_out[idx] = 0;
}
if (tid == 0) {
local_lwe_array_out[lwe_dimension_out] =
block_lwe_array_in[lwe_dimension_in];
}
for (int i = 0; i < lwe_dimension_in; i++) {
__syncthreads();
Torus a_i =
round_to_closest_multiple(block_lwe_array_in[i], base_log, level_count);
Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
Torus mask_mod_b = (1ll << base_log) - 1ll;
for (int j = 0; j < level_count; j++) {
auto ksk_block = get_ith_block(ksk, i, j, lwe_dimension_out, level_count);
Torus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
for (int k = 0; k < lwe_part_per_thd; k++) {
int idx = tid + k * blockDim.x;
local_lwe_array_out[idx] -= (Torus)ksk_block[idx] * decomposed;
}
}
}
for (int k = 0; k < lwe_part_per_thd; k++) {
int idx = tid + k * blockDim.x;
block_lwe_array_out[idx] = local_lwe_array_out[idx];
}
}
/// assume lwe_array_in in the gpu
template <typename Torus>
__host__ void cuda_keyswitch_lwe_ciphertext_vector(
void *v_stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_array_in, Torus *ksk, uint32_t lwe_dimension_in,
uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
uint32_t num_samples) {
cudaSetDevice(gpu_index);
constexpr int ideal_threads = 128;
int lwe_dim = lwe_dimension_out + 1;
int lwe_lower, lwe_upper, cutoff;
if (lwe_dim % ideal_threads == 0) {
lwe_lower = lwe_dim / ideal_threads;
lwe_upper = lwe_dim / ideal_threads;
cutoff = 0;
} else {
int y =
ceil((double)lwe_dim / (double)ideal_threads) * ideal_threads - lwe_dim;
cutoff = ideal_threads - y;
lwe_lower = lwe_dim / ideal_threads;
lwe_upper = (int)ceil((double)lwe_dim / (double)ideal_threads);
}
int lwe_size_after = (lwe_dimension_out + 1) * num_samples;
int shared_mem = sizeof(Torus) * (lwe_dimension_out + 1);
auto stream = static_cast<cudaStream_t *>(v_stream);
cudaMemsetAsync(lwe_array_out, 0, sizeof(Torus) * lwe_size_after, *stream);
dim3 grid(num_samples, 1, 1);
dim3 threads(ideal_threads, 1, 1);
cudaFuncSetAttribute(keyswitch<Torus>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem);
keyswitch<<<grid, threads, shared_mem, *stream>>>(
lwe_array_out, lwe_array_in, ksk, lwe_dimension_in, lwe_dimension_out,
base_log, level_count, lwe_lower, lwe_upper, cutoff);
check_cuda_error(cudaGetLastError());
}
template <typename Torus>
__host__ void cuda_fp_keyswitch_lwe_to_glwe(
void *v_stream, uint32_t gpu_index, Torus *glwe_array_out,
Torus *lwe_array_in, Torus *fp_ksk_array, uint32_t lwe_dimension_in,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t number_of_input_lwe,
uint32_t number_of_keys) {
cudaSetDevice(gpu_index);
int threads = 256;
int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
dim3 blocks(glwe_accumulator_size / threads, number_of_input_lwe, 1);
int shared_mem = sizeof(Torus) * threads;
auto stream = static_cast<cudaStream_t *>(v_stream);
fp_keyswitch<<<blocks, threads, shared_mem, *stream>>>(
glwe_array_out, lwe_array_in, fp_ksk_array, lwe_dimension_in,
glwe_dimension, polynomial_size, base_log, level_count,
number_of_input_lwe, number_of_keys);
}
#endif

View File

@@ -1,183 +0,0 @@
#include "multiplication.cuh"
/*
* Perform the multiplication of a u32 input LWE ciphertext vector with a u32
* cleartext vector. See the equivalent operation on u64 data for more details.
*/
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *cleartext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_cleartext_multiplication(
v_stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(cleartext_array_in), input_lwe_dimension,
input_lwe_ciphertext_count);
}
/*
* Perform the multiplication of a u64 input LWE ciphertext vector with a u64
* input cleartext vector.
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - `lwe_array_out` is an array of size
* `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
* been allocated on the GPU before calling this function, and that will hold
* the result of the computation.
* - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
* been allocated and initialized before calling this function. It has the same
* size as the output array.
* - `cleartext_array_in` is the cleartext vector used as input, it should have
* been allocated and initialized before calling this function. It should be of
* size `input_lwe_ciphertext_count`.
* - `input_lwe_dimension` is the number of mask elements in the input and
* output LWE ciphertext vectors
* - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the
* input LWE ciphertext vector, as well as in the output. It is also the number
* of cleartexts in the input cleartext vector.
*
* Each cleartext of the input cleartext vector is multiplied to the mask and
* body of the corresponding LWE ciphertext in the LWE ciphertext vector. The
* result of the operation is stored in the output LWE ciphertext vector. The
* two input vectors are unchanged. This function is a wrapper to a device
* function that performs the operation on the GPU.
*/
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *cleartext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_cleartext_multiplication(
v_stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(cleartext_array_in), input_lwe_dimension,
input_lwe_ciphertext_count);
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the integer radix multiplication in keyswitch->bootstrap order.
*/
void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
void *v_stream, uint32_t gpu_index, void *mem_ptr, uint32_t message_modulus,
uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level,
uint32_t ks_base_log, uint32_t ks_level, uint32_t num_blocks,
PBS_TYPE pbs_type, uint32_t max_shared_memory, bool allocate_gpu_memory) {
switch (polynomial_size) {
case 2048:
scratch_cuda_integer_mult_radix_ciphertext_kb<uint64_t, Degree<2048>>(
v_stream, gpu_index, (int_mul_memory<uint64_t> *)mem_ptr,
message_modulus, carry_modulus, glwe_dimension, lwe_dimension,
polynomial_size, pbs_base_log, pbs_level, ks_base_log, ks_level,
num_blocks, pbs_type, max_shared_memory, allocate_gpu_memory);
break;
default:
break;
}
}
/*
* Computes a multiplication between two 64 bit radix lwe ciphertexts
* encrypting integer values. keyswitch -> bootstrap pattern is used, function
* works for single pair of radix ciphertexts, 'v_stream' can be used for
* parallelization
* - 'v_stream' is a void pointer to the Cuda stream to be used in the kernel
* launch
* - 'gpu_index' is the index of the GPU to be used in the kernel launch
* - 'radix_lwe_out' is 64 bit radix big lwe ciphertext, product of
* multiplication
* - 'radix_lwe_left' left radix big lwe ciphertext
* - 'radix_lwe_right' right radix big lwe ciphertext
* - 'ct_degree_out' degree for each lwe ciphertext block for out
* RadixCiphertext
* - 'ct_degree_left' degree for each lwe ciphertext block for left
* RadixCiphertext
* - 'ct_degree_right' degree for each lwe ciphertext block for right
* RadixCiphertext
* - 'bsk' bootstrapping key in fourier domain
* - 'ksk' keyswitching key
* - 'mem_ptr'
* - 'message_modulus' message_modulus
* - 'carry_modulus' carry_modulus
* - 'glwe_dimension' glwe_dimension
* - 'lwe_dimension' is the dimension of small lwe ciphertext
* - 'polynomial_size' polynomial size
* - 'pbs_base_log' base log used in the pbs
* - 'pbs_level' decomposition level count used in the pbs
* - 'ks_level' decomposition level count used in the keyswitch
* - 'num_blocks' is the number of big lwe ciphertext blocks inside radix
* ciphertext
* - 'pbs_type' selects which PBS implementation should be used
* - 'max_shared_memory' maximum shared memory per cuda block
*/
void cuda_integer_mult_radix_ciphertext_kb_64(
void *v_stream, uint32_t gpu_index, void *radix_lwe_out,
void *radix_lwe_left, void *radix_lwe_right, uint32_t *ct_degree_out,
uint32_t *ct_degree_left, uint32_t *ct_degree_right, void *bsk, void *ksk,
void *mem_ptr, uint32_t message_modulus, uint32_t carry_modulus,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t pbs_base_log, uint32_t pbs_level, uint32_t ks_base_log,
uint32_t ks_level, uint32_t num_blocks, PBS_TYPE pbs_type,
uint32_t max_shared_memory) {
switch (polynomial_size) {
case 2048:
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<2048>>(
v_stream, gpu_index, (uint64_t *)radix_lwe_out,
(uint64_t *)radix_lwe_left, (uint64_t *)radix_lwe_right, ct_degree_out,
ct_degree_left, ct_degree_right, bsk, (uint64_t *)ksk,
(int_mul_memory<uint64_t> *)mem_ptr, message_modulus, carry_modulus,
glwe_dimension, lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
ks_base_log, ks_level, num_blocks, pbs_type, max_shared_memory);
break;
default:
break;
}
}
void scratch_cuda_integer_mult_radix_ciphertext_kb_64_multi_gpu(
void *mem_ptr, void *bsk, void *ksk, uint32_t message_modulus,
uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level,
uint32_t ks_base_log, uint32_t ks_level, uint32_t num_blocks,
PBS_TYPE pbs_type, uint32_t max_shared_memory, bool allocate_gpu_memory) {
switch (polynomial_size) {
case 2048:
scratch_cuda_integer_mult_radix_ciphertext_kb_multi_gpu<uint64_t,
Degree<2048>>(
(int_mul_memory<uint64_t> *)mem_ptr, (uint64_t *)bsk, (uint64_t *)ksk,
message_modulus, carry_modulus, glwe_dimension, lwe_dimension,
polynomial_size, pbs_base_log, pbs_level, ks_base_log, ks_level,
num_blocks, pbs_type, max_shared_memory, allocate_gpu_memory);
break;
default:
break;
}
}
void cuda_integer_mult_radix_ciphertext_kb_64_multi_gpu(
void *radix_lwe_out, void *radix_lwe_left, void *radix_lwe_right,
uint32_t *ct_degree_out, uint32_t *ct_degree_left,
uint32_t *ct_degree_right, void *bsk, void *ksk, void *mem_ptr,
uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
uint32_t num_blocks, PBS_TYPE pbs_type, uint32_t max_shared_memory) {
switch (polynomial_size) {
case 2048:
host_integer_mult_radix_kb_multi_gpu<uint64_t, int64_t, Degree<2048>>(
(uint64_t *)radix_lwe_out, (uint64_t *)radix_lwe_left,
(uint64_t *)radix_lwe_right, ct_degree_out, ct_degree_left,
ct_degree_right, (uint64_t *)bsk, (uint64_t *)ksk,
(int_mul_memory<uint64_t> *)mem_ptr, message_modulus, carry_modulus,
glwe_dimension, lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
ks_base_log, ks_level, num_blocks, max_shared_memory);
break;
default:
break;
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,49 +0,0 @@
#include "negation.cuh"
/*
* Perform the negation of a u32 input LWE ciphertext vector.
* See the equivalent operation on u64 ciphertexts for more details.
*/
void cuda_negate_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_negation(v_stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in), input_lwe_dimension,
input_lwe_ciphertext_count);
}
/*
* Perform the negation of a u64 input LWE ciphertext vector.
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - `lwe_array_out` is an array of size
* `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
* been allocated on the GPU before calling this function, and that will hold
* the result of the computation.
* - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
* been allocated and initialized before calling this function. It has the same
* size as the output array.
* - `input_lwe_dimension` is the number of mask elements in the two input and
* in the output ciphertext vectors
* - `input_lwe_ciphertext_count` is the number of ciphertexts contained in each
* input LWE ciphertext vector, as well as in the output.
*
* Each element (mask element or body) of the input LWE ciphertext vector is
* negated. The result is stored in the output LWE ciphertext vector. The input
* LWE ciphertext vector is left unchanged. This function is a wrapper to a
* device function that performs the operation on the GPU.
*/
void cuda_negate_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_negation(v_stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in), input_lwe_dimension,
input_lwe_ciphertext_count);
}

View File

@@ -1,45 +0,0 @@
#ifndef CUDA_NEGATE_H
#define CUDA_NEGATE_H
#ifdef __CDT_PARSER__
#undef __CUDA_RUNTIME_H__
#include <cuda_runtime.h>
#endif
#include "device.h"
#include "linear_algebra.h"
#include "utils/kernel_dimensions.cuh"
template <typename T>
__global__ void negation(T *output, T *input, uint32_t num_entries) {
int tid = threadIdx.x;
int index = blockIdx.x * blockDim.x + tid;
if (index < num_entries) {
// Here we take advantage of the wrapping behaviour of uint
output[index] = -input[index];
}
}
template <typename T>
__host__ void host_negation(void *v_stream, uint32_t gpu_index, T *output,
T *input, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
cudaSetDevice(gpu_index);
// lwe_size includes the presence of the body
// whereas lwe_dimension is the number of elements in the mask
int lwe_size = input_lwe_dimension + 1;
// Create a 1-dimensional grid of threads
int num_blocks = 0, num_threads = 0;
int num_entries = input_lwe_ciphertext_count * lwe_size;
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
auto stream = static_cast<cudaStream_t *>(v_stream);
negation<<<grid, thds, 0, *stream>>>(output, input, num_entries);
check_cuda_error(cudaGetLastError());
}
#endif // CUDA_NEGATE_H

View File

@@ -1,304 +0,0 @@
#ifndef GPU_POLYNOMIAL_FUNCTIONS
#define GPU_POLYNOMIAL_FUNCTIONS
#include "device.h"
#include "utils/timer.cuh"
// Return A if C == 0 and B if C == 1
#define SEL(A, B, C) ((-(C) & ((A) ^ (B))) ^ (A))
/*
* function compresses decomposed buffer into half size complex buffer for fft
*/
template <class params>
__device__ void real_to_complex_compressed(int16_t *src, double2 *dst) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
dst[tid].x = __int2double_rn(src[2 * tid]);
dst[tid].y = __int2double_rn(src[2 * tid + 1]);
tid += params::degree / params::opt;
}
}
/*
* copy source polynomial to specific slice of batched polynomials
* used only in low latency version
*/
template <typename T, class params>
__device__ void copy_into_ith_polynomial_low_lat(T *source, T *dst, int i) {
int tid = threadIdx.x;
int begin = i * (params::degree / 2 + 1);
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
dst[tid + begin] = source[tid];
tid = tid + params::degree / params::opt;
}
if (threadIdx.x == 0) {
dst[params::degree / 2 + begin] = source[params::degree / 2];
}
}
template <typename T, int elems_per_thread, int block_size>
__device__ void copy_polynomial(T *source, T *dst) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < elems_per_thread; i++) {
dst[tid] = source[tid];
tid = tid + block_size;
}
}
/*
* accumulates source polynomial into specific slice of batched polynomial
* used only in low latency version
*/
template <typename T, class params>
__device__ void add_polynomial_inplace_low_lat(T *source, T *dst, int p_id) {
int tid = threadIdx.x;
int begin = p_id * (params::degree / 2 + 1);
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
dst[tid] += source[tid + begin];
tid = tid + params::degree / params::opt;
}
if (threadIdx.x == 0) {
dst[params::degree / 2] += source[params::degree / 2 + begin];
}
}
/*
* Receives num_poly concatenated polynomials of type T. For each:
*
* Performs acc = acc * (X^ä + 1) if zeroAcc = false
* Performs acc = 0 if zeroAcc
* takes single buffer and calculates inplace.
*
* By default, it works on a single polynomial.
*/
template <typename T, int elems_per_thread, int block_size>
__device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input,
uint32_t j, bool zeroAcc,
uint32_t num_poly = 1) {
constexpr int degree = block_size * elems_per_thread;
for (int z = 0; z < num_poly; z++) {
T *accumulator_slice = (T *)accumulator + (ptrdiff_t)(z * degree);
T *input_slice = (T *)input + (ptrdiff_t)(z * degree);
int tid = threadIdx.x;
if (zeroAcc) {
for (int i = 0; i < elems_per_thread; i++) {
accumulator_slice[tid] = 0;
tid += block_size;
}
} else {
tid = threadIdx.x;
for (int i = 0; i < elems_per_thread; i++) {
if (j < degree) {
// if (tid < degree - j)
// accumulator_slice[tid] = input_slice[tid + j];
// else
// accumulator_slice[tid] = -input_slice[tid - degree + j];
int x = tid + j - SEL(degree, 0, tid < degree - j);
accumulator_slice[tid] =
SEL(-1, 1, tid < degree - j) * input_slice[x];
} else {
int32_t jj = j - degree;
// if (tid < degree - jj)
// accumulator_slice[tid] = -input_slice[tid + jj];
// else
// accumulator_slice[tid] = input_slice[tid - degree + jj];
int x = tid + jj - SEL(degree, 0, tid < degree - jj);
accumulator_slice[tid] =
SEL(1, -1, tid < degree - jj) * input_slice[x];
}
tid += block_size;
}
}
}
}
/*
* Receives num_poly concatenated polynomials of type T. For each:
*
* Performs result_acc = acc * (X^ä - 1) - acc
* takes single buffer as input and returns a single rotated buffer
*
* By default, it works on a single polynomial.
*/
template <typename T, int elems_per_thread, int block_size>
__device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
T *acc, T *result_acc, uint32_t j, uint32_t num_poly = 1) {
constexpr int degree = block_size * elems_per_thread;
for (int z = 0; z < num_poly; z++) {
T *acc_slice = (T *)acc + (ptrdiff_t)(z * degree);
T *result_acc_slice = (T *)result_acc + (ptrdiff_t)(z * degree);
int tid = threadIdx.x;
for (int i = 0; i < elems_per_thread; i++) {
if (j < degree) {
// if (tid < j)
// result_acc_slice[tid] = -acc_slice[tid - j + degree]-acc_slice[tid];
// else
// result_acc_slice[tid] = acc_slice[tid - j] - acc_slice[tid];
int x = tid - j + SEL(0, degree, tid < j);
result_acc_slice[tid] =
SEL(1, -1, tid < j) * acc_slice[x] - acc_slice[tid];
} else {
int32_t jj = j - degree;
// if (tid < jj)
// result_acc_slice[tid] = acc_slice[tid - jj + degree]-acc_slice[tid];
// else
// result_acc_slice[tid] = -acc_slice[tid - jj] - acc_slice[tid];
int x = tid - jj + SEL(0, degree, tid < jj);
result_acc_slice[tid] =
SEL(-1, 1, tid < jj) * acc_slice[x] - acc_slice[tid];
}
tid += block_size;
}
}
}
/*
* Receives num_poly concatenated polynomials of type T. For each performs a
* rounding to increase accuracy of the PBS. Calculates inplace.
*
* By default, it works on a single polynomial.
*/
template <typename T, int elems_per_thread, int block_size>
__device__ void round_to_closest_multiple_inplace(T *rotated_acc, int base_log,
int level_count,
uint32_t num_poly = 1) {
constexpr int degree = block_size * elems_per_thread;
for (int z = 0; z < num_poly; z++) {
T *rotated_acc_slice = (T *)rotated_acc + (ptrdiff_t)(z * degree);
int tid = threadIdx.x;
for (int i = 0; i < elems_per_thread; i++) {
T x_acc = rotated_acc_slice[tid];
T shift = sizeof(T) * 8 - level_count * base_log;
T mask = 1ll << (shift - 1);
T b_acc = (x_acc & mask) >> (shift - 1);
T res_acc = x_acc >> shift;
res_acc += b_acc;
res_acc <<= shift;
rotated_acc_slice[tid] = res_acc;
tid = tid + block_size;
}
}
}
template <typename Torus, class params>
__device__ void add_to_torus(double2 *m_values, Torus *result,
bool init_torus = false) {
Torus mx = (sizeof(Torus) == 4) ? UINT32_MAX : UINT64_MAX;
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
double v1 = m_values[tid].x;
double v2 = m_values[tid].y;
double frac = v1 - floor(v1);
frac *= mx;
double carry = frac - floor(frac);
frac += (carry >= 0.5);
Torus V1 = 0;
typecast_double_to_torus<Torus>(frac, V1);
frac = v2 - floor(v2);
frac *= mx;
carry = frac - floor(v2);
frac += (carry >= 0.5);
Torus V2 = 0;
typecast_double_to_torus<Torus>(frac, V2);
if (init_torus) {
result[tid] = V1;
result[tid + params::degree / 2] = V2;
} else {
result[tid] += V1;
result[tid + params::degree / 2] += V2;
}
tid = tid + params::degree / params::opt;
}
}
// Extracts the body of a GLWE.
// k is the offset to find the body element / polynomial in the lwe_array_out /
// accumulator
template <typename Torus, class params>
__device__ void sample_extract_body(Torus *lwe_array_out, Torus *accumulator,
uint32_t k) {
// Set first coefficient of the accumulator as the body of the LWE sample
lwe_array_out[k * params::degree] = accumulator[k * params::degree];
}
// Extracts the mask from num_poly polynomials individually
template <typename Torus, class params>
__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator,
uint32_t num_poly = 1) {
for (int z = 0; z < num_poly; z++) {
Torus *lwe_array_out_slice =
(Torus *)lwe_array_out + (ptrdiff_t)(z * params::degree);
Torus *accumulator_slice =
(Torus *)accumulator + (ptrdiff_t)(z * params::degree);
// Set ACC = -ACC
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
accumulator_slice[tid] = -accumulator_slice[tid];
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();
// Reverse the accumulator
tid = threadIdx.x;
Torus result[params::opt];
#pragma unroll
for (int i = 0; i < params::opt; i++) {
result[i] = accumulator_slice[params::degree - tid - 1];
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
accumulator_slice[tid] = result[i];
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();
// Perform ACC * X
// (equivalent to multiply_by_monomial_negacyclic_inplace(1))
tid = threadIdx.x;
result[params::opt];
for (int i = 0; i < params::opt; i++) {
// if (tid < 1)
// result[i] = -accumulator_slice[tid - 1 + params::degree];
// else
// result[i] = accumulator_slice[tid - 1];
int x = tid - 1 + SEL(0, params::degree, tid < 1);
result[i] = SEL(1, -1, tid < 1) * accumulator_slice[x];
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
accumulator_slice[tid] = result[i];
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
// Copy to the mask of the LWE sample
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
lwe_array_out_slice[tid] = accumulator_slice[tid];
tid = tid + params::degree / params::opt;
}
}
}
#endif

View File

@@ -1,106 +0,0 @@
#ifndef CNCRT_PARAMETERS_H
#define CNCRT_PARAMETERS_H
constexpr int log2(int n) { return (n <= 2) ? 1 : 1 + log2(n / 2); }
constexpr int choose_opt_amortized(int degree) {
if (degree <= 1024)
return 4;
else if (degree == 2048)
return 8;
else if (degree == 4096)
return 16;
else if (degree == 8192)
return 32;
else
return 64;
}
constexpr int choose_opt(int degree) {
if (degree <= 1024)
return 4;
else if (degree == 2048)
return 4;
else if (degree == 4096)
return 4;
else if (degree == 8192)
return 8;
else if (degree == 16384)
return 16;
else
return 64;
}
template <class params> class HalfDegree {
public:
constexpr static int degree = params::degree / 2;
constexpr static int opt = params::opt / 2;
constexpr static int log2_degree = params::log2_degree - 1;
constexpr static int quarter = params::quarter / 2;
constexpr static int half = params::half / 2;
constexpr static int three_quarters = quarter + half;
constexpr static int warp = 32;
constexpr static int fft_sm_required = degree + degree / warp;
};
template <int N> class Degree {
public:
constexpr static int degree = N;
constexpr static int opt = choose_opt(N);
constexpr static int log2_degree = log2(N);
constexpr static int quarter = N / 4;
constexpr static int half = N / 2;
constexpr static int three_quarters = half + quarter;
constexpr static int warp = 32;
constexpr static int fft_sm_required = N + 32;
};
template <int N> class AmortizedDegree {
public:
constexpr static int degree = N;
constexpr static int opt = choose_opt_amortized(N);
constexpr static int log2_degree = log2(N);
constexpr static int quarter = N / 4;
constexpr static int half = N / 2;
constexpr static int three_quarters = half + quarter;
constexpr static int warp = 32;
constexpr static int fft_sm_required = N + 32;
};
enum sharedMemDegree {
NOSM = 0,
PARTIALSM = 1,
FULLSM = 2
};
class ForwardFFT {
public:
constexpr static int direction = 0;
};
class BackwardFFT {
public:
constexpr static int direction = 1;
};
class ReorderFFT {
constexpr static int reorder = 1;
};
class NoReorderFFT {
constexpr static int reorder = 0;
};
template <class params, class direction, class reorder = ReorderFFT>
class FFTDegree : public params {
public:
constexpr static int fft_direction = direction::direction;
constexpr static int fft_reorder = reorder::reorder;
};
template <int N, class direction, class reorder = ReorderFFT>
class FFTParams : public Degree<N> {
public:
constexpr static int fft_direction = direction::direction;
constexpr static int fft_reorder = reorder::reorder;
};
#endif // CNCRT_PARAMETERS_H

View File

@@ -1,259 +0,0 @@
#ifndef CNCRT_POLYNOMIAL_H
#define CNCRT_POLYNOMIAL_H
#include "complex/operations.cuh"
#include "crypto/torus.cuh"
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "parameters.cuh"
#include "utils/timer.cuh"
#include <cassert>
#include <cstdint>
#define PI 3.141592653589793238462643383279502884197
template <typename T>
__device__ T *get_chunk(T *data, int chunk_num, int chunk_size) {
int pos = chunk_num * chunk_size;
T *ptr = &data[pos];
return ptr;
}
class ExtraMemory {
public:
uint32_t m_size;
__device__ ExtraMemory(uint32_t size) : m_size(size) {}
};
template <typename T, class params> class Polynomial;
template <typename T, class params> class Vector;
template <typename FT, class params> class Twiddles;
template <typename T, class params> class Polynomial {
public:
T *coefficients;
uint32_t degree;
__device__ Polynomial(T *coefficients, uint32_t degree)
: coefficients(coefficients), degree(degree) {}
__device__ Polynomial(int8_t *memory, uint32_t degree)
: coefficients((T *)memory), degree(degree) {}
__host__ void copy_to_host(T *dest) {
cudaMemcpyAsync(dest, this->coefficients, sizeof(T) * params::degree,
cudaMemcpyDeviceToHost);
}
__device__ T get_coefficient(int i) { return this->coefficients[i]; }
__device__ int8_t *reuse_memory() { return (int8_t *)coefficients; }
__device__ void copy_coefficients_from(Polynomial<T, params> &source,
int begin_dest = 0,
int begin_src = 0) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
this->coefficients[tid + begin_dest] = source.coefficients[tid];
tid = tid + params::degree / params::opt;
}
}
__device__ void fill_with(T value) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
coefficients[tid] = value;
tid += params::degree / params::opt;
}
}
__device__ void round_to_closest_multiple_inplace(uint32_t base_log,
uint32_t level_count) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
T x = coefficients[tid];
T shift = sizeof(T) * 8 - level_count * base_log;
T mask = 1ll << (shift - 1);
T b = (x & mask) >> (shift - 1);
T res = x >> shift;
res += b;
res <<= shift;
coefficients[tid] = res;
tid = tid + params::degree / params::opt;
}
}
__device__ void multiply_by_scalar_inplace(T scalar) {
int tid = threadIdx.x;
const int grid_dim = blockDim.x;
const int slices = params::degree / grid_dim;
const int jump = grid_dim;
for (int i = 0; i < slices; i++) {
this->coefficients[tid] *= scalar;
tid += jump;
}
}
__device__ void add_scalar_inplace(T scalar) {
int tid = threadIdx.x;
const int grid_dim = blockDim.x;
const int slices = params::degree / grid_dim;
const int jump = grid_dim;
for (int i = 0; i < slices; i++) {
this->coefficients[tid] += scalar;
tid += jump;
}
}
__device__ void sub_scalar_inplace(T scalar) {
int tid = threadIdx.x;
const int grid_dim = blockDim.x;
const int slices = params::degree / grid_dim;
const int jump = grid_dim;
for (int i = 0; i < slices; i++) {
this->coefficients[tid] -= scalar;
tid += jump;
}
}
__device__ void sub_polynomial_inplace(Polynomial<T, params> &rhs) {
int tid = threadIdx.x;
const int grid_dim = blockDim.x;
const int slices = params::degree / grid_dim;
const int jump = grid_dim;
for (int i = 0; i < slices; i++) {
this->coefficients[tid] -= rhs.coefficients[tid];
tid += jump;
}
}
__device__ void negate_inplace() {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
coefficients[tid] = -coefficients[tid];
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();
}
__device__ void copy_into(Vector<T, params> &vec) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
vec.m_data[tid] = coefficients[tid];
tid = tid + params::degree / params::opt;
}
}
__device__ void copy_reversed_into(Vector<T, params> &vec) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
vec.m_data[tid] = coefficients[params::degree - tid - 1];
tid = tid + params::degree / params::opt;
}
}
__device__ void reverse_inplace() {
int tid = threadIdx.x;
T result[params::opt];
#pragma unroll
for (int i = 0; i < params::opt; i++) {
result[i] = coefficients[params::degree - tid - 1];
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
coefficients[tid] = result[i];
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();
}
};
template <typename T, class params> class Vector {
public:
T *m_data;
uint32_t m_size;
__device__ Vector(T *elements, uint32_t size)
: m_data(elements), m_size(size) {}
__host__ Vector() {}
__device__ T &operator[](int i) { return m_data[i]; }
__device__ Vector<T, params> get_chunk(int chunk_num, int chunk_size) {
int pos = chunk_num * chunk_size;
T *ptr = &m_data[pos];
return Vector<T, params>(ptr, chunk_size);
}
__host__ void copy_to_device(T *source, uint32_t elements) {
cudaMemcpyAsync(m_data, source, sizeof(T) * elements,
cudaMemcpyHostToDevice);
}
__host__ void copy_to_host(T *dest) {
cudaMemcpyAsync(dest, m_data, sizeof(T) * m_size, cudaMemcpyDeviceToHost);
}
__host__ void copy_to_host(T *dest, int elements) {
cudaMemcpyAsync(dest, m_data, sizeof(T) * elements, cudaMemcpyDeviceToHost);
}
__device__ T get_ith_element(int i) { return m_data[i]; }
__device__ T get_last_element() { return m_data[m_size - 1]; }
__device__ void set_last_element(T elem) { m_data[m_size - 1] = elem; }
__device__ void operator-=(const Vector<T, params> &rhs) {
assert(m_size == rhs->m_size);
int tid = threadIdx.x;
int pos = tid;
int total = m_size / blockDim.x + 1;
for (int i = 0; i < total; i++) {
if (pos < m_size)
m_data[pos] -= rhs.m_data[pos];
pos += blockDim.x;
}
}
__device__ void operator*=(const T &rhs) {
int tid = threadIdx.x;
int pos = tid;
int total = m_size / blockDim.x + 1;
for (int i = 0; i < total; i++) {
if (pos < m_size)
m_data[pos] *= rhs;
pos += blockDim.x;
}
}
};
template <typename FT, class params> class Twiddles {
public:
Vector<FT, params> twiddles2, twiddles3, twiddles4, twiddles5, twiddles6,
twiddles7, twiddles8, twiddles9, twiddles10;
__device__
Twiddles(Vector<FT, params> &twiddles2, Vector<FT, params> &twiddles3,
Vector<FT, params> &twiddles4, Vector<FT, params> &twiddles5,
Vector<FT, params> &twiddles6, Vector<FT, params> &twiddles7,
Vector<FT, params> &twiddles8, Vector<FT, params> &twiddles9,
Vector<FT, params> &twiddles10)
: twiddles2(twiddles2), twiddles3(twiddles3), twiddles4(twiddles4),
twiddles5(twiddles5), twiddles6(twiddles6), twiddles7(twiddles7),
twiddles8(twiddles8), twiddles9(twiddles9), twiddles10(twiddles10) {}
};
#endif // CNCRT_POLYNOMIAL_H

View File

@@ -1,76 +0,0 @@
#ifndef CNCRT_POLYNOMIAL_MATH_H
#define CNCRT_POLYNOMIAL_MATH_H
#include "crypto/torus.cuh"
#include "parameters.cuh"
#include "polynomial.cuh"
template <typename FT, class params>
__device__ void sub_polynomial(FT *result, FT *first, FT *second) {
int tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
result[tid] = first[tid] - second[tid];
tid += params::degree / params::opt;
}
}
template <class params, typename T>
__device__ void polynomial_product_in_fourier_domain(T *result, T *first,
T *second) {
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
result[tid] = first[tid] * second[tid];
tid += params::degree / params::opt;
}
if (threadIdx.x == 0) {
result[params::degree / 2] =
first[params::degree / 2] * second[params::degree / 2];
}
}
// Computes result += first * second
// If init_accumulator is set, assumes that result was not initialized and does
// that with the outcome of first * second
template <class params, typename T>
__device__ void
polynomial_product_accumulate_in_fourier_domain(T *result, T *first, T *second,
bool init_accumulator = false) {
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
if (init_accumulator)
result[tid] = first[tid] * second[tid];
else
result[tid] += first[tid] * second[tid];
tid += params::degree / params::opt;
}
}
// If init_accumulator is set, assumes that result was not initialized and does
// that with the outcome of first * second
template <typename T, class params>
__device__ void
polynomial_product_accumulate_by_monomial(T *result, T *poly,
uint64_t monomial_degree,
bool init_accumulator = false) {
// monomial_degree \in [0, 2 * params::degree)
int full_cycles_count = monomial_degree / params::degree;
int remainder_degrees = monomial_degree % params::degree;
int pos = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
T element = poly[pos];
int new_pos = (pos + monomial_degree) % params::degree;
T x = SEL(element, -element, full_cycles_count % 2); // monomial coefficient
x = SEL(-x, x, new_pos >= remainder_degrees);
if (init_accumulator)
result[new_pos] = x;
else
result[new_pos] += x;
pos += params::degree / params::opt;
}
}
#endif // CNCRT_POLYNOMIAL_MATH_H

View File

@@ -1 +0,0 @@
#include "polynomial.cuh"

View File

@@ -1,76 +0,0 @@
#ifndef CNCRT_INT128_H
#define CNCRT_INT128_H
// abseil's int128 type
// licensed under Apache license
class uint128 {
public:
__device__ uint128(uint64_t high, uint64_t low) : hi_(high), lo_(low) {}
uint64_t lo_;
uint64_t hi_;
};
class int128 {
public:
int128() = default;
__device__ operator unsigned long long() const {
return static_cast<unsigned long long>(lo_);
}
__device__ int128(int64_t high, uint64_t low) : hi_(high), lo_(low) {}
uint64_t lo_;
int64_t hi_;
};
__device__ inline uint128 make_uint128(uint64_t high, uint64_t low) {
return uint128(high, low);
}
template <typename T> __device__ uint128 make_uint128_from_float(T v) {
if (v >= ldexp(static_cast<T>(1), 64)) {
uint64_t hi = static_cast<uint64_t>(ldexp(v, -64));
uint64_t lo = static_cast<uint64_t>(v - ldexp(static_cast<T>(hi), 64));
return make_uint128(hi, lo);
}
return make_uint128(0, static_cast<uint64_t>(v));
}
__device__ inline int128 make_int128(int64_t high, uint64_t low) {
return int128(high, low);
}
__device__ inline int64_t bitcast_to_signed(uint64_t v) {
return v & (uint64_t{1} << 63) ? ~static_cast<int64_t>(~v)
: static_cast<int64_t>(v);
}
__device__ inline uint64_t uint128_high64(uint128 v) { return v.hi_; }
__device__ inline uint64_t uint128_low64(uint128 v) { return v.lo_; }
__device__ __forceinline__ uint128 operator-(uint128 val) {
uint64_t hi = ~uint128_high64(val);
uint64_t lo = ~uint128_low64(val) + 1;
if (lo == 0)
++hi; // carry
return make_uint128(hi, lo);
}
template <typename T> __device__ int128 make_int128_from_float(T v) {
// We must convert the absolute value and then negate as needed, because
// floating point types are typically sign-magnitude. Otherwise, the
// difference between the high and low 64 bits when interpreted as two's
// complement overwhelms the precision of the mantissa.
uint128 result =
v < 0 ? -make_uint128_from_float(-v) : make_uint128_from_float(v);
return make_int128(bitcast_to_signed(uint128_high64(result)),
uint128_low64(result));
}
#endif

View File

@@ -1,21 +0,0 @@
#ifndef KERNEL_DIMENSIONS_H
#define KERNEL_DIMENSIONS_H
inline int nextPow2(int x) {
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
inline void getNumBlocksAndThreads(const int n, const int maxBlockSize,
int &blocks, int &threads) {
threads =
(n < maxBlockSize * 2) ? max(128, nextPow2((n + 1) / 2)) : maxBlockSize;
blocks = (n + threads - 1) / threads;
}
#endif // KERNEL_DIMENSIONS_H

View File

@@ -1,30 +0,0 @@
#ifndef CNCRT_TIMER_H
#define CNCRT_TIMER_H
#include <iostream>
#define synchronize_threads_in_block() __syncthreads()
template <bool active> class CudaMeasureExecution {
public:
cudaEvent_t m_start, m_stop;
__host__ CudaMeasureExecution() {
if constexpr (active) {
cudaEventCreate(&m_start);
cudaEventCreate(&m_stop);
cudaEventRecord(m_start);
}
}
__host__ ~CudaMeasureExecution() {
if constexpr (active) {
float ms;
cudaEventRecord(m_stop);
cudaEventSynchronize(m_stop);
cudaEventElapsedTime(&ms, m_start, m_stop);
std::cout << "Execution took " << ms << "ms" << std::endl;
}
}
};
#endif // CNCRT_TIMER_H

View File

@@ -1,482 +0,0 @@
#include "vertical_packing.cuh"
#include "vertical_packing.h"
#include <cassert>
/*
* Runs standard checks to validate the inputs
*/
void checks_fast_cmux_tree(int polynomial_size) {
assert((
"Error (GPU Cmux tree): polynomial size should be one of 256, 512, 1024, "
"2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
}
/*
* Runs standard checks to validate the inputs
*/
void checks_cmux_tree(int nbits, int polynomial_size, int base_log) {
assert(("Error (GPU Cmux tree): base log should be <= nbits",
base_log <= nbits));
checks_fast_cmux_tree(polynomial_size);
}
/*
* Runs standard checks to validate the inputs
*/
void checks_blind_rotation_and_sample_extraction(int polynomial_size) {
assert(("Error (GPU Blind rotation + sample extraction): polynomial size "
"should be one of 256, 512, 1024, 2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the Cmux tree on 32 bits inputs, into `cmux_tree_buffer`. It also configures
* SM options on the GPU in case FULLSM mode is going to be used.
*/
void scratch_cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index,
int8_t **cmux_tree_buffer,
uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t lut_vector_size, uint32_t tau,
uint32_t max_shared_memory,
bool allocate_gpu_memory) {
checks_fast_cmux_tree(polynomial_size);
switch (polynomial_size) {
case 256:
scratch_cmux_tree<uint32_t, int32_t, Degree<256>>(
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
level_count, lut_vector_size, tau, max_shared_memory,
allocate_gpu_memory);
break;
case 512:
scratch_cmux_tree<uint32_t, int32_t, Degree<512>>(
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
level_count, lut_vector_size, tau, max_shared_memory,
allocate_gpu_memory);
break;
case 1024:
scratch_cmux_tree<uint32_t, int32_t, Degree<1024>>(
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
level_count, lut_vector_size, tau, max_shared_memory,
allocate_gpu_memory);
break;
case 2048:
scratch_cmux_tree<uint32_t, int32_t, Degree<2048>>(
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
level_count, lut_vector_size, tau, max_shared_memory,
allocate_gpu_memory);
break;
case 4096:
scratch_cmux_tree<uint32_t, int32_t, Degree<4096>>(
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
level_count, lut_vector_size, tau, max_shared_memory,
allocate_gpu_memory);
break;
case 8192:
scratch_cmux_tree<uint32_t, int32_t, Degree<8192>>(
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
level_count, lut_vector_size, tau, max_shared_memory,
allocate_gpu_memory);
break;
default:
break;
}
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the Cmux tree on 64 bits inputs, into `cmux_tree_buffer`. It also configures
* SM options on the GPU in case FULLSM mode is going to be used.
*/
void scratch_cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index,
int8_t **cmux_tree_buffer,
uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t lut_vector_size, uint32_t tau,
uint32_t max_shared_memory,
bool allocate_gpu_memory) {
checks_fast_cmux_tree(polynomial_size);
switch (polynomial_size) {
case 256:
scratch_cmux_tree<uint64_t, int64_t, Degree<256>>(
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
level_count, lut_vector_size, tau, max_shared_memory,
allocate_gpu_memory);
break;
case 512:
scratch_cmux_tree<uint64_t, int64_t, Degree<512>>(
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
level_count, lut_vector_size, tau, max_shared_memory,
allocate_gpu_memory);
break;
case 1024:
scratch_cmux_tree<uint64_t, int64_t, Degree<1024>>(
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
level_count, lut_vector_size, tau, max_shared_memory,
allocate_gpu_memory);
break;
case 2048:
scratch_cmux_tree<uint64_t, int64_t, Degree<2048>>(
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
level_count, lut_vector_size, tau, max_shared_memory,
allocate_gpu_memory);
break;
case 4096:
scratch_cmux_tree<uint64_t, int64_t, Degree<4096>>(
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
level_count, lut_vector_size, tau, max_shared_memory,
allocate_gpu_memory);
break;
case 8192:
scratch_cmux_tree<uint64_t, int64_t, Degree<8192>>(
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
level_count, lut_vector_size, tau, max_shared_memory,
allocate_gpu_memory);
break;
default:
break;
}
}
/*
* Perform cmux tree on a batch of 32-bit input GGSW ciphertexts.
* Check the equivalent function for 64-bit inputs for more details.
*/
void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
void *ggsw_in, void *lut_vector,
int8_t *cmux_tree_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t lut_vector_size,
uint32_t tau, uint32_t max_shared_memory) {
checks_cmux_tree(32, polynomial_size, base_log);
switch (polynomial_size) {
case 256:
host_cmux_tree<uint32_t, int32_t, Degree<256>>(
v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in,
(uint32_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
polynomial_size, base_log, level_count, lut_vector_size, tau,
max_shared_memory);
break;
case 512:
host_cmux_tree<uint32_t, int32_t, Degree<512>>(
v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in,
(uint32_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
polynomial_size, base_log, level_count, lut_vector_size, tau,
max_shared_memory);
break;
case 1024:
host_cmux_tree<uint32_t, int32_t, Degree<1024>>(
v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in,
(uint32_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
polynomial_size, base_log, level_count, lut_vector_size, tau,
max_shared_memory);
break;
case 2048:
host_cmux_tree<uint32_t, int32_t, Degree<2048>>(
v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in,
(uint32_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
polynomial_size, base_log, level_count, lut_vector_size, tau,
max_shared_memory);
break;
case 4096:
host_cmux_tree<uint32_t, int32_t, Degree<4096>>(
v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in,
(uint32_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
polynomial_size, base_log, level_count, lut_vector_size, tau,
max_shared_memory);
break;
case 8192:
host_cmux_tree<uint32_t, int32_t, Degree<8192>>(
v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in,
(uint32_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
polynomial_size, base_log, level_count, lut_vector_size, tau,
max_shared_memory);
break;
default:
break;
}
}
/*
* Perform Cmux tree on a batch of 64-bit input GGSW ciphertexts
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - 'glwe_array_out' output batch of GLWE buffer for Cmux tree, 'tau' GLWE's
* will be the output of the function
* - 'ggsw_in' batch of input GGSW ciphertexts, function expects 'r' GGSW
* ciphertexts as input.
* - 'lut_vector' batch of test vectors (LUTs) there should be 2^r LUTs
* inside 'lut_vector' parameter
* - 'glwe_dimension' GLWE dimension, supported values: {1}
* - 'polynomial_size' size of the test polynomial, supported values: {512,
* 1024, 2048, 4096, 8192}
* - 'base_log' base log parameter for cmux block
* - 'level_count' decomposition level for cmux block
* - 'lut_vector_size' number of elements in lut_vector
* - 'tau' number of input LWE ciphertext which were used to generate GGSW
* ciphertexts stored in 'ggsw_in', it is also an amount of output GLWE
* ciphertexts
* - 'max_shared_memory' maximum shared memory amount to be used for cmux
* kernel
*
* This function calls a wrapper to a device kernel that performs the
* Cmux tree. The kernel is templatized based on integer discretization and
* polynomial degree.
*/
void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
void *ggsw_in, void *lut_vector,
int8_t *cmux_tree_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t lut_vector_size,
uint32_t tau, uint32_t max_shared_memory) {
checks_cmux_tree(64, polynomial_size, base_log);
switch (polynomial_size) {
case 256:
host_cmux_tree<uint64_t, int64_t, Degree<256>>(
v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in,
(uint64_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
polynomial_size, base_log, level_count, lut_vector_size, tau,
max_shared_memory);
break;
case 512:
host_cmux_tree<uint64_t, int64_t, Degree<512>>(
v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in,
(uint64_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
polynomial_size, base_log, level_count, lut_vector_size, tau,
max_shared_memory);
break;
case 1024:
host_cmux_tree<uint64_t, int64_t, Degree<1024>>(
v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in,
(uint64_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
polynomial_size, base_log, level_count, lut_vector_size, tau,
max_shared_memory);
break;
case 2048:
host_cmux_tree<uint64_t, int64_t, Degree<2048>>(
v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in,
(uint64_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
polynomial_size, base_log, level_count, lut_vector_size, tau,
max_shared_memory);
break;
case 4096:
host_cmux_tree<uint64_t, int64_t, Degree<4096>>(
v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in,
(uint64_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
polynomial_size, base_log, level_count, lut_vector_size, tau,
max_shared_memory);
break;
case 8192:
host_cmux_tree<uint64_t, int64_t, Degree<8192>>(
v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in,
(uint64_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
polynomial_size, base_log, level_count, lut_vector_size, tau,
max_shared_memory);
break;
default:
break;
}
}
/*
* This cleanup function frees the data for the Cmux tree on GPU in
* cmux_tree_buffer for 32 or 64 bits inputs.
*/
void cleanup_cuda_cmux_tree(void *v_stream, uint32_t gpu_index,
int8_t **cmux_tree_buffer) {
auto stream = static_cast<cudaStream_t *>(v_stream);
// Free memory
cuda_drop_async(*cmux_tree_buffer, stream, gpu_index);
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the Cmux tree on 32 bits inputs, into `br_se_buffer`. It also configures
* SM options on the GPU in case FULLSM mode is going to be used.
*/
void scratch_cuda_blind_rotation_sample_extraction_32(
void *v_stream, uint32_t gpu_index, int8_t **br_se_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t mbr_size, uint32_t tau, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
checks_blind_rotation_and_sample_extraction(polynomial_size);
switch (polynomial_size) {
case 256:
scratch_blind_rotation_sample_extraction<uint32_t, int32_t, Degree<256>>(
v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
break;
case 512:
scratch_blind_rotation_sample_extraction<uint32_t, int32_t, Degree<512>>(
v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
break;
case 1024:
scratch_blind_rotation_sample_extraction<uint32_t, int32_t, Degree<1024>>(
v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
break;
case 2048:
scratch_blind_rotation_sample_extraction<uint32_t, int32_t, Degree<2048>>(
v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
break;
case 4096:
scratch_blind_rotation_sample_extraction<uint32_t, int32_t, Degree<4096>>(
v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
break;
case 8192:
scratch_blind_rotation_sample_extraction<uint32_t, int32_t, Degree<8192>>(
v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
break;
default:
break;
}
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the Cmux tree on 64 bits inputs, into `br_se_buffer`. It also configures
* SM options on the GPU in case FULLSM mode is going to be used.
*/
void scratch_cuda_blind_rotation_sample_extraction_64(
void *v_stream, uint32_t gpu_index, int8_t **br_se_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t mbr_size, uint32_t tau, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
checks_blind_rotation_and_sample_extraction(polynomial_size);
switch (polynomial_size) {
case 256:
scratch_blind_rotation_sample_extraction<uint64_t, int64_t, Degree<256>>(
v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
break;
case 512:
scratch_blind_rotation_sample_extraction<uint64_t, int64_t, Degree<512>>(
v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
break;
case 1024:
scratch_blind_rotation_sample_extraction<uint64_t, int64_t, Degree<1024>>(
v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
break;
case 2048:
scratch_blind_rotation_sample_extraction<uint64_t, int64_t, Degree<2048>>(
v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
break;
case 4096:
scratch_blind_rotation_sample_extraction<uint64_t, int64_t, Degree<4096>>(
v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
break;
case 8192:
scratch_blind_rotation_sample_extraction<uint64_t, int64_t, Degree<8192>>(
v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
break;
default:
break;
}
}
/*
* Performs blind rotation on batch of 64-bit input ggsw ciphertexts
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - 'lwe_out' batch of output lwe ciphertexts, there should be 'tau'
* ciphertexts inside 'lwe_out'
* - 'ggsw_in' batch of input ggsw ciphertexts, function expects 'mbr_size'
* ggsw ciphertexts inside 'ggsw_in'
* - 'lut_vector' list of test vectors, function expects 'tau' test vectors
* inside 'lut_vector' parameter
* - 'glwe_dimension' glwe dimension, supported values : {1}
* - 'polynomial_size' size of test polynomial supported sizes: {512, 1024,
* 2048, 4096, 8192}
* - 'base_log' base log parameter
* - 'l_gadget' decomposition level
* - 'max_shared_memory' maximum number of shared memory to be used in
* device functions(kernels)
*
* This function calls a wrapper to a device kernel that performs the
* blind rotation and sample extraction. The kernel is templatized based on
* integer discretization and polynomial degree.
*/
void cuda_blind_rotate_and_sample_extraction_64(
void *v_stream, uint32_t gpu_index, void *lwe_out, void *ggsw_in,
void *lut_vector, int8_t *br_se_buffer, uint32_t mbr_size, uint32_t tau,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t l_gadget, uint32_t max_shared_memory) {
checks_blind_rotation_and_sample_extraction(polynomial_size);
switch (polynomial_size) {
case 256:
host_blind_rotate_and_sample_extraction<uint64_t, int64_t, Degree<256>>(
v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in,
(uint64_t *)lut_vector, br_se_buffer, mbr_size, tau, glwe_dimension,
polynomial_size, base_log, l_gadget, max_shared_memory);
break;
case 512:
host_blind_rotate_and_sample_extraction<uint64_t, int64_t, Degree<512>>(
v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in,
(uint64_t *)lut_vector, br_se_buffer, mbr_size, tau, glwe_dimension,
polynomial_size, base_log, l_gadget, max_shared_memory);
break;
case 1024:
host_blind_rotate_and_sample_extraction<uint64_t, int64_t, Degree<1024>>(
v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in,
(uint64_t *)lut_vector, br_se_buffer, mbr_size, tau, glwe_dimension,
polynomial_size, base_log, l_gadget, max_shared_memory);
break;
case 2048:
host_blind_rotate_and_sample_extraction<uint64_t, int64_t, Degree<2048>>(
v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in,
(uint64_t *)lut_vector, br_se_buffer, mbr_size, tau, glwe_dimension,
polynomial_size, base_log, l_gadget, max_shared_memory);
break;
case 4096:
host_blind_rotate_and_sample_extraction<uint64_t, int64_t, Degree<4096>>(
v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in,
(uint64_t *)lut_vector, br_se_buffer, mbr_size, tau, glwe_dimension,
polynomial_size, base_log, l_gadget, max_shared_memory);
break;
case 8192:
host_blind_rotate_and_sample_extraction<uint64_t, int64_t, Degree<8192>>(
v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in,
(uint64_t *)lut_vector, br_se_buffer, mbr_size, tau, glwe_dimension,
polynomial_size, base_log, l_gadget, max_shared_memory);
break;
}
}
/*
* This cleanup function frees the data for the blind rotation and sample
* extraction on GPU in br_se_buffer for 32 or 64 bits inputs.
*/
void cleanup_cuda_blind_rotation_sample_extraction(void *v_stream,
uint32_t gpu_index,
int8_t **br_se_buffer) {
auto stream = static_cast<cudaStream_t *>(v_stream);
// Free memory
cuda_drop_async(*br_se_buffer, stream, gpu_index);
}

View File

@@ -1,615 +0,0 @@
#ifndef VERTICAL_PACKING_CUH
#define VERTICAL_PACKING_CUH
#include "bootstrap.h"
#include "complex/operations.cuh"
#include "crypto/gadget.cuh"
#include "crypto/ggsw.cuh"
#include "crypto/torus.cuh"
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "fft/twiddles.cuh"
#include "polynomial/functions.cuh"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial.cuh"
#include "polynomial/polynomial_math.cuh"
#include "utils/timer.cuh"
/*
* Receives an array of GLWE ciphertexts and two indexes to ciphertexts in this
* array, and an array of GGSW ciphertexts with a index to one ciphertext in it.
* Compute a CMUX with these operands and writes the output to a particular
* index of glwe_array_out.
*
* This function needs polynomial_size threads per block.
*
* - glwe_array_out: An array where the result should be written to.
* - glwe_array_in: An array where the GLWE inputs are stored.
* - ggsw_in: An array where the GGSW input is stored. In the fourier domain.
* - selected_memory: An array to be used for the accumulators. Can be in the
* shared memory or global memory.
* - output_idx: The index of the output where the glwe ciphertext should be
* written.
* - input_idx1: The index of the first glwe ciphertext we will use.
* - input_idx2: The index of the second glwe ciphertext we will use.
* - glwe_dim: This is k.
* - polynomial_size: size of the polynomials. This is N.
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
* - level_count: number of decomposition levels in the gadget matrix (~4)
* - ggsw_idx: The index of the GGSW we will use.
*/
template <typename Torus, typename STorus, class params>
__device__ void
cmux(Torus *glwe_array_out, Torus *glwe_array_in, double2 *ggsw_in,
int8_t *selected_memory, uint32_t output_idx, uint32_t input_idx1,
uint32_t input_idx2, uint32_t glwe_dim, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t ggsw_idx) {
// Define glwe_sub
Torus *glwe_sub = (Torus *)selected_memory;
double2 *res_fft =
(double2 *)glwe_sub +
(glwe_dim + 1) * polynomial_size / (sizeof(double2) / sizeof(Torus));
double2 *glwe_fft =
(double2 *)res_fft + (ptrdiff_t)((glwe_dim + 1) * polynomial_size / 2);
/////////////////////////////////////
// glwe2-glwe1
// Gets the pointers for the global memory
auto m0 = &glwe_array_in[input_idx1 * (glwe_dim + 1) * polynomial_size];
auto m1 = &glwe_array_in[input_idx2 * (glwe_dim + 1) * polynomial_size];
// Subtraction: m1-m0
for (int i = 0; i < (glwe_dim + 1); i++) {
auto glwe_sub_slice = glwe_sub + i * params::degree;
auto m0_slice = m0 + i * params::degree;
auto m1_slice = m1 + i * params::degree;
sub_polynomial<Torus, params>(glwe_sub_slice, m1_slice, m0_slice);
}
// Initialize the polynomial multiplication via FFT arrays
// The polynomial multiplications happens at the block level
// and each thread handles two or more coefficients
int pos = threadIdx.x;
for (int i = 0; i < (glwe_dim + 1); i++)
for (int j = 0; j < params::opt / 2; j++) {
res_fft[pos].x = 0;
res_fft[pos].y = 0;
pos += params::degree / params::opt;
}
synchronize_threads_in_block();
GadgetMatrix<Torus, params> gadget(base_log, level_count, glwe_sub,
glwe_dim + 1);
// Subtract each glwe operand, decompose the resulting
// polynomial coefficients to multiply each decomposed level
// with the corresponding part of the LUT
for (int level = level_count - 1; level >= 0; level--) {
// Decomposition
for (int i = 0; i < (glwe_dim + 1); i++) {
gadget.decompose_and_compress_next_polynomial(glwe_fft, i);
// First, perform the polynomial multiplication
NSMFFT_direct<HalfDegree<params>>(glwe_fft);
// External product and accumulate
// Get the piece necessary for the multiplication
auto bsk_slice = get_ith_mask_kth_block(
ggsw_in, ggsw_idx, i, level, polynomial_size, glwe_dim, level_count);
// Perform the coefficient-wise product
for (int j = 0; j < (glwe_dim + 1); j++) {
auto bsk_poly = bsk_slice + j * params::degree / 2;
auto res_fft_poly = res_fft + j * params::degree / 2;
polynomial_product_accumulate_in_fourier_domain<params, double2>(
res_fft_poly, glwe_fft, bsk_poly);
}
}
synchronize_threads_in_block();
}
// IFFT
synchronize_threads_in_block();
for (int i = 0; i < (glwe_dim + 1); i++) {
auto res_fft_slice = res_fft + i * params::degree / 2;
NSMFFT_inverse<HalfDegree<params>>(res_fft_slice);
}
synchronize_threads_in_block();
// Write the output
Torus *mb = &glwe_array_out[output_idx * (glwe_dim + 1) * polynomial_size];
int tid = threadIdx.x;
for (int i = 0; i < (glwe_dim + 1); i++)
for (int j = 0; j < params::opt; j++) {
mb[tid] = m0[tid];
tid += params::degree / params::opt;
}
for (int i = 0; i < (glwe_dim + 1); i++) {
auto res_fft_slice = res_fft + i * params::degree / 2;
auto mb_slice = mb + i * params::degree;
add_to_torus<Torus, params>(res_fft_slice, mb_slice);
}
}
// Converts an array of plaintexts to trivially encrypted GLWEs.
template <typename Torus, class params>
__host__ void
plaintext_to_glwe_array(Torus *lut_out, Torus *lut_in, uint32_t glwe_dimension,
uint32_t lut_vector_size, uint32_t number_of_trees,
cudaStream_t *stream) {
int r = log2(lut_vector_size) - params::log2_degree;
/*
* r < 0: No CMUX tree is needed, but the LUT is not big enough (i.e. has less
* than N elements).
*
* r == 0: No CMUX tree is needed and the LUT has exactly N
* elements.
*
* r > 0: CMUX tree is needed, so LUT is split in smaller LUTs of
* size lut_vector_size / num_lut.
*
* if r <= 0 we simply copy the LUT to lut_out, adding zeroes to the highest
* positions if needed.
*/
int num_lut = std::max(1, 1 << r);
check_cuda_error(cudaMemsetAsync(lut_out, 0,
num_lut * number_of_trees *
(glwe_dimension + 1) * params::degree *
sizeof(Torus),
*stream));
uint32_t small_lut_size = lut_vector_size / num_lut;
for (uint32_t i = 0; i < number_of_trees * num_lut; i++)
check_cuda_error(cudaMemcpyAsync(
lut_out + ((glwe_dimension + 1) * i + glwe_dimension) * params::degree,
lut_in + i * small_lut_size, small_lut_size * sizeof(Torus),
cudaMemcpyDeviceToDevice, *stream));
}
/**
* Computes several CMUXes using an array of GLWE ciphertexts and a single GGSW
* ciphertext. The GLWE ciphertexts are picked two-by-two in sequence. Each
* thread block computes a single CMUX.
*
* - glwe_array_out: An array where the result should be written to.
* - glwe_array_in: An array where the GLWE inputs are stored.
* - ggsw_in: An array where the GGSW input is stored. In the fourier domain.
* - device_mem: An pointer for the global memory in case the shared memory is
* not big enough to store the accumulators.
* - device_memory_size_per_block: Memory size needed to store all accumulators
* for a single block.
* - glwe_dim: This is k.
* - polynomial_size: size of the polynomials. This is N.
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
* - level_count: number of decomposition levels in the gadget matrix (~4)
* - ggsw_idx: The index of the GGSW we will use.
*/
template <typename Torus, typename STorus, class params, sharedMemDegree SMD>
__global__ void device_batch_cmux(Torus *glwe_array_out, Torus *glwe_array_in,
double2 *ggsw_in, int8_t *device_mem,
size_t device_memory_size_per_block,
uint32_t glwe_dim, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count,
uint32_t ggsw_idx, uint32_t num_lut) {
// We are running gridDim.y cmux trees in parallel
int tree_idx = blockIdx.y;
int tree_offset = tree_idx * num_lut * (glwe_dim + 1) * polynomial_size;
auto block_glwe_array_out = glwe_array_out + tree_offset;
auto block_glwe_array_in = glwe_array_in + tree_offset;
// The x-axis handles a single cmux tree. Each block computes one cmux.
int cmux_idx = blockIdx.x;
int output_idx = cmux_idx;
int input_idx1 = (cmux_idx << 1);
int input_idx2 = (cmux_idx << 1) + 1;
// We use shared memory for intermediate result
extern __shared__ int8_t sharedmem[];
int8_t *selected_memory;
if constexpr (SMD == FULLSM)
selected_memory = sharedmem;
else
selected_memory = &device_mem[(blockIdx.x + blockIdx.y * gridDim.x) *
device_memory_size_per_block];
cmux<Torus, STorus, params>(block_glwe_array_out, block_glwe_array_in,
ggsw_in, selected_memory, output_idx, input_idx1,
input_idx2, glwe_dim, polynomial_size, base_log,
level_count, ggsw_idx);
}
template <typename Torus>
__host__ __device__ uint64_t get_memory_needed_per_block_cmux_tree(
uint32_t glwe_dimension, uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // glwe_sub
sizeof(double2) * polynomial_size / 2 *
(glwe_dimension + 1) + // res_fft
sizeof(double2) * polynomial_size / 2; // glwe_fft
}
template <typename Torus, typename params>
__host__ __device__ uint64_t get_buffer_size_cmux_tree(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t lut_vector_size, uint32_t tau, uint32_t max_shared_memory) {
int r = log2(lut_vector_size) - params::log2_degree;
if (r <= 0)
// A cmux tree is not needed
return 0;
uint64_t memory_needed_per_block =
get_memory_needed_per_block_cmux_tree<Torus>(glwe_dimension,
polynomial_size);
uint64_t num_lut = 1 << r;
uint64_t ggsw_size = polynomial_size * (glwe_dimension + 1) *
(glwe_dimension + 1) * level_count;
uint64_t glwe_size = (glwe_dimension + 1) * polynomial_size;
uint64_t device_mem = 0;
if (max_shared_memory < memory_needed_per_block) {
device_mem = memory_needed_per_block * (1 << (r - 1)) * tau;
}
if (max_shared_memory < polynomial_size * sizeof(double)) {
device_mem += polynomial_size * sizeof(double);
}
uint64_t buffer_size =
r * ggsw_size * sizeof(double) + // d_ggsw_fft_in
num_lut * tau * glwe_size * sizeof(Torus) + // d_buffer1
num_lut * tau * glwe_size * sizeof(Torus) + // d_buffer2
device_mem; // d_mem
return buffer_size + buffer_size % sizeof(double2);
}
template <typename Torus, typename STorus, typename params>
__host__ void
scratch_cmux_tree(void *v_stream, uint32_t gpu_index, int8_t **cmux_tree_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t lut_vector_size, uint32_t tau,
uint32_t max_shared_memory, bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
uint64_t memory_needed_per_block =
get_memory_needed_per_block_cmux_tree<Torus>(glwe_dimension,
polynomial_size);
if (max_shared_memory >= memory_needed_per_block) {
check_cuda_error(cudaFuncSetAttribute(
device_batch_cmux<Torus, STorus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, memory_needed_per_block));
check_cuda_error(
cudaFuncSetCacheConfig(device_batch_cmux<Torus, STorus, params, FULLSM>,
cudaFuncCachePreferShared));
}
if (allocate_gpu_memory) {
uint64_t buffer_size = get_buffer_size_cmux_tree<Torus, params>(
glwe_dimension, polynomial_size, level_count, lut_vector_size, tau,
max_shared_memory);
*cmux_tree_buffer =
(int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
check_cuda_error(cudaGetLastError());
}
}
/*
* This kernel executes the CMUX tree used by the hybrid packing of the WoPBS.
*
* Uses shared memory for intermediate results
*
* - v_stream: The CUDA stream that should be used.
* - glwe_array_out: A device array for the output GLWE ciphertext.
* - ggsw_in: A device array for the GGSW ciphertexts used in each layer.
* - lut_vector: A device array of cleartexts.
* - polynomial_size: size of the polynomials. This is N.
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
* - level_count: number of decomposition levels in the gadget matrix (~4)
* - lut_vector_size: Number of elements in lut_vector
* - tau: The quantity of CMUX trees that should be executed
*/
template <typename Torus, typename STorus, class params>
__host__ void host_cmux_tree(void *v_stream, uint32_t gpu_index,
Torus *glwe_array_out, Torus *ggsw_in,
Torus *lut_vector, int8_t *cmux_tree_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count,
uint32_t lut_vector_size, uint32_t tau,
uint32_t max_shared_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
if (lut_vector_size <= params::degree) {
// The LUT itself is the result
plaintext_to_glwe_array<Torus, params>(glwe_array_out, lut_vector,
glwe_dimension, lut_vector_size, tau,
stream);
return;
}
// r = tau * p - log2(N)
uint32_t r = log2(lut_vector_size) - params::log2_degree;
uint32_t num_lut = 1 << r;
uint64_t memory_needed_per_block =
get_memory_needed_per_block_cmux_tree<Torus>(glwe_dimension,
polynomial_size);
dim3 thds(polynomial_size / params::opt, 1, 1);
//////////////////////
int ggsw_size = polynomial_size * (glwe_dimension + 1) *
(glwe_dimension + 1) * level_count;
int glwe_size = (glwe_dimension + 1) * polynomial_size;
// Define the buffers
// Always define the buffers with strongest memory alignment constraints first
// d_buffer1 and d_buffer2 are aligned with Torus, so they're defined last
double2 *d_ggsw_fft_in = (double2 *)cmux_tree_buffer;
int8_t *d_mem =
(int8_t *)d_ggsw_fft_in + (ptrdiff_t)(r * ggsw_size * sizeof(double));
int8_t *d_mem_fft = d_mem;
if (max_shared_memory < memory_needed_per_block) {
d_mem_fft =
d_mem + (ptrdiff_t)(memory_needed_per_block * num_lut / 2 * tau);
}
int8_t *d_buffer1 = d_mem_fft;
if (max_shared_memory < polynomial_size * sizeof(double)) {
d_buffer1 = d_mem_fft + (ptrdiff_t)(polynomial_size * sizeof(double));
}
int8_t *d_buffer2 =
d_buffer1 + (ptrdiff_t)(num_lut * tau * glwe_size * sizeof(Torus));
//////////////////////
batch_fft_ggsw_vector<Torus, STorus, params>(
stream, d_ggsw_fft_in, ggsw_in, d_mem_fft, r, glwe_dimension,
polynomial_size, level_count, gpu_index, max_shared_memory);
plaintext_to_glwe_array<Torus, params>((Torus *)d_buffer1, lut_vector,
glwe_dimension, lut_vector_size, tau,
stream);
Torus *output;
// Run the cmux tree
for (int layer_idx = 0; layer_idx < r; layer_idx++) {
output = (layer_idx % 2 ? (Torus *)d_buffer1 : (Torus *)d_buffer2);
Torus *input = (layer_idx % 2 ? (Torus *)d_buffer2 : (Torus *)d_buffer1);
int num_cmuxes = (1 << (r - 1 - layer_idx));
dim3 grid(num_cmuxes, tau, 1);
// walks horizontally through the leaves
if (max_shared_memory < memory_needed_per_block) {
device_batch_cmux<Torus, STorus, params, NOSM>
<<<grid, thds, 0, *stream>>>(output, input, d_ggsw_fft_in, d_mem,
memory_needed_per_block,
glwe_dimension, // k
polynomial_size, base_log, level_count,
layer_idx, // r
num_lut);
} else {
device_batch_cmux<Torus, STorus, params, FULLSM>
<<<grid, thds, memory_needed_per_block, *stream>>>(
output, input, d_ggsw_fft_in, d_mem, memory_needed_per_block,
glwe_dimension, // k
polynomial_size, base_log, level_count,
layer_idx, // r
num_lut);
}
check_cuda_error(cudaGetLastError());
}
for (int i = 0; i < tau; i++) {
check_cuda_error(cudaMemcpyAsync(
glwe_array_out + i * glwe_size, output + i * num_lut * glwe_size,
glwe_size * sizeof(Torus), cudaMemcpyDeviceToDevice, *stream));
}
}
/*
* Receives "tau" GLWE ciphertexts as LUTs and "mbr_size" GGSWs. Each block
* computes the blind rotation loop + sample extraction for a single LUT.
* Writes the lwe output to lwe_out.
*
* This function needs polynomial_size/params::opt threads per block and tau
* blocks
*
* - lwe_out: An array of lwe ciphertexts. The outcome is written here.
* - glwe_in: An array of "tau" GLWE ciphertexts. These are the LUTs.
* - ggsw_in: An array of "mbr_size" GGSWs in the fourier domain.
* - mbr_size: The number of GGSWs.
* - glwe_dim: This is k.
* - polynomial_size: size of the polynomials. This is N.
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
* - level_count: number of decomposition levels in the gadget matrix (~4)
* - device_memory_size_per_sample: Amount of (shared/global) memory used for
* the accumulators.
* - device_mem: An array to be used for the accumulators. Can be in the shared
* memory or global memory.
*/
template <typename Torus, typename STorus, class params, sharedMemDegree SMD>
__global__ void device_blind_rotation_and_sample_extraction(
Torus *lwe_out, Torus *glwe_in, double2 *ggsw_in, // m^BR
uint32_t mbr_size, uint32_t glwe_dim, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count,
size_t device_memory_size_per_sample, int8_t *device_mem) {
// We use shared memory for intermediate result
extern __shared__ int8_t sharedmem[];
int8_t *selected_memory;
if constexpr (SMD == FULLSM)
selected_memory = sharedmem;
else
selected_memory = &device_mem[blockIdx.x * device_memory_size_per_sample];
Torus *accumulator_c0 = (Torus *)selected_memory;
Torus *accumulator_c1 =
(Torus *)accumulator_c0 + (glwe_dim + 1) * polynomial_size;
int8_t *cmux_memory =
(int8_t *)(accumulator_c1 + (glwe_dim + 1) * polynomial_size);
// Input LUT
auto mi = &glwe_in[blockIdx.x * (glwe_dim + 1) * polynomial_size];
int tid = threadIdx.x;
for (int i = 0; i < (glwe_dim + 1); i++)
for (int j = 0; j < params::opt; j++) {
accumulator_c0[tid] = mi[tid];
tid += params::degree / params::opt;
}
int monomial_degree = 0;
for (int i = mbr_size - 1; i >= 0; i--) {
synchronize_threads_in_block();
// Compute x^ai * ACC
// Mask and Body
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator_c1, accumulator_c0, (1 << monomial_degree), false,
(glwe_dim + 1));
monomial_degree += 1;
// ACC = CMUX ( Ci, x^ai * ACC, ACC )
synchronize_threads_in_block();
cmux<Torus, STorus, params>(accumulator_c0, accumulator_c0, ggsw_in,
cmux_memory, 0, 0, 1, glwe_dim, polynomial_size,
base_log, level_count, i);
}
synchronize_threads_in_block();
// Write the output
auto block_lwe_out = &lwe_out[blockIdx.x * (glwe_dim * polynomial_size + 1)];
// The blind rotation for this block is over
// Now we can perform the sample extraction: for the body it's just
// the resulting constant coefficient of the accumulator
// For the mask it's more complicated
sample_extract_mask<Torus, params>(block_lwe_out, accumulator_c0, glwe_dim);
sample_extract_body<Torus, params>(block_lwe_out, accumulator_c0, glwe_dim);
}
template <typename Torus>
__host__ __device__ uint64_t
get_memory_needed_per_block_blind_rotation_sample_extraction(
uint32_t glwe_dimension, uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size *
(glwe_dimension + 1) + // accumulator_c0
sizeof(Torus) * polynomial_size *
(glwe_dimension + 1) + // accumulator_c1
+get_memory_needed_per_block_cmux_tree<Torus>(glwe_dimension,
polynomial_size);
}
template <typename Torus>
__host__ __device__ uint64_t get_buffer_size_blind_rotation_sample_extraction(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t mbr_size, uint32_t tau, uint32_t max_shared_memory) {
uint64_t memory_needed_per_block =
get_memory_needed_per_block_blind_rotation_sample_extraction<Torus>(
glwe_dimension, polynomial_size);
uint64_t device_mem = 0;
if (max_shared_memory < memory_needed_per_block) {
device_mem = memory_needed_per_block * tau;
}
if (max_shared_memory < polynomial_size * sizeof(double)) {
device_mem += polynomial_size * sizeof(double);
}
int ggsw_size = polynomial_size * (glwe_dimension + 1) *
(glwe_dimension + 1) * level_count;
uint64_t buffer_size = mbr_size * ggsw_size * sizeof(double) // d_ggsw_fft_in
+ device_mem;
return buffer_size + buffer_size % sizeof(double2);
}
template <typename Torus, typename STorus, typename params>
__host__ void scratch_blind_rotation_sample_extraction(
void *v_stream, uint32_t gpu_index, int8_t **br_se_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t mbr_size, uint32_t tau, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
uint64_t memory_needed_per_block =
get_memory_needed_per_block_blind_rotation_sample_extraction<Torus>(
glwe_dimension, polynomial_size);
if (max_shared_memory >= memory_needed_per_block) {
check_cuda_error(cudaFuncSetAttribute(
device_blind_rotation_and_sample_extraction<Torus, STorus, params,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, memory_needed_per_block));
check_cuda_error(cudaFuncSetCacheConfig(
device_blind_rotation_and_sample_extraction<Torus, STorus, params,
FULLSM>,
cudaFuncCachePreferShared));
}
if (allocate_gpu_memory) {
uint64_t buffer_size =
get_buffer_size_blind_rotation_sample_extraction<Torus>(
glwe_dimension, polynomial_size, level_count, mbr_size, tau,
max_shared_memory);
*br_se_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
check_cuda_error(cudaGetLastError());
}
}
template <typename Torus, typename STorus, class params>
__host__ void host_blind_rotate_and_sample_extraction(
void *v_stream, uint32_t gpu_index, Torus *lwe_out, Torus *ggsw_in,
Torus *lut_vector, int8_t *br_se_buffer, uint32_t mbr_size, uint32_t tau,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t max_shared_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
uint64_t memory_needed_per_block =
get_memory_needed_per_block_blind_rotation_sample_extraction<Torus>(
glwe_dimension, polynomial_size);
// Prepare the buffers
// Here all the buffers have double2 alignment
int ggsw_size = polynomial_size * (glwe_dimension + 1) *
(glwe_dimension + 1) * level_count;
double2 *d_ggsw_fft_in = (double2 *)br_se_buffer;
int8_t *d_mem_fft = (int8_t *)d_ggsw_fft_in +
(ptrdiff_t)(mbr_size * ggsw_size * sizeof(double));
int8_t *d_mem = d_mem_fft;
if (max_shared_memory < polynomial_size * sizeof(double)) {
d_mem = d_mem_fft + (ptrdiff_t)(polynomial_size * sizeof(double));
}
// Apply the FFT on m^br
batch_fft_ggsw_vector<Torus, STorus, params>(
stream, d_ggsw_fft_in, ggsw_in, d_mem_fft, mbr_size, glwe_dimension,
polynomial_size, level_count, gpu_index, max_shared_memory);
check_cuda_error(cudaGetLastError());
dim3 thds(polynomial_size / params::opt, 1, 1);
dim3 grid(tau, 1, 1);
if (max_shared_memory < memory_needed_per_block)
device_blind_rotation_and_sample_extraction<Torus, STorus, params, NOSM>
<<<grid, thds, 0, *stream>>>(lwe_out, lut_vector, d_ggsw_fft_in,
mbr_size,
glwe_dimension, // k
polynomial_size, base_log, level_count,
memory_needed_per_block, d_mem);
else
device_blind_rotation_and_sample_extraction<Torus, STorus, params, FULLSM>
<<<grid, thds, memory_needed_per_block, *stream>>>(
lwe_out, lut_vector, d_ggsw_fft_in, mbr_size,
glwe_dimension, // k
polynomial_size, base_log, level_count, memory_needed_per_block,
d_mem);
check_cuda_error(cudaGetLastError());
}
#endif // VERTICAL_PACKING_CUH

View File

@@ -1,559 +0,0 @@
#include "wop_bootstrap.cuh"
#include <cmath>
/*
* Runs standard checks to validate the inputs
*/
void checks_wop_pbs(int glwe_dimension, int polynomial_size,
int level_count_bsk, int crt_decomposition_size,
uint32_t *number_of_bits_to_extract_array) {
int total_bits_to_extract = 0;
for (int i = 0; i < crt_decomposition_size; i++) {
total_bits_to_extract += number_of_bits_to_extract_array[i];
}
assert(("Error (GPU WOP PBS): polynomial_size should be one of "
"256, 512, 1024, 2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
// The number of inputs should be lower than the number of streaming
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
// to the occupancy of 50%). The only supported value for k is 1, so
// k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU WOP PBS): the number of input LWEs must be lower or "
"equal to the "
"number of streaming multiprocessors on the device divided by 4 * (k "
"+ 1) * "
"level_count_bsk",
crt_decomposition_size <=
number_of_sm / 4. / (glwe_dimension + 1) / level_count_bsk));
assert(("Error (GPU WOP PBS): the number of inputs x the number of extracted "
"bits should be "
"larger than log2 of the polynomial size",
total_bits_to_extract >= log2(polynomial_size)));
}
void checks_fast_circuit_bootstrap_vertical_packing(int polynomial_size) {
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
"256, 512, 1024, 2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
}
void checks_circuit_bootstrap_vertical_packing(int glwe_dimension,
int polynomial_size,
int number_of_inputs,
int level_count_bsk) {
// The number of inputs should be lower than the number of streaming
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
// to the occupancy of 50%). The only supported value for k is 1, so
// k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
"equal to the "
"number of streaming multiprocessors on the device divided by 4 * (k "
"+ 1) "
"level_count_bsk",
number_of_inputs <=
number_of_sm / 4. / (glwe_dimension + 1) / level_count_bsk));
checks_fast_circuit_bootstrap_vertical_packing(polynomial_size);
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the circuit bootstrap and vertical packing on 32 bits inputs, into
* `cbs_vp_buffer`. It also fills the value of delta_log to be used in the
* circuit bootstrap.
*/
void scratch_cuda_circuit_bootstrap_vertical_packing_32(
void *v_stream, uint32_t gpu_index, int8_t **cbs_vp_buffer,
uint32_t *cbs_delta_log, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t level_bsk, uint32_t level_count_cbs,
uint32_t number_of_inputs, uint32_t tau, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
checks_fast_circuit_bootstrap_vertical_packing(polynomial_size);
switch (polynomial_size) {
case 256:
scratch_circuit_bootstrap_vertical_packing<uint32_t, int32_t, Degree<256>>(
v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, level_bsk, level_count_cbs,
number_of_inputs, tau, max_shared_memory, allocate_gpu_memory);
break;
case 512:
scratch_circuit_bootstrap_vertical_packing<uint32_t, int32_t, Degree<512>>(
v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, level_bsk, level_count_cbs,
number_of_inputs, tau, max_shared_memory, allocate_gpu_memory);
break;
case 1024:
scratch_circuit_bootstrap_vertical_packing<uint32_t, int32_t, Degree<1024>>(
v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, level_bsk, level_count_cbs,
number_of_inputs, tau, max_shared_memory, allocate_gpu_memory);
break;
case 2048:
scratch_circuit_bootstrap_vertical_packing<uint32_t, int32_t, Degree<2048>>(
v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, level_bsk, level_count_cbs,
number_of_inputs, tau, max_shared_memory, allocate_gpu_memory);
break;
case 4096:
scratch_circuit_bootstrap_vertical_packing<uint32_t, int32_t, Degree<4096>>(
v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, level_bsk, level_count_cbs,
number_of_inputs, tau, max_shared_memory, allocate_gpu_memory);
break;
case 8192:
scratch_circuit_bootstrap_vertical_packing<uint32_t, int32_t, Degree<8192>>(
v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, level_bsk, level_count_cbs,
number_of_inputs, tau, max_shared_memory, allocate_gpu_memory);
break;
default:
break;
}
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the circuit bootstrap and vertical packing on 64 bits inputs, into
* `cbs_vp_buffer`. It also fills the value of delta_log to be used in the
* circuit bootstrap.
*/
void scratch_cuda_circuit_bootstrap_vertical_packing_64(
void *v_stream, uint32_t gpu_index, int8_t **cbs_vp_buffer,
uint32_t *cbs_delta_log, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t level_bsk, uint32_t level_count_cbs,
uint32_t number_of_inputs, uint32_t tau, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
checks_fast_circuit_bootstrap_vertical_packing(polynomial_size);
switch (polynomial_size) {
case 256:
scratch_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<256>>(
v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, level_bsk, level_count_cbs,
number_of_inputs, tau, max_shared_memory, allocate_gpu_memory);
break;
case 512:
scratch_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<512>>(
v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, level_bsk, level_count_cbs,
number_of_inputs, tau, max_shared_memory, allocate_gpu_memory);
break;
case 1024:
scratch_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<1024>>(
v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, level_bsk, level_count_cbs,
number_of_inputs, tau, max_shared_memory, allocate_gpu_memory);
break;
case 2048:
scratch_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<2048>>(
v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, level_bsk, level_count_cbs,
number_of_inputs, tau, max_shared_memory, allocate_gpu_memory);
break;
case 4096:
scratch_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<4096>>(
v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, level_bsk, level_count_cbs,
number_of_inputs, tau, max_shared_memory, allocate_gpu_memory);
break;
case 8192:
scratch_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<8192>>(
v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, level_bsk, level_count_cbs,
number_of_inputs, tau, max_shared_memory, allocate_gpu_memory);
break;
default:
break;
}
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the wop PBS on 32 bits inputs, into `wop_pbs_buffer`. It also fills the value
* of delta_log and cbs_delta_log to be used in the bit extract and circuit
* bootstrap.
*/
void scratch_cuda_wop_pbs_32(void *v_stream, uint32_t gpu_index,
int8_t **wop_pbs_buffer, uint32_t *delta_log_array,
uint32_t *cbs_delta_log, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t level_count_cbs, uint32_t level_count_bsk,
uint32_t *number_of_bits_to_extract_array,
uint32_t crt_decomposition_size,
uint32_t max_shared_memory,
bool allocate_gpu_memory) {
checks_wop_pbs(glwe_dimension, polynomial_size, level_count_bsk,
crt_decomposition_size, number_of_bits_to_extract_array);
switch (polynomial_size) {
case 256:
scratch_wop_pbs<uint32_t, int32_t, Degree<256>>(
v_stream, gpu_index, wop_pbs_buffer, delta_log_array, cbs_delta_log,
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
level_count_bsk, number_of_bits_to_extract_array,
crt_decomposition_size, max_shared_memory, allocate_gpu_memory);
break;
case 512:
scratch_wop_pbs<uint32_t, int32_t, Degree<512>>(
v_stream, gpu_index, wop_pbs_buffer, delta_log_array, cbs_delta_log,
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
level_count_bsk, number_of_bits_to_extract_array,
crt_decomposition_size, max_shared_memory, allocate_gpu_memory);
break;
case 1024:
scratch_wop_pbs<uint32_t, int32_t, Degree<1024>>(
v_stream, gpu_index, wop_pbs_buffer, delta_log_array, cbs_delta_log,
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
level_count_bsk, number_of_bits_to_extract_array,
crt_decomposition_size, max_shared_memory, allocate_gpu_memory);
break;
case 2048:
scratch_wop_pbs<uint32_t, int32_t, Degree<2048>>(
v_stream, gpu_index, wop_pbs_buffer, delta_log_array, cbs_delta_log,
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
level_count_bsk, number_of_bits_to_extract_array,
crt_decomposition_size, max_shared_memory, allocate_gpu_memory);
break;
case 4096:
scratch_wop_pbs<uint32_t, int32_t, Degree<4096>>(
v_stream, gpu_index, wop_pbs_buffer, delta_log_array, cbs_delta_log,
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
level_count_bsk, number_of_bits_to_extract_array,
crt_decomposition_size, max_shared_memory, allocate_gpu_memory);
break;
case 8192:
scratch_wop_pbs<uint32_t, int32_t, Degree<8192>>(
v_stream, gpu_index, wop_pbs_buffer, delta_log_array, cbs_delta_log,
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
level_count_bsk, number_of_bits_to_extract_array,
crt_decomposition_size, max_shared_memory, allocate_gpu_memory);
break;
default:
break;
}
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the wop PBS on 64 bits inputs, into `wop_pbs_buffer`. It also fills the value
* of delta_log and cbs_delta_log to be used in the bit extract and circuit
* bootstrap.
*/
void scratch_cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index,
int8_t **wop_pbs_buffer, uint32_t *delta_log_array,
uint32_t *cbs_delta_log, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t level_count_cbs, uint32_t level_count_bsk,
uint32_t *number_of_bits_to_extract_array,
uint32_t crt_decomposition_size,
uint32_t max_shared_memory,
bool allocate_gpu_memory) {
checks_wop_pbs(glwe_dimension, polynomial_size, level_count_bsk,
crt_decomposition_size, number_of_bits_to_extract_array);
switch (polynomial_size) {
case 256:
scratch_wop_pbs<uint64_t, int64_t, Degree<256>>(
v_stream, gpu_index, wop_pbs_buffer, delta_log_array, cbs_delta_log,
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
level_count_bsk, number_of_bits_to_extract_array,
crt_decomposition_size, max_shared_memory, allocate_gpu_memory);
break;
case 512:
scratch_wop_pbs<uint64_t, int64_t, Degree<512>>(
v_stream, gpu_index, wop_pbs_buffer, delta_log_array, cbs_delta_log,
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
level_count_bsk, number_of_bits_to_extract_array,
crt_decomposition_size, max_shared_memory, allocate_gpu_memory);
break;
case 1024:
scratch_wop_pbs<uint64_t, int64_t, Degree<1024>>(
v_stream, gpu_index, wop_pbs_buffer, delta_log_array, cbs_delta_log,
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
level_count_bsk, number_of_bits_to_extract_array,
crt_decomposition_size, max_shared_memory, allocate_gpu_memory);
break;
case 2048:
scratch_wop_pbs<uint64_t, int64_t, Degree<2048>>(
v_stream, gpu_index, wop_pbs_buffer, delta_log_array, cbs_delta_log,
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
level_count_bsk, number_of_bits_to_extract_array,
crt_decomposition_size, max_shared_memory, allocate_gpu_memory);
break;
case 4096:
scratch_wop_pbs<uint64_t, int64_t, Degree<4096>>(
v_stream, gpu_index, wop_pbs_buffer, delta_log_array, cbs_delta_log,
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
level_count_bsk, number_of_bits_to_extract_array,
crt_decomposition_size, max_shared_memory, allocate_gpu_memory);
break;
case 8192:
scratch_wop_pbs<uint64_t, int64_t, Degree<8192>>(
v_stream, gpu_index, wop_pbs_buffer, delta_log_array, cbs_delta_log,
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
level_count_bsk, number_of_bits_to_extract_array,
crt_decomposition_size, max_shared_memory, allocate_gpu_memory);
break;
default:
break;
}
}
/*
* Entry point for cuda circuit bootstrap + vertical packing for batches of
* input 64 bit LWE ciphertexts.
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - 'lwe_array_out' list of output lwe ciphertexts
* - 'lwe_array_in' list of input lwe_ciphertexts
* - 'fourier_bsk' bootstrapping key in fourier domain, expected half size
* compressed complex key.
* - 'cbs_fpksk' list of private functional packing keyswitch keys
* - 'lut_vector' list of test vectors
* - 'cbs_vp_buffer' a pre-allocated array to store intermediate results
* - 'polynomial_size' size of the test polynomial, supported sizes:
* {256, 512, 1024, 2048, 4096, 8192}
* - 'glwe_dimension' supported dimensions: {1}
* - 'lwe_dimension' dimension of input LWE ciphertexts
* - 'level_count_bsk' decomposition level for bootstrapping
* - 'base_log_bsk' base log parameter for bootstrapping
* - 'level_count_pksk' decomposition level for fp-keyswitch
* - 'base_log_pksk' base log parameter for fp-keyswitch
* - 'level_count_cbs' level of circuit bootstrap
* - 'base_log_cbs' base log parameter for circuit bootstrap
* - 'number_of_inputs' number of input LWE ciphertexts
* - 'lut_number' number of LUTs given as input
* - 'max_shared_memory' maximum shared memory amount to be used in
* bootstrapping kernel
*
*/
void cuda_circuit_bootstrap_vertical_packing_64(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *fourier_bsk, void *cbs_fpksk, void *lut_vector, int8_t *cbs_vp_buffer,
uint32_t cbs_delta_log, uint32_t polynomial_size, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t level_count_bsk, uint32_t base_log_bsk,
uint32_t level_count_pksk, uint32_t base_log_pksk, uint32_t level_count_cbs,
uint32_t base_log_cbs, uint32_t number_of_inputs, uint32_t lut_number,
uint32_t max_shared_memory) {
checks_circuit_bootstrap_vertical_packing(glwe_dimension, polynomial_size,
number_of_inputs, level_count_bsk);
switch (polynomial_size) {
case 256:
host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<256>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
(double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, cbs_vp_buffer,
cbs_delta_log, glwe_dimension, lwe_dimension, polynomial_size,
base_log_bsk, level_count_bsk, base_log_pksk, level_count_pksk,
base_log_cbs, level_count_cbs, number_of_inputs, lut_number,
max_shared_memory);
break;
case 512:
host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<512>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
(double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, cbs_vp_buffer,
cbs_delta_log, glwe_dimension, lwe_dimension, polynomial_size,
base_log_bsk, level_count_bsk, base_log_pksk, level_count_pksk,
base_log_cbs, level_count_cbs, number_of_inputs, lut_number,
max_shared_memory);
break;
case 1024:
host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<1024>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
(double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, cbs_vp_buffer,
cbs_delta_log, glwe_dimension, lwe_dimension, polynomial_size,
base_log_bsk, level_count_bsk, base_log_pksk, level_count_pksk,
base_log_cbs, level_count_cbs, number_of_inputs, lut_number,
max_shared_memory);
break;
case 2048:
host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<2048>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
(double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, cbs_vp_buffer,
cbs_delta_log, glwe_dimension, lwe_dimension, polynomial_size,
base_log_bsk, level_count_bsk, base_log_pksk, level_count_pksk,
base_log_cbs, level_count_cbs, number_of_inputs, lut_number,
max_shared_memory);
break;
case 4096:
host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<4096>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
(double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, cbs_vp_buffer,
cbs_delta_log, glwe_dimension, lwe_dimension, polynomial_size,
base_log_bsk, level_count_bsk, base_log_pksk, level_count_pksk,
base_log_cbs, level_count_cbs, number_of_inputs, lut_number,
max_shared_memory);
break;
case 8192:
host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<8192>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
(double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, cbs_vp_buffer,
cbs_delta_log, glwe_dimension, lwe_dimension, polynomial_size,
base_log_bsk, level_count_bsk, base_log_pksk, level_count_pksk,
base_log_cbs, level_count_cbs, number_of_inputs, lut_number,
max_shared_memory);
break;
default:
break;
}
}
/*
* Entry point for entire without padding programmable bootstrap on 64 bit input
* LWE ciphertexts.
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - 'lwe_array_out' list of output lwe ciphertexts
* - 'lwe_array_in' list of input lwe_ciphertexts
* - 'lut_vector' list of test vectors
* - 'fourier_bsk' bootstrapping key in fourier domain, expected half size
* compressed complex key.
* - 'ksk' keyswitch key to use inside extract bits block
* - 'cbs_fpksk' list of fp-keyswitch keys
* - 'wop_pbs_buffer' a pre-allocated array to store intermediate results
* - 'glwe_dimension' supported dimensions: {1}
* - 'lwe_dimension' dimension of input lwe ciphertexts
* - 'polynomial_size' size of the test polynomial, supported sizes:
* {256, 512, 1024, 2048, 4096, 8192}
* - 'base_log_bsk' base log parameter for bootstrapping
* - 'level_count_bsk' decomposition level for bootstrapping
* - 'base_log_ksk' base log parameter for keyswitch
* - 'level_count_ksk' decomposition level for keyswitch
* - 'base_log_pksk' base log parameter for fp-keyswitch
* - 'level_count_pksk' decomposition level for fp-keyswitch
* - 'base_log_cbs' base log parameter for circuit bootstrap
* - 'level_count_cbs' level of circuit bootstrap
* - 'number_of_bits_of_message_including_padding' number of bits to extract
* from each input lwe ciphertext including padding bit
* - 'number_of_bits_to_extract' number of bits to extract
* from each input lwe ciphertext without padding bit
* - 'crt_decomposition_size' number of input lwe ciphertexts
* - 'max_shared_memory' maximum shared memory amount to be used in
* bootstrapping kernel
*
*/
void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_array_in, void *lut_vector, void *fourier_bsk,
void *ksk, void *cbs_fpksk, int8_t *wop_pbs_buffer,
uint32_t cbs_delta_log, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log_bsk, uint32_t level_count_bsk,
uint32_t base_log_ksk, uint32_t level_count_ksk,
uint32_t base_log_pksk, uint32_t level_count_pksk,
uint32_t base_log_cbs, uint32_t level_count_cbs,
uint32_t *number_of_bits_to_extract_array,
uint32_t *delta_log_array, uint32_t crt_decomposition_size,
uint32_t max_shared_memory) {
checks_wop_pbs(glwe_dimension, polynomial_size, level_count_bsk,
crt_decomposition_size, number_of_bits_to_extract_array);
switch (polynomial_size) {
case 256:
host_wop_pbs<uint64_t, int64_t, Degree<256>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
(double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk,
wop_pbs_buffer, cbs_delta_log, glwe_dimension, lwe_dimension,
polynomial_size, base_log_bsk, level_count_bsk, base_log_ksk,
level_count_ksk, base_log_pksk, level_count_pksk, base_log_cbs,
level_count_cbs, number_of_bits_to_extract_array, delta_log_array,
crt_decomposition_size, max_shared_memory);
break;
case 512:
host_wop_pbs<uint64_t, int64_t, Degree<512>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
(double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk,
wop_pbs_buffer, cbs_delta_log, glwe_dimension, lwe_dimension,
polynomial_size, base_log_bsk, level_count_bsk, base_log_ksk,
level_count_ksk, base_log_pksk, level_count_pksk, base_log_cbs,
level_count_cbs, number_of_bits_to_extract_array, delta_log_array,
crt_decomposition_size, max_shared_memory);
break;
case 1024:
host_wop_pbs<uint64_t, int64_t, Degree<1024>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
(double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk,
wop_pbs_buffer, cbs_delta_log, glwe_dimension, lwe_dimension,
polynomial_size, base_log_bsk, level_count_bsk, base_log_ksk,
level_count_ksk, base_log_pksk, level_count_pksk, base_log_cbs,
level_count_cbs, number_of_bits_to_extract_array, delta_log_array,
crt_decomposition_size, max_shared_memory);
break;
case 2048:
host_wop_pbs<uint64_t, int64_t, Degree<2048>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
(double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk,
wop_pbs_buffer, cbs_delta_log, glwe_dimension, lwe_dimension,
polynomial_size, base_log_bsk, level_count_bsk, base_log_ksk,
level_count_ksk, base_log_pksk, level_count_pksk, base_log_cbs,
level_count_cbs, number_of_bits_to_extract_array, delta_log_array,
crt_decomposition_size, max_shared_memory);
break;
case 4096:
host_wop_pbs<uint64_t, int64_t, Degree<4096>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
(double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk,
wop_pbs_buffer, cbs_delta_log, glwe_dimension, lwe_dimension,
polynomial_size, base_log_bsk, level_count_bsk, base_log_ksk,
level_count_ksk, base_log_pksk, level_count_pksk, base_log_cbs,
level_count_cbs, number_of_bits_to_extract_array, delta_log_array,
crt_decomposition_size, max_shared_memory);
break;
case 8192:
host_wop_pbs<uint64_t, int64_t, Degree<8192>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
(double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk,
wop_pbs_buffer, cbs_delta_log, glwe_dimension, lwe_dimension,
polynomial_size, base_log_bsk, level_count_bsk, base_log_ksk,
level_count_ksk, base_log_pksk, level_count_pksk, base_log_cbs,
level_count_cbs, number_of_bits_to_extract_array, delta_log_array,
crt_decomposition_size, max_shared_memory);
break;
default:
break;
}
}
/*
* This cleanup function frees the data for the wop PBS on GPU in wop_pbs_buffer
* for 32 or 64 bits inputs.
*/
void cleanup_cuda_wop_pbs(void *v_stream, uint32_t gpu_index,
int8_t **wop_pbs_buffer) {
auto stream = static_cast<cudaStream_t *>(v_stream);
// Free memory
cuda_drop_async(*wop_pbs_buffer, stream, gpu_index);
}
/*
* This cleanup function frees the data for the circuit bootstrap and vertical
* packing on GPU in cbs_vp_buffer for 32 or 64 bits inputs.
*/
void cleanup_cuda_circuit_bootstrap_vertical_packing(void *v_stream,
uint32_t gpu_index,
int8_t **cbs_vp_buffer) {
auto stream = static_cast<cudaStream_t *>(v_stream);
// Free memory
cuda_drop_async(*cbs_vp_buffer, stream, gpu_index);
}

View File

@@ -1,322 +0,0 @@
#ifndef WOP_PBS_H
#define WOP_PBS_H
#include "bit_extraction.cuh"
#include "bootstrap.h"
#include "circuit_bootstrap.cuh"
#include "device.h"
#include "utils/kernel_dimensions.cuh"
#include "utils/timer.cuh"
#include "vertical_packing.cuh"
template <typename Torus, class params>
__global__ void device_build_lut(Torus *lut_out, Torus *lut_in,
uint32_t glwe_dimension, uint32_t lut_number) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < glwe_dimension * params::degree * lut_number) {
int lut_index = index / (glwe_dimension * params::degree);
for (int j = 0; j < glwe_dimension; j++) {
lut_out[index + lut_index * (glwe_dimension + 1) * params::degree +
j * params::degree] = 0;
}
lut_out[index + lut_index * (glwe_dimension + 1) * params::degree +
glwe_dimension * params::degree] = lut_in[index];
}
}
template <typename Torus>
__host__ __device__ uint64_t get_buffer_size_cbs_vp(uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t level_count_cbs,
uint32_t tau,
uint32_t number_of_inputs) {
int ggsw_size = level_count_cbs * (glwe_dimension + 1) *
(glwe_dimension + 1) * polynomial_size;
uint64_t buffer_size =
number_of_inputs * level_count_cbs * sizeof(Torus) + // lut_vector_indexes
number_of_inputs * ggsw_size * sizeof(Torus) + // ggsw_out_cbs
tau * (glwe_dimension + 1) * polynomial_size *
sizeof(Torus); // glwe_array_out_cmux_tree
return buffer_size + buffer_size % sizeof(double2);
}
template <typename Torus, typename STorus, typename params>
__host__ void scratch_circuit_bootstrap_vertical_packing(
void *v_stream, uint32_t gpu_index, int8_t **cbs_vp_buffer,
uint32_t *cbs_delta_log, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t level_bsk, uint32_t level_count_cbs,
uint32_t number_of_inputs, uint32_t tau, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
// Allocate lut vector indexes on the CPU first to avoid blocking the stream
Torus *h_lut_vector_indexes =
(Torus *)malloc(number_of_inputs * level_count_cbs * sizeof(Torus));
uint32_t mbr_size = std::min(params::log2_degree, (int)number_of_inputs);
uint64_t buffer_size =
get_buffer_size_cbs_vp<Torus>(glwe_dimension, polynomial_size,
level_count_cbs, tau, number_of_inputs) +
get_buffer_size_cbs<Torus>(glwe_dimension, lwe_dimension, polynomial_size,
level_count_cbs, number_of_inputs) +
get_buffer_size_bootstrap_low_latency<Torus>(
glwe_dimension, polynomial_size, level_bsk,
number_of_inputs * level_count_cbs, max_shared_memory) +
get_buffer_size_cmux_tree<Torus, params>(
glwe_dimension, polynomial_size, level_count_cbs,
1 << number_of_inputs, tau, max_shared_memory) +
get_buffer_size_blind_rotation_sample_extraction<Torus>(
glwe_dimension, polynomial_size, level_count_cbs, mbr_size, tau,
max_shared_memory);
// allocate device pointer for circuit bootstrap and vertical
// packing
if (allocate_gpu_memory) {
*cbs_vp_buffer =
(int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
}
// indexes of lut vectors for cbs
for (uint index = 0; index < level_count_cbs * number_of_inputs; index++) {
h_lut_vector_indexes[index] = index % level_count_cbs;
}
// lut_vector_indexes is the last buffer in the cbs_vp_buffer
uint64_t lut_vector_indexes_size =
number_of_inputs * level_count_cbs * sizeof(Torus);
int8_t *d_lut_vector_indexes =
(int8_t *)*cbs_vp_buffer +
(ptrdiff_t)(buffer_size - lut_vector_indexes_size);
cuda_memcpy_async_to_gpu((Torus *)d_lut_vector_indexes, h_lut_vector_indexes,
lut_vector_indexes_size, stream, gpu_index);
check_cuda_error(cudaStreamSynchronize(*stream));
free(h_lut_vector_indexes);
check_cuda_error(cudaGetLastError());
uint32_t bits = sizeof(Torus) * 8;
*cbs_delta_log = (bits - 1);
scratch_circuit_bootstrap<Torus, STorus, params>(
v_stream, gpu_index, cbs_vp_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_bsk, level_count_cbs, number_of_inputs,
max_shared_memory, false);
scratch_cmux_tree<Torus, STorus, params>(
v_stream, gpu_index, cbs_vp_buffer, glwe_dimension, polynomial_size,
level_count_cbs, number_of_inputs, tau, max_shared_memory, false);
scratch_blind_rotation_sample_extraction<Torus, STorus, params>(
v_stream, gpu_index, cbs_vp_buffer, glwe_dimension, polynomial_size,
level_count_cbs, mbr_size, tau, max_shared_memory, false);
}
// number_of_inputs is the total number of LWE ciphertexts passed to CBS + VP,
// i.e. tau * p where tau is the number of LUTs (the original number of LWEs
// before bit extraction) and p is the number of extracted bits
template <typename Torus, typename STorus, class params>
__host__ void host_circuit_bootstrap_vertical_packing(
void *v_stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_array_in, Torus *lut_vector, double2 *fourier_bsk,
Torus *cbs_fpksk, int8_t *cbs_vp_buffer, uint32_t cbs_delta_log,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log_bsk, uint32_t level_count_bsk, uint32_t base_log_pksk,
uint32_t level_count_pksk, uint32_t base_log_cbs, uint32_t level_count_cbs,
uint32_t number_of_inputs, uint32_t tau, uint32_t max_shared_memory) {
// Define the buffers
// Always define the buffers with strongest memory alignment requirement first
// Here the only requirement is that lut_vector_indexes should be defined
// last, since all the other buffers are aligned with double2 (all buffers
// with a size that's a multiple of polynomial_size * sizeof(Torus) are
// aligned with double2)
int ggsw_size = level_count_cbs * (glwe_dimension + 1) *
(glwe_dimension + 1) * polynomial_size;
int8_t *cbs_buffer = (int8_t *)cbs_vp_buffer;
int8_t *ggsw_out_cbs =
cbs_buffer +
(ptrdiff_t)(get_buffer_size_cbs<Torus>(glwe_dimension, lwe_dimension,
polynomial_size, level_count_cbs,
number_of_inputs) +
get_buffer_size_bootstrap_low_latency<Torus>(
glwe_dimension, polynomial_size, level_count_bsk,
number_of_inputs * level_count_cbs, max_shared_memory));
// number_of_inputs = tau * p is the total number of GGSWs
// split the vec of GGSW in two, the msb GGSW is for the CMux tree and the
// lsb GGSW is for the last blind rotation.
uint32_t mbr_size = std::min(params::log2_degree, (int)number_of_inputs);
int8_t *cmux_tree_buffer =
ggsw_out_cbs + (ptrdiff_t)(number_of_inputs * ggsw_size * sizeof(Torus));
int8_t *glwe_array_out_cmux_tree =
cmux_tree_buffer + (ptrdiff_t)(get_buffer_size_cmux_tree<Torus, params>(
glwe_dimension, polynomial_size, level_count_cbs,
1 << number_of_inputs, tau, max_shared_memory));
int8_t *br_se_buffer =
glwe_array_out_cmux_tree +
(ptrdiff_t)(tau * (glwe_dimension + 1) * polynomial_size * sizeof(Torus));
Torus *lut_vector_indexes =
(Torus *)br_se_buffer +
(ptrdiff_t)(get_buffer_size_blind_rotation_sample_extraction<Torus>(
glwe_dimension, polynomial_size, level_count_cbs,
mbr_size, tau, max_shared_memory) /
sizeof(Torus));
// Circuit bootstrap
host_circuit_bootstrap<Torus, params>(
v_stream, gpu_index, (Torus *)ggsw_out_cbs, lwe_array_in, fourier_bsk,
cbs_fpksk, lut_vector_indexes, cbs_buffer, cbs_delta_log, polynomial_size,
glwe_dimension, lwe_dimension, level_count_bsk, base_log_bsk,
level_count_pksk, base_log_pksk, level_count_cbs, base_log_cbs,
number_of_inputs, max_shared_memory);
check_cuda_error(cudaGetLastError());
// CMUX Tree
uint64_t lut_vector_size = (1 << number_of_inputs);
host_cmux_tree<Torus, STorus, params>(
v_stream, gpu_index, (Torus *)glwe_array_out_cmux_tree,
(Torus *)ggsw_out_cbs, lut_vector, cmux_tree_buffer, glwe_dimension,
polynomial_size, base_log_cbs, level_count_cbs, lut_vector_size, tau,
max_shared_memory);
check_cuda_error(cudaGetLastError());
// Blind rotation + sample extraction
// mbr = tau * p - r = log2(N)
// br_ggsw is a pointer to a sub-part of the ggsw_out_cbs buffer, for the
// blind rotation
uint32_t cmux_ggsw_len =
max(0, (int)number_of_inputs - (int)params::log2_degree);
Torus *br_ggsw =
(Torus *)ggsw_out_cbs +
(ptrdiff_t)(cmux_ggsw_len * level_count_cbs * (glwe_dimension + 1) *
(glwe_dimension + 1) * polynomial_size);
host_blind_rotate_and_sample_extraction<Torus, STorus, params>(
v_stream, gpu_index, lwe_array_out, br_ggsw,
(Torus *)glwe_array_out_cmux_tree, br_se_buffer, mbr_size, tau,
glwe_dimension, polynomial_size, base_log_cbs, level_count_cbs,
max_shared_memory);
}
template <typename Torus>
__host__ __device__ uint64_t get_buffer_size_wop_pbs(
uint32_t lwe_dimension, uint32_t total_bits_of_crt_decomposition) {
uint64_t buffer_size = (lwe_dimension + 1) *
(total_bits_of_crt_decomposition) *
sizeof(Torus); // lwe_array_out_bit_extract
return buffer_size + buffer_size % sizeof(double2);
}
template <typename Torus, typename STorus, typename params>
__host__ void scratch_wop_pbs(
void *v_stream, uint32_t gpu_index, int8_t **wop_pbs_buffer,
uint32_t *delta_log_array, uint32_t *cbs_delta_log, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t level_count_cbs,
uint32_t level_count_bsk, uint32_t *number_of_bits_to_extract_array,
uint32_t crt_decomposition_size, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
uint32_t ciphertext_total_bits_count = sizeof(Torus) * 8;
int total_bits_to_extract = 0;
for (int i = 0; i < crt_decomposition_size; i++) {
total_bits_to_extract += number_of_bits_to_extract_array[i];
delta_log_array[i] =
ciphertext_total_bits_count - number_of_bits_to_extract_array[i];
}
uint64_t bit_extract_buffer_size =
get_buffer_size_extract_bits<Torus>(glwe_dimension, lwe_dimension,
polynomial_size,
crt_decomposition_size) +
get_buffer_size_bootstrap_fast_low_latency<Torus>(
glwe_dimension, polynomial_size, level_count_bsk,
crt_decomposition_size, max_shared_memory);
uint32_t cbs_vp_number_of_inputs = total_bits_to_extract;
uint32_t mbr_size =
std::min(params::log2_degree, (int)(total_bits_to_extract));
if (allocate_gpu_memory) {
uint64_t buffer_size =
bit_extract_buffer_size +
get_buffer_size_wop_pbs<Torus>(lwe_dimension, total_bits_to_extract) +
get_buffer_size_cbs_vp<Torus>(glwe_dimension, polynomial_size,
level_count_cbs, crt_decomposition_size,
cbs_vp_number_of_inputs) +
get_buffer_size_cbs<Torus>(glwe_dimension, lwe_dimension,
polynomial_size, level_count_cbs,
cbs_vp_number_of_inputs) +
get_buffer_size_bootstrap_low_latency<Torus>(
glwe_dimension, polynomial_size, level_count_bsk,
cbs_vp_number_of_inputs * level_count_cbs, max_shared_memory) +
get_buffer_size_cmux_tree<Torus, params>(
glwe_dimension, polynomial_size, level_count_cbs,
(1 << cbs_vp_number_of_inputs), crt_decomposition_size,
max_shared_memory) +
get_buffer_size_blind_rotation_sample_extraction<Torus>(
glwe_dimension, polynomial_size, level_count_cbs, mbr_size,
crt_decomposition_size, max_shared_memory);
*wop_pbs_buffer =
(int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
}
int8_t *bit_extract_buffer =
(int8_t *)*wop_pbs_buffer + (ptrdiff_t)(get_buffer_size_wop_pbs<Torus>(
lwe_dimension, total_bits_to_extract));
scratch_extract_bits<Torus, STorus, params>(
v_stream, gpu_index, &bit_extract_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_count_bsk, crt_decomposition_size,
max_shared_memory, false);
int8_t *cbs_vp_buffer =
bit_extract_buffer + (ptrdiff_t)bit_extract_buffer_size;
scratch_circuit_bootstrap_vertical_packing<Torus, STorus, params>(
v_stream, gpu_index, &cbs_vp_buffer, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, level_count_bsk, level_count_cbs,
total_bits_to_extract, crt_decomposition_size, max_shared_memory, false);
}
template <typename Torus, typename STorus, class params>
__host__ void host_wop_pbs(
void *v_stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_array_in, Torus *lut_vector, double2 *fourier_bsk, Torus *ksk,
Torus *cbs_fpksk, int8_t *wop_pbs_buffer, uint32_t cbs_delta_log,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log_bsk, uint32_t level_count_bsk, uint32_t base_log_ksk,
uint32_t level_count_ksk, uint32_t base_log_pksk, uint32_t level_count_pksk,
uint32_t base_log_cbs, uint32_t level_count_cbs,
uint32_t *number_of_bits_to_extract_array, uint32_t *delta_log_array,
uint32_t crt_decomposition_size, uint32_t max_shared_memory) {
int total_bits_to_extract = 0;
for (int i = 0; i < crt_decomposition_size; i++) {
total_bits_to_extract += number_of_bits_to_extract_array[i];
}
int8_t *bit_extract_buffer = wop_pbs_buffer;
int8_t *lwe_array_out_bit_extract =
bit_extract_buffer +
(ptrdiff_t)(get_buffer_size_extract_bits<Torus>(
glwe_dimension, lwe_dimension, polynomial_size,
crt_decomposition_size) +
get_buffer_size_bootstrap_fast_low_latency<Torus>(
glwe_dimension, polynomial_size, level_count_bsk,
crt_decomposition_size, max_shared_memory));
host_extract_bits<Torus, params>(
v_stream, gpu_index, (Torus *)lwe_array_out_bit_extract, lwe_array_in,
bit_extract_buffer, ksk, fourier_bsk, number_of_bits_to_extract_array,
delta_log_array, glwe_dimension * polynomial_size, lwe_dimension,
glwe_dimension, polynomial_size, base_log_bsk, level_count_bsk,
base_log_ksk, level_count_ksk, crt_decomposition_size, max_shared_memory);
check_cuda_error(cudaGetLastError());
int8_t *cbs_vp_buffer =
lwe_array_out_bit_extract + (ptrdiff_t)(get_buffer_size_wop_pbs<Torus>(
lwe_dimension, total_bits_to_extract));
host_circuit_bootstrap_vertical_packing<Torus, STorus, params>(
v_stream, gpu_index, lwe_array_out, (Torus *)lwe_array_out_bit_extract,
lut_vector, fourier_bsk, cbs_fpksk, cbs_vp_buffer, cbs_delta_log,
glwe_dimension, lwe_dimension, polynomial_size, base_log_bsk,
level_count_bsk, base_log_pksk, level_count_pksk, base_log_cbs,
level_count_cbs, total_bits_to_extract, crt_decomposition_size,
max_shared_memory);
check_cuda_error(cudaGetLastError());
}
#endif // WOP_PBS_H

View File

@@ -1,11 +0,0 @@
option(CONCRETE_CUDA_BUILD_TESTS "Build the test tool" ON)
option(CONCRETE_CUDA_BUILD_BENCHMARKS "Build the benchmark tool" ON)
if(CONCRETE_CUDA_BUILD_TESTS)
message(STATUS "Building with Concrete CUDA test tool")
add_subdirectory(test)
endif()
if(CONCRETE_CUDA_BUILD_BENCHMARKS)
message(STATUS "Building with Concrete CUDA benchmark tool")
add_subdirectory(benchmark)
endif()

View File

@@ -1,93 +0,0 @@
find_package(CUDA REQUIRED)
find_package(CUDAToolkit REQUIRED)
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()
# Disable the Google Benchmark requirement on Google Test
set(BENCHMARK_ENABLE_GTEST_TESTS OFF)
set(BENCHMARK_ENABLE_TESTING OFF)
include(FetchContent)
FetchContent_Declare(
googlebenchmark
GIT_REPOSITORY https://github.com/google/benchmark.git
GIT_TAG v1.7.1)
FetchContent_MakeAvailable(googlebenchmark)
# Enable ExternalProject CMake module
include(ExternalProject)
set(CONCRETE_CPU_BINARY_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../../concrete-cpu/implementation/target/release")
set(CONCRETE_CPU_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../../concrete-cpu/implementation")
set(CONCRETE_CUDA_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../")
# Enable ExternalProject CMake module
include(ExternalProject)
if(NOT TARGET concrete_cpu)
ExternalProject_Add(
concrete_cpu
SOURCE_DIR ${CONCRETE_CPU_SOURCE_DIR}
DOWNLOAD_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND cargo +nightly build --release --features=nightly
COMMAND cargo +nightly build --release --features=nightly
BINARY_DIR ${CONCRETE_CPU_BINARY_DIR}
BUILD_ALWAYS true
INSTALL_COMMAND ""
LOG_BUILD ON)
endif()
set(TFHE_RS_SOURCE_DIR "${CMAKE_BINARY_DIR}/tfhe-rs")
set(TFHE_RS_BINARY_DIR "${TFHE_RS_SOURCE_DIR}/target/release")
if(NOT TARGET tfhe-rs)
ExternalProject_Add(
tfhe-rs
GIT_REPOSITORY https://github.com/zama-ai/tfhe-rs.git
GIT_TAG main
SOURCE_DIR ${TFHE_RS_SOURCE_DIR}
BUILD_IN_SOURCE 1
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND make build_c_api
INSTALL_COMMAND ""
LOG_BUILD ON)
endif()
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../include)
include_directories(${CONCRETE_CPU_SOURCE_DIR}/include)
include_directories(${CONCRETE_CUDA_SOURCE_DIR}/include)
include_directories(${TFHE_RS_BINARY_DIR})
include_directories("${CUDA_INCLUDE_DIRS}" "${CMAKE_CURRENT_SOURCE_DIR}")
find_package(OpenMP REQUIRED)
# Add the OpenMP flag to the compiler flags
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
add_library(concrete_cpu_lib STATIC IMPORTED)
add_dependencies(concrete_cpu_lib concrete_cpu)
set_target_properties(concrete_cpu_lib PROPERTIES IMPORTED_LOCATION ${CONCRETE_CPU_BINARY_DIR}/libconcrete_cpu.a)
add_library(tfhe_rs_lib STATIC IMPORTED)
add_dependencies(tfhe_rs_lib tfhe-rs)
set_target_properties(tfhe_rs_lib PROPERTIES IMPORTED_LOCATION ${TFHE_RS_BINARY_DIR}/libtfhe.a)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--no-as-needed,--allow-multiple-definition -ldl")
set(BINARY benchmark_concrete_cuda)
file(
GLOB_RECURSE BENCH_SOURCES
LIST_DIRECTORIES false
benchmark*.cpp main.cpp)
add_executable(${BINARY} ${BENCH_SOURCES} ../utils.cpp ../setup_and_teardown.cpp)
set_target_properties(benchmark_concrete_cuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(
benchmark_concrete_cuda
PUBLIC benchmark::benchmark concrete_cpu_lib tfhe_rs_lib concrete_cuda OpenMP::OpenMP_CXX
PRIVATE CUDA::cudart)

View File

@@ -1,106 +0,0 @@
# benchmark_concrete_cuda
This benchmark tool is written over Google Benchmark library. It measures the performance of the concrete framework's CUDA-accelerated functions and helps identify potential bottlenecks.
The output format can be adjusted according to the user's interest.
Each benchmark is executed once and targets a single function. Internally, for each benchmark, the tool will repeat each targetted function as many times as it needs to report an execution time with sufficient reliability. At this point, the variation we've observed in the benchmarked functions is relatively small, and thus we chose not to repeat benchmarks by default. However, this can also be tuned by the user if needed.
## How to Compile
The first step in compiling code with CMake is to create a build directory. This directory will
contain all the files generated during the build process, such as object files and executables.
We recommend creating this directory outside of the source directory, but inside the
implementation folder, to keep the source directory clean.
```bash
$ cd concrete/backends/concrete-cuda/implementation
$ mkdir build
$ cd build
```
Run CMake to generate the build files and then use make to compile the project.
```bash
$ cmake ..
$ make
```
The binary will be found in `concrete/backends/concrete-cuda/implementation/build/test_and_benchmark
/benchmark`.
## How to Run Benchmarks
To run benchmarks, you can simply execute the `benchmark_concrete_cuda` executable with no arguments:
```bash
$ test_and_benchmark/benchmark/benchmark_concrete_cuda
```
This will run all the available benchmarks.
## Output format
The reports will be printed to the standard output if you don't pass any arguments. However, Google Benchmarks has extended documentation on how to print it to files with different formats, e.g., `--benchmark_format=json` will print everything to a JSON file.
## How to Filter Benchmarks
You can filter benchmarks by specifying a regular expression as an argument. Only benchmarks whose name matches the regular expression will be executed.
For example, to run only benchmarks whose name contains the word "Bootstrap", you can execute:
```bash
$ test_and_benchmark/benchmark/benchmark_concrete_cuda --benchmark_filter=Bootstrap
```
The parameter `--benchmark_list_tests` can be used to list all the available benchmarks.
## How to Set the Time Unit
By default, benchmarks are reported in seconds. However, you can change the time unit to one of the following:
* `ns` (nanoseconds)
* `us` (microseconds)
* `ms` (milliseconds)
* `s` (seconds)
To set the time unit, use the --benchmark_time_unit option followed by the desired time unit:
```bash
$ test_and_benchmark/benchmark/benchmark_concrete_cuda --benchmark_time_unit=us
```
## How to Set the Number of Iterations
By default, each benchmark is executed for a number of iterations that is automatically determined by the Google Benchmark library.
However, you can increase the minimum time used for each measurement to increase the number of
iterations by using `--benchmark_min_time`. For instance:
```bash
$ test_and_benchmark/benchmark/benchmark_concrete_cuda --benchmark_min_time=10
```
will force the tool to run at least 10s of iterations.
## Statistics about the benchmarks
By default each benchmark will be executed only once. However, if you use
`--benchmark_repetitions` you can increase that and compute the mean, median, and standard
deviation of the benchmarks.
```bash
$ test_and_benchmark/benchmark/benchmark_concrete_cuda --benchmark_repetitions=10
```
Doing this, for each run the execution time will be reported. If you prefer, you can use
`--benchmark_report_aggregates_only=true` to report only the statistical data, or
`--benchmark_display_aggregates_only=true` that will display in the standard output only the
statistical data but report everything in the output file.
## Known issues
When displayed in the standard output, on a terminal, the unit presented for the throughput is given in "number of operations per second". This is a bug on the way data is presented by Google Benchmark. The correct unit is "operations per dollar".
## Conclusion
With these options, you can easily run benchmarks, filter benchmarks, set the time unit, and the number of iterations of benchmark_concrete_cuda. If you have any questions or issues, please feel free to contact us.
To learn more about Google Benchmark library, please refer to the [official user guide](https://github.com/google/benchmark/blob/main/docs/user_guide.md).

View File

@@ -1,141 +0,0 @@
#include <benchmark/benchmark.h>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <setup_and_teardown.h>
#include <vector>
using namespace std;
const unsigned MAX_INPUTS = 4;
const unsigned SAMPLES = 1;
typedef struct {
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
int pbs_base_log;
int pbs_level;
int ks_base_log;
int ks_level;
int number_of_inputs;
int number_of_bits_of_message_including_padding_0;
int number_of_bits_of_message_including_padding_1;
int number_of_bits_of_message_including_padding_2;
int number_of_bits_of_message_including_padding_3;
int number_of_bits_to_extract_0;
int number_of_bits_to_extract_1;
int number_of_bits_to_extract_2;
int number_of_bits_to_extract_3;
} BitExtractionBenchmarkParams;
class BitExtraction_u64 : public benchmark::Fixture {
protected:
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance = 7.52316384526264e-37;
double glwe_modular_variance = 7.52316384526264e-37;
int pbs_base_log;
int pbs_level;
int ks_base_log;
int ks_level;
uint32_t number_of_bits_of_message_including_padding_array[MAX_INPUTS];
uint32_t number_of_bits_to_extract_array[MAX_INPUTS];
int number_of_inputs;
uint64_t delta_array[MAX_INPUTS];
uint32_t delta_log_array[MAX_INPUTS];
Csprng *csprng;
cudaStream_t *stream_array[SAMPLES];
int gpu_index = 0;
uint64_t *plaintexts;
double *d_fourier_bsk;
uint64_t *d_ksk;
uint64_t *d_lwe_ct_in_array;
uint64_t *d_lwe_ct_out_array;
int8_t *bit_extract_buffer_array[SAMPLES];
uint64_t *lwe_sk_in;
uint64_t *lwe_sk_out;
public:
// Test arithmetic functions
void SetUp(const ::benchmark::State &state) {
for (size_t i = 0; i < SAMPLES; i++) {
stream_array[i] = cuda_create_stream(0);
}
// TestParams
lwe_dimension = state.range(0);
glwe_dimension = state.range(1);
polynomial_size = state.range(2);
pbs_base_log = state.range(3);
pbs_level = state.range(4);
ks_base_log = state.range(5);
ks_level = state.range(6);
number_of_inputs = state.range(7);
for (int i = 0; i < number_of_inputs; i++) {
number_of_bits_of_message_including_padding_array[i] = state.range(8 + i);
number_of_bits_to_extract_array[i] = state.range(12 + i);
}
bit_extraction_setup(
stream_array, &csprng, &lwe_sk_in, &lwe_sk_out, &d_fourier_bsk, &d_ksk,
&plaintexts, &d_lwe_ct_in_array, &d_lwe_ct_out_array,
bit_extract_buffer_array, lwe_dimension, glwe_dimension,
polynomial_size, lwe_modular_variance, glwe_modular_variance,
ks_base_log, ks_level, pbs_base_log, pbs_level,
number_of_bits_of_message_including_padding_array,
number_of_bits_to_extract_array, delta_log_array, delta_array,
number_of_inputs, 1, 1, gpu_index);
}
void TearDown(const ::benchmark::State &state) {
bit_extraction_teardown(stream_array, csprng, lwe_sk_in, lwe_sk_out,
d_fourier_bsk, d_ksk, plaintexts, d_lwe_ct_in_array,
d_lwe_ct_out_array, bit_extract_buffer_array,
SAMPLES, gpu_index);
}
};
BENCHMARK_DEFINE_F(BitExtraction_u64, ConcreteCuda_BitExtraction)
(benchmark::State &st) {
for (auto _ : st) {
// Execute bit extract
cuda_extract_bits_64(
stream_array[0], gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lwe_ct_in_array, bit_extract_buffer_array[0], (void *)d_ksk,
(void *)d_fourier_bsk, number_of_bits_to_extract_array, delta_log_array,
glwe_dimension * polynomial_size, lwe_dimension, glwe_dimension,
polynomial_size, pbs_base_log, pbs_level, ks_base_log, ks_level,
number_of_inputs, cuda_get_max_shared_memory(gpu_index));
cuda_synchronize_stream((void *)stream_array[0]);
}
st.counters["Throughput"] =
benchmark::Counter(number_of_inputs / get_aws_cost_per_second(),
benchmark::Counter::kIsIterationInvariantRate);
}
static void
BitExtractionBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
// Define the parameters to benchmark
std::vector<BitExtractionBenchmarkParams> params = {
(BitExtractionBenchmarkParams){585, 1, 1024, 10, 2, 4, 7, 4, 3, 4, 3, 3,
3, 4, 3, 3}};
// Add to the list of parameters to benchmark
for (auto x : params)
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
x.pbs_base_log, x.pbs_level, x.ks_base_log, x.ks_level,
x.number_of_inputs,
x.number_of_bits_of_message_including_padding_0,
x.number_of_bits_of_message_including_padding_1,
x.number_of_bits_of_message_including_padding_2,
x.number_of_bits_of_message_including_padding_3,
x.number_of_bits_to_extract_0, x.number_of_bits_to_extract_1,
x.number_of_bits_to_extract_2, x.number_of_bits_to_extract_3});
}
BENCHMARK_REGISTER_F(BitExtraction_u64, ConcreteCuda_BitExtraction)
->Apply(BitExtractionBenchmarkGenerateParams);

View File

@@ -1,122 +0,0 @@
#include <benchmark/benchmark.h>
#include <cstdint>
#include <setup_and_teardown.h>
#include <stdio.h>
#include <stdlib.h>
typedef struct {
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
int pbs_base_log;
int pbs_level;
int pksk_base_log;
int pksk_level;
int cbs_base_log;
int cbs_level;
int number_of_inputs;
} CircuitBootstrapBenchmarkParams;
class CircuitBootstrap_u64 : public benchmark::Fixture {
protected:
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance = 7.52316384526264e-37;
double glwe_modular_variance = 7.52316384526264e-37;
int pbs_base_log;
int pbs_level;
int pksk_base_log;
int pksk_level;
int cbs_base_log;
int cbs_level;
int number_of_inputs;
int number_of_bits_of_message_including_padding;
int ggsw_size;
uint64_t delta;
int delta_log;
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t *lwe_sk_in;
uint64_t *lwe_sk_out;
uint64_t *plaintexts;
double *d_fourier_bsk;
uint64_t *d_pksk;
uint64_t *d_lwe_ct_in_array;
uint64_t *d_ggsw_ct_out_array;
uint64_t *d_lut_vector_indexes;
int8_t *cbs_buffer;
public:
// Test arithmetic functions
void SetUp(const ::benchmark::State &state) {
stream = cuda_create_stream(0);
// TestParams
lwe_dimension = state.range(0);
glwe_dimension = state.range(1);
polynomial_size = state.range(2);
pbs_base_log = state.range(3);
pbs_level = state.range(4);
pksk_base_log = state.range(5);
pksk_level = state.range(6);
cbs_base_log = state.range(7);
cbs_level = state.range(8);
number_of_inputs = state.range(9);
// We generate binary messages
number_of_bits_of_message_including_padding = 2;
ggsw_size = cbs_level * (glwe_dimension + 1) * (glwe_dimension + 1) *
polynomial_size;
circuit_bootstrap_setup(
stream, &csprng, &lwe_sk_in, &lwe_sk_out, &d_fourier_bsk, &d_pksk,
&plaintexts, &d_lwe_ct_in_array, &d_ggsw_ct_out_array,
&d_lut_vector_indexes, &cbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, lwe_modular_variance, glwe_modular_variance,
pksk_base_log, pksk_level, pbs_base_log, pbs_level, cbs_level,
number_of_bits_of_message_including_padding, ggsw_size, &delta_log,
&delta, number_of_inputs, 1, 1, gpu_index);
}
void TearDown(const ::benchmark::State &state) {
circuit_bootstrap_teardown(stream, csprng, lwe_sk_in, lwe_sk_out,
d_fourier_bsk, d_pksk, plaintexts,
d_lwe_ct_in_array, d_lut_vector_indexes,
d_ggsw_ct_out_array, cbs_buffer, gpu_index);
}
};
BENCHMARK_DEFINE_F(CircuitBootstrap_u64, ConcreteCuda_CircuitBootstrap)
(benchmark::State &st) {
for (auto _ : st) {
// Execute circuit bootstrap
cuda_circuit_bootstrap_64(
stream, gpu_index, (void *)d_ggsw_ct_out_array,
(void *)d_lwe_ct_in_array, (void *)d_fourier_bsk, (void *)d_pksk,
(void *)d_lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
glwe_dimension, lwe_dimension, pbs_level, pbs_base_log, pksk_level,
pksk_base_log, cbs_level, cbs_base_log, number_of_inputs,
cuda_get_max_shared_memory(gpu_index));
cuda_synchronize_stream(stream);
}
st.counters["Throughput"] =
benchmark::Counter(number_of_inputs / get_aws_cost_per_second(),
benchmark::Counter::kIsIterationInvariantRate);
}
static void
CircuitBootstrapBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
// Define the parameters to benchmark
std::vector<CircuitBootstrapBenchmarkParams> params = {
(CircuitBootstrapBenchmarkParams){10, 2, 512, 11, 2, 15, 2, 10, 1, 100}};
// Add to the list of parameters to benchmark
for (auto x : params)
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
x.pbs_base_log, x.pbs_level, x.pksk_base_log, x.pksk_level,
x.cbs_base_log, x.cbs_level, x.number_of_inputs});
}
BENCHMARK_REGISTER_F(CircuitBootstrap_u64, ConcreteCuda_CircuitBootstrap)
->Apply(CircuitBootstrapBenchmarkGenerateParams);

View File

@@ -1,226 +0,0 @@
#include <benchmark/benchmark.h>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <omp.h>
#include <setup_and_teardown.h>
typedef struct {
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
int pbs_base_log;
int pbs_level;
int input_lwe_ciphertext_count;
} BootstrapBenchmarkParams;
class Bootstrap_u64 : public benchmark::Fixture {
protected:
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
int input_lwe_ciphertext_count;
double lwe_modular_variance = 0.000007069849454709433;
double glwe_modular_variance = 0.00000000000000029403601535432533;
int pbs_base_log;
int pbs_level;
int message_modulus = 4;
int carry_modulus = 4;
int payload_modulus;
uint64_t delta;
std::vector<double *> d_fourier_bsk_array;
std::vector<uint64_t *> d_lut_pbs_identity;
std::vector<uint64_t *> d_lut_pbs_indexes;
std::vector<uint64_t *> d_lwe_ct_in_array;
std::vector<uint64_t *> d_lwe_ct_out_array;
uint64_t *lwe_ct_array;
uint64_t *lwe_sk_in_array;
uint64_t *lwe_sk_out_array;
uint64_t *plaintexts;
Csprng *csprng;
std::vector<int8_t *> pbs_buffer;
int num_gpus;
std::vector<cudaStream_t *> streams;
std::vector<int> input_lwe_ciphertext_count_per_gpu;
public:
void SetUp(const ::benchmark::State &state) {
lwe_dimension = state.range(0);
glwe_dimension = state.range(1);
polynomial_size = state.range(2);
pbs_base_log = state.range(3);
pbs_level = state.range(4);
input_lwe_ciphertext_count = state.range(5);
num_gpus = std::min(cuda_get_number_of_gpus(), input_lwe_ciphertext_count);
for (int gpu_index = 0; gpu_index < num_gpus; gpu_index++) {
cudaSetDevice(gpu_index);
cudaStream_t *stream = cuda_create_stream(gpu_index);
streams.push_back(stream);
int input_lwe_ciphertext_count_on_gpu = number_of_inputs_on_gpu(
gpu_index, input_lwe_ciphertext_count, num_gpus);
double *d_fourier_bsk_array_per_gpu;
uint64_t *d_lut_pbs_identity_per_gpu;
uint64_t *d_lut_pbs_indexes_per_gpu;
uint64_t *d_lwe_ct_in_array_per_gpu;
uint64_t *d_lwe_ct_out_array_per_gpu;
int8_t *pbs_buffer_per_gpu;
bootstrap_classical_setup(
stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
&d_fourier_bsk_array_per_gpu, &plaintexts,
&d_lut_pbs_identity_per_gpu, &d_lut_pbs_indexes_per_gpu,
&d_lwe_ct_in_array_per_gpu, &d_lwe_ct_out_array_per_gpu,
lwe_dimension, glwe_dimension, polynomial_size, lwe_modular_variance,
glwe_modular_variance, pbs_base_log, pbs_level, message_modulus,
carry_modulus, &payload_modulus, &delta,
input_lwe_ciphertext_count_on_gpu, 1, 1, gpu_index);
size_t free, total;
cudaMemGetInfo(&free, &total);
uint64_t buffer_size = get_buffer_size_bootstrap_low_latency_64(
glwe_dimension, polynomial_size, pbs_level,
input_lwe_ciphertext_count_on_gpu,
cuda_get_max_shared_memory(gpu_index));
assert(buffer_size > free);
scratch_cuda_bootstrap_low_latency_64(
stream, gpu_index, &pbs_buffer_per_gpu, glwe_dimension,
polynomial_size, pbs_level, input_lwe_ciphertext_count_on_gpu,
cuda_get_max_shared_memory(gpu_index), true);
d_fourier_bsk_array.push_back(d_fourier_bsk_array_per_gpu);
d_lut_pbs_identity.push_back(d_lut_pbs_identity_per_gpu);
d_lut_pbs_indexes.push_back(d_lut_pbs_indexes_per_gpu);
d_lwe_ct_in_array.push_back(d_lwe_ct_in_array_per_gpu);
d_lwe_ct_out_array.push_back(d_lwe_ct_out_array_per_gpu);
pbs_buffer.push_back(pbs_buffer_per_gpu);
input_lwe_ciphertext_count_per_gpu.push_back(
input_lwe_ciphertext_count_on_gpu);
}
// We keep the following for the benchmarks with copies
lwe_ct_array = (uint64_t *)malloc(
(lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint64_t));
}
void TearDown(const ::benchmark::State &state) {
concrete_cpu_destroy_concrete_csprng(csprng);
free(csprng);
free(lwe_sk_in_array);
free(lwe_sk_out_array);
free(plaintexts);
for (int gpu_index = 0; gpu_index < num_gpus; gpu_index++) {
cudaSetDevice(gpu_index);
cleanup_cuda_bootstrap_low_latency(streams[gpu_index], gpu_index,
&pbs_buffer[gpu_index]);
cuda_drop_async(d_fourier_bsk_array[gpu_index], streams[gpu_index],
gpu_index);
cuda_drop_async(d_lut_pbs_identity[gpu_index], streams[gpu_index],
gpu_index);
cuda_drop_async(d_lut_pbs_indexes[gpu_index], streams[gpu_index],
gpu_index);
cuda_drop_async(d_lwe_ct_in_array[gpu_index], streams[gpu_index],
gpu_index);
cuda_drop_async(d_lwe_ct_out_array[gpu_index], streams[gpu_index],
gpu_index);
cuda_synchronize_stream(streams[gpu_index]);
cuda_destroy_stream(streams[gpu_index], gpu_index);
}
d_fourier_bsk_array.clear();
d_lut_pbs_identity.clear();
d_lut_pbs_indexes.clear();
d_lwe_ct_in_array.clear();
d_lwe_ct_out_array.clear();
pbs_buffer.clear();
input_lwe_ciphertext_count_per_gpu.clear();
streams.clear();
cudaDeviceReset();
}
};
BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_LowLatencyPBS)
(benchmark::State &st) {
for (auto _ : st) {
#pragma omp parallel for
for (int gpu_index = 0; gpu_index < num_gpus; gpu_index++) {
// Execute PBS
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
streams[gpu_index], gpu_index, (void *)d_lwe_ct_out_array[gpu_index],
(void *)d_lut_pbs_identity[gpu_index],
(void *)d_lut_pbs_indexes[gpu_index],
(void *)d_lwe_ct_in_array[gpu_index],
(void *)d_fourier_bsk_array[gpu_index], pbs_buffer[gpu_index],
lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
pbs_level, input_lwe_ciphertext_count_per_gpu[gpu_index], 1, 0,
cuda_get_max_shared_memory(gpu_index));
}
for (int gpu_index = 0; gpu_index < num_gpus; gpu_index++) {
cudaSetDevice(gpu_index);
cuda_synchronize_stream(streams[gpu_index]);
}
}
st.counters["Throughput"] =
benchmark::Counter(input_lwe_ciphertext_count / get_aws_cost_per_second(),
benchmark::Counter::kIsIterationInvariantRate);
}
static void
BootstrapBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
// Define the parameters to benchmark
// lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
// input_lwe_ciphertext_count
std::vector<BootstrapBenchmarkParams> params = {
// BOOLEAN_DEFAULT_PARAMETERS
(BootstrapBenchmarkParams){777, 3, 512, 18, 1, 1},
(BootstrapBenchmarkParams){777, 3, 512, 18, 1, 1000},
// BOOLEAN_TFHE_LIB_PARAMETERS
(BootstrapBenchmarkParams){830, 2, 1024, 23, 1, 1},
(BootstrapBenchmarkParams){830, 2, 1024, 23, 1, 1000},
// SHORTINT_PARAM_MESSAGE_1_CARRY_0
(BootstrapBenchmarkParams){678, 5, 256, 15, 1, 1},
(BootstrapBenchmarkParams){678, 5, 256, 15, 1, 1000},
// SHORTINT_PARAM_MESSAGE_1_CARRY_1
(BootstrapBenchmarkParams){684, 3, 512, 18, 1, 1},
(BootstrapBenchmarkParams){684, 3, 512, 18, 1, 1000},
// SHORTINT_PARAM_MESSAGE_2_CARRY_0
(BootstrapBenchmarkParams){656, 2, 512, 8, 2, 1},
(BootstrapBenchmarkParams){656, 2, 512, 8, 2, 1000},
// SHORTINT_PARAM_MESSAGE_1_CARRY_2
// SHORTINT_PARAM_MESSAGE_2_CARRY_1
// SHORTINT_PARAM_MESSAGE_3_CARRY_0
(BootstrapBenchmarkParams){742, 2, 1024, 23, 1, 1},
(BootstrapBenchmarkParams){742, 2, 1024, 23, 1, 1000},
// SHORTINT_PARAM_MESSAGE_1_CARRY_3
// SHORTINT_PARAM_MESSAGE_2_CARRY_2
// SHORTINT_PARAM_MESSAGE_3_CARRY_1
// SHORTINT_PARAM_MESSAGE_4_CARRY_0
(BootstrapBenchmarkParams){745, 1, 2048, 23, 1, 1},
(BootstrapBenchmarkParams){745, 1, 2048, 23, 1, 1000},
// SHORTINT_PARAM_MESSAGE_5_CARRY_0
// SHORTINT_PARAM_MESSAGE_3_CARRY_2
(BootstrapBenchmarkParams){807, 1, 4096, 22, 1, 1},
(BootstrapBenchmarkParams){807, 1, 4096, 22, 1, 1000},
// SHORTINT_PARAM_MESSAGE_6_CARRY_0
(BootstrapBenchmarkParams){915, 1, 8192, 22, 1, 1},
(BootstrapBenchmarkParams){915, 1, 8192, 22, 1, 100},
// SHORTINT_PARAM_MESSAGE_3_CARRY_3
//(BootstrapBenchmarkParams){864, 1, 8192, 15, 2, 100},
// SHORTINT_PARAM_MESSAGE_4_CARRY_3
// SHORTINT_PARAM_MESSAGE_7_CARRY_0
(BootstrapBenchmarkParams){930, 1, 16384, 15, 2, 1},
(BootstrapBenchmarkParams){930, 1, 16384, 15, 2, 100},
};
// Add to the list of parameters to benchmark
for (auto x : params) {
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
x.pbs_base_log, x.pbs_level, x.input_lwe_ciphertext_count});
}
}
BENCHMARK_REGISTER_F(Bootstrap_u64, ConcreteCuda_LowLatencyPBS)
->Apply(BootstrapBenchmarkGenerateParams);

View File

@@ -1,96 +0,0 @@
#include <benchmark/benchmark.h>
#include <cstdint>
#include <functional>
#include <setup_and_teardown.h>
#include <stdio.h>
#include <stdlib.h>
typedef struct {
int glwe_dimension;
int polynomial_size;
int p;
int tau;
int base_log;
int level_count;
} CMUXTreeBenchmarkParams;
class CMUXTree_u64 : public benchmark::Fixture {
protected:
int glwe_dimension;
int polynomial_size;
int p;
int tau;
double glwe_modular_variance = 0.00000000000000029403601535432533;
int base_log;
int level_count;
uint64_t delta;
uint32_t delta_log = 60;
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t *d_lut_identity;
uint64_t *d_ggsw_bit_array;
uint64_t *plaintexts;
uint64_t *d_glwe_out;
uint64_t *glwe_sk;
int8_t *cmux_tree_buffer = nullptr;
public:
// Test arithmetic functions
void SetUp(const ::benchmark::State &state) {
stream = cuda_create_stream(0);
// TestParams
glwe_dimension = state.range(0);
polynomial_size = state.range(1);
p = state.range(2);
tau = state.range(3);
base_log = state.range(4);
level_count = state.range(5);
cmux_tree_setup(stream, &csprng, &glwe_sk, &d_lut_identity, &plaintexts,
&d_ggsw_bit_array, &cmux_tree_buffer, &d_glwe_out,
glwe_dimension, polynomial_size, base_log, level_count,
glwe_modular_variance, p, tau, &delta_log, 1, 1, gpu_index);
// Value of the shift we multiply our messages by
delta = ((uint64_t)(1) << delta_log);
}
void TearDown(const ::benchmark::State &state) {
cmux_tree_teardown(stream, &csprng, &glwe_sk, &d_lut_identity, &plaintexts,
&d_ggsw_bit_array, &cmux_tree_buffer, &d_glwe_out,
gpu_index);
}
};
BENCHMARK_DEFINE_F(CMUXTree_u64, ConcreteCuda_CMUXTree)(benchmark::State &st) {
for (auto _ : st) {
// Execute scratch/CMUX tree/cleanup
cuda_cmux_tree_64(stream, gpu_index, (void *)d_glwe_out,
(void *)d_ggsw_bit_array, (void *)d_lut_identity,
cmux_tree_buffer, glwe_dimension, polynomial_size,
base_log, level_count, (1 << (tau * p)), tau,
cuda_get_max_shared_memory(gpu_index));
cuda_synchronize_stream(stream);
}
st.counters["Throughput"] =
benchmark::Counter(tau * p / get_aws_cost_per_second(),
benchmark::Counter::kIsIterationInvariantRate);
}
static void CMUXTreeBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
// Define the parameters to benchmark
std::vector<CMUXTreeBenchmarkParams> params = {
// glwe_dimension, polynomial_size, p, tau, base_log, level_count,
(CMUXTreeBenchmarkParams){2, 256, 10, 4, 6, 3},
};
// Add to the list of parameters to benchmark
for (auto x : params)
b->Args({x.glwe_dimension, x.polynomial_size, x.p, x.tau, x.base_log,
x.level_count});
}
BENCHMARK_REGISTER_F(CMUXTree_u64, ConcreteCuda_CMUXTree)
->Apply(CMUXTreeBenchmarkGenerateParams);

View File

@@ -1,313 +0,0 @@
#include <benchmark/benchmark.h>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <setup_and_teardown.h>
#include <omp.h>
const bool USE_MULTI_GPU = false;
typedef struct {
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance;
double glwe_modular_variance;
int pbs_base_log;
int pbs_level;
int ksk_base_log;
int ksk_level;
int total_message_bits;
int number_of_blocks;
int message_modulus;
int carry_modulus;
PBS_TYPE pbs_type;
} IntegerMultiplicationBenchmarkParams;
class IntegerMultiplication_u64 : public benchmark::Fixture {
protected:
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance = 4.478453795193731e-11;
double glwe_modular_variance = 8.645717832544903e-32;
int pbs_base_log;
int pbs_level;
int ksk_base_log;
int ksk_level;
int message_modulus;
int carry_modulus;
int total_message_bits;
int number_of_blocks;
int payload_modulus;
PBS_TYPE pbs_type;
uint64_t delta;
std::vector<void *> d_bsk_array;
std::vector<uint64_t *> d_ksk_array;
std::vector<uint64_t *> d_lwe_ct_in_array_1;
std::vector<uint64_t *> d_lwe_ct_in_array_2;
std::vector<uint64_t *> d_lwe_ct_out_array;
uint64_t *lwe_sk_in;
uint64_t *lwe_sk_out;
uint64_t *plaintexts_1;
uint64_t *plaintexts_2;
std::vector<int_mul_memory<uint64_t> *> mem_ptr_array;
Csprng *csprng;
int max_gpus_to_use;
int operations_per_gpu;
int num_gpus;
public:
void SetUp(const ::benchmark::State &state) {
cudaDeviceSynchronize();
lwe_dimension = state.range(0);
glwe_dimension = state.range(1);
polynomial_size = state.range(2);
lwe_modular_variance = state.range(3);
glwe_modular_variance = state.range(4);
pbs_base_log = state.range(5);
pbs_level = state.range(6);
ksk_base_log = state.range(7);
ksk_level = state.range(8);
total_message_bits = state.range(9);
number_of_blocks = state.range(10);
message_modulus = state.range(11);
carry_modulus = state.range(12);
int pbs_type_int = state.range(13);
max_gpus_to_use = state.range(14);
operations_per_gpu = state.range(15);
pbs_type = static_cast<PBS_TYPE>(pbs_type_int);
num_gpus = std::min(cuda_get_number_of_gpus(), max_gpus_to_use);
for (int device = 0; device < num_gpus; device++) {
cudaSetDevice(device);
cudaStream_t *stream = cuda_create_stream(device);
void *d_bsk_array_per_gpu;
uint64_t *d_ksk_array_per_gpu;
uint64_t *d_lwe_ct_in_array_1_per_gpu;
uint64_t *d_lwe_ct_in_array_2_per_gpu;
uint64_t *d_lwe_ct_out_array_per_gpu;
int_mul_memory<uint64_t> *mem_ptr_per_gpu = new int_mul_memory<uint64_t>;
integer_multiplication_setup(
stream, &csprng, &lwe_sk_in, &lwe_sk_out,
&d_bsk_array_per_gpu, &d_ksk_array_per_gpu,
&plaintexts_1, &plaintexts_2, &d_lwe_ct_in_array_1_per_gpu,
&d_lwe_ct_in_array_2_per_gpu, &d_lwe_ct_out_array_per_gpu,
mem_ptr_per_gpu, lwe_dimension, glwe_dimension, polynomial_size,
lwe_modular_variance, glwe_modular_variance, pbs_base_log, pbs_level,
ksk_base_log, ksk_level, total_message_bits, number_of_blocks,
message_modulus, carry_modulus, &delta, 1, 1, pbs_type, device);
if (USE_MULTI_GPU) {
scratch_cuda_integer_mult_radix_ciphertext_kb_64_multi_gpu(
mem_ptr_per_gpu, d_bsk_array_per_gpu, d_ksk_array_per_gpu,
message_modulus, carry_modulus, glwe_dimension, lwe_dimension,
polynomial_size, pbs_base_log, pbs_level, ksk_base_log, ksk_level,
number_of_blocks, pbs_type, cuda_get_max_shared_memory(device),
true);
} else {
scratch_cuda_integer_mult_radix_ciphertext_kb_64(
stream, device, (void *)mem_ptr_per_gpu, message_modulus,
carry_modulus, glwe_dimension, lwe_dimension, polynomial_size,
pbs_base_log, pbs_level, ksk_base_log, ksk_level, number_of_blocks,
pbs_type, cuda_get_max_shared_memory(device), true);
}
d_bsk_array.push_back(d_bsk_array_per_gpu);
d_ksk_array.push_back(d_ksk_array_per_gpu);
d_lwe_ct_in_array_1.push_back(d_lwe_ct_in_array_1_per_gpu);
d_lwe_ct_in_array_2.push_back(d_lwe_ct_in_array_2_per_gpu);
d_lwe_ct_out_array.push_back(d_lwe_ct_out_array_per_gpu);
mem_ptr_array.push_back(mem_ptr_per_gpu);
cuda_synchronize_stream(stream);
cuda_destroy_stream(stream, device);
}
}
void TearDown(const ::benchmark::State &state) {
cudaDeviceSynchronize();
concrete_cpu_destroy_concrete_csprng(csprng);
free(csprng);
free(lwe_sk_in);
free(lwe_sk_out);
free(plaintexts_1);
free(plaintexts_2);
for (int device = 0; device < num_gpus; device++) {
cudaSetDevice(device);
cudaStream_t *stream = cuda_create_stream(device);
cuda_drop_async(d_bsk_array[device], stream, device);
cuda_drop_async(d_ksk_array[device], stream, device);
cuda_drop_async(d_lwe_ct_in_array_1[device], stream, device);
cuda_drop_async(d_lwe_ct_in_array_2[device], stream, device);
cuda_drop_async(d_lwe_ct_out_array[device], stream, device);
int_mul_memory<uint64_t> *mem_ptr = mem_ptr_array[device];
cuda_drop_async(mem_ptr->vector_result_sb, stream, 0);
cuda_drop_async(mem_ptr->block_mul_res, stream, 0);
cuda_drop_async(mem_ptr->small_lwe_vector, stream, 0);
cuda_drop_async(mem_ptr->lwe_pbs_out_array, stream, 0);
cuda_drop_async(mem_ptr->test_vector_array, stream, 0);
cuda_drop_async(mem_ptr->message_acc, stream, 0);
cuda_drop_async(mem_ptr->carry_acc, stream, 0);
cuda_drop_async(mem_ptr->test_vector_indexes, stream, 0);
cuda_drop_async(mem_ptr->tvi_message, stream, 0);
cuda_drop_async(mem_ptr->tvi_carry, stream, 0);
cuda_drop_async(mem_ptr->pbs_buffer, stream, 0);
for (int i = 0; i < mem_ptr->p2p_gpu_count; i++) {
cuda_drop_async(mem_ptr->device_to_device_buffer[i], mem_ptr->streams[i],
i);
cuda_drop_async(mem_ptr->pbs_buffer_multi_gpu[i], mem_ptr->streams[i], i);
cuda_drop_async(mem_ptr->pbs_input_multi_gpu[i], mem_ptr->streams[i], i);
cuda_drop_async(mem_ptr->pbs_output_multi_gpu[i], mem_ptr->streams[i], i);
cuda_drop_async(mem_ptr->test_vector_multi_gpu[i], mem_ptr->streams[i], i);
cuda_drop_async(mem_ptr->tvi_lsb_multi_gpu[i], mem_ptr->streams[i], i);
cuda_drop_async(mem_ptr->tvi_msb_multi_gpu[i], mem_ptr->streams[i], i);
cuda_drop_async(mem_ptr->tvi_message_multi_gpu[i], mem_ptr->streams[i], i);
cuda_drop_async(mem_ptr->tvi_carry_multi_gpu[i], mem_ptr->streams[i], i);
if (i) {
cuda_drop_async(mem_ptr->bsk_multi_gpu[i], mem_ptr->streams[i], i);
cuda_drop_async(mem_ptr->ksk_multi_gpu[i], mem_ptr->streams[i], i);
}
cuda_destroy_stream(mem_ptr->streams[i], i);
}
cuda_synchronize_stream(stream);
cuda_destroy_stream(stream, device);
}
d_bsk_array.clear();
d_ksk_array.clear();
d_lwe_ct_in_array_1.clear();
d_lwe_ct_in_array_2.clear();
d_lwe_ct_out_array.clear();
mem_ptr_array.clear();
cudaDeviceReset();
}
};
BENCHMARK_DEFINE_F(IntegerMultiplication_u64,
ConcreteCuda_IntegerMultiplication)
(benchmark::State &st) {
int8_t *mult_buffer;
uint32_t ct_degree_out = 0;
uint32_t ct_degree_left = 0;
uint32_t ct_degree_right = 0;
omp_set_nested(true);
for (auto _ : st) {
// Execute multiplication
#pragma omp parallel for num_threads(num_gpus)
for (int device = 0; device < num_gpus; device++) {
cudaSetDevice(device);
auto d_lwe_ct_out = d_lwe_ct_out_array[device];
auto d_lwe_ct_in_1 = d_lwe_ct_in_array_1[device];
auto d_lwe_ct_in_2 = d_lwe_ct_in_array_2[device];
auto d_bsk = d_bsk_array[device];
auto d_ksk = d_ksk_array[device];
auto mem_ptr = mem_ptr_array[device];
#pragma omp parallel for num_threads(operations_per_gpu)
for (int i = 0; i < operations_per_gpu; i++) {
cudaStream_t *stream = cuda_create_stream(device);
if (USE_MULTI_GPU) {
cuda_integer_mult_radix_ciphertext_kb_64_multi_gpu(
(void *)d_lwe_ct_out, (void *)d_lwe_ct_in_1,
(void *)d_lwe_ct_in_2, &ct_degree_out, &ct_degree_left,
&ct_degree_right, d_bsk, d_ksk, (void *)mem_ptr, message_modulus,
carry_modulus, glwe_dimension, lwe_dimension, polynomial_size,
pbs_base_log, pbs_level, ksk_base_log, ksk_level,
number_of_blocks, pbs_type, cuda_get_max_shared_memory(device));
} else {
cuda_integer_mult_radix_ciphertext_kb_64(
stream, device, (void *)d_lwe_ct_out, (void *)d_lwe_ct_in_1,
(void *)d_lwe_ct_in_2, &ct_degree_out, &ct_degree_left,
&ct_degree_right, d_bsk, d_ksk, (void *)mem_ptr, message_modulus,
carry_modulus, glwe_dimension, lwe_dimension, polynomial_size,
pbs_base_log, pbs_level, ksk_base_log, ksk_level,
number_of_blocks, pbs_type, cuda_get_max_shared_memory(device));
}
cuda_synchronize_stream(stream);
cuda_destroy_stream(stream, device);
}
}
}
}
static void IntegerMultiplicationBenchmarkGenerateParams(
benchmark::internal::Benchmark *b) {
// Define the parameters to benchmark
std::vector<IntegerMultiplicationBenchmarkParams> params = {
(IntegerMultiplicationBenchmarkParams){
744, 1, 2048, 4.478453795193731e-11, 8.645717832544903e-32, 23, 1, 3,
5, 8, 4, 4, 4, LOW_LAT},
(IntegerMultiplicationBenchmarkParams){
744, 1, 2048, 4.478453795193731e-11, 8.645717832544903e-32, 23, 1, 3,
5, 16, 8, 4, 4, LOW_LAT},
(IntegerMultiplicationBenchmarkParams){
744, 1, 2048, 4.478453795193731e-11, 8.645717832544903e-32, 23, 1, 3,
5, 32, 16, 4, 4, LOW_LAT},
(IntegerMultiplicationBenchmarkParams){
744, 1, 2048, 4.478453795193731e-11, 8.645717832544903e-32, 23, 1, 3,
5, 40, 20, 4, 4, LOW_LAT},
(IntegerMultiplicationBenchmarkParams){
744, 1, 2048, 4.478453795193731e-11, 8.645717832544903e-32, 23, 1, 3,
5, 64, 32, 4, 4, LOW_LAT},
(IntegerMultiplicationBenchmarkParams){
744, 1, 2048, 4.478453795193731e-11, 8.645717832544903e-32, 23, 1, 3,
5, 128, 64, 4, 4, LOW_LAT},
(IntegerMultiplicationBenchmarkParams){
744, 1, 2048, 4.478453795193731e-11, 8.645717832544903e-32, 23, 1, 3,
5, 256, 128, 4, 4, LOW_LAT},
(IntegerMultiplicationBenchmarkParams){
744, 1, 2048, 4.478453795193731e-11, 8.645717832544903e-32, 23, 1, 3,
5, 8, 4, 4, 4, MULTI_BIT},
(IntegerMultiplicationBenchmarkParams){
744, 1, 2048, 4.478453795193731e-11, 8.645717832544903e-32, 23, 1, 3,
5, 16, 8, 4, 4, MULTI_BIT},
(IntegerMultiplicationBenchmarkParams){
744, 1, 2048, 4.478453795193731e-11, 8.645717832544903e-32, 23, 1, 3,
5, 32, 16, 4, 4, MULTI_BIT},
(IntegerMultiplicationBenchmarkParams){
744, 1, 2048, 4.478453795193731e-11, 8.645717832544903e-32, 23, 1, 3,
5, 40, 20, 4, 4, MULTI_BIT},
(IntegerMultiplicationBenchmarkParams){
744, 1, 2048, 4.478453795193731e-11, 8.645717832544903e-32, 23, 1, 3,
5, 64, 32, 4, 4, MULTI_BIT},
(IntegerMultiplicationBenchmarkParams){
744, 1, 2048, 4.478453795193731e-11, 8.645717832544903e-32, 23, 1, 3,
5, 128, 64, 4, 4, MULTI_BIT},
(IntegerMultiplicationBenchmarkParams){
744, 1, 2048, 4.478453795193731e-11, 8.645717832544903e-32, 23, 1, 3,
5, 256, 128, 4, 4, MULTI_BIT},
};
int max_gpus_to_use = 8;
// Add to the list of parameters to benchmark
for(int operations_per_gpu = 1; operations_per_gpu < 10; operations_per_gpu++)
for (auto x : params) {
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
x.lwe_modular_variance, x.glwe_modular_variance, x.pbs_base_log,
x.pbs_level, x.ksk_base_log, x.ksk_level, x.total_message_bits,
x.number_of_blocks, x.message_modulus, x.carry_modulus,
x.pbs_type, max_gpus_to_use, operations_per_gpu});
}
}
BENCHMARK_REGISTER_F(IntegerMultiplication_u64,
ConcreteCuda_IntegerMultiplication)
->Apply(IntegerMultiplicationBenchmarkGenerateParams);

View File

@@ -1,127 +0,0 @@
#include <benchmark/benchmark.h>
#include <cstdint>
#include <setup_and_teardown.h>
#include <stdio.h>
#include <stdlib.h>
typedef struct {
int input_lwe_dimension;
int output_lwe_dimension;
int ksk_base_log;
int ksk_level;
int number_of_inputs;
} KeyswitchBenchmarkParams;
class Keyswitch_u64 : public benchmark::Fixture {
protected:
int input_lwe_dimension;
int output_lwe_dimension;
double noise_variance = 2.9802322387695312e-08;
int ksk_base_log;
int ksk_level;
int message_modulus = 4;
int carry_modulus = 4;
int payload_modulus;
int number_of_inputs;
uint64_t delta;
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t *plaintexts;
uint64_t *d_ksk_array;
uint64_t *d_lwe_out_ct_array;
uint64_t *d_lwe_in_ct_array;
uint64_t *lwe_sk_in_array;
uint64_t *lwe_sk_out_array;
public:
// Test arithmetic functions
void SetUp(const ::benchmark::State &state) {
stream = cuda_create_stream(0);
// TestParams
input_lwe_dimension = state.range(0);
output_lwe_dimension = state.range(1);
ksk_base_log = state.range(2);
ksk_level = state.range(3);
number_of_inputs = state.range(4);
keyswitch_setup(stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
&d_ksk_array, &plaintexts, &d_lwe_in_ct_array,
&d_lwe_out_ct_array, input_lwe_dimension,
output_lwe_dimension, noise_variance, ksk_base_log,
ksk_level, message_modulus, carry_modulus, &payload_modulus,
&delta, number_of_inputs, 1, 1, gpu_index);
}
void TearDown(const ::benchmark::State &state) {
keyswitch_teardown(stream, csprng, lwe_sk_in_array, lwe_sk_out_array,
d_ksk_array, plaintexts, d_lwe_in_ct_array,
d_lwe_out_ct_array, gpu_index);
}
};
BENCHMARK_DEFINE_F(Keyswitch_u64, ConcreteCuda_Keyswitch)
(benchmark::State &st) {
for (auto _ : st) {
// Execute keyswitch
cuda_keyswitch_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct_array,
(void *)d_lwe_in_ct_array, (void *)d_ksk_array, input_lwe_dimension,
output_lwe_dimension, ksk_base_log, ksk_level, number_of_inputs);
cuda_synchronize_stream(stream);
}
st.counters["Throughput"] =
benchmark::Counter(number_of_inputs / get_aws_cost_per_second(),
benchmark::Counter::kIsIterationInvariantRate);
}
BENCHMARK_DEFINE_F(Keyswitch_u64, ConcreteCuda_CopiesPlusKeyswitch)
(benchmark::State &st) {
uint64_t *lwe_in_ct = (uint64_t *)malloc(
number_of_inputs * (input_lwe_dimension + 1) * sizeof(uint64_t));
uint64_t *lwe_out_ct = (uint64_t *)malloc(
number_of_inputs * (output_lwe_dimension + 1) * sizeof(uint64_t));
void *v_stream = (void *)stream;
for (auto _ : st) {
cuda_memcpy_async_to_gpu(d_lwe_in_ct_array, lwe_in_ct,
number_of_inputs * (input_lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
// Execute keyswitch
cuda_keyswitch_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct_array,
(void *)d_lwe_in_ct_array, (void *)d_ksk_array, input_lwe_dimension,
output_lwe_dimension, ksk_base_log, ksk_level, number_of_inputs);
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct_array,
number_of_inputs * (output_lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(v_stream);
}
st.counters["Throughput"] =
benchmark::Counter(number_of_inputs / get_aws_cost_per_second(),
benchmark::Counter::kIsIterationInvariantRate);
free(lwe_in_ct);
free(lwe_out_ct);
}
static void
KeyswitchBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
// Define the parameters to benchmark
// na, nb, base_log, level, number_of_inputs
std::vector<KeyswitchBenchmarkParams> params = {
(KeyswitchBenchmarkParams){600, 1024, 3, 8, 1000},
};
// Add to the list of parameters to benchmark
for (auto x : params)
b->Args({x.input_lwe_dimension, x.output_lwe_dimension, x.ksk_base_log,
x.ksk_level, x.number_of_inputs});
}
BENCHMARK_REGISTER_F(Keyswitch_u64, ConcreteCuda_Keyswitch)
->Apply(KeyswitchBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(Keyswitch_u64, ConcreteCuda_CopiesPlusKeyswitch)
->Apply(KeyswitchBenchmarkGenerateParams);

View File

@@ -1,254 +0,0 @@
#include <benchmark/benchmark.h>
#include <cstdint>
#include <setup_and_teardown.h>
#include <stdio.h>
#include <stdlib.h>
typedef struct {
int lwe_dimension;
int input_lwe_ciphertext_count;
} LinearAlgebraBenchmarkParams;
class LinearAlgebra_u64 : public benchmark::Fixture {
protected:
int lwe_dimension;
double noise_variance = 2.9802322387695312e-08;
int ksk_base_log;
int ksk_level;
int message_modulus = 4;
int carry_modulus = 4;
int num_samples;
uint64_t delta;
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t *d_lwe_in_1_ct;
uint64_t *d_lwe_in_2_ct;
uint64_t *d_lwe_out_ct;
uint64_t *plaintexts_1;
uint64_t *plaintexts_2;
uint64_t *d_plaintext_2;
uint64_t *d_cleartext;
uint64_t *lwe_in_1_ct;
uint64_t *lwe_in_2_ct;
uint64_t *lwe_out_ct;
uint64_t *lwe_sk_array;
public:
// Test arithmetic functions
void SetUp(const ::benchmark::State &state) {
stream = cuda_create_stream(0);
// TestParams
lwe_dimension = state.range(0);
num_samples = state.range(1);
int payload_modulus = message_modulus * carry_modulus;
// Value of the shift we multiply our messages by
delta = ((uint64_t)(1) << 63) / (uint64_t)(payload_modulus);
linear_algebra_setup(
stream, &csprng, &lwe_sk_array, &d_lwe_in_1_ct, &d_lwe_in_2_ct,
&d_lwe_out_ct, &lwe_in_1_ct, &lwe_in_2_ct, &lwe_out_ct, &plaintexts_1,
&plaintexts_2, &d_plaintext_2, &d_cleartext, lwe_dimension,
noise_variance, payload_modulus, delta, num_samples, 1, 1, gpu_index);
}
void TearDown(const ::benchmark::State &state) {
linear_algebra_teardown(
stream, &csprng, &lwe_sk_array, &d_lwe_in_1_ct, &d_lwe_in_2_ct,
&d_lwe_out_ct, &lwe_in_1_ct, &lwe_in_2_ct, &lwe_out_ct, &plaintexts_1,
&plaintexts_2, &d_plaintext_2, &d_cleartext, gpu_index);
}
};
BENCHMARK_DEFINE_F(LinearAlgebra_u64, ConcreteCuda_Addition)
(benchmark::State &st) {
// Execute addition
for (auto _ : st) {
cuda_add_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
(void *)d_lwe_in_2_ct, lwe_dimension, num_samples);
cuda_synchronize_stream(stream);
}
st.counters["Throughput"] =
benchmark::Counter(num_samples / get_aws_cost_per_second(),
benchmark::Counter::kIsIterationInvariantRate);
}
BENCHMARK_DEFINE_F(LinearAlgebra_u64, ConcreteCuda_CopiesPlusAddition)
(benchmark::State &st) {
// Execute addition
for (auto _ : st) {
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
num_samples * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
cuda_memcpy_async_to_gpu(d_lwe_in_2_ct, lwe_in_2_ct,
num_samples * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
cuda_add_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
(void *)d_lwe_in_2_ct, lwe_dimension, num_samples);
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
num_samples * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(stream);
}
st.counters["Throughput"] =
benchmark::Counter(num_samples / get_aws_cost_per_second(),
benchmark::Counter::kIsIterationInvariantRate);
}
BENCHMARK_DEFINE_F(LinearAlgebra_u64, ConcreteCuda_PlaintextAddition)
(benchmark::State &st) {
for (auto _ : st) {
// Execute addition
cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
(void *)d_plaintext_2, lwe_dimension, num_samples);
cuda_synchronize_stream(stream);
}
st.counters["Throughput"] =
benchmark::Counter(num_samples / get_aws_cost_per_second(),
benchmark::Counter::kIsIterationInvariantRate);
}
BENCHMARK_DEFINE_F(LinearAlgebra_u64, ConcreteCuda_CopiesPlusPlaintextAddition)
(benchmark::State &st) {
for (auto _ : st) {
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
num_samples * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
cuda_memcpy_async_to_gpu(d_plaintext_2, plaintexts_2,
num_samples * sizeof(uint64_t), stream, gpu_index);
// Execute addition
cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
(void *)d_plaintext_2, lwe_dimension, num_samples);
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
num_samples * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(stream);
}
st.counters["Throughput"] =
benchmark::Counter(num_samples / get_aws_cost_per_second(),
benchmark::Counter::kIsIterationInvariantRate);
}
BENCHMARK_DEFINE_F(LinearAlgebra_u64, ConcreteCuda_CleartextMultiplication)
(benchmark::State &st) {
for (auto _ : st) {
// Execute addition
cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
(void *)d_cleartext, lwe_dimension, num_samples);
cuda_synchronize_stream(stream);
}
st.counters["Throughput"] =
benchmark::Counter(num_samples / get_aws_cost_per_second(),
benchmark::Counter::kIsIterationInvariantRate);
}
BENCHMARK_DEFINE_F(LinearAlgebra_u64,
ConcreteCuda_CopiesPlusCleartextMultiplication)
(benchmark::State &st) {
for (auto _ : st) {
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
num_samples * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
cuda_memcpy_async_to_gpu(d_cleartext, plaintexts_2,
num_samples * sizeof(uint64_t), stream, gpu_index);
// Execute addition
cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
(void *)d_cleartext, lwe_dimension, num_samples);
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
num_samples * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(stream);
}
st.counters["Throughput"] =
benchmark::Counter(num_samples / get_aws_cost_per_second(),
benchmark::Counter::kIsIterationInvariantRate);
}
BENCHMARK_DEFINE_F(LinearAlgebra_u64, ConcreteCuda_Negation)
(benchmark::State &st) {
for (auto _ : st) {
// Execute addition
cuda_negate_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
lwe_dimension, num_samples);
cuda_synchronize_stream(stream);
}
st.counters["Throughput"] =
benchmark::Counter(num_samples / get_aws_cost_per_second(),
benchmark::Counter::kIsIterationInvariantRate);
}
BENCHMARK_DEFINE_F(LinearAlgebra_u64, ConcreteCuda_CopiesPlusNegation)
(benchmark::State &st) {
for (auto _ : st) {
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
num_samples * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
// Execute addition
cuda_negate_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
lwe_dimension, num_samples);
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
num_samples * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(stream);
}
st.counters["Throughput"] =
benchmark::Counter(num_samples / get_aws_cost_per_second(),
benchmark::Counter::kIsIterationInvariantRate);
}
static void
LinearAlgebraBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
// Define the parameters to benchmark
// n, input_lwe_ciphertext_count
std::vector<LinearAlgebraBenchmarkParams> params = {
(LinearAlgebraBenchmarkParams){600, 100},
};
// Add to the list of parameters to benchmark
for (auto x : params)
b->Args({x.lwe_dimension, x.input_lwe_ciphertext_count});
}
BENCHMARK_REGISTER_F(LinearAlgebra_u64, ConcreteCuda_Addition)
->Apply(LinearAlgebraBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(LinearAlgebra_u64, ConcreteCuda_CopiesPlusAddition)
->Apply(LinearAlgebraBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(LinearAlgebra_u64, ConcreteCuda_PlaintextAddition)
->Apply(LinearAlgebraBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(LinearAlgebra_u64,
ConcreteCuda_CopiesPlusPlaintextAddition)
->Apply(LinearAlgebraBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(LinearAlgebra_u64, ConcreteCuda_CleartextMultiplication)
->Apply(LinearAlgebraBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(LinearAlgebra_u64,
ConcreteCuda_CopiesPlusCleartextMultiplication)
->Apply(LinearAlgebraBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(LinearAlgebra_u64, ConcreteCuda_Negation)
->Apply(LinearAlgebraBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(LinearAlgebra_u64, ConcreteCuda_CopiesPlusNegation)
->Apply(LinearAlgebraBenchmarkGenerateParams);

View File

@@ -1,183 +0,0 @@
#include <benchmark/benchmark.h>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <omp.h>
#include <setup_and_teardown.h>
typedef struct {
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
int pbs_base_log;
int pbs_level;
int input_lwe_ciphertext_count;
int grouping_factor;
int chunk_size;
} MultiBitPBSBenchmarkParams;
class MultiBitBootstrap_u64 : public benchmark::Fixture {
protected:
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
int input_lwe_ciphertext_count;
int input_lwe_ciphertext_count_per_gpu;
int grouping_factor;
double lwe_modular_variance = 0.000007069849454709433;
double glwe_modular_variance = 0.00000000000000029403601535432533;
int pbs_base_log;
int pbs_level;
int message_modulus = 4;
int carry_modulus = 4;
int payload_modulus;
uint64_t delta;
std::vector<uint64_t *> d_bsk_array;
std::vector<uint64_t *> d_lut_pbs_identity;
std::vector<uint64_t *> d_lut_pbs_indexes;
std::vector<uint64_t *> d_lwe_ct_in_array;
std::vector<uint64_t *> d_lwe_ct_out_array;
uint64_t *lwe_sk_in_array;
uint64_t *lwe_sk_out_array;
uint64_t *plaintexts;
Csprng *csprng;
std::vector<int8_t *> pbs_buffer;
int chunk_size;
int num_gpus;
std::vector<cudaStream_t *> streams;
public:
void SetUp(const ::benchmark::State &state) {
lwe_dimension = state.range(0);
glwe_dimension = state.range(1);
polynomial_size = state.range(2);
pbs_base_log = state.range(3);
pbs_level = state.range(4);
input_lwe_ciphertext_count = state.range(5);
grouping_factor = state.range(6);
chunk_size = state.range(7);
num_gpus = std::min(cuda_get_number_of_gpus(), input_lwe_ciphertext_count);
assert(input_lwe_ciphertext_count % num_gpus == 0);
input_lwe_ciphertext_count_per_gpu =
std::max(1, input_lwe_ciphertext_count / num_gpus);
// Create streams
for (int device = 0; device < num_gpus; device++) {
cudaSetDevice(device);
cudaStream_t *stream = cuda_create_stream(device);
streams.push_back(stream);
uint64_t *d_bsk_array_per_gpu;
uint64_t *d_lut_pbs_identity_per_gpu;
uint64_t *d_lut_pbs_indexes_per_gpu;
uint64_t *d_lwe_ct_in_array_per_gpu;
uint64_t *d_lwe_ct_out_array_per_gpu;
int8_t *pbs_buffer_per_gpu;
bootstrap_multibit_setup(
stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
&d_bsk_array_per_gpu, &plaintexts, &d_lut_pbs_identity_per_gpu,
&d_lut_pbs_indexes_per_gpu, &d_lwe_ct_in_array_per_gpu,
&d_lwe_ct_out_array_per_gpu, &pbs_buffer_per_gpu, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor,
lwe_modular_variance, glwe_modular_variance, pbs_base_log, pbs_level,
message_modulus, carry_modulus, &payload_modulus, &delta,
input_lwe_ciphertext_count_per_gpu, 1, 1, device, chunk_size);
d_bsk_array.push_back(d_bsk_array_per_gpu);
d_lut_pbs_identity.push_back(d_lut_pbs_identity_per_gpu);
d_lut_pbs_indexes.push_back(d_lut_pbs_indexes_per_gpu);
d_lwe_ct_in_array.push_back(d_lwe_ct_in_array_per_gpu);
d_lwe_ct_out_array.push_back(d_lwe_ct_out_array_per_gpu);
pbs_buffer.push_back(pbs_buffer_per_gpu);
}
}
void TearDown(const ::benchmark::State &state) {
concrete_cpu_destroy_concrete_csprng(csprng);
free(csprng);
free(lwe_sk_in_array);
free(lwe_sk_out_array);
free(plaintexts);
for (int device = 0; device < num_gpus; device++) {
cudaSetDevice(device);
cleanup_cuda_multi_bit_pbs(streams[device], device, &pbs_buffer[device]);
cuda_drop_async(d_bsk_array[device], streams[device], device);
cuda_drop_async(d_lut_pbs_identity[device], streams[device], device);
cuda_drop_async(d_lut_pbs_indexes[device], streams[device], device);
cuda_drop_async(d_lwe_ct_in_array[device], streams[device], device);
cuda_drop_async(d_lwe_ct_out_array[device], streams[device], device);
cuda_synchronize_stream(streams[device]);
cuda_destroy_stream(streams[device], device);
}
d_bsk_array.clear();
d_lut_pbs_identity.clear();
d_lut_pbs_indexes.clear();
d_lwe_ct_in_array.clear();
d_lwe_ct_out_array.clear();
pbs_buffer.clear();
streams.clear();
cudaDeviceReset();
}
};
BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, ConcreteCuda_MultiBit)
(benchmark::State &st) {
for (auto _ : st) {
#pragma omp parallel for num_threads(num_gpus)
for (int device = 0; device < num_gpus; device++) {
cudaSetDevice(device);
// Execute PBS
cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
streams[device], device, (void *)d_lwe_ct_out_array[device],
(void *)d_lut_pbs_identity[device], (void *)d_lut_pbs_indexes[device],
(void *)d_lwe_ct_in_array[device], (void *)d_bsk_array[device],
pbs_buffer[device], lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, pbs_base_log, pbs_level,
input_lwe_ciphertext_count_per_gpu, 1, 0,
cuda_get_max_shared_memory(device), chunk_size);
}
for (int device = 0; device < num_gpus; device++) {
cudaSetDevice(device);
cuda_synchronize_stream(streams[device]);
}
}
st.counters["Throughput"] =
benchmark::Counter(input_lwe_ciphertext_count / get_aws_cost_per_second(),
benchmark::Counter::kIsIterationInvariantRate);
}
static void
MultiBitPBSBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
// Define the parameters to benchmark
// lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
// input_lwe_ciphertext_count
std::vector<MultiBitPBSBenchmarkParams> params = {
// 4_bits_multi_bit_group_2
(MultiBitPBSBenchmarkParams){818, 1, 2048, 22, 1, 1, 2},
// 4_bits_multi_bit_group_3
(MultiBitPBSBenchmarkParams){888, 1, 2048, 21, 1, 1, 3},
(MultiBitPBSBenchmarkParams){742, 1, 2048, 23, 1, 1, 2},
(MultiBitPBSBenchmarkParams){744, 1, 2048, 23, 1, 1, 3},
};
// Add to the list of parameters to benchmark
for (auto x : params) {
for (int input_lwe_ciphertext_count = 1;
input_lwe_ciphertext_count <= 16384; input_lwe_ciphertext_count *= 2)
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
x.pbs_base_log, x.pbs_level, input_lwe_ciphertext_count,
x.grouping_factor, 0});
}
}
BENCHMARK_REGISTER_F(MultiBitBootstrap_u64, ConcreteCuda_MultiBit)
->Apply(MultiBitPBSBenchmarkGenerateParams);

View File

@@ -1,184 +0,0 @@
#include <benchmark/benchmark.h>
#include <cstdint>
#include <setup_and_teardown.h>
#include <stdio.h>
#include <stdlib.h>
const unsigned MAX_TAU = 4;
typedef struct {
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
int pbs_base_log;
int pbs_level;
int ks_base_log;
int ks_level;
int pksk_base_log;
int pksk_level;
int cbs_base_log;
int cbs_level;
int tau;
int p;
} WopPBSBenchmarkParams;
class WopPBS_u64 : public benchmark::Fixture {
protected:
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance = 7.52316384526264e-37;
double glwe_modular_variance = 7.52316384526264e-37;
int pbs_base_log;
int pbs_level;
int ks_base_log;
int ks_level;
int pksk_base_log;
int pksk_level;
int cbs_base_log;
int cbs_level;
int tau;
uint32_t p_array[MAX_TAU];
int input_lwe_dimension;
uint64_t delta_array[MAX_TAU];
int cbs_delta_log;
uint32_t delta_log_array[MAX_TAU];
int delta_log_lut;
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t *plaintexts;
double *d_fourier_bsk;
uint64_t *lwe_sk_in;
uint64_t *lwe_sk_out;
uint64_t *d_ksk;
uint64_t *d_pksk;
uint64_t *d_lwe_ct_in_array;
uint64_t *d_lwe_ct_out_array;
uint64_t *d_lut_vector;
int8_t *wop_pbs_buffer;
uint64_t *lwe_ct_in_array;
uint64_t *lwe_ct_out_array;
public:
// Test arithmetic functions
void SetUp(const ::benchmark::State &state) {
stream = cuda_create_stream(0);
// TestParams
lwe_dimension = state.range(0);
glwe_dimension = state.range(1);
polynomial_size = state.range(2);
pbs_base_log = state.range(3);
pbs_level = state.range(4);
ks_base_log = state.range(5);
ks_level = state.range(6);
pksk_base_log = state.range(7);
pksk_level = state.range(8);
cbs_base_log = state.range(9);
cbs_level = state.range(10);
tau = state.range(11);
p_array[0] = state.range(12);
wop_pbs_setup(stream, &csprng, &lwe_sk_in, &lwe_sk_out, &d_ksk,
&d_fourier_bsk, &d_pksk, &plaintexts, &d_lwe_ct_in_array,
&d_lwe_ct_out_array, &d_lut_vector, &wop_pbs_buffer,
lwe_dimension, glwe_dimension, polynomial_size,
lwe_modular_variance, glwe_modular_variance, ks_base_log,
ks_level, pksk_base_log, pksk_level, pbs_base_log, pbs_level,
cbs_level, p_array, delta_log_array, &cbs_delta_log,
delta_array, tau, 1, 1, gpu_index);
// We keep the following for the benchmarks with copies
lwe_ct_in_array = (uint64_t *)malloc(
(glwe_dimension * polynomial_size + 1) * tau * sizeof(uint64_t));
lwe_ct_out_array = (uint64_t *)malloc(
(glwe_dimension * polynomial_size + 1) * tau * sizeof(uint64_t));
for (int i = 0; i < tau; i++) {
uint64_t plaintext = plaintexts[i];
uint64_t *lwe_ct_in =
lwe_ct_in_array +
(ptrdiff_t)(i * (glwe_dimension * polynomial_size + 1));
concrete_cpu_encrypt_lwe_ciphertext_u64(
lwe_sk_in, lwe_ct_in, plaintext, glwe_dimension * polynomial_size,
lwe_modular_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
}
}
void TearDown(const ::benchmark::State &state) {
wop_pbs_teardown(stream, csprng, lwe_sk_in, lwe_sk_out, d_ksk,
d_fourier_bsk, d_pksk, plaintexts, d_lwe_ct_in_array,
d_lut_vector, d_lwe_ct_out_array, wop_pbs_buffer,
gpu_index);
free(lwe_ct_in_array);
free(lwe_ct_out_array);
}
};
BENCHMARK_DEFINE_F(WopPBS_u64, ConcreteCuda_WopPBS)(benchmark::State &st) {
for (auto _ : st) {
// Execute wop pbs
cuda_wop_pbs_64(stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lwe_ct_in_array, (void *)d_lut_vector,
(void *)d_fourier_bsk, (void *)d_ksk, (void *)d_pksk,
wop_pbs_buffer, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
ks_base_log, ks_level, pksk_base_log, pksk_level,
cbs_base_log, cbs_level, p_array, delta_log_array, tau,
cuda_get_max_shared_memory(gpu_index));
cuda_synchronize_stream(stream);
}
st.counters["Throughput"] =
benchmark::Counter(tau * p_array[0] / get_aws_cost_per_second(),
benchmark::Counter::kIsIterationInvariantRate);
}
BENCHMARK_DEFINE_F(WopPBS_u64, ConcreteCuda_CopiesPlusWopPBS)
(benchmark::State &st) {
for (auto _ : st) {
cuda_memcpy_async_to_gpu(d_lwe_ct_in_array, lwe_ct_in_array,
(input_lwe_dimension + 1) * tau * sizeof(uint64_t),
stream, gpu_index);
// Execute wop pbs
cuda_wop_pbs_64(stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lwe_ct_in_array, (void *)d_lut_vector,
(void *)d_fourier_bsk, (void *)d_ksk, (void *)d_pksk,
wop_pbs_buffer, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
ks_base_log, ks_level, pksk_base_log, pksk_level,
cbs_base_log, cbs_level, p_array, delta_log_array, tau,
cuda_get_max_shared_memory(gpu_index));
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
(input_lwe_dimension + 1) * tau * sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(stream);
}
st.counters["Throughput"] =
benchmark::Counter(tau * p_array[0] / get_aws_cost_per_second(),
benchmark::Counter::kIsIterationInvariantRate);
}
static void WopPBSBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
// Define the parameters to benchmark
// n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
// ks_base_log, ks_level, tau, p
std::vector<WopPBSBenchmarkParams> params = {
(WopPBSBenchmarkParams){481, 2, 512, 4, 9, 1, 9, 4, 9, 6, 4, 1, 10},
//// INTEGER_PARAM_MESSAGE_4_CARRY_4_16_BITS
//(WopPBSBenchmarkParams){481, 1, 2048, 9, 4, 1, 9, 9, 4, 6, 4, 1, 8},
//// INTEGER_PARAM_MESSAGE_2_CARRY_2_16_BITS
//(WopPBSBenchmarkParams){493, 1, 2048, 16, 2, 2, 5, 16, 2, 6, 4, 1, 4},
};
// Add to the list of parameters to benchmark
for (auto x : params)
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
x.pbs_base_log, x.pbs_level, x.ks_base_log, x.ks_level,
x.pksk_base_log, x.pksk_level, x.cbs_base_log, x.cbs_level, x.tau,
x.p});
}
BENCHMARK_REGISTER_F(WopPBS_u64, ConcreteCuda_WopPBS)
->Apply(WopPBSBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(WopPBS_u64, ConcreteCuda_CopiesPlusWopPBS)
->Apply(WopPBSBenchmarkGenerateParams);

View File

@@ -1,3 +0,0 @@
#include <benchmark/benchmark.h>
BENCHMARK_MAIN();

View File

@@ -1,228 +0,0 @@
#ifndef SETUP_AND_TEARDOWN_H
#define SETUP_AND_TEARDOWN_H
#include <bit_extraction.h>
#include <bootstrap.h>
#include <bootstrap_multibit.h>
#include <circuit_bootstrap.h>
#include <concrete-cpu.h>
#include <device.h>
#include <keyswitch.h>
#include <linear_algebra.h>
#include <utils.h>
#include <vertical_packing.h>
template <typename Torus> struct int_mul_memory {
Torus *vector_result_sb;
Torus *block_mul_res;
Torus *small_lwe_vector;
Torus *lwe_pbs_out_array;
Torus *test_vector_array;
Torus *message_acc;
Torus *carry_acc;
Torus *test_vector_indexes;
Torus *tvi_message;
Torus *tvi_carry;
int8_t *pbs_buffer;
int p2p_gpu_count = 0;
cudaStream_t *streams[32];
int8_t *pbs_buffer_multi_gpu[32];
Torus *pbs_input_multi_gpu[32];
Torus *pbs_output_multi_gpu[32];
Torus *test_vector_multi_gpu[32];
Torus *tvi_lsb_multi_gpu[32];
Torus *tvi_msb_multi_gpu[32];
Torus *tvi_message_multi_gpu[32];
Torus *tvi_carry_multi_gpu[32];
Torus *bsk_multi_gpu[32];
Torus *ksk_multi_gpu[32];
Torus *device_to_device_buffer[8];
bool IsAppBuiltAs64() { return sizeof(void *) == 8; }
};
void bootstrap_classical_setup(
cudaStream_t *stream, Csprng **csprng, uint64_t **lwe_sk_in_array,
uint64_t **lwe_sk_out_array, double **d_fourier_bsk_array,
uint64_t **plaintexts, uint64_t **d_lut_pbs_identity,
uint64_t **d_lut_pbs_indexes, uint64_t **d_lwe_ct_in_array,
uint64_t **d_lwe_ct_out_array, int lwe_dimension, int glwe_dimension,
int polynomial_size, double lwe_modular_variance,
double glwe_modular_variance, int pbs_base_log, int pbs_level,
int message_modulus, int carry_modulus, int *payload_modulus,
uint64_t *delta, int number_of_inputs, int repetitions, int samples,
int gpu_index);
void bootstrap_classical_teardown(
cudaStream_t *stream, Csprng *csprng, uint64_t *lwe_sk_in_array,
uint64_t *lwe_sk_out_array, double *d_fourier_bsk_array,
uint64_t *plaintexts, uint64_t *d_lut_pbs_identity,
uint64_t *d_lut_pbs_indexes, uint64_t *d_lwe_ct_in_array,
uint64_t *d_lwe_ct_out_array, int gpu_index);
void bootstrap_multibit_setup(
cudaStream_t *stream, Csprng **csprng, uint64_t **lwe_sk_in_array,
uint64_t **lwe_sk_out_array, uint64_t **d_bsk_array, uint64_t **plaintexts,
uint64_t **d_lut_pbs_identity, uint64_t **d_lut_pbs_indexes,
uint64_t **d_lwe_ct_in_array, uint64_t **d_lwe_ct_out_array,
int8_t **pbs_buffer, int lwe_dimension, int glwe_dimension,
int polynomial_size, int grouping_factor, double lwe_modular_variance,
double glwe_modular_variance, int pbs_base_log, int pbs_level,
int message_modulus, int carry_modulus, int *payload_modulus,
uint64_t *delta, int number_of_inputs, int repetitions, int samples,
int gpu_index, int chunk_size = 0);
void bootstrap_multibit_teardown(
cudaStream_t *stream, Csprng *csprng, uint64_t *lwe_sk_in_array,
uint64_t *lwe_sk_out_array, uint64_t *d_bsk_array, uint64_t *plaintexts,
uint64_t *d_lut_pbs_identity, uint64_t *d_lut_pbs_indexes,
uint64_t *d_lwe_ct_in_array, uint64_t *d_lwe_ct_out_array,
int8_t **pbs_buffer, int gpu_index);
void keyswitch_setup(cudaStream_t *stream, Csprng **csprng,
uint64_t **lwe_sk_in_array, uint64_t **lwe_sk_out_array,
uint64_t **d_ksk_array, uint64_t **plaintexts,
uint64_t **d_lwe_ct_in_array,
uint64_t **d_lwe_ct_out_array, int input_lwe_dimension,
int output_lwe_dimension, double lwe_modular_variance,
int ksk_base_log, int ksk_level, int message_modulus,
int carry_modulus, int *payload_modulus, uint64_t *delta,
int number_of_inputs, int repetitions, int samples,
int gpu_index);
void keyswitch_teardown(cudaStream_t *stream, Csprng *csprng,
uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array,
uint64_t *d_ksk_array, uint64_t *plaintexts,
uint64_t *d_lwe_ct_in_array,
uint64_t *d_lwe_ct_out_array, int gpu_index);
void bit_extraction_setup(
cudaStream_t **stream, Csprng **csprng, uint64_t **lwe_sk_in_array,
uint64_t **lwe_sk_out_array, double **d_fourier_bsk_array,
uint64_t **d_ksk_array, uint64_t **plaintexts, uint64_t **d_lwe_ct_in_array,
uint64_t **d_lwe_ct_out_array, int8_t **bit_extract_buffer_array,
int lwe_dimension, int glwe_dimension, int polynomial_size,
double lwe_modular_variance, double glwe_modular_variance, int ks_base_log,
int ks_level, int pbs_base_log, int pbs_level,
uint32_t *number_of_bits_of_message_including_padding_array,
uint32_t *number_of_bits_to_extract_array, uint32_t *delta_log_array,
uint64_t *delta_array, int crt_decomposition_size, int repetitions,
int samples, int gpu_index);
void bit_extraction_teardown(cudaStream_t **stream, Csprng *csprng,
uint64_t *lwe_sk_in_array,
uint64_t *lwe_sk_out_array,
double *d_fourier_bsk_array, uint64_t *d_ksk_array,
uint64_t *plaintexts, uint64_t *d_lwe_ct_in_array,
uint64_t *d_lwe_ct_out_array,
int8_t **bit_extract_buffer_array, int samples,
int gpu_index);
void circuit_bootstrap_setup(
cudaStream_t *stream, Csprng **csprng, uint64_t **lwe_sk_in_array,
uint64_t **lwe_sk_out_array, double **d_fourier_bsk_array,
uint64_t **d_pksk_array, uint64_t **plaintexts,
uint64_t **d_lwe_ct_in_array, uint64_t **d_ggsw_ct_out_array,
uint64_t **d_lut_vector_indexes, int8_t **cbs_buffer, int lwe_dimension,
int glwe_dimension, int polynomial_size, double lwe_modular_variance,
double glwe_modular_variance, int pksk_base_log, int pksk_level,
int pbs_base_log, int pbs_level, int cbs_level,
int number_of_bits_of_message_including_padding, int ggsw_size,
int *delta_log, uint64_t *delta, int number_of_inputs, int repetitions,
int samples, int gpu_index);
void circuit_bootstrap_teardown(
cudaStream_t *stream, Csprng *csprng, uint64_t *lwe_sk_in_array,
uint64_t *lwe_sk_out_array, double *d_fourier_bsk_array,
uint64_t *d_pksk_array, uint64_t *plaintexts, uint64_t *d_lwe_ct_in_array,
uint64_t *d_lut_vector_indexes, uint64_t *d_ggsw_ct_out_array,
int8_t *cbs_buffer, int gpu_index);
void cmux_tree_setup(cudaStream_t *stream, Csprng **csprng, uint64_t **glwe_sk,
uint64_t **d_lut_identity, uint64_t **plaintexts,
uint64_t **d_ggsw_bit_array, int8_t **cmux_tree_buffer,
uint64_t **d_glwe_out, int glwe_dimension,
int polynomial_size, int base_log, int level_count,
double glwe_modular_variance, int r_lut, int tau,
uint32_t *delta_log, int repetitions, int samples,
int gpu_index);
void cmux_tree_teardown(cudaStream_t *stream, Csprng **csprng,
uint64_t **glwe_sk, uint64_t **d_lut_identity,
uint64_t **plaintexts, uint64_t **d_ggsw_bit_array,
int8_t **cmux_tree_buffer, uint64_t **d_glwe_out,
int gpu_index);
void wop_pbs_setup(cudaStream_t *stream, Csprng **csprng,
uint64_t **lwe_sk_in_array, uint64_t **lwe_sk_out_array,
uint64_t **d_ksk_array, double **d_fourier_bsk_array,
uint64_t **d_pksk_array, uint64_t **plaintexts,
uint64_t **d_lwe_ct_in_array, uint64_t **d_lwe_ct_out_array,
uint64_t **d_lut_vector, int8_t **wop_pbs_buffer,
int lwe_dimension, int glwe_dimension, int polynomial_size,
double lwe_modular_variance, double glwe_modular_variance,
int ks_base_log, int ks_level, int pksk_base_log,
int pksk_level, int pbs_base_log, int pbs_level,
int cbs_level, uint32_t *p_array, uint32_t *delta_log_array,
int *cbs_delta_log, uint64_t *delta_array, int tau,
int repetitions, int samples, int gpu_index);
void wop_pbs_teardown(cudaStream_t *stream, Csprng *csprng,
uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array,
uint64_t *d_ksk_array, double *d_fourier_bsk_array,
uint64_t *d_pksk_array, uint64_t *plaintexts,
uint64_t *d_lwe_ct_in_array, uint64_t *d_lut_vector,
uint64_t *d_lwe_ct_out_array, int8_t *wop_pbs_buffer,
int gpu_index);
void linear_algebra_setup(cudaStream_t *stream, Csprng **csprng,
uint64_t **lwe_sk_array, uint64_t **d_lwe_in_1_ct,
uint64_t **d_lwe_in_2_ct, uint64_t **d_lwe_out_ct,
uint64_t **lwe_in_1_ct, uint64_t **lwe_in_2_ct,
uint64_t **lwe_out_ct, uint64_t **plaintexts_1,
uint64_t **plaintexts_2, uint64_t **d_plaintext_2,
uint64_t **d_plaintexts_2_mul, int lwe_dimension,
double noise_variance, int payload_modulus,
uint64_t delta, int number_of_inputs, int repetitions,
int samples, int gpu_index);
void linear_algebra_teardown(cudaStream_t *stream, Csprng **csprng,
uint64_t **lwe_sk_array, uint64_t **d_lwe_in_1_ct,
uint64_t **d_lwe_in_2_ct, uint64_t **d_lwe_out_ct,
uint64_t **lwe_in_1_ct, uint64_t **lwe_in_2_ct,
uint64_t **lwe_out_ct, uint64_t **plaintexts_1,
uint64_t **plaintexts_2, uint64_t **d_plaintext_2,
uint64_t **d_plaintexts_2_mul, int gpu_index);
void fft_setup(cudaStream_t *stream, double **poly1, double **poly2,
double2 **h_cpoly1, double2 **h_cpoly2, double2 **d_cpoly1,
double2 **d_cpoly2, size_t polynomial_size, int samples,
int gpu_index);
void fft_teardown(cudaStream_t *stream, double *poly1, double *poly2,
double2 *h_cpoly1, double2 *h_cpoly2, double2 *d_cpoly1,
double2 *d_cpoly2, int gpu_index);
void integer_multiplication_setup(
cudaStream_t *stream, Csprng **csprng, uint64_t **lwe_sk_in_array,
uint64_t **lwe_sk_out_array, void **d_bsk_array, uint64_t **d_ksk_array,
uint64_t **plaintexts_1, uint64_t **plaintexts_2,
uint64_t **d_lwe_ct_in_array_1, uint64_t **d_lwe_ct_in_array_2,
uint64_t **d_lwe_ct_out_array, int_mul_memory<uint64_t> *mem_ptr,
int lwe_dimension, int glwe_dimension, int polynomial_size,
double lwe_modular_variance, double glwe_modular_variance, int pbs_base_log,
int pbs_level, int ksk_base_log, int ksk_level, int total_message_bits,
int number_of_blocks, int message_modulus, int carry_modulus,
uint64_t *delta, int repetitions, int samples, PBS_TYPE pbs_type, int gpu_index);
void integer_multiplication_teardown(
cudaStream_t *stream, Csprng *csprng, uint64_t *lwe_sk_in_array,
uint64_t *lwe_sk_out_array, void *d_bsk_array, uint64_t *d_ksk_array,
uint64_t *plaintexts_1, uint64_t *plaintexts_2,
uint64_t *d_lwe_ct_in_array_1, uint64_t *d_lwe_ct_in_array_2,
uint64_t *d_lwe_ct_out_array, int_mul_memory<uint64_t> *mem_ptr);
#endif // SETUP_AND_TEARDOWN_H

View File

@@ -1,85 +0,0 @@
#ifndef UTILS_H
#define UTILS_H
#include <concrete-cpu.h>
#include <device.h>
#include <functional>
#include <tfhe.h>
// This is the price per hour of a p3.2xlarge instance on Amazon AWS
#define AWS_VM_COST_PER_HOUR (double)3.06
double get_aws_cost_per_second();
uint64_t *generate_plaintexts(uint64_t payload_modulus, uint64_t delta,
int number_of_inputs, const unsigned repetitions,
const unsigned samples);
uint64_t *generate_plaintexts_bit_extract(uint64_t *payload_modulus,
uint64_t *delta,
int crt_decomposition_size,
const unsigned repetitions,
const unsigned samples);
uint64_t *generate_identity_lut_pbs(int polynomial_size, int glwe_dimension,
int message_modulus, int carry_modulus,
std::function<uint64_t(uint64_t)> func);
uint64_t *generate_identity_lut_cmux_tree(int polynomial_size, int num_lut,
int tau, int delta_log);
void generate_lwe_secret_keys(uint64_t **lwe_sk_array, int lwe_dimension,
Csprng *csprng, const unsigned repetitions);
void generate_glwe_secret_keys(uint64_t **glwe_sk_array, int glwe_dimension,
int polynomial_size, Csprng *csprng,
const unsigned repetitions);
void generate_lwe_bootstrap_keys(
cudaStream_t *stream, int gpu_index, double **d_fourier_bsk_array,
uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array, int lwe_dimension,
int glwe_dimension, int polynomial_size, int pbs_level, int pbs_base_log,
Csprng *csprng, double variance, const unsigned repetitions);
void generate_lwe_multi_bit_pbs_keys(
cudaStream_t *stream, int gpu_index, uint64_t **d_bsk_array,
uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array, int lwe_dimension,
int glwe_dimension, int polynomial_size, int pbs_level, int pbs_base_log,
int grouping_factor, Csprng *csprng, double variance,
const unsigned repetitions);
void generate_lwe_keyswitch_keys(cudaStream_t *stream, int gpu_index,
uint64_t **d_ksk_array,
uint64_t *lwe_sk_in_array,
uint64_t *lwe_sk_out_array,
int input_lwe_dimension,
int output_lwe_dimension, int ksk_level,
int ksk_base_log, Csprng *csprng,
double variance, const unsigned repetitions);
void generate_lwe_private_functional_keyswitch_key_lists(
cudaStream_t *stream, int gpu_index, uint64_t **d_pksk_array,
uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array,
int input_lwe_dimension, int output_glwe_dimension,
int output_polynomial_size, int pksk_level, int pksk_base_log,
Csprng *csprng, double variance, const unsigned repetitions);
uint64_t closest_representable(uint64_t input, int level_count, int base_log);
uint64_t *bit_decompose_value(uint64_t value, int r);
uint64_t number_of_inputs_on_gpu(uint64_t gpu_index,
uint64_t lwe_ciphertext_count,
uint64_t number_of_gpus);
void encrypt_integer_u64_blocks(uint64_t **ct, uint64_t *lwe_sk,
uint64_t *message_blocks, int lwe_dimension,
int num_blocks, Csprng *csprng,
double variance);
void decrypt_integer_u64_blocks(uint64_t *ct, uint64_t *lwe_sk,
uint64_t **message_blocks, int lwe_dimension,
int num_blocks, uint64_t delta,
int message_modulus);
#endif

View File

@@ -1,87 +0,0 @@
find_package(CUDA REQUIRED)
find_package(CUDAToolkit REQUIRED)
include(FetchContent)
FetchContent_Declare(googletest
URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip)
# For Windows: Prevent overriding the parent project's compiler/linker settings
set(gtest_force_shared_crt
ON
CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(googletest)
set(CONCRETE_CPU_BINARY_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../../concrete-cpu/implementation/target/release")
set(CONCRETE_CPU_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../../concrete-cpu/implementation")
set(CONCRETE_CUDA_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../")
# Enable ExternalProject CMake module
include(ExternalProject)
if(NOT TARGET concrete_cpu)
ExternalProject_Add(
concrete_cpu
SOURCE_DIR ${CONCRETE_CPU_SOURCE_DIR}
DOWNLOAD_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND cargo +nightly build --release --features=nightly
COMMAND cargo +nightly build --release --features=nightly
BINARY_DIR ${CONCRETE_CPU_BINARY_DIR}
BUILD_ALWAYS true
INSTALL_COMMAND ""
LOG_BUILD ON)
endif()
set(TFHE_RS_SOURCE_DIR "${CMAKE_BINARY_DIR}/tfhe-rs")
set(TFHE_RS_BINARY_DIR "${TFHE_RS_SOURCE_DIR}/target/release")
if(NOT TARGET tfhe-rs)
ExternalProject_Add(
tfhe-rs
GIT_REPOSITORY https://github.com/zama-ai/tfhe-rs.git
GIT_TAG main
SOURCE_DIR ${TFHE_RS_SOURCE_DIR}
BUILD_IN_SOURCE 1
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND make build_c_api
INSTALL_COMMAND ""
LOG_BUILD ON)
endif()
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../include)
include_directories(${CONCRETE_CPU_SOURCE_DIR}/include)
include_directories(${CONCRETE_CUDA_SOURCE_DIR}/include)
include_directories(${TFHE_RS_BINARY_DIR})
include_directories("${CUDA_INCLUDE_DIRS}" "${CMAKE_CURRENT_SOURCE_DIR}")
add_library(concrete_cpu_lib STATIC IMPORTED)
add_dependencies(concrete_cpu_lib concrete_cpu)
set_target_properties(concrete_cpu_lib PROPERTIES IMPORTED_LOCATION ${CONCRETE_CPU_BINARY_DIR}/libconcrete_cpu.a)
add_library(tfhe_rs_lib STATIC IMPORTED)
add_dependencies(tfhe_rs_lib tfhe-rs)
set_target_properties(tfhe_rs_lib PROPERTIES IMPORTED_LOCATION ${TFHE_RS_BINARY_DIR}/libtfhe.a)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--no-as-needed,--allow-multiple-definition -ldl")
set(BINARY test_concrete_cuda)
file(
GLOB_RECURSE TEST_SOURCES
LIST_DIRECTORIES false
test_*.cpp)
add_executable(${BINARY} ${TEST_SOURCES} ../utils.cpp ../setup_and_teardown.cpp)
add_test(NAME ${BINARY} COMMAND ${BINARY})
set_target_properties(
test_concrete_cuda
PROPERTIES CUDA_SEPARABLE_COMPILATION ON
CUDA_RESOLVE_DEVICE_SYMBOLS ON
CUDA_ARCHITECTURES native)
target_link_libraries(test_concrete_cuda PUBLIC GTest::gtest_main concrete_cpu_lib tfhe_rs_lib concrete_cuda cudart)
include(GoogleTest)
gtest_discover_tests(test_concrete_cuda)

View File

@@ -1,60 +0,0 @@
# test_concrete_cuda
This test tool is written over GoogleTest library. It tests the correctness of the concrete framework's CUDA-accelerated functions and helps identify arithmetic flaws.
The output format can be adjusted according to the user's interest.
A particular function will be executed for each test case, and the result will be verified considering the expected behavior. This will be repeated for multiple encryption keys and samples per key. These can be modified by changing `REPETITIONS` and `SAMPLES` variables at the beginning of each test file.
## How to Compile
The first step in compiling code with CMake is to create a build directory. This directory will
contain all the files generated during the build process, such as object files and executables.
We recommend creating this directory outside of the source directory, but inside the
implementation folder, to keep the source directory clean.
```bash
$ cd concrete/backends/concrete-cuda/implementation
$ mkdir build
$ cd build
```
Run CMake to generate the build files and then use make to compile the project.
```bash
$ cmake ..
$ make
```
The binary will be found in
`concrete/backends/concrete-cuda/implementation/build/test_and_benchmark/test`.
## How to Run Tests
To run tests, you can simply execute the `test_concrete_cuda` executable with no arguments:
```bash
$ test_and_benchmark/test/test_concrete_cuda
```
This will run all the available tests.
## How to Filter Tests
You can select a subset of sets by specifying a filter for the name of the tests of interest as
an argument. Only tests whose full name matches the filter will be executed.
For example, to run only tests whose name starts with the word "Bootstrap", you can execute:
```bash
$ test_and_benchmark/test/test_concrete_cuda --gtest_filter=Bootstrap*
```
The parameter `--gtest_list_tests` can be used to list all the available tests, and a better
description on how to select a subset of tests can be found in
[GoogleTest documentation](http://google.github.io/googletest/advanced.html#running-a-subset-of-the-tests).
## Conclusion
With these options, you can easily verify the correctness of concrete-cuda's implementations. If
you have any questions or issues, please feel free to contact us.
To learn more about GoogleTest library, please refer to the [official user guide](http://google.github.io/googletest/).

View File

@@ -1,243 +0,0 @@
#include <cstdint>
#include <gtest/gtest.h>
#include <setup_and_teardown.h>
#include <stdio.h>
#include <stdlib.h>
const unsigned REPETITIONS = 2;
const unsigned MAX_INPUTS = 4;
const unsigned SAMPLES = 10;
typedef struct {
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance;
double glwe_modular_variance;
int pbs_base_log;
int pbs_level;
int ks_base_log;
int ks_level;
uint32_t number_of_bits_of_message_including_padding_array[MAX_INPUTS];
uint32_t number_of_bits_to_extract_array[MAX_INPUTS];
int number_of_inputs;
} BitExtractionTestParams;
class BitExtractionTestPrimitives_u64
: public ::testing::TestWithParam<BitExtractionTestParams> {
protected:
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance;
double glwe_modular_variance;
int pbs_base_log;
int pbs_level;
int ks_base_log;
int ks_level;
uint32_t number_of_bits_of_message_including_padding_array[MAX_INPUTS];
uint32_t number_of_bits_to_extract_array[MAX_INPUTS];
int number_of_inputs;
uint64_t delta_array[MAX_INPUTS];
uint32_t delta_log_array[MAX_INPUTS];
Csprng *csprng;
cudaStream_t *stream_array[SAMPLES];
int gpu_index = 0;
uint64_t *lwe_sk_in_array;
uint64_t *lwe_sk_out_array;
uint64_t *lwe_ct_in_array;
uint64_t *plaintexts;
double *d_fourier_bsk_array;
uint64_t *d_ksk_array;
uint64_t *d_lwe_ct_in_array;
uint64_t *d_lwe_ct_out_array;
int8_t *bit_extract_buffer_array[SAMPLES];
int input_lwe_dimension;
int output_lwe_dimension;
public:
// Test arithmetic functions
void SetUp() {
for (size_t i = 0; i < SAMPLES; i++) {
stream_array[i] = cuda_create_stream(0);
}
// TestParams
lwe_dimension = (int)GetParam().lwe_dimension;
glwe_dimension = (int)GetParam().glwe_dimension;
polynomial_size = (int)GetParam().polynomial_size;
lwe_modular_variance = (double)GetParam().lwe_modular_variance;
glwe_modular_variance = (double)GetParam().glwe_modular_variance;
pbs_base_log = (int)GetParam().pbs_base_log;
pbs_level = (int)GetParam().pbs_level;
ks_base_log = (int)GetParam().ks_base_log;
ks_level = (int)GetParam().ks_level;
for (size_t i = 0; i < MAX_INPUTS; i++) {
number_of_bits_to_extract_array[i] =
(int)GetParam().number_of_bits_to_extract_array[i];
number_of_bits_of_message_including_padding_array[i] =
(int)GetParam().number_of_bits_of_message_including_padding_array[i];
}
number_of_inputs = (int)GetParam().number_of_inputs;
input_lwe_dimension = glwe_dimension * polynomial_size;
output_lwe_dimension = lwe_dimension;
bit_extraction_setup(
stream_array, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
&d_fourier_bsk_array, &d_ksk_array, &plaintexts, &d_lwe_ct_in_array,
&d_lwe_ct_out_array, bit_extract_buffer_array, lwe_dimension,
glwe_dimension, polynomial_size, lwe_modular_variance,
glwe_modular_variance, ks_base_log, ks_level, pbs_base_log, pbs_level,
number_of_bits_of_message_including_padding_array,
number_of_bits_to_extract_array, delta_log_array, delta_array,
number_of_inputs, REPETITIONS, SAMPLES, gpu_index);
}
void TearDown() {
bit_extraction_teardown(stream_array, csprng, lwe_sk_in_array,
lwe_sk_out_array, d_fourier_bsk_array, d_ksk_array,
plaintexts, d_lwe_ct_in_array, d_lwe_ct_out_array,
bit_extract_buffer_array, SAMPLES, gpu_index);
}
};
TEST_P(BitExtractionTestPrimitives_u64, bit_extraction) {
int total_bits_to_extract = 0;
for (int i = 0; i < number_of_inputs; i++) {
total_bits_to_extract += number_of_bits_to_extract_array[i];
}
uint64_t *lwe_ct_out_array =
(uint64_t *)malloc((output_lwe_dimension + 1) * total_bits_to_extract *
SAMPLES * sizeof(uint64_t));
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
polynomial_size * (output_lwe_dimension + 1);
int ksk_size = ks_level * input_lwe_dimension * (output_lwe_dimension + 1);
for (uint r = 0; r < REPETITIONS; r++) {
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
uint64_t *d_ksk = d_ksk_array + (ptrdiff_t)(ksk_size * r);
uint64_t *lwe_sk_out =
lwe_sk_out_array + (ptrdiff_t)(r * output_lwe_dimension);
auto d_cur_rep_ct_lwe_in_array =
&d_lwe_ct_in_array[r * SAMPLES * number_of_inputs *
(input_lwe_dimension + 1)];
for (uint s = 0; s < SAMPLES; s++) {
auto d_cur_sample_ct_lwe_in_array =
&d_cur_rep_ct_lwe_in_array[s * number_of_inputs *
(input_lwe_dimension + 1)];
auto d_cur_sample_ct_lwe_out_array =
&d_lwe_ct_out_array[s * total_bits_to_extract *
(output_lwe_dimension + 1)];
// Execute bit extract
auto cur_sample_ct_lwe_out_array =
&lwe_ct_out_array[s * total_bits_to_extract *
(output_lwe_dimension + 1)];
cuda_extract_bits_64(
stream_array[s], gpu_index, (void *)d_cur_sample_ct_lwe_out_array,
(void *)d_cur_sample_ct_lwe_in_array, bit_extract_buffer_array[s],
(void *)d_ksk, (void *)d_fourier_bsk, number_of_bits_to_extract_array,
delta_log_array, input_lwe_dimension, output_lwe_dimension,
glwe_dimension, polynomial_size, pbs_base_log, pbs_level, ks_base_log,
ks_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index));
// Copy result back
cuda_memcpy_async_to_cpu(
cur_sample_ct_lwe_out_array, d_cur_sample_ct_lwe_out_array,
(output_lwe_dimension + 1) * total_bits_to_extract * sizeof(uint64_t),
stream_array[s], gpu_index);
}
for (size_t s = 0; s < SAMPLES; s++) {
void *v_stream = (void *)stream_array[s];
cuda_synchronize_stream(v_stream);
}
cudaDeviceSynchronize();
for (size_t s = 0; s < SAMPLES; s++) {
auto cur_sample_result_array =
&lwe_ct_out_array[s * total_bits_to_extract *
(output_lwe_dimension + 1)];
int cur_total_bits = 0;
for (int j = 0; j < number_of_inputs; j++) {
auto cur_input_result_array =
&cur_sample_result_array[cur_total_bits *
(output_lwe_dimension + 1)];
cur_total_bits += number_of_bits_to_extract_array[j];
uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs +
s * number_of_inputs + j];
for (size_t i = 0; i < number_of_bits_to_extract_array[j]; i++) {
auto result_ct =
&cur_input_result_array[(number_of_bits_to_extract_array[j] - 1 -
i) *
(output_lwe_dimension + 1)];
uint64_t decrypted_message = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(
lwe_sk_out, result_ct, output_lwe_dimension, &decrypted_message);
// Round after decryption
uint64_t decrypted_rounded =
closest_representable(decrypted_message, 1, 1);
// Bring back the extracted bit found in the MSB in the LSB
uint64_t decrypted_extract_bit = decrypted_rounded >> 63;
uint64_t expected =
((plaintext >> delta_log_array[j]) >> i) & (uint64_t)(1);
EXPECT_EQ(decrypted_extract_bit, expected);
}
}
}
}
}
// Defines for which parameters set the PBS will be tested.
// It executes each test for all pairs on phis X qs (Cartesian product)
::testing::internal::ParamGenerator<BitExtractionTestParams>
bit_extract_params_u64 = ::testing::Values(
// n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
// ks_base_log, ks_level, number_of_message_bits,
// number_of_bits_to_extract, number_of_inputs
(BitExtractionTestParams){585,
1,
1024,
7.52316384526264e-37,
7.52316384526264e-37,
10,
2,
4,
7,
{5, 4, 4, 3},
{5, 4, 4, 3},
4});
std::string
printParamName(::testing::TestParamInfo<BitExtractionTestParams> p) {
BitExtractionTestParams params = p.param;
return "n_" + std::to_string(params.lwe_dimension) + "_k_" +
std::to_string(params.glwe_dimension) + "_N_" +
std::to_string(params.polynomial_size) + "_pbs_base_log_" +
std::to_string(params.pbs_base_log) + "_pbs_level_" +
std::to_string(params.pbs_level) + "_ks_base_log_" +
std::to_string(params.ks_base_log) + "_ks_level_" +
std::to_string(params.ks_level) + "_number_of_message_bits_" +
std::to_string(
params.number_of_bits_of_message_including_padding_array[0]) +
"_" +
std::to_string(
params.number_of_bits_of_message_including_padding_array[1]) +
"_" +
std::to_string(
params.number_of_bits_of_message_including_padding_array[2]) +
"_" +
std::to_string(
params.number_of_bits_of_message_including_padding_array[3]) +
"_number_of_bits_to_extract_" +
std::to_string(params.number_of_bits_to_extract_array[0]) + "_" +
std::to_string(params.number_of_bits_to_extract_array[1]) + "_" +
std::to_string(params.number_of_bits_to_extract_array[2]) + "_" +
std::to_string(params.number_of_bits_to_extract_array[3]) +
"_number_of_inputs_" + std::to_string(params.number_of_inputs);
}
INSTANTIATE_TEST_CASE_P(BitExtractionInstantiation,
BitExtractionTestPrimitives_u64, bit_extract_params_u64,
printParamName);

View File

@@ -1,220 +0,0 @@
#include <cstdint>
#include <gtest/gtest.h>
#include <setup_and_teardown.h>
#include <stdio.h>
#include <stdlib.h>
const unsigned REPETITIONS = 2;
const unsigned SAMPLES = 50;
typedef struct {
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance;
double glwe_modular_variance;
int pbs_base_log;
int pbs_level;
int pksk_base_log;
int pksk_level;
int cbs_base_log;
int cbs_level;
int number_of_inputs;
} CircuitBootstrapTestParams;
class CircuitBootstrapTestPrimitives_u64
: public ::testing::TestWithParam<CircuitBootstrapTestParams> {
protected:
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance;
double glwe_modular_variance;
int pbs_base_log;
int pbs_level;
int pksk_base_log;
int pksk_level;
int cbs_base_log;
int cbs_level;
int number_of_inputs;
int number_of_bits_of_message_including_padding;
int ggsw_size;
uint64_t delta;
int delta_log;
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t *lwe_sk_in_array;
uint64_t *lwe_sk_out_array;
uint64_t *plaintexts;
double *d_fourier_bsk_array;
uint64_t *d_pksk_array;
uint64_t *d_lwe_ct_in_array;
uint64_t *d_ggsw_ct_out_array;
uint64_t *d_lut_vector_indexes;
int8_t *cbs_buffer;
public:
// Test arithmetic functions
void SetUp() {
stream = cuda_create_stream(0);
// TestParams
lwe_dimension = (int)GetParam().lwe_dimension;
glwe_dimension = (int)GetParam().glwe_dimension;
polynomial_size = (int)GetParam().polynomial_size;
lwe_modular_variance = (double)GetParam().lwe_modular_variance;
glwe_modular_variance = (double)GetParam().glwe_modular_variance;
pbs_base_log = (int)GetParam().pbs_base_log;
pbs_level = (int)GetParam().pbs_level;
pksk_base_log = (int)GetParam().pksk_base_log;
pksk_level = (int)GetParam().pksk_level;
cbs_base_log = (int)GetParam().cbs_base_log;
cbs_level = (int)GetParam().cbs_level;
number_of_inputs = (int)GetParam().number_of_inputs;
// We generate binary messages
number_of_bits_of_message_including_padding = 2;
ggsw_size = cbs_level * (glwe_dimension + 1) * (glwe_dimension + 1) *
polynomial_size;
circuit_bootstrap_setup(
stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
&d_fourier_bsk_array, &d_pksk_array, &plaintexts, &d_lwe_ct_in_array,
&d_ggsw_ct_out_array, &d_lut_vector_indexes, &cbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, lwe_modular_variance,
glwe_modular_variance, pksk_base_log, pksk_level, pbs_base_log,
pbs_level, cbs_level, number_of_bits_of_message_including_padding,
ggsw_size, &delta_log, &delta, number_of_inputs, REPETITIONS, SAMPLES,
gpu_index);
}
void TearDown() {
circuit_bootstrap_teardown(
stream, csprng, lwe_sk_in_array, lwe_sk_out_array, d_fourier_bsk_array,
d_pksk_array, plaintexts, d_lwe_ct_in_array, d_lut_vector_indexes,
d_ggsw_ct_out_array, cbs_buffer, gpu_index);
}
};
TEST_P(CircuitBootstrapTestPrimitives_u64, circuit_bootstrap) {
void *v_stream = (void *)stream;
uint64_t *ggsw_ct_out = (uint64_t *)malloc(ggsw_size * sizeof(uint64_t));
for (uint r = 0; r < REPETITIONS; r++) {
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
polynomial_size * (lwe_dimension + 1);
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
int pksk_list_size = pksk_level * (glwe_dimension + 1) * polynomial_size *
(glwe_dimension * polynomial_size + 1) *
(glwe_dimension + 1);
uint64_t *d_pksk_list = d_pksk_array + (ptrdiff_t)(pksk_list_size * r);
uint64_t *lwe_sk_out =
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
for (uint s = 0; s < SAMPLES; s++) {
uint64_t *d_lwe_ct_in =
d_lwe_ct_in_array +
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
(lwe_dimension + 1));
// Execute circuit bootstrap
cuda_circuit_bootstrap_64(
stream, gpu_index, (void *)d_ggsw_ct_out_array, (void *)d_lwe_ct_in,
(void *)d_fourier_bsk, (void *)d_pksk_list,
(void *)d_lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
glwe_dimension, lwe_dimension, pbs_level, pbs_base_log, pksk_level,
pksk_base_log, cbs_level, cbs_base_log, number_of_inputs,
cuda_get_max_shared_memory(gpu_index));
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i];
uint64_t *decrypted =
(uint64_t *)malloc(polynomial_size * (glwe_dimension + 1) *
cbs_level * sizeof(uint64_t));
// Copy result back
cuda_memcpy_async_to_cpu(
ggsw_ct_out, d_ggsw_ct_out_array + i * ggsw_size,
ggsw_size * sizeof(uint64_t), stream, gpu_index);
cuda_synchronize_stream(v_stream);
uint64_t multiplying_factor = -(plaintext >> delta_log);
for (int l = 1; l < cbs_level + 1; l++) {
for (int j = 0; j < glwe_dimension; j++) {
uint64_t *res = decrypted + (ptrdiff_t)((l - 1) * polynomial_size *
(glwe_dimension + 1) +
j * polynomial_size);
uint64_t *glwe_ct_out =
ggsw_ct_out +
(ptrdiff_t)((l - 1) * polynomial_size * (glwe_dimension + 1) *
(glwe_dimension + 1) +
j * polynomial_size * (glwe_dimension + 1));
concrete_cpu_decrypt_glwe_ciphertext_u64(
lwe_sk_out, res, glwe_ct_out, glwe_dimension, polynomial_size);
for (int k = 0; k < polynomial_size; k++) {
uint64_t expected_decryption =
lwe_sk_out[j * polynomial_size + k] * multiplying_factor;
expected_decryption >>= (64 - cbs_base_log * l);
uint64_t decoded_plaintext =
closest_representable(res[k], l, cbs_base_log) >>
(64 - cbs_base_log * l);
EXPECT_EQ(expected_decryption, decoded_plaintext);
}
}
}
// Check last glwe on last level
uint64_t *res =
decrypted + (ptrdiff_t)((cbs_level - 1) * polynomial_size *
(glwe_dimension + 1) +
glwe_dimension * polynomial_size);
uint64_t *glwe_ct_out =
ggsw_ct_out +
(ptrdiff_t)((cbs_level - 1) * polynomial_size *
(glwe_dimension + 1) * (glwe_dimension + 1) +
glwe_dimension * polynomial_size *
(glwe_dimension + 1));
concrete_cpu_decrypt_glwe_ciphertext_u64(
lwe_sk_out, res, glwe_ct_out, glwe_dimension, polynomial_size);
for (int k = 0; k < polynomial_size; k++) {
uint64_t expected_decryption = (k == 0) ? plaintext / delta : 0;
uint64_t decoded_plaintext =
closest_representable(res[k], cbs_level, cbs_base_log) >>
(64 - cbs_base_log * cbs_level);
EXPECT_EQ(expected_decryption, decoded_plaintext);
}
free(decrypted);
}
}
}
free(ggsw_ct_out);
}
// Defines for which parameters set the PBS will be tested.
// It executes each test for all pairs on phis X qs (Cartesian product)
::testing::internal::ParamGenerator<CircuitBootstrapTestParams> cbs_params_u64 =
::testing::Values(
// n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
// pksk_base_log, pksk_level, cbs_base_log, cbs_level, number_of_inputs
(CircuitBootstrapTestParams){10, 2, 512, 7.52316384526264e-37,
7.52316384526264e-37, 11, 2, 15, 2, 10, 1,
10});
std::string
printParamName(::testing::TestParamInfo<CircuitBootstrapTestParams> p) {
CircuitBootstrapTestParams params = p.param;
return "n_" + std::to_string(params.lwe_dimension) + "_k_" +
std::to_string(params.glwe_dimension) + "_N_" +
std::to_string(params.polynomial_size) + "_pbs_base_log_" +
std::to_string(params.pbs_base_log) + "_pbs_level_" +
std::to_string(params.pbs_level) + "_pksk_base_log_" +
std::to_string(params.pksk_base_log) + "_pksk_level_" +
std::to_string(params.pksk_level) + "_cbs_base_log_" +
std::to_string(params.cbs_base_log) + "_cbs_level_" +
std::to_string(params.cbs_level);
}
INSTANTIATE_TEST_CASE_P(CircuitBootstrapInstantiation,
CircuitBootstrapTestPrimitives_u64, cbs_params_u64,
printParamName);

View File

@@ -1,339 +0,0 @@
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <functional>
#include <gtest/gtest.h>
#include <setup_and_teardown.h>
#include <utils.h>
typedef struct {
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance;
double glwe_modular_variance;
int pbs_base_log;
int pbs_level;
int message_modulus;
int carry_modulus;
int number_of_inputs;
int repetitions;
int samples;
} ClassicalBootstrapTestParams;
class ClassicalBootstrapTestPrimitives_u64
: public ::testing::TestWithParam<ClassicalBootstrapTestParams> {
protected:
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance;
double glwe_modular_variance;
int pbs_base_log;
int pbs_level;
int message_modulus;
int carry_modulus;
int payload_modulus;
int number_of_inputs;
int repetitions;
int samples;
uint64_t delta;
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t *lwe_sk_in_array;
uint64_t *lwe_sk_out_array;
uint64_t *plaintexts;
double *d_fourier_bsk_array;
uint64_t *d_lut_pbs_identity;
uint64_t *d_lut_pbs_indexes;
uint64_t *d_lwe_ct_in_array;
uint64_t *d_lwe_ct_out_array;
uint64_t *lwe_ct_out_array;
public:
// Test arithmetic functions
void SetUp() {
stream = cuda_create_stream(0);
// TestParams
lwe_dimension = (int)GetParam().lwe_dimension;
glwe_dimension = (int)GetParam().glwe_dimension;
polynomial_size = (int)GetParam().polynomial_size;
lwe_modular_variance = (double)GetParam().lwe_modular_variance;
glwe_modular_variance = (double)GetParam().glwe_modular_variance;
pbs_base_log = (int)GetParam().pbs_base_log;
pbs_level = (int)GetParam().pbs_level;
message_modulus = (int)GetParam().message_modulus;
carry_modulus = (int)GetParam().carry_modulus;
number_of_inputs = (int)GetParam().number_of_inputs;
repetitions = (int)GetParam().repetitions;
samples = (int)GetParam().samples;
bootstrap_classical_setup(
stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
&d_fourier_bsk_array, &plaintexts, &d_lut_pbs_identity,
&d_lut_pbs_indexes, &d_lwe_ct_in_array, &d_lwe_ct_out_array,
lwe_dimension, glwe_dimension, polynomial_size, lwe_modular_variance,
glwe_modular_variance, pbs_base_log, pbs_level, message_modulus,
carry_modulus, &payload_modulus, &delta, number_of_inputs, repetitions,
samples, gpu_index);
lwe_ct_out_array =
(uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t));
}
void TearDown() {
free(lwe_ct_out_array);
bootstrap_classical_teardown(
stream, csprng, lwe_sk_in_array, lwe_sk_out_array, d_fourier_bsk_array,
plaintexts, d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
d_lwe_ct_out_array, gpu_index);
}
};
TEST_P(ClassicalBootstrapTestPrimitives_u64, amortized_bootstrap) {
int8_t *pbs_buffer;
scratch_cuda_bootstrap_amortized_64(
stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
polynomial_size * (lwe_dimension + 1);
// Here execute the PBS
for (int r = 0; r < repetitions; r++) {
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
uint64_t *lwe_sk_out =
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
for (int s = 0; s < samples; s++) {
uint64_t *d_lwe_ct_in =
d_lwe_ct_in_array +
(ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
(lwe_dimension + 1));
// Execute PBS
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
(void *)d_lwe_ct_in, (void *)d_fourier_bsk, pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
number_of_inputs, 1, 0, cuda_get_max_shared_memory(gpu_index));
// Copy result back
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
(glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t),
stream, gpu_index);
for (int j = 0; j < number_of_inputs; j++) {
uint64_t *result =
lwe_ct_out_array +
(ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
s * number_of_inputs + j];
uint64_t decrypted = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(
lwe_sk_out, result, glwe_dimension * polynomial_size, &decrypted);
EXPECT_NE(decrypted, plaintext);
// let err = (decrypted >= plaintext) ? decrypted - plaintext :
// plaintext
// - decrypted;
// error_sample_vec.push(err);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
EXPECT_EQ(decoded, plaintext / delta)
<< "Repetition: " << r << ", sample: " << s;
}
}
}
cleanup_cuda_bootstrap_amortized(stream, gpu_index, &pbs_buffer);
}
TEST_P(ClassicalBootstrapTestPrimitives_u64, low_latency_bootstrap) {
int8_t *pbs_buffer;
scratch_cuda_bootstrap_low_latency_64(
stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
polynomial_size * (lwe_dimension + 1);
// Here execute the PBS
for (int r = 0; r < repetitions; r++) {
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
uint64_t *lwe_sk_out =
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
for (int s = 0; s < samples; s++) {
uint64_t *d_lwe_ct_in =
d_lwe_ct_in_array +
(ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
(lwe_dimension + 1));
// Execute PBS
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
(void *)d_lwe_ct_in, (void *)d_fourier_bsk, pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
number_of_inputs, 1, 0, cuda_get_max_shared_memory(gpu_index));
// Copy result back
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
(glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t),
stream, gpu_index);
for (int j = 0; j < number_of_inputs; j++) {
uint64_t *result =
lwe_ct_out_array +
(ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
s * number_of_inputs + j];
uint64_t decrypted = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(
lwe_sk_out, result, glwe_dimension * polynomial_size, &decrypted);
EXPECT_NE(decrypted, plaintext);
// let err = (decrypted >= plaintext) ? decrypted - plaintext :
// plaintext
// - decrypted;
// error_sample_vec.push(err);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
EXPECT_EQ(decoded, plaintext / delta);
}
}
}
cleanup_cuda_bootstrap_low_latency(stream, gpu_index, &pbs_buffer);
}
// Defines for which parameters set the PBS will be tested.
// It executes each test for all pairs on phis X qs (Cartesian product)
::testing::internal::ParamGenerator<ClassicalBootstrapTestParams>
pbs_params_u64 = ::testing::Values(
// n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
// message_modulus, carry_modulus, number_of_inputs, repetitions,
// samples
// BOOLEAN_DEFAULT_PARAMETERS
(ClassicalBootstrapTestParams){777, 3, 512, 1.3880686109937e-11,
1.1919984450689246e-23, 18, 1, 2, 2, 2,
2, 40},
// BOOLEAN_TFHE_LIB_PARAMETERS
(ClassicalBootstrapTestParams){830, 2, 1024, 1.994564705573226e-12,
8.645717832544903e-32, 23, 1, 2, 2, 2, 2,
40},
// SHORTINT_PARAM_MESSAGE_1_CARRY_0
(ClassicalBootstrapTestParams){678, 5, 256, 5.203010004723453e-10,
1.3996292326131784e-19, 15, 1, 2, 1, 2,
2, 40},
// SHORTINT_PARAM_MESSAGE_1_CARRY_1
(ClassicalBootstrapTestParams){684, 3, 512, 4.177054989616946e-10,
1.1919984450689246e-23, 18, 1, 2, 2, 2,
2, 40},
// SHORTINT_PARAM_MESSAGE_2_CARRY_0
(ClassicalBootstrapTestParams){656, 2, 512, 1.1641198952558192e-09,
1.6434266310406663e-15, 8, 2, 4, 1, 2, 2,
40},
// SHORTINT_PARAM_MESSAGE_1_CARRY_2
// SHORTINT_PARAM_MESSAGE_2_CARRY_1
// SHORTINT_PARAM_MESSAGE_3_CARRY_0
(ClassicalBootstrapTestParams){742, 2, 1024, 4.998277131225527e-11,
8.645717832544903e-32, 23, 1, 2, 4, 2, 2,
40},
// SHORTINT_PARAM_MESSAGE_1_CARRY_3
// SHORTINT_PARAM_MESSAGE_2_CARRY_2
// SHORTINT_PARAM_MESSAGE_3_CARRY_1
// SHORTINT_PARAM_MESSAGE_4_CARRY_0
(ClassicalBootstrapTestParams){745, 1, 2048, 4.478453795193731e-11,
8.645717832544903e-32, 23, 1, 2, 8, 2, 2,
40},
// SHORTINT_PARAM_MESSAGE_5_CARRY_0
// SHORTINT_PARAM_MESSAGE_3_CARRY_2
(ClassicalBootstrapTestParams){807, 1, 4096, 4.629015039118823e-12,
4.70197740328915e-38, 22, 1, 32, 1, 2, 1,
40},
// SHORTINT_PARAM_MESSAGE_6_CARRY_0
(ClassicalBootstrapTestParams){915, 1, 8192, 8.883173851180252e-14,
4.70197740328915e-38, 22, 1, 64, 1, 2, 1,
5},
// SHORTINT_PARAM_MESSAGE_3_CARRY_3
(ClassicalBootstrapTestParams){864, 1, 8192, 1.5843564961097632e-15,
4.70197740328915e-38, 15, 2, 8, 8, 2, 1,
5},
// SHORTINT_PARAM_MESSAGE_4_CARRY_3
// SHORTINT_PARAM_MESSAGE_7_CARRY_0
(ClassicalBootstrapTestParams){930, 1, 16384, 5.129877458078009e-14,
4.70197740328915e-38, 15, 2, 128, 1, 2,
1, 5},
// BOOLEAN_DEFAULT_PARAMETERS
(ClassicalBootstrapTestParams){777, 3, 512, 1.3880686109937e-11,
1.1919984450689246e-23, 18, 1, 2, 2, 100,
2, 40},
// BOOLEAN_TFHE_LIB_PARAMETERS
(ClassicalBootstrapTestParams){830, 2, 1024, 1.994564705573226e-12,
8.645717832544903e-32, 23, 1, 2, 2, 100,
2, 40},
// SHORTINT_PARAM_MESSAGE_1_CARRY_0
(ClassicalBootstrapTestParams){678, 5, 256, 5.203010004723453e-10,
1.3996292326131784e-19, 15, 1, 2, 1, 100,
2, 40},
// SHORTINT_PARAM_MESSAGE_1_CARRY_1
(ClassicalBootstrapTestParams){684, 3, 512, 4.177054989616946e-10,
1.1919984450689246e-23, 18, 1, 2, 2, 100,
2, 40},
// SHORTINT_PARAM_MESSAGE_2_CARRY_0
(ClassicalBootstrapTestParams){656, 2, 512, 1.1641198952558192e-09,
1.6434266310406663e-15, 8, 2, 4, 1, 100,
2, 40},
// SHORTINT_PARAM_MESSAGE_1_CARRY_2
// SHORTINT_PARAM_MESSAGE_2_CARRY_1
// SHORTINT_PARAM_MESSAGE_3_CARRY_0
(ClassicalBootstrapTestParams){742, 2, 1024, 4.998277131225527e-11,
8.645717832544903e-32, 23, 1, 2, 4, 100,
2, 40},
// SHORTINT_PARAM_MESSAGE_1_CARRY_3
// SHORTINT_PARAM_MESSAGE_2_CARRY_2
// SHORTINT_PARAM_MESSAGE_3_CARRY_1
// SHORTINT_PARAM_MESSAGE_4_CARRY_0
(ClassicalBootstrapTestParams){745, 1, 2048, 4.478453795193731e-11,
8.645717832544903e-32, 23, 1, 2, 8, 100,
2, 40},
// SHORTINT_PARAM_MESSAGE_5_CARRY_0
// SHORTINT_PARAM_MESSAGE_3_CARRY_2
(ClassicalBootstrapTestParams){807, 1, 4096, 4.629015039118823e-12,
4.70197740328915e-38, 22, 1, 32, 1, 100,
1, 40},
// SHORTINT_PARAM_MESSAGE_6_CARRY_0
(ClassicalBootstrapTestParams){915, 1, 8192, 8.883173851180252e-14,
4.70197740328915e-38, 22, 1, 64, 1, 100,
1, 5},
// SHORTINT_PARAM_MESSAGE_3_CARRY_3
(ClassicalBootstrapTestParams){864, 1, 8192, 1.5843564961097632e-15,
4.70197740328915e-38, 15, 2, 8, 8, 100,
1, 5},
// SHORTINT_PARAM_MESSAGE_4_CARRY_3
// SHORTINT_PARAM_MESSAGE_7_CARRY_0
(ClassicalBootstrapTestParams){930, 1, 16384, 5.129877458078009e-14,
4.70197740328915e-38, 15, 2, 128, 1, 100,
1, 5});
std::string
printParamName(::testing::TestParamInfo<ClassicalBootstrapTestParams> p) {
ClassicalBootstrapTestParams params = p.param;
return "n_" + std::to_string(params.lwe_dimension) + "_k_" +
std::to_string(params.glwe_dimension) + "_N_" +
std::to_string(params.polynomial_size) + "_pbs_base_log_" +
std::to_string(params.pbs_base_log) + "_pbs_level_" +
std::to_string(params.pbs_level) + "_number_of_inputs_" +
std::to_string(params.number_of_inputs);
}
INSTANTIATE_TEST_CASE_P(ClassicalBootstrapInstantiation,
ClassicalBootstrapTestPrimitives_u64, pbs_params_u64,
printParamName);

View File

@@ -1,149 +0,0 @@
#include <cmath>
#include <cstdint>
#include <functional>
#include <gtest/gtest.h>
#include <setup_and_teardown.h>
#include <stdlib.h>
const unsigned REPETITIONS = 5;
const unsigned SAMPLES = 50;
typedef struct {
int glwe_dimension;
int polynomial_size;
int p; // number_of_bits_to_extract
int tau;
double glwe_modular_variance;
int base_log;
int level_count;
} CMUXTreeTestParams;
class CMUXTreeTestPrimitives_u64
: public ::testing::TestWithParam<CMUXTreeTestParams> {
protected:
int glwe_dimension;
int polynomial_size;
int p;
int tau;
double glwe_modular_variance;
int base_log;
int level_count;
uint64_t delta;
uint32_t delta_log;
Csprng *csprng;
uint64_t *plaintexts;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t *glwe_sk;
uint64_t *d_lut_identity;
int8_t *cmux_tree_buffer = nullptr;
uint64_t *d_ggsw_bit_array;
uint64_t *d_glwe_out;
uint64_t *glwe_out;
public:
// Test arithmetic functions
void SetUp() {
stream = cuda_create_stream(0);
// TestParams
glwe_dimension = (int)GetParam().glwe_dimension;
polynomial_size = (int)GetParam().polynomial_size;
p = (int)GetParam().p;
tau = (int)GetParam().tau;
glwe_modular_variance = (int)GetParam().glwe_modular_variance;
base_log = (int)GetParam().base_log;
level_count = (int)GetParam().level_count;
cmux_tree_setup(stream, &csprng, &glwe_sk, &d_lut_identity, &plaintexts,
&d_ggsw_bit_array, &cmux_tree_buffer, &d_glwe_out,
glwe_dimension, polynomial_size, base_log, level_count,
glwe_modular_variance, p, tau, &delta_log, REPETITIONS,
SAMPLES, gpu_index);
// Value of the shift we multiply our messages by
delta = ((uint64_t)(1) << delta_log);
glwe_out = (uint64_t *)malloc(tau * (glwe_dimension + 1) * polynomial_size *
sizeof(uint64_t));
}
void TearDown() {
free(glwe_out);
cmux_tree_teardown(stream, &csprng, &glwe_sk, &d_lut_identity, &plaintexts,
&d_ggsw_bit_array, &cmux_tree_buffer, &d_glwe_out,
gpu_index);
}
};
TEST_P(CMUXTreeTestPrimitives_u64, cmux_tree) {
int ggsw_size = polynomial_size * (glwe_dimension + 1) *
(glwe_dimension + 1) * level_count;
int glwe_size = (glwe_dimension + 1) * polynomial_size;
uint32_t r_lut = 1;
if (tau * p > log2(polynomial_size)) {
r_lut = tau * p - log2(polynomial_size);
}
// Here execute the PBS
for (uint r = 0; r < REPETITIONS; r++) {
for (uint s = 0; s < SAMPLES; s++) {
uint64_t witness = plaintexts[r * SAMPLES + s];
uint64_t *d_ggsw_bit_array_slice =
d_ggsw_bit_array +
(ptrdiff_t)((r * SAMPLES * r_lut + s * r_lut) * ggsw_size);
// Execute CMUX tree
cuda_cmux_tree_64(stream, gpu_index, (void *)d_glwe_out,
(void *)d_ggsw_bit_array_slice, (void *)d_lut_identity,
cmux_tree_buffer, glwe_dimension, polynomial_size,
base_log, level_count, (1 << (tau * p)), tau,
cuda_get_max_shared_memory(gpu_index));
// Copy result back
cuda_memcpy_async_to_cpu(glwe_out, d_glwe_out,
tau * glwe_size * sizeof(uint64_t), stream,
gpu_index);
cuda_synchronize_stream(stream);
for (int tree = 0; tree < tau; tree++) {
uint64_t *result = glwe_out + tree * glwe_size;
uint64_t *decrypted =
(uint64_t *)malloc(polynomial_size * sizeof(uint64_t));
concrete_cpu_decrypt_glwe_ciphertext_u64(
glwe_sk, decrypted, result, glwe_dimension, polynomial_size);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted[0] & rounding_bit) << 1;
uint64_t decoded = (decrypted[0] + rounding) / delta;
EXPECT_EQ(decoded, witness % (1 << p))
<< "Repetition: " << r << ", sample: " << s << ", tree: " << tree;
free(decrypted);
}
}
}
cuda_synchronize_stream(stream);
}
// Defines for which parameters set the PBS will be tested.
// It executes each test for all pairs on phis X qs (Cartesian product)
::testing::internal::ParamGenerator<CMUXTreeTestParams> cmux_tree_params_u64 =
::testing::Values(
// k, N, p, tau, glwe_variance, base_log, level_count
(CMUXTreeTestParams){2, 256, 3, 4, 2.9403601535432533e-16, 6, 3},
(CMUXTreeTestParams){2, 512, 4, 2, 2.9403601535432533e-16, 6, 3},
(CMUXTreeTestParams){1, 1024, 11, 1, 2.9403601535432533e-16, 6, 3});
std::string printParamName(::testing::TestParamInfo<CMUXTreeTestParams> p) {
CMUXTreeTestParams params = p.param;
return "k_" + std::to_string(params.glwe_dimension) + "_N_" +
std::to_string(params.polynomial_size) + "_tau_" +
std::to_string(params.tau) + "_p_" + std::to_string(params.p) +
"_base_log_" + std::to_string(params.base_log) + "_level_count_" +
std::to_string(params.level_count);
}
INSTANTIATE_TEST_CASE_P(CMUXTreeInstantiation, CMUXTreeTestPrimitives_u64,
cmux_tree_params_u64, printParamName);

View File

@@ -1,129 +0,0 @@
#include "concrete-cpu.h"
#include "utils.h"
#include "gtest/gtest.h"
#include <bootstrap.h>
#include <cstdint>
#include <device.h>
#include <functional>
#include <random>
#include <setup_and_teardown.h>
#include <stdio.h>
#include <stdlib.h>
typedef struct {
size_t polynomial_size;
int samples;
} FourierTransformTestParams;
class FourierTransformTestPrimitives_u64
: public ::testing::TestWithParam<FourierTransformTestParams> {
protected:
size_t polynomial_size;
int samples;
cudaStream_t *stream;
int gpu_index = 0;
double *poly1;
double *poly2; // will be used as extracted result for cuda mult
double *poly_exp_result;
double2 *h_cpoly1;
double2 *h_cpoly2; // will be used as a result poly
double2 *d_cpoly1;
double2 *d_cpoly2; // will be used as a result poly
public:
void SetUp() {
stream = cuda_create_stream(0);
// get test params
polynomial_size = (int)GetParam().polynomial_size;
samples = (int)GetParam().samples;
fft_setup(stream, &poly1, &poly2, &h_cpoly1, &h_cpoly2, &d_cpoly1,
&d_cpoly2, polynomial_size, samples, gpu_index);
// allocate memory
poly_exp_result =
(double *)malloc(polynomial_size * 2 * samples * sizeof(double));
memset(poly_exp_result, 0., polynomial_size * 2 * samples * sizeof(double));
// execute school book multiplication
for (size_t p = 0; p < (size_t)samples; p++) {
auto left = &poly1[p * polynomial_size];
auto right = &poly2[p * polynomial_size];
auto res = &poly_exp_result[p * polynomial_size * 2];
// multiplication
for (std::size_t i = 0; i < polynomial_size; ++i) {
for (std::size_t j = 0; j < polynomial_size; ++j) {
res[i + j] += left[i] * right[j];
}
}
// make result negacyclic
for (size_t i = 0; i < polynomial_size; i++) {
res[i] = res[i] - res[i + polynomial_size];
}
}
}
void TearDown() {
fft_teardown(stream, poly1, poly2, h_cpoly1, h_cpoly2, d_cpoly1, d_cpoly2,
gpu_index);
free(poly_exp_result);
}
};
TEST_P(FourierTransformTestPrimitives_u64, cuda_fft_mult) {
int r = 0;
auto cur_input1 = &d_cpoly1[r * polynomial_size / 2 * samples];
auto cur_input2 = &d_cpoly2[r * polynomial_size / 2 * samples];
auto cur_h_c_res = &h_cpoly2[r * polynomial_size / 2 * samples];
auto cur_poly2 = &poly2[r * polynomial_size * samples];
auto cur_expected = &poly_exp_result[r * polynomial_size * 2 * samples];
cuda_fourier_polynomial_mul(cur_input1, cur_input2, cur_input2, stream, 0,
polynomial_size, samples);
cuda_memcpy_async_to_cpu(cur_h_c_res, cur_input2,
polynomial_size / 2 * samples * sizeof(double2),
stream, gpu_index);
cuda_synchronize_stream(stream);
for (int p = 0; p < samples; p++) {
for (size_t i = 0; i < (size_t)polynomial_size / 2; i++) {
cur_poly2[p * polynomial_size + i] =
cur_h_c_res[p * polynomial_size / 2 + i].x;
cur_poly2[p * polynomial_size + i + polynomial_size / 2] =
cur_h_c_res[p * polynomial_size / 2 + i].y;
}
}
for (size_t p = 0; p < (size_t)samples; p++) {
for (size_t i = 0; i < (size_t)polynomial_size; i++) {
EXPECT_NEAR(cur_poly2[p * polynomial_size + i],
cur_expected[p * 2 * polynomial_size + i], 1e-9);
}
}
}
::testing::internal::ParamGenerator<FourierTransformTestParams> fft_params_u64 =
::testing::Values((FourierTransformTestParams){256, 100},
(FourierTransformTestParams){512, 100},
(FourierTransformTestParams){1024, 100},
(FourierTransformTestParams){2048, 100},
(FourierTransformTestParams){4096, 100},
(FourierTransformTestParams){8192, 50},
(FourierTransformTestParams){16384, 10});
std::string
printParamName(::testing::TestParamInfo<FourierTransformTestParams> p) {
FourierTransformTestParams params = p.param;
return "N_" + std::to_string(params.polynomial_size) + "_samples_" +
std::to_string(params.samples);
}
INSTANTIATE_TEST_CASE_P(fftInstantiation, FourierTransformTestPrimitives_u64,
fft_params_u64, printParamName);

View File

@@ -1,266 +0,0 @@
#include <cmath>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <functional>
#include <gtest/gtest.h>
#include <setup_and_teardown.h>
#include <utils.h>
const bool USE_MULTI_GPU = false;
typedef struct {
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance;
double glwe_modular_variance;
int pbs_base_log;
int pbs_level;
int ksk_base_log;
int ksk_level;
int total_message_bits;
int number_of_blocks;
int message_modulus;
int carry_modulus;
int repetitions;
int samples;
PBS_TYPE pbs_type;
} IntegerMultiplicationTestParams;
class IntegerMultiplicationTestPrimitives_u64
: public ::testing::TestWithParam<IntegerMultiplicationTestParams> {
protected:
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance;
double glwe_modular_variance;
int pbs_base_log;
int pbs_level;
int ksk_base_log;
int ksk_level;
int total_message_bits;
int number_of_blocks;
int message_modulus;
int carry_modulus;
int repetitions;
int samples;
PBS_TYPE pbs_type;
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t delta;
uint64_t *lwe_sk_in_array;
uint64_t *lwe_sk_out_array;
uint64_t *plaintexts_1;
uint64_t *plaintexts_2;
uint64_t *expected;
void *d_bsk_array;
uint64_t *d_ksk_array;
uint64_t *d_lwe_ct_in_array_1;
uint64_t *d_lwe_ct_in_array_2;
uint64_t *d_lwe_ct_out_array;
int_mul_memory<uint64_t> *mem_ptr;
public:
// Test arithmetic functions
void SetUp() {
// TestParams
lwe_dimension = (int)GetParam().lwe_dimension;
glwe_dimension = (int)GetParam().glwe_dimension;
polynomial_size = (int)GetParam().polynomial_size;
lwe_modular_variance = (double)GetParam().lwe_modular_variance;
glwe_modular_variance = (double)GetParam().glwe_modular_variance;
pbs_base_log = (int)GetParam().pbs_base_log;
pbs_level = (int)GetParam().pbs_level;
ksk_base_log = (int)GetParam().ksk_base_log;
ksk_level = (int)GetParam().ksk_level;
total_message_bits = (int)GetParam().total_message_bits;
number_of_blocks = (int)GetParam().number_of_blocks;
message_modulus = (int)GetParam().message_modulus;
carry_modulus = (int)GetParam().carry_modulus;
repetitions = (int)GetParam().repetitions;
samples = (int)GetParam().samples;
pbs_type = (PBS_TYPE)GetParam().pbs_type;
mem_ptr = new int_mul_memory<uint64_t>;
stream = cuda_create_stream(gpu_index);
integer_multiplication_setup(
stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array, &d_bsk_array,
&d_ksk_array, &plaintexts_1, &plaintexts_2, &d_lwe_ct_in_array_1,
&d_lwe_ct_in_array_2, &d_lwe_ct_out_array, mem_ptr, lwe_dimension,
glwe_dimension, polynomial_size, lwe_modular_variance,
glwe_modular_variance, pbs_base_log, pbs_level, ksk_base_log, ksk_level,
total_message_bits, number_of_blocks, message_modulus, carry_modulus,
&delta, repetitions, samples, pbs_type, gpu_index);
expected = (uint64_t *)malloc(repetitions * samples * number_of_blocks *
sizeof(uint64_t));
for (int r = 0; r < repetitions; r++) {
for (int s = 0; s < samples; s++) {
uint64_t message_1 = 0;
uint64_t message_2 = 0;
for (int i = 0; i < number_of_blocks; i++) {
message_1 += std::pow(message_modulus, i) *
plaintexts_1[r * samples * number_of_blocks +
s * number_of_blocks + i] /
delta;
message_2 += std::pow(message_modulus, i) *
plaintexts_2[r * samples * number_of_blocks +
s * number_of_blocks + i] /
delta;
}
uint64_t expected_result =
(message_1 * message_2) % (1 << total_message_bits);
for (int i = number_of_blocks - 1; i >= 0; i--) {
uint64_t coef = expected_result / std::pow(message_modulus, i);
expected[i] = coef;
expected_result -= coef * std::pow(message_modulus, i);
}
}
}
}
void TearDown() {
free(expected);
integer_multiplication_teardown(
stream, csprng, lwe_sk_in_array, lwe_sk_out_array, d_bsk_array,
d_ksk_array, plaintexts_1, plaintexts_2, d_lwe_ct_in_array_1,
d_lwe_ct_in_array_2, d_lwe_ct_out_array, mem_ptr);
cuda_synchronize_stream(stream);
cuda_destroy_stream(stream, gpu_index);
}
};
TEST_P(IntegerMultiplicationTestPrimitives_u64, integer_multiplication) {
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
polynomial_size * (lwe_dimension + 1);
int ksk_size =
ksk_level * (lwe_dimension + 1) * glwe_dimension * polynomial_size;
uint64_t *lwe_ct_out_array =
(uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *
number_of_blocks * sizeof(uint64_t));
uint64_t *decrypted = (uint64_t *)malloc(number_of_blocks * sizeof(uint64_t));
for (int r = 0; r < repetitions; r++) {
void *d_bsk = d_bsk_array + (ptrdiff_t)(bsk_size * r);
uint64_t *d_ksk = d_ksk_array + (ptrdiff_t)(ksk_size * r);
uint64_t *lwe_sk =
lwe_sk_in_array + (ptrdiff_t)(glwe_dimension * polynomial_size * r);
for (int s = 0; s < samples; s++) {
uint64_t *d_lwe_ct_in_1 =
d_lwe_ct_in_array_1 +
(ptrdiff_t)((r * samples * number_of_blocks + s * number_of_blocks) *
(glwe_dimension * polynomial_size + 1));
uint64_t *d_lwe_ct_in_2 =
d_lwe_ct_in_array_2 +
(ptrdiff_t)((r * samples * number_of_blocks + s * number_of_blocks) *
(glwe_dimension * polynomial_size + 1));
uint32_t ct_degree_out = 0;
uint32_t ct_degree_left = 0;
uint32_t ct_degree_right = 0;
int8_t *mult_buffer = NULL;
// Execute integer mult
if (USE_MULTI_GPU) {
scratch_cuda_integer_mult_radix_ciphertext_kb_64_multi_gpu(
mem_ptr, d_bsk, d_ksk, message_modulus, carry_modulus,
glwe_dimension, lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, ksk_base_log, ksk_level, number_of_blocks, pbs_type,
cuda_get_max_shared_memory(gpu_index), true);
cuda_integer_mult_radix_ciphertext_kb_64_multi_gpu(
d_lwe_ct_out_array, d_lwe_ct_in_1, d_lwe_ct_in_2, &ct_degree_out,
&ct_degree_left, &ct_degree_right, d_bsk, d_ksk, (void *)mem_ptr,
message_modulus, carry_modulus, glwe_dimension, lwe_dimension,
polynomial_size, pbs_base_log, pbs_level, ksk_base_log, ksk_level,
number_of_blocks, pbs_type, cuda_get_max_shared_memory(gpu_index));
} else {
scratch_cuda_integer_mult_radix_ciphertext_kb_64(
stream, gpu_index, (void *)mem_ptr, message_modulus, carry_modulus,
glwe_dimension, lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, ksk_base_log, ksk_level, number_of_blocks, pbs_type,
cuda_get_max_shared_memory(gpu_index), true);
cuda_integer_mult_radix_ciphertext_kb_64(
stream, gpu_index, d_lwe_ct_out_array, d_lwe_ct_in_1, d_lwe_ct_in_2,
&ct_degree_out, &ct_degree_left, &ct_degree_right, d_bsk, d_ksk,
(void *)mem_ptr, message_modulus, carry_modulus, glwe_dimension,
lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
ksk_base_log, ksk_level, number_of_blocks, pbs_type,
cuda_get_max_shared_memory(gpu_index));
}
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
(glwe_dimension * polynomial_size + 1) *
number_of_blocks * sizeof(uint64_t),
stream, gpu_index);
// Process result
decrypt_integer_u64_blocks(lwe_ct_out_array, lwe_sk, &decrypted,
glwe_dimension * polynomial_size,
number_of_blocks, delta, message_modulus);
for (int i = 0; i < number_of_blocks; i++) {
ASSERT_EQ(decrypted[i], expected[i])
<< "Repetition: " << r << ", sample: " << s;
}
}
}
free(lwe_ct_out_array);
free(decrypted);
}
// Defines for which parameters set the PBS will be tested.
// It executes each test for all pairs on phis X qs (Cartesian product)
::testing::internal::ParamGenerator<IntegerMultiplicationTestParams>
integer_mult_params_u64 = ::testing::Values(
// n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
// ksk_base_log, ksk_level,
// total_message_bits, number_of_blocks, message_modulus,
// carry_modulus, repetitions, samples
// SHORTINT_PARAM_MESSAGE_2_CARRY_2
// The total number of bits of message should not exceed 64 to be
// able to use a uint64_t representation for the result calculation
// in clear
(IntegerMultiplicationTestParams){744, 1, 2048, 4.478453795193731e-11,
8.645717832544903e-32, 23, 1, 3, 5, 4,
2, 4, 4, 1, 1, MULTI_BIT},
(IntegerMultiplicationTestParams){744, 1, 2048, 4.478453795193731e-11,
8.645717832544903e-32, 23, 1, 3, 5, 4,
2, 4, 4, 1, 1, LOW_LAT},
(IntegerMultiplicationTestParams){744, 1, 2048, 4.478453795193731e-11,
8.645717832544903e-32, 23, 1, 3, 5, 4,
2, 4, 4, 1, 1, AMORTIZED});
std::string
printParamName(::testing::TestParamInfo<IntegerMultiplicationTestParams> p) {
IntegerMultiplicationTestParams params = p.param;
const char *pbs_type;
switch(params.pbs_type){
case 0:
pbs_type = "MULTIBIT";
break;
case 1:
pbs_type = "LOW_LAT";
break;
case 2:
pbs_type = "AMORTIZED";
break;
default:
pbs_type = "Unknown";
}
return "n_" + std::to_string(params.lwe_dimension) + "_k_" +
std::to_string(params.glwe_dimension) + "_N_" +
std::to_string(params.polynomial_size) + "_pbs_base_log_" +
std::to_string(params.pbs_base_log) + "_pbs_level_" +
std::to_string(params.pbs_level) + "_number_of_blocks_" +
std::to_string(params.number_of_blocks) + "_message_modulus_" +
std::to_string(params.message_modulus) + "_carry_modulus_" +
std::to_string(params.carry_modulus) + "_" +
pbs_type;
}
INSTANTIATE_TEST_CASE_P(IntegerMultiplicationInstantiation,
IntegerMultiplicationTestPrimitives_u64,
integer_mult_params_u64, printParamName);

View File

@@ -1,149 +0,0 @@
#include <cstdint>
#include <gtest/gtest.h>
#include <setup_and_teardown.h>
#include <stdio.h>
#include <stdlib.h>
const unsigned REPETITIONS = 2;
const unsigned SAMPLES = 50;
typedef struct {
int input_lwe_dimension;
int output_lwe_dimension;
double noise_variance;
int ksk_base_log;
int ksk_level;
int message_modulus;
int carry_modulus;
int number_of_inputs;
} KeyswitchTestParams;
class KeyswitchTestPrimitives_u64
: public ::testing::TestWithParam<KeyswitchTestParams> {
protected:
int input_lwe_dimension;
int output_lwe_dimension;
double noise_variance;
int ksk_base_log;
int ksk_level;
int message_modulus;
int carry_modulus;
int number_of_inputs;
int payload_modulus;
uint64_t delta;
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t *lwe_sk_in_array;
uint64_t *lwe_sk_out_array;
uint64_t *plaintexts;
uint64_t *d_ksk_array;
uint64_t *d_lwe_ct_out_array;
uint64_t *d_lwe_ct_in_array;
uint64_t *lwe_in_ct;
uint64_t *lwe_out_ct;
public:
// Test arithmetic functions
void SetUp() {
stream = cuda_create_stream(0);
// TestParams
input_lwe_dimension = (int)GetParam().input_lwe_dimension;
output_lwe_dimension = (int)GetParam().output_lwe_dimension;
noise_variance = (double)GetParam().noise_variance;
ksk_base_log = (int)GetParam().ksk_base_log;
ksk_level = (int)GetParam().ksk_level;
message_modulus = (int)GetParam().message_modulus;
carry_modulus = (int)GetParam().carry_modulus;
number_of_inputs = (int)GetParam().number_of_inputs;
keyswitch_setup(stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
&d_ksk_array, &plaintexts, &d_lwe_ct_in_array,
&d_lwe_ct_out_array, input_lwe_dimension,
output_lwe_dimension, noise_variance, ksk_base_log,
ksk_level, message_modulus, carry_modulus, &payload_modulus,
&delta, number_of_inputs, REPETITIONS, SAMPLES, gpu_index);
}
void TearDown() {
keyswitch_teardown(stream, csprng, lwe_sk_in_array, lwe_sk_out_array,
d_ksk_array, plaintexts, d_lwe_ct_in_array,
d_lwe_ct_out_array, gpu_index);
}
};
TEST_P(KeyswitchTestPrimitives_u64, keyswitch) {
uint64_t *lwe_out_ct = (uint64_t *)malloc(
(output_lwe_dimension + 1) * number_of_inputs * sizeof(uint64_t));
for (uint r = 0; r < REPETITIONS; r++) {
uint64_t *lwe_out_sk =
lwe_sk_out_array + (ptrdiff_t)(r * output_lwe_dimension);
int ksk_size = ksk_level * (output_lwe_dimension + 1) * input_lwe_dimension;
uint64_t *d_ksk = d_ksk_array + (ptrdiff_t)(ksk_size * r);
for (uint s = 0; s < SAMPLES; s++) {
uint64_t *d_lwe_ct_in =
d_lwe_ct_in_array +
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
(input_lwe_dimension + 1));
// Execute keyswitch
cuda_keyswitch_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_ct_out_array, (void *)d_lwe_ct_in,
(void *)d_ksk, input_lwe_dimension, output_lwe_dimension,
ksk_base_log, ksk_level, number_of_inputs);
// Copy result back
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_ct_out_array,
number_of_inputs * (output_lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i];
uint64_t decrypted = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(
lwe_out_sk, lwe_out_ct + i * (output_lwe_dimension + 1),
output_lwe_dimension, &decrypted);
EXPECT_NE(decrypted, plaintext);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
EXPECT_EQ(decoded, plaintext / delta);
}
}
}
free(lwe_out_ct);
}
// Defines for which parameters set the PBS will be tested.
// It executes each test for all pairs on phis X qs (Cartesian product)
::testing::internal::ParamGenerator<KeyswitchTestParams> ksk_params_u64 =
::testing::Values(
// n, k*N, noise_variance, ks_base_log, ks_level,
// message_modulus, carry_modulus, number_of_inputs
(KeyswitchTestParams){567, 1280, 2.9802322387695312e-18, 3, 3, 2, 1,
10},
(KeyswitchTestParams){694, 1536, 2.9802322387695312e-18, 4, 3, 2, 1,
10},
(KeyswitchTestParams){769, 2048, 2.9802322387695312e-18, 4, 3, 2, 1,
10},
(KeyswitchTestParams){754, 2048, 2.9802322387695312e-18, 3, 5, 2, 1,
10},
(KeyswitchTestParams){847, 4096, 2.9802322387695312e-18, 4, 4, 2, 1,
10},
(KeyswitchTestParams){881, 8192, 2.9802322387695312e-18, 3, 6, 2, 1,
10});
std::string printParamName(::testing::TestParamInfo<KeyswitchTestParams> p) {
KeyswitchTestParams params = p.param;
return "na_" + std::to_string(params.input_lwe_dimension) + "_nb_" +
std::to_string(params.output_lwe_dimension) + "_baselog_" +
std::to_string(params.ksk_base_log) + "_ksk_level_" +
std::to_string(params.ksk_level);
}
INSTANTIATE_TEST_CASE_P(KeyswitchInstantiation, KeyswitchTestPrimitives_u64,
ksk_params_u64, printParamName);

View File

@@ -1,269 +0,0 @@
#include <cstdint>
#include <gtest/gtest.h>
#include <setup_and_teardown.h>
#include <stdio.h>
#include <stdlib.h>
const unsigned REPETITIONS = 5;
const unsigned SAMPLES = 100;
typedef struct {
int lwe_dimension;
double noise_variance;
int message_modulus;
int carry_modulus;
int number_of_inputs;
} LinearAlgebraTestParams;
class LinearAlgebraTestPrimitives_u64
: public ::testing::TestWithParam<LinearAlgebraTestParams> {
protected:
int lwe_dimension;
double noise_variance;
int message_modulus;
int carry_modulus;
int number_of_inputs;
int payload_modulus;
uint64_t delta;
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t *lwe_sk_array;
uint64_t *d_lwe_in_1_ct;
uint64_t *d_lwe_in_2_ct;
uint64_t *d_plaintext_2;
uint64_t *d_cleartext;
uint64_t *d_lwe_out_ct;
uint64_t *lwe_in_1_ct;
uint64_t *lwe_in_2_ct;
uint64_t *lwe_out_ct;
uint64_t *plaintexts_1;
uint64_t *plaintexts_2;
int num_samples;
public:
// Test arithmetic functions
void SetUp() {
stream = cuda_create_stream(0);
// TestParams
lwe_dimension = (int)GetParam().lwe_dimension;
noise_variance = (double)GetParam().noise_variance;
message_modulus = (int)GetParam().message_modulus;
carry_modulus = (int)GetParam().carry_modulus;
number_of_inputs = (int)GetParam().number_of_inputs;
payload_modulus = message_modulus * carry_modulus;
// Value of the shift we multiply our messages by
// In this test we use a smaller delta to avoid an overflow during
// multiplication
delta =
((uint64_t)(1) << 63) / (uint64_t)(payload_modulus * payload_modulus);
linear_algebra_setup(stream, &csprng, &lwe_sk_array, &d_lwe_in_1_ct,
&d_lwe_in_2_ct, &d_lwe_out_ct, &lwe_in_1_ct,
&lwe_in_2_ct, &lwe_out_ct, &plaintexts_1,
&plaintexts_2, &d_plaintext_2, &d_cleartext,
lwe_dimension, noise_variance, payload_modulus, delta,
number_of_inputs, REPETITIONS, SAMPLES, gpu_index);
}
void TearDown() {
linear_algebra_teardown(
stream, &csprng, &lwe_sk_array, &d_lwe_in_1_ct, &d_lwe_in_2_ct,
&d_lwe_out_ct, &lwe_in_1_ct, &lwe_in_2_ct, &lwe_out_ct, &plaintexts_1,
&plaintexts_2, &d_plaintext_2, &d_cleartext, gpu_index);
}
};
TEST_P(LinearAlgebraTestPrimitives_u64, addition) {
void *v_stream = (void *)stream;
// Here execute the PBS
for (uint r = 0; r < REPETITIONS; r++) {
uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
for (uint s = 0; s < SAMPLES; s++) {
uint64_t *d_lwe_1_in =
d_lwe_in_1_ct +
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
(lwe_dimension + 1));
uint64_t *d_lwe_2_in =
d_lwe_in_2_ct +
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
(lwe_dimension + 1));
// Execute addition
cuda_add_lwe_ciphertext_vector_64(stream, gpu_index, (void *)d_lwe_out_ct,
(void *)d_lwe_1_in, (void *)d_lwe_2_in,
lwe_dimension, number_of_inputs);
// Copy result back
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
number_of_inputs * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(v_stream);
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext_1 = plaintexts_1[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i];
uint64_t plaintext_2 = plaintexts_2[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i];
uint64_t decrypted = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(
lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension,
&decrypted);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
EXPECT_EQ(decoded, (plaintext_1 + plaintext_2) / delta)
<< "Repetition: " << r << ", sample: " << s;
}
}
}
}
TEST_P(LinearAlgebraTestPrimitives_u64, plaintext_addition) {
// Here execute the PBS
for (uint r = 0; r < REPETITIONS; r++) {
uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
for (uint s = 0; s < SAMPLES; s++) {
uint64_t *d_lwe_1_slice =
d_lwe_in_1_ct +
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
(lwe_dimension + 1));
uint64_t *d_plaintext_2_in =
d_plaintext_2 +
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs));
// Execute addition
cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_1_slice,
(void *)d_plaintext_2_in, lwe_dimension, number_of_inputs);
// Copy result back
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
number_of_inputs * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext_1 = plaintexts_1[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i];
uint64_t plaintext_2 = plaintexts_2[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i];
uint64_t decrypted = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(
lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension,
&decrypted);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
EXPECT_EQ(decoded, (plaintext_1 + plaintext_2) / delta)
<< "Repetition: " << r << ", sample: " << s << " i: " << i << ") "
<< plaintext_1 / delta << " + " << plaintext_2 / delta;
}
}
}
}
TEST_P(LinearAlgebraTestPrimitives_u64, cleartext_multiplication) {
void *v_stream = (void *)stream;
// Here execute the PBS
for (uint r = 0; r < REPETITIONS; r++) {
uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
for (uint s = 0; s < SAMPLES; s++) {
uint64_t *d_lwe_1_slice =
d_lwe_in_1_ct +
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
(lwe_dimension + 1));
uint64_t *d_cleartext_in =
d_cleartext +
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs));
// Execute cleartext multiplication
cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_1_slice,
(void *)d_cleartext_in, lwe_dimension, number_of_inputs);
// Copy result back
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
number_of_inputs * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(v_stream);
for (int i = 0; i < number_of_inputs; i++) {
uint64_t cleartext_1 = plaintexts_1[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i] /
delta;
uint64_t cleartext_2 = plaintexts_2[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i] /
delta;
uint64_t decrypted = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(
lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension,
&decrypted);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
EXPECT_EQ(decoded, cleartext_1 * cleartext_2)
<< "Repetition: " << r << ", sample: " << s << " i: " << i
<< ", decrypted: " << decrypted;
}
}
}
}
TEST_P(LinearAlgebraTestPrimitives_u64, negate) {
// Here execute the PBS
for (uint r = 0; r < REPETITIONS; r++) {
uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
for (uint s = 0; s < SAMPLES; s++) {
uint64_t *d_lwe_1_slice =
d_lwe_in_1_ct +
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
(lwe_dimension + 1));
// Execute negate
cuda_negate_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_1_slice,
lwe_dimension, number_of_inputs);
// Copy result back
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
number_of_inputs * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext = plaintexts_1[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i];
uint64_t decrypted = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(
lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension,
&decrypted);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
EXPECT_EQ(decoded, -plaintext / delta)
<< "Repetition: " << r << ", sample: " << s << " i: " << i;
}
}
}
}
// Defines for which parameters set the linear algebra operations will be
// tested. It executes each test for all pairs on phis X qs (Cartesian product)
::testing::internal::ParamGenerator<LinearAlgebraTestParams>
linear_algebra_params_u64 = ::testing::Values(
// n, lwe_std_dev, message_modulus, carry_modulus, number_of_inputs
(LinearAlgebraTestParams){600, 7.52316384526264e-37, 2, 2, 10});
std::string
printParamName(::testing::TestParamInfo<LinearAlgebraTestParams> p) {
LinearAlgebraTestParams params = p.param;
return "n_" + std::to_string(params.lwe_dimension);
}
INSTANTIATE_TEST_CASE_P(LinearAlgebraInstantiation,
LinearAlgebraTestPrimitives_u64,
linear_algebra_params_u64, printParamName);

View File

@@ -1,211 +0,0 @@
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <functional>
#include <gtest/gtest.h>
#include <setup_and_teardown.h>
#include <utils.h>
typedef struct {
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance;
double glwe_modular_variance;
int pbs_base_log;
int pbs_level;
int message_modulus;
int carry_modulus;
int number_of_inputs;
int grouping_factor;
int repetitions;
int samples;
} MultiBitBootstrapTestParams;
class MultiBitBootstrapTestPrimitives_u64
: public ::testing::TestWithParam<MultiBitBootstrapTestParams> {
protected:
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance;
double glwe_modular_variance;
int pbs_base_log;
int pbs_level;
int message_modulus;
int carry_modulus;
int payload_modulus;
int number_of_inputs;
int grouping_factor;
uint64_t delta;
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t *lwe_sk_in_array;
uint64_t *lwe_sk_out_array;
uint64_t *plaintexts;
uint64_t *d_bsk_array;
uint64_t *d_lut_pbs_identity;
uint64_t *d_lut_pbs_indexes;
uint64_t *d_lwe_ct_in_array;
uint64_t *d_lwe_ct_out_array;
uint64_t *lwe_ct_out_array;
int8_t *pbs_buffer;
int repetitions;
int samples;
public:
void SetUp() {
stream = cuda_create_stream(0);
// TestParams
lwe_dimension = (int)GetParam().lwe_dimension;
glwe_dimension = (int)GetParam().glwe_dimension;
polynomial_size = (int)GetParam().polynomial_size;
grouping_factor = (int)GetParam().grouping_factor;
lwe_modular_variance = (double)GetParam().lwe_modular_variance;
glwe_modular_variance = (double)GetParam().glwe_modular_variance;
pbs_base_log = (int)GetParam().pbs_base_log;
pbs_level = (int)GetParam().pbs_level;
message_modulus = (int)GetParam().message_modulus;
carry_modulus = (int)GetParam().carry_modulus;
number_of_inputs = (int)GetParam().number_of_inputs;
repetitions = (int)GetParam().repetitions;
samples = (int)GetParam().samples;
bootstrap_multibit_setup(
stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array, &d_bsk_array,
&plaintexts, &d_lut_pbs_identity, &d_lut_pbs_indexes,
&d_lwe_ct_in_array, &d_lwe_ct_out_array, &pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, lwe_modular_variance,
glwe_modular_variance, pbs_base_log, pbs_level, message_modulus,
carry_modulus, &payload_modulus, &delta, number_of_inputs, repetitions,
samples, gpu_index);
lwe_ct_out_array =
(uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t));
}
void TearDown() {
free(lwe_ct_out_array);
bootstrap_multibit_teardown(
stream, csprng, lwe_sk_in_array, lwe_sk_out_array, d_bsk_array,
plaintexts, d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
d_lwe_ct_out_array, &pbs_buffer, gpu_index);
}
};
TEST_P(MultiBitBootstrapTestPrimitives_u64, multi_bit_pbs) {
int bsk_size = (lwe_dimension / grouping_factor) * pbs_level *
(glwe_dimension + 1) * (glwe_dimension + 1) * polynomial_size *
(1 << grouping_factor);
for (int r = 0; r < repetitions; r++) {
uint64_t *d_bsk = d_bsk_array + (ptrdiff_t)(bsk_size * r);
uint64_t *lwe_sk_out =
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
for (int s = 0; s < samples; s++) {
uint64_t *d_lwe_ct_in =
d_lwe_ct_in_array +
(ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
(lwe_dimension + 1));
// Execute PBS
cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
(void *)d_lwe_ct_in, (void *)d_bsk, pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
pbs_level, number_of_inputs, 1, 0,
cuda_get_max_shared_memory(gpu_index));
// Copy result to the host memory
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
(glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t),
stream, gpu_index);
for (int j = 0; j < number_of_inputs; j++) {
uint64_t *result =
lwe_ct_out_array +
(ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
s * number_of_inputs + j];
uint64_t decrypted = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(
lwe_sk_out, result, glwe_dimension * polynomial_size, &decrypted);
EXPECT_NE(decrypted, plaintext)
<< "Repetition: " << r << ", sample: " << s << ", input: " << j;
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
EXPECT_EQ(decoded, plaintext / delta)
<< "Repetition: " << r << ", sample: " << s << ", input: " << j;
}
}
}
// cleanup_cuda_multi_bit_pbs(stream, gpu_index, &pbs_buffer);
}
// Defines for which parameters set the PBS will be tested.
// It executes each test for all pairs on phis X qs (Cartesian product)
::testing::internal::ParamGenerator<MultiBitBootstrapTestParams>
multipbs_params_u64 = ::testing::Values(
// fast test
(MultiBitBootstrapTestParams){16, 1, 256, 1.3880686109937e-11,
1.1919984450689246e-23, 23, 1, 2, 2, 1, 2,
1, 2},
(MultiBitBootstrapTestParams){16, 1, 256, 1.3880686109937e-11,
1.1919984450689246e-23, 23, 1, 2, 2, 128,
2, 1, 2},
// 4_bits_multi_bit_group_2
(MultiBitBootstrapTestParams){818, 1, 2048, 1.3880686109937e-11,
1.1919984450689246e-23, 22, 1, 2, 2, 1, 2,
1, 1},
(MultiBitBootstrapTestParams){818, 1, 2048, 1.3880686109937e-15,
1.1919984450689246e-24, 22, 1, 2, 2, 128,
2, 1, 1},
// 4_bits_multi_bit_group_3
(MultiBitBootstrapTestParams){888, 1, 2048, 4.9571231961752025e-12,
9.9409770026944e-32, 21, 1, 2, 2, 1, 3, 1,
1},
(MultiBitBootstrapTestParams){888, 1, 2048, 4.9571231961752025e-12,
9.9409770026944e-32, 21, 1, 2, 2, 128, 3,
1, 1},
(MultiBitBootstrapTestParams){742, 1, 2048, 4.9571231961752025e-12,
9.9409770026944e-32, 23, 1, 2, 2, 128, 2,
1, 1},
(MultiBitBootstrapTestParams){744, 1, 2048, 4.9571231961752025e-12,
9.9409770026944e-32, 23, 1, 2, 2, 1, 3, 1,
1},
(MultiBitBootstrapTestParams){744, 1, 2048, 4.9571231961752025e-12,
9.9409770026944e-32, 23, 1, 2, 2, 5, 3, 1,
1},
(MultiBitBootstrapTestParams){744, 1, 2048, 4.9571231961752025e-12,
9.9409770026944e-32, 23, 1, 2, 2, 128, 3,
1, 1});
std::string
printParamName(::testing::TestParamInfo<MultiBitBootstrapTestParams> p) {
MultiBitBootstrapTestParams params = p.param;
return "n_" + std::to_string(params.lwe_dimension) + "_k_" +
std::to_string(params.glwe_dimension) + "_N_" +
std::to_string(params.polynomial_size) + "_pbs_base_log_" +
std::to_string(params.pbs_base_log) + "_pbs_level_" +
std::to_string(params.pbs_level) + "_grouping_factor_" +
std::to_string(params.grouping_factor) + "_number_of_inputs_" +
std::to_string(params.number_of_inputs);
}
INSTANTIATE_TEST_CASE_P(MultiBitBootstrapInstantiation,
MultiBitBootstrapTestPrimitives_u64,
multipbs_params_u64, printParamName);

View File

@@ -1,256 +0,0 @@
#include <cstdint>
#include <gtest/gtest.h>
#include <setup_and_teardown.h>
#include <stdio.h>
#include <stdlib.h>
const unsigned REPETITIONS = 2;
const unsigned SAMPLES = 10;
const unsigned MAX_TAU = 4;
typedef struct {
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance;
double glwe_modular_variance;
int pbs_base_log;
int pbs_level;
int ks_base_log;
int ks_level;
int pksk_base_log;
int pksk_level;
int cbs_base_log;
int cbs_level;
int tau;
int p_array[MAX_TAU];
} WopBootstrapTestParams;
class WopBootstrapTestPrimitives_u64
: public ::testing::TestWithParam<WopBootstrapTestParams> {
protected:
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance;
double glwe_modular_variance;
int pbs_base_log;
int pbs_level;
int ks_base_log;
int ks_level;
int pksk_base_log;
int pksk_level;
int cbs_base_log;
int cbs_level;
int tau;
uint32_t p_array[MAX_TAU];
uint64_t delta_array[MAX_TAU];
int cbs_delta_log;
uint32_t delta_log_array[MAX_TAU];
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t *lwe_sk_in_array;
uint64_t *lwe_sk_out_array;
uint64_t *lwe_in_ct_array;
uint64_t *lwe_out_ct_array;
uint64_t *plaintexts;
double *d_fourier_bsk_array;
uint64_t *d_ksk_array;
uint64_t *d_pksk_array;
uint64_t *d_lwe_ct_in_array;
uint64_t *d_lwe_ct_out_array;
uint64_t *d_lut_vector;
int8_t *wop_pbs_buffer;
int input_lwe_dimension;
public:
// Test arithmetic functions
void SetUp() {
stream = cuda_create_stream(0);
// TestParams
lwe_dimension = (int)GetParam().lwe_dimension;
glwe_dimension = (int)GetParam().glwe_dimension;
polynomial_size = (int)GetParam().polynomial_size;
lwe_modular_variance = (double)GetParam().lwe_modular_variance;
glwe_modular_variance = (double)GetParam().glwe_modular_variance;
pbs_base_log = (int)GetParam().pbs_base_log;
pbs_level = (int)GetParam().pbs_level;
ks_base_log = (int)GetParam().ks_base_log;
ks_level = (int)GetParam().ks_level;
pksk_base_log = (int)GetParam().pksk_base_log;
pksk_level = (int)GetParam().pksk_level;
cbs_base_log = (int)GetParam().cbs_base_log;
cbs_level = (int)GetParam().cbs_level;
tau = (int)GetParam().tau;
for (int i = 0; i < tau; i++) {
p_array[i] = (int)GetParam().p_array[i];
}
input_lwe_dimension = glwe_dimension * polynomial_size;
wop_pbs_setup(
stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array, &d_ksk_array,
&d_fourier_bsk_array, &d_pksk_array, &plaintexts, &d_lwe_ct_in_array,
&d_lwe_ct_out_array, &d_lut_vector, &wop_pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, lwe_modular_variance,
glwe_modular_variance, ks_base_log, ks_level, pksk_base_log, pksk_level,
pbs_base_log, pbs_level, cbs_level, p_array, delta_log_array,
&cbs_delta_log, delta_array, tau, REPETITIONS, SAMPLES, gpu_index);
}
void TearDown() {
wop_pbs_teardown(stream, csprng, lwe_sk_in_array, lwe_sk_out_array,
d_ksk_array, d_fourier_bsk_array, d_pksk_array, plaintexts,
d_lwe_ct_in_array, d_lut_vector, d_lwe_ct_out_array,
wop_pbs_buffer, gpu_index);
}
};
TEST_P(WopBootstrapTestPrimitives_u64, wop_pbs) {
void *v_stream = (void *)stream;
uint64_t *lwe_out_ct_array =
(uint64_t *)malloc((input_lwe_dimension + 1) * tau * sizeof(uint64_t));
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
polynomial_size * (lwe_dimension + 1);
int ksk_size =
ks_level * (lwe_dimension + 1) * glwe_dimension * polynomial_size;
int pksk_list_size = pksk_level * (glwe_dimension + 1) * polynomial_size *
(glwe_dimension * polynomial_size + 1) *
(glwe_dimension + 1);
for (uint r = 0; r < REPETITIONS; r++) {
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
uint64_t *d_ksk = d_ksk_array + (ptrdiff_t)(ksk_size * r);
uint64_t *d_pksk_list = d_pksk_array + (ptrdiff_t)(pksk_list_size * r);
uint64_t *lwe_sk_in =
lwe_sk_in_array + (ptrdiff_t)(input_lwe_dimension * r);
for (uint s = 0; s < SAMPLES; s++) {
uint64_t *d_lwe_ct_in =
d_lwe_ct_in_array + (ptrdiff_t)((r * SAMPLES * tau + s * tau) *
(input_lwe_dimension + 1));
// Execute wop pbs
cuda_wop_pbs_64(
stream, gpu_index, (void *)d_lwe_ct_out_array, (void *)d_lwe_ct_in,
(void *)d_lut_vector, (void *)d_fourier_bsk, (void *)d_ksk,
(void *)d_pksk_list, wop_pbs_buffer, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, pbs_base_log, pbs_level, ks_base_log,
ks_level, pksk_base_log, pksk_level, cbs_base_log, cbs_level, p_array,
delta_log_array, tau, cuda_get_max_shared_memory(gpu_index));
//// Copy result back
cuda_memcpy_async_to_cpu(lwe_out_ct_array, d_lwe_ct_out_array,
(input_lwe_dimension + 1) * tau *
sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(v_stream);
for (int i = 0; i < tau; i++) {
uint64_t plaintext = plaintexts[r * SAMPLES * tau + s * tau + i];
uint64_t *result_ct =
lwe_out_ct_array + (ptrdiff_t)(i * (input_lwe_dimension + 1));
uint64_t decrypted_message = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(
lwe_sk_in, result_ct, input_lwe_dimension, &decrypted_message);
// Round after decryption
uint64_t decrypted =
closest_representable(decrypted_message, 1, p_array[i]) >>
delta_log_array[i];
uint64_t expected = plaintext >> delta_log_array[i];
EXPECT_EQ(decrypted, expected)
<< " failed at tau " << i << ", repetition " << r
<< ","
"sample "
<< s;
}
}
}
}
// Defines for which parameters set the PBS will be tested.
// It executes each test for all pairs on phis X qs (Cartesian product)
::testing::internal::ParamGenerator<WopBootstrapTestParams> wop_pbs_params_u64 =
::testing::Values(
// lwe_dimension, glwe_dimension, polynomial_size, lwe_modular_variance,
// glwe_modular_variance, pbs_base_log, pbs_level, ks_base_log,
// ks_level, pksk_base_log, pksk_level, cbs_base_log, cbs_level, tau, p
(WopBootstrapTestParams){481,
2,
512,
7.52316384526264e-37,
7.52316384526264e-37,
4,
9,
1,
9,
4,
9,
6,
4,
1,
{11}}, // Full Wop-PBS
(WopBootstrapTestParams){481,
2,
512,
7.52316384526264e-37,
7.52316384526264e-37,
4,
9,
1,
9,
4,
9,
6,
4,
1,
{9}}, // No CMUX tree
(WopBootstrapTestParams){481,
1,
1024,
7.52316384526264e-37,
7.52316384526264e-37,
4,
9,
1,
9,
4,
9,
6,
4,
1,
{9}});
std::string printParamName(::testing::TestParamInfo<WopBootstrapTestParams> p) {
WopBootstrapTestParams params = p.param;
uint32_t lut_vector_size = (1 << (params.p_array[0] * params.tau));
std::string message = "Unknown_parameter_set";
if ((uint32_t)params.polynomial_size < lut_vector_size) {
// We have a cmux tree done with a single cmux.
message = "wop_pbs_full_n_" + std::to_string(params.lwe_dimension) + "_k_" +
std::to_string(params.glwe_dimension) + "_N_" +
std::to_string(params.polynomial_size) + "_tau_" +
std::to_string(params.tau) + "_p_" +
std::to_string(params.p_array[0]);
} else if ((uint32_t)params.polynomial_size == lut_vector_size) {
// the VP skips the cmux tree.
message =
"wop_pbs_without_cmux_tree_n_" + std::to_string(params.lwe_dimension) +
"_k_" + std::to_string(params.glwe_dimension) + "_N_" +
std::to_string(params.polynomial_size) + "_tau_" +
std::to_string(params.tau) + "_p_" + std::to_string(params.p_array[0]);
} else {
// the VP skips the cmux tree and expands the lut.
message = "wop_pbs_expanded_lut_n_" + std::to_string(params.lwe_dimension) +
"_k_" + std::to_string(params.glwe_dimension) + "_N_" +
std::to_string(params.polynomial_size) + "_tau_" +
std::to_string(params.tau) + "_p_" +
std::to_string(params.p_array[0]);
}
return message;
}
INSTANTIATE_TEST_CASE_P(WopBootstrapInstantiation,
WopBootstrapTestPrimitives_u64, wop_pbs_params_u64,
printParamName);

View File

@@ -1,410 +0,0 @@
#include <algorithm>
#include <bootstrap.h>
#include <bootstrap_multibit.h>
#include <cmath>
#include <concrete-cpu.h>
#include <cstdint>
#include <cstdlib>
#include <device.h>
#include <functional>
#include <random>
#include <utils.h>
double get_aws_cost_per_second() { return AWS_VM_COST_PER_HOUR / 3600; }
// For each sample and repetition, create a plaintext
// The payload_modulus is the message modulus times the carry modulus
// (so the total message modulus)
uint64_t *generate_plaintexts(uint64_t payload_modulus, uint64_t delta,
int number_of_inputs, const unsigned repetitions,
const unsigned samples) {
uint64_t *plaintext_array = (uint64_t *)malloc(
repetitions * samples * number_of_inputs * sizeof(uint64_t));
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<unsigned long long> dis(
std::numeric_limits<std::uint64_t>::min(),
std::numeric_limits<std::uint64_t>::max());
for (uint r = 0; r < repetitions; r++) {
for (uint s = 0; s < samples; s++) {
for (int i = 0; i < number_of_inputs; i++) {
plaintext_array[r * samples * number_of_inputs + s * number_of_inputs +
i] = (dis(gen) % payload_modulus) * delta;
}
}
}
return plaintext_array;
}
// For each sample and repetition, create a plaintext for bit extract,
// The payload_modulus is the message modulus times the carry modulus
// (so the total message modulus)
uint64_t *generate_plaintexts_bit_extract(uint64_t *payload_modulus,
uint64_t *delta,
int crt_decomposition_size,
const unsigned repetitions,
const unsigned samples) {
uint64_t *plaintext_array = (uint64_t *)malloc(
repetitions * samples * crt_decomposition_size * sizeof(uint64_t));
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<unsigned long long> dis(
std::numeric_limits<std::uint64_t>::min(),
std::numeric_limits<std::uint64_t>::max());
for (size_t i = 0; i < crt_decomposition_size * repetitions * samples; i++) {
plaintext_array[i] =
(dis(gen) % payload_modulus[i % crt_decomposition_size]) *
delta[i % crt_decomposition_size];
}
return plaintext_array;
}
// Decompose value in r bits
// Bit decomposition of the value from MSB to LSB
uint64_t *bit_decompose_value(uint64_t value, int r) {
uint64_t *bit_array = (uint64_t *)malloc(r * sizeof(uint64_t));
uint64_t x = value;
for (int i = 0; i < r; i++) {
bit_array[i] = x & 1;
x >>= 1;
}
return bit_array;
}
uint64_t *generate_identity_lut_pbs(int polynomial_size, int glwe_dimension,
int message_modulus, int carry_modulus,
std::function<uint64_t(uint64_t)> func) {
// Modulus of the msg contained in the msg bits and operations buffer
uint64_t modulus_sup = message_modulus * carry_modulus;
// N/(p/2) = size of each block
uint64_t box_size = polynomial_size / modulus_sup;
// Value of the shift we multiply our messages by
uint64_t delta = ((uint64_t)1 << 63) / (uint64_t)(modulus_sup);
// Create the plaintext lut_pbs
uint64_t *plaintext_lut_pbs =
(uint64_t *)malloc(polynomial_size * sizeof(uint64_t));
// This plaintext_lut_pbs extracts the carry bits
for (uint64_t i = 0; i < modulus_sup; i++) {
uint64_t index = i * box_size;
for (uint64_t j = index; j < index + box_size; j++) {
plaintext_lut_pbs[j] = func(i) * delta;
}
}
uint64_t half_box_size = box_size / 2;
// Negate the first half_box_size coefficients
for (uint64_t i = 0; i < half_box_size; i++) {
plaintext_lut_pbs[i] = -plaintext_lut_pbs[i];
}
// Rotate the plaintext_lut_pbs
std::rotate(plaintext_lut_pbs, plaintext_lut_pbs + half_box_size,
plaintext_lut_pbs + polynomial_size);
// Create the GLWE lut_pbs
uint64_t *lut_pbs = (uint64_t *)malloc(
polynomial_size * (glwe_dimension + 1) * sizeof(uint64_t));
for (int i = 0; i < polynomial_size * glwe_dimension; i++) {
lut_pbs[i] = 0;
}
for (int i = 0; i < polynomial_size; i++) {
int glwe_index = glwe_dimension * polynomial_size + i;
lut_pbs[glwe_index] = plaintext_lut_pbs[i];
}
free(plaintext_lut_pbs);
return lut_pbs;
}
uint64_t *generate_identity_lut_cmux_tree(int polynomial_size, int lut_size,
int tau, int delta_log) {
int r = 1;
if (log2(lut_size) > log2(polynomial_size)) {
r = log2(lut_size) - log2(polynomial_size);
}
uint64_t num_lut = (1 << r);
// Create the plaintext lut_pbs
uint64_t *plaintext_lut_cmux_tree =
(uint64_t *)malloc(num_lut * tau * polynomial_size * sizeof(uint64_t));
// This plaintext_lut_cmux_tree extracts the carry bits
for (int tree = 0; tree < tau; tree++)
for (uint64_t i = 0; i < num_lut; i++) {
uint64_t *plaintext_lut_slice = plaintext_lut_cmux_tree +
i * polynomial_size +
tree * num_lut * polynomial_size;
uint64_t coeff =
(((uint64_t)(i + tree * num_lut) % (1 << (64 - delta_log))))
<< delta_log;
for (int p = 0; p < polynomial_size; p++)
plaintext_lut_slice[p] = coeff;
}
return plaintext_lut_cmux_tree;
}
// Generate repetitions LWE secret keys
void generate_lwe_secret_keys(uint64_t **lwe_sk_array, int lwe_dimension,
Csprng *csprng, const unsigned repetitions) {
*lwe_sk_array =
(uint64_t *)malloc(lwe_dimension * repetitions * sizeof(uint64_t));
int shift = 0;
for (uint r = 0; r < repetitions; r++) {
// Generate the lwe secret key for each repetition
concrete_cpu_init_secret_key_u64(*lwe_sk_array + (ptrdiff_t)(shift),
lwe_dimension, csprng,
&CONCRETE_CSPRNG_VTABLE);
shift += lwe_dimension;
}
}
// Generate repetitions GLWE secret keys
void generate_glwe_secret_keys(uint64_t **glwe_sk_array, int glwe_dimension,
int polynomial_size, Csprng *csprng,
const unsigned repetitions) {
int glwe_sk_array_size = glwe_dimension * polynomial_size * repetitions;
*glwe_sk_array = (uint64_t *)malloc(glwe_sk_array_size * sizeof(uint64_t));
int shift = 0;
for (uint r = 0; r < repetitions; r++) {
// Generate the lwe secret key for each repetition
concrete_cpu_init_secret_key_u64(*glwe_sk_array + (ptrdiff_t)(shift),
glwe_dimension * polynomial_size, csprng,
&CONCRETE_CSPRNG_VTABLE);
shift += glwe_dimension * polynomial_size;
}
}
// Generate repetitions LWE bootstrap keys
void generate_lwe_bootstrap_keys(
cudaStream_t *stream, int gpu_index, double **d_fourier_bsk_array,
uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array, int lwe_dimension,
int glwe_dimension, int polynomial_size, int pbs_level, int pbs_base_log,
Csprng *csprng, double variance, const unsigned repetitions) {
void *v_stream = (void *)stream;
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
polynomial_size * (lwe_dimension + 1);
int bsk_array_size = bsk_size * repetitions;
uint64_t *bsk_array = (uint64_t *)malloc(bsk_array_size * sizeof(uint64_t));
*d_fourier_bsk_array = (double *)cuda_malloc_async(
bsk_array_size * sizeof(double), stream, gpu_index);
int shift_in = 0;
int shift_out = 0;
int shift_bsk = 0;
for (uint r = 0; r < repetitions; r++) {
// Generate the bootstrap key for each repetition
concrete_cpu_init_lwe_bootstrap_key_u64(
bsk_array + (ptrdiff_t)(shift_bsk),
lwe_sk_in_array + (ptrdiff_t)(shift_in),
lwe_sk_out_array + (ptrdiff_t)(shift_out), lwe_dimension,
polynomial_size, glwe_dimension, pbs_level, pbs_base_log, variance,
Parallelism(1), csprng, &CONCRETE_CSPRNG_VTABLE);
double *d_fourier_bsk = *d_fourier_bsk_array + (ptrdiff_t)(shift_bsk);
uint64_t *bsk = bsk_array + (ptrdiff_t)(shift_bsk);
cuda_synchronize_stream(v_stream);
cuda_convert_lwe_bootstrap_key_64(
(void *)(d_fourier_bsk), (void *)(bsk), v_stream, gpu_index,
lwe_dimension, glwe_dimension, pbs_level, polynomial_size);
shift_in += lwe_dimension;
shift_out += glwe_dimension * polynomial_size;
shift_bsk += bsk_size;
}
cuda_synchronize_stream(v_stream);
free(bsk_array);
}
void generate_lwe_multi_bit_pbs_keys(
cudaStream_t *stream, int gpu_index, uint64_t **d_bsk_array,
uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array, int lwe_dimension,
int glwe_dimension, int polynomial_size, int grouping_factor, int pbs_level,
int pbs_base_log, Csprng *csprng, double variance,
const unsigned repetitions) {
void *v_stream = (void *)stream;
int bsk_size = lwe_dimension * pbs_level * (glwe_dimension + 1) *
(glwe_dimension + 1) * polynomial_size *
(1 << grouping_factor) / grouping_factor;
int bsk_array_size = bsk_size * repetitions;
uint64_t *bsk_array = (uint64_t *)malloc(bsk_array_size * sizeof(uint64_t));
*d_bsk_array = (uint64_t *)cuda_malloc_async(
bsk_array_size * sizeof(uint64_t), stream, gpu_index);
for (uint r = 0; r < repetitions; r++) {
int shift_in = 0;
int shift_out = 0;
int shift_bsk = 0;
core_crypto_par_generate_lwe_multi_bit_bootstrapping_key(
lwe_sk_in_array + (ptrdiff_t)(shift_in), lwe_dimension,
lwe_sk_out_array + (ptrdiff_t)(shift_out), glwe_dimension,
polynomial_size, bsk_array + (ptrdiff_t)(shift_bsk), pbs_base_log,
pbs_level, grouping_factor, sqrt(variance), 0, 0);
uint64_t *d_bsk = *d_bsk_array + (ptrdiff_t)(shift_bsk);
uint64_t *bsk = bsk_array + (ptrdiff_t)(shift_bsk);
cuda_convert_lwe_multi_bit_bootstrap_key_64(
d_bsk, bsk, stream, gpu_index, lwe_dimension, glwe_dimension, pbs_level,
polynomial_size, grouping_factor);
shift_in += lwe_dimension;
shift_out += glwe_dimension * polynomial_size;
shift_bsk += bsk_size;
}
cuda_synchronize_stream(v_stream);
free(bsk_array);
}
// Generate repetitions keyswitch keys
void generate_lwe_keyswitch_keys(cudaStream_t *stream, int gpu_index,
uint64_t **d_ksk_array,
uint64_t *lwe_sk_in_array,
uint64_t *lwe_sk_out_array,
int input_lwe_dimension,
int output_lwe_dimension, int ksk_level,
int ksk_base_log, Csprng *csprng,
double variance, const unsigned repetitions) {
int ksk_size = ksk_level * (output_lwe_dimension + 1) * input_lwe_dimension;
int ksk_array_size = ksk_size * repetitions;
uint64_t *ksk_array = (uint64_t *)malloc(ksk_array_size * sizeof(uint64_t));
*d_ksk_array = (uint64_t *)cuda_malloc_async(
ksk_array_size * sizeof(uint64_t), stream, gpu_index);
int shift_in = 0;
int shift_out = 0;
int shift_ksk = 0;
for (uint r = 0; r < repetitions; r++) {
// Generate the keyswitch key for each repetition
concrete_cpu_init_lwe_keyswitch_key_u64(
ksk_array + (ptrdiff_t)(shift_ksk),
lwe_sk_in_array + (ptrdiff_t)(shift_in),
lwe_sk_out_array + (ptrdiff_t)(shift_out), input_lwe_dimension,
output_lwe_dimension, ksk_level, ksk_base_log, variance, csprng,
&CONCRETE_CSPRNG_VTABLE);
uint64_t *d_ksk = *d_ksk_array + (ptrdiff_t)(shift_ksk);
uint64_t *ksk = ksk_array + (ptrdiff_t)(shift_ksk);
cuda_memcpy_async_to_gpu(d_ksk, ksk, ksk_size * sizeof(uint64_t), stream,
gpu_index);
shift_in += input_lwe_dimension;
shift_out += output_lwe_dimension;
shift_ksk += ksk_size;
}
cuda_synchronize_stream(stream);
free(ksk_array);
}
// Generate repetitions private functional keyswitch key lists (with (k + 1)
// keys each)
void generate_lwe_private_functional_keyswitch_key_lists(
cudaStream_t *stream, int gpu_index, uint64_t **d_pksk_array,
uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array,
int input_lwe_dimension, int output_glwe_dimension,
int output_polynomial_size, int pksk_level, int pksk_base_log,
Csprng *csprng, double variance, const unsigned repetitions) {
int pksk_list_size = pksk_level * (output_glwe_dimension + 1) *
output_polynomial_size * (input_lwe_dimension + 1) *
(output_glwe_dimension + 1);
int pksk_array_size = pksk_list_size * repetitions;
uint64_t *pksk_array = (uint64_t *)malloc(pksk_array_size * sizeof(uint64_t));
*d_pksk_array = (uint64_t *)cuda_malloc_async(
pksk_array_size * sizeof(uint64_t), stream, gpu_index);
int shift_in = 0;
int shift_out = 0;
int shift_pksk_list = 0;
for (uint r = 0; r < repetitions; r++) {
// Generate the (k + 1) private functional keyswitch keys for each
// repetition
concrete_cpu_init_lwe_circuit_bootstrap_private_functional_packing_keyswitch_keys_u64(
pksk_array + (ptrdiff_t)(shift_pksk_list),
lwe_sk_in_array + (ptrdiff_t)(shift_in),
lwe_sk_out_array + (ptrdiff_t)(shift_out), input_lwe_dimension,
output_polynomial_size, output_glwe_dimension, pksk_level,
pksk_base_log, variance, Parallelism(1), csprng,
&CONCRETE_CSPRNG_VTABLE);
uint64_t *d_pksk_list = *d_pksk_array + (ptrdiff_t)(shift_pksk_list);
uint64_t *pksk_list = pksk_array + (ptrdiff_t)(shift_pksk_list);
cuda_memcpy_async_to_gpu(d_pksk_list, pksk_list,
pksk_list_size * sizeof(uint64_t), stream,
gpu_index);
shift_in += input_lwe_dimension;
shift_out += output_glwe_dimension * output_polynomial_size;
shift_pksk_list += pksk_list_size;
}
free(pksk_array);
}
// The closest number representable by the decomposition can be computed by
// performing the rounding at the appropriate bit.
uint64_t closest_representable(uint64_t input, int level_count, int base_log) {
// Compute the number of least significant bits which can not be represented
// by the decomposition
int non_rep_bit_count = 64 - (level_count * base_log);
// Generate a mask which captures the non representable bits
uint64_t one = 1;
uint64_t non_rep_mask = one << (non_rep_bit_count - 1);
// Retrieve the non representable bits
uint64_t non_rep_bits = input & non_rep_mask;
// Extract the msb of the non representable bits to perform the rounding
uint64_t non_rep_msb = non_rep_bits >> (non_rep_bit_count - 1);
// Remove the non-representable bits and perform the rounding
uint64_t res = input >> non_rep_bit_count;
res += non_rep_msb;
return res << non_rep_bit_count;
}
uint64_t number_of_inputs_on_gpu(uint64_t gpu_index,
uint64_t lwe_ciphertext_count,
uint64_t number_of_gpus) {
uint64_t samples_per_gpu = lwe_ciphertext_count / number_of_gpus;
uint64_t samples = samples_per_gpu;
// We add the remainder of the integer division lwe_count/num_gpus to the load
// of the last GPU
if (gpu_index == number_of_gpus - 1) {
samples += lwe_ciphertext_count % number_of_gpus;
}
return samples;
}
// See tfhe-rs for more explanations
// tfhe/src/integer/encryption.rs:152
void encrypt_integer_u64_blocks(uint64_t **ct, uint64_t *lwe_sk,
uint64_t *message_blocks, int lwe_dimension,
int num_blocks, Csprng *csprng,
double variance) {
for (int i = 0; i < num_blocks; i++) {
concrete_cpu_encrypt_lwe_ciphertext_u64(
lwe_sk, *ct + (ptrdiff_t)(i * (lwe_dimension + 1)), message_blocks[i],
lwe_dimension, variance, csprng, &CONCRETE_CSPRNG_VTABLE);
}
}
void decrypt_integer_u64_blocks(uint64_t *ct, uint64_t *lwe_sk,
uint64_t **message_blocks, int lwe_dimension,
int num_blocks, uint64_t delta,
int message_modulus) {
uint64_t rounding_bit = delta >> 1;
for (int i = 0; i < num_blocks; i++) {
uint64_t decrypted_u64 = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(
lwe_sk, ct + (ptrdiff_t)((lwe_dimension + 1) * i), lwe_dimension,
&decrypted_u64);
uint64_t rounding = (decrypted_u64 & rounding_bit) << 1;
uint64_t block_value =
((decrypted_u64 + rounding) / delta) % message_modulus;
(*message_blocks)[i] = block_value;
}
}

View File

@@ -7,10 +7,10 @@
#define CONCRETELANG_GPUDFG_HPP
#ifdef CONCRETELANG_CUDA_SUPPORT
#include "bootstrap.h"
#include "device.h"
#include "keyswitch.h"
#include "linear_algebra.h"
#include "programmable_bootstrap.h"
#endif

View File

@@ -19,9 +19,9 @@
using ::concretelang::keysets::ServerKeyset;
#ifdef CONCRETELANG_CUDA_SUPPORT
#include "bootstrap.h"
#include "device.h"
#include "keyswitch.h"
#include "programmable_bootstrap.h"
#endif
namespace mlir {
@@ -102,14 +102,14 @@ public:
size_t bsk_gpu_buffer_size = bsk_buffer_len * sizeof(double);
void *bsk_gpu_tmp =
cuda_malloc_async(bsk_gpu_buffer_size, (cudaStream_t *)stream, gpu_idx);
cuda_convert_lwe_bootstrap_key_64(
bsk_gpu_tmp, const_cast<uint64_t *>(bsk.getBuffer().data()),
(cudaStream_t *)stream, gpu_idx, input_lwe_dim, glwe_dim, level,
poly_size);
cuda_malloc_async(bsk_gpu_buffer_size, (cudaStream_t)stream, gpu_idx);
cuda_convert_lwe_programmable_bootstrap_key_64(
(cudaStream_t)stream, gpu_idx, bsk_gpu_tmp,
const_cast<uint64_t *>(bsk.getBuffer().data()), input_lwe_dim, glwe_dim,
level, poly_size);
// Synchronization here is not optional as it works with mutex to
// prevent other GPU streams from reading partially copied keys.
cudaStreamSynchronize(*(cudaStream_t *)stream);
cudaStreamSynchronize((cudaStream_t)stream);
bsk_gpu[gpu_idx][bsk_idx] = bsk_gpu_tmp;
return bsk_gpu[gpu_idx][bsk_idx];
}
@@ -132,14 +132,14 @@ public:
size_t ksk_buffer_size = sizeof(uint64_t) * ksk.getBuffer().size();
void *ksk_gpu_tmp =
cuda_malloc_async(ksk_buffer_size, (cudaStream_t *)stream, gpu_idx);
cuda_malloc_async(ksk_buffer_size, (cudaStream_t)stream, gpu_idx);
cuda_memcpy_async_to_gpu(ksk_gpu_tmp,
const_cast<uint64_t *>(ksk.getBuffer().data()),
ksk_buffer_size, (cudaStream_t *)stream, gpu_idx);
ksk_buffer_size, (cudaStream_t)stream, gpu_idx);
// Synchronization here is not optional as it works with mutex to
// prevent other GPU streams from reading partially copied keys.
cudaStreamSynchronize(*(cudaStream_t *)stream);
cudaStreamSynchronize((cudaStream_t)stream);
ksk_gpu[gpu_idx][ksk_idx] = ksk_gpu_tmp;
return ksk_gpu[gpu_idx][ksk_idx];
}

View File

@@ -34,7 +34,7 @@ if(CONCRETELANG_DATAFLOW_EXECUTION_ENABLED)
endif()
if(CONCRETELANG_CUDA_SUPPORT)
target_link_libraries(ConcretelangRuntime LINK_PUBLIC concrete_cuda)
target_link_libraries(ConcretelangRuntime LINK_PUBLIC tfhe_cuda_backend)
endif()
if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
@@ -70,7 +70,7 @@ target_link_libraries(
$<TARGET_OBJECTS:MLIRSparseTensorRuntime>)
if(CONCRETELANG_CUDA_SUPPORT)
install(TARGETS ConcretelangRuntime omp concrete_cuda EXPORT ConcretelangRuntime)
install(TARGETS ConcretelangRuntime omp tfhe_cuda_backend EXPORT ConcretelangRuntime)
else()
install(TARGETS ConcretelangRuntime omp EXPORT ConcretelangRuntime)
endif()

View File

@@ -31,6 +31,16 @@ namespace concretelang {
namespace gpu_dfg {
namespace {
void *alloc_and_memcpy_async_to_gpu(uint64_t *buf_ptr, uint64_t buf_offset,
uint64_t buf_size, uint32_t gpu_idx,
void *stream) {
size_t buf_size_ = buf_size * sizeof(uint64_t);
void *ct_gpu = cuda_malloc_async(buf_size_, (cudaStream_t)stream, gpu_idx);
cuda_memcpy_async_to_gpu(ct_gpu, buf_ptr + buf_offset, buf_size_,
(cudaStream_t)stream, gpu_idx);
return ct_gpu;
}
#if CONCRETELANG_TIMING_ENABLED
static struct timespec init_timer, blocking_get_timer, acc1, acc2;
#endif
@@ -107,22 +117,25 @@ struct Dependence;
// is required.
struct PBS_buffer {
PBS_buffer(void *stream, uint32_t gpu_idx, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count)
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count)
: max_pbs_buffer_samples(input_lwe_ciphertext_count),
glwe_dim(glwe_dimension), poly_size(polynomial_size),
gpu_stream(stream), gpu_index(gpu_idx) {
scratch_cuda_bootstrap_amortized_64(
gpu_stream, gpu_index, &pbs_buffer, glwe_dim, poly_size,
max_pbs_buffer_samples, cuda_get_max_shared_memory(gpu_index), true);
glwe_dim(glwe_dimension), _level_count(level_count),
poly_size(polynomial_size), gpu_stream(stream), gpu_index(gpu_idx) {
scratch_cuda_programmable_bootstrap_64(gpu_stream, gpu_index, &pbs_buffer,
glwe_dim, poly_size, _level_count,
max_pbs_buffer_samples, true);
}
~PBS_buffer() {
cleanup_cuda_bootstrap_amortized(gpu_stream, gpu_index, &pbs_buffer);
cleanup_cuda_programmable_bootstrap(gpu_stream, gpu_index, &pbs_buffer);
}
int8_t *get_pbs_buffer(void *stream, uint32_t gpu_idx,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count,
uint32_t input_lwe_ciphertext_count) {
assert(glwe_dimension <= glwe_dim);
assert(polynomial_size <= poly_size);
assert(level_count <= _level_count);
assert(input_lwe_ciphertext_count <= max_pbs_buffer_samples);
assert(stream == gpu_stream);
assert(gpu_idx == gpu_index);
@@ -134,6 +147,7 @@ struct PBS_buffer {
uint32_t max_pbs_buffer_samples;
uint32_t glwe_dim;
uint32_t poly_size;
uint32_t _level_count;
void *gpu_stream;
uint32_t gpu_index;
};
@@ -150,13 +164,14 @@ struct GPU_state {
if (pbs_buffer != nullptr)
delete pbs_buffer;
if (gpu_stream != nullptr)
cuda_destroy_stream((cudaStream_t *)gpu_stream, gpu_idx);
cuda_destroy_stream((cudaStream_t)gpu_stream, gpu_idx);
}
inline int8_t *get_pbs_buffer(uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count) {
if (pbs_buffer != nullptr && (pbs_buffer->glwe_dim != glwe_dimension ||
pbs_buffer->poly_size != polynomial_size ||
pbs_buffer->_level_count != level_count ||
pbs_buffer->get_max_pbs_buffer_samples() <
input_lwe_ciphertext_count)) {
delete pbs_buffer;
@@ -164,9 +179,10 @@ struct GPU_state {
}
if (pbs_buffer == nullptr)
pbs_buffer = new PBS_buffer(get_gpu_stream(), gpu_idx, glwe_dimension,
polynomial_size, input_lwe_ciphertext_count);
polynomial_size, level_count,
input_lwe_ciphertext_count);
return pbs_buffer->get_pbs_buffer(get_gpu_stream(), gpu_idx, glwe_dimension,
polynomial_size,
polynomial_size, level_count,
input_lwe_ciphertext_count);
}
inline void *get_gpu_stream() {
@@ -206,16 +222,17 @@ struct GPU_DFG {
to_free_list.clear();
}
inline int8_t *get_pbs_buffer(uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count) {
if (pbs_buffer == nullptr) {
int8_t *ret = gpus[gpu_idx].get_pbs_buffer(
glwe_dimension, polynomial_size, input_lwe_ciphertext_count);
int8_t *ret =
gpus[gpu_idx].get_pbs_buffer(glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count);
pbs_buffer = gpus[gpu_idx].pbs_buffer;
return ret;
}
return pbs_buffer->get_pbs_buffer(gpu_stream, gpu_idx, glwe_dimension,
polynomial_size,
polynomial_size, level_count,
input_lwe_ciphertext_count);
}
inline void *get_gpu_stream(int32_t loc) {
@@ -234,7 +251,7 @@ private:
struct Dependence;
static void sdfg_gpu_debug_print_mref(const char *c, MemRef2 m);
static MemRef2 sdfg_gpu_debug_dependence(Dependence *d, cudaStream_t *s);
static MemRef2 sdfg_gpu_debug_dependence(Dependence *d, cudaStream_t s);
static bool sdfg_gpu_debug_compare_memref(MemRef2 &a, MemRef2 &b,
char const *msg);
@@ -374,7 +391,7 @@ struct Dependence {
return;
cuda_drop_async(
chunks[chunk_id]->device_data,
(cudaStream_t *)dfg->get_gpu_stream(chunks[chunk_id]->location),
(cudaStream_t)dfg->get_gpu_stream(chunks[chunk_id]->location),
chunks[chunk_id]->location);
chunks[chunk_id]->device_data = nullptr;
}
@@ -385,8 +402,8 @@ struct Dependence {
data_offset +=
chunking_schedule[c] * host_data.sizes[1] * sizeof(uint64_t);
size_t csize = memref_get_data_size(chunks[chunk_id]->host_data);
cudaStream_t *s =
(cudaStream_t *)dfg->get_gpu_stream(chunks[chunk_id]->location);
cudaStream_t s =
(cudaStream_t)dfg->get_gpu_stream(chunks[chunk_id]->location);
cuda_memcpy_async_to_cpu(((char *)host_data.aligned) + data_offset,
chunks[chunk_id]->device_data, csize, s,
chunks[chunk_id]->location);
@@ -404,7 +421,7 @@ struct Dependence {
return;
cuda_drop_async(
chunks[chunk_id]->device_data,
(cudaStream_t *)dfg->get_gpu_stream(chunks[chunk_id]->location),
(cudaStream_t)dfg->get_gpu_stream(chunks[chunk_id]->location),
chunks[chunk_id]->location);
chunks[chunk_id]->device_data = nullptr;
chunks[chunk_id]->location =
@@ -412,8 +429,8 @@ struct Dependence {
}
inline void free_data(GPU_DFG *dfg, bool immediate = false) {
if (device_data != nullptr) {
cuda_drop_async(device_data,
(cudaStream_t *)dfg->get_gpu_stream(location), location);
cuda_drop_async(device_data, (cudaStream_t)dfg->get_gpu_stream(location),
location);
}
if (onHostReady && host_data.allocated != nullptr && hostAllocated) {
// As streams are not synchronized aside from the GET operation,
@@ -442,16 +459,16 @@ struct Dependence {
host_data.allocated = host_data.aligned = (uint64_t *)malloc(data_size);
hostAllocated = true;
}
cudaStream_t *s = (cudaStream_t *)dfg->get_gpu_stream(location);
cudaStream_t s = (cudaStream_t)dfg->get_gpu_stream(location);
cuda_memcpy_async_to_cpu(host_data.aligned, device_data, data_size, s,
location);
if (synchronize)
cudaStreamSynchronize(*s);
cudaStreamSynchronize(s);
onHostReady = true;
} else {
assert(onHostReady &&
"Device-to-device data transfers not supported yet.");
cudaStream_t *s = (cudaStream_t *)dfg->get_gpu_stream(loc);
cudaStream_t s = (cudaStream_t)dfg->get_gpu_stream(loc);
if (device_data != nullptr)
cuda_drop_async(device_data, s, location);
device_data = cuda_malloc_async(data_size, s, loc);
@@ -681,7 +698,7 @@ struct Stream {
// TODO: this could be improved
// Force deallocation with a synchronization point
for (size_t g = 0; g < num_devices; ++g)
cudaStreamSynchronize(*(cudaStream_t *)dfg->get_gpu_stream(g));
cudaStreamSynchronize((cudaStream_t)dfg->get_gpu_stream(g));
auto status = cudaMemGetInfo(&gpu_free_mem, &gpu_total_mem);
assert(status == cudaSuccess);
// TODO - for now assume each device on the system has roughly same
@@ -871,7 +888,7 @@ struct Stream {
iv->dep->free_chunk_device_data(c, dfg);
for (auto o : outputs)
o->dep->free_chunk_device_data(c, dfg);
cudaStreamSynchronize(*(cudaStream_t *)dfg->get_gpu_stream(dev));
cudaStreamSynchronize((cudaStream_t)dfg->get_gpu_stream(dev));
}
},
queue, dev));
@@ -886,7 +903,7 @@ struct Stream {
for (auto o : outputs)
o->dep->finalize_merged_dependence(dfg);
for (dev = 0; dev < num_devices; ++dev)
cudaStreamSynchronize(*(cudaStream_t *)dfg->get_gpu_stream(dev));
cudaStreamSynchronize((cudaStream_t)dfg->get_gpu_stream(dev));
// We will assume that only one subgraph is being processed per
// DFG at a time, so we can safely free these here.
dfg->free_stream_order_dependent_data();
@@ -1004,7 +1021,7 @@ make_process_2_1(void *dfg, void *sin1, void *sin2, void *sout,
}
[[maybe_unused]] static MemRef2 sdfg_gpu_debug_dependence(Dependence *d,
cudaStream_t *s) {
cudaStream_t s) {
if (d->onHostReady)
return d->host_data;
size_t data_size = memref_get_data_size(d->host_data);
@@ -1015,7 +1032,7 @@ make_process_2_1(void *dfg, void *sin1, void *sin2, void *sout,
{d->host_data.sizes[0], d->host_data.sizes[1]},
{d->host_data.strides[0], d->host_data.strides[1]}};
cuda_memcpy_async_to_cpu(data, d->device_data, data_size, s, d->location);
cudaStreamSynchronize(*s);
cudaStreamSynchronize(s);
return ret;
}
@@ -1064,17 +1081,28 @@ void memref_keyswitch_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
return dep;
} else {
// Schedule the keyswitch kernel on the GPU
cudaStream_t *s = (cudaStream_t *)p->dfg->get_gpu_stream(loc);
cudaStream_t s = (cudaStream_t)p->dfg->get_gpu_stream(loc);
void *ct0_gpu = d->device_data;
void *out_gpu = cuda_malloc_async(data_size, s, loc);
void *ksk_gpu = p->ctx.val->get_ksk_gpu(
p->level.val, p->input_lwe_dim.val, p->output_lwe_dim.val, loc, s,
p->sk_index.val);
// Initialize indexes
uint64_t *indexes = (uint64_t *)malloc(num_samples * sizeof(uint64_t));
for (uint32_t i = 0; i < num_samples; i++) {
indexes[i] = i;
}
void *indexes_gpu =
alloc_and_memcpy_async_to_gpu(indexes, 0, num_samples, loc, s);
cuda_keyswitch_lwe_ciphertext_vector_64(
s, loc, out_gpu, ct0_gpu, ksk_gpu, p->input_lwe_dim.val,
p->output_lwe_dim.val, p->base_log.val, p->level.val, num_samples);
s, loc, out_gpu, indexes_gpu, ct0_gpu, indexes_gpu, ksk_gpu,
p->input_lwe_dim.val, p->output_lwe_dim.val, p->base_log.val,
p->level.val, num_samples);
cuda_drop_async(indexes_gpu, s, loc);
Dependence *dep =
new Dependence(loc, out, out_gpu, false, false, d->chunk_id);
p->dfg->register_stream_order_dependent_allocation(indexes);
return dep;
}
};
@@ -1108,7 +1136,7 @@ void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
}
auto sched = [&](Dependence *d0, Dependence *d1, uint64_t *glwe_ct,
std::vector<size_t> &lut_indexes, cudaStream_t *s,
std::vector<size_t> &lut_indexes, cudaStream_t s,
int32_t loc) {
uint64_t num_samples = d0->host_data.sizes[0];
MemRef2 out = {out_ptr,
@@ -1168,20 +1196,29 @@ void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
cuda_malloc_async(test_vector_idxes_size, s, loc);
cuda_memcpy_async_to_gpu(test_vector_idxes_gpu, (void *)test_vector_idxes,
test_vector_idxes_size, s, loc);
// Initialize indexes
uint64_t *indexes = (uint64_t *)malloc(num_samples * sizeof(uint64_t));
for (uint32_t i = 0; i < num_samples; i++) {
indexes[i] = i;
}
void *indexes_gpu =
alloc_and_memcpy_async_to_gpu(indexes, 0, num_samples, loc, s);
int8_t *pbs_buffer = p->dfg->gpus[loc].get_pbs_buffer(
p->glwe_dim.val, p->poly_size.val, num_samples);
p->glwe_dim.val, p->poly_size.val, p->level.val, num_samples);
void *ct0_gpu = d0->device_data;
void *out_gpu = cuda_malloc_async(data_size, s, loc);
void *fbsk_gpu = p->ctx.val->get_bsk_gpu(
p->input_lwe_dim.val, p->poly_size.val, p->level.val, p->glwe_dim.val,
loc, s, p->sk_index.val);
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
s, loc, out_gpu, glwe_ct_gpu, test_vector_idxes_gpu, ct0_gpu,
fbsk_gpu, (int8_t *)pbs_buffer, p->input_lwe_dim.val, p->glwe_dim.val,
p->poly_size.val, p->base_log.val, p->level.val, num_samples,
lut_indexes.size(), lwe_idx, cuda_get_max_shared_memory(loc));
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
s, loc, out_gpu, indexes_gpu, glwe_ct_gpu, test_vector_idxes_gpu,
ct0_gpu, indexes_gpu, fbsk_gpu, (int8_t *)pbs_buffer,
p->input_lwe_dim.val, p->glwe_dim.val, p->poly_size.val,
p->base_log.val, p->level.val, num_samples, 1, 1);
cuda_drop_async(test_vector_idxes_gpu, s, loc);
cuda_drop_async(glwe_ct_gpu, s, loc);
cuda_drop_async(indexes_gpu, s, loc);
Dependence *dep =
new Dependence(loc, out, out_gpu, false, false, d0->chunk_id);
// As streams are not synchronized, we can only free this vector
@@ -1189,6 +1226,7 @@ void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
// this vector is no longer needed.
p->dfg->register_stream_order_dependent_allocation(test_vector_idxes);
p->dfg->register_stream_order_dependent_allocation(glwe_ct);
p->dfg->register_stream_order_dependent_allocation(indexes);
return dep;
}
};
@@ -1204,7 +1242,7 @@ void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
lut_indexes.push_back(0);
}
cudaStream_t *cstream = (cudaStream_t *)p->dfg->get_gpu_stream(loc);
cudaStream_t cstream = (cudaStream_t)p->dfg->get_gpu_stream(loc);
Dependence *idep0 = p->input_streams[0]->get(loc, chunk_id);
if (p->output_streams[0]->need_new_gen(chunk_id))
p->output_streams[0]->put(
@@ -1214,7 +1252,7 @@ void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
void memref_add_lwe_ciphertexts_u64_process(Process *p, int32_t loc,
int32_t chunk_id,
uint64_t *out_ptr) {
auto sched = [&](Dependence *d0, Dependence *d1, cudaStream_t *s,
auto sched = [&](Dependence *d0, Dependence *d1, cudaStream_t s,
int32_t loc) {
assert(d0->host_data.sizes[0] == d1->host_data.sizes[0]);
assert(d0->host_data.sizes[1] == d1->host_data.sizes[1]);
@@ -1257,14 +1295,14 @@ void memref_add_lwe_ciphertexts_u64_process(Process *p, int32_t loc,
Dependence *idep1 = p->input_streams[1]->get(loc, chunk_id);
if (p->output_streams[0]->need_new_gen(chunk_id))
p->output_streams[0]->put(
sched(idep0, idep1, (cudaStream_t *)p->dfg->get_gpu_stream(loc), loc),
sched(idep0, idep1, (cudaStream_t)p->dfg->get_gpu_stream(loc), loc),
chunk_id);
}
void memref_add_plaintext_lwe_ciphertext_u64_process(Process *p, int32_t loc,
int32_t chunk_id,
uint64_t *out_ptr) {
auto sched = [&](Dependence *d0, Dependence *d1, cudaStream_t *s,
auto sched = [&](Dependence *d0, Dependence *d1, cudaStream_t s,
int32_t loc) {
assert(d0->host_data.sizes[0] == d1->host_data.sizes[1] ||
d1->host_data.sizes[1] == 1);
@@ -1315,14 +1353,14 @@ void memref_add_plaintext_lwe_ciphertext_u64_process(Process *p, int32_t loc,
Dependence *idep1 = p->input_streams[1]->get(loc, chunk_id);
if (p->output_streams[0]->need_new_gen(chunk_id))
p->output_streams[0]->put(
sched(idep0, idep1, (cudaStream_t *)p->dfg->get_gpu_stream(loc), loc),
sched(idep0, idep1, (cudaStream_t)p->dfg->get_gpu_stream(loc), loc),
chunk_id);
}
void memref_mul_cleartext_lwe_ciphertext_u64_process(Process *p, int32_t loc,
int32_t chunk_id,
uint64_t *out_ptr) {
auto sched = [&](Dependence *d0, Dependence *d1, cudaStream_t *s,
auto sched = [&](Dependence *d0, Dependence *d1, cudaStream_t s,
int32_t loc) {
assert(d0->host_data.sizes[0] == d1->host_data.sizes[1] ||
d1->host_data.sizes[1] == 1);
@@ -1373,14 +1411,14 @@ void memref_mul_cleartext_lwe_ciphertext_u64_process(Process *p, int32_t loc,
Dependence *idep1 = p->input_streams[1]->get(loc, chunk_id);
if (p->output_streams[0]->need_new_gen(chunk_id))
p->output_streams[0]->put(
sched(idep0, idep1, (cudaStream_t *)p->dfg->get_gpu_stream(loc), loc),
sched(idep0, idep1, (cudaStream_t)p->dfg->get_gpu_stream(loc), loc),
chunk_id);
}
void memref_negate_lwe_ciphertext_u64_process(Process *p, int32_t loc,
int32_t chunk_id,
uint64_t *out_ptr) {
auto sched = [&](Dependence *d0, cudaStream_t *s, int32_t loc) {
auto sched = [&](Dependence *d0, cudaStream_t s, int32_t loc) {
uint64_t num_samples = d0->host_data.sizes[0];
MemRef2 out = {out_ptr,
out_ptr,
@@ -1415,8 +1453,7 @@ void memref_negate_lwe_ciphertext_u64_process(Process *p, int32_t loc,
Dependence *idep0 = p->input_streams[0]->get(loc, chunk_id);
if (p->output_streams[0]->need_new_gen(chunk_id))
p->output_streams[0]->put(
sched(idep0, (cudaStream_t *)p->dfg->get_gpu_stream(loc), loc),
chunk_id);
sched(idep0, (cudaStream_t)p->dfg->get_gpu_stream(loc), loc), chunk_id);
}
} // namespace

View File

@@ -45,9 +45,9 @@ void *alloc_and_memcpy_async_to_gpu(uint64_t *buf_ptr, uint64_t buf_offset,
uint64_t buf_size, uint32_t gpu_idx,
void *stream) {
size_t buf_size_ = buf_size * sizeof(uint64_t);
void *ct_gpu = cuda_malloc_async(buf_size_, (cudaStream_t *)stream, gpu_idx);
void *ct_gpu = cuda_malloc_async(buf_size_, (cudaStream_t)stream, gpu_idx);
cuda_memcpy_async_to_gpu(ct_gpu, buf_ptr + buf_offset, buf_size_,
(cudaStream_t *)stream, gpu_idx);
(cudaStream_t)stream, gpu_idx);
return ct_gpu;
}
@@ -55,7 +55,7 @@ void memcpy_async_to_cpu(uint64_t *buf_ptr, uint64_t buf_offset,
uint64_t buf_size, void *buf_gpu, uint32_t gpu_idx,
void *stream) {
cuda_memcpy_async_to_cpu(buf_ptr + buf_offset, buf_gpu,
buf_size * sizeof(uint64_t), (cudaStream_t *)stream,
buf_size * sizeof(uint64_t), (cudaStream_t)stream,
gpu_idx);
}
@@ -132,21 +132,31 @@ void memref_batched_keyswitch_lwe_cuda_u64(
// Move the input and output batch of ciphertexts to the GPU
// TODO: The allocation should be done by the compiler codegen
void *ct0_gpu = alloc_and_memcpy_async_to_gpu(
ct0_aligned, ct0_offset, ct0_batch_size, gpu_idx, (cudaStream_t *)stream);
ct0_aligned, ct0_offset, ct0_batch_size, gpu_idx, (cudaStream_t)stream);
// Initialize indexes
uint64_t *indexes = (uint64_t *)malloc(num_samples * sizeof(uint64_t));
for (uint32_t i = 0; i < num_samples; i++) {
indexes[i] = i;
}
void *indexes_gpu =
alloc_and_memcpy_async_to_gpu(indexes, 0, num_samples * sizeof(uint64_t),
gpu_idx, (cudaStream_t)stream);
void *out_gpu = cuda_malloc_async(out_batch_size * sizeof(uint64_t),
(cudaStream_t *)stream, gpu_idx);
(cudaStream_t)stream, gpu_idx);
// Run the keyswitch kernel on the GPU
cuda_keyswitch_lwe_ciphertext_vector_64(
stream, gpu_idx, out_gpu, ct0_gpu, ksk_gpu, input_lwe_dim, output_lwe_dim,
base_log, level, num_samples);
stream, gpu_idx, out_gpu, indexes_gpu, ct0_gpu, indexes_gpu, ksk_gpu,
input_lwe_dim, output_lwe_dim, base_log, level, num_samples);
// Copy the output batch of ciphertext back to CPU
memcpy_async_to_cpu(out_aligned, out_offset, out_batch_size, out_gpu, gpu_idx,
stream);
cuda_synchronize_device(gpu_idx);
// free memory that we allocated on gpu
cuda_drop(indexes_gpu, gpu_idx);
cuda_drop(ct0_gpu, gpu_idx);
cuda_drop(out_gpu, gpu_idx);
cuda_destroy_stream((cudaStream_t *)stream, gpu_idx);
cuda_destroy_stream((cudaStream_t)stream, gpu_idx);
free(indexes);
}
void memref_batched_bootstrap_lwe_cuda_u64(
@@ -178,9 +188,9 @@ void memref_batched_bootstrap_lwe_cuda_u64(
// Move the input and output batch of ciphertext to the GPU
// TODO: The allocation should be done by the compiler codegen
void *ct0_gpu = alloc_and_memcpy_async_to_gpu(
ct0_aligned, ct0_offset, ct0_batch_size, gpu_idx, (cudaStream_t *)stream);
ct0_aligned, ct0_offset, ct0_batch_size, gpu_idx, (cudaStream_t)stream);
void *out_gpu = cuda_malloc_async(out_batch_size * sizeof(uint64_t),
(cudaStream_t *)stream, gpu_idx);
(cudaStream_t)stream, gpu_idx);
// Construct the glwe accumulator (on CPU)
// TODO: Should be done outside of the bootstrap call, compile time if
// possible. Refactor in progress
@@ -198,41 +208,49 @@ void memref_batched_bootstrap_lwe_cuda_u64(
// Move the glwe accumulator to the GPU
void *glwe_ct_gpu = alloc_and_memcpy_async_to_gpu(
glwe_ct, 0, glwe_ct_size, gpu_idx, (cudaStream_t *)stream);
glwe_ct, 0, glwe_ct_size, gpu_idx, (cudaStream_t)stream);
// Move test vector indexes to the GPU, the test vector indexes is set of 0
uint32_t num_test_vectors = 1, lwe_idx = 0,
test_vector_idxes_size = num_samples * sizeof(uint64_t);
void *test_vector_idxes = malloc(test_vector_idxes_size);
memset(test_vector_idxes, 0, test_vector_idxes_size);
void *test_vector_idxes_gpu = cuda_malloc_async(
test_vector_idxes_size, (cudaStream_t *)stream, gpu_idx);
void *test_vector_idxes_gpu =
cuda_malloc_async(test_vector_idxes_size, (cudaStream_t)stream, gpu_idx);
cuda_memcpy_async_to_gpu(test_vector_idxes_gpu, test_vector_idxes,
test_vector_idxes_size, (cudaStream_t *)stream,
test_vector_idxes_size, (cudaStream_t)stream,
gpu_idx);
// Initialize indexes
uint64_t *indexes = (uint64_t *)malloc(num_samples * sizeof(uint64_t));
for (uint32_t i = 0; i < num_samples; i++) {
indexes[i] = i;
}
void *indexes_gpu =
alloc_and_memcpy_async_to_gpu(indexes, 0, num_samples * sizeof(uint64_t),
gpu_idx, (cudaStream_t)stream);
// Allocate PBS buffer on GPU
scratch_cuda_bootstrap_amortized_64(
stream, gpu_idx, &pbs_buffer, glwe_dim, poly_size, num_samples,
cuda_get_max_shared_memory(gpu_idx), true);
scratch_cuda_programmable_bootstrap_64(stream, gpu_idx, &pbs_buffer, glwe_dim,
poly_size, level, num_samples, true);
// Run the bootstrap kernel on the GPU
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
stream, gpu_idx, out_gpu, glwe_ct_gpu, test_vector_idxes_gpu, ct0_gpu,
fbsk_gpu, pbs_buffer, input_lwe_dim, glwe_dim, poly_size, base_log, level,
num_samples, num_test_vectors, lwe_idx,
cuda_get_max_shared_memory(gpu_idx));
cleanup_cuda_bootstrap_amortized(stream, gpu_idx, &pbs_buffer);
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
stream, gpu_idx, out_gpu, indexes_gpu, glwe_ct_gpu, test_vector_idxes_gpu,
ct0_gpu, indexes_gpu, fbsk_gpu, pbs_buffer, input_lwe_dim, glwe_dim,
poly_size, base_log, level, num_samples, 1, 1);
cleanup_cuda_programmable_bootstrap(stream, gpu_idx, &pbs_buffer);
// Copy the output batch of ciphertext back to CPU
memcpy_async_to_cpu(out_aligned, out_offset, out_batch_size, out_gpu, gpu_idx,
stream);
// free memory that we allocated on gpu
cuda_drop_async(ct0_gpu, (cudaStream_t *)stream, gpu_idx);
cuda_drop_async(out_gpu, (cudaStream_t *)stream, gpu_idx);
cuda_drop_async(glwe_ct_gpu, (cudaStream_t *)stream, gpu_idx);
cuda_drop_async(test_vector_idxes_gpu, (cudaStream_t *)stream, gpu_idx);
cudaStreamSynchronize(*(cudaStream_t *)stream);
cuda_drop_async(indexes_gpu, (cudaStream_t)stream, gpu_idx);
cuda_drop_async(ct0_gpu, (cudaStream_t)stream, gpu_idx);
cuda_drop_async(out_gpu, (cudaStream_t)stream, gpu_idx);
cuda_drop_async(glwe_ct_gpu, (cudaStream_t)stream, gpu_idx);
cuda_drop_async(test_vector_idxes_gpu, (cudaStream_t)stream, gpu_idx);
cudaStreamSynchronize((cudaStream_t)stream);
// Free the glwe accumulator (on CPU)
free(glwe_ct);
cuda_destroy_stream((cudaStream_t *)stream, gpu_idx);
free(indexes);
cuda_destroy_stream((cudaStream_t)stream, gpu_idx);
}
void memref_batched_mapped_bootstrap_lwe_cuda_u64(
@@ -268,9 +286,9 @@ void memref_batched_mapped_bootstrap_lwe_cuda_u64(
// Move the input and output batch of ciphertext to the GPU
// TODO: The allocation should be done by the compiler codegen
void *ct0_gpu = alloc_and_memcpy_async_to_gpu(
ct0_aligned, ct0_offset, ct0_batch_size, gpu_idx, (cudaStream_t *)stream);
ct0_aligned, ct0_offset, ct0_batch_size, gpu_idx, (cudaStream_t)stream);
void *out_gpu = cuda_malloc_async(out_batch_size * sizeof(uint64_t),
(cudaStream_t *)stream, gpu_idx);
(cudaStream_t)stream, gpu_idx);
// Construct the glwe accumulator (on CPU)
// TODO: Should be done outside of the bootstrap call, compile time if
// possible. Refactor in progress
@@ -291,7 +309,7 @@ void memref_batched_mapped_bootstrap_lwe_cuda_u64(
// Move the glwe accumulator to the GPU
void *glwe_ct_gpu = alloc_and_memcpy_async_to_gpu(
glwe_ct, 0, glwe_ct_size, gpu_idx, (cudaStream_t *)stream);
glwe_ct, 0, glwe_ct_size, gpu_idx, (cudaStream_t)stream);
// Move test vector indexes to the GPU, the test vector indexes is set of 0
uint32_t lwe_idx = 0, test_vector_idxes_size = num_samples * sizeof(uint64_t);
@@ -303,34 +321,43 @@ void memref_batched_mapped_bootstrap_lwe_cuda_u64(
for (size_t i = 0; i < num_lut_vectors; ++i)
test_vector_idxes[i] = i;
}
void *test_vector_idxes_gpu = cuda_malloc_async(
test_vector_idxes_size, (cudaStream_t *)stream, gpu_idx);
void *test_vector_idxes_gpu =
cuda_malloc_async(test_vector_idxes_size, (cudaStream_t)stream, gpu_idx);
cuda_memcpy_async_to_gpu(test_vector_idxes_gpu, (void *)test_vector_idxes,
test_vector_idxes_size, (cudaStream_t *)stream,
test_vector_idxes_size, (cudaStream_t)stream,
gpu_idx);
// Initialize indexes
uint64_t *indexes = (uint64_t *)malloc(num_samples * sizeof(uint64_t));
for (uint32_t i = 0; i < num_samples; i++) {
indexes[i] = i;
}
void *indexes_gpu =
alloc_and_memcpy_async_to_gpu(indexes, 0, num_samples * sizeof(uint64_t),
gpu_idx, (cudaStream_t)stream);
// Allocate PBS buffer on GPU
scratch_cuda_bootstrap_amortized_64(
stream, gpu_idx, &pbs_buffer, glwe_dim, poly_size, num_samples,
cuda_get_max_shared_memory(gpu_idx), true);
scratch_cuda_programmable_bootstrap_64(stream, gpu_idx, &pbs_buffer, glwe_dim,
poly_size, level, num_samples, true);
// Run the bootstrap kernel on the GPU
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
stream, gpu_idx, out_gpu, glwe_ct_gpu, test_vector_idxes_gpu, ct0_gpu,
fbsk_gpu, pbs_buffer, input_lwe_dim, glwe_dim, poly_size, base_log, level,
num_samples, num_lut_vectors, lwe_idx,
cuda_get_max_shared_memory(gpu_idx));
cleanup_cuda_bootstrap_amortized(stream, gpu_idx, &pbs_buffer);
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
stream, gpu_idx, out_gpu, indexes_gpu, glwe_ct_gpu, test_vector_idxes_gpu,
ct0_gpu, indexes_gpu, fbsk_gpu, pbs_buffer, input_lwe_dim, glwe_dim,
poly_size, base_log, level, num_samples, 1, 1);
cleanup_cuda_programmable_bootstrap(stream, gpu_idx, &pbs_buffer);
// Copy the output batch of ciphertext back to CPU
memcpy_async_to_cpu(out_aligned, out_offset, out_batch_size, out_gpu, gpu_idx,
stream);
// free memory that we allocated on gpu
cuda_drop_async(ct0_gpu, (cudaStream_t *)stream, gpu_idx);
cuda_drop_async(out_gpu, (cudaStream_t *)stream, gpu_idx);
cuda_drop_async(glwe_ct_gpu, (cudaStream_t *)stream, gpu_idx);
cuda_drop_async(test_vector_idxes_gpu, (cudaStream_t *)stream, gpu_idx);
cudaStreamSynchronize(*(cudaStream_t *)stream);
cuda_drop_async(indexes_gpu, (cudaStream_t)stream, gpu_idx);
cuda_drop_async(ct0_gpu, (cudaStream_t)stream, gpu_idx);
cuda_drop_async(out_gpu, (cudaStream_t)stream, gpu_idx);
cuda_drop_async(glwe_ct_gpu, (cudaStream_t)stream, gpu_idx);
cuda_drop_async(test_vector_idxes_gpu, (cudaStream_t)stream, gpu_idx);
cudaStreamSynchronize((cudaStream_t)stream);
// Free the glwe accumulator (on CPU)
free(indexes);
free(glwe_ct);
cuda_destroy_stream((cudaStream_t *)stream, gpu_idx);
cuda_destroy_stream((cudaStream_t)stream, gpu_idx);
}
#endif

1
third_party/tfhe-rs vendored Submodule

Submodule third_party/tfhe-rs added at 35fdcdf1a6