mirror of
https://github.com/zama-ai/concrete.git
synced 2026-02-08 11:35:02 -05:00
feat(cuda): introduce cuda acceleration for the pbs and keyswitch
- a new crate concrete-cuda is added to the repository, containing some Cuda implementations for the bootstrap and keyswitch and a Rust wrapping to call them - a new backend_cuda is added to concrete-core, with dedicated entities whose memory is located on the GPU and engines that call the Cuda accelerated functions
This commit is contained in:
88
CMakeLists.txt
Normal file
88
CMakeLists.txt
Normal file
@@ -0,0 +1,88 @@
|
||||
cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
|
||||
project(concrete_cuda LANGUAGES CXX CUDA)
|
||||
|
||||
include(CTest)
|
||||
|
||||
# See if the minimum CUDA version is available. If not, only enable documentation building.
|
||||
set(MINIMUM_SUPPORTED_CUDA_VERSION 10.0)
|
||||
include(CheckLanguage)
|
||||
# See if CUDA is available
|
||||
check_language(CUDA)
|
||||
# If so, enable CUDA to check the version.
|
||||
if (CMAKE_CUDA_COMPILER)
|
||||
enable_language(CUDA)
|
||||
endif ()
|
||||
# If CUDA is not available, or the minimum version is too low do not build
|
||||
if (NOT CMAKE_CUDA_COMPILER)
|
||||
message(FATAL_ERROR "Cuda compiler not found.")
|
||||
endif()
|
||||
|
||||
if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS ${MINIMUM_SUPPORTED_CUDA_VERSION})
|
||||
message(FATAL_ERROR "CUDA ${MINIMUM_SUPPORTED_CUDA_VERSION} or greater is required for compilation.")
|
||||
endif()
|
||||
#Get CUDA compute capability
|
||||
set(OUTPUTFILE ${CMAKE_CURRENT_SOURCE_DIR}/cuda_script) # No suffix required
|
||||
set(CUDAFILE ${CMAKE_CURRENT_SOURCE_DIR}/check_cuda.cu)
|
||||
execute_process(COMMAND nvcc -lcuda ${CUDAFILE} -o ${OUTPUTFILE})
|
||||
execute_process(COMMAND ${OUTPUTFILE}
|
||||
RESULT_VARIABLE CUDA_RETURN_CODE
|
||||
OUTPUT_VARIABLE ARCH)
|
||||
|
||||
if (${CUDA_RETURN_CODE} EQUAL 0)
|
||||
set(CUDA_SUCCESS "TRUE")
|
||||
else ()
|
||||
set(CUDA_SUCCESS "FALSE")
|
||||
endif ()
|
||||
|
||||
if (${CUDA_SUCCESS})
|
||||
message(STATUS "CUDA Architecture: ${ARCH}")
|
||||
message(STATUS "CUDA Version: ${CUDA_VERSION_STRING}")
|
||||
message(STATUS "CUDA Path: ${CUDA_TOOLKIT_ROOT_DIR}")
|
||||
message(STATUS "CUDA Libraries: ${CUDA_LIBRARIES}")
|
||||
message(STATUS "CUDA Performance Primitives: ${CUDA_npp_LIBRARY}")
|
||||
|
||||
set(CUDA_NVCC_FLAGS "${ARCH}")
|
||||
#add_definitions(-DGPU) #You may not require this
|
||||
|
||||
else ()
|
||||
message(WARNING ${ARCH})
|
||||
endif ()
|
||||
|
||||
if (NOT CMAKE_BUILD_TYPE)
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
endif ()
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
|
||||
if (NOT CUDA_NVCC_FLAGS)
|
||||
set(CUDA_NVCC_FLAGS -arch=sm_70)
|
||||
endif ()
|
||||
|
||||
# in production, should use -arch=sm_70
|
||||
# --ptxas-options=-v to see register spills
|
||||
# -lineinfo for better debugging
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O3 ${CUDA_NVCC_FLAGS} \
|
||||
-std=c++17 --no-exceptions --expt-relaxed-constexpr -rdc=true --use_fast_math -Xcompiler -fPIC")
|
||||
|
||||
set(INCLUDE_DIR include)
|
||||
|
||||
add_subdirectory(src)
|
||||
add_subdirectory(parameters)
|
||||
target_include_directories(concrete_cuda PRIVATE ${INCLUDE_DIR})
|
||||
|
||||
# This is required for rust cargo build
|
||||
install(TARGETS concrete_cuda DESTINATION .)
|
||||
install(TARGETS concrete_cuda DESTINATION lib)
|
||||
install(TARGETS cuda_parameters DESTINATION .)
|
||||
install(TARGETS cuda_parameters DESTINATION lib)
|
||||
|
||||
# Define a function to add a lint target.
|
||||
find_file(CPPLINT NAMES cpplint cpplint.exe)
|
||||
if (CPPLINT)
|
||||
# Add a custom target to lint all child projects. Dependencies are specified in child projects.
|
||||
add_custom_target(all_lint)
|
||||
# Don't trigger this target on ALL_BUILD or Visual Studio 'Rebuild Solution'
|
||||
set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_ALL TRUE)
|
||||
# set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD TRUE)
|
||||
endif ()
|
||||
3
CPPLINT.cfg
Normal file
3
CPPLINT.cfg
Normal file
@@ -0,0 +1,3 @@
|
||||
set noparent
|
||||
linelength=240
|
||||
filter=-legal/copyright,-readability/todo,-runtime/references,-build/c++17
|
||||
22
check_cuda.cu
Normal file
22
check_cuda.cu
Normal file
@@ -0,0 +1,22 @@
|
||||
#include <stdio.h>
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
cudaDeviceProp dP;
|
||||
float min_cc = 3.0;
|
||||
|
||||
int rc = cudaGetDeviceProperties(&dP, 0);
|
||||
if (rc != cudaSuccess) {
|
||||
cudaError_t error = cudaGetLastError();
|
||||
printf("CUDA error: %s", cudaGetErrorString(error));
|
||||
return rc; /* Failure */
|
||||
}
|
||||
if ((dP.major + (dP.minor / 10)) < min_cc) {
|
||||
printf("Min Compute Capability of %2.1f required: %d.%d found\n Not "
|
||||
"Building CUDA Code",
|
||||
min_cc, dP.major, dP.minor);
|
||||
return 1; /* Failure */
|
||||
} else {
|
||||
printf("-arch=sm_%d%d", dP.major, dP.minor);
|
||||
return 0; /* Success */
|
||||
}
|
||||
}
|
||||
94
include/bootstrap.h
Normal file
94
include/bootstrap.h
Normal file
@@ -0,0 +1,94 @@
|
||||
#ifndef CUDA_BOOTSTRAP_H
|
||||
#define CUDA_BOOTSTRAP_H
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
extern "C" {
|
||||
|
||||
void cuda_initialize_twiddles(uint32_t polynomial_size, uint32_t gpu_index);
|
||||
|
||||
void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src, void *v_stream,
|
||||
uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim,
|
||||
uint32_t l_gadget, uint32_t polynomial_size);
|
||||
|
||||
void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src, void *v_stream,
|
||||
uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim,
|
||||
uint32_t l_gadget, uint32_t polynomial_size);
|
||||
|
||||
void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
void *v_stream,
|
||||
void *lwe_out,
|
||||
void *test_vector,
|
||||
void *test_vector_indexes,
|
||||
void *lwe_in,
|
||||
void *bootstrapping_key,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t num_samples,
|
||||
uint32_t num_test_vectors,
|
||||
uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
void *v_stream,
|
||||
void *lwe_out,
|
||||
void *test_vector,
|
||||
void *test_vector_indexes,
|
||||
void *lwe_in,
|
||||
void *bootstrapping_key,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t num_samples,
|
||||
uint32_t num_test_vectors,
|
||||
uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
void *v_stream,
|
||||
void *lwe_out,
|
||||
void *test_vector,
|
||||
void *test_vector_indexes,
|
||||
void *lwe_in,
|
||||
void *bootstrapping_key,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t num_samples,
|
||||
uint32_t num_test_vectors,
|
||||
uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
void *v_stream,
|
||||
void *lwe_out,
|
||||
void *test_vector,
|
||||
void *test_vector_indexes,
|
||||
void *lwe_in,
|
||||
void *bootstrapping_key,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t num_samples,
|
||||
uint32_t num_test_vectors,
|
||||
uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory);
|
||||
};
|
||||
|
||||
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t l_gadget);
|
||||
|
||||
__device__ double2*
|
||||
get_ith_mask_kth_block(double2* ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget);
|
||||
|
||||
__device__ double2*
|
||||
get_ith_body_kth_block(double2 *ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget);
|
||||
#endif // CUDA_BOOTSTRAP_H
|
||||
30
include/device.h
Normal file
30
include/device.h
Normal file
@@ -0,0 +1,30 @@
|
||||
#include <cstdint>
|
||||
|
||||
extern "C" {
|
||||
void *cuda_create_stream(uint32_t gpu_index);
|
||||
|
||||
int cuda_destroy_stream(void *v_stream, uint32_t gpu_index);
|
||||
|
||||
void *cuda_malloc(uint64_t size, uint32_t gpu_index);
|
||||
|
||||
int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
|
||||
|
||||
int cuda_memcpy_to_cpu(void *dest, const void *src, uint64_t size,
|
||||
uint32_t gpu_index);
|
||||
|
||||
int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
void *v_stream, uint32_t gpu_index);
|
||||
|
||||
int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size,
|
||||
uint32_t gpu_index);
|
||||
|
||||
int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||
void *v_stream, uint32_t gpu_index);
|
||||
int cuda_get_number_of_gpus();
|
||||
|
||||
int cuda_synchronize_device(uint32_t gpu_index);
|
||||
|
||||
int cuda_drop(void *ptr, uint32_t gpu_index);
|
||||
|
||||
int cuda_get_max_shared_memory(uint32_t gpu_index);
|
||||
}
|
||||
1154
include/helper_cuda.h
Normal file
1154
include/helper_cuda.h
Normal file
File diff suppressed because it is too large
Load Diff
100
include/helper_debug.cuh
Normal file
100
include/helper_debug.cuh
Normal file
@@ -0,0 +1,100 @@
|
||||
#include "cuComplex.h"
|
||||
#include "thrust/complex.h"
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
|
||||
#define PRINT_VARS
|
||||
#ifdef PRINT_VARS
|
||||
#define PRINT_DEBUG_5(var, begin, end, step, cond) \
|
||||
_print_debug(var, #var, begin, end, step, cond, "", false)
|
||||
#define PRINT_DEBUG_6(var, begin, end, step, cond, text) \
|
||||
_print_debug(var, #var, begin, end, step, cond, text, true)
|
||||
#define CAT(A, B) A##B
|
||||
#define PRINT_SELECT(NAME, NUM) CAT(NAME##_, NUM)
|
||||
#define GET_COUNT(_1, _2, _3, _4, _5, _6, COUNT, ...) COUNT
|
||||
#define VA_SIZE(...) GET_COUNT(__VA_ARGS__, 6, 5, 4, 3, 2, 1)
|
||||
#define PRINT_DEBUG(...) \
|
||||
PRINT_SELECT(PRINT_DEBUG, VA_SIZE(__VA_ARGS__))(__VA_ARGS__)
|
||||
#else
|
||||
#define PRINT_DEBUG(...)
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
__device__ typename std::enable_if<std::is_unsigned<T>::value, void>::type
|
||||
_print_debug(T *var, const char *var_name, int start, int end, int step,
|
||||
bool cond, const char *text, bool has_text) {
|
||||
__syncthreads();
|
||||
if (cond) {
|
||||
if (has_text)
|
||||
printf("%s\n", text);
|
||||
for (int i = start; i < end; i += step) {
|
||||
printf("%s[%u]: %u\n", var_name, i, var[i]);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ typename std::enable_if<std::is_signed<T>::value, void>::type
|
||||
_print_debug(T *var, const char *var_name, int start, int end, int step,
|
||||
bool cond, const char *text, bool has_text) {
|
||||
__syncthreads();
|
||||
if (cond) {
|
||||
if (has_text)
|
||||
printf("%s\n", text);
|
||||
for (int i = start; i < end; i += step) {
|
||||
printf("%s[%u]: %d\n", var_name, i, var[i]);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ typename std::enable_if<std::is_floating_point<T>::value, void>::type
|
||||
_print_debug(T *var, const char *var_name, int start, int end, int step,
|
||||
bool cond, const char *text, bool has_text) {
|
||||
__syncthreads();
|
||||
if (cond) {
|
||||
if (has_text)
|
||||
printf("%s\n", text);
|
||||
for (int i = start; i < end; i += step) {
|
||||
printf("%s[%u]: %.15f\n", var_name, i, var[i]);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__
|
||||
typename std::enable_if<std::is_same<T, thrust::complex<double>>::value,
|
||||
void>::type
|
||||
_print_debug(T *var, const char *var_name, int start, int end, int step,
|
||||
bool cond, const char *text, bool has_text) {
|
||||
__syncthreads();
|
||||
if (cond) {
|
||||
if (has_text)
|
||||
printf("%s\n", text);
|
||||
for (int i = start; i < end; i += step) {
|
||||
printf("%s[%u]: %.15f , %.15f\n", var_name, i, var[i].real(),
|
||||
var[i].imag());
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__
|
||||
typename std::enable_if<std::is_same<T, cuDoubleComplex>::value, void>::type
|
||||
_print_debug(T *var, const char *var_name, int start, int end, int step,
|
||||
bool cond, const char *text, bool has_text) {
|
||||
__syncthreads();
|
||||
if (cond) {
|
||||
if (has_text)
|
||||
printf("%s\n", text);
|
||||
for (int i = start; i < end; i += step) {
|
||||
printf("%s[%u]: %.15f , %.15f\n", var_name, i, var[i].x, var[i].y);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
664
include/helper_string.h
Normal file
664
include/helper_string.h
Normal file
@@ -0,0 +1,664 @@
|
||||
/**
|
||||
* Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
// These are helper functions for the SDK samples (string parsing, timers, etc)
|
||||
#ifndef STRING_HELPER_H
|
||||
#define STRING_HELPER_H
|
||||
|
||||
#include <fstream>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string>
|
||||
|
||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||
#ifndef _CRT_SECURE_NO_DEPRECATE
|
||||
#define _CRT_SECURE_NO_DEPRECATE
|
||||
#endif
|
||||
#ifndef STRCASECMP
|
||||
#define STRCASECMP _stricmp
|
||||
#endif
|
||||
#ifndef STRNCASECMP
|
||||
#define STRNCASECMP _strnicmp
|
||||
#endif
|
||||
#ifndef STRCPY
|
||||
#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
|
||||
#endif
|
||||
|
||||
#ifndef FOPEN
|
||||
#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
|
||||
#endif
|
||||
#ifndef FOPEN_FAIL
|
||||
#define FOPEN_FAIL(result) (result != 0)
|
||||
#endif
|
||||
#ifndef SSCANF
|
||||
#define SSCANF sscanf_s
|
||||
#endif
|
||||
#ifndef SPRINTF
|
||||
#define SPRINTF sprintf_s
|
||||
#endif
|
||||
#else // Linux Includes
|
||||
#include <string.h>
|
||||
#include <strings.h>
|
||||
|
||||
#ifndef STRCASECMP
|
||||
#define STRCASECMP strcasecmp
|
||||
#endif
|
||||
#ifndef STRNCASECMP
|
||||
#define STRNCASECMP strncasecmp
|
||||
#endif
|
||||
#ifndef STRCPY
|
||||
#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
|
||||
#endif
|
||||
|
||||
#ifndef FOPEN
|
||||
#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
|
||||
#endif
|
||||
#ifndef FOPEN_FAIL
|
||||
#define FOPEN_FAIL(result) (result == NULL)
|
||||
#endif
|
||||
#ifndef SSCANF
|
||||
#define SSCANF sscanf
|
||||
#endif
|
||||
#ifndef SPRINTF
|
||||
#define SPRINTF sprintf
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef EXIT_WAIVED
|
||||
#define EXIT_WAIVED 2
|
||||
#endif
|
||||
|
||||
// CUDA Utility Helper Functions
|
||||
inline int stringRemoveDelimiter(char delimiter, const char *string) {
|
||||
int string_start = 0;
|
||||
|
||||
while (string[string_start] == delimiter) {
|
||||
string_start++;
|
||||
}
|
||||
|
||||
if (string_start >= (int)strlen(string) - 1) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return string_start;
|
||||
}
|
||||
|
||||
inline int getFileExtension(char *filename, char **extension) {
|
||||
int string_length = (int)strlen(filename);
|
||||
|
||||
while (filename[string_length--] != '.') {
|
||||
if (string_length == 0)
|
||||
break;
|
||||
}
|
||||
|
||||
if (string_length > 0)
|
||||
string_length += 2;
|
||||
|
||||
if (string_length == 0)
|
||||
*extension = NULL;
|
||||
else
|
||||
*extension = &filename[string_length];
|
||||
|
||||
return string_length;
|
||||
}
|
||||
|
||||
inline bool checkCmdLineFlag(const int argc, const char **argv,
|
||||
const char *string_ref) {
|
||||
bool bFound = false;
|
||||
|
||||
if (argc >= 1) {
|
||||
for (int i = 1; i < argc; i++) {
|
||||
int string_start = stringRemoveDelimiter('-', argv[i]);
|
||||
const char *string_argv = &argv[i][string_start];
|
||||
|
||||
const char *equal_pos = strchr(string_argv, '=');
|
||||
int argv_length =
|
||||
(int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
|
||||
|
||||
int length = (int)strlen(string_ref);
|
||||
|
||||
if (length == argv_length &&
|
||||
!STRNCASECMP(string_argv, string_ref, length)) {
|
||||
bFound = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return bFound;
|
||||
}
|
||||
|
||||
// This function wraps the CUDA Driver API into a template function
|
||||
template <class T>
|
||||
inline bool getCmdLineArgumentValue(const int argc, const char **argv,
|
||||
const char *string_ref, T *value) {
|
||||
bool bFound = false;
|
||||
|
||||
if (argc >= 1) {
|
||||
for (int i = 1; i < argc; i++) {
|
||||
int string_start = stringRemoveDelimiter('-', argv[i]);
|
||||
const char *string_argv = &argv[i][string_start];
|
||||
int length = (int)strlen(string_ref);
|
||||
|
||||
if (!STRNCASECMP(string_argv, string_ref, length)) {
|
||||
if (length + 1 <= (int)strlen(string_argv)) {
|
||||
int auto_inc = (string_argv[length] == '=') ? 1 : 0;
|
||||
*value = (T)atoi(&string_argv[length + auto_inc]);
|
||||
}
|
||||
|
||||
bFound = true;
|
||||
i = argc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return bFound;
|
||||
}
|
||||
|
||||
inline int getCmdLineArgumentInt(const int argc, const char **argv,
|
||||
const char *string_ref) {
|
||||
bool bFound = false;
|
||||
int value = -1;
|
||||
|
||||
if (argc >= 1) {
|
||||
for (int i = 1; i < argc; i++) {
|
||||
int string_start = stringRemoveDelimiter('-', argv[i]);
|
||||
const char *string_argv = &argv[i][string_start];
|
||||
int length = (int)strlen(string_ref);
|
||||
|
||||
if (!STRNCASECMP(string_argv, string_ref, length)) {
|
||||
if (length + 1 <= (int)strlen(string_argv)) {
|
||||
int auto_inc = (string_argv[length] == '=') ? 1 : 0;
|
||||
value = atoi(&string_argv[length + auto_inc]);
|
||||
} else {
|
||||
value = 0;
|
||||
}
|
||||
|
||||
bFound = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (bFound) {
|
||||
return value;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
inline float getCmdLineArgumentFloat(const int argc, const char **argv,
|
||||
const char *string_ref) {
|
||||
bool bFound = false;
|
||||
float value = -1;
|
||||
|
||||
if (argc >= 1) {
|
||||
for (int i = 1; i < argc; i++) {
|
||||
int string_start = stringRemoveDelimiter('-', argv[i]);
|
||||
const char *string_argv = &argv[i][string_start];
|
||||
int length = (int)strlen(string_ref);
|
||||
|
||||
if (!STRNCASECMP(string_argv, string_ref, length)) {
|
||||
if (length + 1 <= (int)strlen(string_argv)) {
|
||||
int auto_inc = (string_argv[length] == '=') ? 1 : 0;
|
||||
value = (float)atof(&string_argv[length + auto_inc]);
|
||||
} else {
|
||||
value = 0.f;
|
||||
}
|
||||
|
||||
bFound = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (bFound) {
|
||||
return value;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
inline bool getCmdLineArgumentString(const int argc, const char **argv,
|
||||
const char *string_ref,
|
||||
char **string_retval) {
|
||||
bool bFound = false;
|
||||
|
||||
if (argc >= 1) {
|
||||
for (int i = 1; i < argc; i++) {
|
||||
int string_start = stringRemoveDelimiter('-', argv[i]);
|
||||
char *string_argv = (char *)&argv[i][string_start];
|
||||
int length = (int)strlen(string_ref);
|
||||
|
||||
if (!STRNCASECMP(string_argv, string_ref, length)) {
|
||||
*string_retval = &string_argv[length + 1];
|
||||
bFound = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!bFound) {
|
||||
*string_retval = NULL;
|
||||
}
|
||||
|
||||
return bFound;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Find the path for a file assuming that
|
||||
//! files are found in the searchPath.
|
||||
//!
|
||||
//! @return the path if succeeded, otherwise 0
|
||||
//! @param filename name of the file
|
||||
//! @param executable_path optional absolute path of the executable
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
inline char *sdkFindFilePath(const char *filename,
|
||||
const char *executable_path) {
|
||||
// <executable_name> defines a variable that is replaced with the name of the
|
||||
// executable
|
||||
|
||||
// Typical relative search paths to locate needed companion files (e.g. sample
|
||||
// input data, or JIT source files) The origin for the relative search may be
|
||||
// the .exe file, a .bat file launching an .exe, a browser .exe launching the
|
||||
// .exe or .bat, etc
|
||||
const char *searchPath[] = {
|
||||
"./", // same dir
|
||||
"./common/", // "/common/" subdir
|
||||
"./common/data/", // "/common/data/" subdir
|
||||
"./data/", // "/data/" subdir
|
||||
"./src/", // "/src/" subdir
|
||||
"./src/<executable_name>/data/", // "/src/<executable_name>/data/" subdir
|
||||
"./inc/", // "/inc/" subdir
|
||||
"./0_Simple/", // "/0_Simple/" subdir
|
||||
"./1_Utilities/", // "/1_Utilities/" subdir
|
||||
"./2_Graphics/", // "/2_Graphics/" subdir
|
||||
"./3_Imaging/", // "/3_Imaging/" subdir
|
||||
"./4_Finance/", // "/4_Finance/" subdir
|
||||
"./5_Simulations/", // "/5_Simulations/" subdir
|
||||
"./6_Advanced/", // "/6_Advanced/" subdir
|
||||
"./7_CUDALibraries/", // "/7_CUDALibraries/" subdir
|
||||
"./8_Android/", // "/8_Android/" subdir
|
||||
"./samples/", // "/samples/" subdir
|
||||
|
||||
"../", // up 1 in tree
|
||||
"../common/", // up 1 in tree, "/common/" subdir
|
||||
"../common/data/", // up 1 in tree, "/common/data/" subdir
|
||||
"../data/", // up 1 in tree, "/data/" subdir
|
||||
"../src/", // up 1 in tree, "/src/" subdir
|
||||
"../inc/", // up 1 in tree, "/inc/" subdir
|
||||
|
||||
"../0_Simple/<executable_name>/data/", // up 1 in tree,
|
||||
// "/0_Simple/<executable_name>/"
|
||||
// subdir
|
||||
"../1_Utilities/<executable_name>/data/", // up 1 in tree,
|
||||
// "/1_Utilities/<executable_name>/"
|
||||
// subdir
|
||||
"../2_Graphics/<executable_name>/data/", // up 1 in tree,
|
||||
// "/2_Graphics/<executable_name>/"
|
||||
// subdir
|
||||
"../3_Imaging/<executable_name>/data/", // up 1 in tree,
|
||||
// "/3_Imaging/<executable_name>/"
|
||||
// subdir
|
||||
"../4_Finance/<executable_name>/data/", // up 1 in tree,
|
||||
// "/4_Finance/<executable_name>/"
|
||||
// subdir
|
||||
"../5_Simulations/<executable_name>/data/", // up 1 in tree,
|
||||
// "/5_Simulations/<executable_name>/"
|
||||
// subdir
|
||||
"../6_Advanced/<executable_name>/data/", // up 1 in tree,
|
||||
// "/6_Advanced/<executable_name>/"
|
||||
// subdir
|
||||
"../7_CUDALibraries/<executable_name>/data/", // up 1 in tree,
|
||||
// "/7_CUDALibraries/<executable_name>/"
|
||||
// subdir
|
||||
"../8_Android/<executable_name>/data/", // up 1 in tree,
|
||||
// "/8_Android/<executable_name>/"
|
||||
// subdir
|
||||
"../samples/<executable_name>/data/", // up 1 in tree,
|
||||
// "/samples/<executable_name>/"
|
||||
// subdir
|
||||
"../../", // up 2 in tree
|
||||
"../../common/", // up 2 in tree, "/common/" subdir
|
||||
"../../common/data/", // up 2 in tree, "/common/data/" subdir
|
||||
"../../data/", // up 2 in tree, "/data/" subdir
|
||||
"../../src/", // up 2 in tree, "/src/" subdir
|
||||
"../../inc/", // up 2 in tree, "/inc/" subdir
|
||||
"../../sandbox/<executable_name>/data/", // up 2 in tree,
|
||||
// "/sandbox/<executable_name>/"
|
||||
// subdir
|
||||
"../../0_Simple/<executable_name>/data/", // up 2 in tree,
|
||||
// "/0_Simple/<executable_name>/"
|
||||
// subdir
|
||||
"../../1_Utilities/<executable_name>/data/", // up 2 in tree,
|
||||
// "/1_Utilities/<executable_name>/"
|
||||
// subdir
|
||||
"../../2_Graphics/<executable_name>/data/", // up 2 in tree,
|
||||
// "/2_Graphics/<executable_name>/"
|
||||
// subdir
|
||||
"../../3_Imaging/<executable_name>/data/", // up 2 in tree,
|
||||
// "/3_Imaging/<executable_name>/"
|
||||
// subdir
|
||||
"../../4_Finance/<executable_name>/data/", // up 2 in tree,
|
||||
// "/4_Finance/<executable_name>/"
|
||||
// subdir
|
||||
"../../5_Simulations/<executable_name>/data/", // up 2 in tree,
|
||||
// "/5_Simulations/<executable_name>/"
|
||||
// subdir
|
||||
"../../6_Advanced/<executable_name>/data/", // up 2 in tree,
|
||||
// "/6_Advanced/<executable_name>/"
|
||||
// subdir
|
||||
"../../7_CUDALibraries/<executable_name>/data/", // up 2 in tree,
|
||||
// "/7_CUDALibraries/<executable_name>/"
|
||||
// subdir
|
||||
"../../8_Android/<executable_name>/data/", // up 2 in tree,
|
||||
// "/8_Android/<executable_name>/"
|
||||
// subdir
|
||||
"../../samples/<executable_name>/data/", // up 2 in tree,
|
||||
// "/samples/<executable_name>/"
|
||||
// subdir
|
||||
"../../../", // up 3 in tree
|
||||
"../../../src/<executable_name>/", // up 3 in tree,
|
||||
// "/src/<executable_name>/" subdir
|
||||
"../../../src/<executable_name>/data/", // up 3 in tree,
|
||||
// "/src/<executable_name>/data/"
|
||||
// subdir
|
||||
"../../../src/<executable_name>/src/", // up 3 in tree,
|
||||
// "/src/<executable_name>/src/"
|
||||
// subdir
|
||||
"../../../src/<executable_name>/inc/", // up 3 in tree,
|
||||
// "/src/<executable_name>/inc/"
|
||||
// subdir
|
||||
"../../../sandbox/<executable_name>/", // up 3 in tree,
|
||||
// "/sandbox/<executable_name>/"
|
||||
// subdir
|
||||
"../../../sandbox/<executable_name>/data/", // up 3 in tree,
|
||||
// "/sandbox/<executable_name>/data/"
|
||||
// subdir
|
||||
"../../../sandbox/<executable_name>/src/", // up 3 in tree,
|
||||
// "/sandbox/<executable_name>/src/"
|
||||
// subdir
|
||||
"../../../sandbox/<executable_name>/inc/", // up 3 in tree,
|
||||
// "/sandbox/<executable_name>/inc/"
|
||||
// subdir
|
||||
"../../../0_Simple/<executable_name>/data/", // up 3 in tree,
|
||||
// "/0_Simple/<executable_name>/"
|
||||
// subdir
|
||||
"../../../1_Utilities/<executable_name>/data/", // up 3 in tree,
|
||||
// "/1_Utilities/<executable_name>/"
|
||||
// subdir
|
||||
"../../../2_Graphics/<executable_name>/data/", // up 3 in tree,
|
||||
// "/2_Graphics/<executable_name>/"
|
||||
// subdir
|
||||
"../../../3_Imaging/<executable_name>/data/", // up 3 in tree,
|
||||
// "/3_Imaging/<executable_name>/"
|
||||
// subdir
|
||||
"../../../4_Finance/<executable_name>/data/", // up 3 in tree,
|
||||
// "/4_Finance/<executable_name>/"
|
||||
// subdir
|
||||
"../../../5_Simulations/<executable_name>/data/", // up 3 in tree,
|
||||
// "/5_Simulations/<executable_name>/"
|
||||
// subdir
|
||||
"../../../6_Advanced/<executable_name>/data/", // up 3 in tree,
|
||||
// "/6_Advanced/<executable_name>/"
|
||||
// subdir
|
||||
"../../../7_CUDALibraries/<executable_name>/data/", // up 3 in tree,
|
||||
// "/7_CUDALibraries/<executable_name>/"
|
||||
// subdir
|
||||
"../../../8_Android/<executable_name>/data/", // up 3 in tree,
|
||||
// "/8_Android/<executable_name>/"
|
||||
// subdir
|
||||
"../../../0_Simple/<executable_name>/", // up 3 in tree,
|
||||
// "/0_Simple/<executable_name>/"
|
||||
// subdir
|
||||
"../../../1_Utilities/<executable_name>/", // up 3 in tree,
|
||||
// "/1_Utilities/<executable_name>/"
|
||||
// subdir
|
||||
"../../../2_Graphics/<executable_name>/", // up 3 in tree,
|
||||
// "/2_Graphics/<executable_name>/"
|
||||
// subdir
|
||||
"../../../3_Imaging/<executable_name>/", // up 3 in tree,
|
||||
// "/3_Imaging/<executable_name>/"
|
||||
// subdir
|
||||
"../../../4_Finance/<executable_name>/", // up 3 in tree,
|
||||
// "/4_Finance/<executable_name>/"
|
||||
// subdir
|
||||
"../../../5_Simulations/<executable_name>/", // up 3 in tree,
|
||||
// "/5_Simulations/<executable_name>/"
|
||||
// subdir
|
||||
"../../../6_Advanced/<executable_name>/", // up 3 in tree,
|
||||
// "/6_Advanced/<executable_name>/"
|
||||
// subdir
|
||||
"../../../7_CUDALibraries/<executable_name>/", // up 3 in tree,
|
||||
// "/7_CUDALibraries/<executable_name>/"
|
||||
// subdir
|
||||
"../../../8_Android/<executable_name>/", // up 3 in tree,
|
||||
// "/8_Android/<executable_name>/"
|
||||
// subdir
|
||||
"../../../samples/<executable_name>/data/", // up 3 in tree,
|
||||
// "/samples/<executable_name>/"
|
||||
// subdir
|
||||
"../../../common/", // up 3 in tree, "../../../common/" subdir
|
||||
"../../../common/data/", // up 3 in tree, "../../../common/data/" subdir
|
||||
"../../../data/", // up 3 in tree, "../../../data/" subdir
|
||||
"../../../../", // up 4 in tree
|
||||
"../../../../src/<executable_name>/", // up 4 in tree,
|
||||
// "/src/<executable_name>/" subdir
|
||||
"../../../../src/<executable_name>/data/", // up 4 in tree,
|
||||
// "/src/<executable_name>/data/"
|
||||
// subdir
|
||||
"../../../../src/<executable_name>/src/", // up 4 in tree,
|
||||
// "/src/<executable_name>/src/"
|
||||
// subdir
|
||||
"../../../../src/<executable_name>/inc/", // up 4 in tree,
|
||||
// "/src/<executable_name>/inc/"
|
||||
// subdir
|
||||
"../../../../sandbox/<executable_name>/", // up 4 in tree,
|
||||
// "/sandbox/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../sandbox/<executable_name>/data/", // up 4 in tree,
|
||||
// "/sandbox/<executable_name>/data/"
|
||||
// subdir
|
||||
"../../../../sandbox/<executable_name>/src/", // up 4 in tree,
|
||||
// "/sandbox/<executable_name>/src/"
|
||||
// subdir
|
||||
"../../../../sandbox/<executable_name>/inc/", // up 4 in tree,
|
||||
// "/sandbox/<executable_name>/inc/"
|
||||
// subdir
|
||||
"../../../../0_Simple/<executable_name>/data/", // up 4 in tree,
|
||||
// "/0_Simple/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../1_Utilities/<executable_name>/data/", // up 4 in tree,
|
||||
// "/1_Utilities/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../2_Graphics/<executable_name>/data/", // up 4 in tree,
|
||||
// "/2_Graphics/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../3_Imaging/<executable_name>/data/", // up 4 in tree,
|
||||
// "/3_Imaging/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../4_Finance/<executable_name>/data/", // up 4 in tree,
|
||||
// "/4_Finance/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../5_Simulations/<executable_name>/data/", // up 4 in tree,
|
||||
// "/5_Simulations/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../6_Advanced/<executable_name>/data/", // up 4 in tree,
|
||||
// "/6_Advanced/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../7_CUDALibraries/<executable_name>/data/", // up 4 in tree,
|
||||
// "/7_CUDALibraries/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../8_Android/<executable_name>/data/", // up 4 in tree,
|
||||
// "/8_Android/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../0_Simple/<executable_name>/", // up 4 in tree,
|
||||
// "/0_Simple/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../1_Utilities/<executable_name>/", // up 4 in tree,
|
||||
// "/1_Utilities/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../2_Graphics/<executable_name>/", // up 4 in tree,
|
||||
// "/2_Graphics/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../3_Imaging/<executable_name>/", // up 4 in tree,
|
||||
// "/3_Imaging/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../4_Finance/<executable_name>/", // up 4 in tree,
|
||||
// "/4_Finance/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../5_Simulations/<executable_name>/", // up 4 in tree,
|
||||
// "/5_Simulations/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../6_Advanced/<executable_name>/", // up 4 in tree,
|
||||
// "/6_Advanced/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../7_CUDALibraries/<executable_name>/", // up 4 in tree,
|
||||
// "/7_CUDALibraries/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../8_Android/<executable_name>/", // up 4 in tree,
|
||||
// "/8_Android/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../samples/<executable_name>/data/", // up 4 in tree,
|
||||
// "/samples/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../common/", // up 4 in tree, "../../../common/" subdir
|
||||
"../../../../common/data/", // up 4 in tree, "../../../common/data/"
|
||||
// subdir
|
||||
"../../../../data/", // up 4 in tree, "../../../data/" subdir
|
||||
"../../../../../", // up 5 in tree
|
||||
"../../../../../src/<executable_name>/", // up 5 in tree,
|
||||
// "/src/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../../src/<executable_name>/data/", // up 5 in tree,
|
||||
// "/src/<executable_name>/data/"
|
||||
// subdir
|
||||
"../../../../../src/<executable_name>/src/", // up 5 in tree,
|
||||
// "/src/<executable_name>/src/"
|
||||
// subdir
|
||||
"../../../../../src/<executable_name>/inc/", // up 5 in tree,
|
||||
// "/src/<executable_name>/inc/"
|
||||
// subdir
|
||||
"../../../../../sandbox/<executable_name>/", // up 5 in tree,
|
||||
// "/sandbox/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../../sandbox/<executable_name>/data/", // up 5 in tree,
|
||||
// "/sandbox/<executable_name>/data/"
|
||||
// subdir
|
||||
"../../../../../sandbox/<executable_name>/src/", // up 5 in tree,
|
||||
// "/sandbox/<executable_name>/src/"
|
||||
// subdir
|
||||
"../../../../../sandbox/<executable_name>/inc/", // up 5 in tree,
|
||||
// "/sandbox/<executable_name>/inc/"
|
||||
// subdir
|
||||
"../../../../../0_Simple/<executable_name>/data/", // up 5 in tree,
|
||||
// "/0_Simple/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../../1_Utilities/<executable_name>/data/", // up 5 in tree,
|
||||
// "/1_Utilities/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../../2_Graphics/<executable_name>/data/", // up 5 in tree,
|
||||
// "/2_Graphics/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../../3_Imaging/<executable_name>/data/", // up 5 in tree,
|
||||
// "/3_Imaging/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../../4_Finance/<executable_name>/data/", // up 5 in tree,
|
||||
// "/4_Finance/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../../5_Simulations/<executable_name>/data/", // up 5 in tree,
|
||||
// "/5_Simulations/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../../6_Advanced/<executable_name>/data/", // up 5 in tree,
|
||||
// "/6_Advanced/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../../7_CUDALibraries/<executable_name>/data/", // up 5 in tree,
|
||||
// "/7_CUDALibraries/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../../8_Android/<executable_name>/data/", // up 5 in tree,
|
||||
// "/8_Android/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../../samples/<executable_name>/data/", // up 5 in tree,
|
||||
// "/samples/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../../common/", // up 5 in tree, "../../../common/" subdir
|
||||
"../../../../../common/data/", // up 5 in tree, "../../../common/data/"
|
||||
// subdir
|
||||
};
|
||||
|
||||
// Extract the executable name
|
||||
std::string executable_name;
|
||||
|
||||
if (executable_path != 0) {
|
||||
executable_name = std::string(executable_path);
|
||||
|
||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||
// Windows path delimiter
|
||||
size_t delimiter_pos = executable_name.find_last_of('\\');
|
||||
executable_name.erase(0, delimiter_pos + 1);
|
||||
|
||||
if (executable_name.rfind(".exe") != std::string::npos) {
|
||||
// we strip .exe, only if the .exe is found
|
||||
executable_name.resize(executable_name.size() - 4);
|
||||
}
|
||||
|
||||
#else
|
||||
// Linux & OSX path delimiter
|
||||
size_t delimiter_pos = executable_name.find_last_of('/');
|
||||
executable_name.erase(0, delimiter_pos + 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Loop over all search paths and return the first hit
|
||||
for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) {
|
||||
std::string path(searchPath[i]);
|
||||
size_t executable_name_pos = path.find("<executable_name>");
|
||||
|
||||
// If there is executable_name variable in the searchPath
|
||||
// replace it with the value
|
||||
if (executable_name_pos != std::string::npos) {
|
||||
if (executable_path != 0) {
|
||||
path.replace(executable_name_pos, strlen("<executable_name>"),
|
||||
executable_name);
|
||||
} else {
|
||||
// Skip this path entry if no executable argument is given
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _DEBUG
|
||||
printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
|
||||
#endif
|
||||
|
||||
// Test if the file exists
|
||||
path.append(filename);
|
||||
FILE *fp;
|
||||
FOPEN(fp, path.c_str(), "rb");
|
||||
|
||||
if (fp != NULL) {
|
||||
fclose(fp);
|
||||
// File found
|
||||
// returning an allocated array here for backwards compatibility reasons
|
||||
char *file_path = (char *)malloc(path.length() + 1);
|
||||
STRCPY(file_path, path.length() + 1, path.c_str());
|
||||
return file_path;
|
||||
}
|
||||
|
||||
if (fp) {
|
||||
fclose(fp);
|
||||
}
|
||||
}
|
||||
|
||||
// File not found
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
24
include/keyswitch.h
Normal file
24
include/keyswitch.h
Normal file
@@ -0,0 +1,24 @@
|
||||
#ifndef CNCRT_KS_H_
|
||||
#define CNCRT_KS_H_
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
extern "C" {
|
||||
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_32(void *v_stream, void *lwe_out, void *lwe_in,
|
||||
void *ksk,
|
||||
uint32_t lwe_dimension_before,
|
||||
uint32_t lwe_dimension_after,
|
||||
uint32_t base_log, uint32_t l_gadget,
|
||||
uint32_t num_samples);
|
||||
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_64(void *v_stream, void *lwe_out, void *lwe_in,
|
||||
void *ksk,
|
||||
uint32_t lwe_dimension_before,
|
||||
uint32_t lwe_dimension_after,
|
||||
uint32_t base_log, uint32_t l_gadget,
|
||||
uint32_t num_samples);
|
||||
|
||||
}
|
||||
|
||||
#endif // CNCRT_KS_H_
|
||||
4
parameters/CMakeLists.txt
Normal file
4
parameters/CMakeLists.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
file(GLOB SOURCES
|
||||
"parameters.cpp")
|
||||
add_library(cuda_parameters STATIC ${SOURCES})
|
||||
set_target_properties(cuda_parameters PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
|
||||
380
parameters/parameters.cpp
Normal file
380
parameters/parameters.cpp
Normal file
@@ -0,0 +1,380 @@
|
||||
|
||||
|
||||
#include <iostream>
|
||||
using namespace std;
|
||||
|
||||
const int NORM2_MAX = 31;
|
||||
const int P_MAX = 7;
|
||||
|
||||
typedef struct V0Parameter {
|
||||
int k;
|
||||
int polynomialSize;
|
||||
int nSmall;
|
||||
int brLevel;
|
||||
int brLogBase;
|
||||
int ksLevel;
|
||||
int ksLogBase;
|
||||
|
||||
V0Parameter(int k_, int polynomialSize_, int nSmall_, int brLevel_,
|
||||
int brLogBase_, int ksLevel_, int ksLogBase_) {
|
||||
k = k_;
|
||||
polynomialSize = polynomialSize_;
|
||||
nSmall = nSmall_;
|
||||
brLevel = brLevel_;
|
||||
brLogBase = brLogBase_;
|
||||
ksLevel = ksLevel_;
|
||||
ksLogBase = ksLogBase_;
|
||||
}
|
||||
|
||||
} V0Parameter;
|
||||
|
||||
typedef struct V0Variances {
|
||||
float logstdEncrypt;
|
||||
float logstdDecrypt;
|
||||
|
||||
V0Variances(float logstdEncrypt_, float logstdDecrypt_) {
|
||||
logstdEncrypt = logstdEncrypt_;
|
||||
logstdDecrypt = logstdDecrypt_;
|
||||
}
|
||||
|
||||
} V0Variances;
|
||||
|
||||
V0Parameter parameters[NORM2_MAX][P_MAX] = {
|
||||
{V0Parameter(1, 10, 472, 2, 8, 4, 2), V0Parameter(1, 10, 514, 2, 8, 5, 2),
|
||||
V0Parameter(1, 10, 564, 2, 8, 5, 2), V0Parameter(1, 10, 599, 3, 6, 6, 2),
|
||||
V0Parameter(1, 10, 686, 3, 6, 7, 2), V0Parameter(1, 11, 736, 1, 23, 5, 3),
|
||||
V0Parameter(1, 12, 830, 1, 23, 4, 4)},
|
||||
{V0Parameter(1, 10, 474, 2, 8, 4, 2), V0Parameter(1, 10, 519, 2, 8, 5, 2),
|
||||
V0Parameter(1, 10, 558, 3, 6, 5, 2), V0Parameter(1, 10, 610, 3, 6, 6, 2),
|
||||
V0Parameter(1, 11, 689, 1, 23, 4, 3), V0Parameter(1, 11, 736, 1, 23, 5, 3),
|
||||
V0Parameter(1, 12, 831, 1, 23, 4, 4)},
|
||||
{V0Parameter(1, 10, 479, 2, 8, 4, 2), V0Parameter(1, 10, 515, 3, 6, 5, 2),
|
||||
V0Parameter(1, 10, 569, 3, 6, 5, 2), V0Parameter(1, 11, 638, 1, 23, 4, 3),
|
||||
V0Parameter(1, 11, 689, 1, 23, 4, 3), V0Parameter(1, 11, 737, 1, 23, 5, 3),
|
||||
V0Parameter(1, 12, 840, 1, 23, 4, 4)},
|
||||
{V0Parameter(1, 10, 531, 2, 8, 5, 2), V0Parameter(1, 10, 523, 3, 6, 5, 2),
|
||||
V0Parameter(1, 11, 598, 1, 23, 4, 3), V0Parameter(1, 11, 639, 1, 23, 4, 3),
|
||||
V0Parameter(1, 11, 690, 1, 23, 4, 3), V0Parameter(1, 11, 739, 1, 23, 5, 3),
|
||||
V0Parameter(1, 12, 806, 2, 16, 5, 3)},
|
||||
{V0Parameter(1, 10, 483, 3, 6, 4, 2), V0Parameter(1, 11, 563, 1, 23, 3, 3),
|
||||
V0Parameter(1, 11, 598, 1, 23, 4, 3), V0Parameter(1, 11, 639, 1, 23, 4, 3),
|
||||
V0Parameter(1, 11, 691, 1, 23, 4, 3), V0Parameter(1, 11, 748, 1, 23, 5, 3),
|
||||
V0Parameter(1, 12, 806, 2, 15, 5, 3)},
|
||||
{V0Parameter(1, 11, 497, 1, 23, 4, 2), V0Parameter(1, 11, 563, 1, 23, 3, 3),
|
||||
V0Parameter(1, 11, 598, 1, 23, 4, 3), V0Parameter(1, 11, 640, 1, 23, 4, 3),
|
||||
V0Parameter(1, 11, 699, 1, 23, 4, 3), V0Parameter(1, 11, 736, 2, 15, 5, 3),
|
||||
V0Parameter(1, 12, 806, 2, 15, 5, 3)},
|
||||
{V0Parameter(1, 11, 497, 1, 23, 4, 2), V0Parameter(1, 11, 563, 1, 23, 3, 3),
|
||||
V0Parameter(1, 11, 599, 1, 23, 4, 3), V0Parameter(1, 11, 643, 1, 23, 4, 3),
|
||||
V0Parameter(1, 11, 721, 1, 23, 5, 3), V0Parameter(1, 11, 736, 2, 15, 5, 3),
|
||||
V0Parameter(1, 12, 806, 2, 15, 5, 3)},
|
||||
{V0Parameter(1, 11, 497, 1, 23, 4, 2), V0Parameter(1, 11, 564, 1, 23, 3, 3),
|
||||
V0Parameter(1, 11, 602, 1, 23, 4, 3), V0Parameter(1, 11, 671, 1, 23, 4, 3),
|
||||
V0Parameter(1, 11, 689, 2, 15, 4, 3), V0Parameter(1, 11, 736, 2, 15, 5, 3),
|
||||
V0Parameter(1, 12, 807, 2, 15, 5, 3)},
|
||||
{V0Parameter(1, 11, 498, 1, 23, 4, 2), V0Parameter(1, 11, 569, 1, 23, 3, 3),
|
||||
V0Parameter(1, 11, 622, 1, 23, 4, 3), V0Parameter(1, 11, 638, 2, 15, 4, 3),
|
||||
V0Parameter(1, 11, 689, 2, 16, 4, 3), V0Parameter(1, 11, 736, 2, 16, 5, 3),
|
||||
V0Parameter(1, 12, 809, 2, 15, 5, 3)},
|
||||
{V0Parameter(1, 11, 502, 1, 23, 4, 2), V0Parameter(1, 11, 555, 1, 23, 5, 2),
|
||||
V0Parameter(1, 11, 579, 2, 15, 5, 2), V0Parameter(1, 11, 638, 2, 15, 4, 3),
|
||||
V0Parameter(1, 11, 689, 2, 15, 4, 3), V0Parameter(1, 11, 737, 2, 15, 5, 3),
|
||||
V0Parameter(1, 12, 818, 2, 15, 5, 3)},
|
||||
{V0Parameter(1, 11, 537, 1, 23, 3, 3), V0Parameter(1, 11, 532, 2, 15, 5, 2),
|
||||
V0Parameter(1, 11, 579, 2, 15, 5, 2), V0Parameter(1, 11, 638, 2, 15, 4, 3),
|
||||
V0Parameter(1, 11, 690, 2, 15, 4, 3), V0Parameter(1, 11, 738, 2, 15, 5, 3),
|
||||
V0Parameter(1, 12, 832, 2, 15, 9, 2)},
|
||||
{V0Parameter(1, 11, 497, 2, 15, 4, 2), V0Parameter(1, 11, 532, 2, 15, 5, 2),
|
||||
V0Parameter(1, 11, 579, 2, 15, 5, 2), V0Parameter(1, 11, 639, 2, 15, 4, 3),
|
||||
V0Parameter(1, 11, 691, 2, 15, 4, 3), V0Parameter(1, 11, 743, 2, 16, 5, 3),
|
||||
V0Parameter(1, 12, 807, 3, 12, 5, 3)},
|
||||
{V0Parameter(1, 11, 497, 2, 15, 4, 2), V0Parameter(1, 11, 532, 2, 15, 5, 2),
|
||||
V0Parameter(1, 11, 579, 2, 16, 5, 2), V0Parameter(1, 11, 639, 2, 15, 4, 3),
|
||||
V0Parameter(1, 11, 695, 2, 16, 4, 3),
|
||||
V0Parameter(1, 11, 757, 2, 16, 16, 1),
|
||||
V0Parameter(1, 12, 811, 3, 12, 5, 3)},
|
||||
{V0Parameter(1, 11, 497, 2, 16, 4, 2), V0Parameter(1, 11, 533, 2, 15, 5, 2),
|
||||
V0Parameter(1, 11, 580, 2, 15, 5, 2), V0Parameter(1, 11, 641, 2, 16, 4, 3),
|
||||
V0Parameter(1, 11, 699, 2, 16, 5, 3), V0Parameter(1, 11, 737, 3, 12, 5, 3),
|
||||
V0Parameter(1, 12, 788, 3, 12, 8, 2)},
|
||||
{V0Parameter(1, 11, 497, 2, 16, 4, 2), V0Parameter(1, 11, 533, 2, 15, 5, 2),
|
||||
V0Parameter(1, 11, 583, 2, 16, 5, 2), V0Parameter(1, 11, 653, 2, 16, 4, 3),
|
||||
V0Parameter(1, 11, 665, 3, 12, 6, 2), V0Parameter(1, 11, 738, 3, 12, 5, 3),
|
||||
V0Parameter(1, 12, 775, 4, 9, 8, 2)},
|
||||
{V0Parameter(1, 11, 498, 2, 15, 4, 2), V0Parameter(1, 11, 535, 2, 16, 5, 2),
|
||||
V0Parameter(1, 11, 610, 2, 16, 4, 3), V0Parameter(1, 11, 614, 3, 12, 6, 2),
|
||||
V0Parameter(1, 11, 666, 3, 12, 6, 2), V0Parameter(1, 11, 747, 3, 12, 5, 3),
|
||||
V0Parameter(1, 12, 782, 4, 9, 8, 2)},
|
||||
{V0Parameter(1, 11, 500, 2, 16, 4, 2), V0Parameter(1, 11, 544, 2, 16, 5, 2),
|
||||
V0Parameter(1, 11, 580, 3, 12, 5, 2), V0Parameter(1, 11, 615, 3, 12, 6, 2),
|
||||
V0Parameter(1, 11, 661, 3, 12, 7, 2), V0Parameter(1, 11, 715, 4, 9, 7, 2),
|
||||
V0Parameter(1, 12, 778, 5, 8, 8, 2)},
|
||||
{V0Parameter(1, 11, 513, 2, 16, 4, 2), V0Parameter(1, 11, 533, 3, 12, 5, 2),
|
||||
V0Parameter(1, 11, 581, 3, 12, 5, 2), V0Parameter(1, 11, 618, 3, 12, 6, 2),
|
||||
V0Parameter(1, 11, 687, 3, 12, 7, 2), V0Parameter(1, 11, 726, 4, 9, 7, 2),
|
||||
V0Parameter(1, 12, 809, 5, 8, 8, 2)},
|
||||
{V0Parameter(1, 11, 497, 3, 12, 4, 2), V0Parameter(1, 11, 533, 3, 12, 5, 2),
|
||||
V0Parameter(1, 11, 585, 3, 12, 5, 2), V0Parameter(1, 11, 639, 3, 12, 6, 2),
|
||||
V0Parameter(1, 11, 662, 4, 9, 7, 2), V0Parameter(1, 11, 717, 5, 8, 7, 2),
|
||||
V0Parameter(1, 12, 820, 6, 7, 9, 2)},
|
||||
{V0Parameter(1, 11, 498, 3, 12, 4, 2), V0Parameter(1, 11, 536, 3, 12, 5, 2),
|
||||
V0Parameter(1, 11, 593, 3, 12, 6, 2), V0Parameter(1, 11, 619, 4, 9, 6, 2),
|
||||
V0Parameter(1, 11, 693, 4, 9, 7, 2), V0Parameter(1, 11, 737, 5, 8, 7, 2),
|
||||
V0Parameter(1, 12, 788, 8, 5, 8, 2)},
|
||||
{V0Parameter(1, 11, 502, 3, 12, 4, 2), V0Parameter(1, 11, 552, 3, 12, 5, 2),
|
||||
V0Parameter(1, 11, 585, 4, 9, 5, 2), V0Parameter(1, 11, 644, 4, 9, 6, 2),
|
||||
V0Parameter(1, 11, 665, 5, 8, 7, 2), V0Parameter(1, 11, 736, 6, 7, 8, 2),
|
||||
V0Parameter(1, 12, 786, 11, 4, 8, 2)},
|
||||
{V0Parameter(1, 11, 508, 3, 12, 5, 2), V0Parameter(1, 11, 536, 4, 9, 5, 2),
|
||||
V0Parameter(1, 11, 596, 4, 9, 6, 2), V0Parameter(1, 11, 621, 5, 8, 6, 2),
|
||||
V0Parameter(1, 11, 667, 6, 7, 7, 2), V0Parameter(1, 11, 746, 7, 6, 8, 2),
|
||||
V0Parameter(1, 12, 798, 14, 3, 9, 2)},
|
||||
{V0Parameter(1, 11, 502, 4, 9, 4, 2), V0Parameter(1, 11, 555, 4, 9, 5, 2),
|
||||
V0Parameter(1, 11, 580, 5, 8, 6, 2), V0Parameter(1, 11, 623, 6, 7, 6, 2),
|
||||
V0Parameter(1, 11, 669, 7, 6, 7, 2), V0Parameter(1, 11, 723, 11, 4, 7, 2),
|
||||
V0Parameter(1, 12, 814, 22, 2, 9, 2)},
|
||||
{V0Parameter(1, 11, 510, 4, 9, 5, 2), V0Parameter(1, 11, 539, 5, 8, 5, 2),
|
||||
V0Parameter(1, 11, 636, 5, 8, 6, 2), V0Parameter(1, 11, 625, 7, 6, 6, 2),
|
||||
V0Parameter(1, 11, 674, 9, 5, 7, 2), V0Parameter(1, 11, 735, 14, 3, 8, 2),
|
||||
V0Parameter(0, 0, 0, 0, 0, 0, 0)},
|
||||
{V0Parameter(1, 11, 498, 5, 8, 5, 2), V0Parameter(1, 11, 579, 5, 8, 6, 2),
|
||||
V0Parameter(1, 11, 583, 7, 6, 6, 2), V0Parameter(1, 11, 661, 8, 5, 7, 2),
|
||||
V0Parameter(1, 11, 681, 11, 4, 7, 2), V0Parameter(1, 11, 736, 22, 2, 8, 2),
|
||||
V0Parameter(0, 0, 0, 0, 0, 0, 0)},
|
||||
{V0Parameter(1, 11, 530, 5, 8, 5, 2), V0Parameter(1, 11, 541, 7, 6, 5, 2),
|
||||
V0Parameter(1, 11, 611, 8, 5, 6, 2), V0Parameter(1, 11, 635, 11, 4, 6, 2),
|
||||
V0Parameter(1, 11, 704, 15, 3, 7, 2), V0Parameter(0, 0, 0, 0, 0, 0, 0),
|
||||
V0Parameter(0, 0, 0, 0, 0, 0, 0)},
|
||||
{V0Parameter(1, 11, 565, 6, 7, 5, 2), V0Parameter(1, 11, 569, 8, 5, 5, 2),
|
||||
V0Parameter(1, 11, 590, 11, 4, 6, 2), V0Parameter(1, 11, 647, 15, 3, 7, 2),
|
||||
V0Parameter(1, 11, 679, 44, 1, 14, 1), V0Parameter(0, 0, 0, 0, 0, 0, 0),
|
||||
V0Parameter(0, 0, 0, 0, 0, 0, 0)},
|
||||
{V0Parameter(1, 11, 520, 8, 5, 5, 2), V0Parameter(1, 11, 549, 11, 4, 5, 2),
|
||||
V0Parameter(1, 11, 600, 15, 3, 6, 2),
|
||||
V0Parameter(1, 11, 628, 44, 1, 13, 1), V0Parameter(0, 0, 0, 0, 0, 0, 0),
|
||||
V0Parameter(0, 0, 0, 0, 0, 0, 0), V0Parameter(0, 0, 0, 0, 0, 0, 0)},
|
||||
{V0Parameter(1, 11, 506, 11, 4, 5, 2), V0Parameter(1, 11, 559, 15, 3, 5, 2),
|
||||
V0Parameter(1, 11, 584, 44, 1, 12, 1), V0Parameter(0, 0, 0, 0, 0, 0, 0),
|
||||
V0Parameter(0, 0, 0, 0, 0, 0, 0), V0Parameter(0, 0, 0, 0, 0, 0, 0),
|
||||
V0Parameter(0, 0, 0, 0, 0, 0, 0)},
|
||||
{V0Parameter(1, 11, 503, 15, 3, 9, 1),
|
||||
V0Parameter(1, 11, 594, 23, 2, 12, 1), V0Parameter(0, 0, 0, 0, 0, 0, 0),
|
||||
V0Parameter(0, 0, 0, 0, 0, 0, 0), V0Parameter(0, 0, 0, 0, 0, 0, 0),
|
||||
V0Parameter(0, 0, 0, 0, 0, 0, 0), V0Parameter(0, 0, 0, 0, 0, 0, 0)},
|
||||
{V0Parameter(1, 11, 545, 22, 2, 11, 1), V0Parameter(0, 0, 0, 0, 0, 0, 0),
|
||||
V0Parameter(0, 0, 0, 0, 0, 0, 0), V0Parameter(0, 0, 0, 0, 0, 0, 0),
|
||||
V0Parameter(0, 0, 0, 0, 0, 0, 0), V0Parameter(0, 0, 0, 0, 0, 0, 0),
|
||||
V0Parameter(0, 0, 0, 0, 0, 0, 0)}};
|
||||
|
||||
V0Variances variances[NORM2_MAX][P_MAX] = {
|
||||
{V0Variances(-7.186489389863581, -8.186489389863581),
|
||||
V0Variances(-7.124998639947563, -8.124998639947563),
|
||||
V0Variances(-7.058035238345106, -8.058035238345106),
|
||||
V0Variances(-8.771444355069676, -9.771444355069676),
|
||||
V0Variances(-8.673618068377664, -9.673618068377664),
|
||||
V0Variances(-14.282552902365403, -15.282552902365403),
|
||||
V0Variances(-13.46973411689919, -14.46973411689919)},
|
||||
{V0Variances(-6.183439290095372, -8.183439290095372),
|
||||
V0Variances(-6.118015550365563, -8.118015550365563),
|
||||
V0Variances(-7.822589795549476, -9.822589795549476),
|
||||
V0Variances(-7.758317735238947, -9.758317735238947),
|
||||
V0Variances(-13.330153794041763, -15.330153794041763),
|
||||
V0Variances(-13.282552902365403, -15.282552902365403),
|
||||
V0Variances(-12.468865546631157, -14.468865546631157)},
|
||||
{V0Variances(-5.175869991676414, -8.175869991676414),
|
||||
V0Variances(-6.8804361404287775, -9.880436140428777),
|
||||
V0Variances(-6.808508030310776, -9.808508030310776),
|
||||
V0Variances(-12.38562757351147, -15.38562757351147),
|
||||
V0Variances(-12.330153794041763, -15.330153794041763),
|
||||
V0Variances(-12.281573475846372, -15.281573475846372),
|
||||
V0Variances(-11.46109512118327, -14.46109512118327)},
|
||||
{V0Variances(-4.101526889142427, -8.101526889142427),
|
||||
V0Variances(-5.869316883340595, -9.869316883340595),
|
||||
V0Variances(-11.432333043294854, -15.432333043294854),
|
||||
V0Variances(-11.384497819920412, -15.384497819920412),
|
||||
V0Variances(-11.329107604561145, -15.329107604561145),
|
||||
V0Variances(-11.279618603320834, -15.279618603320834),
|
||||
V0Variances(-17.494100927447505, -21.494100927447505)},
|
||||
{V0Variances(-4.926710762046184, -9.926710762046184),
|
||||
V0Variances(-10.475838324353795, -15.475838324353795),
|
||||
V0Variances(-10.432333043294854, -15.432333043294854),
|
||||
V0Variances(-10.384497819920412, -15.384497819920412),
|
||||
V0Variances(-10.328062930199778, -15.328062930199778),
|
||||
V0Variances(-10.270886650450088, -15.270886650450088),
|
||||
V0Variances(-16.68961710493341, -21.68961710493341)},
|
||||
{V0Variances(-9.565782859612767, -15.565782859612767),
|
||||
V0Variances(-9.475838324353795, -15.475838324353795),
|
||||
V0Variances(-9.432333043294854, -15.432333043294854),
|
||||
V0Variances(-9.38336983295023, -15.38336983295023),
|
||||
V0Variances(-9.319759557706192, -15.319759557706192),
|
||||
V0Variances(-16.395716120060897, -22.395716120060897),
|
||||
V0Variances(-15.689617104933411, -21.68961710493341)},
|
||||
{V0Variances(-8.565782859612767, -15.565782859612767),
|
||||
V0Variances(-8.475838324353795, -15.475838324353795),
|
||||
V0Variances(-8.431127783999514, -15.431127783999514),
|
||||
V0Variances(-8.37999641672993, -15.37999641672993),
|
||||
V0Variances(-8.297406155773494, -15.297406155773494),
|
||||
V0Variances(-15.395716120060897, -22.395716120060897),
|
||||
V0Variances(-14.689617104933411, -21.68961710493341)},
|
||||
{V0Variances(-7.565782859612767, -15.565782859612767),
|
||||
V0Variances(-7.4745582041945084, -15.474558204194508),
|
||||
V0Variances(-7.427524042014056, -15.427524042014056),
|
||||
V0Variances(-7.349249402293815, -15.349249402293815),
|
||||
V0Variances(-14.443317011737257, -22.443317011737257),
|
||||
V0Variances(-14.395716120060897, -22.395716120060897),
|
||||
V0Variances(-13.688722687558503, -21.688722687558503)},
|
||||
{V0Variances(-6.564332914359866, -15.564332914359866),
|
||||
V0Variances(-6.4681914592406144, -15.468191459240614),
|
||||
V0Variances(-6.403948495328606, -15.403948495328606),
|
||||
V0Variances(-13.498790791206964, -22.498790791206964),
|
||||
V0Variances(-13.669055725537874, -22.669055725537874),
|
||||
V0Variances(-13.621454833861513, -22.621454833861513),
|
||||
V0Variances(-12.686937172982404, -21.686937172982404)},
|
||||
{V0Variances(-5.558562103418524, -15.558562103418524),
|
||||
V0Variances(-5.486161899775176, -15.486161899775176),
|
||||
V0Variances(-12.568787329094782, -22.568787329094782),
|
||||
V0Variances(-12.498790791206964, -22.498790791206964),
|
||||
V0Variances(-12.443317011737257, -22.443317011737257),
|
||||
V0Variances(-12.394736693541866, -22.394736693541866),
|
||||
V0Variances(-11.678956602726522, -21.678956602726522)},
|
||||
{V0Variances(-4.509944741401199, -15.509944741401199),
|
||||
V0Variances(-11.629855880338809, -22.62985588033881),
|
||||
V0Variances(-11.568787329094782, -22.568787329094782),
|
||||
V0Variances(-11.498790791206964, -22.498790791206964),
|
||||
V0Variances(-11.442270822256638, -22.44227082225664),
|
||||
V0Variances(-11.393758595059204, -22.393758595059204),
|
||||
V0Variances(-10.66671526012685, -21.66671526012685)},
|
||||
{V0Variances(-10.678946077308261, -22.67894607730826),
|
||||
V0Variances(-10.629855880338809, -22.62985588033881),
|
||||
V0Variances(-10.568787329094782, -22.568787329094782),
|
||||
V0Variances(-10.497661037615906, -22.497661037615906),
|
||||
V0Variances(-10.441226147895271, -22.44122614789527),
|
||||
V0Variances(-10.614626611620722, -22.61462661162072),
|
||||
V0Variances(-13.208593433538631, -25.20859343353863)},
|
||||
{V0Variances(-9.678946077308261, -22.67894607730826),
|
||||
V0Variances(-9.629855880338809, -22.62985588033881),
|
||||
V0Variances(-9.794526042895399, -22.7945260428954),
|
||||
V0Variances(-9.497661037615906, -22.497661037615906),
|
||||
V0Variances(-9.662801228084582, -22.662801228084582),
|
||||
V0Variances(-9.601161066897156, -22.601161066897156),
|
||||
V0Variances(-12.205026813068883, -25.205026813068883)},
|
||||
{V0Variances(-8.904684791108878, -22.904684791108878),
|
||||
V0Variances(-8.628501236709816, -22.628501236709816),
|
||||
V0Variances(-8.567542553081935, -22.567542553081935),
|
||||
V0Variances(-8.72114553858065, -22.72114553858065),
|
||||
V0Variances(-8.658661489202302, -22.658661489202302),
|
||||
V0Variances(-12.345258643192501, -26.3452586431925),
|
||||
V0Variances(-11.225779955449333, -25.225779955449333)},
|
||||
{V0Variances(-7.904684791108878, -22.904684791108878),
|
||||
V0Variances(-7.628501236709816, -22.628501236709816),
|
||||
V0Variances(-7.789559775289774, -22.789559775289774),
|
||||
V0Variances(-7.707766221116806, -22.707766221116806),
|
||||
V0Variances(-11.41941378254576, -26.41941378254576),
|
||||
V0Variances(-11.34428054470984, -26.34428054470984),
|
||||
V0Variances(-12.519191629935513, -27.519191629935513)},
|
||||
{V0Variances(-6.67749613205536, -22.67749613205536),
|
||||
V0Variances(-6.851538271245765, -22.851538271245765),
|
||||
V0Variances(-6.756903095664896, -22.756903095664896),
|
||||
V0Variances(-10.476971625054944, -26.476971625054944),
|
||||
V0Variances(-10.418329864204402, -26.418329864204402),
|
||||
V0Variances(-10.335536831345415, -26.335536831345415),
|
||||
V0Variances(-11.512705481362637, -27.512705481362637)},
|
||||
{V0Variances(-5.900343669558978, -22.90034366955898),
|
||||
V0Variances(-5.839504391264846, -22.839504391264846),
|
||||
V0Variances(-9.518064502732571, -26.51806450273257),
|
||||
V0Variances(-9.475797747626736, -26.475797747626736),
|
||||
V0Variances(-9.42376581698619, -26.42376581698619),
|
||||
V0Variances(-11.303434156979073, -28.303434156979073),
|
||||
V0Variances(-11.872866817284404, -28.872866817284404)},
|
||||
{V0Variances(-4.88182830408649, -22.88182830408649),
|
||||
V0Variances(-8.579023186360452, -26.579023186360452),
|
||||
V0Variances(-8.51682187103777, -26.51682187103777),
|
||||
V0Variances(-8.47228753378785, -26.47228753378785),
|
||||
V0Variances(-8.395935903330987, -26.395935903330987),
|
||||
V0Variances(-10.292421003814077, -28.292421003814077),
|
||||
V0Variances(-10.844682043562514, -28.844682043562514)},
|
||||
{V0Variances(-7.629468026958897, -26.629468026958897),
|
||||
V0Variances(-7.579023186360452, -26.579023186360452),
|
||||
V0Variances(-7.511872640504656, -26.511872640504656),
|
||||
V0Variances(-7.4481829872665415, -26.44818298726654),
|
||||
V0Variances(-9.358990169408344, -28.358990169408344),
|
||||
V0Variances(-11.010736901553656, -30.010736901553656),
|
||||
V0Variances(-10.711045072243067, -29.711045072243067)},
|
||||
{V0Variances(-6.628018081705996, -26.628018081705996),
|
||||
V0Variances(-6.5749744525111495, -26.57497445251115),
|
||||
V0Variances(-6.502074900467036, -26.502074900467036),
|
||||
V0Variances(-8.40743607320482, -28.40743607320482),
|
||||
V0Variances(-8.325978101743345, -28.325978101743345),
|
||||
V0Variances(-9.990891151357076, -29.990891151357076),
|
||||
V0Variances(-11.223620949776631, -31.22362094977663)},
|
||||
{V0Variances(-5.622247270764653, -26.622247270764653),
|
||||
V0Variances(-5.5537568193509514, -26.55375681935095),
|
||||
V0Variances(-7.4481874655765665, -28.448187465576567),
|
||||
V0Variances(-7.378875433754644, -28.378875433754644),
|
||||
V0Variances(-9.065046290710335, -30.065046290710335),
|
||||
V0Variances(-9.877481950164196, -30.877481950164196),
|
||||
V0Variances(-11.301978588237922, -32.30197858823793)},
|
||||
{V0Variances(-4.613676704353956, -26.613676704353956),
|
||||
V0Variances(-6.511289277583067, -28.511289277583067),
|
||||
V0Variances(-6.434749612580873, -28.434749612580873),
|
||||
V0Variances(-8.114426826794364, -30.114426826794364),
|
||||
V0Variances(-8.948491452600415, -30.948491452600415),
|
||||
V0Variances(-9.745555749635251, -31.74555574963525),
|
||||
V0Variances(-10.928045032645045, -32.928045032645045)},
|
||||
{V0Variances(-5.558562095836564, -28.558562095836564),
|
||||
V0Variances(-5.486161892193216, -28.486161892193216),
|
||||
V0Variances(-7.163697010897138, -30.16369701089714),
|
||||
V0Variances(-7.997718751680708, -30.997718751680708),
|
||||
V0Variances(-8.824140459442134, -31.824140459442134),
|
||||
V0Variances(-10.447705621458134, -33.447705621458134),
|
||||
V0Variances(-10.746667774832375, -33.746667774832375)},
|
||||
{V0Variances(-4.547157154382525, -28.547157154382525),
|
||||
V0Variances(-6.216580824528357, -30.216580824528357),
|
||||
V0Variances(-6.097210078262428, -30.097210078262428),
|
||||
V0Variances(-7.873215469988139, -31.87321546998814),
|
||||
V0Variances(-8.652284365210448, -32.65228436521045),
|
||||
V0Variances(-9.889772718943199, -33.8897727189432), V0Variances(0, 0)},
|
||||
{V0Variances(-5.27365058987057, -30.27365058987057),
|
||||
V0Variances(-5.164941786909992, -30.164941786909992),
|
||||
V0Variances(-6.9233956231626195, -31.92339562316262),
|
||||
V0Variances(-7.176178241418846, -32.17617824141885),
|
||||
V0Variances(-8.490876045927656, -33.490876045927656),
|
||||
V0Variances(-9.868458351961102, -34.86845835196111), V0Variances(0, 0)},
|
||||
{V0Variances(-4.228727281179324, -30.228727281179324),
|
||||
V0Variances(-5.977329267849463, -31.977329267849463),
|
||||
V0Variances(-6.232917187263332, -32.232917187263325),
|
||||
V0Variances(-7.541325149103926, -33.541325149103926),
|
||||
V0Variances(-8.245745876084897, -34.2457458760849), V0Variances(0, 0),
|
||||
V0Variances(0, 0)},
|
||||
{V0Variances(-4.068209399541431, -31.06820939954143),
|
||||
V0Variances(-5.284289051019407, -32.2842890510194),
|
||||
V0Variances(-6.59434596780909, -33.59434596780909),
|
||||
V0Variances(-7.306650734407292, -34.30665073440729),
|
||||
V0Variances(-8.356857016386314, -35.356857016386314), V0Variances(0, 0),
|
||||
V0Variances(0, 0)},
|
||||
{V0Variances(-4.349247565658466, -32.349247565658466),
|
||||
V0Variances(-5.646300370431092, -33.64630037043109),
|
||||
V0Variances(-6.361052340155609, -34.36105234015561),
|
||||
V0Variances(-7.4131805240628665, -35.41318052406287), V0Variances(0, 0),
|
||||
V0Variances(0, 0), V0Variances(0, 0)},
|
||||
{V0Variances(-4.705134752586538, -33.70513475258654),
|
||||
V0Variances(-5.412109448981951, -34.41210944898195),
|
||||
V0Variances(-6.465578619068673, -35.46557861906867), V0Variances(0, 0),
|
||||
V0Variances(0, 0), V0Variances(0, 0), V0Variances(0, 0)},
|
||||
{V0Variances(-4.488254390500785, -34.488254390500785),
|
||||
V0Variances(-5.062180911982956, -35.062180911982956), V0Variances(0, 0),
|
||||
V0Variances(0, 0), V0Variances(0, 0), V0Variances(0, 0),
|
||||
V0Variances(0, 0)},
|
||||
{V0Variances(-4.085183120157467, -35.08518312015747), V0Variances(0, 0),
|
||||
V0Variances(0, 0), V0Variances(0, 0), V0Variances(0, 0), V0Variances(0, 0),
|
||||
V0Variances(0, 0)}};
|
||||
|
||||
extern "C" V0Parameter *get_parameters(int norm, int p) {
|
||||
// - 1 is an offset as norm and p are in [1, ...] and not [0, ...]
|
||||
return ¶meters[norm - 1][p - 1];
|
||||
}
|
||||
|
||||
extern "C" V0Variances *get_variances(int norm, int p) {
|
||||
// - 1 is an offset as norm and p are in [1, ...] and not [0, ...]
|
||||
return &variances[norm - 1][p - 1];
|
||||
}
|
||||
10
src/CMakeLists.txt
Normal file
10
src/CMakeLists.txt
Normal file
@@ -0,0 +1,10 @@
|
||||
set(SOURCES ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h)
|
||||
file(GLOB SOURCES
|
||||
"*.cu"
|
||||
"*.h"
|
||||
"fft/*.cu")
|
||||
add_library(concrete_cuda STATIC ${SOURCES})
|
||||
set_target_properties(concrete_cuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
|
||||
target_link_libraries(concrete_cuda PUBLIC cudart)
|
||||
target_include_directories(concrete_cuda PRIVATE .)
|
||||
1
src/bootstrap.cu
Normal file
1
src/bootstrap.cu
Normal file
@@ -0,0 +1 @@
|
||||
#include "crypto/bootstrapping_key.cuh"
|
||||
174
src/bootstrap_amortized.cu
Normal file
174
src/bootstrap_amortized.cu
Normal file
@@ -0,0 +1,174 @@
|
||||
#include "bootstrap_amortized.cuh"
|
||||
|
||||
/* Perform bootstrapping on a batch of input LWE ciphertexts
|
||||
*
|
||||
* - lwe_out: output batch of num_samples bootstrapped ciphertexts c =
|
||||
* (a0,..an-1,b) where n is the LWE dimension
|
||||
* - lut_vector: should hold as many test vectors of size polynomial_size
|
||||
* as there are input ciphertexts, but actually holds
|
||||
* num_lut_vectors vectors to reduce memory usage
|
||||
* - lut_vector_indexes: stores the index corresponding to
|
||||
* which test vector to use for each sample in
|
||||
* lut_vector
|
||||
* - lwe_in: input batch of num_samples LWE ciphertexts, containing n
|
||||
* mask values + 1 body value
|
||||
* - bootstrapping_key: RGSW encryption of the LWE secret key sk1
|
||||
* under secret key sk2
|
||||
* bsk = Z + sk1 H
|
||||
* where H is the gadget matrix and Z is a matrix (k+1).l
|
||||
* containing GLWE encryptions of 0 under sk2.
|
||||
* bsk is thus a tensor of size (k+1)^2.l.N.n
|
||||
* where l is the number of decomposition levels and
|
||||
* k is the GLWE dimension, N is the polynomial size for
|
||||
* GLWE. The polynomial size for GLWE and the test vector
|
||||
* are the same because they have to be in the same ring
|
||||
* to be multiplied.
|
||||
* Note: it is necessary to generate (k+1).k.l.N.n
|
||||
* uniformly random coefficients for the zero encryptions
|
||||
* - input_lwe_dimension: size of the Torus vector used to encrypt the input
|
||||
* LWE ciphertexts - referred to as n above (~ 600)
|
||||
* - polynomial_size: size of the test polynomial (test vector) and size of the
|
||||
* GLWE polynomial (~1024)
|
||||
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
|
||||
* - l_gadget: number of decomposition levels in the gadget matrix (~4)
|
||||
* - num_samples: number of encrypted input messages
|
||||
* - num_lut_vectors: parameter to set the actual number of test vectors to be
|
||||
* used
|
||||
* - q: number of bytes in the integer representation (32 or 64)
|
||||
*
|
||||
* This function calls a wrapper to a device kernel that performs the
|
||||
* bootstrapping:
|
||||
* - the kernel is templatized based on integer discretization and
|
||||
* polynomial degree
|
||||
* - num_samples blocks of threads are launched, where each thread is going
|
||||
* to handle one or more polynomial coefficients at each stage:
|
||||
* - perform the blind rotation
|
||||
* - round the result
|
||||
* - decompose into l_gadget levels, then for each level:
|
||||
* - switch to the FFT domain
|
||||
* - multiply with the bootstrapping key
|
||||
* - come back to the coefficients representation
|
||||
* - between each stage a synchronization of the threads is necessary
|
||||
* - in case the device has enough shared memory, temporary arrays used for
|
||||
* the different stages (accumulators) are stored into the shared memory
|
||||
* - the accumulators serve to combine the results for all decomposition
|
||||
* levels
|
||||
* - the constant memory (64K) is used for storing the roots of identity
|
||||
* values for the FFT
|
||||
*/
|
||||
|
||||
void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
void *v_stream,
|
||||
void *lwe_out,
|
||||
void *lut_vector,
|
||||
void *lut_vector_indexes,
|
||||
void *lwe_in,
|
||||
void *bootstrapping_key,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t num_samples,
|
||||
uint32_t num_lut_vectors,
|
||||
uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
host_bootstrap_amortized<uint32_t, Degree<512>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_bootstrap_amortized<uint32_t, Degree<1024>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_bootstrap_amortized<uint32_t, Degree<2048>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_bootstrap_amortized<uint32_t, Degree<4096>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_bootstrap_amortized<uint32_t, Degree<8192>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
void *v_stream,
|
||||
void *lwe_out,
|
||||
void *lut_vector,
|
||||
void *lut_vector_indexes,
|
||||
void *lwe_in,
|
||||
void *bootstrapping_key,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t num_samples,
|
||||
uint32_t num_lut_vectors,
|
||||
uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
host_bootstrap_amortized<uint64_t, Degree<512>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_bootstrap_amortized<uint64_t, Degree<1024>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_bootstrap_amortized<uint64_t, Degree<2048>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_bootstrap_amortized<uint64_t, Degree<4096>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_bootstrap_amortized<uint64_t, Degree<8192>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
466
src/bootstrap_amortized.cuh
Normal file
466
src/bootstrap_amortized.cuh
Normal file
@@ -0,0 +1,466 @@
|
||||
#ifdef __CDT_PARSER__
|
||||
#undef __CUDA_RUNTIME_H__
|
||||
#include <cuda_runtime.h>
|
||||
#include <helper_cuda.h>
|
||||
#endif
|
||||
|
||||
#ifndef CNCRT_AMORTIZED_PBS_H
|
||||
#define CNCRT_AMORTIZED_PBS_H
|
||||
|
||||
#include "cooperative_groups.h"
|
||||
|
||||
#include "../include/helper_cuda.h"
|
||||
#include "bootstrap.h"
|
||||
#include "complex/operations.cuh"
|
||||
//#include "crypto/bootstrapping_key.cuh"
|
||||
#include "crypto/gadget.cuh"
|
||||
#include "crypto/torus.cuh"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/smfft.cuh"
|
||||
#include "fft/twiddles.cuh"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "utils/memory.cuh"
|
||||
#include "utils/timer.cuh"
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
/*
|
||||
* Kernel launched by host_bootstrap_amortized
|
||||
*
|
||||
* Uses shared memory to increase performance
|
||||
* - lwe_out: output batch of num_samples bootstrapped ciphertexts c =
|
||||
* (a0,..an-1,b) where n is the LWE dimension
|
||||
* - lut_vector: should hold as many test vectors of size polynomial_size
|
||||
* as there are input ciphertexts, but actually holds
|
||||
* num_lut_vectors vectors to reduce memory usage
|
||||
* - lut_vector_indexes: stores the index corresponding to which test vector
|
||||
* to use for each sample in lut_vector
|
||||
* - lwe_in: input batch of num_samples LWE ciphertexts, containing n mask
|
||||
* values + 1 body value
|
||||
* - bootstrapping_key: RGSW encryption of the LWE secret key sk1 under secret
|
||||
* key sk2
|
||||
* - device_mem: pointer to the device's global memory in case we use it (SMD
|
||||
* == NOSM or PARTIALSM)
|
||||
* - lwe_mask_size: size of the Torus vector used to encrypt the input
|
||||
* LWE ciphertexts - referred to as n above (~ 600)
|
||||
* - polynomial_size: size of the test polynomial (test vector) and size of the
|
||||
* GLWE polynomial (~1024)
|
||||
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
|
||||
* - l_gadget: number of decomposition levels in the gadget matrix (~4)
|
||||
* - gpu_num: index of the current GPU (useful for multi-GPU computations)
|
||||
* - lwe_idx: equal to the number of samples per gpu x gpu_num
|
||||
* - device_memory_size_per_sample: amount of global memory to allocate if SMD
|
||||
* is not FULLSM
|
||||
*/
|
||||
__global__ void device_bootstrap_amortized(
|
||||
Torus *lwe_out,
|
||||
Torus *lut_vector,
|
||||
uint32_t *lut_vector_indexes,
|
||||
Torus *lwe_in,
|
||||
double2 *bootstrapping_key,
|
||||
char *device_mem,
|
||||
uint32_t lwe_mask_size,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t lwe_idx,
|
||||
size_t device_memory_size_per_sample) {
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
// bootstrap, since shared memory is kept in L1 cache and accessing it is
|
||||
// much faster than global memory
|
||||
extern __shared__ char sharedmem[];
|
||||
char *selected_memory;
|
||||
|
||||
if constexpr (SMD == FULLSM)
|
||||
selected_memory = sharedmem;
|
||||
else
|
||||
selected_memory = &device_mem[blockIdx.x * device_memory_size_per_sample];
|
||||
|
||||
// For GPU bootstrapping the RLWE dimension is hard-set to 1: there is only
|
||||
// one mask polynomial and 1 body to handle Also, since the decomposed
|
||||
// polynomials take coefficients between -B/2 and B/2 they can be represented
|
||||
// with only 16 bits, assuming the base log does not exceed 2^16
|
||||
int16_t *accumulator_mask_decomposed = (int16_t *)selected_memory;
|
||||
// TODO (Agnes) why not the 16 bits representation here?
|
||||
int16_t *accumulator_body_decomposed =
|
||||
(int16_t *)accumulator_mask_decomposed + polynomial_size;
|
||||
Torus *accumulator_mask = (Torus *)accumulator_body_decomposed +
|
||||
polynomial_size / (sizeof(Torus) / sizeof(int16_t));
|
||||
Torus *accumulator_body =
|
||||
(Torus *)accumulator_mask + (ptrdiff_t)polynomial_size;
|
||||
Torus *accumulator_mask_rotated =
|
||||
(Torus *)accumulator_body + (ptrdiff_t)polynomial_size;
|
||||
Torus *accumulator_body_rotated =
|
||||
(Torus *)accumulator_mask_rotated + (ptrdiff_t)polynomial_size;
|
||||
double2 *mask_res_fft = (double2 *)accumulator_body_rotated +
|
||||
polynomial_size / (sizeof(double2) / sizeof(Torus));
|
||||
double2 *body_res_fft =
|
||||
(double2 *)mask_res_fft + (ptrdiff_t)polynomial_size / 2;
|
||||
double2 *accumulator_fft = (double2 *)sharedmem;
|
||||
if constexpr (SMD != PARTIALSM)
|
||||
accumulator_fft =
|
||||
(double2 *)body_res_fft + (ptrdiff_t)(polynomial_size / 2);
|
||||
|
||||
/*
|
||||
int dif0 = ((char*)accumulator_body_decomposed - (char*)selected_memory);
|
||||
int dif1 = ((char*)accumulator_mask - (char*)accumulator_body_decomposed);
|
||||
int dif2 = ((char*)accumulator_body - (char*)accumulator_mask);
|
||||
int dif3 = ((char*)accumulator_mask_rotated - (char*)accumulator_body);
|
||||
int dif4 = ((char*)accumulator_body_rotated -
|
||||
(char*)accumulator_mask_rotated); int dif5 = ((char*)mask_res_fft -
|
||||
(char*)accumulator_body_rotated); int dif6 = ((char*)body_res_fft -
|
||||
(char*)mask_res_fft); int dif7 = (SMD != PARTIALSM)? (char*)accumulator_fft -
|
||||
(char*)body_res_fft:0; if (threadIdx.x == 0 && blockIdx.x == 0) {
|
||||
printf("device and shared mem: %d %d %d %d %d %d %d %d\n ",dif0, dif1, dif2,
|
||||
dif3, dif4, dif5, dif6, dif7);
|
||||
}
|
||||
*/
|
||||
|
||||
auto block_lwe_in = &lwe_in[blockIdx.x * (lwe_mask_size + 1)];
|
||||
Torus *block_lut_vector =
|
||||
&lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree * 2];
|
||||
|
||||
// TODO (Agnes) try to store the gadget matrix in const memory to see if
|
||||
// register use decreases Since all const mem is used for twiddles currently,
|
||||
// it would mean moving some of them to global memory instead
|
||||
GadgetMatrix<Torus, params> gadget(base_log, l_gadget);
|
||||
|
||||
// Put "b", the body, in [0, 2N[
|
||||
Torus b_hat = rescale_torus_element(
|
||||
block_lwe_in[lwe_mask_size],
|
||||
2 * params::degree); // 2 * params::log2_degree + 1);
|
||||
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_mask, block_lut_vector, b_hat, false);
|
||||
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_body, &block_lut_vector[params::degree], b_hat, false);
|
||||
|
||||
// Loop over all the mask elements of the sample to accumulate
|
||||
// (X^a_i-1) multiplication, decomposition of the resulting polynomial
|
||||
// into l_gadget polynomials, and performing polynomial multiplication
|
||||
// via an FFT with the RGSW encrypted secret key
|
||||
for (int iteration = 0; iteration < lwe_mask_size; iteration++) {
|
||||
// TODO make sure that following sync is necessary
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Put "a" in [0, 2N[ instead of Zq
|
||||
Torus a_hat = rescale_torus_element(
|
||||
block_lwe_in[iteration],
|
||||
2 * params::degree); // 2 * params::log2_degree + 1);
|
||||
|
||||
// TODO (Agnes) why is there this if condition?
|
||||
if (a_hat == 0) {
|
||||
// todo(Joao): **cannot use this optimization**
|
||||
// the reason is that one of the input ciphertexts (blockIdx.z)
|
||||
// might skip an iteration while others don't, which as a result
|
||||
// will make that block not call the grid.sync(), causing a deadlock;
|
||||
// maybe it's a workaround to add grid.sync() here, but not sure if
|
||||
// there are any edge cases?
|
||||
|
||||
// continue
|
||||
}
|
||||
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator_mask, accumulator_mask_rotated, a_hat);
|
||||
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator_body, accumulator_body_rotated, a_hat);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_mask_rotated, base_log, l_gadget);
|
||||
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_body_rotated, base_log, l_gadget);
|
||||
// Initialize the polynomial multiplication via FFT arrays
|
||||
// The polynomial multiplications happens at the block level
|
||||
// and each thread handles two or more coefficients
|
||||
int pos = threadIdx.x;
|
||||
for (int j = 0; j < params::opt / 2; j++) {
|
||||
mask_res_fft[pos].x = 0;
|
||||
mask_res_fft[pos].y = 0;
|
||||
body_res_fft[pos].x = 0;
|
||||
body_res_fft[pos].y = 0;
|
||||
pos += params::degree / params::opt;
|
||||
}
|
||||
|
||||
// Now that the rotation is done, decompose the resulting polynomial
|
||||
// coefficients so as to multiply each decomposed level with the
|
||||
// corresponding part of the bootstrapping key
|
||||
// TODO (Agnes) explain why we do that for the mask and body separately
|
||||
for (int decomp_level = 0; decomp_level < l_gadget; decomp_level++) {
|
||||
|
||||
gadget.decompose_one_level(accumulator_mask_decomposed,
|
||||
accumulator_mask_rotated, decomp_level);
|
||||
|
||||
gadget.decompose_one_level(accumulator_body_decomposed,
|
||||
accumulator_body_rotated, decomp_level);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// First, perform the polynomial multiplication for the mask
|
||||
|
||||
// Reduce the size of the FFT to be performed by storing
|
||||
// the real-valued polynomial into a complex polynomial
|
||||
real_to_complex_compressed<int16_t, params>(accumulator_mask_decomposed,
|
||||
accumulator_fft);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
// Switch to the FFT space
|
||||
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
correction_direct_fft_inplace<params>(accumulator_fft);
|
||||
|
||||
// Get the bootstrapping key piece necessary for the multiplication
|
||||
// It is already in the Fourier domain
|
||||
// TODO (Agnes) Explain why for the mask polynomial multiplication
|
||||
// we need the bsk_body_slice and vice versa
|
||||
auto bsk_mask_slice = PolynomialFourier<double2, params>(
|
||||
get_ith_mask_kth_block(
|
||||
bootstrapping_key, iteration, 0, decomp_level,
|
||||
polynomial_size, 1, l_gadget));
|
||||
auto bsk_body_slice = PolynomialFourier<double2, params>(
|
||||
get_ith_body_kth_block(
|
||||
bootstrapping_key, iteration, 0, decomp_level,
|
||||
polynomial_size, 1, l_gadget));
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform the coefficient-wise product with the two pieces of
|
||||
// bootstrapping key TODO (Agnes) why two pieces?
|
||||
polynomial_product_accumulate_in_fourier_domain(
|
||||
mask_res_fft, accumulator_fft, bsk_mask_slice);
|
||||
polynomial_product_accumulate_in_fourier_domain(
|
||||
body_res_fft, accumulator_fft, bsk_body_slice);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Now handle the polynomial multiplication for the body
|
||||
// in the same way
|
||||
real_to_complex_compressed<int16_t, params>(accumulator_body_decomposed,
|
||||
accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
correction_direct_fft_inplace<params>(accumulator_fft);
|
||||
|
||||
auto bsk_mask_slice_2 = PolynomialFourier<double2, params>(
|
||||
get_ith_mask_kth_block(bootstrapping_key, iteration, 1, decomp_level,
|
||||
polynomial_size, 1, l_gadget));
|
||||
auto bsk_body_slice_2 = PolynomialFourier<double2, params>(
|
||||
get_ith_body_kth_block(bootstrapping_key, iteration, 1, decomp_level,
|
||||
polynomial_size, 1, l_gadget));
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
polynomial_product_accumulate_in_fourier_domain(
|
||||
mask_res_fft, accumulator_fft, bsk_mask_slice_2);
|
||||
polynomial_product_accumulate_in_fourier_domain(
|
||||
body_res_fft, accumulator_fft, bsk_body_slice_2);
|
||||
}
|
||||
|
||||
// Come back to the coefficient representation
|
||||
if constexpr (SMD == FULLSM || SMD == NOSM) {
|
||||
synchronize_threads_in_block();
|
||||
|
||||
correction_inverse_fft_inplace<params>(mask_res_fft);
|
||||
correction_inverse_fft_inplace<params>(body_res_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
NSMFFT_inverse<HalfDegree<params>>(mask_res_fft);
|
||||
NSMFFT_inverse<HalfDegree<params>>(body_res_fft);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
add_to_torus<Torus, params>(mask_res_fft, accumulator_mask);
|
||||
add_to_torus<Torus, params>(body_res_fft, accumulator_body);
|
||||
synchronize_threads_in_block();
|
||||
} else {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
accumulator_fft[tid] = mask_res_fft[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
correction_inverse_fft_inplace<params>(accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
add_to_torus<Torus, params>(accumulator_fft, accumulator_mask);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
accumulator_fft[tid] = body_res_fft[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
correction_inverse_fft_inplace<params>(accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
add_to_torus<Torus, params>(accumulator_fft, accumulator_body);
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
}
|
||||
|
||||
auto block_lwe_out = &lwe_out[blockIdx.x * (polynomial_size + 1)];
|
||||
|
||||
// The blind rotation for this block is over
|
||||
// Now we can perform the sample extraction: for the body it's just
|
||||
// the resulting constant coefficient of the accumulator
|
||||
// For the mask it's more complicated TODO (Agnes) explain why
|
||||
sample_extract_mask<Torus, params>(block_lwe_out, accumulator_mask);
|
||||
sample_extract_body<Torus, params>(block_lwe_out, accumulator_body);
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_bootstrap_amortized(
|
||||
void *v_stream,
|
||||
Torus *lwe_out,
|
||||
Torus *lut_vector,
|
||||
uint32_t *lut_vector_indexes,
|
||||
Torus *lwe_in,
|
||||
double2 *bootstrapping_key,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t num_lut_vectors,
|
||||
uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
int SM_FULL = sizeof(Torus) * polynomial_size + // accumulator mask
|
||||
sizeof(Torus) * polynomial_size + // accumulator body
|
||||
sizeof(Torus) * polynomial_size + // accumulator mask rotated
|
||||
sizeof(Torus) * polynomial_size + // accumulator body rotated
|
||||
sizeof(int16_t) * polynomial_size + // accumulator_dec mask
|
||||
sizeof(int16_t) * polynomial_size + // accumulator_dec_body
|
||||
sizeof(double2) * polynomial_size / 2 + // accumulator fft mask
|
||||
sizeof(double2) * polynomial_size / 2 + // accumulator fft body
|
||||
sizeof(double2) * polynomial_size / 2; // calculate buffer fft
|
||||
|
||||
int SM_PART = sizeof(double2) * polynomial_size / 2; // calculate buffer fft
|
||||
|
||||
int DM_PART = SM_FULL - SM_PART;
|
||||
|
||||
int DM_FULL = SM_FULL;
|
||||
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
char *d_mem;
|
||||
|
||||
// Create a 1-dimensional grid of threads
|
||||
// where each block handles 1 sample and each thread
|
||||
// handles opt polynomial coefficients
|
||||
// (actually opt/2 coefficients since we compress the real polynomial into a
|
||||
// complex)
|
||||
// TODO (Agnes) Polynomial size / params::opt should be equal to 256 or 512
|
||||
// probably, maybe 1024 would be too big?
|
||||
// Or would it actually be good in our case to have the largest possible
|
||||
// number of threads per block since anyway few blocks will run
|
||||
// concurrently?
|
||||
dim3 grid(input_lwe_ciphertext_count, 1, 1);
|
||||
dim3 thds(polynomial_size / params::opt, 1, 1);
|
||||
|
||||
// Launch the kernel using polynomial_size/opt threads
|
||||
// where each thread computes opt polynomial coefficients
|
||||
// Depending on the required amount of shared memory, choose
|
||||
// from one of three templates (no use, partial use or full use
|
||||
// of shared memory)
|
||||
if (max_shared_memory < SM_PART) {
|
||||
checkCudaErrors(cudaMalloc((void **)&d_mem, DM_FULL * input_lwe_ciphertext_count));
|
||||
device_bootstrap_amortized<Torus, params, NOSM>
|
||||
<<<grid, thds, 0, *stream>>>(
|
||||
lwe_out, lut_vector, lut_vector_indexes, lwe_in,
|
||||
bootstrapping_key, d_mem,
|
||||
input_lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, lwe_idx, DM_FULL);
|
||||
} else if (max_shared_memory < SM_FULL) {
|
||||
cudaFuncSetAttribute(device_bootstrap_amortized<Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
SM_PART);
|
||||
cudaFuncSetCacheConfig(
|
||||
device_bootstrap_amortized<Torus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
checkCudaErrors(cudaMalloc((void **)&d_mem, DM_PART * input_lwe_ciphertext_count));
|
||||
device_bootstrap_amortized<Torus, params, PARTIALSM>
|
||||
<<<grid, thds, SM_PART, *stream>>>(
|
||||
lwe_out, lut_vector, lut_vector_indexes,
|
||||
lwe_in, bootstrapping_key,
|
||||
d_mem, input_lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, lwe_idx,
|
||||
DM_PART);
|
||||
} else {
|
||||
// For devices with compute capability 7.x a single thread block can
|
||||
// address the full capacity of shared memory. Shared memory on the
|
||||
// device then has to be allocated dynamically.
|
||||
// For lower compute capabilities, this call
|
||||
// just does nothing and the amount of shared memory used is 48 KB
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
device_bootstrap_amortized<Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
SM_FULL));
|
||||
// TODO (Agnes): is this necessary?
|
||||
checkCudaErrors(cudaFuncSetCacheConfig(
|
||||
device_bootstrap_amortized<Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
checkCudaErrors(cudaMalloc((void **)&d_mem, 0));
|
||||
|
||||
device_bootstrap_amortized<Torus, params, FULLSM>
|
||||
<<<grid, thds, SM_FULL, *stream>>>(
|
||||
lwe_out, lut_vector, lut_vector_indexes,
|
||||
lwe_in, bootstrapping_key,
|
||||
d_mem, input_lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, lwe_idx,
|
||||
0);
|
||||
}
|
||||
// Synchronize the streams before copying the result to lwe_out at the right
|
||||
// place
|
||||
cudaStreamSynchronize(*stream);
|
||||
cudaFree(d_mem);
|
||||
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
int cuda_get_pbs_per_gpu(int polynomial_size) {
|
||||
|
||||
int blocks_per_sm = 0;
|
||||
int num_threads = polynomial_size / params::opt;
|
||||
cudaGetDeviceCount(0);
|
||||
cudaDeviceProp device_properties;
|
||||
// FIXME: here we assume every device has same properties
|
||||
cudaGetDeviceProperties(&device_properties, 0);
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&blocks_per_sm, device_bootstrap_amortized<Torus, params>,
|
||||
num_threads, 0);
|
||||
|
||||
return device_properties.multiProcessorCount * blocks_per_sm;
|
||||
}
|
||||
|
||||
#endif // CNCRT_PBS_H
|
||||
183
src/bootstrap_low_latency.cu
Normal file
183
src/bootstrap_low_latency.cu
Normal file
@@ -0,0 +1,183 @@
|
||||
#include "bootstrap_low_latency.cuh"
|
||||
|
||||
/* Perform bootstrapping on a batch of input LWE ciphertexts
|
||||
*
|
||||
* - lwe_out: output batch of num_samples bootstrapped ciphertexts c =
|
||||
* (a0,..an-1,b) where n is the LWE dimension
|
||||
* - lut_vector: should hold as many test vectors of size polynomial_size
|
||||
* as there are input ciphertexts, but actually holds
|
||||
* num_lut_vectors vectors to reduce memory usage
|
||||
* - lut_vector_indexes: stores the index corresponding to
|
||||
* which test vector to use for each sample in
|
||||
* lut_vector
|
||||
* - lwe_in: input batch of num_samples LWE ciphertexts, containing n
|
||||
* mask values + 1 body value
|
||||
* - bootstrapping_key: RGSW encryption of the LWE secret key sk1
|
||||
* under secret key sk2
|
||||
* bsk = Z + sk1 H
|
||||
* where H is the gadget matrix and Z is a matrix (k+1).l
|
||||
* containing GLWE encryptions of 0 under sk2.
|
||||
* bsk is thus a tensor of size (k+1)^2.l.N.n
|
||||
* where l is the number of decomposition levels and
|
||||
* k is the GLWE dimension, N is the polynomial size for
|
||||
* GLWE. The polynomial size for GLWE and the test vector
|
||||
* are the same because they have to be in the same ring
|
||||
* to be multiplied.
|
||||
* Note: it is necessary to generate (k+1).k.l.N.n
|
||||
* uniformly random coefficients for the zero encryptions
|
||||
* - lwe_dimension: size of the Torus vector used to encrypt the input
|
||||
* LWE ciphertexts - referred to as n above (~ 600)
|
||||
* - polynomial_size: size of the test polynomial (test vector) and size of the
|
||||
* GLWE polynomial (~1024)
|
||||
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
|
||||
* - l_gadget: number of decomposition levels in the gadget matrix (~4)
|
||||
* - num_samples: number of encrypted input messages
|
||||
* - num_lut_vectors: parameter to set the actual number of test vectors to be
|
||||
* used
|
||||
* - q: number of bytes in the integer representation (32 or 64)
|
||||
*
|
||||
* This function calls a wrapper to a device kernel that performs the
|
||||
* bootstrapping:
|
||||
* - the kernel is templatized based on integer discretization and
|
||||
* polynomial degree
|
||||
* - num_samples blocks of threads are launched, where each thread is going
|
||||
* to handle one or more polynomial coefficients at each stage:
|
||||
* - perform the blind rotation
|
||||
* - round the result
|
||||
* - decompose into l_gadget levels, then for each level:
|
||||
* - switch to the FFT domain
|
||||
* - multiply with the bootstrapping key
|
||||
* - come back to the coefficients representation
|
||||
* - between each stage a synchronization of the threads is necessary TODO
|
||||
* (Agnes) check this
|
||||
* - in case the device has enough shared memory, temporary arrays used for
|
||||
* the different stages (accumulators) are stored into the shared memory
|
||||
* - the accumulators serve to combine the results for all decomposition
|
||||
* levels TODO (Agnes) check this
|
||||
* - the constant memory (64K) is used for storing the roots of identity
|
||||
* values for the FFT
|
||||
*/
|
||||
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
void *v_stream,
|
||||
void *lwe_out,
|
||||
void *lut_vector,
|
||||
void *lut_vector_indexes,
|
||||
void *lwe_in,
|
||||
void *bootstrapping_key,
|
||||
uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t num_samples,
|
||||
uint32_t num_lut_vectors,
|
||||
uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
host_bootstrap_low_latency<uint32_t, Degree<512>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
break;
|
||||
case 1024:
|
||||
host_bootstrap_low_latency<uint32_t, Degree<1024>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
break;
|
||||
case 2048:
|
||||
host_bootstrap_low_latency<uint32_t, Degree<2048>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
break;
|
||||
case 4096:
|
||||
host_bootstrap_low_latency<uint32_t, Degree<4096>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
break;
|
||||
case 8192:
|
||||
host_bootstrap_low_latency<uint32_t, Degree<8192>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
void *v_stream,
|
||||
void *lwe_out,
|
||||
void *lut_vector,
|
||||
void *lut_vector_indexes,
|
||||
void *lwe_in,
|
||||
void *bootstrapping_key,
|
||||
uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t num_samples,
|
||||
uint32_t num_lut_vectors,
|
||||
uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
host_bootstrap_low_latency<uint64_t, Degree<512>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
break;
|
||||
case 1024:
|
||||
host_bootstrap_low_latency<uint64_t, Degree<1024>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
break;
|
||||
case 2048:
|
||||
host_bootstrap_low_latency<uint64_t, Degree<2048>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
break;
|
||||
case 4096:
|
||||
host_bootstrap_low_latency<uint64_t, Degree<4096>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
break;
|
||||
case 8192:
|
||||
host_bootstrap_low_latency<uint64_t, Degree<8192>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
348
src/bootstrap_low_latency.cuh
Normal file
348
src/bootstrap_low_latency.cuh
Normal file
@@ -0,0 +1,348 @@
|
||||
#ifdef __CDT_PARSER__
|
||||
#undef __CUDA_RUNTIME_H__
|
||||
#include <cuda_runtime.h>
|
||||
#include <helper_cuda.h>
|
||||
#endif
|
||||
|
||||
#ifndef LOWLAT_PBS_H
|
||||
#define LOWLAT_PBS_H
|
||||
|
||||
#include "cooperative_groups.h"
|
||||
|
||||
#include "../include/helper_cuda.h"
|
||||
#include "bootstrap.h"
|
||||
#include "complex/operations.cuh"
|
||||
#include "crypto/gadget.cuh"
|
||||
#include "crypto/torus.cuh"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/smfft.cuh"
|
||||
#include "fft/twiddles.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "utils/memory.cuh"
|
||||
#include "utils/timer.cuh"
|
||||
|
||||
// Cooperative groups are used in the low latency
|
||||
// version of the bootstrapping
|
||||
using namespace cooperative_groups;
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
template <typename Torus, class params>
|
||||
__device__ void
|
||||
mul_trgsw_trlwe(Torus *accumulator,
|
||||
double2 *fft,
|
||||
int16_t *trlwe_decomposed,
|
||||
double2 *mask_join_buffer,
|
||||
double2 *body_join_buffer,
|
||||
double2 *bootstrapping_key,
|
||||
int polynomial_size, int l_gadget, int iteration, grid_group &grid) {
|
||||
|
||||
// Put the decomposed TRLWE sample in the Fourier domain
|
||||
real_to_complex_compressed<int16_t, params>(trlwe_decomposed,
|
||||
fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Switch to the FFT space
|
||||
NSMFFT_direct<HalfDegree<params>>(fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
correction_direct_fft_inplace<params>(fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
|
||||
|
||||
// Get the pieces of the bootstrapping key that will be needed for the
|
||||
// external product; blockIdx.x is the ID of the block that's executing
|
||||
// this function, so we end up getting the lines of the bootstrapping key
|
||||
// needed to perform the external product in this block (corresponding to
|
||||
// the same decomposition level)
|
||||
|
||||
// auto bsk_mask_slice = bootstrapping_key.get_ith_mask_kth_block(
|
||||
// gpu_num, iteration, blockIdx.y, blockIdx.x);
|
||||
// auto bsk_body_slice = bootstrapping_key.get_ith_body_kth_block(
|
||||
// gpu_num, iteration, blockIdx.y, blockIdx.x);
|
||||
|
||||
auto bsk_mask_slice = PolynomialFourier<double2, params>(
|
||||
get_ith_mask_kth_block(
|
||||
bootstrapping_key, iteration, blockIdx.y, blockIdx.x,
|
||||
polynomial_size, 1, l_gadget));
|
||||
auto bsk_body_slice = PolynomialFourier<double2, params>(
|
||||
get_ith_body_kth_block(
|
||||
bootstrapping_key, iteration, blockIdx.y, blockIdx.x,
|
||||
polynomial_size, 1, l_gadget));
|
||||
|
||||
// Perform the matrix multiplication between the RGSW and the TRLWE,
|
||||
// each block operating on a single level for mask and body
|
||||
|
||||
auto first_processed_bsk = (blockIdx.y == 0) ? bsk_mask_slice : bsk_body_slice;
|
||||
auto second_processed_bsk = (blockIdx.y == 0) ? bsk_body_slice : bsk_mask_slice;
|
||||
|
||||
auto first_processed_acc = (blockIdx.y == 0) ?
|
||||
&mask_join_buffer[params::degree / 2 * blockIdx.x] :
|
||||
&body_join_buffer[params::degree / 2 * blockIdx.x];
|
||||
auto second_processed_acc = (blockIdx.y == 0) ?
|
||||
&body_join_buffer[params::degree / 2 * blockIdx.x] :
|
||||
&mask_join_buffer[params::degree / 2 * blockIdx.x];
|
||||
|
||||
int tid = threadIdx.x;
|
||||
|
||||
//first product
|
||||
for(int i = 0; i < params::opt / 2; i++) {
|
||||
first_processed_acc[tid] = fft[tid] * first_processed_bsk.m_values[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
grid.sync();
|
||||
tid = threadIdx.x;
|
||||
//second product
|
||||
for(int i = 0; i < params::opt / 2; i++) {
|
||||
second_processed_acc[tid] += fft[tid] * second_processed_bsk.m_values[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
// All blocks are synchronized here; after this sync, *_join_buffer has the
|
||||
// values needed from every other block
|
||||
grid.sync();
|
||||
|
||||
auto src_acc = (blockIdx.y == 0) ? mask_join_buffer : body_join_buffer;
|
||||
|
||||
// copy first product into fft buffer
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] = src_acc[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// accumulate rest of the products into fft buffer
|
||||
for (int l = 1; l < gridDim.x; l++) {
|
||||
auto cur_src_acc = &src_acc[l * params::degree / 2];
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] += cur_src_acc[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
correction_inverse_fft_inplace<params>(fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform the inverse FFT on the result of the RGSWxTRLWE and add to the
|
||||
// accumulator
|
||||
NSMFFT_inverse<HalfDegree<params>>(fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
add_to_torus<Torus, params>(fft, accumulator);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
/*
|
||||
* Kernel launched by the low latency version of the
|
||||
* bootstrapping, that uses cooperative groups
|
||||
* lwe_out vector of output lwe s, with length (polynomial_size+1)*num_samples
|
||||
* lut_vector - vector of look up tables with length polynomial_size * num_samples
|
||||
* lut_vector_indexes - mapping between lwe_in and lut_vector
|
||||
* lwe_in - vector of lwe inputs with length (lwe_mask_size + 1) * num_samples
|
||||
*
|
||||
*/
|
||||
__global__ void device_bootstrap_low_latency(
|
||||
Torus *lwe_out,
|
||||
Torus *lut_vector,
|
||||
Torus *lwe_in,
|
||||
double2 *bootstrapping_key,
|
||||
double2 *mask_join_buffer,
|
||||
double2 *body_join_buffer,
|
||||
uint32_t lwe_mask_size,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t l_gadget
|
||||
) {
|
||||
|
||||
grid_group grid = this_grid();
|
||||
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
// bootstrap, since shared memory is kept in L1 cache and accessing it is
|
||||
// much faster than global memory
|
||||
extern __shared__ char sharedmem[];
|
||||
|
||||
char* selected_memory = sharedmem;
|
||||
|
||||
int16_t *accumulator_decomposed = (int16_t *)selected_memory;
|
||||
Torus *accumulator = (Torus*)accumulator_decomposed +
|
||||
polynomial_size / (sizeof(Torus) / sizeof(int16_t));
|
||||
double2 *accumulator_fft = (double2*)accumulator +
|
||||
polynomial_size / (sizeof(double2) / sizeof(Torus));
|
||||
|
||||
// Reuse memory from accumulator_fft for accumulator_rotated
|
||||
Torus* accumulator_rotated = (Torus*)accumulator_fft;
|
||||
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
auto block_lwe_in = &lwe_in[blockIdx.z * (lwe_mask_size + 1)];
|
||||
|
||||
auto block_lut_vector =
|
||||
&lut_vector[blockIdx.z * params::degree * 2];
|
||||
|
||||
auto block_mask_join_buffer = &mask_join_buffer[blockIdx.z * l_gadget * params::degree / 2];
|
||||
auto block_body_join_buffer = &body_join_buffer[blockIdx.z * l_gadget * params::degree / 2];
|
||||
|
||||
// Since the space is L1 cache is small, we use the same memory location for
|
||||
// the rotated accumulator and the fft accumulator, since we know that the
|
||||
// rotated array is not in use anymore by the time we perform the fft
|
||||
|
||||
GadgetMatrix<Torus, params> gadget(base_log, l_gadget);
|
||||
|
||||
// Put "b" in [0, 2N[
|
||||
Torus b_hat = rescale_torus_element(
|
||||
block_lwe_in[lwe_mask_size],
|
||||
2 * params::degree);
|
||||
|
||||
if (blockIdx.y == 0) {
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, block_lut_vector, b_hat, false);
|
||||
}
|
||||
else {
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[params::degree], b_hat, false);
|
||||
}
|
||||
|
||||
for (int i = 0; i < lwe_mask_size; i++) {
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Put "a" in [0, 2N[
|
||||
Torus a_hat = rescale_torus_element(
|
||||
block_lwe_in[i],
|
||||
2 * params::degree); // 2 * params::log2_degree + 1);
|
||||
|
||||
if (a_hat == 0) {
|
||||
// todo(Joao): **cannot use this optimization**
|
||||
// the reason is that one of the input ciphertexts (blockIdx.z)
|
||||
// might skip an iteration while others don't, which as a result
|
||||
// will make that block not call the grid.sync(), causing a deadlock;
|
||||
// maybe it's a workaround to add grid.sync() here, but not sure if
|
||||
// there are any edge cases?
|
||||
|
||||
// continue
|
||||
}
|
||||
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator, accumulator_rotated, a_hat);
|
||||
|
||||
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_rotated, base_log, l_gadget);
|
||||
|
||||
|
||||
|
||||
// Decompose the accumulator. Each block gets one level of the
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
gadget.decompose_one_level(accumulator_decomposed, accumulator_rotated,
|
||||
blockIdx.x);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
// accumulator_rotated, so we need to synchronize here to make sure they
|
||||
// don't modify the same memory space at the same time
|
||||
synchronize_threads_in_block();
|
||||
// Perform G^-1(ACC) * RGSW -> TRLWE
|
||||
mul_trgsw_trlwe<Torus, params>(
|
||||
accumulator,
|
||||
accumulator_fft,
|
||||
accumulator_decomposed,
|
||||
block_mask_join_buffer,
|
||||
block_body_join_buffer,
|
||||
bootstrapping_key,
|
||||
polynomial_size, l_gadget, i, grid);
|
||||
}
|
||||
|
||||
auto block_lwe_out = &lwe_out[blockIdx.z * (polynomial_size + 1)];
|
||||
|
||||
if (blockIdx.x == 0 && blockIdx.y == 0) {
|
||||
// Perform a sample extract. At this point, all blocks have the result, but
|
||||
// we do the computation at block 0 to avoid waiting for extra blocks, in
|
||||
// case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_out, accumulator);
|
||||
} else if (blockIdx.x == 0 && blockIdx.y == 1) {
|
||||
sample_extract_body<Torus, params>(block_lwe_out, accumulator);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Host wrapper to the low latency version
|
||||
* of bootstrapping
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_bootstrap_low_latency(
|
||||
void *v_stream,
|
||||
Torus *lwe_out,
|
||||
Torus *lut_vector,
|
||||
uint32_t *lut_vector_indexes,
|
||||
Torus *lwe_in,
|
||||
double2 *bootstrapping_key,
|
||||
uint32_t lwe_mask_size,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t num_samples,
|
||||
uint32_t num_lut_vectors) {
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
int buffer_size_per_gpu = l_gadget * num_samples * polynomial_size / 2 * sizeof(double2);
|
||||
double2 *mask_buffer_fft;
|
||||
double2 *body_buffer_fft;
|
||||
checkCudaErrors(cudaMalloc((void **)&mask_buffer_fft, buffer_size_per_gpu));
|
||||
checkCudaErrors(cudaMalloc((void **)&body_buffer_fft, buffer_size_per_gpu));
|
||||
|
||||
|
||||
int bytes_needed =
|
||||
sizeof(int16_t) * polynomial_size + // accumulator_decomp
|
||||
sizeof(Torus) * polynomial_size + // accumulator
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
|
||||
int thds = polynomial_size / params::opt;
|
||||
dim3 grid(l_gadget, 2, num_samples);
|
||||
|
||||
void *kernel_args[10];
|
||||
kernel_args[0] = &lwe_out;
|
||||
kernel_args[1] = &lut_vector;
|
||||
kernel_args[2] = &lwe_in;
|
||||
kernel_args[3] = &bootstrapping_key;
|
||||
kernel_args[4] = &mask_buffer_fft;
|
||||
kernel_args[5] = &body_buffer_fft;
|
||||
kernel_args[6] = &lwe_mask_size;
|
||||
kernel_args[7] = &polynomial_size;
|
||||
kernel_args[8] = &base_log;
|
||||
kernel_args[9] =&l_gadget;
|
||||
|
||||
checkCudaErrors(cudaFuncSetAttribute(device_bootstrap_low_latency<Torus,
|
||||
params>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
bytes_needed));
|
||||
cudaFuncSetCacheConfig(device_bootstrap_low_latency<Torus, params>,
|
||||
cudaFuncCachePreferShared);
|
||||
|
||||
checkCudaErrors(cudaLaunchCooperativeKernel ( (void *)device_bootstrap_low_latency<Torus, params>, grid, thds, (void**)kernel_args, bytes_needed, *stream )) ;
|
||||
|
||||
// Synchronize the streams before copying the result to lwe_out at the right
|
||||
// place
|
||||
cudaStreamSynchronize(*stream);
|
||||
cudaFree(mask_buffer_fft);
|
||||
cudaFree(body_buffer_fft);
|
||||
}
|
||||
|
||||
#endif // LOWLAT_PBS_H
|
||||
87
src/complex/operations.cuh
Normal file
87
src/complex/operations.cuh
Normal file
@@ -0,0 +1,87 @@
|
||||
#ifndef GPU_BOOTSTRAP_COMMON_CUH
|
||||
#define GPU_BOOTSTRAP_COMMON_CUH
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
|
||||
#define SNT 1
|
||||
#define dPI 6.283185307179586231995926937088
|
||||
|
||||
using sTorus = int32_t;
|
||||
// using Torus = uint32_t;
|
||||
using sTorus = int32_t;
|
||||
using u32 = uint32_t;
|
||||
using i32 = int32_t;
|
||||
|
||||
//--------------------------------------------------
|
||||
// Basic double2 operations
|
||||
|
||||
__device__ inline double2 conjugate(const double2 num) {
|
||||
double2 res;
|
||||
res.x = num.x;
|
||||
res.y = -num.y;
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ inline void operator+=(double2 &lh, const double2 rh) {
|
||||
lh.x += rh.x;
|
||||
lh.y += rh.y;
|
||||
}
|
||||
|
||||
__device__ inline void operator-=(double2 &lh, const double2 rh) {
|
||||
lh.x -= rh.x;
|
||||
lh.y -= rh.y;
|
||||
}
|
||||
|
||||
__device__ inline double2 operator+(const double2 a, const double2 b) {
|
||||
double2 res;
|
||||
res.x = a.x + b.x;
|
||||
res.y = a.y + b.y;
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ inline double2 operator-(const double2 a, const double2 b) {
|
||||
double2 res;
|
||||
res.x = a.x - b.x;
|
||||
res.y = a.y - b.y;
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ inline double2 operator*(const double2 a, const double2 b) {
|
||||
double xx = a.x * b.x;
|
||||
double xy = a.x * b.y;
|
||||
double yx = a.y * b.x;
|
||||
double yy = a.y * b.y;
|
||||
|
||||
double2 res;
|
||||
// asm volatile("fma.rn.f64 %0, %1, %2, %3;": "=d"(res.x) : "d"(a.x),
|
||||
// "d"(b.x), "d"(yy));
|
||||
res.x = xx - yy;
|
||||
res.y = xy + yx;
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ inline double2 operator*(const double2 a, double b) {
|
||||
double2 res;
|
||||
res.x = a.x * b;
|
||||
res.y = a.y * b;
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ inline void operator*=(double2 &a, const double2 b) {
|
||||
double tmp = a.x;
|
||||
a.x *= b.x;
|
||||
a.x -= a.y * b.y;
|
||||
a.y *= b.x;
|
||||
a.y += b.y * tmp;
|
||||
}
|
||||
|
||||
__device__ inline double2 operator*(double a, double2 b) {
|
||||
double2 res;
|
||||
res.x = b.x * a;
|
||||
res.y = b.y * a;
|
||||
return res;
|
||||
}
|
||||
|
||||
#endif
|
||||
157
src/crypto/bootstrapping_key.cuh
Normal file
157
src/crypto/bootstrapping_key.cuh
Normal file
@@ -0,0 +1,157 @@
|
||||
#ifndef CNCRT_BSK_H
|
||||
#define CNCRT_BSK_H
|
||||
|
||||
#include "bootstrap.h"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial.cuh"
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
|
||||
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t l_gadget) {
|
||||
return i * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) * l_gadget;
|
||||
}
|
||||
|
||||
__device__ double2*
|
||||
get_ith_mask_kth_block(double2* ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget) {
|
||||
return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension, l_gadget) +
|
||||
level * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) +
|
||||
k * polynomial_size / 2 * (glwe_dimension + 1)];
|
||||
}
|
||||
|
||||
__device__ double2*
|
||||
get_ith_body_kth_block(double2 *ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget) {
|
||||
return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension, l_gadget) +
|
||||
level * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) +
|
||||
k * polynomial_size / 2 * (glwe_dimension + 1) +
|
||||
polynomial_size / 2];
|
||||
}
|
||||
|
||||
void cuda_initialize_twiddles(uint32_t polynomial_size, uint32_t gpu_index) {
|
||||
cudaSetDevice(gpu_index);
|
||||
int sw_size = polynomial_size / 2;
|
||||
short *sw1_h, *sw2_h;
|
||||
|
||||
sw1_h = (short *)malloc(sizeof(short) * sw_size);
|
||||
sw2_h = (short *)malloc(sizeof(short) * sw_size);
|
||||
|
||||
memset(sw1_h, 0, sw_size * sizeof(short));
|
||||
memset(sw2_h, 0, sw_size * sizeof(short));
|
||||
int cnt = 0;
|
||||
for (int i = 1, j = 0; i < polynomial_size / 2; i++) {
|
||||
int bit = (polynomial_size / 2) >> 1;
|
||||
for (; j & bit; bit >>= 1)
|
||||
j ^= bit;
|
||||
j ^= bit;
|
||||
|
||||
if (i < j) {
|
||||
sw1_h[cnt] = i;
|
||||
sw2_h[cnt] = j;
|
||||
cnt++;
|
||||
}
|
||||
}
|
||||
cudaMemcpyToSymbol(SW1, sw1_h, sw_size * sizeof(short), 0,
|
||||
cudaMemcpyHostToDevice);
|
||||
cudaMemcpyToSymbol(SW2, sw2_h, sw_size * sizeof(short), 0,
|
||||
cudaMemcpyHostToDevice);
|
||||
free(sw1_h);
|
||||
free(sw2_h);
|
||||
}
|
||||
|
||||
template <typename T, typename ST>
|
||||
void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
|
||||
uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim,
|
||||
uint32_t l_gadget, uint32_t polynomial_size) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
int shared_memory_size = sizeof(double) * polynomial_size;
|
||||
|
||||
int total_polynomials =
|
||||
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
|
||||
l_gadget;
|
||||
|
||||
// Here the buffer size is the size of double2 times the number of polynomials
|
||||
// times the polynomial size over 2 because the polynomials are compressed
|
||||
// into the complex domain to perform the FFT
|
||||
size_t buffer_size = total_polynomials * polynomial_size / 2 * sizeof
|
||||
(double2);
|
||||
|
||||
int gridSize = total_polynomials;
|
||||
int blockSize = polynomial_size / choose_opt(polynomial_size);
|
||||
// todo(Joao): let's use cudaMallocHost here,
|
||||
// since it allocates page-staged memory which allows
|
||||
// faster data copy
|
||||
double2 *h_bsk = (double2 *)malloc(buffer_size);
|
||||
double2 *d_bsk;
|
||||
cudaMalloc((void **)&d_bsk, buffer_size);
|
||||
|
||||
// compress real bsk to complex and divide it on DOUBLE_MAX
|
||||
for (int i = 0; i < total_polynomials; i++) {
|
||||
int complex_current_poly_idx = i * polynomial_size / 2;
|
||||
int torus_current_poly_idx = i * polynomial_size;
|
||||
for (int j = 0; j < polynomial_size / 2; j++) {
|
||||
h_bsk[complex_current_poly_idx + j].x =
|
||||
src[torus_current_poly_idx + 2 * j];
|
||||
h_bsk[complex_current_poly_idx + j].y =
|
||||
src[torus_current_poly_idx + 2 * j + 1];
|
||||
h_bsk[complex_current_poly_idx + j].x /=
|
||||
(double)std::numeric_limits<T>::max();
|
||||
h_bsk[complex_current_poly_idx + j].y /=
|
||||
(double)std::numeric_limits<T>::max();
|
||||
}
|
||||
}
|
||||
|
||||
cudaMemcpy(d_bsk, h_bsk, buffer_size, cudaMemcpyHostToDevice);
|
||||
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
switch (polynomial_size) {
|
||||
// FIXME (Agnes): check if polynomial sizes are ok
|
||||
case 512:
|
||||
batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
|
||||
break;
|
||||
case 1024:
|
||||
batch_NSMFFT<FFTDegree<Degree<1024>, ForwardFFT>>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
|
||||
break;
|
||||
case 2048:
|
||||
batch_NSMFFT<FFTDegree<Degree<2048>, ForwardFFT>>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
|
||||
break;
|
||||
case 4096:
|
||||
batch_NSMFFT<FFTDegree<Degree<4096>, ForwardFFT>>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
|
||||
break;
|
||||
case 8192:
|
||||
batch_NSMFFT<FFTDegree<Degree<8192>, ForwardFFT>>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
cudaFree(d_bsk);
|
||||
free(h_bsk);
|
||||
|
||||
}
|
||||
|
||||
void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src, void *v_stream,
|
||||
uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim,
|
||||
uint32_t l_gadget, uint32_t polynomial_size) {
|
||||
cuda_convert_lwe_bootstrap_key<uint32_t, int32_t>((double2 *)dest, (int32_t *)src,
|
||||
v_stream, gpu_index, input_lwe_dim,
|
||||
glwe_dim, l_gadget, polynomial_size);
|
||||
}
|
||||
|
||||
void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src, void *v_stream,
|
||||
uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim,
|
||||
uint32_t l_gadget, uint32_t polynomial_size) {
|
||||
cuda_convert_lwe_bootstrap_key<uint64_t, int64_t>((double2 *)dest, (int64_t *)src,
|
||||
v_stream, gpu_index, input_lwe_dim,
|
||||
glwe_dim, l_gadget, polynomial_size);
|
||||
}
|
||||
|
||||
#endif // CNCRT_BSK_H
|
||||
91
src/crypto/gadget.cuh
Normal file
91
src/crypto/gadget.cuh
Normal file
@@ -0,0 +1,91 @@
|
||||
#ifndef CNCRT_CRYPTO_H
|
||||
#define CNCRT_CRPYTO_H
|
||||
|
||||
#include "polynomial/polynomial.cuh"
|
||||
#include <cstdint>
|
||||
|
||||
template <typename T, class params> class GadgetMatrix {
|
||||
private:
|
||||
uint32_t l_gadget;
|
||||
uint32_t base_log;
|
||||
uint32_t mask;
|
||||
uint32_t halfbg;
|
||||
T offset;
|
||||
|
||||
public:
|
||||
__device__ GadgetMatrix(uint32_t base_log, uint32_t l_gadget)
|
||||
: base_log(base_log), l_gadget(l_gadget) {
|
||||
uint32_t bg = 1 << base_log;
|
||||
this->halfbg = bg / 2;
|
||||
this->mask = bg - 1;
|
||||
T temp = 0;
|
||||
for (int i = 0; i < this->l_gadget; i++) {
|
||||
temp += 1ULL << (sizeof(T) * 8 - (i + 1) * this->base_log);
|
||||
}
|
||||
this->offset = temp * this->halfbg;
|
||||
}
|
||||
|
||||
template <typename V, typename U>
|
||||
__device__ void decompose_one_level(Polynomial<V, params> &result,
|
||||
Polynomial<U, params> &polynomial,
|
||||
uint32_t level) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
T s = polynomial.coefficients[tid] + this->offset;
|
||||
uint32_t decal = (sizeof(T) * 8 - (level + 1) * this->base_log);
|
||||
T temp1 = (s >> decal) & this->mask;
|
||||
result.coefficients[tid] = (V)(temp1 - this->halfbg);
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
template <typename V, typename U>
|
||||
__device__ void decompose_one_level(V *result, U *polynomial,
|
||||
uint32_t level) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
T s = polynomial[tid] + this->offset;
|
||||
uint32_t decal = (sizeof(T) * 8 - (level + 1) * this->base_log);
|
||||
T temp1 = (s >> decal) & this->mask;
|
||||
result[tid] = (V)(temp1 - this->halfbg);
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ T decompose_one_level_single(T element, uint32_t level) {
|
||||
T s = element + this->offset;
|
||||
uint32_t decal = (sizeof(T) * 8 - (level + 1) * this->base_log);
|
||||
T temp1 = (s >> decal) & this->mask;
|
||||
return (T)(temp1 - this->halfbg);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> class GadgetMatrixSingle {
|
||||
private:
|
||||
uint32_t l_gadget;
|
||||
uint32_t base_log;
|
||||
uint32_t mask;
|
||||
uint32_t halfbg;
|
||||
T offset;
|
||||
|
||||
public:
|
||||
__device__ GadgetMatrixSingle(uint32_t base_log, uint32_t l_gadget)
|
||||
: base_log(base_log), l_gadget(l_gadget) {
|
||||
uint32_t bg = 1 << base_log;
|
||||
this->halfbg = bg / 2;
|
||||
this->mask = bg - 1;
|
||||
T temp = 0;
|
||||
for (int i = 0; i < this->l_gadget; i++) {
|
||||
temp += 1ULL << (sizeof(T) * 8 - (i + 1) * this->base_log);
|
||||
}
|
||||
this->offset = temp * this->halfbg;
|
||||
}
|
||||
|
||||
__device__ T decompose_one_level_single(T element, uint32_t level) {
|
||||
T s = element + this->offset;
|
||||
uint32_t decal = (sizeof(T) * 8 - (level + 1) * this->base_log);
|
||||
T temp1 = (s >> decal) & this->mask;
|
||||
return (T)(temp1 - this->halfbg);
|
||||
}
|
||||
};
|
||||
|
||||
#endif // CNCRT_CRPYTO_H
|
||||
45
src/crypto/torus.cuh
Normal file
45
src/crypto/torus.cuh
Normal file
@@ -0,0 +1,45 @@
|
||||
#ifndef CNCRT_TORUS_H
|
||||
#define CNCRT_TORUS_H
|
||||
|
||||
#include "types/int128.cuh"
|
||||
#include <limits>
|
||||
|
||||
template <typename Torus>
|
||||
__device__ inline Torus typecast_double_to_torus(double x) {
|
||||
if constexpr (sizeof(Torus) < 8) {
|
||||
// this simple cast works up to 32 bits, afterwards we must do it manually
|
||||
long long ret = x;
|
||||
return (Torus)ret;
|
||||
} else {
|
||||
int128 nnnn = make_int128_from_float(x);
|
||||
uint64_t lll = nnnn.lo_;
|
||||
return lll;
|
||||
}
|
||||
// nvcc doesn't get it that the if {} else {} above should always return
|
||||
// something, and complains that this function might return nothing, so we
|
||||
// put this useless return here
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
|
||||
uint32_t l_gadget) {
|
||||
T shift = sizeof(T) * 8 - l_gadget * base_log;
|
||||
T mask = 1ll << (shift - 1);
|
||||
T b = (x & mask) >> (shift - 1);
|
||||
T res = x >> shift;
|
||||
res += b;
|
||||
res <<= shift;
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T rescale_torus_element(T element,
|
||||
uint32_t log_shift) {
|
||||
// todo(Joao): not sure if this works
|
||||
// return element >> log_shift;
|
||||
return round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
|
||||
(double)log_shift);
|
||||
}
|
||||
|
||||
#endif // CNCRT_TORUS_H
|
||||
147
src/device.cu
Normal file
147
src/device.cu
Normal file
@@ -0,0 +1,147 @@
|
||||
#include "device.h"
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cuda_runtime.h>
|
||||
#include <helper_cuda.h>
|
||||
|
||||
/// Unsafe function to create a CUDA stream, must check first that GPU exists
|
||||
void *cuda_create_stream(uint32_t gpu_index) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaStream_t *stream = new cudaStream_t;
|
||||
cudaStreamCreate(stream);
|
||||
return stream;
|
||||
}
|
||||
|
||||
/// Unsafe function to destroy CUDA stream, must check first the GPU exists
|
||||
int cuda_destroy_stream(void *v_stream, uint32_t gpu_index) {
|
||||
cudaSetDevice(gpu_index);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
cudaStreamDestroy(*stream);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Unsafe function that will try to allocate even if gpu_index is invalid
|
||||
/// or if there's not enough memory. A safe wrapper around it must call
|
||||
/// cuda_check_valid_malloc() first
|
||||
void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
|
||||
cudaSetDevice(gpu_index);
|
||||
void *ptr;
|
||||
checkCudaErrors(cudaMalloc((void **)&ptr, size));
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/// Checks that allocation is valid
|
||||
/// 0: valid
|
||||
/// -1: invalid, not enough memory in device
|
||||
/// -2: invalid, gpu index doesn't exist
|
||||
int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
|
||||
|
||||
if (gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
}
|
||||
cudaSetDevice(gpu_index);
|
||||
size_t total_mem, free_mem;
|
||||
cudaMemGetInfo(&free_mem, &total_mem);
|
||||
if (size > free_mem) {
|
||||
// error code: not enough memory
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Tries to copy memory to the GPU asynchronously
|
||||
/// 0: success
|
||||
/// -1: error, invalid device pointer
|
||||
/// -2: error, gpu index doesn't exist
|
||||
int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
void *v_stream, uint32_t gpu_index) {
|
||||
if (gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
}
|
||||
cudaPointerAttributes attr;
|
||||
cudaPointerGetAttributes(&attr, dest);
|
||||
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
|
||||
// error code: invalid device pointer
|
||||
return -1;
|
||||
}
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
cudaSetDevice(gpu_index);
|
||||
checkCudaErrors(cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice,
|
||||
*stream));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Synchronizes device
|
||||
/// 0: success
|
||||
/// -2: error, gpu index doesn't exist
|
||||
int cuda_synchronize_device(uint32_t gpu_index) {
|
||||
if (gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
}
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaDeviceSynchronize();
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Tries to copy memory to the GPU asynchronously
|
||||
/// 0: success
|
||||
/// -1: error, invalid device pointer
|
||||
/// -2: error, gpu index doesn't exist
|
||||
int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||
void *v_stream, uint32_t gpu_index) {
|
||||
if (gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
}
|
||||
cudaPointerAttributes attr;
|
||||
cudaPointerGetAttributes(&attr, src);
|
||||
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
|
||||
// error code: invalid device pointer
|
||||
return -1;
|
||||
}
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
cudaSetDevice(gpu_index);
|
||||
checkCudaErrors(cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost,
|
||||
*stream));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Return number of GPUs available
|
||||
int cuda_get_number_of_gpus() {
|
||||
int num_gpus;
|
||||
cudaGetDeviceCount(&num_gpus);
|
||||
return num_gpus;
|
||||
}
|
||||
|
||||
/// Drop a cuda array
|
||||
int cuda_drop(void *ptr, uint32_t gpu_index) {
|
||||
if (gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
}
|
||||
cudaSetDevice(gpu_index);
|
||||
checkCudaErrors(cudaFree(ptr));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Get the maximum size for the shared memory
|
||||
int cuda_get_max_shared_memory(uint32_t gpu_index) {
|
||||
if (gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
}
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaDeviceProp prop;
|
||||
cudaGetDeviceProperties(&prop, gpu_index);
|
||||
int max_shared_memory = 0;
|
||||
if (prop.major > 7) {
|
||||
max_shared_memory = prop.sharedMemPerMultiprocessor;
|
||||
} else {
|
||||
max_shared_memory = prop.sharedMemPerBlock;
|
||||
}
|
||||
return max_shared_memory;
|
||||
}
|
||||
753
src/fft/bnsmfft.cuh
Normal file
753
src/fft/bnsmfft.cuh
Normal file
@@ -0,0 +1,753 @@
|
||||
#ifndef GPU_BOOTSTRAP_FFT_1024_CUH
|
||||
#define GPU_BOOTSTRAP_FFT_1024_CUH
|
||||
|
||||
#include "complex/operations.cuh"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "twiddles.cuh"
|
||||
|
||||
/*
|
||||
* bit reverse
|
||||
* coefficient bits are reversed based on precalculated indexes
|
||||
* SW1 and SW2
|
||||
*/
|
||||
template <class params> __device__ void bit_reverse_inplace(double2 *A) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
short sw1 = SW1[tid];
|
||||
short sw2 = SW2[tid];
|
||||
double2 tmp = A[sw1];
|
||||
A[sw1] = A[sw2];
|
||||
A[sw2] = tmp;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* negacyclic twiddle
|
||||
* returns negacyclic twiddle based on degree and index
|
||||
* twiddles are precalculated inside negTwids{3..13} arrays
|
||||
*/
|
||||
template <int degree> __device__ double2 negacyclic_twiddle(int j) {
|
||||
double2 twid;
|
||||
switch (degree) {
|
||||
case 512:
|
||||
twid = negTwids9[j];
|
||||
break;
|
||||
case 1024:
|
||||
twid = negTwids10[j];
|
||||
break;
|
||||
case 2048:
|
||||
twid = negTwids11[j];
|
||||
break;
|
||||
case 4096:
|
||||
twid = negTwids12[j];
|
||||
break;
|
||||
case 8192:
|
||||
twid = negTwids13[j];
|
||||
default:
|
||||
twid.x = 0;
|
||||
twid.y = 0;
|
||||
break;
|
||||
}
|
||||
return twid;
|
||||
}
|
||||
|
||||
/*
|
||||
* Direct negacyclic FFT:
|
||||
* - before the FFT the N real coefficients are stored into a
|
||||
* N/2 sized complex with the even coefficients in the real part
|
||||
* and the odd coefficients in the imaginary part. This is referred to
|
||||
* as the half-size FFT
|
||||
* - when calling BNSMFFT_direct for the forward negacyclic FFT of PBS,
|
||||
* opt is divided by 2 because the butterfly pattern is always applied
|
||||
* between pairs of coefficients
|
||||
* - instead of twisting each coefficient A_j before the FFT by
|
||||
* multiplying by the w^j roots of unity (aka twiddles, w=exp(-i pi /N)),
|
||||
* the FFT is modified, and for each level k of the FFT the twiddle:
|
||||
* w_j,k = exp(-i pi j/2^k)
|
||||
* is replaced with:
|
||||
* \zeta_j,k = exp(-i pi (2j-1)/2^k)
|
||||
* - this technique also implies a correction of the
|
||||
* complex obtained after the FFT, which is done in the
|
||||
* forward_negacyclic_fft_inplace function of bootstrap.cuh
|
||||
*/
|
||||
template <class params> __device__ void NSMFFT_direct(double2 *A) {
|
||||
/* First, reverse the bits of the input complex
|
||||
* The bit reversal for half-size FFT has been stored into the
|
||||
* SW1 and SW2 arrays beforehand
|
||||
* Each thread is always in charge of "opt/2" pairs of coefficients,
|
||||
* which is why we always loop through N/2 by N/opt strides
|
||||
* The pragma unroll instruction tells the compiler to unroll the
|
||||
* full loop, which should increase performance TODO (Agnes) check this
|
||||
*/
|
||||
bit_reverse_inplace<params>(A);
|
||||
__syncthreads();
|
||||
|
||||
// Now we go through all the levels of the FFT one by one
|
||||
// (instead of recursively)
|
||||
// first iteration: k=1, zeta=i for all coefficients
|
||||
int tid = threadIdx.x;
|
||||
int i1, i2;
|
||||
double2 u, v;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 2
|
||||
i1 = tid << 1;
|
||||
i2 = i1 + 1;
|
||||
u = A[i1];
|
||||
// v = i*A[i2]
|
||||
v.y = A[i2].x;
|
||||
v.x = -A[i2].y;
|
||||
// A[i1] <- A[i1] + i*A[i2]
|
||||
// A[i2] <- A[i1] - i*A[i2]
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// second iteration: apply the butterfly pattern
|
||||
// between groups of 4 coefficients
|
||||
// k=2, \zeta=exp(i pi/4) for even coefficients and
|
||||
// exp(3 i pi / 4) for odd coefficients
|
||||
// TODO (Agnes) how does this work on the gpu? aren't we doing
|
||||
// a lot more computations than we should?
|
||||
tid = threadIdx.x;
|
||||
// odd = 0 for even coefficients, 1 for odd coefficients
|
||||
int odd = tid & 1;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 2
|
||||
// i1=2*tid if tid is even and 2*tid-1 if it is odd
|
||||
i1 = (tid << 1) - odd;
|
||||
i2 = i1 + 2;
|
||||
|
||||
double a = A[i2].x;
|
||||
double b = A[i2].y;
|
||||
u = A[i1];
|
||||
|
||||
// \zeta_j,2 = exp(-i pi (2j-1)/4) -> j=0: exp(i pi/4) or j=1: exp(-i pi/4)
|
||||
// \zeta_even = sqrt(2)/2 + i * sqrt(2)/2 = sqrt(2)/2*(1+i)
|
||||
// \zeta_odd = sqrt(2)/2 - i * sqrt(2)/2 = sqrt(2)/2*(1-i)
|
||||
|
||||
// v_j = \zeta_j * (a+i*b)
|
||||
// v_even = sqrt(2)/2*((a-b)+i*(a+b))
|
||||
// v_odd = sqrt(2)/2*(a+b+i*(b-a))
|
||||
v.x =
|
||||
(odd) ? (-0.707106781186548) * (a + b) : (0.707106781186548) * (a - b);
|
||||
v.y = (odd) ? (0.707106781186548) * (a - b) : (0.707106781186548) * (a + b);
|
||||
|
||||
// v.x = (0.707106781186548 * odd) * (a + b) + (0.707106781186548 * (!odd))
|
||||
// * (a - b); v.y = (0.707106781186548 * odd) * (b - a) + (0.707106781186548
|
||||
// * (!odd)) * (a + b);
|
||||
|
||||
A[i1] = u + v;
|
||||
A[i2] = u - v;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// third iteration
|
||||
// from k=3 on, we have to do the full complex multiplication
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 4
|
||||
// rem is the remainder of tid/4. tid takes values:
|
||||
// 0, 1, 2, 3, 4, 5, 6, 7, ... N/4
|
||||
// then rem takes values:
|
||||
// 0, 1, 2, 3, 0, 1, 2, 3, ... N/4
|
||||
// and striding by 4 will allow us to cover all
|
||||
// the coefficients correctly
|
||||
int rem = tid & 3;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 4;
|
||||
|
||||
double2 w = negTwids3[rem];
|
||||
u = A[i1], v = A[i2] * w;
|
||||
|
||||
A[i1] = u + v;
|
||||
A[i2] = u - v;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// 4_th iteration
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 8
|
||||
// rem is the remainder of tid/8. tid takes values:
|
||||
// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ... N/4
|
||||
// then rem takes values:
|
||||
// 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, ... N/4
|
||||
// and striding by 8 will allow us to cover all
|
||||
// the coefficients correctly
|
||||
int rem = tid & 7;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 8;
|
||||
|
||||
double2 w = negTwids4[rem];
|
||||
u = A[i1], v = A[i2] * w;
|
||||
A[i1] = u + v;
|
||||
A[i2] = u - v;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// 5_th iteration
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 16
|
||||
// rem is the remainder of tid/16
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 15;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 16;
|
||||
double2 w = negTwids5[rem];
|
||||
u = A[i1], v = A[i2] * w;
|
||||
A[i1] = u + v;
|
||||
A[i2] = u - v;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// 6_th iteration
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 32
|
||||
// rem is the remainder of tid/32
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 31;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 32;
|
||||
double2 w = negTwids6[rem];
|
||||
u = A[i1], v = A[i2] * w;
|
||||
A[i1] = u + v;
|
||||
A[i2] = u - v;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// 7_th iteration
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 64
|
||||
// rem is the remainder of tid/64
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 63;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 64;
|
||||
double2 w = negTwids7[rem];
|
||||
u = A[i1], v = A[i2] * w;
|
||||
A[i1] = u + v;
|
||||
A[i2] = u - v;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// 8_th iteration
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 128
|
||||
// rem is the remainder of tid/128
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 127;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 128;
|
||||
double2 w = negTwids8[rem];
|
||||
u = A[i1], v = A[i2] * w;
|
||||
A[i1] = u + v;
|
||||
A[i2] = u - v;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
if constexpr (params::log2_degree > 8) {
|
||||
// 9_th iteration
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 256
|
||||
// rem is the remainder of tid/256
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 255;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 256;
|
||||
double2 w = negTwids9[rem];
|
||||
u = A[i1], v = A[i2] * w;
|
||||
A[i1] = u + v;
|
||||
A[i2] = u - v;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
if constexpr (params::log2_degree > 9) {
|
||||
// 10_th iteration
|
||||
tid = threadIdx.x;
|
||||
//#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 512
|
||||
// rem is the remainder of tid/512
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 511;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 512;
|
||||
double2 w = negTwids10[rem];
|
||||
u = A[i1], v = A[i2] * w;
|
||||
A[i1] = u + v;
|
||||
A[i2] = u - v;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::log2_degree > 10) {
|
||||
// 11_th iteration
|
||||
tid = threadIdx.x;
|
||||
//#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 1024
|
||||
// rem is the remainder of tid/1024
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 1023;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 1024;
|
||||
double2 w = negTwids11[rem];
|
||||
u = A[i1], v = A[i2] * w;
|
||||
A[i1] = u + v;
|
||||
A[i2] = u - v;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::log2_degree > 11) {
|
||||
// 12_th iteration
|
||||
tid = threadIdx.x;
|
||||
//#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 2048
|
||||
// rem is the remainder of tid/2048
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 2047;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 2048;
|
||||
double2 w = negTwids12[rem];
|
||||
u = A[i1], v = A[i2] * w;
|
||||
A[i1] = u + v;
|
||||
A[i2] = u - v;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
// Real polynomials handled should not exceed a degree of 8192
|
||||
}
|
||||
|
||||
/*
|
||||
* negacyclic inverse fft
|
||||
*/
|
||||
template <class params> __device__ void NSMFFT_inverse(double2 *A) {
|
||||
/* First, reverse the bits of the input complex
|
||||
* The bit reversal for half-size FFT has been stored into the
|
||||
* SW1 and SW2 arrays beforehand
|
||||
* Each thread is always in charge of "opt/2" pairs of coefficients,
|
||||
* which is why we always loop through N/2 by N/opt strides
|
||||
* The pragma unroll instruction tells the compiler to unroll the
|
||||
* full loop, which should increase performance TODO (Agnes) check this
|
||||
*/
|
||||
int tid;
|
||||
int i1, i2;
|
||||
double2 u, v;
|
||||
if constexpr (params::log2_degree > 11) {
|
||||
// 12_th iteration
|
||||
tid = threadIdx.x;
|
||||
//#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 2048
|
||||
// rem is the remainder of tid/2048
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 2047;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 2048;
|
||||
double2 w = conjugate(negTwids12[rem]);
|
||||
u = A[i1], v = A[i2];
|
||||
A[i1] = (u + v) * 0.5;
|
||||
A[i2] = (u - v) * w * 0.5;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::log2_degree > 10) {
|
||||
// 11_th iteration
|
||||
tid = threadIdx.x;
|
||||
//#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 1024
|
||||
// rem is the remainder of tid/1024
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 1023;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 1024;
|
||||
double2 w = conjugate(negTwids11[rem]);
|
||||
u = A[i1], v = A[i2];
|
||||
A[i1] = (u + v) * 0.5;
|
||||
A[i2] = (u - v) * w * 0.5;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::log2_degree > 9) {
|
||||
// 10_th iteration
|
||||
tid = threadIdx.x;
|
||||
//#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 512
|
||||
// rem is the remainder of tid/512
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 511;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 512;
|
||||
double2 w = conjugate(negTwids10[rem]);
|
||||
u = A[i1], v = A[i2];
|
||||
A[i1] = (u + v) * 0.5;
|
||||
A[i2] = (u - v) * w * 0.5;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::log2_degree > 8) {
|
||||
// 9_th iteration
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 256
|
||||
// rem is the remainder of tid/256
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 255;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 256;
|
||||
double2 w = conjugate(negTwids9[rem]);
|
||||
u = A[i1], v = A[i2];
|
||||
A[i1] = (u + v) * 0.5;
|
||||
A[i2] = (u - v) * w * 0.5;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// 8_th iteration
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 128
|
||||
// rem is the remainder of tid/128
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 127;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 128;
|
||||
double2 w = conjugate(negTwids8[rem]);
|
||||
u = A[i1], v = A[i2];
|
||||
A[i1] = (u + v) * 0.5;
|
||||
A[i2] = (u - v) * w * 0.5;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// 7_th iteration
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 64
|
||||
// rem is the remainder of tid/64
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 63;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 64;
|
||||
double2 w = conjugate(negTwids7[rem]);
|
||||
u = A[i1], v = A[i2];
|
||||
A[i1] = (u + v) * 0.5;
|
||||
A[i2] = (u - v) * w * 0.5;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// 6_th iteration
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 32
|
||||
// rem is the remainder of tid/32
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 31;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 32;
|
||||
double2 w = conjugate(negTwids6[rem]);
|
||||
u = A[i1], v = A[i2];
|
||||
A[i1] = (u + v) * 0.5;
|
||||
A[i2] = (u - v) * w * 0.5;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// 5_th iteration
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 16
|
||||
// rem is the remainder of tid/16
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 15;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 16;
|
||||
double2 w = conjugate(negTwids5[rem]);
|
||||
u = A[i1], v = A[i2];
|
||||
A[i1] = (u + v) * 0.5;
|
||||
A[i2] = (u - v) * w * 0.5;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// 4_th iteration
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 8
|
||||
// rem is the remainder of tid/8. tid takes values:
|
||||
// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ... N/4
|
||||
// then rem takes values:
|
||||
// 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, ... N/4
|
||||
// and striding by 8 will allow us to cover all
|
||||
// the coefficients correctly
|
||||
int rem = tid & 7;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 8;
|
||||
|
||||
double2 w = conjugate(negTwids4[rem]);
|
||||
u = A[i1], v = A[i2];
|
||||
A[i1] = (u + v) * 0.5;
|
||||
A[i2] = (u - v) * w * 0.5;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// third iteration
|
||||
// from k=3 on, we have to do the full complex multiplication
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 4
|
||||
// rem is the remainder of tid/4. tid takes values:
|
||||
// 0, 1, 2, 3, 4, 5, 6, 7, ... N/4
|
||||
// then rem takes values:
|
||||
// 0, 1, 2, 3, 0, 1, 2, 3, ... N/4
|
||||
// and striding by 4 will allow us to cover all
|
||||
// the coefficients correctly
|
||||
int rem = tid & 3;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 4;
|
||||
|
||||
double2 w = conjugate(negTwids3[rem]);
|
||||
u = A[i1], v = A[i2];
|
||||
A[i1] = (u + v) * 0.5;
|
||||
A[i2] = (u - v) * w * 0.5;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// second iteration: apply the butterfly pattern
|
||||
// between groups of 4 coefficients
|
||||
// k=2, \zeta=exp(i pi/4) for even coefficients and
|
||||
// exp(3 i pi / 4) for odd coefficients
|
||||
// TODO (Agnes) how does this work on the gpu? aren't we doing
|
||||
// a lot more computations than we should?
|
||||
tid = threadIdx.x;
|
||||
// odd = 0 for even coefficients, 1 for odd coefficients
|
||||
int odd = tid & 1;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 2
|
||||
// i1=2*tid if tid is even and 2*tid-1 if it is odd
|
||||
i1 = (tid << 1) - odd;
|
||||
i2 = i1 + 2;
|
||||
|
||||
// TODO(Beka) optimize twiddle multiplication
|
||||
double2 w;
|
||||
if (odd) {
|
||||
w.x = -0.707106781186547461715008466854;
|
||||
w.y = -0.707106781186547572737310929369;
|
||||
} else {
|
||||
w.x = 0.707106781186547461715008466854;
|
||||
w.y = -0.707106781186547572737310929369;
|
||||
}
|
||||
|
||||
u = A[i1], v = A[i2];
|
||||
A[i1] = (u + v) * 0.5;
|
||||
A[i2] = (u - v) * w * 0.5;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// Now we go through all the levels of the FFT one by one
|
||||
// (instead of recursively)
|
||||
// first iteration: k=1, zeta=i for all coefficients
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 2
|
||||
i1 = tid << 1;
|
||||
i2 = i1 + 1;
|
||||
// TODO(Beka) optimize twiddle multiplication
|
||||
double2 w = {0, -1};
|
||||
u = A[i1], v = A[i2];
|
||||
A[i1] = (u + v) * 0.5;
|
||||
A[i2] = (u - v) * w * 0.5;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
bit_reverse_inplace<params>(A);
|
||||
__syncthreads();
|
||||
// Real polynomials handled should not exceed a degree of 8192
|
||||
}
|
||||
|
||||
/*
|
||||
* correction after direct fft
|
||||
* does not use extra shared memory for recovering
|
||||
* correction is done using registers.
|
||||
* based on Pascal's paper
|
||||
*/
|
||||
template <class params>
|
||||
__device__ void correction_direct_fft_inplace(double2 *x) {
|
||||
constexpr int threads = params::degree / params::opt;
|
||||
int tid = threadIdx.x;
|
||||
double2 left[params::opt / 4];
|
||||
double2 right[params::opt / 4];
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 4; i++) {
|
||||
left[i] = x[tid + i * threads];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 4; i++) {
|
||||
right[i] = x[params::degree / 2 - (tid + i * threads + 1)];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 4; i++) {
|
||||
double2 tw = negacyclic_twiddle<params::degree>(tid + i * threads);
|
||||
double add_RE = left[i].x + right[i].x;
|
||||
double sub_RE = left[i].x - right[i].x;
|
||||
double add_IM = left[i].y + right[i].y;
|
||||
double sub_IM = left[i].y - right[i].y;
|
||||
|
||||
double tmp1 = add_IM * tw.x + sub_RE * tw.y;
|
||||
double tmp2 = -sub_RE * tw.x + add_IM * tw.y;
|
||||
x[tid + i * threads].x = (add_RE + tmp1) * 0.5;
|
||||
x[tid + i * threads].y = (sub_IM + tmp2) * 0.5;
|
||||
x[params::degree / 2 - (tid + i * threads + 1)].x = (add_RE - tmp1) * 0.5;
|
||||
x[params::degree / 2 - (tid + i * threads + 1)].y = (-sub_IM + tmp2) * 0.5;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* correction before inverse fft
|
||||
* does not use extra shared memory for recovering
|
||||
* correction is done using registers.
|
||||
* based on Pascal's paper
|
||||
*/
|
||||
template <class params>
|
||||
__device__ void correction_inverse_fft_inplace(double2 *x) {
|
||||
constexpr int threads = params::degree / params::opt;
|
||||
int tid = threadIdx.x;
|
||||
double2 left[params::opt / 4];
|
||||
double2 right[params::opt / 4];
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 4; i++) {
|
||||
left[i] = x[tid + i * threads];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 4; i++) {
|
||||
right[i] = x[params::degree / 2 - (tid + i * threads + 1)];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 4; i++) {
|
||||
double2 tw = negacyclic_twiddle<params::degree>(tid + i * threads);
|
||||
double add_RE = left[i].x + right[i].x;
|
||||
double sub_RE = left[i].x - right[i].x;
|
||||
double add_IM = left[i].y + right[i].y;
|
||||
double sub_IM = left[i].y - right[i].y;
|
||||
|
||||
double tmp1 = add_IM * tw.x - sub_RE * tw.y;
|
||||
double tmp2 = sub_RE * tw.x + add_IM * tw.y;
|
||||
x[tid + i * threads].x = (add_RE - tmp1) * 0.5;
|
||||
x[tid + i * threads].y = (sub_IM + tmp2) * 0.5;
|
||||
x[params::degree / 2 - (tid + i * threads + 1)].x = (add_RE + tmp1) * 0.5;
|
||||
x[params::degree / 2 - (tid + i * threads + 1)].y = (-sub_IM + tmp2) * 0.5;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* global batch fft
|
||||
* does fft in half size
|
||||
* unrolling halfsize fft result in half size + 1 eleemnts
|
||||
* this function must be called with actual degree
|
||||
* function takes as input already compressed input
|
||||
*/
|
||||
template <class params>
|
||||
__global__ void batch_NSMFFT(double2 *d_input, double2 *d_output) {
|
||||
extern __shared__ double2 sharedMemoryFFT[];
|
||||
double2 *fft = sharedMemoryFFT;
|
||||
|
||||
int tid = threadIdx.x;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] = d_input[blockIdx.x * (params::degree / 2) + tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
NSMFFT_direct<HalfDegree<params>>(fft);
|
||||
__syncthreads();
|
||||
correction_direct_fft_inplace<params>(fft);
|
||||
__syncthreads();
|
||||
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
d_output[blockIdx.x * (params::degree / 2) + tid] = fft[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // GPU_BOOTSTRAP_FFT_1024_CUH
|
||||
402
src/fft/smfft.cuh
Normal file
402
src/fft/smfft.cuh
Normal file
@@ -0,0 +1,402 @@
|
||||
/*
|
||||
#ifndef GPU_BOOTSTRAP_SMFFT_CUH
|
||||
#define GPU_BOOTSTRAP_SMFFT_CUH
|
||||
|
||||
#include "../complex/operations.cuh"
|
||||
#include "twiddles.cuh"
|
||||
|
||||
__device__ __inline__ double2 Get_W_value_inverse(int index) {
|
||||
double2 ctemp = _gTwiddles[index];
|
||||
ctemp.y = -ctemp.y;
|
||||
return (ctemp);
|
||||
}
|
||||
template <class params>
|
||||
__device__ double2 Get_after_inverse_fft_twiddle(int index) {
|
||||
double2 ctemp;
|
||||
switch (params::degree) {
|
||||
case 512:
|
||||
ctemp = INVERSE_TWIDDLES_512[index];
|
||||
break;
|
||||
case 1024:
|
||||
ctemp = gTwiddles1024[index];
|
||||
ctemp.x /= params::degree;
|
||||
ctemp.y /= -params::degree;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return ctemp;
|
||||
}
|
||||
|
||||
__device__ __inline__ double shfl(double *value, int par) {
|
||||
#if (CUDART_VERSION >= 9000)
|
||||
return (__shfl_sync(0xffffffff, (*value), par));
|
||||
#else
|
||||
return (__shfl((*value), par));
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ __inline__ double shfl_xor(double *value, int par) {
|
||||
#if (CUDART_VERSION >= 9000)
|
||||
return (__shfl_xor_sync(0xffffffff, (*value), par));
|
||||
#else
|
||||
return (__shfl_xor((*value), par));
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ __inline__ double shfl_down(double *value, int par) {
|
||||
#if (CUDART_VERSION >= 9000)
|
||||
return (__shfl_down_sync(0xffffffff, (*value), par));
|
||||
#else
|
||||
return (__shfl_down((*value), par));
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ __inline__ void
|
||||
reorder_16_register(double2 *A_DFT_value, double2 *B_DFT_value,
|
||||
double2 *C_DFT_value, double2 *D_DFT_value, int *local_id) {
|
||||
double2 Af2temp, Bf2temp, Cf2temp, Df2temp;
|
||||
unsigned int target = (((unsigned int)__brev(((*local_id) & 15))) >> (28)) +
|
||||
16 * ((*local_id) >> 4);
|
||||
Af2temp.x = shfl(&(A_DFT_value->x), target);
|
||||
Af2temp.y = shfl(&(A_DFT_value->y), target);
|
||||
Bf2temp.x = shfl(&(B_DFT_value->x), target);
|
||||
Bf2temp.y = shfl(&(B_DFT_value->y), target);
|
||||
Cf2temp.x = shfl(&(C_DFT_value->x), target);
|
||||
Cf2temp.y = shfl(&(C_DFT_value->y), target);
|
||||
Df2temp.x = shfl(&(D_DFT_value->x), target);
|
||||
Df2temp.y = shfl(&(D_DFT_value->y), target);
|
||||
__syncwarp();
|
||||
(*A_DFT_value) = Af2temp;
|
||||
(*B_DFT_value) = Bf2temp;
|
||||
(*C_DFT_value) = Cf2temp;
|
||||
(*D_DFT_value) = Df2temp;
|
||||
}
|
||||
|
||||
__device__ __inline__ void reorder_32_register(double2 *A_DFT_value,
|
||||
double2 *B_DFT_value,
|
||||
double2 *C_DFT_value,
|
||||
double2 *D_DFT_value) {
|
||||
double2 Af2temp, Bf2temp, Cf2temp, Df2temp;
|
||||
unsigned int target = ((unsigned int)__brev(threadIdx.x)) >> (27);
|
||||
Af2temp.x = shfl(&(A_DFT_value->x), target);
|
||||
Af2temp.y = shfl(&(A_DFT_value->y), target);
|
||||
Bf2temp.x = shfl(&(B_DFT_value->x), target);
|
||||
Bf2temp.y = shfl(&(B_DFT_value->y), target);
|
||||
Cf2temp.x = shfl(&(C_DFT_value->x), target);
|
||||
Cf2temp.y = shfl(&(C_DFT_value->y), target);
|
||||
Df2temp.x = shfl(&(D_DFT_value->x), target);
|
||||
Df2temp.y = shfl(&(D_DFT_value->y), target);
|
||||
__syncwarp();
|
||||
(*A_DFT_value) = Af2temp;
|
||||
(*B_DFT_value) = Bf2temp;
|
||||
(*C_DFT_value) = Cf2temp;
|
||||
(*D_DFT_value) = Df2temp;
|
||||
}
|
||||
|
||||
template <class params>
|
||||
__device__ __inline__ void
|
||||
reorder_512(double2 *s_input, double2 *A_DFT_value, double2 *B_DFT_value,
|
||||
double2 *C_DFT_value, double2 *D_DFT_value) {
|
||||
int local_id = threadIdx.x & (params::warp - 1);
|
||||
int warp_id = threadIdx.x / params::warp;
|
||||
|
||||
// reorder elements within warp so we can save them in semi-transposed manner
|
||||
// into shared memory
|
||||
reorder_32_register(A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value);
|
||||
|
||||
// reorder elements within warp so we can save them in semi-transposed manner
|
||||
// into shared memory
|
||||
__syncthreads();
|
||||
unsigned int sm_store_pos =
|
||||
(local_id >> 1) + 16 * (local_id & 1) + warp_id * 132;
|
||||
s_input[sm_store_pos] = *A_DFT_value;
|
||||
s_input[sm_store_pos + 33] = *B_DFT_value;
|
||||
s_input[66 + sm_store_pos] = *C_DFT_value;
|
||||
s_input[66 + sm_store_pos + 33] = *D_DFT_value;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Read shared memory to get reordered input
|
||||
unsigned int sm_read_pos = (local_id & 15) * 32 + local_id + warp_id * 4;
|
||||
__syncthreads();
|
||||
*A_DFT_value = s_input[sm_read_pos + 0];
|
||||
*B_DFT_value = s_input[sm_read_pos + 1];
|
||||
*C_DFT_value = s_input[sm_read_pos + 2];
|
||||
*D_DFT_value = s_input[sm_read_pos + 3];
|
||||
|
||||
__syncthreads();
|
||||
reorder_16_register(A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value,
|
||||
&local_id);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <class params>
|
||||
__device__ __inline__ void
|
||||
reorder_1024(double2 *s_input, double2 *A_DFT_value, double2 *B_DFT_value,
|
||||
double2 *C_DFT_value, double2 *D_DFT_value) {
|
||||
int local_id = threadIdx.x & (params::warp - 1);
|
||||
int warp_id = threadIdx.x / params::warp;
|
||||
|
||||
// reorder elements within params::warp so we can save them in semi-transposed
|
||||
// manner into shared memory
|
||||
reorder_32_register(A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value);
|
||||
|
||||
// reorder elements within params::warp so we can save them in semi-transposed
|
||||
// manner into shared memory
|
||||
__syncthreads();
|
||||
unsigned int sm_store_pos =
|
||||
(local_id >> 0) + 32 * (local_id & 0) + warp_id * 132;
|
||||
s_input[sm_store_pos] = *A_DFT_value;
|
||||
s_input[sm_store_pos + 33] = *B_DFT_value;
|
||||
s_input[66 + sm_store_pos] = *C_DFT_value;
|
||||
s_input[66 + sm_store_pos + 33] = *D_DFT_value;
|
||||
|
||||
// Read shared memory to get reordered input
|
||||
unsigned int sm_read_pos = (local_id & 31) * 32 + local_id + warp_id * 4;
|
||||
__syncthreads();
|
||||
*A_DFT_value = s_input[sm_read_pos + 0];
|
||||
*B_DFT_value = s_input[sm_read_pos + 1];
|
||||
*C_DFT_value = s_input[sm_read_pos + 2];
|
||||
*D_DFT_value = s_input[sm_read_pos + 3];
|
||||
|
||||
__syncthreads();
|
||||
reorder_32_register(A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value);
|
||||
}
|
||||
|
||||
__device__ bool printOnce = true;
|
||||
|
||||
template <class params> __device__ void do_SMFFT_CT_DIT(double2 *s_input) {
|
||||
double2 A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value;
|
||||
double2 W;
|
||||
double2 Aftemp, Bftemp, Cftemp, Dftemp;
|
||||
|
||||
int j, m_param;
|
||||
int parity, itemp;
|
||||
int A_read_index, B_read_index, C_read_index, D_read_index;
|
||||
int PoT, PoTp1, q;
|
||||
|
||||
int local_id = threadIdx.x & (params::warp - 1);
|
||||
int warp_id = threadIdx.x / params::warp;
|
||||
A_DFT_value = s_input[local_id + (warp_id << 2) * params::warp];
|
||||
B_DFT_value =
|
||||
s_input[local_id + (warp_id << 2) * params::warp + params::warp];
|
||||
C_DFT_value =
|
||||
s_input[local_id + (warp_id << 2) * params::warp + 2 * params::warp];
|
||||
D_DFT_value =
|
||||
s_input[local_id + (warp_id << 2) * params::warp + 3 * params::warp];
|
||||
|
||||
switch (params::log2_degree) {
|
||||
case 9:
|
||||
reorder_512<params>(s_input, &A_DFT_value, &B_DFT_value, &C_DFT_value,
|
||||
&D_DFT_value);
|
||||
break;
|
||||
case 10:
|
||||
reorder_1024<params>(s_input, &A_DFT_value, &B_DFT_value, &C_DFT_value,
|
||||
&D_DFT_value);
|
||||
break;
|
||||
// case 11:
|
||||
// reorder_2048<params, opt>(s_input, &A_DFT_value, &B_DFT_value,
|
||||
//&C_DFT_value, &D_DFT_value); break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
//----> FFT
|
||||
PoT = 1;
|
||||
PoTp1 = 2;
|
||||
|
||||
//--> First iteration
|
||||
itemp = local_id & 1;
|
||||
parity = (1 - itemp * 2);
|
||||
|
||||
A_DFT_value.x = parity * A_DFT_value.x + shfl_xor(&A_DFT_value.x, 1);
|
||||
A_DFT_value.y = parity * A_DFT_value.y + shfl_xor(&A_DFT_value.y, 1);
|
||||
B_DFT_value.x = parity * B_DFT_value.x + shfl_xor(&B_DFT_value.x, 1);
|
||||
B_DFT_value.y = parity * B_DFT_value.y + shfl_xor(&B_DFT_value.y, 1);
|
||||
C_DFT_value.x = parity * C_DFT_value.x + shfl_xor(&C_DFT_value.x, 1);
|
||||
C_DFT_value.y = parity * C_DFT_value.y + shfl_xor(&C_DFT_value.y, 1);
|
||||
D_DFT_value.x = parity * D_DFT_value.x + shfl_xor(&D_DFT_value.x, 1);
|
||||
D_DFT_value.y = parity * D_DFT_value.y + shfl_xor(&D_DFT_value.y, 1);
|
||||
|
||||
//--> Second through Fifth iteration (no synchronization)
|
||||
PoT = 2;
|
||||
PoTp1 = 4;
|
||||
for (q = 1; q < 5; q++) {
|
||||
m_param = (local_id & (PoTp1 - 1));
|
||||
itemp = m_param >> q;
|
||||
parity = ((itemp << 1) - 1);
|
||||
if (params::fft_direction)
|
||||
W = Get_W_value_inverse((q - 1) * 257 + itemp * m_param);
|
||||
else
|
||||
W = _gTwiddles[(q - 1) * 257 + itemp * m_param];
|
||||
Aftemp.x = W.x * A_DFT_value.x - W.y * A_DFT_value.y;
|
||||
Aftemp.y = W.x * A_DFT_value.y + W.y * A_DFT_value.x;
|
||||
Bftemp.x = W.x * B_DFT_value.x - W.y * B_DFT_value.y;
|
||||
Bftemp.y = W.x * B_DFT_value.y + W.y * B_DFT_value.x;
|
||||
Cftemp.x = W.x * C_DFT_value.x - W.y * C_DFT_value.y;
|
||||
Cftemp.y = W.x * C_DFT_value.y + W.y * C_DFT_value.x;
|
||||
Dftemp.x = W.x * D_DFT_value.x - W.y * D_DFT_value.y;
|
||||
Dftemp.y = W.x * D_DFT_value.y + W.y * D_DFT_value.x;
|
||||
|
||||
A_DFT_value.x = Aftemp.x + parity * shfl_xor(&Aftemp.x, PoT);
|
||||
A_DFT_value.y = Aftemp.y + parity * shfl_xor(&Aftemp.y, PoT);
|
||||
B_DFT_value.x = Bftemp.x + parity * shfl_xor(&Bftemp.x, PoT);
|
||||
B_DFT_value.y = Bftemp.y + parity * shfl_xor(&Bftemp.y, PoT);
|
||||
C_DFT_value.x = Cftemp.x + parity * shfl_xor(&Cftemp.x, PoT);
|
||||
C_DFT_value.y = Cftemp.y + parity * shfl_xor(&Cftemp.y, PoT);
|
||||
D_DFT_value.x = Dftemp.x + parity * shfl_xor(&Dftemp.x, PoT);
|
||||
D_DFT_value.y = Dftemp.y + parity * shfl_xor(&Dftemp.y, PoT);
|
||||
|
||||
PoT = PoT << 1;
|
||||
PoTp1 = PoTp1 << 1;
|
||||
}
|
||||
|
||||
itemp = local_id + (warp_id << 2) * params::warp;
|
||||
s_input[itemp] = A_DFT_value;
|
||||
s_input[itemp + params::warp] = B_DFT_value;
|
||||
s_input[itemp + 2 * params::warp] = C_DFT_value;
|
||||
s_input[itemp + 3 * params::warp] = D_DFT_value;
|
||||
|
||||
for (q = 5; q < (params::log2_degree - 1); q++) {
|
||||
__syncthreads();
|
||||
m_param = threadIdx.x & (PoT - 1);
|
||||
j = threadIdx.x >> q;
|
||||
|
||||
if (params::fft_direction)
|
||||
W = Get_W_value_inverse((q - 1) * 257 + m_param);
|
||||
else
|
||||
W = _gTwiddles[(q - 1) * 257 + m_param];
|
||||
|
||||
A_read_index = j * (PoTp1 << 1) + m_param;
|
||||
B_read_index = j * (PoTp1 << 1) + m_param + PoT;
|
||||
C_read_index = j * (PoTp1 << 1) + m_param + PoTp1;
|
||||
D_read_index = j * (PoTp1 << 1) + m_param + 3 * PoT;
|
||||
|
||||
Aftemp = s_input[A_read_index];
|
||||
Bftemp = s_input[B_read_index];
|
||||
A_DFT_value.x = Aftemp.x + W.x * Bftemp.x - W.y * Bftemp.y;
|
||||
A_DFT_value.y = Aftemp.y + W.x * Bftemp.y + W.y * Bftemp.x;
|
||||
B_DFT_value.x = Aftemp.x - W.x * Bftemp.x + W.y * Bftemp.y;
|
||||
B_DFT_value.y = Aftemp.y - W.x * Bftemp.y - W.y * Bftemp.x;
|
||||
|
||||
Cftemp = s_input[C_read_index];
|
||||
Dftemp = s_input[D_read_index];
|
||||
C_DFT_value.x = Cftemp.x + W.x * Dftemp.x - W.y * Dftemp.y;
|
||||
C_DFT_value.y = Cftemp.y + W.x * Dftemp.y + W.y * Dftemp.x;
|
||||
D_DFT_value.x = Cftemp.x - W.x * Dftemp.x + W.y * Dftemp.y;
|
||||
D_DFT_value.y = Cftemp.y - W.x * Dftemp.y - W.y * Dftemp.x;
|
||||
|
||||
s_input[A_read_index] = A_DFT_value;
|
||||
s_input[B_read_index] = B_DFT_value;
|
||||
s_input[C_read_index] = C_DFT_value;
|
||||
s_input[D_read_index] = D_DFT_value;
|
||||
|
||||
PoT = PoT << 1;
|
||||
PoTp1 = PoTp1 << 1;
|
||||
}
|
||||
|
||||
// last iteration
|
||||
if (params::log2_degree > 6) {
|
||||
__syncthreads();
|
||||
m_param = threadIdx.x;
|
||||
|
||||
if (params::fft_direction)
|
||||
W = Get_W_value_inverse((q - 1) * 257 + m_param);
|
||||
else
|
||||
W = _gTwiddles[(q - 1) * 257 + m_param];
|
||||
|
||||
A_read_index = m_param;
|
||||
B_read_index = m_param + PoT;
|
||||
C_read_index = m_param + (PoT >> 1);
|
||||
D_read_index = m_param + 3 * (PoT >> 1);
|
||||
|
||||
Aftemp = s_input[A_read_index];
|
||||
Bftemp = s_input[B_read_index];
|
||||
A_DFT_value.x = Aftemp.x + W.x * Bftemp.x - W.y * Bftemp.y;
|
||||
A_DFT_value.y = Aftemp.y + W.x * Bftemp.y + W.y * Bftemp.x;
|
||||
B_DFT_value.x = Aftemp.x - W.x * Bftemp.x + W.y * Bftemp.y;
|
||||
B_DFT_value.y = Aftemp.y - W.x * Bftemp.y - W.y * Bftemp.x;
|
||||
|
||||
Cftemp = s_input[C_read_index];
|
||||
Dftemp = s_input[D_read_index];
|
||||
C_DFT_value.x = Cftemp.x + W.y * Dftemp.x + W.x * Dftemp.y;
|
||||
C_DFT_value.y = Cftemp.y + W.y * Dftemp.y - W.x * Dftemp.x;
|
||||
D_DFT_value.x = Cftemp.x - W.y * Dftemp.x - W.x * Dftemp.y;
|
||||
D_DFT_value.y = Cftemp.y - W.y * Dftemp.y + W.x * Dftemp.x;
|
||||
|
||||
s_input[A_read_index] = A_DFT_value;
|
||||
s_input[B_read_index] = B_DFT_value;
|
||||
s_input[C_read_index] = C_DFT_value;
|
||||
s_input[D_read_index] = D_DFT_value;
|
||||
}
|
||||
}
|
||||
|
||||
template <class params>
|
||||
__global__ void SMFFT_DIT_external(double2 *d_input, double2 *d_output) {
|
||||
__syncthreads();
|
||||
|
||||
extern __shared__ double2 sharedmemBSK[];
|
||||
|
||||
double2 *s_input = sharedmemBSK;
|
||||
|
||||
int cTid = threadIdx.x * params::opt;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
double2 tmp;
|
||||
switch (params::degree) {
|
||||
case 512:
|
||||
tmp = INVERSE_TWIDDLES_512[cTid];
|
||||
tmp.x *= params::degree;
|
||||
tmp.y *= -params::degree;
|
||||
break;
|
||||
case 1024:
|
||||
tmp = gTwiddles1024[cTid];
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
d_input[blockIdx.x * params::degree + cTid] *= tmp;
|
||||
cTid++;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
s_input[threadIdx.x] = d_input[threadIdx.x + blockIdx.x * params::degree];
|
||||
s_input[threadIdx.x + params::quarter] =
|
||||
d_input[threadIdx.x + blockIdx.x * params::degree + params::quarter];
|
||||
s_input[threadIdx.x + params::half] =
|
||||
d_input[threadIdx.x + blockIdx.x * params::degree + params::half];
|
||||
s_input[threadIdx.x + params::three_quarters] =
|
||||
d_input[threadIdx.x + blockIdx.x * params::degree +
|
||||
params::three_quarters];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
do_SMFFT_CT_DIT<params>(s_input);
|
||||
if (threadIdx.x == 0 && blockIdx.x == 0) {
|
||||
for (int i = 0; i < 1024; i++)
|
||||
printf("smfft[%u] %.10f %.10f\n", i, s_input[i].x, s_input[i].y);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
|
||||
|
||||
__syncthreads();
|
||||
d_output[threadIdx.x + blockIdx.x * params::degree] = s_input[threadIdx.x];
|
||||
d_output[threadIdx.x + blockIdx.x * params::degree + params::quarter] =
|
||||
s_input[threadIdx.x + params::quarter];
|
||||
d_output[threadIdx.x + blockIdx.x * params::degree + params::half] =
|
||||
s_input[threadIdx.x + params::half];
|
||||
d_output[threadIdx.x + blockIdx.x * params::degree + params::three_quarters] =
|
||||
s_input[threadIdx.x + params::three_quarters];
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
#endif // GPU_BOOTSTRAP_SMFFT_CUH
|
||||
|
||||
*/
|
||||
8204
src/fft/twiddles.cu
Normal file
8204
src/fft/twiddles.cu
Normal file
File diff suppressed because it is too large
Load Diff
28
src/fft/twiddles.cuh
Normal file
28
src/fft/twiddles.cuh
Normal file
@@ -0,0 +1,28 @@
|
||||
|
||||
#ifndef GPU_BOOTSTRAP_TWIDDLES_CUH
|
||||
#define GPU_BOOTSTRAP_TWIDDLES_CUH
|
||||
|
||||
// TODO (Agnes) depending on the device architecture
|
||||
// can we make more of them __constant__?
|
||||
// Do we have to define them all regardless of the
|
||||
// polynomial degree and q values?
|
||||
|
||||
// TODO (Beka) make those two arrays with dynamic size
|
||||
// or find exact maximum for 8192 length poly it shuld
|
||||
// be less than 2048
|
||||
extern __constant__ short SW1[2048];
|
||||
extern __constant__ short SW2[2048];
|
||||
|
||||
extern __constant__ double2 negTwids3[4];
|
||||
extern __constant__ double2 negTwids4[8];
|
||||
extern __constant__ double2 negTwids5[16];
|
||||
extern __constant__ double2 negTwids6[32];
|
||||
extern __constant__ double2 negTwids7[64];
|
||||
extern __constant__ double2 negTwids8[128];
|
||||
extern __constant__ double2 negTwids9[256];
|
||||
extern __constant__ double2 negTwids10[512];
|
||||
extern __constant__ double2 negTwids11[1024];
|
||||
extern __device__ double2 negTwids12[2048];
|
||||
extern __device__ double2 negTwids13[4096];
|
||||
|
||||
#endif
|
||||
55
src/keyswitch.cu
Normal file
55
src/keyswitch.cu
Normal file
@@ -0,0 +1,55 @@
|
||||
#include "keyswitch.cuh"
|
||||
#include "keyswitch.h"
|
||||
#include "polynomial/parameters.cuh"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
/* Perform keyswitch on a batch of input LWE ciphertexts for 32 bits
|
||||
*
|
||||
* - lwe_out: output batch of num_samples keyswitched ciphertexts c =
|
||||
* (a0,..an-1,b) where n is the LWE dimension
|
||||
* - lwe_in: input batch of num_samples LWE ciphertexts, containing n
|
||||
* mask values + 1 body value
|
||||
*
|
||||
* This function calls a wrapper to a device kernel that performs the keyswitch
|
||||
* - num_samples blocks of threads are launched
|
||||
*/
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_32(void *v_stream, void *lwe_out, void *lwe_in,
|
||||
void *ksk,
|
||||
uint32_t lwe_dimension_before,
|
||||
uint32_t lwe_dimension_after,
|
||||
uint32_t base_log, uint32_t l_gadget,
|
||||
uint32_t num_samples) {
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
v_stream, static_cast<uint32_t *>(lwe_out), static_cast<uint32_t *>(lwe_in),
|
||||
static_cast<uint32_t*>(ksk),
|
||||
lwe_dimension_before, lwe_dimension_after,
|
||||
base_log, l_gadget,
|
||||
num_samples);
|
||||
}
|
||||
|
||||
/* Perform keyswitch on a batch of input LWE ciphertexts for 64 bits
|
||||
*
|
||||
* - lwe_out: output batch of num_samples keyswitched ciphertexts c =
|
||||
* (a0,..an-1,b) where n is the LWE dimension
|
||||
* - lwe_in: input batch of num_samples LWE ciphertexts, containing n
|
||||
* mask values + 1 body value
|
||||
*
|
||||
* This function calls a wrapper to a device kernel that performs the keyswitch
|
||||
* - num_samples blocks of threads are launched
|
||||
*/
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_64(void *v_stream, void *lwe_out, void *lwe_in,
|
||||
void *ksk,
|
||||
uint32_t lwe_dimension_before,
|
||||
uint32_t lwe_dimension_after,
|
||||
uint32_t base_log, uint32_t l_gadget,
|
||||
uint32_t num_samples) {
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
v_stream, static_cast<uint64_t *>(lwe_out), static_cast<uint64_t *> (lwe_in),
|
||||
static_cast<uint64_t*>(ksk),
|
||||
lwe_dimension_before, lwe_dimension_after,
|
||||
base_log, l_gadget,
|
||||
num_samples);
|
||||
}
|
||||
|
||||
|
||||
146
src/keyswitch.cuh
Normal file
146
src/keyswitch.cuh
Normal file
@@ -0,0 +1,146 @@
|
||||
#ifndef CNCRT_KS_H
|
||||
#define CNCRT_KS_H
|
||||
|
||||
#include "crypto/gadget.cuh"
|
||||
#include "crypto/torus.cuh"
|
||||
#include "polynomial/polynomial.cuh"
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
template <typename Torus>
|
||||
__device__ Torus *get_ith_block(Torus *ksk, int i, int level,
|
||||
uint32_t lwe_dimension_after,
|
||||
uint32_t l_gadget) {
|
||||
int pos = i * l_gadget * (lwe_dimension_after + 1) +
|
||||
level * (lwe_dimension_after + 1);
|
||||
Torus *ptr = &ksk[pos];
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/*
|
||||
* keyswitch kernel
|
||||
* Each thread handles a piece of the following equation:
|
||||
* $$GLWE_s2(\Delta.m+e) = (0,0,..,0,b) - \sum_{i=0,k-1} <Dec(a_i),
|
||||
* (GLWE_s2(s1_i q/beta),..,GLWE(s1_i q/beta^l)>$$ where k is the dimension of
|
||||
* the GLWE ciphertext. If the polynomial dimension in GLWE is > 1, this
|
||||
* equation is solved for each polynomial coefficient. where Dec denotes the
|
||||
* decomposition with base beta and l levels and the inner product is done
|
||||
* between the decomposition of a_i and l GLWE encryptions of s1_i q/\beta^j,
|
||||
* with j in [1,l] We obtain a GLWE encryption of Delta.m (with Delta the
|
||||
* scaling factor) under key s2 instead of s1, with an increased noise
|
||||
*
|
||||
*/
|
||||
template <typename Torus>
|
||||
__global__ void keyswitch(Torus *lwe_out, Torus *lwe_in,
|
||||
Torus *ksk,
|
||||
uint32_t lwe_dimension_before,
|
||||
uint32_t lwe_dimension_after,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
int lwe_lower, int lwe_upper, int cutoff) {
|
||||
int tid = threadIdx.x;
|
||||
|
||||
extern __shared__ char sharedmem[];
|
||||
|
||||
Torus *local_lwe_out = (Torus *)sharedmem;
|
||||
|
||||
auto block_lwe_in =
|
||||
get_chunk(lwe_in, blockIdx.x, lwe_dimension_before + 1);
|
||||
auto block_lwe_out =
|
||||
get_chunk(lwe_out, blockIdx.x, lwe_dimension_after + 1);
|
||||
|
||||
auto gadget = GadgetMatrixSingle<Torus>(base_log, l_gadget);
|
||||
|
||||
int lwe_part_per_thd;
|
||||
if (tid < cutoff) {
|
||||
lwe_part_per_thd = lwe_upper;
|
||||
} else {
|
||||
lwe_part_per_thd = lwe_lower;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
for (int k = 0; k < lwe_part_per_thd; k++) {
|
||||
int idx = tid + k * blockDim.x;
|
||||
local_lwe_out[idx] = 0;
|
||||
}
|
||||
|
||||
if (tid == 0) {
|
||||
local_lwe_out[lwe_dimension_after] =
|
||||
block_lwe_in[lwe_dimension_before];
|
||||
}
|
||||
|
||||
for (int i = 0; i < lwe_dimension_before; i++) {
|
||||
|
||||
__syncthreads();
|
||||
|
||||
Torus a_i = round_to_closest_multiple(block_lwe_in[i], base_log,
|
||||
l_gadget);
|
||||
|
||||
for (int j = 0; j < l_gadget; j++) {
|
||||
auto ksk_block = get_ith_block(ksk, i, j, lwe_dimension_after, l_gadget);
|
||||
Torus decomposed = gadget.decompose_one_level_single(a_i, (uint32_t)j);
|
||||
for (int k = 0; k < lwe_part_per_thd; k++) {
|
||||
int idx = tid + k * blockDim.x;
|
||||
local_lwe_out[idx] -= (Torus)ksk_block[idx] * decomposed;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int k = 0; k < lwe_part_per_thd; k++) {
|
||||
int idx = tid + k * blockDim.x;
|
||||
block_lwe_out[idx] = local_lwe_out[idx];
|
||||
}
|
||||
}
|
||||
|
||||
/// assume lwe_in in the gpu
|
||||
template <typename Torus>
|
||||
__host__ void cuda_keyswitch_lwe_ciphertext_vector(void *v_stream, Torus *lwe_out, Torus *lwe_in,
|
||||
Torus *ksk,
|
||||
uint32_t lwe_dimension_before,
|
||||
uint32_t lwe_dimension_after,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t num_samples) {
|
||||
|
||||
constexpr int ideal_threads = 128;
|
||||
|
||||
int lwe_dim = lwe_dimension_after + 1;
|
||||
int lwe_lower, lwe_upper, cutoff;
|
||||
if (lwe_dim % ideal_threads == 0) {
|
||||
lwe_lower = lwe_dim / ideal_threads;
|
||||
lwe_upper = lwe_dim / ideal_threads;
|
||||
cutoff = 0;
|
||||
} else {
|
||||
int y =
|
||||
ceil((double)lwe_dim / (double)ideal_threads) * ideal_threads - lwe_dim;
|
||||
cutoff = ideal_threads - y;
|
||||
lwe_lower = lwe_dim / ideal_threads;
|
||||
lwe_upper = (int)ceil((double)lwe_dim / (double)ideal_threads);
|
||||
}
|
||||
|
||||
// int lwe_size_before =
|
||||
// (lwe_dimension_before + 1) * num_samples;
|
||||
int lwe_size_after =
|
||||
(lwe_dimension_after + 1) * num_samples;
|
||||
|
||||
int shared_mem =
|
||||
sizeof(Torus) * (lwe_dimension_after + 1);
|
||||
|
||||
cudaMemset(lwe_out, 0, sizeof(Torus) * lwe_size_after);
|
||||
|
||||
dim3 grid(num_samples, 1, 1);
|
||||
dim3 threads(ideal_threads, 1, 1);
|
||||
|
||||
cudaFuncSetAttribute(keyswitch<Torus>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem);
|
||||
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
keyswitch<<<grid, threads, shared_mem, *stream>>>(
|
||||
lwe_out, lwe_in, ksk, lwe_dimension_before, lwe_dimension_after, base_log,
|
||||
l_gadget, lwe_lower, lwe_upper, cutoff);
|
||||
|
||||
cudaStreamSynchronize(*stream);
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
261
src/polynomial/functions.cuh
Normal file
261
src/polynomial/functions.cuh
Normal file
@@ -0,0 +1,261 @@
|
||||
#ifndef GPU_POLYNOMIAL_FUNCTIONS
|
||||
#define GPU_POLYNOMIAL_FUNCTIONS
|
||||
#include "utils/memory.cuh"
|
||||
#include "utils/timer.cuh"
|
||||
|
||||
/*
|
||||
* function compresses decomposed buffer into half size complex buffer for fft
|
||||
*/
|
||||
template <typename T, class params>
|
||||
__device__ void real_to_complex_compressed(T *src, double2 *dst) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
dst[tid].x = (double)src[2 * tid];
|
||||
dst[tid].y = (double)src[2 * tid + 1];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* copy source polynomial to specific slice of batched polynomials
|
||||
* used only in low latency version
|
||||
*/
|
||||
template <typename T, class params>
|
||||
__device__ void copy_into_ith_polynomial_low_lat(T *source, T *dst, int i) {
|
||||
int tid = threadIdx.x;
|
||||
int begin = i * (params::degree / 2 + 1);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
dst[tid + begin] = source[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
dst[params::degree / 2 + begin] = source[params::degree / 2];
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* accumulates source polynomial into specific slice of batched polynomial
|
||||
* used only in low latency version
|
||||
*/
|
||||
template <typename T, class params>
|
||||
__device__ void add_polynomial_inplace_low_lat(T *source, T *dst, int p_id) {
|
||||
int tid = threadIdx.x;
|
||||
int begin = p_id * (params::degree / 2 + 1);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
dst[tid] += source[tid + begin];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
dst[params::degree / 2] += source[params::degree / 2 + begin];
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Performs acc = acc * (X^ä + 1) if zeroAcc = false
|
||||
* Performs acc = 0 if zeroAcc
|
||||
* takes single buffer and calculates inplace.
|
||||
*/
|
||||
template <typename T, int elems_per_thread, int block_size>
|
||||
__device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input,
|
||||
uint32_t j,
|
||||
bool zeroAcc) {
|
||||
int tid = threadIdx.x;
|
||||
constexpr int degree = block_size * elems_per_thread;
|
||||
if (zeroAcc) {
|
||||
for (int i = 0; i < elems_per_thread; i++) {
|
||||
accumulator[tid] = 0;
|
||||
tid += block_size;
|
||||
}
|
||||
} else {
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < elems_per_thread; i++) {
|
||||
if (j < degree) {
|
||||
if (tid < degree - j) {
|
||||
accumulator[tid] = input[tid + j];
|
||||
} else {
|
||||
accumulator[tid] = -input[tid - degree + j];
|
||||
}
|
||||
} else {
|
||||
uint32_t jj = j - degree;
|
||||
if (tid < degree - jj) {
|
||||
accumulator[tid] = -input[tid + jj];
|
||||
} else {
|
||||
accumulator[tid] = input[tid - degree + jj];
|
||||
}
|
||||
}
|
||||
tid += block_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Performs result_acc = acc * (X^ä - 1) - acc
|
||||
* takes single buffer as input returns single rotated buffer
|
||||
*/
|
||||
template <typename T, int elems_per_thread, int block_size>
|
||||
__device__ void
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial(T *acc, T *result_acc,
|
||||
uint32_t j) {
|
||||
int tid = threadIdx.x;
|
||||
constexpr int degree = block_size * elems_per_thread;
|
||||
for (int i = 0; i < elems_per_thread; i++) {
|
||||
if (j < degree) {
|
||||
if (tid < j) {
|
||||
result_acc[tid] = -acc[tid - j + degree] - acc[tid];
|
||||
} else {
|
||||
result_acc[tid] = acc[tid - j] - acc[tid];
|
||||
}
|
||||
} else {
|
||||
uint32_t jj = j - degree;
|
||||
if (tid < jj) {
|
||||
result_acc[tid] = acc[tid - jj + degree] - acc[tid];
|
||||
|
||||
} else {
|
||||
result_acc[tid] = -acc[tid - jj] - acc[tid];
|
||||
}
|
||||
}
|
||||
tid += block_size;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* performs a rounding to increase accuracy of the PBS
|
||||
* calculates inplace.
|
||||
*/
|
||||
template <typename T, int elems_per_thread, int block_size>
|
||||
__device__ void round_to_closest_multiple_inplace(T *rotated_acc, int base_log,
|
||||
int l_gadget) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < elems_per_thread; i++) {
|
||||
|
||||
T x_acc = rotated_acc[tid];
|
||||
T shift = sizeof(T) * 8 - l_gadget * base_log;
|
||||
T mask = 1ll << (shift - 1);
|
||||
T b_acc = (x_acc & mask) >> (shift - 1);
|
||||
T res_acc = x_acc >> shift;
|
||||
res_acc += b_acc;
|
||||
res_acc <<= shift;
|
||||
rotated_acc[tid] = res_acc;
|
||||
tid = tid + block_size;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__device__ void add_to_torus(double2 *m_values, Torus *result) {
|
||||
Torus mx = (sizeof(Torus) == 4) ? UINT32_MAX : UINT64_MAX;
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
// TODO (Beka) check if better memory access is possible
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
double v1 = m_values[tid].x;
|
||||
double v2 = m_values[tid].y;
|
||||
|
||||
double frac = v1 - floor(v1);
|
||||
frac *= mx;
|
||||
double carry = frac - floor(frac);
|
||||
double zero_point_five = 1. / 2.;
|
||||
if (carry >= zero_point_five)
|
||||
frac++;
|
||||
|
||||
|
||||
Torus V1 = typecast_double_to_torus<Torus>(frac);
|
||||
|
||||
frac = v2 - floor(v2);
|
||||
frac *= mx;
|
||||
carry = frac - floor(v2);
|
||||
if (carry >= zero_point_five)
|
||||
frac++;
|
||||
|
||||
Torus V2 = typecast_double_to_torus<Torus>(frac);
|
||||
|
||||
result[tid * 2] += V1;
|
||||
result[tid * 2 + 1] += V2;
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__device__ void sample_extract_body(Torus *lwe_out, Torus *accumulator) {
|
||||
// Set first coefficient of the accumulator as the body of the LWE sample
|
||||
// todo(Joao): not every thread needs to set it
|
||||
// if (threadIdx.x == 0)
|
||||
lwe_out[params::degree] = accumulator[0];
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__device__ void sample_extract_mask(Torus *lwe_out, Torus *accumulator) {
|
||||
// Set ACC = -ACC
|
||||
// accumulator.negate_inplace();
|
||||
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
accumulator[tid] = -accumulator[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Reverse the accumulator
|
||||
// accumulator.reverse_inplace();
|
||||
|
||||
tid = threadIdx.x;
|
||||
Torus result[params::opt];
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
result[i] = accumulator[params::degree - tid - 1];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
accumulator[tid] = result[i];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform ACC * X
|
||||
// accumulator.multiply_by_monomial_negacyclic_inplace(1);
|
||||
|
||||
tid = threadIdx.x;
|
||||
result[params::opt];
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
if (1 < params::degree) {
|
||||
if (tid < 1)
|
||||
result[i] = -accumulator[tid - 1 + params::degree];
|
||||
else
|
||||
result[i] = accumulator[tid - 1];
|
||||
} else {
|
||||
uint32_t jj = 1 - (uint32_t)params::degree;
|
||||
if (tid < jj)
|
||||
result[i] = accumulator[tid - jj + params::degree];
|
||||
else
|
||||
result[i] = -accumulator[tid - jj];
|
||||
}
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
accumulator[tid] = result[i];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Copy to the mask of the LWE sample
|
||||
// accumulator.copy_into(lwe_out);
|
||||
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
lwe_out[tid] = accumulator[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
78
src/polynomial/parameters.cuh
Normal file
78
src/polynomial/parameters.cuh
Normal file
@@ -0,0 +1,78 @@
|
||||
#ifndef CNCRT_PARAMETERS_H
|
||||
#define CNCRT_PARAMETERS_H
|
||||
|
||||
constexpr int log2(int n) { return (n <= 2) ? 1 : 1 + log2(n / 2); }
|
||||
|
||||
constexpr int choose_opt(int degree) {
|
||||
if (degree <= 1024)
|
||||
return 4;
|
||||
else if (degree == 2048)
|
||||
return 8;
|
||||
else if (degree == 4096)
|
||||
return 16;
|
||||
else
|
||||
return 32;
|
||||
}
|
||||
template <class params> class HalfDegree {
|
||||
public:
|
||||
constexpr static int degree = params::degree / 2;
|
||||
constexpr static int opt = params::opt / 2;
|
||||
constexpr static int log2_degree = params::log2_degree - 1;
|
||||
constexpr static int quarter = params::quarter / 2;
|
||||
constexpr static int half = params::half / 2;
|
||||
constexpr static int three_quarters = quarter + half;
|
||||
constexpr static int warp = 32;
|
||||
constexpr static int fft_sm_required = degree + degree / warp;
|
||||
};
|
||||
|
||||
template <int N> class Degree {
|
||||
public:
|
||||
constexpr static int degree = N;
|
||||
constexpr static int opt = choose_opt(N);
|
||||
constexpr static int log2_degree = log2(N);
|
||||
constexpr static int quarter = N / 4;
|
||||
constexpr static int half = N / 2;
|
||||
constexpr static int three_quarters = half + quarter;
|
||||
constexpr static int warp = 32;
|
||||
constexpr static int fft_sm_required = N + 32;
|
||||
};
|
||||
|
||||
enum sharedMemDegree {
|
||||
NOSM = 0,
|
||||
PARTIALSM = 1,
|
||||
FULLSM = 2
|
||||
|
||||
};
|
||||
|
||||
class ForwardFFT {
|
||||
public:
|
||||
constexpr static int direction = 0;
|
||||
};
|
||||
|
||||
class BackwardFFT {
|
||||
public:
|
||||
constexpr static int direction = 1;
|
||||
};
|
||||
|
||||
class ReorderFFT {
|
||||
constexpr static int reorder = 1;
|
||||
};
|
||||
class NoReorderFFT {
|
||||
constexpr static int reorder = 0;
|
||||
};
|
||||
|
||||
template <class params, class direction, class reorder = ReorderFFT>
|
||||
class FFTDegree : public params {
|
||||
public:
|
||||
constexpr static int fft_direction = direction::direction;
|
||||
constexpr static int fft_reorder = reorder::reorder;
|
||||
};
|
||||
|
||||
template <int N, class direction, class reorder = ReorderFFT>
|
||||
class FFTParams : public Degree<N> {
|
||||
public:
|
||||
constexpr static int fft_direction = direction::direction;
|
||||
constexpr static int fft_reorder = reorder::reorder;
|
||||
};
|
||||
|
||||
#endif // CNCRT_PARAMETERS_H
|
||||
668
src/polynomial/polynomial.cuh
Normal file
668
src/polynomial/polynomial.cuh
Normal file
@@ -0,0 +1,668 @@
|
||||
#ifndef CNCRT_POLYNOMIAL_H
|
||||
#define CNCRT_POLYNOMIAL_H
|
||||
|
||||
#include "complex/operations.cuh"
|
||||
#include "crypto/torus.cuh"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/smfft.cuh"
|
||||
#include "parameters.cuh"
|
||||
#include "utils/memory.cuh"
|
||||
#include "utils/timer.cuh"
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
|
||||
#define PI 3.141592653589793238462643383279502884197
|
||||
|
||||
template <typename T>
|
||||
__device__ T *get_chunk(T *data, int chunk_num, int chunk_size) {
|
||||
int pos = chunk_num * chunk_size;
|
||||
T *ptr = &data[pos];
|
||||
return ptr;
|
||||
}
|
||||
|
||||
class ExtraMemory {
|
||||
public:
|
||||
uint32_t m_size;
|
||||
__device__ ExtraMemory(uint32_t size) : m_size(size) {}
|
||||
};
|
||||
template <typename T, class params> class PolynomialFourier;
|
||||
|
||||
template <typename T, class params> class Polynomial;
|
||||
|
||||
template <typename T, class params> class Vector;
|
||||
|
||||
template <typename FT, class params> class Twiddles;
|
||||
|
||||
template <typename T, class params> class VectorPolynomial {
|
||||
public:
|
||||
T *m_data;
|
||||
uint32_t m_num_polynomials;
|
||||
|
||||
__device__ VectorPolynomial(T *data, uint32_t num_polynomials)
|
||||
: m_data(data), m_num_polynomials(num_polynomials) {}
|
||||
|
||||
__device__ VectorPolynomial(SharedMemory &shmem, uint32_t num_polynomials)
|
||||
: m_num_polynomials(num_polynomials) {
|
||||
shmem.get_allocation(&m_data, m_num_polynomials * params::degree);
|
||||
}
|
||||
|
||||
__device__ VectorPolynomial<T, params> get_chunk(int chunk_num,
|
||||
int chunk_size) {
|
||||
int pos = chunk_num * chunk_size;
|
||||
T *ptr = &m_data[pos];
|
||||
// todo(Joao): unsafe, user must pass chunk that has size multiple of
|
||||
// polynomial degree
|
||||
return VectorPolynomial<T, params>(ptr, chunk_size / params::degree);
|
||||
}
|
||||
|
||||
__host__ VectorPolynomial() {}
|
||||
|
||||
__host__ VectorPolynomial(DeviceMemory &dmem, uint32_t num_polynomials,
|
||||
int device)
|
||||
: m_num_polynomials(num_polynomials) {
|
||||
dmem.get_allocation(&m_data, m_num_polynomials * params::degree, device);
|
||||
}
|
||||
|
||||
__host__ VectorPolynomial(DeviceMemory &dmem, T *source,
|
||||
uint32_t num_polynomials, int device)
|
||||
: m_num_polynomials(num_polynomials) {
|
||||
dmem.get_allocation_and_copy_async(
|
||||
&m_data, source, m_num_polynomials * params::degree, device);
|
||||
}
|
||||
|
||||
__host__ void copy_to_host(T *dest) {
|
||||
cudaMemcpyAsync(dest, m_data,
|
||||
sizeof(T) * m_num_polynomials * params::degree,
|
||||
cudaMemcpyDeviceToHost);
|
||||
}
|
||||
|
||||
__device__ void copy_into(Polynomial<T, params> &dest,
|
||||
int polynomial_number = 0) {
|
||||
int tid = threadIdx.x;
|
||||
int begin = polynomial_number * params::degree;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
dest.coefficients[tid] = m_data[tid + begin];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
// todo(Joao): we need to make these APIs more clear, as it's confusing what's
|
||||
// being copied where
|
||||
__device__ void copy_into_ith_polynomial(PolynomialFourier<T, params> &source,
|
||||
int i) {
|
||||
int tid = threadIdx.x;
|
||||
int begin = i * (params::degree / 2 + 1);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
this->m_data[tid + begin] = source.m_values[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
this->m_data[params::degree / 2 + begin] =
|
||||
source.m_values[params::degree / 2];
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void split_into_polynomials(Polynomial<T, params> &first,
|
||||
Polynomial<T, params> &second) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
first.coefficients[tid] = m_data[tid];
|
||||
second.coefficients[tid] = m_data[tid + params::degree];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, class params> class PolynomialFourier {
|
||||
public:
|
||||
T *m_values;
|
||||
uint32_t degree;
|
||||
|
||||
__device__ __host__ PolynomialFourier(T *m_values) : m_values(m_values) {}
|
||||
|
||||
__device__ PolynomialFourier(SharedMemory &shmem) : degree(degree) {
|
||||
shmem.get_allocation(&this->m_values, params::degree);
|
||||
}
|
||||
|
||||
__device__ PolynomialFourier(SharedMemory &shmem, ExtraMemory extra_memory)
|
||||
: degree(degree) {
|
||||
shmem.get_allocation(&this->m_values, params::degree + extra_memory.m_size);
|
||||
}
|
||||
__device__ PolynomialFourier(SharedMemory &shmem, uint32_t degree)
|
||||
: degree(degree) {
|
||||
shmem.get_allocation(&this->m_values, degree);
|
||||
}
|
||||
|
||||
__host__ PolynomialFourier(DeviceMemory &dmem, int device) : degree(degree) {
|
||||
dmem.get_allocation(&this->m_values, params::degree, device);
|
||||
}
|
||||
|
||||
__device__ char *reuse_memory() { return (char *)m_values; }
|
||||
__device__ void copy_from(PolynomialFourier<T, params> &source, int begin) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
this->m_values[tid + begin] = source.m_values[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
__device__ void fill_with(T value) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
m_values[tid] = value;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
__device__ void add_polynomial_inplace(PolynomialFourier<T, params> &source,
|
||||
int begin) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
this->m_values[tid] += source.m_values[tid + begin];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
if (threadIdx.x == 0) {
|
||||
this->m_values[params::degree / 2] += source.m_values[params::degree / 2 +
|
||||
begin];
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
__device__ void swap_quarters_inplace() {
|
||||
int tid = threadIdx.x;
|
||||
int s1 = params::quarter;
|
||||
int s2 = params::three_quarters;
|
||||
|
||||
T tmp = m_values[s2 + tid];
|
||||
m_values[s2 + tid] = m_values[s1 + tid];
|
||||
m_values[s1 + tid] = tmp;
|
||||
}
|
||||
|
||||
__device__ void add_polynomial_inplace(VectorPolynomial<T, params> &source,
|
||||
int polynomial_number) {
|
||||
int tid = threadIdx.x;
|
||||
int begin = polynomial_number * (params::degree / 2 + 1);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
this->m_values[tid] += source.m_data[tid + begin];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
this->m_values[params::degree / 2] +=
|
||||
source.m_data[params::degree / 2 + begin];
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void
|
||||
forward_negacyclic_fft_inplace(PolynomialFourier<double2, params> &X) {
|
||||
// TODO function should be removed
|
||||
}
|
||||
|
||||
__device__ void inverse_negacyclic_fft_inplace() {
|
||||
// TODO function should be removed
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__device__ void add_to_torus(Polynomial<Torus, params> &result) {
|
||||
// TODO function should be removed
|
||||
}
|
||||
|
||||
__device__ T &operator[](int i) { return m_values[i]; }
|
||||
};
|
||||
|
||||
template <typename T, class params> class Polynomial {
|
||||
public:
|
||||
T *coefficients;
|
||||
uint32_t degree;
|
||||
|
||||
__device__ Polynomial(T *coefficients, uint32_t degree)
|
||||
: coefficients(coefficients), degree(degree) {}
|
||||
|
||||
__device__ Polynomial(char *memory, uint32_t degree)
|
||||
: coefficients((T *)memory), degree(degree) {}
|
||||
|
||||
__device__ Polynomial(SharedMemory &shmem, uint32_t degree) : degree(degree) {
|
||||
shmem.get_allocation(&this->coefficients, degree);
|
||||
}
|
||||
|
||||
__host__ Polynomial(DeviceMemory &dmem, uint32_t degree, int device)
|
||||
: degree(degree) {
|
||||
dmem.get_allocation(&this->coefficients, params::degree, device);
|
||||
}
|
||||
|
||||
__host__ Polynomial(DeviceMemory &dmem, T *source, uint32_t degree,
|
||||
int device)
|
||||
: degree(degree) {
|
||||
dmem.get_allocation_and_copy_async(&this->coefficients, source,
|
||||
params::degree, device);
|
||||
}
|
||||
|
||||
__host__ void copy_to_host(T *dest) {
|
||||
cudaMemcpyAsync(dest, this->coefficients, sizeof(T) * params::degree,
|
||||
cudaMemcpyDeviceToHost);
|
||||
}
|
||||
|
||||
__device__ T get_coefficient(int i) { return this->coefficients[i]; }
|
||||
|
||||
__device__ char *reuse_memory() { return (char *)coefficients; }
|
||||
|
||||
__device__ void copy_coefficients_from(Polynomial<T, params> &source,
|
||||
int begin_dest = 0,
|
||||
int begin_src = 0) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
this->coefficients[tid + begin_dest] = source.coefficients[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void fill_with(T value) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
coefficients[tid] = value;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void multiply_by_monomial_negacyclic(Polynomial<T, params> &result,
|
||||
uint32_t j) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
if (j < params::degree) {
|
||||
if (tid < j)
|
||||
result.coefficients[tid] =
|
||||
-this->coefficients[tid - j + params::degree];
|
||||
else
|
||||
result.coefficients[tid] = this->coefficients[tid - j];
|
||||
} else {
|
||||
uint32_t jj = j - params::degree;
|
||||
if (tid < jj)
|
||||
result.coefficients[tid] =
|
||||
this->coefficients[tid - jj + params::degree];
|
||||
else
|
||||
result.coefficients[tid] = -this->coefficients[tid - jj];
|
||||
}
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void multiply_by_monomial_negacyclic_inplace(uint32_t j) {
|
||||
int tid = threadIdx.x;
|
||||
T result[params::opt];
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
if (j < params::degree) {
|
||||
if (tid < j)
|
||||
result[i] = -this->coefficients[tid - j + params::degree];
|
||||
else
|
||||
result[i] = this->coefficients[tid - j];
|
||||
} else {
|
||||
uint32_t jj = j - params::degree;
|
||||
if (tid < jj)
|
||||
result[i] = this->coefficients[tid - jj + params::degree];
|
||||
else
|
||||
result[i] = -this->coefficients[tid - jj];
|
||||
}
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
coefficients[tid] = result[i];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
__device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
|
||||
Polynomial<T, params> &result, uint32_t j) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
if (j < params::degree) {
|
||||
if (tid < j)
|
||||
result.coefficients[tid] =
|
||||
-this->coefficients[tid - j + params::degree] -
|
||||
this->coefficients[tid];
|
||||
else
|
||||
result.coefficients[tid] =
|
||||
this->coefficients[tid - j] - this->coefficients[tid];
|
||||
} else {
|
||||
uint32_t jj = j - params::degree;
|
||||
if (tid < jj)
|
||||
result.coefficients[tid] =
|
||||
this->coefficients[tid - jj + params::degree] -
|
||||
this->coefficients[tid];
|
||||
else
|
||||
result.coefficients[tid] =
|
||||
-this->coefficients[tid - jj] - this->coefficients[tid];
|
||||
}
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void divide_by_monomial_negacyclic(Polynomial<T, params> &result,
|
||||
uint32_t j) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
if (j < params::degree) {
|
||||
if (tid < params::degree - j) {
|
||||
result.coefficients[tid] = this->coefficients[tid + j];
|
||||
} else {
|
||||
result.coefficients[tid] =
|
||||
-this->coefficients[tid - params::degree + j];
|
||||
}
|
||||
} else {
|
||||
uint32_t jj = j - params::degree;
|
||||
if (tid < params::degree - jj) {
|
||||
result.coefficients[tid] = -this->coefficients[tid + jj];
|
||||
} else {
|
||||
result.coefficients[tid] =
|
||||
this->coefficients[tid - params::degree + jj];
|
||||
}
|
||||
}
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void divide_by_monomial_negacyclic_inplace(uint32_t j) {
|
||||
int tid = threadIdx.x;
|
||||
T result[params::opt];
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
if (j < params::degree) {
|
||||
if (tid < params::degree - j) {
|
||||
result[i] = this->coefficients[tid + j];
|
||||
} else {
|
||||
result[i] = -this->coefficients[tid - params::degree + j];
|
||||
}
|
||||
} else {
|
||||
uint32_t jj = j - params::degree;
|
||||
if (tid < params::degree - jj) {
|
||||
result[i] = -this->coefficients[tid + jj];
|
||||
} else {
|
||||
result[i] = this->coefficients[tid - params::degree + jj];
|
||||
}
|
||||
}
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
coefficients[tid] = result[i];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void round_to_closest_multiple_inplace(uint32_t base_log,
|
||||
uint32_t l_gadget) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
|
||||
T x = coefficients[tid];
|
||||
T shift = sizeof(T) * 8 - l_gadget * base_log;
|
||||
T mask = 1ll << (shift - 1);
|
||||
T b = (x & mask) >> (shift - 1);
|
||||
T res = x >> shift;
|
||||
res += b;
|
||||
res <<= shift;
|
||||
coefficients[tid] = res;
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void
|
||||
to_complex_compressed(PolynomialFourier<double2, params> &dest) {
|
||||
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
dest.m_values[tid].x = (double)coefficients[2 * tid];
|
||||
dest.m_values[tid].y = (double)coefficients[2 * tid + 1];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void to_complex(PolynomialFourier<double2, params> &dest) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
dest.m_values[tid].x = (double)coefficients[tid];
|
||||
dest.m_values[tid].y = 0.0;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void multiply_by_scalar_inplace(T scalar) {
|
||||
int tid = threadIdx.x;
|
||||
const int grid_dim = blockDim.x;
|
||||
const int slices = params::degree / grid_dim;
|
||||
const int jump = grid_dim;
|
||||
for (int i = 0; i < slices; i++) {
|
||||
this->coefficients[tid] *= scalar;
|
||||
tid += jump;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void add_scalar_inplace(T scalar) {
|
||||
int tid = threadIdx.x;
|
||||
const int grid_dim = blockDim.x;
|
||||
const int slices = params::degree / grid_dim;
|
||||
const int jump = grid_dim;
|
||||
for (int i = 0; i < slices; i++) {
|
||||
this->coefficients[tid] += scalar;
|
||||
tid += jump;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void sub_scalar_inplace(T scalar) {
|
||||
int tid = threadIdx.x;
|
||||
const int grid_dim = blockDim.x;
|
||||
const int slices = params::degree / grid_dim;
|
||||
const int jump = grid_dim;
|
||||
for (int i = 0; i < slices; i++) {
|
||||
this->coefficients[tid] -= scalar;
|
||||
tid += jump;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
__device__ void add_polynomial_inplace(Polynomial<T, params> &source,
|
||||
int begin) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
this->coefficients[tid] += source.coefficients[tid + begin];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
__device__ void sub_polynomial_inplace(Polynomial<T, params> &rhs) {
|
||||
int tid = threadIdx.x;
|
||||
const int grid_dim = blockDim.x;
|
||||
const int slices = params::degree / grid_dim;
|
||||
const int jump = grid_dim;
|
||||
for (int i = 0; i < slices; i++) {
|
||||
this->coefficients[tid] -= rhs.coefficients[tid];
|
||||
tid += jump;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void negate_inplace() {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
coefficients[tid] = -coefficients[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
__device__ void copy_into(Vector<T, params> &vec) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
vec.m_data[tid] = coefficients[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void copy_reversed_into(Vector<T, params> &vec) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
vec.m_data[tid] = coefficients[params::degree - tid - 1];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void reverse_inplace() {
|
||||
int tid = threadIdx.x;
|
||||
T result[params::opt];
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
result[i] = coefficients[params::degree - tid - 1];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
coefficients[tid] = result[i];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
template <typename FT>
|
||||
__device__ void
|
||||
forward_negacyclic_fft_half(PolynomialFourier<FT, params> &result) {
|
||||
// TODO function should be removed
|
||||
}
|
||||
};
|
||||
template <typename T, class params> class Vector {
|
||||
public:
|
||||
T *m_data;
|
||||
uint32_t m_size;
|
||||
|
||||
__device__ Vector(T *elements, uint32_t size)
|
||||
: m_data(elements), m_size(size) {}
|
||||
|
||||
template <typename V>
|
||||
__device__ Vector(SharedMemory &shmem, V src, int size) : m_size(size) {
|
||||
shmem.get_allocation(&m_data, m_size);
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt && tid < m_size; i++) {
|
||||
if (tid > m_size)
|
||||
continue;
|
||||
m_data[tid] = src[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ Vector(SharedMemory &shmem, uint32_t size) : m_size(size) {
|
||||
shmem.get_allocation(&m_data, m_size);
|
||||
}
|
||||
|
||||
__host__ Vector() {}
|
||||
|
||||
__host__ Vector(DeviceMemory &dmem, uint32_t size, int device)
|
||||
: m_size(size) {
|
||||
dmem.get_allocation(&m_data, m_size, device);
|
||||
}
|
||||
|
||||
__device__ T &operator[](int i) { return m_data[i]; }
|
||||
|
||||
__device__ Vector<T, params> get_chunk(int chunk_num, int chunk_size) {
|
||||
int pos = chunk_num * chunk_size;
|
||||
T *ptr = &m_data[pos];
|
||||
return Vector<T, params>(ptr, chunk_size);
|
||||
}
|
||||
|
||||
__host__ void copy_to_device(T *source, uint32_t elements) {
|
||||
cudaMemcpyAsync(m_data, source, sizeof(T) * elements,
|
||||
cudaMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
__host__ Vector(DeviceMemory &dmem, T *source, uint32_t size_source,
|
||||
int device)
|
||||
: m_size(size_source) {
|
||||
dmem.get_allocation_and_copy_async(&m_data, source, m_size, device);
|
||||
}
|
||||
|
||||
__host__ Vector(DeviceMemory &dmem, T *source, uint32_t allocation_size,
|
||||
uint32_t copy_size, int device)
|
||||
: m_size(allocation_size) {
|
||||
if (copy_size > allocation_size) {
|
||||
printf("warning: copying more than allocation");
|
||||
}
|
||||
dmem.get_allocation_and_copy_async(&m_data, source, m_size, copy_size,
|
||||
device);
|
||||
}
|
||||
|
||||
__host__ void copy_to_host(T *dest) {
|
||||
cudaMemcpyAsync(dest, m_data, sizeof(T) * m_size, cudaMemcpyDeviceToHost);
|
||||
}
|
||||
|
||||
__host__ void copy_to_host(T *dest, int elements) {
|
||||
cudaMemcpyAsync(dest, m_data, sizeof(T) * elements, cudaMemcpyDeviceToHost);
|
||||
}
|
||||
|
||||
__device__ T get_ith_element(int i) { return m_data[i]; }
|
||||
|
||||
__device__ T get_last_element() { return m_data[m_size - 1]; }
|
||||
|
||||
__device__ void set_last_element(T elem) { m_data[m_size - 1] = elem; }
|
||||
|
||||
// todo(Joao): let's do coalesced access here at some point
|
||||
__device__ void operator-=(const Vector<T, params> &rhs) {
|
||||
assert(m_size == rhs->m_size);
|
||||
int tid = threadIdx.x;
|
||||
int pos = tid;
|
||||
int total = m_size / blockDim.x + 1;
|
||||
for (int i = 0; i < total; i++) {
|
||||
if (pos < m_size)
|
||||
m_data[pos] -= rhs.m_data[pos];
|
||||
pos += blockDim.x;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void operator*=(const T &rhs) {
|
||||
int tid = threadIdx.x;
|
||||
int pos = tid;
|
||||
int total = m_size / blockDim.x + 1;
|
||||
for (int i = 0; i < total; i++) {
|
||||
if (pos < m_size)
|
||||
m_data[pos] *= rhs;
|
||||
pos += blockDim.x;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename FT, class params> class Twiddles {
|
||||
public:
|
||||
Vector<FT, params> twiddles2, twiddles3, twiddles4, twiddles5, twiddles6,
|
||||
twiddles7, twiddles8, twiddles9, twiddles10;
|
||||
|
||||
__device__
|
||||
Twiddles(Vector<FT, params> &twiddles2, Vector<FT, params> &twiddles3,
|
||||
Vector<FT, params> &twiddles4, Vector<FT, params> &twiddles5,
|
||||
Vector<FT, params> &twiddles6, Vector<FT, params> &twiddles7,
|
||||
Vector<FT, params> &twiddles8, Vector<FT, params> &twiddles9,
|
||||
Vector<FT, params> &twiddles10)
|
||||
: twiddles2(twiddles2), twiddles3(twiddles3), twiddles4(twiddles4),
|
||||
twiddles5(twiddles5), twiddles6(twiddles6), twiddles7(twiddles7),
|
||||
twiddles8(twiddles8), twiddles9(twiddles9), twiddles10(twiddles10) {}
|
||||
};
|
||||
|
||||
#endif // CNCRT_POLYNOMIAL_H
|
||||
65
src/polynomial/polynomial_math.cuh
Normal file
65
src/polynomial/polynomial_math.cuh
Normal file
@@ -0,0 +1,65 @@
|
||||
#ifndef CNCRT_POLYNOMIAL_MATH_H
|
||||
#define CNCRT_POLYNOMIAL_MATH_H
|
||||
|
||||
#include "crypto/torus.cuh"
|
||||
#include "parameters.cuh"
|
||||
#include "polynomial.cuh"
|
||||
|
||||
template <class params, typename FT>
|
||||
__device__ void polynomial_product_in_fourier_domain(FT *result, FT *first,
|
||||
FT *second) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
result[tid] = first[tid] * second[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
result[params::degree / 2] =
|
||||
first[params::degree / 2] * second[params::degree / 2];
|
||||
}
|
||||
}
|
||||
|
||||
template <class params, typename FT>
|
||||
__device__ void polynomial_product_in_fourier_domain(
|
||||
PolynomialFourier<FT, params> &result, PolynomialFourier<FT, params> &first,
|
||||
PolynomialFourier<FT, params> &second) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
result[tid] = first[tid] * second[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
result[params::degree / 2] =
|
||||
first[params::degree / 2] * second[params::degree / 2];
|
||||
}
|
||||
}
|
||||
|
||||
template <class params, typename FT>
|
||||
__device__ void polynomial_product_accumulate_in_fourier_domain(
|
||||
PolynomialFourier<FT, params> &result, PolynomialFourier<FT, params> &first,
|
||||
PolynomialFourier<FT, params> &second) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
result[tid] += first[tid] * second[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
result[params::degree / 2] +=
|
||||
first[params::degree / 2] * second[params::degree / 2];
|
||||
}
|
||||
}
|
||||
|
||||
template <class params, typename FT>
|
||||
__device__ void polynomial_product_accumulate_in_fourier_domain(
|
||||
FT *result, FT *first, PolynomialFourier<FT, params> &second) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
result[tid] += first[tid] * second.m_values[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // CNCRT_POLYNOMIAL_MATH_H
|
||||
1
src/polynomial/polynomial_twiddles.cuh
Normal file
1
src/polynomial/polynomial_twiddles.cuh
Normal file
@@ -0,0 +1 @@
|
||||
#include "polynomial.cuh"
|
||||
76
src/types/int128.cuh
Normal file
76
src/types/int128.cuh
Normal file
@@ -0,0 +1,76 @@
|
||||
#ifndef CNCRT_INT128_H
|
||||
#define CNCRT_INT128_H
|
||||
|
||||
// abseil's int128 type
|
||||
// licensed under Apache license
|
||||
|
||||
class uint128 {
|
||||
public:
|
||||
__device__ uint128(uint64_t high, uint64_t low) : hi_(high), lo_(low) {}
|
||||
|
||||
uint64_t lo_;
|
||||
uint64_t hi_;
|
||||
};
|
||||
|
||||
class int128 {
|
||||
public:
|
||||
int128() = default;
|
||||
|
||||
__device__ operator unsigned long long() const {
|
||||
return static_cast<unsigned long long>(lo_);
|
||||
}
|
||||
|
||||
__device__ int128(int64_t high, uint64_t low) : hi_(high), lo_(low) {}
|
||||
|
||||
uint64_t lo_;
|
||||
int64_t hi_;
|
||||
};
|
||||
|
||||
__device__ inline uint128 make_uint128(uint64_t high, uint64_t low) {
|
||||
return uint128(high, low);
|
||||
}
|
||||
|
||||
template <typename T> __device__ uint128 make_uint128_from_float(T v) {
|
||||
if (v >= ldexp(static_cast<T>(1), 64)) {
|
||||
uint64_t hi = static_cast<uint64_t>(ldexp(v, -64));
|
||||
uint64_t lo = static_cast<uint64_t>(v - ldexp(static_cast<T>(hi), 64));
|
||||
return make_uint128(hi, lo);
|
||||
}
|
||||
|
||||
return make_uint128(0, static_cast<uint64_t>(v));
|
||||
}
|
||||
|
||||
__device__ inline int128 make_int128(int64_t high, uint64_t low) {
|
||||
return int128(high, low);
|
||||
}
|
||||
|
||||
__device__ inline int64_t bitcast_to_signed(uint64_t v) {
|
||||
return v & (uint64_t{1} << 63) ? ~static_cast<int64_t>(~v)
|
||||
: static_cast<int64_t>(v);
|
||||
}
|
||||
|
||||
__device__ inline uint64_t uint128_high64(uint128 v) { return v.hi_; }
|
||||
__device__ inline uint64_t uint128_low64(uint128 v) { return v.lo_; }
|
||||
|
||||
__device__ __forceinline__ uint128 operator-(uint128 val) {
|
||||
uint64_t hi = ~uint128_high64(val);
|
||||
uint64_t lo = ~uint128_low64(val) + 1;
|
||||
if (lo == 0)
|
||||
++hi; // carry
|
||||
return make_uint128(hi, lo);
|
||||
}
|
||||
|
||||
template <typename T> __device__ int128 make_int128_from_float(T v) {
|
||||
|
||||
// We must convert the absolute value and then negate as needed, because
|
||||
// floating point types are typically sign-magnitude. Otherwise, the
|
||||
// difference between the high and low 64 bits when interpreted as two's
|
||||
// complement overwhelms the precision of the mantissa.
|
||||
uint128 result =
|
||||
v < 0 ? -make_uint128_from_float(-v) : make_uint128_from_float(v);
|
||||
|
||||
return make_int128(bitcast_to_signed(uint128_high64(result)),
|
||||
uint128_low64(result));
|
||||
}
|
||||
|
||||
#endif
|
||||
15
src/utils/kernel_dimensions.cuh
Normal file
15
src/utils/kernel_dimensions.cuh
Normal file
@@ -0,0 +1,15 @@
|
||||
int nextPow2(int x) {
|
||||
--x;
|
||||
x |= x >> 1;
|
||||
x |= x >> 2;
|
||||
x |= x >> 4;
|
||||
x |= x >> 8;
|
||||
x |= x >> 16;
|
||||
return ++x;
|
||||
}
|
||||
|
||||
void getNumBlocksAndThreads(const int n, const int maxBlockSize, int &blocks,
|
||||
int &threads) {
|
||||
threads = (n < maxBlockSize * 2) ? nextPow2((n + 1) / 2) : maxBlockSize;
|
||||
blocks = (n + threads - 1) / threads;
|
||||
}
|
||||
90
src/utils/memory.cuh
Normal file
90
src/utils/memory.cuh
Normal file
@@ -0,0 +1,90 @@
|
||||
#ifndef CNCRT_SHMEM_H
|
||||
#define CNCRT_SHMEM_H
|
||||
|
||||
#include "helper_cuda.h"
|
||||
#include <atomic>
|
||||
#include <iostream>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
class SharedMemory {
|
||||
public:
|
||||
char *m_memory_block;
|
||||
int m_last_byte;
|
||||
|
||||
__device__ SharedMemory(char *ptr) : m_memory_block(ptr), m_last_byte(0) {}
|
||||
|
||||
template <typename T> __device__ void get_allocation(T **ptr, int elements) {
|
||||
*ptr = (T *)(&this->m_memory_block[m_last_byte]);
|
||||
this->m_last_byte += elements * sizeof(T);
|
||||
}
|
||||
};
|
||||
|
||||
class DeviceMemory {
|
||||
public:
|
||||
std::vector<std::tuple<void *, int>> m_allocated;
|
||||
std::mutex m_allocation_mtx;
|
||||
std::atomic<uint32_t> m_total_devices;
|
||||
|
||||
DeviceMemory() : m_total_devices(1) {}
|
||||
|
||||
__host__ void set_device(int device) {
|
||||
if (device > m_total_devices)
|
||||
m_total_devices = device + 1;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void get_allocation(T **ptr, int elements, int device) {
|
||||
T *res;
|
||||
cudaMalloc((void **)&res, sizeof(T) * elements);
|
||||
*ptr = res;
|
||||
std::lock_guard<std::mutex> lock(m_allocation_mtx);
|
||||
m_allocated.push_back(std::make_tuple(res, device));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void get_allocation_and_copy_async(T **ptr, T *src, int elements,
|
||||
int device) {
|
||||
T *res;
|
||||
cudaMalloc((void **)&res, sizeof(T) * elements);
|
||||
cudaMemcpyAsync(res, src, sizeof(T) * elements, cudaMemcpyHostToDevice);
|
||||
*ptr = res;
|
||||
std::lock_guard<std::mutex> lock(m_allocation_mtx);
|
||||
m_allocated.push_back(std::make_tuple(res, device));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void get_allocation_and_copy_async(T **ptr, T *src, int allocation,
|
||||
int elements, int device) {
|
||||
T *res;
|
||||
cudaMalloc((void **)&res, sizeof(T) * allocation);
|
||||
cudaMemcpyAsync(res, src, sizeof(T) * elements, cudaMemcpyHostToDevice);
|
||||
*ptr = res;
|
||||
std::lock_guard<std::mutex> lock(m_allocation_mtx);
|
||||
m_allocated.push_back(std::make_tuple(res, device));
|
||||
}
|
||||
|
||||
void free_all_from_device(int device) {
|
||||
cudaSetDevice(device);
|
||||
for (auto elem : m_allocated) {
|
||||
auto dev = std::get<1>(elem);
|
||||
if (dev == device) {
|
||||
auto mem = std::get<0>(elem);
|
||||
checkCudaErrors(cudaFree(mem));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__host__ ~DeviceMemory() {
|
||||
for (auto elem : m_allocated) {
|
||||
auto dev = std::get<1>(elem);
|
||||
auto mem = std::get<0>(elem);
|
||||
cudaSetDevice(dev);
|
||||
checkCudaErrors(cudaFree(mem));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#endif // CNCRT_SHMEM_H
|
||||
29
src/utils/timer.cuh
Normal file
29
src/utils/timer.cuh
Normal file
@@ -0,0 +1,29 @@
|
||||
#ifndef CNCRT_TIMER_H
|
||||
#define CNCRT_TIMER_H
|
||||
|
||||
#define synchronize_threads_in_block() __syncthreads()
|
||||
|
||||
template <bool active> class CudaMeasureExecution {
|
||||
public:
|
||||
cudaEvent_t m_start, m_stop;
|
||||
|
||||
__host__ CudaMeasureExecution() {
|
||||
if constexpr (active) {
|
||||
cudaEventCreate(&m_start);
|
||||
cudaEventCreate(&m_stop);
|
||||
cudaEventRecord(m_start);
|
||||
}
|
||||
}
|
||||
|
||||
__host__ ~CudaMeasureExecution() {
|
||||
if constexpr (active) {
|
||||
float ms;
|
||||
cudaEventRecord(m_stop);
|
||||
cudaEventSynchronize(m_stop);
|
||||
cudaEventElapsedTime(&ms, m_start, m_stop);
|
||||
std::cout << "Execution took " << ms << "ms" << std::endl;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#endif // CNCRT_TIMER_H
|
||||
Reference in New Issue
Block a user