feat(cuda): introduce cuda acceleration for the pbs and keyswitch

- a new crate concrete-cuda is added to the repository, containing some
Cuda implementations for the bootstrap and keyswitch and a Rust wrapping
to call them
- a new backend_cuda is added to concrete-core, with dedicated entities
whose memory is located on the GPU and engines that call the Cuda
accelerated functions
This commit is contained in:
Agnes Leroy
2021-10-20 09:34:57 +02:00
committed by Agnès Leroy
commit 64521f6747
37 changed files with 15143 additions and 0 deletions

88
CMakeLists.txt Normal file
View File

@@ -0,0 +1,88 @@
cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
project(concrete_cuda LANGUAGES CXX CUDA)
include(CTest)
# See if the minimum CUDA version is available. If not, only enable documentation building.
set(MINIMUM_SUPPORTED_CUDA_VERSION 10.0)
include(CheckLanguage)
# See if CUDA is available
check_language(CUDA)
# If so, enable CUDA to check the version.
if (CMAKE_CUDA_COMPILER)
enable_language(CUDA)
endif ()
# If CUDA is not available, or the minimum version is too low do not build
if (NOT CMAKE_CUDA_COMPILER)
message(FATAL_ERROR "Cuda compiler not found.")
endif()
if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS ${MINIMUM_SUPPORTED_CUDA_VERSION})
message(FATAL_ERROR "CUDA ${MINIMUM_SUPPORTED_CUDA_VERSION} or greater is required for compilation.")
endif()
#Get CUDA compute capability
set(OUTPUTFILE ${CMAKE_CURRENT_SOURCE_DIR}/cuda_script) # No suffix required
set(CUDAFILE ${CMAKE_CURRENT_SOURCE_DIR}/check_cuda.cu)
execute_process(COMMAND nvcc -lcuda ${CUDAFILE} -o ${OUTPUTFILE})
execute_process(COMMAND ${OUTPUTFILE}
RESULT_VARIABLE CUDA_RETURN_CODE
OUTPUT_VARIABLE ARCH)
if (${CUDA_RETURN_CODE} EQUAL 0)
set(CUDA_SUCCESS "TRUE")
else ()
set(CUDA_SUCCESS "FALSE")
endif ()
if (${CUDA_SUCCESS})
message(STATUS "CUDA Architecture: ${ARCH}")
message(STATUS "CUDA Version: ${CUDA_VERSION_STRING}")
message(STATUS "CUDA Path: ${CUDA_TOOLKIT_ROOT_DIR}")
message(STATUS "CUDA Libraries: ${CUDA_LIBRARIES}")
message(STATUS "CUDA Performance Primitives: ${CUDA_npp_LIBRARY}")
set(CUDA_NVCC_FLAGS "${ARCH}")
#add_definitions(-DGPU) #You may not require this
else ()
message(WARNING ${ARCH})
endif ()
if (NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif ()
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
if (NOT CUDA_NVCC_FLAGS)
set(CUDA_NVCC_FLAGS -arch=sm_70)
endif ()
# in production, should use -arch=sm_70
# --ptxas-options=-v to see register spills
# -lineinfo for better debugging
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O3 ${CUDA_NVCC_FLAGS} \
-std=c++17 --no-exceptions --expt-relaxed-constexpr -rdc=true --use_fast_math -Xcompiler -fPIC")
set(INCLUDE_DIR include)
add_subdirectory(src)
add_subdirectory(parameters)
target_include_directories(concrete_cuda PRIVATE ${INCLUDE_DIR})
# This is required for rust cargo build
install(TARGETS concrete_cuda DESTINATION .)
install(TARGETS concrete_cuda DESTINATION lib)
install(TARGETS cuda_parameters DESTINATION .)
install(TARGETS cuda_parameters DESTINATION lib)
# Define a function to add a lint target.
find_file(CPPLINT NAMES cpplint cpplint.exe)
if (CPPLINT)
# Add a custom target to lint all child projects. Dependencies are specified in child projects.
add_custom_target(all_lint)
# Don't trigger this target on ALL_BUILD or Visual Studio 'Rebuild Solution'
set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_ALL TRUE)
# set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD TRUE)
endif ()

3
CPPLINT.cfg Normal file
View File

@@ -0,0 +1,3 @@
set noparent
linelength=240
filter=-legal/copyright,-readability/todo,-runtime/references,-build/c++17

22
check_cuda.cu Normal file
View File

@@ -0,0 +1,22 @@
#include <stdio.h>
int main(int argc, char **argv) {
cudaDeviceProp dP;
float min_cc = 3.0;
int rc = cudaGetDeviceProperties(&dP, 0);
if (rc != cudaSuccess) {
cudaError_t error = cudaGetLastError();
printf("CUDA error: %s", cudaGetErrorString(error));
return rc; /* Failure */
}
if ((dP.major + (dP.minor / 10)) < min_cc) {
printf("Min Compute Capability of %2.1f required: %d.%d found\n Not "
"Building CUDA Code",
min_cc, dP.major, dP.minor);
return 1; /* Failure */
} else {
printf("-arch=sm_%d%d", dP.major, dP.minor);
return 0; /* Success */
}
}

94
include/bootstrap.h Normal file
View File

@@ -0,0 +1,94 @@
#ifndef CUDA_BOOTSTRAP_H
#define CUDA_BOOTSTRAP_H
#include <cstdint>
extern "C" {
void cuda_initialize_twiddles(uint32_t polynomial_size, uint32_t gpu_index);
void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src, void *v_stream,
uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim,
uint32_t l_gadget, uint32_t polynomial_size);
void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src, void *v_stream,
uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim,
uint32_t l_gadget, uint32_t polynomial_size);
void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
void *v_stream,
void *lwe_out,
void *test_vector,
void *test_vector_indexes,
void *lwe_in,
void *bootstrapping_key,
uint32_t input_lwe_dimension,
uint32_t polynomial_size,
uint32_t base_log,
uint32_t l_gadget,
uint32_t num_samples,
uint32_t num_test_vectors,
uint32_t lwe_idx,
uint32_t max_shared_memory);
void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
void *v_stream,
void *lwe_out,
void *test_vector,
void *test_vector_indexes,
void *lwe_in,
void *bootstrapping_key,
uint32_t input_lwe_dimension,
uint32_t polynomial_size,
uint32_t base_log,
uint32_t l_gadget,
uint32_t num_samples,
uint32_t num_test_vectors,
uint32_t lwe_idx,
uint32_t max_shared_memory);
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
void *v_stream,
void *lwe_out,
void *test_vector,
void *test_vector_indexes,
void *lwe_in,
void *bootstrapping_key,
uint32_t input_lwe_dimension,
uint32_t polynomial_size,
uint32_t base_log,
uint32_t l_gadget,
uint32_t num_samples,
uint32_t num_test_vectors,
uint32_t lwe_idx,
uint32_t max_shared_memory);
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
void *v_stream,
void *lwe_out,
void *test_vector,
void *test_vector_indexes,
void *lwe_in,
void *bootstrapping_key,
uint32_t input_lwe_dimension,
uint32_t polynomial_size,
uint32_t base_log,
uint32_t l_gadget,
uint32_t num_samples,
uint32_t num_test_vectors,
uint32_t lwe_idx,
uint32_t max_shared_memory);
};
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
int glwe_dimension,
uint32_t l_gadget);
__device__ double2*
get_ith_mask_kth_block(double2* ptr, int i, int k, int level, uint32_t polynomial_size,
int glwe_dimension, uint32_t l_gadget);
__device__ double2*
get_ith_body_kth_block(double2 *ptr, int i, int k, int level, uint32_t polynomial_size,
int glwe_dimension, uint32_t l_gadget);
#endif // CUDA_BOOTSTRAP_H

30
include/device.h Normal file
View File

@@ -0,0 +1,30 @@
#include <cstdint>
extern "C" {
void *cuda_create_stream(uint32_t gpu_index);
int cuda_destroy_stream(void *v_stream, uint32_t gpu_index);
void *cuda_malloc(uint64_t size, uint32_t gpu_index);
int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
int cuda_memcpy_to_cpu(void *dest, const void *src, uint64_t size,
uint32_t gpu_index);
int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
void *v_stream, uint32_t gpu_index);
int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size,
uint32_t gpu_index);
int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
void *v_stream, uint32_t gpu_index);
int cuda_get_number_of_gpus();
int cuda_synchronize_device(uint32_t gpu_index);
int cuda_drop(void *ptr, uint32_t gpu_index);
int cuda_get_max_shared_memory(uint32_t gpu_index);
}

1154
include/helper_cuda.h Normal file

File diff suppressed because it is too large Load Diff

100
include/helper_debug.cuh Normal file
View File

@@ -0,0 +1,100 @@
#include "cuComplex.h"
#include "thrust/complex.h"
#include <iostream>
#include <string>
#include <type_traits>
#define PRINT_VARS
#ifdef PRINT_VARS
#define PRINT_DEBUG_5(var, begin, end, step, cond) \
_print_debug(var, #var, begin, end, step, cond, "", false)
#define PRINT_DEBUG_6(var, begin, end, step, cond, text) \
_print_debug(var, #var, begin, end, step, cond, text, true)
#define CAT(A, B) A##B
#define PRINT_SELECT(NAME, NUM) CAT(NAME##_, NUM)
#define GET_COUNT(_1, _2, _3, _4, _5, _6, COUNT, ...) COUNT
#define VA_SIZE(...) GET_COUNT(__VA_ARGS__, 6, 5, 4, 3, 2, 1)
#define PRINT_DEBUG(...) \
PRINT_SELECT(PRINT_DEBUG, VA_SIZE(__VA_ARGS__))(__VA_ARGS__)
#else
#define PRINT_DEBUG(...)
#endif
template <typename T>
__device__ typename std::enable_if<std::is_unsigned<T>::value, void>::type
_print_debug(T *var, const char *var_name, int start, int end, int step,
bool cond, const char *text, bool has_text) {
__syncthreads();
if (cond) {
if (has_text)
printf("%s\n", text);
for (int i = start; i < end; i += step) {
printf("%s[%u]: %u\n", var_name, i, var[i]);
}
}
__syncthreads();
}
template <typename T>
__device__ typename std::enable_if<std::is_signed<T>::value, void>::type
_print_debug(T *var, const char *var_name, int start, int end, int step,
bool cond, const char *text, bool has_text) {
__syncthreads();
if (cond) {
if (has_text)
printf("%s\n", text);
for (int i = start; i < end; i += step) {
printf("%s[%u]: %d\n", var_name, i, var[i]);
}
}
__syncthreads();
}
template <typename T>
__device__ typename std::enable_if<std::is_floating_point<T>::value, void>::type
_print_debug(T *var, const char *var_name, int start, int end, int step,
bool cond, const char *text, bool has_text) {
__syncthreads();
if (cond) {
if (has_text)
printf("%s\n", text);
for (int i = start; i < end; i += step) {
printf("%s[%u]: %.15f\n", var_name, i, var[i]);
}
}
__syncthreads();
}
template <typename T>
__device__
typename std::enable_if<std::is_same<T, thrust::complex<double>>::value,
void>::type
_print_debug(T *var, const char *var_name, int start, int end, int step,
bool cond, const char *text, bool has_text) {
__syncthreads();
if (cond) {
if (has_text)
printf("%s\n", text);
for (int i = start; i < end; i += step) {
printf("%s[%u]: %.15f , %.15f\n", var_name, i, var[i].real(),
var[i].imag());
}
}
__syncthreads();
}
template <typename T>
__device__
typename std::enable_if<std::is_same<T, cuDoubleComplex>::value, void>::type
_print_debug(T *var, const char *var_name, int start, int end, int step,
bool cond, const char *text, bool has_text) {
__syncthreads();
if (cond) {
if (has_text)
printf("%s\n", text);
for (int i = start; i < end; i += step) {
printf("%s[%u]: %.15f , %.15f\n", var_name, i, var[i].x, var[i].y);
}
}
__syncthreads();
}

664
include/helper_string.h Normal file
View File

@@ -0,0 +1,664 @@
/**
* Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
// These are helper functions for the SDK samples (string parsing, timers, etc)
#ifndef STRING_HELPER_H
#define STRING_HELPER_H
#include <fstream>
#include <stdio.h>
#include <stdlib.h>
#include <string>
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
#ifndef _CRT_SECURE_NO_DEPRECATE
#define _CRT_SECURE_NO_DEPRECATE
#endif
#ifndef STRCASECMP
#define STRCASECMP _stricmp
#endif
#ifndef STRNCASECMP
#define STRNCASECMP _strnicmp
#endif
#ifndef STRCPY
#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
#endif
#ifndef FOPEN
#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
#endif
#ifndef FOPEN_FAIL
#define FOPEN_FAIL(result) (result != 0)
#endif
#ifndef SSCANF
#define SSCANF sscanf_s
#endif
#ifndef SPRINTF
#define SPRINTF sprintf_s
#endif
#else // Linux Includes
#include <string.h>
#include <strings.h>
#ifndef STRCASECMP
#define STRCASECMP strcasecmp
#endif
#ifndef STRNCASECMP
#define STRNCASECMP strncasecmp
#endif
#ifndef STRCPY
#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
#endif
#ifndef FOPEN
#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
#endif
#ifndef FOPEN_FAIL
#define FOPEN_FAIL(result) (result == NULL)
#endif
#ifndef SSCANF
#define SSCANF sscanf
#endif
#ifndef SPRINTF
#define SPRINTF sprintf
#endif
#endif
#ifndef EXIT_WAIVED
#define EXIT_WAIVED 2
#endif
// CUDA Utility Helper Functions
inline int stringRemoveDelimiter(char delimiter, const char *string) {
int string_start = 0;
while (string[string_start] == delimiter) {
string_start++;
}
if (string_start >= (int)strlen(string) - 1) {
return 0;
}
return string_start;
}
inline int getFileExtension(char *filename, char **extension) {
int string_length = (int)strlen(filename);
while (filename[string_length--] != '.') {
if (string_length == 0)
break;
}
if (string_length > 0)
string_length += 2;
if (string_length == 0)
*extension = NULL;
else
*extension = &filename[string_length];
return string_length;
}
inline bool checkCmdLineFlag(const int argc, const char **argv,
const char *string_ref) {
bool bFound = false;
if (argc >= 1) {
for (int i = 1; i < argc; i++) {
int string_start = stringRemoveDelimiter('-', argv[i]);
const char *string_argv = &argv[i][string_start];
const char *equal_pos = strchr(string_argv, '=');
int argv_length =
(int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
int length = (int)strlen(string_ref);
if (length == argv_length &&
!STRNCASECMP(string_argv, string_ref, length)) {
bFound = true;
continue;
}
}
}
return bFound;
}
// This function wraps the CUDA Driver API into a template function
template <class T>
inline bool getCmdLineArgumentValue(const int argc, const char **argv,
const char *string_ref, T *value) {
bool bFound = false;
if (argc >= 1) {
for (int i = 1; i < argc; i++) {
int string_start = stringRemoveDelimiter('-', argv[i]);
const char *string_argv = &argv[i][string_start];
int length = (int)strlen(string_ref);
if (!STRNCASECMP(string_argv, string_ref, length)) {
if (length + 1 <= (int)strlen(string_argv)) {
int auto_inc = (string_argv[length] == '=') ? 1 : 0;
*value = (T)atoi(&string_argv[length + auto_inc]);
}
bFound = true;
i = argc;
}
}
}
return bFound;
}
inline int getCmdLineArgumentInt(const int argc, const char **argv,
const char *string_ref) {
bool bFound = false;
int value = -1;
if (argc >= 1) {
for (int i = 1; i < argc; i++) {
int string_start = stringRemoveDelimiter('-', argv[i]);
const char *string_argv = &argv[i][string_start];
int length = (int)strlen(string_ref);
if (!STRNCASECMP(string_argv, string_ref, length)) {
if (length + 1 <= (int)strlen(string_argv)) {
int auto_inc = (string_argv[length] == '=') ? 1 : 0;
value = atoi(&string_argv[length + auto_inc]);
} else {
value = 0;
}
bFound = true;
continue;
}
}
}
if (bFound) {
return value;
} else {
return 0;
}
}
inline float getCmdLineArgumentFloat(const int argc, const char **argv,
const char *string_ref) {
bool bFound = false;
float value = -1;
if (argc >= 1) {
for (int i = 1; i < argc; i++) {
int string_start = stringRemoveDelimiter('-', argv[i]);
const char *string_argv = &argv[i][string_start];
int length = (int)strlen(string_ref);
if (!STRNCASECMP(string_argv, string_ref, length)) {
if (length + 1 <= (int)strlen(string_argv)) {
int auto_inc = (string_argv[length] == '=') ? 1 : 0;
value = (float)atof(&string_argv[length + auto_inc]);
} else {
value = 0.f;
}
bFound = true;
continue;
}
}
}
if (bFound) {
return value;
} else {
return 0;
}
}
inline bool getCmdLineArgumentString(const int argc, const char **argv,
const char *string_ref,
char **string_retval) {
bool bFound = false;
if (argc >= 1) {
for (int i = 1; i < argc; i++) {
int string_start = stringRemoveDelimiter('-', argv[i]);
char *string_argv = (char *)&argv[i][string_start];
int length = (int)strlen(string_ref);
if (!STRNCASECMP(string_argv, string_ref, length)) {
*string_retval = &string_argv[length + 1];
bFound = true;
continue;
}
}
}
if (!bFound) {
*string_retval = NULL;
}
return bFound;
}
//////////////////////////////////////////////////////////////////////////////
//! Find the path for a file assuming that
//! files are found in the searchPath.
//!
//! @return the path if succeeded, otherwise 0
//! @param filename name of the file
//! @param executable_path optional absolute path of the executable
//////////////////////////////////////////////////////////////////////////////
inline char *sdkFindFilePath(const char *filename,
const char *executable_path) {
// <executable_name> defines a variable that is replaced with the name of the
// executable
// Typical relative search paths to locate needed companion files (e.g. sample
// input data, or JIT source files) The origin for the relative search may be
// the .exe file, a .bat file launching an .exe, a browser .exe launching the
// .exe or .bat, etc
const char *searchPath[] = {
"./", // same dir
"./common/", // "/common/" subdir
"./common/data/", // "/common/data/" subdir
"./data/", // "/data/" subdir
"./src/", // "/src/" subdir
"./src/<executable_name>/data/", // "/src/<executable_name>/data/" subdir
"./inc/", // "/inc/" subdir
"./0_Simple/", // "/0_Simple/" subdir
"./1_Utilities/", // "/1_Utilities/" subdir
"./2_Graphics/", // "/2_Graphics/" subdir
"./3_Imaging/", // "/3_Imaging/" subdir
"./4_Finance/", // "/4_Finance/" subdir
"./5_Simulations/", // "/5_Simulations/" subdir
"./6_Advanced/", // "/6_Advanced/" subdir
"./7_CUDALibraries/", // "/7_CUDALibraries/" subdir
"./8_Android/", // "/8_Android/" subdir
"./samples/", // "/samples/" subdir
"../", // up 1 in tree
"../common/", // up 1 in tree, "/common/" subdir
"../common/data/", // up 1 in tree, "/common/data/" subdir
"../data/", // up 1 in tree, "/data/" subdir
"../src/", // up 1 in tree, "/src/" subdir
"../inc/", // up 1 in tree, "/inc/" subdir
"../0_Simple/<executable_name>/data/", // up 1 in tree,
// "/0_Simple/<executable_name>/"
// subdir
"../1_Utilities/<executable_name>/data/", // up 1 in tree,
// "/1_Utilities/<executable_name>/"
// subdir
"../2_Graphics/<executable_name>/data/", // up 1 in tree,
// "/2_Graphics/<executable_name>/"
// subdir
"../3_Imaging/<executable_name>/data/", // up 1 in tree,
// "/3_Imaging/<executable_name>/"
// subdir
"../4_Finance/<executable_name>/data/", // up 1 in tree,
// "/4_Finance/<executable_name>/"
// subdir
"../5_Simulations/<executable_name>/data/", // up 1 in tree,
// "/5_Simulations/<executable_name>/"
// subdir
"../6_Advanced/<executable_name>/data/", // up 1 in tree,
// "/6_Advanced/<executable_name>/"
// subdir
"../7_CUDALibraries/<executable_name>/data/", // up 1 in tree,
// "/7_CUDALibraries/<executable_name>/"
// subdir
"../8_Android/<executable_name>/data/", // up 1 in tree,
// "/8_Android/<executable_name>/"
// subdir
"../samples/<executable_name>/data/", // up 1 in tree,
// "/samples/<executable_name>/"
// subdir
"../../", // up 2 in tree
"../../common/", // up 2 in tree, "/common/" subdir
"../../common/data/", // up 2 in tree, "/common/data/" subdir
"../../data/", // up 2 in tree, "/data/" subdir
"../../src/", // up 2 in tree, "/src/" subdir
"../../inc/", // up 2 in tree, "/inc/" subdir
"../../sandbox/<executable_name>/data/", // up 2 in tree,
// "/sandbox/<executable_name>/"
// subdir
"../../0_Simple/<executable_name>/data/", // up 2 in tree,
// "/0_Simple/<executable_name>/"
// subdir
"../../1_Utilities/<executable_name>/data/", // up 2 in tree,
// "/1_Utilities/<executable_name>/"
// subdir
"../../2_Graphics/<executable_name>/data/", // up 2 in tree,
// "/2_Graphics/<executable_name>/"
// subdir
"../../3_Imaging/<executable_name>/data/", // up 2 in tree,
// "/3_Imaging/<executable_name>/"
// subdir
"../../4_Finance/<executable_name>/data/", // up 2 in tree,
// "/4_Finance/<executable_name>/"
// subdir
"../../5_Simulations/<executable_name>/data/", // up 2 in tree,
// "/5_Simulations/<executable_name>/"
// subdir
"../../6_Advanced/<executable_name>/data/", // up 2 in tree,
// "/6_Advanced/<executable_name>/"
// subdir
"../../7_CUDALibraries/<executable_name>/data/", // up 2 in tree,
// "/7_CUDALibraries/<executable_name>/"
// subdir
"../../8_Android/<executable_name>/data/", // up 2 in tree,
// "/8_Android/<executable_name>/"
// subdir
"../../samples/<executable_name>/data/", // up 2 in tree,
// "/samples/<executable_name>/"
// subdir
"../../../", // up 3 in tree
"../../../src/<executable_name>/", // up 3 in tree,
// "/src/<executable_name>/" subdir
"../../../src/<executable_name>/data/", // up 3 in tree,
// "/src/<executable_name>/data/"
// subdir
"../../../src/<executable_name>/src/", // up 3 in tree,
// "/src/<executable_name>/src/"
// subdir
"../../../src/<executable_name>/inc/", // up 3 in tree,
// "/src/<executable_name>/inc/"
// subdir
"../../../sandbox/<executable_name>/", // up 3 in tree,
// "/sandbox/<executable_name>/"
// subdir
"../../../sandbox/<executable_name>/data/", // up 3 in tree,
// "/sandbox/<executable_name>/data/"
// subdir
"../../../sandbox/<executable_name>/src/", // up 3 in tree,
// "/sandbox/<executable_name>/src/"
// subdir
"../../../sandbox/<executable_name>/inc/", // up 3 in tree,
// "/sandbox/<executable_name>/inc/"
// subdir
"../../../0_Simple/<executable_name>/data/", // up 3 in tree,
// "/0_Simple/<executable_name>/"
// subdir
"../../../1_Utilities/<executable_name>/data/", // up 3 in tree,
// "/1_Utilities/<executable_name>/"
// subdir
"../../../2_Graphics/<executable_name>/data/", // up 3 in tree,
// "/2_Graphics/<executable_name>/"
// subdir
"../../../3_Imaging/<executable_name>/data/", // up 3 in tree,
// "/3_Imaging/<executable_name>/"
// subdir
"../../../4_Finance/<executable_name>/data/", // up 3 in tree,
// "/4_Finance/<executable_name>/"
// subdir
"../../../5_Simulations/<executable_name>/data/", // up 3 in tree,
// "/5_Simulations/<executable_name>/"
// subdir
"../../../6_Advanced/<executable_name>/data/", // up 3 in tree,
// "/6_Advanced/<executable_name>/"
// subdir
"../../../7_CUDALibraries/<executable_name>/data/", // up 3 in tree,
// "/7_CUDALibraries/<executable_name>/"
// subdir
"../../../8_Android/<executable_name>/data/", // up 3 in tree,
// "/8_Android/<executable_name>/"
// subdir
"../../../0_Simple/<executable_name>/", // up 3 in tree,
// "/0_Simple/<executable_name>/"
// subdir
"../../../1_Utilities/<executable_name>/", // up 3 in tree,
// "/1_Utilities/<executable_name>/"
// subdir
"../../../2_Graphics/<executable_name>/", // up 3 in tree,
// "/2_Graphics/<executable_name>/"
// subdir
"../../../3_Imaging/<executable_name>/", // up 3 in tree,
// "/3_Imaging/<executable_name>/"
// subdir
"../../../4_Finance/<executable_name>/", // up 3 in tree,
// "/4_Finance/<executable_name>/"
// subdir
"../../../5_Simulations/<executable_name>/", // up 3 in tree,
// "/5_Simulations/<executable_name>/"
// subdir
"../../../6_Advanced/<executable_name>/", // up 3 in tree,
// "/6_Advanced/<executable_name>/"
// subdir
"../../../7_CUDALibraries/<executable_name>/", // up 3 in tree,
// "/7_CUDALibraries/<executable_name>/"
// subdir
"../../../8_Android/<executable_name>/", // up 3 in tree,
// "/8_Android/<executable_name>/"
// subdir
"../../../samples/<executable_name>/data/", // up 3 in tree,
// "/samples/<executable_name>/"
// subdir
"../../../common/", // up 3 in tree, "../../../common/" subdir
"../../../common/data/", // up 3 in tree, "../../../common/data/" subdir
"../../../data/", // up 3 in tree, "../../../data/" subdir
"../../../../", // up 4 in tree
"../../../../src/<executable_name>/", // up 4 in tree,
// "/src/<executable_name>/" subdir
"../../../../src/<executable_name>/data/", // up 4 in tree,
// "/src/<executable_name>/data/"
// subdir
"../../../../src/<executable_name>/src/", // up 4 in tree,
// "/src/<executable_name>/src/"
// subdir
"../../../../src/<executable_name>/inc/", // up 4 in tree,
// "/src/<executable_name>/inc/"
// subdir
"../../../../sandbox/<executable_name>/", // up 4 in tree,
// "/sandbox/<executable_name>/"
// subdir
"../../../../sandbox/<executable_name>/data/", // up 4 in tree,
// "/sandbox/<executable_name>/data/"
// subdir
"../../../../sandbox/<executable_name>/src/", // up 4 in tree,
// "/sandbox/<executable_name>/src/"
// subdir
"../../../../sandbox/<executable_name>/inc/", // up 4 in tree,
// "/sandbox/<executable_name>/inc/"
// subdir
"../../../../0_Simple/<executable_name>/data/", // up 4 in tree,
// "/0_Simple/<executable_name>/"
// subdir
"../../../../1_Utilities/<executable_name>/data/", // up 4 in tree,
// "/1_Utilities/<executable_name>/"
// subdir
"../../../../2_Graphics/<executable_name>/data/", // up 4 in tree,
// "/2_Graphics/<executable_name>/"
// subdir
"../../../../3_Imaging/<executable_name>/data/", // up 4 in tree,
// "/3_Imaging/<executable_name>/"
// subdir
"../../../../4_Finance/<executable_name>/data/", // up 4 in tree,
// "/4_Finance/<executable_name>/"
// subdir
"../../../../5_Simulations/<executable_name>/data/", // up 4 in tree,
// "/5_Simulations/<executable_name>/"
// subdir
"../../../../6_Advanced/<executable_name>/data/", // up 4 in tree,
// "/6_Advanced/<executable_name>/"
// subdir
"../../../../7_CUDALibraries/<executable_name>/data/", // up 4 in tree,
// "/7_CUDALibraries/<executable_name>/"
// subdir
"../../../../8_Android/<executable_name>/data/", // up 4 in tree,
// "/8_Android/<executable_name>/"
// subdir
"../../../../0_Simple/<executable_name>/", // up 4 in tree,
// "/0_Simple/<executable_name>/"
// subdir
"../../../../1_Utilities/<executable_name>/", // up 4 in tree,
// "/1_Utilities/<executable_name>/"
// subdir
"../../../../2_Graphics/<executable_name>/", // up 4 in tree,
// "/2_Graphics/<executable_name>/"
// subdir
"../../../../3_Imaging/<executable_name>/", // up 4 in tree,
// "/3_Imaging/<executable_name>/"
// subdir
"../../../../4_Finance/<executable_name>/", // up 4 in tree,
// "/4_Finance/<executable_name>/"
// subdir
"../../../../5_Simulations/<executable_name>/", // up 4 in tree,
// "/5_Simulations/<executable_name>/"
// subdir
"../../../../6_Advanced/<executable_name>/", // up 4 in tree,
// "/6_Advanced/<executable_name>/"
// subdir
"../../../../7_CUDALibraries/<executable_name>/", // up 4 in tree,
// "/7_CUDALibraries/<executable_name>/"
// subdir
"../../../../8_Android/<executable_name>/", // up 4 in tree,
// "/8_Android/<executable_name>/"
// subdir
"../../../../samples/<executable_name>/data/", // up 4 in tree,
// "/samples/<executable_name>/"
// subdir
"../../../../common/", // up 4 in tree, "../../../common/" subdir
"../../../../common/data/", // up 4 in tree, "../../../common/data/"
// subdir
"../../../../data/", // up 4 in tree, "../../../data/" subdir
"../../../../../", // up 5 in tree
"../../../../../src/<executable_name>/", // up 5 in tree,
// "/src/<executable_name>/"
// subdir
"../../../../../src/<executable_name>/data/", // up 5 in tree,
// "/src/<executable_name>/data/"
// subdir
"../../../../../src/<executable_name>/src/", // up 5 in tree,
// "/src/<executable_name>/src/"
// subdir
"../../../../../src/<executable_name>/inc/", // up 5 in tree,
// "/src/<executable_name>/inc/"
// subdir
"../../../../../sandbox/<executable_name>/", // up 5 in tree,
// "/sandbox/<executable_name>/"
// subdir
"../../../../../sandbox/<executable_name>/data/", // up 5 in tree,
// "/sandbox/<executable_name>/data/"
// subdir
"../../../../../sandbox/<executable_name>/src/", // up 5 in tree,
// "/sandbox/<executable_name>/src/"
// subdir
"../../../../../sandbox/<executable_name>/inc/", // up 5 in tree,
// "/sandbox/<executable_name>/inc/"
// subdir
"../../../../../0_Simple/<executable_name>/data/", // up 5 in tree,
// "/0_Simple/<executable_name>/"
// subdir
"../../../../../1_Utilities/<executable_name>/data/", // up 5 in tree,
// "/1_Utilities/<executable_name>/"
// subdir
"../../../../../2_Graphics/<executable_name>/data/", // up 5 in tree,
// "/2_Graphics/<executable_name>/"
// subdir
"../../../../../3_Imaging/<executable_name>/data/", // up 5 in tree,
// "/3_Imaging/<executable_name>/"
// subdir
"../../../../../4_Finance/<executable_name>/data/", // up 5 in tree,
// "/4_Finance/<executable_name>/"
// subdir
"../../../../../5_Simulations/<executable_name>/data/", // up 5 in tree,
// "/5_Simulations/<executable_name>/"
// subdir
"../../../../../6_Advanced/<executable_name>/data/", // up 5 in tree,
// "/6_Advanced/<executable_name>/"
// subdir
"../../../../../7_CUDALibraries/<executable_name>/data/", // up 5 in tree,
// "/7_CUDALibraries/<executable_name>/"
// subdir
"../../../../../8_Android/<executable_name>/data/", // up 5 in tree,
// "/8_Android/<executable_name>/"
// subdir
"../../../../../samples/<executable_name>/data/", // up 5 in tree,
// "/samples/<executable_name>/"
// subdir
"../../../../../common/", // up 5 in tree, "../../../common/" subdir
"../../../../../common/data/", // up 5 in tree, "../../../common/data/"
// subdir
};
// Extract the executable name
std::string executable_name;
if (executable_path != 0) {
executable_name = std::string(executable_path);
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
// Windows path delimiter
size_t delimiter_pos = executable_name.find_last_of('\\');
executable_name.erase(0, delimiter_pos + 1);
if (executable_name.rfind(".exe") != std::string::npos) {
// we strip .exe, only if the .exe is found
executable_name.resize(executable_name.size() - 4);
}
#else
// Linux & OSX path delimiter
size_t delimiter_pos = executable_name.find_last_of('/');
executable_name.erase(0, delimiter_pos + 1);
#endif
}
// Loop over all search paths and return the first hit
for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) {
std::string path(searchPath[i]);
size_t executable_name_pos = path.find("<executable_name>");
// If there is executable_name variable in the searchPath
// replace it with the value
if (executable_name_pos != std::string::npos) {
if (executable_path != 0) {
path.replace(executable_name_pos, strlen("<executable_name>"),
executable_name);
} else {
// Skip this path entry if no executable argument is given
continue;
}
}
#ifdef _DEBUG
printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
#endif
// Test if the file exists
path.append(filename);
FILE *fp;
FOPEN(fp, path.c_str(), "rb");
if (fp != NULL) {
fclose(fp);
// File found
// returning an allocated array here for backwards compatibility reasons
char *file_path = (char *)malloc(path.length() + 1);
STRCPY(file_path, path.length() + 1, path.c_str());
return file_path;
}
if (fp) {
fclose(fp);
}
}
// File not found
return 0;
}
#endif

24
include/keyswitch.h Normal file
View File

@@ -0,0 +1,24 @@
#ifndef CNCRT_KS_H_
#define CNCRT_KS_H_
#include <cstdint>
extern "C" {
void cuda_keyswitch_lwe_ciphertext_vector_32(void *v_stream, void *lwe_out, void *lwe_in,
void *ksk,
uint32_t lwe_dimension_before,
uint32_t lwe_dimension_after,
uint32_t base_log, uint32_t l_gadget,
uint32_t num_samples);
void cuda_keyswitch_lwe_ciphertext_vector_64(void *v_stream, void *lwe_out, void *lwe_in,
void *ksk,
uint32_t lwe_dimension_before,
uint32_t lwe_dimension_after,
uint32_t base_log, uint32_t l_gadget,
uint32_t num_samples);
}
#endif // CNCRT_KS_H_

View File

@@ -0,0 +1,4 @@
file(GLOB SOURCES
"parameters.cpp")
add_library(cuda_parameters STATIC ${SOURCES})
set_target_properties(cuda_parameters PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)

380
parameters/parameters.cpp Normal file
View File

@@ -0,0 +1,380 @@
#include <iostream>
using namespace std;
const int NORM2_MAX = 31;
const int P_MAX = 7;
typedef struct V0Parameter {
int k;
int polynomialSize;
int nSmall;
int brLevel;
int brLogBase;
int ksLevel;
int ksLogBase;
V0Parameter(int k_, int polynomialSize_, int nSmall_, int brLevel_,
int brLogBase_, int ksLevel_, int ksLogBase_) {
k = k_;
polynomialSize = polynomialSize_;
nSmall = nSmall_;
brLevel = brLevel_;
brLogBase = brLogBase_;
ksLevel = ksLevel_;
ksLogBase = ksLogBase_;
}
} V0Parameter;
typedef struct V0Variances {
float logstdEncrypt;
float logstdDecrypt;
V0Variances(float logstdEncrypt_, float logstdDecrypt_) {
logstdEncrypt = logstdEncrypt_;
logstdDecrypt = logstdDecrypt_;
}
} V0Variances;
V0Parameter parameters[NORM2_MAX][P_MAX] = {
{V0Parameter(1, 10, 472, 2, 8, 4, 2), V0Parameter(1, 10, 514, 2, 8, 5, 2),
V0Parameter(1, 10, 564, 2, 8, 5, 2), V0Parameter(1, 10, 599, 3, 6, 6, 2),
V0Parameter(1, 10, 686, 3, 6, 7, 2), V0Parameter(1, 11, 736, 1, 23, 5, 3),
V0Parameter(1, 12, 830, 1, 23, 4, 4)},
{V0Parameter(1, 10, 474, 2, 8, 4, 2), V0Parameter(1, 10, 519, 2, 8, 5, 2),
V0Parameter(1, 10, 558, 3, 6, 5, 2), V0Parameter(1, 10, 610, 3, 6, 6, 2),
V0Parameter(1, 11, 689, 1, 23, 4, 3), V0Parameter(1, 11, 736, 1, 23, 5, 3),
V0Parameter(1, 12, 831, 1, 23, 4, 4)},
{V0Parameter(1, 10, 479, 2, 8, 4, 2), V0Parameter(1, 10, 515, 3, 6, 5, 2),
V0Parameter(1, 10, 569, 3, 6, 5, 2), V0Parameter(1, 11, 638, 1, 23, 4, 3),
V0Parameter(1, 11, 689, 1, 23, 4, 3), V0Parameter(1, 11, 737, 1, 23, 5, 3),
V0Parameter(1, 12, 840, 1, 23, 4, 4)},
{V0Parameter(1, 10, 531, 2, 8, 5, 2), V0Parameter(1, 10, 523, 3, 6, 5, 2),
V0Parameter(1, 11, 598, 1, 23, 4, 3), V0Parameter(1, 11, 639, 1, 23, 4, 3),
V0Parameter(1, 11, 690, 1, 23, 4, 3), V0Parameter(1, 11, 739, 1, 23, 5, 3),
V0Parameter(1, 12, 806, 2, 16, 5, 3)},
{V0Parameter(1, 10, 483, 3, 6, 4, 2), V0Parameter(1, 11, 563, 1, 23, 3, 3),
V0Parameter(1, 11, 598, 1, 23, 4, 3), V0Parameter(1, 11, 639, 1, 23, 4, 3),
V0Parameter(1, 11, 691, 1, 23, 4, 3), V0Parameter(1, 11, 748, 1, 23, 5, 3),
V0Parameter(1, 12, 806, 2, 15, 5, 3)},
{V0Parameter(1, 11, 497, 1, 23, 4, 2), V0Parameter(1, 11, 563, 1, 23, 3, 3),
V0Parameter(1, 11, 598, 1, 23, 4, 3), V0Parameter(1, 11, 640, 1, 23, 4, 3),
V0Parameter(1, 11, 699, 1, 23, 4, 3), V0Parameter(1, 11, 736, 2, 15, 5, 3),
V0Parameter(1, 12, 806, 2, 15, 5, 3)},
{V0Parameter(1, 11, 497, 1, 23, 4, 2), V0Parameter(1, 11, 563, 1, 23, 3, 3),
V0Parameter(1, 11, 599, 1, 23, 4, 3), V0Parameter(1, 11, 643, 1, 23, 4, 3),
V0Parameter(1, 11, 721, 1, 23, 5, 3), V0Parameter(1, 11, 736, 2, 15, 5, 3),
V0Parameter(1, 12, 806, 2, 15, 5, 3)},
{V0Parameter(1, 11, 497, 1, 23, 4, 2), V0Parameter(1, 11, 564, 1, 23, 3, 3),
V0Parameter(1, 11, 602, 1, 23, 4, 3), V0Parameter(1, 11, 671, 1, 23, 4, 3),
V0Parameter(1, 11, 689, 2, 15, 4, 3), V0Parameter(1, 11, 736, 2, 15, 5, 3),
V0Parameter(1, 12, 807, 2, 15, 5, 3)},
{V0Parameter(1, 11, 498, 1, 23, 4, 2), V0Parameter(1, 11, 569, 1, 23, 3, 3),
V0Parameter(1, 11, 622, 1, 23, 4, 3), V0Parameter(1, 11, 638, 2, 15, 4, 3),
V0Parameter(1, 11, 689, 2, 16, 4, 3), V0Parameter(1, 11, 736, 2, 16, 5, 3),
V0Parameter(1, 12, 809, 2, 15, 5, 3)},
{V0Parameter(1, 11, 502, 1, 23, 4, 2), V0Parameter(1, 11, 555, 1, 23, 5, 2),
V0Parameter(1, 11, 579, 2, 15, 5, 2), V0Parameter(1, 11, 638, 2, 15, 4, 3),
V0Parameter(1, 11, 689, 2, 15, 4, 3), V0Parameter(1, 11, 737, 2, 15, 5, 3),
V0Parameter(1, 12, 818, 2, 15, 5, 3)},
{V0Parameter(1, 11, 537, 1, 23, 3, 3), V0Parameter(1, 11, 532, 2, 15, 5, 2),
V0Parameter(1, 11, 579, 2, 15, 5, 2), V0Parameter(1, 11, 638, 2, 15, 4, 3),
V0Parameter(1, 11, 690, 2, 15, 4, 3), V0Parameter(1, 11, 738, 2, 15, 5, 3),
V0Parameter(1, 12, 832, 2, 15, 9, 2)},
{V0Parameter(1, 11, 497, 2, 15, 4, 2), V0Parameter(1, 11, 532, 2, 15, 5, 2),
V0Parameter(1, 11, 579, 2, 15, 5, 2), V0Parameter(1, 11, 639, 2, 15, 4, 3),
V0Parameter(1, 11, 691, 2, 15, 4, 3), V0Parameter(1, 11, 743, 2, 16, 5, 3),
V0Parameter(1, 12, 807, 3, 12, 5, 3)},
{V0Parameter(1, 11, 497, 2, 15, 4, 2), V0Parameter(1, 11, 532, 2, 15, 5, 2),
V0Parameter(1, 11, 579, 2, 16, 5, 2), V0Parameter(1, 11, 639, 2, 15, 4, 3),
V0Parameter(1, 11, 695, 2, 16, 4, 3),
V0Parameter(1, 11, 757, 2, 16, 16, 1),
V0Parameter(1, 12, 811, 3, 12, 5, 3)},
{V0Parameter(1, 11, 497, 2, 16, 4, 2), V0Parameter(1, 11, 533, 2, 15, 5, 2),
V0Parameter(1, 11, 580, 2, 15, 5, 2), V0Parameter(1, 11, 641, 2, 16, 4, 3),
V0Parameter(1, 11, 699, 2, 16, 5, 3), V0Parameter(1, 11, 737, 3, 12, 5, 3),
V0Parameter(1, 12, 788, 3, 12, 8, 2)},
{V0Parameter(1, 11, 497, 2, 16, 4, 2), V0Parameter(1, 11, 533, 2, 15, 5, 2),
V0Parameter(1, 11, 583, 2, 16, 5, 2), V0Parameter(1, 11, 653, 2, 16, 4, 3),
V0Parameter(1, 11, 665, 3, 12, 6, 2), V0Parameter(1, 11, 738, 3, 12, 5, 3),
V0Parameter(1, 12, 775, 4, 9, 8, 2)},
{V0Parameter(1, 11, 498, 2, 15, 4, 2), V0Parameter(1, 11, 535, 2, 16, 5, 2),
V0Parameter(1, 11, 610, 2, 16, 4, 3), V0Parameter(1, 11, 614, 3, 12, 6, 2),
V0Parameter(1, 11, 666, 3, 12, 6, 2), V0Parameter(1, 11, 747, 3, 12, 5, 3),
V0Parameter(1, 12, 782, 4, 9, 8, 2)},
{V0Parameter(1, 11, 500, 2, 16, 4, 2), V0Parameter(1, 11, 544, 2, 16, 5, 2),
V0Parameter(1, 11, 580, 3, 12, 5, 2), V0Parameter(1, 11, 615, 3, 12, 6, 2),
V0Parameter(1, 11, 661, 3, 12, 7, 2), V0Parameter(1, 11, 715, 4, 9, 7, 2),
V0Parameter(1, 12, 778, 5, 8, 8, 2)},
{V0Parameter(1, 11, 513, 2, 16, 4, 2), V0Parameter(1, 11, 533, 3, 12, 5, 2),
V0Parameter(1, 11, 581, 3, 12, 5, 2), V0Parameter(1, 11, 618, 3, 12, 6, 2),
V0Parameter(1, 11, 687, 3, 12, 7, 2), V0Parameter(1, 11, 726, 4, 9, 7, 2),
V0Parameter(1, 12, 809, 5, 8, 8, 2)},
{V0Parameter(1, 11, 497, 3, 12, 4, 2), V0Parameter(1, 11, 533, 3, 12, 5, 2),
V0Parameter(1, 11, 585, 3, 12, 5, 2), V0Parameter(1, 11, 639, 3, 12, 6, 2),
V0Parameter(1, 11, 662, 4, 9, 7, 2), V0Parameter(1, 11, 717, 5, 8, 7, 2),
V0Parameter(1, 12, 820, 6, 7, 9, 2)},
{V0Parameter(1, 11, 498, 3, 12, 4, 2), V0Parameter(1, 11, 536, 3, 12, 5, 2),
V0Parameter(1, 11, 593, 3, 12, 6, 2), V0Parameter(1, 11, 619, 4, 9, 6, 2),
V0Parameter(1, 11, 693, 4, 9, 7, 2), V0Parameter(1, 11, 737, 5, 8, 7, 2),
V0Parameter(1, 12, 788, 8, 5, 8, 2)},
{V0Parameter(1, 11, 502, 3, 12, 4, 2), V0Parameter(1, 11, 552, 3, 12, 5, 2),
V0Parameter(1, 11, 585, 4, 9, 5, 2), V0Parameter(1, 11, 644, 4, 9, 6, 2),
V0Parameter(1, 11, 665, 5, 8, 7, 2), V0Parameter(1, 11, 736, 6, 7, 8, 2),
V0Parameter(1, 12, 786, 11, 4, 8, 2)},
{V0Parameter(1, 11, 508, 3, 12, 5, 2), V0Parameter(1, 11, 536, 4, 9, 5, 2),
V0Parameter(1, 11, 596, 4, 9, 6, 2), V0Parameter(1, 11, 621, 5, 8, 6, 2),
V0Parameter(1, 11, 667, 6, 7, 7, 2), V0Parameter(1, 11, 746, 7, 6, 8, 2),
V0Parameter(1, 12, 798, 14, 3, 9, 2)},
{V0Parameter(1, 11, 502, 4, 9, 4, 2), V0Parameter(1, 11, 555, 4, 9, 5, 2),
V0Parameter(1, 11, 580, 5, 8, 6, 2), V0Parameter(1, 11, 623, 6, 7, 6, 2),
V0Parameter(1, 11, 669, 7, 6, 7, 2), V0Parameter(1, 11, 723, 11, 4, 7, 2),
V0Parameter(1, 12, 814, 22, 2, 9, 2)},
{V0Parameter(1, 11, 510, 4, 9, 5, 2), V0Parameter(1, 11, 539, 5, 8, 5, 2),
V0Parameter(1, 11, 636, 5, 8, 6, 2), V0Parameter(1, 11, 625, 7, 6, 6, 2),
V0Parameter(1, 11, 674, 9, 5, 7, 2), V0Parameter(1, 11, 735, 14, 3, 8, 2),
V0Parameter(0, 0, 0, 0, 0, 0, 0)},
{V0Parameter(1, 11, 498, 5, 8, 5, 2), V0Parameter(1, 11, 579, 5, 8, 6, 2),
V0Parameter(1, 11, 583, 7, 6, 6, 2), V0Parameter(1, 11, 661, 8, 5, 7, 2),
V0Parameter(1, 11, 681, 11, 4, 7, 2), V0Parameter(1, 11, 736, 22, 2, 8, 2),
V0Parameter(0, 0, 0, 0, 0, 0, 0)},
{V0Parameter(1, 11, 530, 5, 8, 5, 2), V0Parameter(1, 11, 541, 7, 6, 5, 2),
V0Parameter(1, 11, 611, 8, 5, 6, 2), V0Parameter(1, 11, 635, 11, 4, 6, 2),
V0Parameter(1, 11, 704, 15, 3, 7, 2), V0Parameter(0, 0, 0, 0, 0, 0, 0),
V0Parameter(0, 0, 0, 0, 0, 0, 0)},
{V0Parameter(1, 11, 565, 6, 7, 5, 2), V0Parameter(1, 11, 569, 8, 5, 5, 2),
V0Parameter(1, 11, 590, 11, 4, 6, 2), V0Parameter(1, 11, 647, 15, 3, 7, 2),
V0Parameter(1, 11, 679, 44, 1, 14, 1), V0Parameter(0, 0, 0, 0, 0, 0, 0),
V0Parameter(0, 0, 0, 0, 0, 0, 0)},
{V0Parameter(1, 11, 520, 8, 5, 5, 2), V0Parameter(1, 11, 549, 11, 4, 5, 2),
V0Parameter(1, 11, 600, 15, 3, 6, 2),
V0Parameter(1, 11, 628, 44, 1, 13, 1), V0Parameter(0, 0, 0, 0, 0, 0, 0),
V0Parameter(0, 0, 0, 0, 0, 0, 0), V0Parameter(0, 0, 0, 0, 0, 0, 0)},
{V0Parameter(1, 11, 506, 11, 4, 5, 2), V0Parameter(1, 11, 559, 15, 3, 5, 2),
V0Parameter(1, 11, 584, 44, 1, 12, 1), V0Parameter(0, 0, 0, 0, 0, 0, 0),
V0Parameter(0, 0, 0, 0, 0, 0, 0), V0Parameter(0, 0, 0, 0, 0, 0, 0),
V0Parameter(0, 0, 0, 0, 0, 0, 0)},
{V0Parameter(1, 11, 503, 15, 3, 9, 1),
V0Parameter(1, 11, 594, 23, 2, 12, 1), V0Parameter(0, 0, 0, 0, 0, 0, 0),
V0Parameter(0, 0, 0, 0, 0, 0, 0), V0Parameter(0, 0, 0, 0, 0, 0, 0),
V0Parameter(0, 0, 0, 0, 0, 0, 0), V0Parameter(0, 0, 0, 0, 0, 0, 0)},
{V0Parameter(1, 11, 545, 22, 2, 11, 1), V0Parameter(0, 0, 0, 0, 0, 0, 0),
V0Parameter(0, 0, 0, 0, 0, 0, 0), V0Parameter(0, 0, 0, 0, 0, 0, 0),
V0Parameter(0, 0, 0, 0, 0, 0, 0), V0Parameter(0, 0, 0, 0, 0, 0, 0),
V0Parameter(0, 0, 0, 0, 0, 0, 0)}};
V0Variances variances[NORM2_MAX][P_MAX] = {
{V0Variances(-7.186489389863581, -8.186489389863581),
V0Variances(-7.124998639947563, -8.124998639947563),
V0Variances(-7.058035238345106, -8.058035238345106),
V0Variances(-8.771444355069676, -9.771444355069676),
V0Variances(-8.673618068377664, -9.673618068377664),
V0Variances(-14.282552902365403, -15.282552902365403),
V0Variances(-13.46973411689919, -14.46973411689919)},
{V0Variances(-6.183439290095372, -8.183439290095372),
V0Variances(-6.118015550365563, -8.118015550365563),
V0Variances(-7.822589795549476, -9.822589795549476),
V0Variances(-7.758317735238947, -9.758317735238947),
V0Variances(-13.330153794041763, -15.330153794041763),
V0Variances(-13.282552902365403, -15.282552902365403),
V0Variances(-12.468865546631157, -14.468865546631157)},
{V0Variances(-5.175869991676414, -8.175869991676414),
V0Variances(-6.8804361404287775, -9.880436140428777),
V0Variances(-6.808508030310776, -9.808508030310776),
V0Variances(-12.38562757351147, -15.38562757351147),
V0Variances(-12.330153794041763, -15.330153794041763),
V0Variances(-12.281573475846372, -15.281573475846372),
V0Variances(-11.46109512118327, -14.46109512118327)},
{V0Variances(-4.101526889142427, -8.101526889142427),
V0Variances(-5.869316883340595, -9.869316883340595),
V0Variances(-11.432333043294854, -15.432333043294854),
V0Variances(-11.384497819920412, -15.384497819920412),
V0Variances(-11.329107604561145, -15.329107604561145),
V0Variances(-11.279618603320834, -15.279618603320834),
V0Variances(-17.494100927447505, -21.494100927447505)},
{V0Variances(-4.926710762046184, -9.926710762046184),
V0Variances(-10.475838324353795, -15.475838324353795),
V0Variances(-10.432333043294854, -15.432333043294854),
V0Variances(-10.384497819920412, -15.384497819920412),
V0Variances(-10.328062930199778, -15.328062930199778),
V0Variances(-10.270886650450088, -15.270886650450088),
V0Variances(-16.68961710493341, -21.68961710493341)},
{V0Variances(-9.565782859612767, -15.565782859612767),
V0Variances(-9.475838324353795, -15.475838324353795),
V0Variances(-9.432333043294854, -15.432333043294854),
V0Variances(-9.38336983295023, -15.38336983295023),
V0Variances(-9.319759557706192, -15.319759557706192),
V0Variances(-16.395716120060897, -22.395716120060897),
V0Variances(-15.689617104933411, -21.68961710493341)},
{V0Variances(-8.565782859612767, -15.565782859612767),
V0Variances(-8.475838324353795, -15.475838324353795),
V0Variances(-8.431127783999514, -15.431127783999514),
V0Variances(-8.37999641672993, -15.37999641672993),
V0Variances(-8.297406155773494, -15.297406155773494),
V0Variances(-15.395716120060897, -22.395716120060897),
V0Variances(-14.689617104933411, -21.68961710493341)},
{V0Variances(-7.565782859612767, -15.565782859612767),
V0Variances(-7.4745582041945084, -15.474558204194508),
V0Variances(-7.427524042014056, -15.427524042014056),
V0Variances(-7.349249402293815, -15.349249402293815),
V0Variances(-14.443317011737257, -22.443317011737257),
V0Variances(-14.395716120060897, -22.395716120060897),
V0Variances(-13.688722687558503, -21.688722687558503)},
{V0Variances(-6.564332914359866, -15.564332914359866),
V0Variances(-6.4681914592406144, -15.468191459240614),
V0Variances(-6.403948495328606, -15.403948495328606),
V0Variances(-13.498790791206964, -22.498790791206964),
V0Variances(-13.669055725537874, -22.669055725537874),
V0Variances(-13.621454833861513, -22.621454833861513),
V0Variances(-12.686937172982404, -21.686937172982404)},
{V0Variances(-5.558562103418524, -15.558562103418524),
V0Variances(-5.486161899775176, -15.486161899775176),
V0Variances(-12.568787329094782, -22.568787329094782),
V0Variances(-12.498790791206964, -22.498790791206964),
V0Variances(-12.443317011737257, -22.443317011737257),
V0Variances(-12.394736693541866, -22.394736693541866),
V0Variances(-11.678956602726522, -21.678956602726522)},
{V0Variances(-4.509944741401199, -15.509944741401199),
V0Variances(-11.629855880338809, -22.62985588033881),
V0Variances(-11.568787329094782, -22.568787329094782),
V0Variances(-11.498790791206964, -22.498790791206964),
V0Variances(-11.442270822256638, -22.44227082225664),
V0Variances(-11.393758595059204, -22.393758595059204),
V0Variances(-10.66671526012685, -21.66671526012685)},
{V0Variances(-10.678946077308261, -22.67894607730826),
V0Variances(-10.629855880338809, -22.62985588033881),
V0Variances(-10.568787329094782, -22.568787329094782),
V0Variances(-10.497661037615906, -22.497661037615906),
V0Variances(-10.441226147895271, -22.44122614789527),
V0Variances(-10.614626611620722, -22.61462661162072),
V0Variances(-13.208593433538631, -25.20859343353863)},
{V0Variances(-9.678946077308261, -22.67894607730826),
V0Variances(-9.629855880338809, -22.62985588033881),
V0Variances(-9.794526042895399, -22.7945260428954),
V0Variances(-9.497661037615906, -22.497661037615906),
V0Variances(-9.662801228084582, -22.662801228084582),
V0Variances(-9.601161066897156, -22.601161066897156),
V0Variances(-12.205026813068883, -25.205026813068883)},
{V0Variances(-8.904684791108878, -22.904684791108878),
V0Variances(-8.628501236709816, -22.628501236709816),
V0Variances(-8.567542553081935, -22.567542553081935),
V0Variances(-8.72114553858065, -22.72114553858065),
V0Variances(-8.658661489202302, -22.658661489202302),
V0Variances(-12.345258643192501, -26.3452586431925),
V0Variances(-11.225779955449333, -25.225779955449333)},
{V0Variances(-7.904684791108878, -22.904684791108878),
V0Variances(-7.628501236709816, -22.628501236709816),
V0Variances(-7.789559775289774, -22.789559775289774),
V0Variances(-7.707766221116806, -22.707766221116806),
V0Variances(-11.41941378254576, -26.41941378254576),
V0Variances(-11.34428054470984, -26.34428054470984),
V0Variances(-12.519191629935513, -27.519191629935513)},
{V0Variances(-6.67749613205536, -22.67749613205536),
V0Variances(-6.851538271245765, -22.851538271245765),
V0Variances(-6.756903095664896, -22.756903095664896),
V0Variances(-10.476971625054944, -26.476971625054944),
V0Variances(-10.418329864204402, -26.418329864204402),
V0Variances(-10.335536831345415, -26.335536831345415),
V0Variances(-11.512705481362637, -27.512705481362637)},
{V0Variances(-5.900343669558978, -22.90034366955898),
V0Variances(-5.839504391264846, -22.839504391264846),
V0Variances(-9.518064502732571, -26.51806450273257),
V0Variances(-9.475797747626736, -26.475797747626736),
V0Variances(-9.42376581698619, -26.42376581698619),
V0Variances(-11.303434156979073, -28.303434156979073),
V0Variances(-11.872866817284404, -28.872866817284404)},
{V0Variances(-4.88182830408649, -22.88182830408649),
V0Variances(-8.579023186360452, -26.579023186360452),
V0Variances(-8.51682187103777, -26.51682187103777),
V0Variances(-8.47228753378785, -26.47228753378785),
V0Variances(-8.395935903330987, -26.395935903330987),
V0Variances(-10.292421003814077, -28.292421003814077),
V0Variances(-10.844682043562514, -28.844682043562514)},
{V0Variances(-7.629468026958897, -26.629468026958897),
V0Variances(-7.579023186360452, -26.579023186360452),
V0Variances(-7.511872640504656, -26.511872640504656),
V0Variances(-7.4481829872665415, -26.44818298726654),
V0Variances(-9.358990169408344, -28.358990169408344),
V0Variances(-11.010736901553656, -30.010736901553656),
V0Variances(-10.711045072243067, -29.711045072243067)},
{V0Variances(-6.628018081705996, -26.628018081705996),
V0Variances(-6.5749744525111495, -26.57497445251115),
V0Variances(-6.502074900467036, -26.502074900467036),
V0Variances(-8.40743607320482, -28.40743607320482),
V0Variances(-8.325978101743345, -28.325978101743345),
V0Variances(-9.990891151357076, -29.990891151357076),
V0Variances(-11.223620949776631, -31.22362094977663)},
{V0Variances(-5.622247270764653, -26.622247270764653),
V0Variances(-5.5537568193509514, -26.55375681935095),
V0Variances(-7.4481874655765665, -28.448187465576567),
V0Variances(-7.378875433754644, -28.378875433754644),
V0Variances(-9.065046290710335, -30.065046290710335),
V0Variances(-9.877481950164196, -30.877481950164196),
V0Variances(-11.301978588237922, -32.30197858823793)},
{V0Variances(-4.613676704353956, -26.613676704353956),
V0Variances(-6.511289277583067, -28.511289277583067),
V0Variances(-6.434749612580873, -28.434749612580873),
V0Variances(-8.114426826794364, -30.114426826794364),
V0Variances(-8.948491452600415, -30.948491452600415),
V0Variances(-9.745555749635251, -31.74555574963525),
V0Variances(-10.928045032645045, -32.928045032645045)},
{V0Variances(-5.558562095836564, -28.558562095836564),
V0Variances(-5.486161892193216, -28.486161892193216),
V0Variances(-7.163697010897138, -30.16369701089714),
V0Variances(-7.997718751680708, -30.997718751680708),
V0Variances(-8.824140459442134, -31.824140459442134),
V0Variances(-10.447705621458134, -33.447705621458134),
V0Variances(-10.746667774832375, -33.746667774832375)},
{V0Variances(-4.547157154382525, -28.547157154382525),
V0Variances(-6.216580824528357, -30.216580824528357),
V0Variances(-6.097210078262428, -30.097210078262428),
V0Variances(-7.873215469988139, -31.87321546998814),
V0Variances(-8.652284365210448, -32.65228436521045),
V0Variances(-9.889772718943199, -33.8897727189432), V0Variances(0, 0)},
{V0Variances(-5.27365058987057, -30.27365058987057),
V0Variances(-5.164941786909992, -30.164941786909992),
V0Variances(-6.9233956231626195, -31.92339562316262),
V0Variances(-7.176178241418846, -32.17617824141885),
V0Variances(-8.490876045927656, -33.490876045927656),
V0Variances(-9.868458351961102, -34.86845835196111), V0Variances(0, 0)},
{V0Variances(-4.228727281179324, -30.228727281179324),
V0Variances(-5.977329267849463, -31.977329267849463),
V0Variances(-6.232917187263332, -32.232917187263325),
V0Variances(-7.541325149103926, -33.541325149103926),
V0Variances(-8.245745876084897, -34.2457458760849), V0Variances(0, 0),
V0Variances(0, 0)},
{V0Variances(-4.068209399541431, -31.06820939954143),
V0Variances(-5.284289051019407, -32.2842890510194),
V0Variances(-6.59434596780909, -33.59434596780909),
V0Variances(-7.306650734407292, -34.30665073440729),
V0Variances(-8.356857016386314, -35.356857016386314), V0Variances(0, 0),
V0Variances(0, 0)},
{V0Variances(-4.349247565658466, -32.349247565658466),
V0Variances(-5.646300370431092, -33.64630037043109),
V0Variances(-6.361052340155609, -34.36105234015561),
V0Variances(-7.4131805240628665, -35.41318052406287), V0Variances(0, 0),
V0Variances(0, 0), V0Variances(0, 0)},
{V0Variances(-4.705134752586538, -33.70513475258654),
V0Variances(-5.412109448981951, -34.41210944898195),
V0Variances(-6.465578619068673, -35.46557861906867), V0Variances(0, 0),
V0Variances(0, 0), V0Variances(0, 0), V0Variances(0, 0)},
{V0Variances(-4.488254390500785, -34.488254390500785),
V0Variances(-5.062180911982956, -35.062180911982956), V0Variances(0, 0),
V0Variances(0, 0), V0Variances(0, 0), V0Variances(0, 0),
V0Variances(0, 0)},
{V0Variances(-4.085183120157467, -35.08518312015747), V0Variances(0, 0),
V0Variances(0, 0), V0Variances(0, 0), V0Variances(0, 0), V0Variances(0, 0),
V0Variances(0, 0)}};
extern "C" V0Parameter *get_parameters(int norm, int p) {
// - 1 is an offset as norm and p are in [1, ...] and not [0, ...]
return &parameters[norm - 1][p - 1];
}
extern "C" V0Variances *get_variances(int norm, int p) {
// - 1 is an offset as norm and p are in [1, ...] and not [0, ...]
return &variances[norm - 1][p - 1];
}

10
src/CMakeLists.txt Normal file
View File

@@ -0,0 +1,10 @@
set(SOURCES ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h)
file(GLOB SOURCES
"*.cu"
"*.h"
"fft/*.cu")
add_library(concrete_cuda STATIC ${SOURCES})
set_target_properties(concrete_cuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(concrete_cuda PUBLIC cudart)
target_include_directories(concrete_cuda PRIVATE .)

1
src/bootstrap.cu Normal file
View File

@@ -0,0 +1 @@
#include "crypto/bootstrapping_key.cuh"

174
src/bootstrap_amortized.cu Normal file
View File

@@ -0,0 +1,174 @@
#include "bootstrap_amortized.cuh"
/* Perform bootstrapping on a batch of input LWE ciphertexts
*
* - lwe_out: output batch of num_samples bootstrapped ciphertexts c =
* (a0,..an-1,b) where n is the LWE dimension
* - lut_vector: should hold as many test vectors of size polynomial_size
* as there are input ciphertexts, but actually holds
* num_lut_vectors vectors to reduce memory usage
* - lut_vector_indexes: stores the index corresponding to
* which test vector to use for each sample in
* lut_vector
* - lwe_in: input batch of num_samples LWE ciphertexts, containing n
* mask values + 1 body value
* - bootstrapping_key: RGSW encryption of the LWE secret key sk1
* under secret key sk2
* bsk = Z + sk1 H
* where H is the gadget matrix and Z is a matrix (k+1).l
* containing GLWE encryptions of 0 under sk2.
* bsk is thus a tensor of size (k+1)^2.l.N.n
* where l is the number of decomposition levels and
* k is the GLWE dimension, N is the polynomial size for
* GLWE. The polynomial size for GLWE and the test vector
* are the same because they have to be in the same ring
* to be multiplied.
* Note: it is necessary to generate (k+1).k.l.N.n
* uniformly random coefficients for the zero encryptions
* - input_lwe_dimension: size of the Torus vector used to encrypt the input
* LWE ciphertexts - referred to as n above (~ 600)
* - polynomial_size: size of the test polynomial (test vector) and size of the
* GLWE polynomial (~1024)
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
* - l_gadget: number of decomposition levels in the gadget matrix (~4)
* - num_samples: number of encrypted input messages
* - num_lut_vectors: parameter to set the actual number of test vectors to be
* used
* - q: number of bytes in the integer representation (32 or 64)
*
* This function calls a wrapper to a device kernel that performs the
* bootstrapping:
* - the kernel is templatized based on integer discretization and
* polynomial degree
* - num_samples blocks of threads are launched, where each thread is going
* to handle one or more polynomial coefficients at each stage:
* - perform the blind rotation
* - round the result
* - decompose into l_gadget levels, then for each level:
* - switch to the FFT domain
* - multiply with the bootstrapping key
* - come back to the coefficients representation
* - between each stage a synchronization of the threads is necessary
* - in case the device has enough shared memory, temporary arrays used for
* the different stages (accumulators) are stored into the shared memory
* - the accumulators serve to combine the results for all decomposition
* levels
* - the constant memory (64K) is used for storing the roots of identity
* values for the FFT
*/
void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
void *v_stream,
void *lwe_out,
void *lut_vector,
void *lut_vector_indexes,
void *lwe_in,
void *bootstrapping_key,
uint32_t input_lwe_dimension,
uint32_t polynomial_size,
uint32_t base_log,
uint32_t l_gadget,
uint32_t num_samples,
uint32_t num_lut_vectors,
uint32_t lwe_idx,
uint32_t max_shared_memory) {
switch (polynomial_size) {
case 512:
host_bootstrap_amortized<uint32_t, Degree<512>>(
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size,
base_log, l_gadget, num_samples,
num_lut_vectors, lwe_idx, max_shared_memory);
break;
case 1024:
host_bootstrap_amortized<uint32_t, Degree<1024>>(
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
num_lut_vectors, lwe_idx, max_shared_memory);
break;
case 2048:
host_bootstrap_amortized<uint32_t, Degree<2048>>(
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
num_lut_vectors, lwe_idx, max_shared_memory);
break;
case 4096:
host_bootstrap_amortized<uint32_t, Degree<4096>>(
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
num_lut_vectors, lwe_idx, max_shared_memory);
break;
case 8192:
host_bootstrap_amortized<uint32_t, Degree<8192>>(
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
num_lut_vectors, lwe_idx, max_shared_memory);
break;
default:
break;
}
}
void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
void *v_stream,
void *lwe_out,
void *lut_vector,
void *lut_vector_indexes,
void *lwe_in,
void *bootstrapping_key,
uint32_t input_lwe_dimension,
uint32_t polynomial_size,
uint32_t base_log,
uint32_t l_gadget,
uint32_t num_samples,
uint32_t num_lut_vectors,
uint32_t lwe_idx,
uint32_t max_shared_memory) {
switch (polynomial_size) {
case 512:
host_bootstrap_amortized<uint64_t, Degree<512>>(
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size,
base_log, l_gadget, num_samples,
num_lut_vectors, lwe_idx, max_shared_memory);
break;
case 1024:
host_bootstrap_amortized<uint64_t, Degree<1024>>(
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
num_lut_vectors, lwe_idx, max_shared_memory);
break;
case 2048:
host_bootstrap_amortized<uint64_t, Degree<2048>>(
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
num_lut_vectors, lwe_idx, max_shared_memory);
break;
case 4096:
host_bootstrap_amortized<uint64_t, Degree<4096>>(
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
num_lut_vectors, lwe_idx, max_shared_memory);
break;
case 8192:
host_bootstrap_amortized<uint64_t, Degree<8192>>(
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
num_lut_vectors, lwe_idx, max_shared_memory);
break;
default:
break;
}
}

466
src/bootstrap_amortized.cuh Normal file
View File

@@ -0,0 +1,466 @@
#ifdef __CDT_PARSER__
#undef __CUDA_RUNTIME_H__
#include <cuda_runtime.h>
#include <helper_cuda.h>
#endif
#ifndef CNCRT_AMORTIZED_PBS_H
#define CNCRT_AMORTIZED_PBS_H
#include "cooperative_groups.h"
#include "../include/helper_cuda.h"
#include "bootstrap.h"
#include "complex/operations.cuh"
//#include "crypto/bootstrapping_key.cuh"
#include "crypto/gadget.cuh"
#include "crypto/torus.cuh"
#include "fft/bnsmfft.cuh"
#include "fft/smfft.cuh"
#include "fft/twiddles.cuh"
#include "polynomial/functions.cuh"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial.cuh"
#include "polynomial/polynomial_math.cuh"
#include "utils/memory.cuh"
#include "utils/timer.cuh"
template <typename Torus, class params, sharedMemDegree SMD>
/*
* Kernel launched by host_bootstrap_amortized
*
* Uses shared memory to increase performance
* - lwe_out: output batch of num_samples bootstrapped ciphertexts c =
* (a0,..an-1,b) where n is the LWE dimension
* - lut_vector: should hold as many test vectors of size polynomial_size
* as there are input ciphertexts, but actually holds
* num_lut_vectors vectors to reduce memory usage
* - lut_vector_indexes: stores the index corresponding to which test vector
* to use for each sample in lut_vector
* - lwe_in: input batch of num_samples LWE ciphertexts, containing n mask
* values + 1 body value
* - bootstrapping_key: RGSW encryption of the LWE secret key sk1 under secret
* key sk2
* - device_mem: pointer to the device's global memory in case we use it (SMD
* == NOSM or PARTIALSM)
* - lwe_mask_size: size of the Torus vector used to encrypt the input
* LWE ciphertexts - referred to as n above (~ 600)
* - polynomial_size: size of the test polynomial (test vector) and size of the
* GLWE polynomial (~1024)
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
* - l_gadget: number of decomposition levels in the gadget matrix (~4)
* - gpu_num: index of the current GPU (useful for multi-GPU computations)
* - lwe_idx: equal to the number of samples per gpu x gpu_num
* - device_memory_size_per_sample: amount of global memory to allocate if SMD
* is not FULLSM
*/
__global__ void device_bootstrap_amortized(
Torus *lwe_out,
Torus *lut_vector,
uint32_t *lut_vector_indexes,
Torus *lwe_in,
double2 *bootstrapping_key,
char *device_mem,
uint32_t lwe_mask_size,
uint32_t polynomial_size,
uint32_t base_log,
uint32_t l_gadget,
uint32_t lwe_idx,
size_t device_memory_size_per_sample) {
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
// much faster than global memory
extern __shared__ char sharedmem[];
char *selected_memory;
if constexpr (SMD == FULLSM)
selected_memory = sharedmem;
else
selected_memory = &device_mem[blockIdx.x * device_memory_size_per_sample];
// For GPU bootstrapping the RLWE dimension is hard-set to 1: there is only
// one mask polynomial and 1 body to handle Also, since the decomposed
// polynomials take coefficients between -B/2 and B/2 they can be represented
// with only 16 bits, assuming the base log does not exceed 2^16
int16_t *accumulator_mask_decomposed = (int16_t *)selected_memory;
// TODO (Agnes) why not the 16 bits representation here?
int16_t *accumulator_body_decomposed =
(int16_t *)accumulator_mask_decomposed + polynomial_size;
Torus *accumulator_mask = (Torus *)accumulator_body_decomposed +
polynomial_size / (sizeof(Torus) / sizeof(int16_t));
Torus *accumulator_body =
(Torus *)accumulator_mask + (ptrdiff_t)polynomial_size;
Torus *accumulator_mask_rotated =
(Torus *)accumulator_body + (ptrdiff_t)polynomial_size;
Torus *accumulator_body_rotated =
(Torus *)accumulator_mask_rotated + (ptrdiff_t)polynomial_size;
double2 *mask_res_fft = (double2 *)accumulator_body_rotated +
polynomial_size / (sizeof(double2) / sizeof(Torus));
double2 *body_res_fft =
(double2 *)mask_res_fft + (ptrdiff_t)polynomial_size / 2;
double2 *accumulator_fft = (double2 *)sharedmem;
if constexpr (SMD != PARTIALSM)
accumulator_fft =
(double2 *)body_res_fft + (ptrdiff_t)(polynomial_size / 2);
/*
int dif0 = ((char*)accumulator_body_decomposed - (char*)selected_memory);
int dif1 = ((char*)accumulator_mask - (char*)accumulator_body_decomposed);
int dif2 = ((char*)accumulator_body - (char*)accumulator_mask);
int dif3 = ((char*)accumulator_mask_rotated - (char*)accumulator_body);
int dif4 = ((char*)accumulator_body_rotated -
(char*)accumulator_mask_rotated); int dif5 = ((char*)mask_res_fft -
(char*)accumulator_body_rotated); int dif6 = ((char*)body_res_fft -
(char*)mask_res_fft); int dif7 = (SMD != PARTIALSM)? (char*)accumulator_fft -
(char*)body_res_fft:0; if (threadIdx.x == 0 && blockIdx.x == 0) {
printf("device and shared mem: %d %d %d %d %d %d %d %d\n ",dif0, dif1, dif2,
dif3, dif4, dif5, dif6, dif7);
}
*/
auto block_lwe_in = &lwe_in[blockIdx.x * (lwe_mask_size + 1)];
Torus *block_lut_vector =
&lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree * 2];
// TODO (Agnes) try to store the gadget matrix in const memory to see if
// register use decreases Since all const mem is used for twiddles currently,
// it would mean moving some of them to global memory instead
GadgetMatrix<Torus, params> gadget(base_log, l_gadget);
// Put "b", the body, in [0, 2N[
Torus b_hat = rescale_torus_element(
block_lwe_in[lwe_mask_size],
2 * params::degree); // 2 * params::log2_degree + 1);
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator_mask, block_lut_vector, b_hat, false);
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator_body, &block_lut_vector[params::degree], b_hat, false);
// Loop over all the mask elements of the sample to accumulate
// (X^a_i-1) multiplication, decomposition of the resulting polynomial
// into l_gadget polynomials, and performing polynomial multiplication
// via an FFT with the RGSW encrypted secret key
for (int iteration = 0; iteration < lwe_mask_size; iteration++) {
// TODO make sure that following sync is necessary
synchronize_threads_in_block();
// Put "a" in [0, 2N[ instead of Zq
Torus a_hat = rescale_torus_element(
block_lwe_in[iteration],
2 * params::degree); // 2 * params::log2_degree + 1);
// TODO (Agnes) why is there this if condition?
if (a_hat == 0) {
// todo(Joao): **cannot use this optimization**
// the reason is that one of the input ciphertexts (blockIdx.z)
// might skip an iteration while others don't, which as a result
// will make that block not call the grid.sync(), causing a deadlock;
// maybe it's a workaround to add grid.sync() here, but not sure if
// there are any edge cases?
// continue
}
// Perform ACC * (X^ä - 1)
multiply_by_monomial_negacyclic_and_sub_polynomial<
Torus, params::opt, params::degree / params::opt>(
accumulator_mask, accumulator_mask_rotated, a_hat);
multiply_by_monomial_negacyclic_and_sub_polynomial<
Torus, params::opt, params::degree / params::opt>(
accumulator_body, accumulator_body_rotated, a_hat);
synchronize_threads_in_block();
// Perform a rounding to increase the accuracy of the
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator_mask_rotated, base_log, l_gadget);
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator_body_rotated, base_log, l_gadget);
// Initialize the polynomial multiplication via FFT arrays
// The polynomial multiplications happens at the block level
// and each thread handles two or more coefficients
int pos = threadIdx.x;
for (int j = 0; j < params::opt / 2; j++) {
mask_res_fft[pos].x = 0;
mask_res_fft[pos].y = 0;
body_res_fft[pos].x = 0;
body_res_fft[pos].y = 0;
pos += params::degree / params::opt;
}
// Now that the rotation is done, decompose the resulting polynomial
// coefficients so as to multiply each decomposed level with the
// corresponding part of the bootstrapping key
// TODO (Agnes) explain why we do that for the mask and body separately
for (int decomp_level = 0; decomp_level < l_gadget; decomp_level++) {
gadget.decompose_one_level(accumulator_mask_decomposed,
accumulator_mask_rotated, decomp_level);
gadget.decompose_one_level(accumulator_body_decomposed,
accumulator_body_rotated, decomp_level);
synchronize_threads_in_block();
// First, perform the polynomial multiplication for the mask
// Reduce the size of the FFT to be performed by storing
// the real-valued polynomial into a complex polynomial
real_to_complex_compressed<int16_t, params>(accumulator_mask_decomposed,
accumulator_fft);
synchronize_threads_in_block();
// Switch to the FFT space
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
synchronize_threads_in_block();
correction_direct_fft_inplace<params>(accumulator_fft);
// Get the bootstrapping key piece necessary for the multiplication
// It is already in the Fourier domain
// TODO (Agnes) Explain why for the mask polynomial multiplication
// we need the bsk_body_slice and vice versa
auto bsk_mask_slice = PolynomialFourier<double2, params>(
get_ith_mask_kth_block(
bootstrapping_key, iteration, 0, decomp_level,
polynomial_size, 1, l_gadget));
auto bsk_body_slice = PolynomialFourier<double2, params>(
get_ith_body_kth_block(
bootstrapping_key, iteration, 0, decomp_level,
polynomial_size, 1, l_gadget));
synchronize_threads_in_block();
// Perform the coefficient-wise product with the two pieces of
// bootstrapping key TODO (Agnes) why two pieces?
polynomial_product_accumulate_in_fourier_domain(
mask_res_fft, accumulator_fft, bsk_mask_slice);
polynomial_product_accumulate_in_fourier_domain(
body_res_fft, accumulator_fft, bsk_body_slice);
synchronize_threads_in_block();
// Now handle the polynomial multiplication for the body
// in the same way
real_to_complex_compressed<int16_t, params>(accumulator_body_decomposed,
accumulator_fft);
synchronize_threads_in_block();
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
synchronize_threads_in_block();
correction_direct_fft_inplace<params>(accumulator_fft);
auto bsk_mask_slice_2 = PolynomialFourier<double2, params>(
get_ith_mask_kth_block(bootstrapping_key, iteration, 1, decomp_level,
polynomial_size, 1, l_gadget));
auto bsk_body_slice_2 = PolynomialFourier<double2, params>(
get_ith_body_kth_block(bootstrapping_key, iteration, 1, decomp_level,
polynomial_size, 1, l_gadget));
synchronize_threads_in_block();
polynomial_product_accumulate_in_fourier_domain(
mask_res_fft, accumulator_fft, bsk_mask_slice_2);
polynomial_product_accumulate_in_fourier_domain(
body_res_fft, accumulator_fft, bsk_body_slice_2);
}
// Come back to the coefficient representation
if constexpr (SMD == FULLSM || SMD == NOSM) {
synchronize_threads_in_block();
correction_inverse_fft_inplace<params>(mask_res_fft);
correction_inverse_fft_inplace<params>(body_res_fft);
synchronize_threads_in_block();
NSMFFT_inverse<HalfDegree<params>>(mask_res_fft);
NSMFFT_inverse<HalfDegree<params>>(body_res_fft);
synchronize_threads_in_block();
add_to_torus<Torus, params>(mask_res_fft, accumulator_mask);
add_to_torus<Torus, params>(body_res_fft, accumulator_body);
synchronize_threads_in_block();
} else {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
accumulator_fft[tid] = mask_res_fft[tid];
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();
correction_inverse_fft_inplace<params>(accumulator_fft);
synchronize_threads_in_block();
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
synchronize_threads_in_block();
add_to_torus<Torus, params>(accumulator_fft, accumulator_mask);
synchronize_threads_in_block();
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
accumulator_fft[tid] = body_res_fft[tid];
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();
correction_inverse_fft_inplace<params>(accumulator_fft);
synchronize_threads_in_block();
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
synchronize_threads_in_block();
add_to_torus<Torus, params>(accumulator_fft, accumulator_body);
synchronize_threads_in_block();
}
}
auto block_lwe_out = &lwe_out[blockIdx.x * (polynomial_size + 1)];
// The blind rotation for this block is over
// Now we can perform the sample extraction: for the body it's just
// the resulting constant coefficient of the accumulator
// For the mask it's more complicated TODO (Agnes) explain why
sample_extract_mask<Torus, params>(block_lwe_out, accumulator_mask);
sample_extract_body<Torus, params>(block_lwe_out, accumulator_body);
}
template <typename Torus, class params>
__host__ void host_bootstrap_amortized(
void *v_stream,
Torus *lwe_out,
Torus *lut_vector,
uint32_t *lut_vector_indexes,
Torus *lwe_in,
double2 *bootstrapping_key,
uint32_t input_lwe_dimension,
uint32_t polynomial_size,
uint32_t base_log,
uint32_t l_gadget,
uint32_t input_lwe_ciphertext_count,
uint32_t num_lut_vectors,
uint32_t lwe_idx,
uint32_t max_shared_memory) {
int SM_FULL = sizeof(Torus) * polynomial_size + // accumulator mask
sizeof(Torus) * polynomial_size + // accumulator body
sizeof(Torus) * polynomial_size + // accumulator mask rotated
sizeof(Torus) * polynomial_size + // accumulator body rotated
sizeof(int16_t) * polynomial_size + // accumulator_dec mask
sizeof(int16_t) * polynomial_size + // accumulator_dec_body
sizeof(double2) * polynomial_size / 2 + // accumulator fft mask
sizeof(double2) * polynomial_size / 2 + // accumulator fft body
sizeof(double2) * polynomial_size / 2; // calculate buffer fft
int SM_PART = sizeof(double2) * polynomial_size / 2; // calculate buffer fft
int DM_PART = SM_FULL - SM_PART;
int DM_FULL = SM_FULL;
auto stream = static_cast<cudaStream_t *>(v_stream);
char *d_mem;
// Create a 1-dimensional grid of threads
// where each block handles 1 sample and each thread
// handles opt polynomial coefficients
// (actually opt/2 coefficients since we compress the real polynomial into a
// complex)
// TODO (Agnes) Polynomial size / params::opt should be equal to 256 or 512
// probably, maybe 1024 would be too big?
// Or would it actually be good in our case to have the largest possible
// number of threads per block since anyway few blocks will run
// concurrently?
dim3 grid(input_lwe_ciphertext_count, 1, 1);
dim3 thds(polynomial_size / params::opt, 1, 1);
// Launch the kernel using polynomial_size/opt threads
// where each thread computes opt polynomial coefficients
// Depending on the required amount of shared memory, choose
// from one of three templates (no use, partial use or full use
// of shared memory)
if (max_shared_memory < SM_PART) {
checkCudaErrors(cudaMalloc((void **)&d_mem, DM_FULL * input_lwe_ciphertext_count));
device_bootstrap_amortized<Torus, params, NOSM>
<<<grid, thds, 0, *stream>>>(
lwe_out, lut_vector, lut_vector_indexes, lwe_in,
bootstrapping_key, d_mem,
input_lwe_dimension, polynomial_size,
base_log, l_gadget, lwe_idx, DM_FULL);
} else if (max_shared_memory < SM_FULL) {
cudaFuncSetAttribute(device_bootstrap_amortized<Torus, params, PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
SM_PART);
cudaFuncSetCacheConfig(
device_bootstrap_amortized<Torus, params, PARTIALSM>,
cudaFuncCachePreferShared);
checkCudaErrors(cudaMalloc((void **)&d_mem, DM_PART * input_lwe_ciphertext_count));
device_bootstrap_amortized<Torus, params, PARTIALSM>
<<<grid, thds, SM_PART, *stream>>>(
lwe_out, lut_vector, lut_vector_indexes,
lwe_in, bootstrapping_key,
d_mem, input_lwe_dimension, polynomial_size,
base_log, l_gadget, lwe_idx,
DM_PART);
} else {
// For devices with compute capability 7.x a single thread block can
// address the full capacity of shared memory. Shared memory on the
// device then has to be allocated dynamically.
// For lower compute capabilities, this call
// just does nothing and the amount of shared memory used is 48 KB
checkCudaErrors(cudaFuncSetAttribute(
device_bootstrap_amortized<Torus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
SM_FULL));
// TODO (Agnes): is this necessary?
checkCudaErrors(cudaFuncSetCacheConfig(
device_bootstrap_amortized<Torus, params, FULLSM>,
cudaFuncCachePreferShared));
checkCudaErrors(cudaMalloc((void **)&d_mem, 0));
device_bootstrap_amortized<Torus, params, FULLSM>
<<<grid, thds, SM_FULL, *stream>>>(
lwe_out, lut_vector, lut_vector_indexes,
lwe_in, bootstrapping_key,
d_mem, input_lwe_dimension, polynomial_size,
base_log, l_gadget, lwe_idx,
0);
}
// Synchronize the streams before copying the result to lwe_out at the right
// place
cudaStreamSynchronize(*stream);
cudaFree(d_mem);
}
template <typename Torus, class params>
int cuda_get_pbs_per_gpu(int polynomial_size) {
int blocks_per_sm = 0;
int num_threads = polynomial_size / params::opt;
cudaGetDeviceCount(0);
cudaDeviceProp device_properties;
// FIXME: here we assume every device has same properties
cudaGetDeviceProperties(&device_properties, 0);
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&blocks_per_sm, device_bootstrap_amortized<Torus, params>,
num_threads, 0);
return device_properties.multiProcessorCount * blocks_per_sm;
}
#endif // CNCRT_PBS_H

View File

@@ -0,0 +1,183 @@
#include "bootstrap_low_latency.cuh"
/* Perform bootstrapping on a batch of input LWE ciphertexts
*
* - lwe_out: output batch of num_samples bootstrapped ciphertexts c =
* (a0,..an-1,b) where n is the LWE dimension
* - lut_vector: should hold as many test vectors of size polynomial_size
* as there are input ciphertexts, but actually holds
* num_lut_vectors vectors to reduce memory usage
* - lut_vector_indexes: stores the index corresponding to
* which test vector to use for each sample in
* lut_vector
* - lwe_in: input batch of num_samples LWE ciphertexts, containing n
* mask values + 1 body value
* - bootstrapping_key: RGSW encryption of the LWE secret key sk1
* under secret key sk2
* bsk = Z + sk1 H
* where H is the gadget matrix and Z is a matrix (k+1).l
* containing GLWE encryptions of 0 under sk2.
* bsk is thus a tensor of size (k+1)^2.l.N.n
* where l is the number of decomposition levels and
* k is the GLWE dimension, N is the polynomial size for
* GLWE. The polynomial size for GLWE and the test vector
* are the same because they have to be in the same ring
* to be multiplied.
* Note: it is necessary to generate (k+1).k.l.N.n
* uniformly random coefficients for the zero encryptions
* - lwe_dimension: size of the Torus vector used to encrypt the input
* LWE ciphertexts - referred to as n above (~ 600)
* - polynomial_size: size of the test polynomial (test vector) and size of the
* GLWE polynomial (~1024)
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
* - l_gadget: number of decomposition levels in the gadget matrix (~4)
* - num_samples: number of encrypted input messages
* - num_lut_vectors: parameter to set the actual number of test vectors to be
* used
* - q: number of bytes in the integer representation (32 or 64)
*
* This function calls a wrapper to a device kernel that performs the
* bootstrapping:
* - the kernel is templatized based on integer discretization and
* polynomial degree
* - num_samples blocks of threads are launched, where each thread is going
* to handle one or more polynomial coefficients at each stage:
* - perform the blind rotation
* - round the result
* - decompose into l_gadget levels, then for each level:
* - switch to the FFT domain
* - multiply with the bootstrapping key
* - come back to the coefficients representation
* - between each stage a synchronization of the threads is necessary TODO
* (Agnes) check this
* - in case the device has enough shared memory, temporary arrays used for
* the different stages (accumulators) are stored into the shared memory
* - the accumulators serve to combine the results for all decomposition
* levels TODO (Agnes) check this
* - the constant memory (64K) is used for storing the roots of identity
* values for the FFT
*/
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
void *v_stream,
void *lwe_out,
void *lut_vector,
void *lut_vector_indexes,
void *lwe_in,
void *bootstrapping_key,
uint32_t lwe_dimension,
uint32_t polynomial_size,
uint32_t base_log,
uint32_t l_gadget,
uint32_t num_samples,
uint32_t num_lut_vectors,
uint32_t lwe_idx,
uint32_t max_shared_memory) {
switch (polynomial_size) {
case 512:
host_bootstrap_low_latency<uint32_t, Degree<512>>(
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
base_log, l_gadget, num_samples,
num_lut_vectors);
break;
case 1024:
host_bootstrap_low_latency<uint32_t, Degree<1024>>(
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
base_log, l_gadget, num_samples,
num_lut_vectors);
break;
case 2048:
host_bootstrap_low_latency<uint32_t, Degree<2048>>(
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
base_log, l_gadget, num_samples,
num_lut_vectors);
break;
case 4096:
host_bootstrap_low_latency<uint32_t, Degree<4096>>(
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
base_log, l_gadget, num_samples,
num_lut_vectors);
break;
case 8192:
host_bootstrap_low_latency<uint32_t, Degree<8192>>(
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
base_log, l_gadget, num_samples,
num_lut_vectors);
break;
default:
break;
}
}
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
void *v_stream,
void *lwe_out,
void *lut_vector,
void *lut_vector_indexes,
void *lwe_in,
void *bootstrapping_key,
uint32_t lwe_dimension,
uint32_t polynomial_size,
uint32_t base_log,
uint32_t l_gadget,
uint32_t num_samples,
uint32_t num_lut_vectors,
uint32_t lwe_idx,
uint32_t max_shared_memory) {
switch (polynomial_size) {
case 512:
host_bootstrap_low_latency<uint64_t, Degree<512>>(
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
base_log, l_gadget, num_samples,
num_lut_vectors);
break;
case 1024:
host_bootstrap_low_latency<uint64_t, Degree<1024>>(
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
base_log, l_gadget, num_samples,
num_lut_vectors);
break;
case 2048:
host_bootstrap_low_latency<uint64_t, Degree<2048>>(
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
base_log, l_gadget, num_samples,
num_lut_vectors);
break;
case 4096:
host_bootstrap_low_latency<uint64_t, Degree<4096>>(
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
base_log, l_gadget, num_samples,
num_lut_vectors);
break;
case 8192:
host_bootstrap_low_latency<uint64_t, Degree<8192>>(
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
base_log, l_gadget, num_samples,
num_lut_vectors);
break;
default:
break;
}
}

View File

@@ -0,0 +1,348 @@
#ifdef __CDT_PARSER__
#undef __CUDA_RUNTIME_H__
#include <cuda_runtime.h>
#include <helper_cuda.h>
#endif
#ifndef LOWLAT_PBS_H
#define LOWLAT_PBS_H
#include "cooperative_groups.h"
#include "../include/helper_cuda.h"
#include "bootstrap.h"
#include "complex/operations.cuh"
#include "crypto/gadget.cuh"
#include "crypto/torus.cuh"
#include "fft/bnsmfft.cuh"
#include "fft/smfft.cuh"
#include "fft/twiddles.cuh"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial.cuh"
#include "polynomial/polynomial_math.cuh"
#include "utils/memory.cuh"
#include "utils/timer.cuh"
// Cooperative groups are used in the low latency
// version of the bootstrapping
using namespace cooperative_groups;
namespace cg = cooperative_groups;
template <typename Torus, class params>
__device__ void
mul_trgsw_trlwe(Torus *accumulator,
double2 *fft,
int16_t *trlwe_decomposed,
double2 *mask_join_buffer,
double2 *body_join_buffer,
double2 *bootstrapping_key,
int polynomial_size, int l_gadget, int iteration, grid_group &grid) {
// Put the decomposed TRLWE sample in the Fourier domain
real_to_complex_compressed<int16_t, params>(trlwe_decomposed,
fft);
synchronize_threads_in_block();
// Switch to the FFT space
NSMFFT_direct<HalfDegree<params>>(fft);
synchronize_threads_in_block();
correction_direct_fft_inplace<params>(fft);
synchronize_threads_in_block();
// Get the pieces of the bootstrapping key that will be needed for the
// external product; blockIdx.x is the ID of the block that's executing
// this function, so we end up getting the lines of the bootstrapping key
// needed to perform the external product in this block (corresponding to
// the same decomposition level)
// auto bsk_mask_slice = bootstrapping_key.get_ith_mask_kth_block(
// gpu_num, iteration, blockIdx.y, blockIdx.x);
// auto bsk_body_slice = bootstrapping_key.get_ith_body_kth_block(
// gpu_num, iteration, blockIdx.y, blockIdx.x);
auto bsk_mask_slice = PolynomialFourier<double2, params>(
get_ith_mask_kth_block(
bootstrapping_key, iteration, blockIdx.y, blockIdx.x,
polynomial_size, 1, l_gadget));
auto bsk_body_slice = PolynomialFourier<double2, params>(
get_ith_body_kth_block(
bootstrapping_key, iteration, blockIdx.y, blockIdx.x,
polynomial_size, 1, l_gadget));
// Perform the matrix multiplication between the RGSW and the TRLWE,
// each block operating on a single level for mask and body
auto first_processed_bsk = (blockIdx.y == 0) ? bsk_mask_slice : bsk_body_slice;
auto second_processed_bsk = (blockIdx.y == 0) ? bsk_body_slice : bsk_mask_slice;
auto first_processed_acc = (blockIdx.y == 0) ?
&mask_join_buffer[params::degree / 2 * blockIdx.x] :
&body_join_buffer[params::degree / 2 * blockIdx.x];
auto second_processed_acc = (blockIdx.y == 0) ?
&body_join_buffer[params::degree / 2 * blockIdx.x] :
&mask_join_buffer[params::degree / 2 * blockIdx.x];
int tid = threadIdx.x;
//first product
for(int i = 0; i < params::opt / 2; i++) {
first_processed_acc[tid] = fft[tid] * first_processed_bsk.m_values[tid];
tid += params::degree / params::opt;
}
grid.sync();
tid = threadIdx.x;
//second product
for(int i = 0; i < params::opt / 2; i++) {
second_processed_acc[tid] += fft[tid] * second_processed_bsk.m_values[tid];
tid += params::degree / params::opt;
}
// -----------------------------------------------------------------
// All blocks are synchronized here; after this sync, *_join_buffer has the
// values needed from every other block
grid.sync();
auto src_acc = (blockIdx.y == 0) ? mask_join_buffer : body_join_buffer;
// copy first product into fft buffer
tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] = src_acc[tid];
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
// accumulate rest of the products into fft buffer
for (int l = 1; l < gridDim.x; l++) {
auto cur_src_acc = &src_acc[l * params::degree / 2];
tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] += cur_src_acc[tid];
tid += params::degree / params::opt;
}
}
synchronize_threads_in_block();
correction_inverse_fft_inplace<params>(fft);
synchronize_threads_in_block();
// Perform the inverse FFT on the result of the RGSWxTRLWE and add to the
// accumulator
NSMFFT_inverse<HalfDegree<params>>(fft);
synchronize_threads_in_block();
add_to_torus<Torus, params>(fft, accumulator);
__syncthreads();
}
template <typename Torus, class params>
/*
* Kernel launched by the low latency version of the
* bootstrapping, that uses cooperative groups
* lwe_out vector of output lwe s, with length (polynomial_size+1)*num_samples
* lut_vector - vector of look up tables with length polynomial_size * num_samples
* lut_vector_indexes - mapping between lwe_in and lut_vector
* lwe_in - vector of lwe inputs with length (lwe_mask_size + 1) * num_samples
*
*/
__global__ void device_bootstrap_low_latency(
Torus *lwe_out,
Torus *lut_vector,
Torus *lwe_in,
double2 *bootstrapping_key,
double2 *mask_join_buffer,
double2 *body_join_buffer,
uint32_t lwe_mask_size,
uint32_t polynomial_size, uint32_t base_log, uint32_t l_gadget
) {
grid_group grid = this_grid();
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
// much faster than global memory
extern __shared__ char sharedmem[];
char* selected_memory = sharedmem;
int16_t *accumulator_decomposed = (int16_t *)selected_memory;
Torus *accumulator = (Torus*)accumulator_decomposed +
polynomial_size / (sizeof(Torus) / sizeof(int16_t));
double2 *accumulator_fft = (double2*)accumulator +
polynomial_size / (sizeof(double2) / sizeof(Torus));
// Reuse memory from accumulator_fft for accumulator_rotated
Torus* accumulator_rotated = (Torus*)accumulator_fft;
// The third dimension of the block is used to determine on which ciphertext
// this block is operating, in the case of batch bootstraps
auto block_lwe_in = &lwe_in[blockIdx.z * (lwe_mask_size + 1)];
auto block_lut_vector =
&lut_vector[blockIdx.z * params::degree * 2];
auto block_mask_join_buffer = &mask_join_buffer[blockIdx.z * l_gadget * params::degree / 2];
auto block_body_join_buffer = &body_join_buffer[blockIdx.z * l_gadget * params::degree / 2];
// Since the space is L1 cache is small, we use the same memory location for
// the rotated accumulator and the fft accumulator, since we know that the
// rotated array is not in use anymore by the time we perform the fft
GadgetMatrix<Torus, params> gadget(base_log, l_gadget);
// Put "b" in [0, 2N[
Torus b_hat = rescale_torus_element(
block_lwe_in[lwe_mask_size],
2 * params::degree);
if (blockIdx.y == 0) {
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, block_lut_vector, b_hat, false);
}
else {
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[params::degree], b_hat, false);
}
for (int i = 0; i < lwe_mask_size; i++) {
synchronize_threads_in_block();
// Put "a" in [0, 2N[
Torus a_hat = rescale_torus_element(
block_lwe_in[i],
2 * params::degree); // 2 * params::log2_degree + 1);
if (a_hat == 0) {
// todo(Joao): **cannot use this optimization**
// the reason is that one of the input ciphertexts (blockIdx.z)
// might skip an iteration while others don't, which as a result
// will make that block not call the grid.sync(), causing a deadlock;
// maybe it's a workaround to add grid.sync() here, but not sure if
// there are any edge cases?
// continue
}
// Perform ACC * (X^ä - 1)
multiply_by_monomial_negacyclic_and_sub_polynomial<
Torus, params::opt, params::degree / params::opt>(
accumulator, accumulator_rotated, a_hat);
// Perform a rounding to increase the accuracy of the
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator_rotated, base_log, l_gadget);
// Decompose the accumulator. Each block gets one level of the
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
gadget.decompose_one_level(accumulator_decomposed, accumulator_rotated,
blockIdx.x);
// We are using the same memory space for accumulator_fft and
// accumulator_rotated, so we need to synchronize here to make sure they
// don't modify the same memory space at the same time
synchronize_threads_in_block();
// Perform G^-1(ACC) * RGSW -> TRLWE
mul_trgsw_trlwe<Torus, params>(
accumulator,
accumulator_fft,
accumulator_decomposed,
block_mask_join_buffer,
block_body_join_buffer,
bootstrapping_key,
polynomial_size, l_gadget, i, grid);
}
auto block_lwe_out = &lwe_out[blockIdx.z * (polynomial_size + 1)];
if (blockIdx.x == 0 && blockIdx.y == 0) {
// Perform a sample extract. At this point, all blocks have the result, but
// we do the computation at block 0 to avoid waiting for extra blocks, in
// case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_out, accumulator);
} else if (blockIdx.x == 0 && blockIdx.y == 1) {
sample_extract_body<Torus, params>(block_lwe_out, accumulator);
}
}
/*
* Host wrapper to the low latency version
* of bootstrapping
*/
template <typename Torus, class params>
__host__ void host_bootstrap_low_latency(
void *v_stream,
Torus *lwe_out,
Torus *lut_vector,
uint32_t *lut_vector_indexes,
Torus *lwe_in,
double2 *bootstrapping_key,
uint32_t lwe_mask_size,
uint32_t polynomial_size,
uint32_t base_log,
uint32_t l_gadget,
uint32_t num_samples,
uint32_t num_lut_vectors) {
auto stream = static_cast<cudaStream_t *>(v_stream);
int buffer_size_per_gpu = l_gadget * num_samples * polynomial_size / 2 * sizeof(double2);
double2 *mask_buffer_fft;
double2 *body_buffer_fft;
checkCudaErrors(cudaMalloc((void **)&mask_buffer_fft, buffer_size_per_gpu));
checkCudaErrors(cudaMalloc((void **)&body_buffer_fft, buffer_size_per_gpu));
int bytes_needed =
sizeof(int16_t) * polynomial_size + // accumulator_decomp
sizeof(Torus) * polynomial_size + // accumulator
sizeof(double2) * polynomial_size / 2; // accumulator fft
int thds = polynomial_size / params::opt;
dim3 grid(l_gadget, 2, num_samples);
void *kernel_args[10];
kernel_args[0] = &lwe_out;
kernel_args[1] = &lut_vector;
kernel_args[2] = &lwe_in;
kernel_args[3] = &bootstrapping_key;
kernel_args[4] = &mask_buffer_fft;
kernel_args[5] = &body_buffer_fft;
kernel_args[6] = &lwe_mask_size;
kernel_args[7] = &polynomial_size;
kernel_args[8] = &base_log;
kernel_args[9] =&l_gadget;
checkCudaErrors(cudaFuncSetAttribute(device_bootstrap_low_latency<Torus,
params>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
bytes_needed));
cudaFuncSetCacheConfig(device_bootstrap_low_latency<Torus, params>,
cudaFuncCachePreferShared);
checkCudaErrors(cudaLaunchCooperativeKernel ( (void *)device_bootstrap_low_latency<Torus, params>, grid, thds, (void**)kernel_args, bytes_needed, *stream )) ;
// Synchronize the streams before copying the result to lwe_out at the right
// place
cudaStreamSynchronize(*stream);
cudaFree(mask_buffer_fft);
cudaFree(body_buffer_fft);
}
#endif // LOWLAT_PBS_H

View File

@@ -0,0 +1,87 @@
#ifndef GPU_BOOTSTRAP_COMMON_CUH
#define GPU_BOOTSTRAP_COMMON_CUH
#include <cassert>
#include <cstdint>
#include <cstdio>
#define SNT 1
#define dPI 6.283185307179586231995926937088
using sTorus = int32_t;
// using Torus = uint32_t;
using sTorus = int32_t;
using u32 = uint32_t;
using i32 = int32_t;
//--------------------------------------------------
// Basic double2 operations
__device__ inline double2 conjugate(const double2 num) {
double2 res;
res.x = num.x;
res.y = -num.y;
return res;
}
__device__ inline void operator+=(double2 &lh, const double2 rh) {
lh.x += rh.x;
lh.y += rh.y;
}
__device__ inline void operator-=(double2 &lh, const double2 rh) {
lh.x -= rh.x;
lh.y -= rh.y;
}
__device__ inline double2 operator+(const double2 a, const double2 b) {
double2 res;
res.x = a.x + b.x;
res.y = a.y + b.y;
return res;
}
__device__ inline double2 operator-(const double2 a, const double2 b) {
double2 res;
res.x = a.x - b.x;
res.y = a.y - b.y;
return res;
}
__device__ inline double2 operator*(const double2 a, const double2 b) {
double xx = a.x * b.x;
double xy = a.x * b.y;
double yx = a.y * b.x;
double yy = a.y * b.y;
double2 res;
// asm volatile("fma.rn.f64 %0, %1, %2, %3;": "=d"(res.x) : "d"(a.x),
// "d"(b.x), "d"(yy));
res.x = xx - yy;
res.y = xy + yx;
return res;
}
__device__ inline double2 operator*(const double2 a, double b) {
double2 res;
res.x = a.x * b;
res.y = a.y * b;
return res;
}
__device__ inline void operator*=(double2 &a, const double2 b) {
double tmp = a.x;
a.x *= b.x;
a.x -= a.y * b.y;
a.y *= b.x;
a.y += b.y * tmp;
}
__device__ inline double2 operator*(double a, double2 b) {
double2 res;
res.x = b.x * a;
res.y = b.y * a;
return res;
}
#endif

View File

@@ -0,0 +1,157 @@
#ifndef CNCRT_BSK_H
#define CNCRT_BSK_H
#include "bootstrap.h"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial.cuh"
#include <atomic>
#include <cstdint>
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
int glwe_dimension,
uint32_t l_gadget) {
return i * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) * l_gadget;
}
__device__ double2*
get_ith_mask_kth_block(double2* ptr, int i, int k, int level, uint32_t polynomial_size,
int glwe_dimension, uint32_t l_gadget) {
return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension, l_gadget) +
level * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) +
k * polynomial_size / 2 * (glwe_dimension + 1)];
}
__device__ double2*
get_ith_body_kth_block(double2 *ptr, int i, int k, int level, uint32_t polynomial_size,
int glwe_dimension, uint32_t l_gadget) {
return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension, l_gadget) +
level * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) +
k * polynomial_size / 2 * (glwe_dimension + 1) +
polynomial_size / 2];
}
void cuda_initialize_twiddles(uint32_t polynomial_size, uint32_t gpu_index) {
cudaSetDevice(gpu_index);
int sw_size = polynomial_size / 2;
short *sw1_h, *sw2_h;
sw1_h = (short *)malloc(sizeof(short) * sw_size);
sw2_h = (short *)malloc(sizeof(short) * sw_size);
memset(sw1_h, 0, sw_size * sizeof(short));
memset(sw2_h, 0, sw_size * sizeof(short));
int cnt = 0;
for (int i = 1, j = 0; i < polynomial_size / 2; i++) {
int bit = (polynomial_size / 2) >> 1;
for (; j & bit; bit >>= 1)
j ^= bit;
j ^= bit;
if (i < j) {
sw1_h[cnt] = i;
sw2_h[cnt] = j;
cnt++;
}
}
cudaMemcpyToSymbol(SW1, sw1_h, sw_size * sizeof(short), 0,
cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(SW2, sw2_h, sw_size * sizeof(short), 0,
cudaMemcpyHostToDevice);
free(sw1_h);
free(sw2_h);
}
template <typename T, typename ST>
void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim,
uint32_t l_gadget, uint32_t polynomial_size) {
cudaSetDevice(gpu_index);
int shared_memory_size = sizeof(double) * polynomial_size;
int total_polynomials =
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
l_gadget;
// Here the buffer size is the size of double2 times the number of polynomials
// times the polynomial size over 2 because the polynomials are compressed
// into the complex domain to perform the FFT
size_t buffer_size = total_polynomials * polynomial_size / 2 * sizeof
(double2);
int gridSize = total_polynomials;
int blockSize = polynomial_size / choose_opt(polynomial_size);
// todo(Joao): let's use cudaMallocHost here,
// since it allocates page-staged memory which allows
// faster data copy
double2 *h_bsk = (double2 *)malloc(buffer_size);
double2 *d_bsk;
cudaMalloc((void **)&d_bsk, buffer_size);
// compress real bsk to complex and divide it on DOUBLE_MAX
for (int i = 0; i < total_polynomials; i++) {
int complex_current_poly_idx = i * polynomial_size / 2;
int torus_current_poly_idx = i * polynomial_size;
for (int j = 0; j < polynomial_size / 2; j++) {
h_bsk[complex_current_poly_idx + j].x =
src[torus_current_poly_idx + 2 * j];
h_bsk[complex_current_poly_idx + j].y =
src[torus_current_poly_idx + 2 * j + 1];
h_bsk[complex_current_poly_idx + j].x /=
(double)std::numeric_limits<T>::max();
h_bsk[complex_current_poly_idx + j].y /=
(double)std::numeric_limits<T>::max();
}
}
cudaMemcpy(d_bsk, h_bsk, buffer_size, cudaMemcpyHostToDevice);
auto stream = static_cast<cudaStream_t *>(v_stream);
switch (polynomial_size) {
// FIXME (Agnes): check if polynomial sizes are ok
case 512:
batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
break;
case 1024:
batch_NSMFFT<FFTDegree<Degree<1024>, ForwardFFT>>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
break;
case 2048:
batch_NSMFFT<FFTDegree<Degree<2048>, ForwardFFT>>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
break;
case 4096:
batch_NSMFFT<FFTDegree<Degree<4096>, ForwardFFT>>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
break;
case 8192:
batch_NSMFFT<FFTDegree<Degree<8192>, ForwardFFT>>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
break;
default:
break;
}
cudaFree(d_bsk);
free(h_bsk);
}
void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src, void *v_stream,
uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim,
uint32_t l_gadget, uint32_t polynomial_size) {
cuda_convert_lwe_bootstrap_key<uint32_t, int32_t>((double2 *)dest, (int32_t *)src,
v_stream, gpu_index, input_lwe_dim,
glwe_dim, l_gadget, polynomial_size);
}
void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src, void *v_stream,
uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim,
uint32_t l_gadget, uint32_t polynomial_size) {
cuda_convert_lwe_bootstrap_key<uint64_t, int64_t>((double2 *)dest, (int64_t *)src,
v_stream, gpu_index, input_lwe_dim,
glwe_dim, l_gadget, polynomial_size);
}
#endif // CNCRT_BSK_H

91
src/crypto/gadget.cuh Normal file
View File

@@ -0,0 +1,91 @@
#ifndef CNCRT_CRYPTO_H
#define CNCRT_CRPYTO_H
#include "polynomial/polynomial.cuh"
#include <cstdint>
template <typename T, class params> class GadgetMatrix {
private:
uint32_t l_gadget;
uint32_t base_log;
uint32_t mask;
uint32_t halfbg;
T offset;
public:
__device__ GadgetMatrix(uint32_t base_log, uint32_t l_gadget)
: base_log(base_log), l_gadget(l_gadget) {
uint32_t bg = 1 << base_log;
this->halfbg = bg / 2;
this->mask = bg - 1;
T temp = 0;
for (int i = 0; i < this->l_gadget; i++) {
temp += 1ULL << (sizeof(T) * 8 - (i + 1) * this->base_log);
}
this->offset = temp * this->halfbg;
}
template <typename V, typename U>
__device__ void decompose_one_level(Polynomial<V, params> &result,
Polynomial<U, params> &polynomial,
uint32_t level) {
int tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
T s = polynomial.coefficients[tid] + this->offset;
uint32_t decal = (sizeof(T) * 8 - (level + 1) * this->base_log);
T temp1 = (s >> decal) & this->mask;
result.coefficients[tid] = (V)(temp1 - this->halfbg);
tid += params::degree / params::opt;
}
}
template <typename V, typename U>
__device__ void decompose_one_level(V *result, U *polynomial,
uint32_t level) {
int tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
T s = polynomial[tid] + this->offset;
uint32_t decal = (sizeof(T) * 8 - (level + 1) * this->base_log);
T temp1 = (s >> decal) & this->mask;
result[tid] = (V)(temp1 - this->halfbg);
tid += params::degree / params::opt;
}
}
__device__ T decompose_one_level_single(T element, uint32_t level) {
T s = element + this->offset;
uint32_t decal = (sizeof(T) * 8 - (level + 1) * this->base_log);
T temp1 = (s >> decal) & this->mask;
return (T)(temp1 - this->halfbg);
}
};
template <typename T> class GadgetMatrixSingle {
private:
uint32_t l_gadget;
uint32_t base_log;
uint32_t mask;
uint32_t halfbg;
T offset;
public:
__device__ GadgetMatrixSingle(uint32_t base_log, uint32_t l_gadget)
: base_log(base_log), l_gadget(l_gadget) {
uint32_t bg = 1 << base_log;
this->halfbg = bg / 2;
this->mask = bg - 1;
T temp = 0;
for (int i = 0; i < this->l_gadget; i++) {
temp += 1ULL << (sizeof(T) * 8 - (i + 1) * this->base_log);
}
this->offset = temp * this->halfbg;
}
__device__ T decompose_one_level_single(T element, uint32_t level) {
T s = element + this->offset;
uint32_t decal = (sizeof(T) * 8 - (level + 1) * this->base_log);
T temp1 = (s >> decal) & this->mask;
return (T)(temp1 - this->halfbg);
}
};
#endif // CNCRT_CRPYTO_H

45
src/crypto/torus.cuh Normal file
View File

@@ -0,0 +1,45 @@
#ifndef CNCRT_TORUS_H
#define CNCRT_TORUS_H
#include "types/int128.cuh"
#include <limits>
template <typename Torus>
__device__ inline Torus typecast_double_to_torus(double x) {
if constexpr (sizeof(Torus) < 8) {
// this simple cast works up to 32 bits, afterwards we must do it manually
long long ret = x;
return (Torus)ret;
} else {
int128 nnnn = make_int128_from_float(x);
uint64_t lll = nnnn.lo_;
return lll;
}
// nvcc doesn't get it that the if {} else {} above should always return
// something, and complains that this function might return nothing, so we
// put this useless return here
return 0;
}
template <typename T>
__device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
uint32_t l_gadget) {
T shift = sizeof(T) * 8 - l_gadget * base_log;
T mask = 1ll << (shift - 1);
T b = (x & mask) >> (shift - 1);
T res = x >> shift;
res += b;
res <<= shift;
return res;
}
template <typename T>
__device__ __forceinline__ T rescale_torus_element(T element,
uint32_t log_shift) {
// todo(Joao): not sure if this works
// return element >> log_shift;
return round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
(double)log_shift);
}
#endif // CNCRT_TORUS_H

147
src/device.cu Normal file
View File

@@ -0,0 +1,147 @@
#include "device.h"
#include <cstdint>
#include <cstdio>
#include <cuda_runtime.h>
#include <helper_cuda.h>
/// Unsafe function to create a CUDA stream, must check first that GPU exists
void *cuda_create_stream(uint32_t gpu_index) {
cudaSetDevice(gpu_index);
cudaStream_t *stream = new cudaStream_t;
cudaStreamCreate(stream);
return stream;
}
/// Unsafe function to destroy CUDA stream, must check first the GPU exists
int cuda_destroy_stream(void *v_stream, uint32_t gpu_index) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
cudaStreamDestroy(*stream);
return 0;
}
/// Unsafe function that will try to allocate even if gpu_index is invalid
/// or if there's not enough memory. A safe wrapper around it must call
/// cuda_check_valid_malloc() first
void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
cudaSetDevice(gpu_index);
void *ptr;
checkCudaErrors(cudaMalloc((void **)&ptr, size));
return ptr;
}
/// Checks that allocation is valid
/// 0: valid
/// -1: invalid, not enough memory in device
/// -2: invalid, gpu index doesn't exist
int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
if (gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
cudaSetDevice(gpu_index);
size_t total_mem, free_mem;
cudaMemGetInfo(&free_mem, &total_mem);
if (size > free_mem) {
// error code: not enough memory
return -1;
}
return 0;
}
/// Tries to copy memory to the GPU asynchronously
/// 0: success
/// -1: error, invalid device pointer
/// -2: error, gpu index doesn't exist
int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
void *v_stream, uint32_t gpu_index) {
if (gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
cudaPointerAttributes attr;
cudaPointerGetAttributes(&attr, dest);
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
// error code: invalid device pointer
return -1;
}
auto stream = static_cast<cudaStream_t *>(v_stream);
cudaSetDevice(gpu_index);
checkCudaErrors(cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice,
*stream));
return 0;
}
/// Synchronizes device
/// 0: success
/// -2: error, gpu index doesn't exist
int cuda_synchronize_device(uint32_t gpu_index) {
if (gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
cudaSetDevice(gpu_index);
cudaDeviceSynchronize();
return 0;
}
/// Tries to copy memory to the GPU asynchronously
/// 0: success
/// -1: error, invalid device pointer
/// -2: error, gpu index doesn't exist
int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
void *v_stream, uint32_t gpu_index) {
if (gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
cudaPointerAttributes attr;
cudaPointerGetAttributes(&attr, src);
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
// error code: invalid device pointer
return -1;
}
auto stream = static_cast<cudaStream_t *>(v_stream);
cudaSetDevice(gpu_index);
checkCudaErrors(cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost,
*stream));
return 0;
}
/// Return number of GPUs available
int cuda_get_number_of_gpus() {
int num_gpus;
cudaGetDeviceCount(&num_gpus);
return num_gpus;
}
/// Drop a cuda array
int cuda_drop(void *ptr, uint32_t gpu_index) {
if (gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
cudaSetDevice(gpu_index);
checkCudaErrors(cudaFree(ptr));
return 0;
}
/// Get the maximum size for the shared memory
int cuda_get_max_shared_memory(uint32_t gpu_index) {
if (gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
cudaSetDevice(gpu_index);
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, gpu_index);
int max_shared_memory = 0;
if (prop.major > 7) {
max_shared_memory = prop.sharedMemPerMultiprocessor;
} else {
max_shared_memory = prop.sharedMemPerBlock;
}
return max_shared_memory;
}

753
src/fft/bnsmfft.cuh Normal file
View File

@@ -0,0 +1,753 @@
#ifndef GPU_BOOTSTRAP_FFT_1024_CUH
#define GPU_BOOTSTRAP_FFT_1024_CUH
#include "complex/operations.cuh"
#include "polynomial/functions.cuh"
#include "polynomial/parameters.cuh"
#include "twiddles.cuh"
/*
* bit reverse
* coefficient bits are reversed based on precalculated indexes
* SW1 and SW2
*/
template <class params> __device__ void bit_reverse_inplace(double2 *A) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
short sw1 = SW1[tid];
short sw2 = SW2[tid];
double2 tmp = A[sw1];
A[sw1] = A[sw2];
A[sw2] = tmp;
tid += params::degree / params::opt;
}
}
/*
* negacyclic twiddle
* returns negacyclic twiddle based on degree and index
* twiddles are precalculated inside negTwids{3..13} arrays
*/
template <int degree> __device__ double2 negacyclic_twiddle(int j) {
double2 twid;
switch (degree) {
case 512:
twid = negTwids9[j];
break;
case 1024:
twid = negTwids10[j];
break;
case 2048:
twid = negTwids11[j];
break;
case 4096:
twid = negTwids12[j];
break;
case 8192:
twid = negTwids13[j];
default:
twid.x = 0;
twid.y = 0;
break;
}
return twid;
}
/*
* Direct negacyclic FFT:
* - before the FFT the N real coefficients are stored into a
* N/2 sized complex with the even coefficients in the real part
* and the odd coefficients in the imaginary part. This is referred to
* as the half-size FFT
* - when calling BNSMFFT_direct for the forward negacyclic FFT of PBS,
* opt is divided by 2 because the butterfly pattern is always applied
* between pairs of coefficients
* - instead of twisting each coefficient A_j before the FFT by
* multiplying by the w^j roots of unity (aka twiddles, w=exp(-i pi /N)),
* the FFT is modified, and for each level k of the FFT the twiddle:
* w_j,k = exp(-i pi j/2^k)
* is replaced with:
* \zeta_j,k = exp(-i pi (2j-1)/2^k)
* - this technique also implies a correction of the
* complex obtained after the FFT, which is done in the
* forward_negacyclic_fft_inplace function of bootstrap.cuh
*/
template <class params> __device__ void NSMFFT_direct(double2 *A) {
/* First, reverse the bits of the input complex
* The bit reversal for half-size FFT has been stored into the
* SW1 and SW2 arrays beforehand
* Each thread is always in charge of "opt/2" pairs of coefficients,
* which is why we always loop through N/2 by N/opt strides
* The pragma unroll instruction tells the compiler to unroll the
* full loop, which should increase performance TODO (Agnes) check this
*/
bit_reverse_inplace<params>(A);
__syncthreads();
// Now we go through all the levels of the FFT one by one
// (instead of recursively)
// first iteration: k=1, zeta=i for all coefficients
int tid = threadIdx.x;
int i1, i2;
double2 u, v;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
// the butterfly pattern is applied to each pair
// of coefficients, with a stride of 2
i1 = tid << 1;
i2 = i1 + 1;
u = A[i1];
// v = i*A[i2]
v.y = A[i2].x;
v.x = -A[i2].y;
// A[i1] <- A[i1] + i*A[i2]
// A[i2] <- A[i1] - i*A[i2]
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// second iteration: apply the butterfly pattern
// between groups of 4 coefficients
// k=2, \zeta=exp(i pi/4) for even coefficients and
// exp(3 i pi / 4) for odd coefficients
// TODO (Agnes) how does this work on the gpu? aren't we doing
// a lot more computations than we should?
tid = threadIdx.x;
// odd = 0 for even coefficients, 1 for odd coefficients
int odd = tid & 1;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
// the butterfly pattern is applied to each pair
// of coefficients, with a stride of 2
// i1=2*tid if tid is even and 2*tid-1 if it is odd
i1 = (tid << 1) - odd;
i2 = i1 + 2;
double a = A[i2].x;
double b = A[i2].y;
u = A[i1];
// \zeta_j,2 = exp(-i pi (2j-1)/4) -> j=0: exp(i pi/4) or j=1: exp(-i pi/4)
// \zeta_even = sqrt(2)/2 + i * sqrt(2)/2 = sqrt(2)/2*(1+i)
// \zeta_odd = sqrt(2)/2 - i * sqrt(2)/2 = sqrt(2)/2*(1-i)
// v_j = \zeta_j * (a+i*b)
// v_even = sqrt(2)/2*((a-b)+i*(a+b))
// v_odd = sqrt(2)/2*(a+b+i*(b-a))
v.x =
(odd) ? (-0.707106781186548) * (a + b) : (0.707106781186548) * (a - b);
v.y = (odd) ? (0.707106781186548) * (a - b) : (0.707106781186548) * (a + b);
// v.x = (0.707106781186548 * odd) * (a + b) + (0.707106781186548 * (!odd))
// * (a - b); v.y = (0.707106781186548 * odd) * (b - a) + (0.707106781186548
// * (!odd)) * (a + b);
A[i1] = u + v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// third iteration
// from k=3 on, we have to do the full complex multiplication
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
// the butterfly pattern is applied to each pair
// of coefficients, with a stride of 4
// rem is the remainder of tid/4. tid takes values:
// 0, 1, 2, 3, 4, 5, 6, 7, ... N/4
// then rem takes values:
// 0, 1, 2, 3, 0, 1, 2, 3, ... N/4
// and striding by 4 will allow us to cover all
// the coefficients correctly
int rem = tid & 3;
i1 = (tid << 1) - rem;
i2 = i1 + 4;
double2 w = negTwids3[rem];
u = A[i1], v = A[i2] * w;
A[i1] = u + v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// 4_th iteration
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
// the butterfly pattern is applied to each pair
// of coefficients, with a stride of 8
// rem is the remainder of tid/8. tid takes values:
// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ... N/4
// then rem takes values:
// 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, ... N/4
// and striding by 8 will allow us to cover all
// the coefficients correctly
int rem = tid & 7;
i1 = (tid << 1) - rem;
i2 = i1 + 8;
double2 w = negTwids4[rem];
u = A[i1], v = A[i2] * w;
A[i1] = u + v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// 5_th iteration
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
// the butterfly pattern is applied to each pair
// of coefficients, with a stride of 16
// rem is the remainder of tid/16
// and the same logic as for previous iterations applies
int rem = tid & 15;
i1 = (tid << 1) - rem;
i2 = i1 + 16;
double2 w = negTwids5[rem];
u = A[i1], v = A[i2] * w;
A[i1] = u + v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// 6_th iteration
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
// the butterfly pattern is applied to each pair
// of coefficients, with a stride of 32
// rem is the remainder of tid/32
// and the same logic as for previous iterations applies
int rem = tid & 31;
i1 = (tid << 1) - rem;
i2 = i1 + 32;
double2 w = negTwids6[rem];
u = A[i1], v = A[i2] * w;
A[i1] = u + v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// 7_th iteration
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
// the butterfly pattern is applied to each pair
// of coefficients, with a stride of 64
// rem is the remainder of tid/64
// and the same logic as for previous iterations applies
int rem = tid & 63;
i1 = (tid << 1) - rem;
i2 = i1 + 64;
double2 w = negTwids7[rem];
u = A[i1], v = A[i2] * w;
A[i1] = u + v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// 8_th iteration
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
// the butterfly pattern is applied to each pair
// of coefficients, with a stride of 128
// rem is the remainder of tid/128
// and the same logic as for previous iterations applies
int rem = tid & 127;
i1 = (tid << 1) - rem;
i2 = i1 + 128;
double2 w = negTwids8[rem];
u = A[i1], v = A[i2] * w;
A[i1] = u + v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
if constexpr (params::log2_degree > 8) {
// 9_th iteration
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
// the butterfly pattern is applied to each pair
// of coefficients, with a stride of 256
// rem is the remainder of tid/256
// and the same logic as for previous iterations applies
int rem = tid & 255;
i1 = (tid << 1) - rem;
i2 = i1 + 256;
double2 w = negTwids9[rem];
u = A[i1], v = A[i2] * w;
A[i1] = u + v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::log2_degree > 9) {
// 10_th iteration
tid = threadIdx.x;
//#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
// the butterfly pattern is applied to each pair
// of coefficients, with a stride of 512
// rem is the remainder of tid/512
// and the same logic as for previous iterations applies
int rem = tid & 511;
i1 = (tid << 1) - rem;
i2 = i1 + 512;
double2 w = negTwids10[rem];
u = A[i1], v = A[i2] * w;
A[i1] = u + v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::log2_degree > 10) {
// 11_th iteration
tid = threadIdx.x;
//#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
// the butterfly pattern is applied to each pair
// of coefficients, with a stride of 1024
// rem is the remainder of tid/1024
// and the same logic as for previous iterations applies
int rem = tid & 1023;
i1 = (tid << 1) - rem;
i2 = i1 + 1024;
double2 w = negTwids11[rem];
u = A[i1], v = A[i2] * w;
A[i1] = u + v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::log2_degree > 11) {
// 12_th iteration
tid = threadIdx.x;
//#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
// the butterfly pattern is applied to each pair
// of coefficients, with a stride of 2048
// rem is the remainder of tid/2048
// and the same logic as for previous iterations applies
int rem = tid & 2047;
i1 = (tid << 1) - rem;
i2 = i1 + 2048;
double2 w = negTwids12[rem];
u = A[i1], v = A[i2] * w;
A[i1] = u + v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
}
// Real polynomials handled should not exceed a degree of 8192
}
/*
* negacyclic inverse fft
*/
template <class params> __device__ void NSMFFT_inverse(double2 *A) {
/* First, reverse the bits of the input complex
* The bit reversal for half-size FFT has been stored into the
* SW1 and SW2 arrays beforehand
* Each thread is always in charge of "opt/2" pairs of coefficients,
* which is why we always loop through N/2 by N/opt strides
* The pragma unroll instruction tells the compiler to unroll the
* full loop, which should increase performance TODO (Agnes) check this
*/
int tid;
int i1, i2;
double2 u, v;
if constexpr (params::log2_degree > 11) {
// 12_th iteration
tid = threadIdx.x;
//#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
// the butterfly pattern is applied to each pair
// of coefficients, with a stride of 2048
// rem is the remainder of tid/2048
// and the same logic as for previous iterations applies
int rem = tid & 2047;
i1 = (tid << 1) - rem;
i2 = i1 + 2048;
double2 w = conjugate(negTwids12[rem]);
u = A[i1], v = A[i2];
A[i1] = (u + v) * 0.5;
A[i2] = (u - v) * w * 0.5;
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::log2_degree > 10) {
// 11_th iteration
tid = threadIdx.x;
//#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
// the butterfly pattern is applied to each pair
// of coefficients, with a stride of 1024
// rem is the remainder of tid/1024
// and the same logic as for previous iterations applies
int rem = tid & 1023;
i1 = (tid << 1) - rem;
i2 = i1 + 1024;
double2 w = conjugate(negTwids11[rem]);
u = A[i1], v = A[i2];
A[i1] = (u + v) * 0.5;
A[i2] = (u - v) * w * 0.5;
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::log2_degree > 9) {
// 10_th iteration
tid = threadIdx.x;
//#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
// the butterfly pattern is applied to each pair
// of coefficients, with a stride of 512
// rem is the remainder of tid/512
// and the same logic as for previous iterations applies
int rem = tid & 511;
i1 = (tid << 1) - rem;
i2 = i1 + 512;
double2 w = conjugate(negTwids10[rem]);
u = A[i1], v = A[i2];
A[i1] = (u + v) * 0.5;
A[i2] = (u - v) * w * 0.5;
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::log2_degree > 8) {
// 9_th iteration
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
// the butterfly pattern is applied to each pair
// of coefficients, with a stride of 256
// rem is the remainder of tid/256
// and the same logic as for previous iterations applies
int rem = tid & 255;
i1 = (tid << 1) - rem;
i2 = i1 + 256;
double2 w = conjugate(negTwids9[rem]);
u = A[i1], v = A[i2];
A[i1] = (u + v) * 0.5;
A[i2] = (u - v) * w * 0.5;
tid += params::degree / params::opt;
}
__syncthreads();
}
// 8_th iteration
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
// the butterfly pattern is applied to each pair
// of coefficients, with a stride of 128
// rem is the remainder of tid/128
// and the same logic as for previous iterations applies
int rem = tid & 127;
i1 = (tid << 1) - rem;
i2 = i1 + 128;
double2 w = conjugate(negTwids8[rem]);
u = A[i1], v = A[i2];
A[i1] = (u + v) * 0.5;
A[i2] = (u - v) * w * 0.5;
tid += params::degree / params::opt;
}
__syncthreads();
// 7_th iteration
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
// the butterfly pattern is applied to each pair
// of coefficients, with a stride of 64
// rem is the remainder of tid/64
// and the same logic as for previous iterations applies
int rem = tid & 63;
i1 = (tid << 1) - rem;
i2 = i1 + 64;
double2 w = conjugate(negTwids7[rem]);
u = A[i1], v = A[i2];
A[i1] = (u + v) * 0.5;
A[i2] = (u - v) * w * 0.5;
tid += params::degree / params::opt;
}
__syncthreads();
// 6_th iteration
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
// the butterfly pattern is applied to each pair
// of coefficients, with a stride of 32
// rem is the remainder of tid/32
// and the same logic as for previous iterations applies
int rem = tid & 31;
i1 = (tid << 1) - rem;
i2 = i1 + 32;
double2 w = conjugate(negTwids6[rem]);
u = A[i1], v = A[i2];
A[i1] = (u + v) * 0.5;
A[i2] = (u - v) * w * 0.5;
tid += params::degree / params::opt;
}
__syncthreads();
// 5_th iteration
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
// the butterfly pattern is applied to each pair
// of coefficients, with a stride of 16
// rem is the remainder of tid/16
// and the same logic as for previous iterations applies
int rem = tid & 15;
i1 = (tid << 1) - rem;
i2 = i1 + 16;
double2 w = conjugate(negTwids5[rem]);
u = A[i1], v = A[i2];
A[i1] = (u + v) * 0.5;
A[i2] = (u - v) * w * 0.5;
tid += params::degree / params::opt;
}
__syncthreads();
// 4_th iteration
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
// the butterfly pattern is applied to each pair
// of coefficients, with a stride of 8
// rem is the remainder of tid/8. tid takes values:
// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ... N/4
// then rem takes values:
// 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, ... N/4
// and striding by 8 will allow us to cover all
// the coefficients correctly
int rem = tid & 7;
i1 = (tid << 1) - rem;
i2 = i1 + 8;
double2 w = conjugate(negTwids4[rem]);
u = A[i1], v = A[i2];
A[i1] = (u + v) * 0.5;
A[i2] = (u - v) * w * 0.5;
tid += params::degree / params::opt;
}
__syncthreads();
// third iteration
// from k=3 on, we have to do the full complex multiplication
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
// the butterfly pattern is applied to each pair
// of coefficients, with a stride of 4
// rem is the remainder of tid/4. tid takes values:
// 0, 1, 2, 3, 4, 5, 6, 7, ... N/4
// then rem takes values:
// 0, 1, 2, 3, 0, 1, 2, 3, ... N/4
// and striding by 4 will allow us to cover all
// the coefficients correctly
int rem = tid & 3;
i1 = (tid << 1) - rem;
i2 = i1 + 4;
double2 w = conjugate(negTwids3[rem]);
u = A[i1], v = A[i2];
A[i1] = (u + v) * 0.5;
A[i2] = (u - v) * w * 0.5;
tid += params::degree / params::opt;
}
__syncthreads();
// second iteration: apply the butterfly pattern
// between groups of 4 coefficients
// k=2, \zeta=exp(i pi/4) for even coefficients and
// exp(3 i pi / 4) for odd coefficients
// TODO (Agnes) how does this work on the gpu? aren't we doing
// a lot more computations than we should?
tid = threadIdx.x;
// odd = 0 for even coefficients, 1 for odd coefficients
int odd = tid & 1;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
// the butterfly pattern is applied to each pair
// of coefficients, with a stride of 2
// i1=2*tid if tid is even and 2*tid-1 if it is odd
i1 = (tid << 1) - odd;
i2 = i1 + 2;
// TODO(Beka) optimize twiddle multiplication
double2 w;
if (odd) {
w.x = -0.707106781186547461715008466854;
w.y = -0.707106781186547572737310929369;
} else {
w.x = 0.707106781186547461715008466854;
w.y = -0.707106781186547572737310929369;
}
u = A[i1], v = A[i2];
A[i1] = (u + v) * 0.5;
A[i2] = (u - v) * w * 0.5;
tid += params::degree / params::opt;
}
__syncthreads();
// Now we go through all the levels of the FFT one by one
// (instead of recursively)
// first iteration: k=1, zeta=i for all coefficients
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
// the butterfly pattern is applied to each pair
// of coefficients, with a stride of 2
i1 = tid << 1;
i2 = i1 + 1;
// TODO(Beka) optimize twiddle multiplication
double2 w = {0, -1};
u = A[i1], v = A[i2];
A[i1] = (u + v) * 0.5;
A[i2] = (u - v) * w * 0.5;
tid += params::degree / params::opt;
}
__syncthreads();
bit_reverse_inplace<params>(A);
__syncthreads();
// Real polynomials handled should not exceed a degree of 8192
}
/*
* correction after direct fft
* does not use extra shared memory for recovering
* correction is done using registers.
* based on Pascal's paper
*/
template <class params>
__device__ void correction_direct_fft_inplace(double2 *x) {
constexpr int threads = params::degree / params::opt;
int tid = threadIdx.x;
double2 left[params::opt / 4];
double2 right[params::opt / 4];
#pragma unroll
for (int i = 0; i < params::opt / 4; i++) {
left[i] = x[tid + i * threads];
}
#pragma unroll
for (int i = 0; i < params::opt / 4; i++) {
right[i] = x[params::degree / 2 - (tid + i * threads + 1)];
}
#pragma unroll
for (int i = 0; i < params::opt / 4; i++) {
double2 tw = negacyclic_twiddle<params::degree>(tid + i * threads);
double add_RE = left[i].x + right[i].x;
double sub_RE = left[i].x - right[i].x;
double add_IM = left[i].y + right[i].y;
double sub_IM = left[i].y - right[i].y;
double tmp1 = add_IM * tw.x + sub_RE * tw.y;
double tmp2 = -sub_RE * tw.x + add_IM * tw.y;
x[tid + i * threads].x = (add_RE + tmp1) * 0.5;
x[tid + i * threads].y = (sub_IM + tmp2) * 0.5;
x[params::degree / 2 - (tid + i * threads + 1)].x = (add_RE - tmp1) * 0.5;
x[params::degree / 2 - (tid + i * threads + 1)].y = (-sub_IM + tmp2) * 0.5;
}
}
/*
* correction before inverse fft
* does not use extra shared memory for recovering
* correction is done using registers.
* based on Pascal's paper
*/
template <class params>
__device__ void correction_inverse_fft_inplace(double2 *x) {
constexpr int threads = params::degree / params::opt;
int tid = threadIdx.x;
double2 left[params::opt / 4];
double2 right[params::opt / 4];
#pragma unroll
for (int i = 0; i < params::opt / 4; i++) {
left[i] = x[tid + i * threads];
}
#pragma unroll
for (int i = 0; i < params::opt / 4; i++) {
right[i] = x[params::degree / 2 - (tid + i * threads + 1)];
}
#pragma unroll
for (int i = 0; i < params::opt / 4; i++) {
double2 tw = negacyclic_twiddle<params::degree>(tid + i * threads);
double add_RE = left[i].x + right[i].x;
double sub_RE = left[i].x - right[i].x;
double add_IM = left[i].y + right[i].y;
double sub_IM = left[i].y - right[i].y;
double tmp1 = add_IM * tw.x - sub_RE * tw.y;
double tmp2 = sub_RE * tw.x + add_IM * tw.y;
x[tid + i * threads].x = (add_RE - tmp1) * 0.5;
x[tid + i * threads].y = (sub_IM + tmp2) * 0.5;
x[params::degree / 2 - (tid + i * threads + 1)].x = (add_RE + tmp1) * 0.5;
x[params::degree / 2 - (tid + i * threads + 1)].y = (-sub_IM + tmp2) * 0.5;
}
}
/*
* global batch fft
* does fft in half size
* unrolling halfsize fft result in half size + 1 eleemnts
* this function must be called with actual degree
* function takes as input already compressed input
*/
template <class params>
__global__ void batch_NSMFFT(double2 *d_input, double2 *d_output) {
extern __shared__ double2 sharedMemoryFFT[];
double2 *fft = sharedMemoryFFT;
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] = d_input[blockIdx.x * (params::degree / 2) + tid];
tid = tid + params::degree / params::opt;
}
__syncthreads();
NSMFFT_direct<HalfDegree<params>>(fft);
__syncthreads();
correction_direct_fft_inplace<params>(fft);
__syncthreads();
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
d_output[blockIdx.x * (params::degree / 2) + tid] = fft[tid];
tid = tid + params::degree / params::opt;
}
}
#endif // GPU_BOOTSTRAP_FFT_1024_CUH

402
src/fft/smfft.cuh Normal file
View File

@@ -0,0 +1,402 @@
/*
#ifndef GPU_BOOTSTRAP_SMFFT_CUH
#define GPU_BOOTSTRAP_SMFFT_CUH
#include "../complex/operations.cuh"
#include "twiddles.cuh"
__device__ __inline__ double2 Get_W_value_inverse(int index) {
double2 ctemp = _gTwiddles[index];
ctemp.y = -ctemp.y;
return (ctemp);
}
template <class params>
__device__ double2 Get_after_inverse_fft_twiddle(int index) {
double2 ctemp;
switch (params::degree) {
case 512:
ctemp = INVERSE_TWIDDLES_512[index];
break;
case 1024:
ctemp = gTwiddles1024[index];
ctemp.x /= params::degree;
ctemp.y /= -params::degree;
break;
default:
break;
}
return ctemp;
}
__device__ __inline__ double shfl(double *value, int par) {
#if (CUDART_VERSION >= 9000)
return (__shfl_sync(0xffffffff, (*value), par));
#else
return (__shfl((*value), par));
#endif
}
__device__ __inline__ double shfl_xor(double *value, int par) {
#if (CUDART_VERSION >= 9000)
return (__shfl_xor_sync(0xffffffff, (*value), par));
#else
return (__shfl_xor((*value), par));
#endif
}
__device__ __inline__ double shfl_down(double *value, int par) {
#if (CUDART_VERSION >= 9000)
return (__shfl_down_sync(0xffffffff, (*value), par));
#else
return (__shfl_down((*value), par));
#endif
}
__device__ __inline__ void
reorder_16_register(double2 *A_DFT_value, double2 *B_DFT_value,
double2 *C_DFT_value, double2 *D_DFT_value, int *local_id) {
double2 Af2temp, Bf2temp, Cf2temp, Df2temp;
unsigned int target = (((unsigned int)__brev(((*local_id) & 15))) >> (28)) +
16 * ((*local_id) >> 4);
Af2temp.x = shfl(&(A_DFT_value->x), target);
Af2temp.y = shfl(&(A_DFT_value->y), target);
Bf2temp.x = shfl(&(B_DFT_value->x), target);
Bf2temp.y = shfl(&(B_DFT_value->y), target);
Cf2temp.x = shfl(&(C_DFT_value->x), target);
Cf2temp.y = shfl(&(C_DFT_value->y), target);
Df2temp.x = shfl(&(D_DFT_value->x), target);
Df2temp.y = shfl(&(D_DFT_value->y), target);
__syncwarp();
(*A_DFT_value) = Af2temp;
(*B_DFT_value) = Bf2temp;
(*C_DFT_value) = Cf2temp;
(*D_DFT_value) = Df2temp;
}
__device__ __inline__ void reorder_32_register(double2 *A_DFT_value,
double2 *B_DFT_value,
double2 *C_DFT_value,
double2 *D_DFT_value) {
double2 Af2temp, Bf2temp, Cf2temp, Df2temp;
unsigned int target = ((unsigned int)__brev(threadIdx.x)) >> (27);
Af2temp.x = shfl(&(A_DFT_value->x), target);
Af2temp.y = shfl(&(A_DFT_value->y), target);
Bf2temp.x = shfl(&(B_DFT_value->x), target);
Bf2temp.y = shfl(&(B_DFT_value->y), target);
Cf2temp.x = shfl(&(C_DFT_value->x), target);
Cf2temp.y = shfl(&(C_DFT_value->y), target);
Df2temp.x = shfl(&(D_DFT_value->x), target);
Df2temp.y = shfl(&(D_DFT_value->y), target);
__syncwarp();
(*A_DFT_value) = Af2temp;
(*B_DFT_value) = Bf2temp;
(*C_DFT_value) = Cf2temp;
(*D_DFT_value) = Df2temp;
}
template <class params>
__device__ __inline__ void
reorder_512(double2 *s_input, double2 *A_DFT_value, double2 *B_DFT_value,
double2 *C_DFT_value, double2 *D_DFT_value) {
int local_id = threadIdx.x & (params::warp - 1);
int warp_id = threadIdx.x / params::warp;
// reorder elements within warp so we can save them in semi-transposed manner
// into shared memory
reorder_32_register(A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value);
// reorder elements within warp so we can save them in semi-transposed manner
// into shared memory
__syncthreads();
unsigned int sm_store_pos =
(local_id >> 1) + 16 * (local_id & 1) + warp_id * 132;
s_input[sm_store_pos] = *A_DFT_value;
s_input[sm_store_pos + 33] = *B_DFT_value;
s_input[66 + sm_store_pos] = *C_DFT_value;
s_input[66 + sm_store_pos + 33] = *D_DFT_value;
__syncthreads();
// Read shared memory to get reordered input
unsigned int sm_read_pos = (local_id & 15) * 32 + local_id + warp_id * 4;
__syncthreads();
*A_DFT_value = s_input[sm_read_pos + 0];
*B_DFT_value = s_input[sm_read_pos + 1];
*C_DFT_value = s_input[sm_read_pos + 2];
*D_DFT_value = s_input[sm_read_pos + 3];
__syncthreads();
reorder_16_register(A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value,
&local_id);
__syncthreads();
}
template <class params>
__device__ __inline__ void
reorder_1024(double2 *s_input, double2 *A_DFT_value, double2 *B_DFT_value,
double2 *C_DFT_value, double2 *D_DFT_value) {
int local_id = threadIdx.x & (params::warp - 1);
int warp_id = threadIdx.x / params::warp;
// reorder elements within params::warp so we can save them in semi-transposed
// manner into shared memory
reorder_32_register(A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value);
// reorder elements within params::warp so we can save them in semi-transposed
// manner into shared memory
__syncthreads();
unsigned int sm_store_pos =
(local_id >> 0) + 32 * (local_id & 0) + warp_id * 132;
s_input[sm_store_pos] = *A_DFT_value;
s_input[sm_store_pos + 33] = *B_DFT_value;
s_input[66 + sm_store_pos] = *C_DFT_value;
s_input[66 + sm_store_pos + 33] = *D_DFT_value;
// Read shared memory to get reordered input
unsigned int sm_read_pos = (local_id & 31) * 32 + local_id + warp_id * 4;
__syncthreads();
*A_DFT_value = s_input[sm_read_pos + 0];
*B_DFT_value = s_input[sm_read_pos + 1];
*C_DFT_value = s_input[sm_read_pos + 2];
*D_DFT_value = s_input[sm_read_pos + 3];
__syncthreads();
reorder_32_register(A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value);
}
__device__ bool printOnce = true;
template <class params> __device__ void do_SMFFT_CT_DIT(double2 *s_input) {
double2 A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value;
double2 W;
double2 Aftemp, Bftemp, Cftemp, Dftemp;
int j, m_param;
int parity, itemp;
int A_read_index, B_read_index, C_read_index, D_read_index;
int PoT, PoTp1, q;
int local_id = threadIdx.x & (params::warp - 1);
int warp_id = threadIdx.x / params::warp;
A_DFT_value = s_input[local_id + (warp_id << 2) * params::warp];
B_DFT_value =
s_input[local_id + (warp_id << 2) * params::warp + params::warp];
C_DFT_value =
s_input[local_id + (warp_id << 2) * params::warp + 2 * params::warp];
D_DFT_value =
s_input[local_id + (warp_id << 2) * params::warp + 3 * params::warp];
switch (params::log2_degree) {
case 9:
reorder_512<params>(s_input, &A_DFT_value, &B_DFT_value, &C_DFT_value,
&D_DFT_value);
break;
case 10:
reorder_1024<params>(s_input, &A_DFT_value, &B_DFT_value, &C_DFT_value,
&D_DFT_value);
break;
// case 11:
// reorder_2048<params, opt>(s_input, &A_DFT_value, &B_DFT_value,
//&C_DFT_value, &D_DFT_value); break;
default:
break;
}
//----> FFT
PoT = 1;
PoTp1 = 2;
//--> First iteration
itemp = local_id & 1;
parity = (1 - itemp * 2);
A_DFT_value.x = parity * A_DFT_value.x + shfl_xor(&A_DFT_value.x, 1);
A_DFT_value.y = parity * A_DFT_value.y + shfl_xor(&A_DFT_value.y, 1);
B_DFT_value.x = parity * B_DFT_value.x + shfl_xor(&B_DFT_value.x, 1);
B_DFT_value.y = parity * B_DFT_value.y + shfl_xor(&B_DFT_value.y, 1);
C_DFT_value.x = parity * C_DFT_value.x + shfl_xor(&C_DFT_value.x, 1);
C_DFT_value.y = parity * C_DFT_value.y + shfl_xor(&C_DFT_value.y, 1);
D_DFT_value.x = parity * D_DFT_value.x + shfl_xor(&D_DFT_value.x, 1);
D_DFT_value.y = parity * D_DFT_value.y + shfl_xor(&D_DFT_value.y, 1);
//--> Second through Fifth iteration (no synchronization)
PoT = 2;
PoTp1 = 4;
for (q = 1; q < 5; q++) {
m_param = (local_id & (PoTp1 - 1));
itemp = m_param >> q;
parity = ((itemp << 1) - 1);
if (params::fft_direction)
W = Get_W_value_inverse((q - 1) * 257 + itemp * m_param);
else
W = _gTwiddles[(q - 1) * 257 + itemp * m_param];
Aftemp.x = W.x * A_DFT_value.x - W.y * A_DFT_value.y;
Aftemp.y = W.x * A_DFT_value.y + W.y * A_DFT_value.x;
Bftemp.x = W.x * B_DFT_value.x - W.y * B_DFT_value.y;
Bftemp.y = W.x * B_DFT_value.y + W.y * B_DFT_value.x;
Cftemp.x = W.x * C_DFT_value.x - W.y * C_DFT_value.y;
Cftemp.y = W.x * C_DFT_value.y + W.y * C_DFT_value.x;
Dftemp.x = W.x * D_DFT_value.x - W.y * D_DFT_value.y;
Dftemp.y = W.x * D_DFT_value.y + W.y * D_DFT_value.x;
A_DFT_value.x = Aftemp.x + parity * shfl_xor(&Aftemp.x, PoT);
A_DFT_value.y = Aftemp.y + parity * shfl_xor(&Aftemp.y, PoT);
B_DFT_value.x = Bftemp.x + parity * shfl_xor(&Bftemp.x, PoT);
B_DFT_value.y = Bftemp.y + parity * shfl_xor(&Bftemp.y, PoT);
C_DFT_value.x = Cftemp.x + parity * shfl_xor(&Cftemp.x, PoT);
C_DFT_value.y = Cftemp.y + parity * shfl_xor(&Cftemp.y, PoT);
D_DFT_value.x = Dftemp.x + parity * shfl_xor(&Dftemp.x, PoT);
D_DFT_value.y = Dftemp.y + parity * shfl_xor(&Dftemp.y, PoT);
PoT = PoT << 1;
PoTp1 = PoTp1 << 1;
}
itemp = local_id + (warp_id << 2) * params::warp;
s_input[itemp] = A_DFT_value;
s_input[itemp + params::warp] = B_DFT_value;
s_input[itemp + 2 * params::warp] = C_DFT_value;
s_input[itemp + 3 * params::warp] = D_DFT_value;
for (q = 5; q < (params::log2_degree - 1); q++) {
__syncthreads();
m_param = threadIdx.x & (PoT - 1);
j = threadIdx.x >> q;
if (params::fft_direction)
W = Get_W_value_inverse((q - 1) * 257 + m_param);
else
W = _gTwiddles[(q - 1) * 257 + m_param];
A_read_index = j * (PoTp1 << 1) + m_param;
B_read_index = j * (PoTp1 << 1) + m_param + PoT;
C_read_index = j * (PoTp1 << 1) + m_param + PoTp1;
D_read_index = j * (PoTp1 << 1) + m_param + 3 * PoT;
Aftemp = s_input[A_read_index];
Bftemp = s_input[B_read_index];
A_DFT_value.x = Aftemp.x + W.x * Bftemp.x - W.y * Bftemp.y;
A_DFT_value.y = Aftemp.y + W.x * Bftemp.y + W.y * Bftemp.x;
B_DFT_value.x = Aftemp.x - W.x * Bftemp.x + W.y * Bftemp.y;
B_DFT_value.y = Aftemp.y - W.x * Bftemp.y - W.y * Bftemp.x;
Cftemp = s_input[C_read_index];
Dftemp = s_input[D_read_index];
C_DFT_value.x = Cftemp.x + W.x * Dftemp.x - W.y * Dftemp.y;
C_DFT_value.y = Cftemp.y + W.x * Dftemp.y + W.y * Dftemp.x;
D_DFT_value.x = Cftemp.x - W.x * Dftemp.x + W.y * Dftemp.y;
D_DFT_value.y = Cftemp.y - W.x * Dftemp.y - W.y * Dftemp.x;
s_input[A_read_index] = A_DFT_value;
s_input[B_read_index] = B_DFT_value;
s_input[C_read_index] = C_DFT_value;
s_input[D_read_index] = D_DFT_value;
PoT = PoT << 1;
PoTp1 = PoTp1 << 1;
}
// last iteration
if (params::log2_degree > 6) {
__syncthreads();
m_param = threadIdx.x;
if (params::fft_direction)
W = Get_W_value_inverse((q - 1) * 257 + m_param);
else
W = _gTwiddles[(q - 1) * 257 + m_param];
A_read_index = m_param;
B_read_index = m_param + PoT;
C_read_index = m_param + (PoT >> 1);
D_read_index = m_param + 3 * (PoT >> 1);
Aftemp = s_input[A_read_index];
Bftemp = s_input[B_read_index];
A_DFT_value.x = Aftemp.x + W.x * Bftemp.x - W.y * Bftemp.y;
A_DFT_value.y = Aftemp.y + W.x * Bftemp.y + W.y * Bftemp.x;
B_DFT_value.x = Aftemp.x - W.x * Bftemp.x + W.y * Bftemp.y;
B_DFT_value.y = Aftemp.y - W.x * Bftemp.y - W.y * Bftemp.x;
Cftemp = s_input[C_read_index];
Dftemp = s_input[D_read_index];
C_DFT_value.x = Cftemp.x + W.y * Dftemp.x + W.x * Dftemp.y;
C_DFT_value.y = Cftemp.y + W.y * Dftemp.y - W.x * Dftemp.x;
D_DFT_value.x = Cftemp.x - W.y * Dftemp.x - W.x * Dftemp.y;
D_DFT_value.y = Cftemp.y - W.y * Dftemp.y + W.x * Dftemp.x;
s_input[A_read_index] = A_DFT_value;
s_input[B_read_index] = B_DFT_value;
s_input[C_read_index] = C_DFT_value;
s_input[D_read_index] = D_DFT_value;
}
}
template <class params>
__global__ void SMFFT_DIT_external(double2 *d_input, double2 *d_output) {
__syncthreads();
extern __shared__ double2 sharedmemBSK[];
double2 *s_input = sharedmemBSK;
int cTid = threadIdx.x * params::opt;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
double2 tmp;
switch (params::degree) {
case 512:
tmp = INVERSE_TWIDDLES_512[cTid];
tmp.x *= params::degree;
tmp.y *= -params::degree;
break;
case 1024:
tmp = gTwiddles1024[cTid];
break;
default:
break;
}
d_input[blockIdx.x * params::degree + cTid] *= tmp;
cTid++;
}
__syncthreads();
s_input[threadIdx.x] = d_input[threadIdx.x + blockIdx.x * params::degree];
s_input[threadIdx.x + params::quarter] =
d_input[threadIdx.x + blockIdx.x * params::degree + params::quarter];
s_input[threadIdx.x + params::half] =
d_input[threadIdx.x + blockIdx.x * params::degree + params::half];
s_input[threadIdx.x + params::three_quarters] =
d_input[threadIdx.x + blockIdx.x * params::degree +
params::three_quarters];
__syncthreads();
do_SMFFT_CT_DIT<params>(s_input);
if (threadIdx.x == 0 && blockIdx.x == 0) {
for (int i = 0; i < 1024; i++)
printf("smfft[%u] %.10f %.10f\n", i, s_input[i].x, s_input[i].y);
}
__syncthreads();
__syncthreads();
d_output[threadIdx.x + blockIdx.x * params::degree] = s_input[threadIdx.x];
d_output[threadIdx.x + blockIdx.x * params::degree + params::quarter] =
s_input[threadIdx.x + params::quarter];
d_output[threadIdx.x + blockIdx.x * params::degree + params::half] =
s_input[threadIdx.x + params::half];
d_output[threadIdx.x + blockIdx.x * params::degree + params::three_quarters] =
s_input[threadIdx.x + params::three_quarters];
__syncthreads();
}
#endif // GPU_BOOTSTRAP_SMFFT_CUH
*/

8204
src/fft/twiddles.cu Normal file

File diff suppressed because it is too large Load Diff

28
src/fft/twiddles.cuh Normal file
View File

@@ -0,0 +1,28 @@
#ifndef GPU_BOOTSTRAP_TWIDDLES_CUH
#define GPU_BOOTSTRAP_TWIDDLES_CUH
// TODO (Agnes) depending on the device architecture
// can we make more of them __constant__?
// Do we have to define them all regardless of the
// polynomial degree and q values?
// TODO (Beka) make those two arrays with dynamic size
// or find exact maximum for 8192 length poly it shuld
// be less than 2048
extern __constant__ short SW1[2048];
extern __constant__ short SW2[2048];
extern __constant__ double2 negTwids3[4];
extern __constant__ double2 negTwids4[8];
extern __constant__ double2 negTwids5[16];
extern __constant__ double2 negTwids6[32];
extern __constant__ double2 negTwids7[64];
extern __constant__ double2 negTwids8[128];
extern __constant__ double2 negTwids9[256];
extern __constant__ double2 negTwids10[512];
extern __constant__ double2 negTwids11[1024];
extern __device__ double2 negTwids12[2048];
extern __device__ double2 negTwids13[4096];
#endif

55
src/keyswitch.cu Normal file
View File

@@ -0,0 +1,55 @@
#include "keyswitch.cuh"
#include "keyswitch.h"
#include "polynomial/parameters.cuh"
#include <cstdint>
/* Perform keyswitch on a batch of input LWE ciphertexts for 32 bits
*
* - lwe_out: output batch of num_samples keyswitched ciphertexts c =
* (a0,..an-1,b) where n is the LWE dimension
* - lwe_in: input batch of num_samples LWE ciphertexts, containing n
* mask values + 1 body value
*
* This function calls a wrapper to a device kernel that performs the keyswitch
* - num_samples blocks of threads are launched
*/
void cuda_keyswitch_lwe_ciphertext_vector_32(void *v_stream, void *lwe_out, void *lwe_in,
void *ksk,
uint32_t lwe_dimension_before,
uint32_t lwe_dimension_after,
uint32_t base_log, uint32_t l_gadget,
uint32_t num_samples) {
cuda_keyswitch_lwe_ciphertext_vector(
v_stream, static_cast<uint32_t *>(lwe_out), static_cast<uint32_t *>(lwe_in),
static_cast<uint32_t*>(ksk),
lwe_dimension_before, lwe_dimension_after,
base_log, l_gadget,
num_samples);
}
/* Perform keyswitch on a batch of input LWE ciphertexts for 64 bits
*
* - lwe_out: output batch of num_samples keyswitched ciphertexts c =
* (a0,..an-1,b) where n is the LWE dimension
* - lwe_in: input batch of num_samples LWE ciphertexts, containing n
* mask values + 1 body value
*
* This function calls a wrapper to a device kernel that performs the keyswitch
* - num_samples blocks of threads are launched
*/
void cuda_keyswitch_lwe_ciphertext_vector_64(void *v_stream, void *lwe_out, void *lwe_in,
void *ksk,
uint32_t lwe_dimension_before,
uint32_t lwe_dimension_after,
uint32_t base_log, uint32_t l_gadget,
uint32_t num_samples) {
cuda_keyswitch_lwe_ciphertext_vector(
v_stream, static_cast<uint64_t *>(lwe_out), static_cast<uint64_t *> (lwe_in),
static_cast<uint64_t*>(ksk),
lwe_dimension_before, lwe_dimension_after,
base_log, l_gadget,
num_samples);
}

146
src/keyswitch.cuh Normal file
View File

@@ -0,0 +1,146 @@
#ifndef CNCRT_KS_H
#define CNCRT_KS_H
#include "crypto/gadget.cuh"
#include "crypto/torus.cuh"
#include "polynomial/polynomial.cuh"
#include <thread>
#include <vector>
template <typename Torus>
__device__ Torus *get_ith_block(Torus *ksk, int i, int level,
uint32_t lwe_dimension_after,
uint32_t l_gadget) {
int pos = i * l_gadget * (lwe_dimension_after + 1) +
level * (lwe_dimension_after + 1);
Torus *ptr = &ksk[pos];
return ptr;
}
/*
* keyswitch kernel
* Each thread handles a piece of the following equation:
* $$GLWE_s2(\Delta.m+e) = (0,0,..,0,b) - \sum_{i=0,k-1} <Dec(a_i),
* (GLWE_s2(s1_i q/beta),..,GLWE(s1_i q/beta^l)>$$ where k is the dimension of
* the GLWE ciphertext. If the polynomial dimension in GLWE is > 1, this
* equation is solved for each polynomial coefficient. where Dec denotes the
* decomposition with base beta and l levels and the inner product is done
* between the decomposition of a_i and l GLWE encryptions of s1_i q/\beta^j,
* with j in [1,l] We obtain a GLWE encryption of Delta.m (with Delta the
* scaling factor) under key s2 instead of s1, with an increased noise
*
*/
template <typename Torus>
__global__ void keyswitch(Torus *lwe_out, Torus *lwe_in,
Torus *ksk,
uint32_t lwe_dimension_before,
uint32_t lwe_dimension_after,
uint32_t base_log,
uint32_t l_gadget,
int lwe_lower, int lwe_upper, int cutoff) {
int tid = threadIdx.x;
extern __shared__ char sharedmem[];
Torus *local_lwe_out = (Torus *)sharedmem;
auto block_lwe_in =
get_chunk(lwe_in, blockIdx.x, lwe_dimension_before + 1);
auto block_lwe_out =
get_chunk(lwe_out, blockIdx.x, lwe_dimension_after + 1);
auto gadget = GadgetMatrixSingle<Torus>(base_log, l_gadget);
int lwe_part_per_thd;
if (tid < cutoff) {
lwe_part_per_thd = lwe_upper;
} else {
lwe_part_per_thd = lwe_lower;
}
__syncthreads();
for (int k = 0; k < lwe_part_per_thd; k++) {
int idx = tid + k * blockDim.x;
local_lwe_out[idx] = 0;
}
if (tid == 0) {
local_lwe_out[lwe_dimension_after] =
block_lwe_in[lwe_dimension_before];
}
for (int i = 0; i < lwe_dimension_before; i++) {
__syncthreads();
Torus a_i = round_to_closest_multiple(block_lwe_in[i], base_log,
l_gadget);
for (int j = 0; j < l_gadget; j++) {
auto ksk_block = get_ith_block(ksk, i, j, lwe_dimension_after, l_gadget);
Torus decomposed = gadget.decompose_one_level_single(a_i, (uint32_t)j);
for (int k = 0; k < lwe_part_per_thd; k++) {
int idx = tid + k * blockDim.x;
local_lwe_out[idx] -= (Torus)ksk_block[idx] * decomposed;
}
}
}
for (int k = 0; k < lwe_part_per_thd; k++) {
int idx = tid + k * blockDim.x;
block_lwe_out[idx] = local_lwe_out[idx];
}
}
/// assume lwe_in in the gpu
template <typename Torus>
__host__ void cuda_keyswitch_lwe_ciphertext_vector(void *v_stream, Torus *lwe_out, Torus *lwe_in,
Torus *ksk,
uint32_t lwe_dimension_before,
uint32_t lwe_dimension_after,
uint32_t base_log,
uint32_t l_gadget,
uint32_t num_samples) {
constexpr int ideal_threads = 128;
int lwe_dim = lwe_dimension_after + 1;
int lwe_lower, lwe_upper, cutoff;
if (lwe_dim % ideal_threads == 0) {
lwe_lower = lwe_dim / ideal_threads;
lwe_upper = lwe_dim / ideal_threads;
cutoff = 0;
} else {
int y =
ceil((double)lwe_dim / (double)ideal_threads) * ideal_threads - lwe_dim;
cutoff = ideal_threads - y;
lwe_lower = lwe_dim / ideal_threads;
lwe_upper = (int)ceil((double)lwe_dim / (double)ideal_threads);
}
// int lwe_size_before =
// (lwe_dimension_before + 1) * num_samples;
int lwe_size_after =
(lwe_dimension_after + 1) * num_samples;
int shared_mem =
sizeof(Torus) * (lwe_dimension_after + 1);
cudaMemset(lwe_out, 0, sizeof(Torus) * lwe_size_after);
dim3 grid(num_samples, 1, 1);
dim3 threads(ideal_threads, 1, 1);
cudaFuncSetAttribute(keyswitch<Torus>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem);
auto stream = static_cast<cudaStream_t *>(v_stream);
keyswitch<<<grid, threads, shared_mem, *stream>>>(
lwe_out, lwe_in, ksk, lwe_dimension_before, lwe_dimension_after, base_log,
l_gadget, lwe_lower, lwe_upper, cutoff);
cudaStreamSynchronize(*stream);
}
#endif

View File

@@ -0,0 +1,261 @@
#ifndef GPU_POLYNOMIAL_FUNCTIONS
#define GPU_POLYNOMIAL_FUNCTIONS
#include "utils/memory.cuh"
#include "utils/timer.cuh"
/*
* function compresses decomposed buffer into half size complex buffer for fft
*/
template <typename T, class params>
__device__ void real_to_complex_compressed(T *src, double2 *dst) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
dst[tid].x = (double)src[2 * tid];
dst[tid].y = (double)src[2 * tid + 1];
tid += params::degree / params::opt;
}
}
/*
* copy source polynomial to specific slice of batched polynomials
* used only in low latency version
*/
template <typename T, class params>
__device__ void copy_into_ith_polynomial_low_lat(T *source, T *dst, int i) {
int tid = threadIdx.x;
int begin = i * (params::degree / 2 + 1);
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
dst[tid + begin] = source[tid];
tid = tid + params::degree / params::opt;
}
if (threadIdx.x == 0) {
dst[params::degree / 2 + begin] = source[params::degree / 2];
}
}
/*
* accumulates source polynomial into specific slice of batched polynomial
* used only in low latency version
*/
template <typename T, class params>
__device__ void add_polynomial_inplace_low_lat(T *source, T *dst, int p_id) {
int tid = threadIdx.x;
int begin = p_id * (params::degree / 2 + 1);
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
dst[tid] += source[tid + begin];
tid = tid + params::degree / params::opt;
}
if (threadIdx.x == 0) {
dst[params::degree / 2] += source[params::degree / 2 + begin];
}
}
/*
* Performs acc = acc * (X^ä + 1) if zeroAcc = false
* Performs acc = 0 if zeroAcc
* takes single buffer and calculates inplace.
*/
template <typename T, int elems_per_thread, int block_size>
__device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input,
uint32_t j,
bool zeroAcc) {
int tid = threadIdx.x;
constexpr int degree = block_size * elems_per_thread;
if (zeroAcc) {
for (int i = 0; i < elems_per_thread; i++) {
accumulator[tid] = 0;
tid += block_size;
}
} else {
tid = threadIdx.x;
for (int i = 0; i < elems_per_thread; i++) {
if (j < degree) {
if (tid < degree - j) {
accumulator[tid] = input[tid + j];
} else {
accumulator[tid] = -input[tid - degree + j];
}
} else {
uint32_t jj = j - degree;
if (tid < degree - jj) {
accumulator[tid] = -input[tid + jj];
} else {
accumulator[tid] = input[tid - degree + jj];
}
}
tid += block_size;
}
}
}
/*
* Performs result_acc = acc * (X^ä - 1) - acc
* takes single buffer as input returns single rotated buffer
*/
template <typename T, int elems_per_thread, int block_size>
__device__ void
multiply_by_monomial_negacyclic_and_sub_polynomial(T *acc, T *result_acc,
uint32_t j) {
int tid = threadIdx.x;
constexpr int degree = block_size * elems_per_thread;
for (int i = 0; i < elems_per_thread; i++) {
if (j < degree) {
if (tid < j) {
result_acc[tid] = -acc[tid - j + degree] - acc[tid];
} else {
result_acc[tid] = acc[tid - j] - acc[tid];
}
} else {
uint32_t jj = j - degree;
if (tid < jj) {
result_acc[tid] = acc[tid - jj + degree] - acc[tid];
} else {
result_acc[tid] = -acc[tid - jj] - acc[tid];
}
}
tid += block_size;
}
}
/*
* performs a rounding to increase accuracy of the PBS
* calculates inplace.
*/
template <typename T, int elems_per_thread, int block_size>
__device__ void round_to_closest_multiple_inplace(T *rotated_acc, int base_log,
int l_gadget) {
int tid = threadIdx.x;
for (int i = 0; i < elems_per_thread; i++) {
T x_acc = rotated_acc[tid];
T shift = sizeof(T) * 8 - l_gadget * base_log;
T mask = 1ll << (shift - 1);
T b_acc = (x_acc & mask) >> (shift - 1);
T res_acc = x_acc >> shift;
res_acc += b_acc;
res_acc <<= shift;
rotated_acc[tid] = res_acc;
tid = tid + block_size;
}
}
template <typename Torus, class params>
__device__ void add_to_torus(double2 *m_values, Torus *result) {
Torus mx = (sizeof(Torus) == 4) ? UINT32_MAX : UINT64_MAX;
int tid = threadIdx.x;
#pragma unroll
// TODO (Beka) check if better memory access is possible
for (int i = 0; i < params::opt / 2; i++) {
double v1 = m_values[tid].x;
double v2 = m_values[tid].y;
double frac = v1 - floor(v1);
frac *= mx;
double carry = frac - floor(frac);
double zero_point_five = 1. / 2.;
if (carry >= zero_point_five)
frac++;
Torus V1 = typecast_double_to_torus<Torus>(frac);
frac = v2 - floor(v2);
frac *= mx;
carry = frac - floor(v2);
if (carry >= zero_point_five)
frac++;
Torus V2 = typecast_double_to_torus<Torus>(frac);
result[tid * 2] += V1;
result[tid * 2 + 1] += V2;
tid = tid + params::degree / params::opt;
}
}
template <typename Torus, class params>
__device__ void sample_extract_body(Torus *lwe_out, Torus *accumulator) {
// Set first coefficient of the accumulator as the body of the LWE sample
// todo(Joao): not every thread needs to set it
// if (threadIdx.x == 0)
lwe_out[params::degree] = accumulator[0];
}
template <typename Torus, class params>
__device__ void sample_extract_mask(Torus *lwe_out, Torus *accumulator) {
// Set ACC = -ACC
// accumulator.negate_inplace();
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
accumulator[tid] = -accumulator[tid];
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();
// Reverse the accumulator
// accumulator.reverse_inplace();
tid = threadIdx.x;
Torus result[params::opt];
#pragma unroll
for (int i = 0; i < params::opt; i++) {
result[i] = accumulator[params::degree - tid - 1];
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
accumulator[tid] = result[i];
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();
// Perform ACC * X
// accumulator.multiply_by_monomial_negacyclic_inplace(1);
tid = threadIdx.x;
result[params::opt];
for (int i = 0; i < params::opt; i++) {
if (1 < params::degree) {
if (tid < 1)
result[i] = -accumulator[tid - 1 + params::degree];
else
result[i] = accumulator[tid - 1];
} else {
uint32_t jj = 1 - (uint32_t)params::degree;
if (tid < jj)
result[i] = accumulator[tid - jj + params::degree];
else
result[i] = -accumulator[tid - jj];
}
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
accumulator[tid] = result[i];
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
// Copy to the mask of the LWE sample
// accumulator.copy_into(lwe_out);
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
lwe_out[tid] = accumulator[tid];
tid = tid + params::degree / params::opt;
}
}
#endif

View File

@@ -0,0 +1,78 @@
#ifndef CNCRT_PARAMETERS_H
#define CNCRT_PARAMETERS_H
constexpr int log2(int n) { return (n <= 2) ? 1 : 1 + log2(n / 2); }
constexpr int choose_opt(int degree) {
if (degree <= 1024)
return 4;
else if (degree == 2048)
return 8;
else if (degree == 4096)
return 16;
else
return 32;
}
template <class params> class HalfDegree {
public:
constexpr static int degree = params::degree / 2;
constexpr static int opt = params::opt / 2;
constexpr static int log2_degree = params::log2_degree - 1;
constexpr static int quarter = params::quarter / 2;
constexpr static int half = params::half / 2;
constexpr static int three_quarters = quarter + half;
constexpr static int warp = 32;
constexpr static int fft_sm_required = degree + degree / warp;
};
template <int N> class Degree {
public:
constexpr static int degree = N;
constexpr static int opt = choose_opt(N);
constexpr static int log2_degree = log2(N);
constexpr static int quarter = N / 4;
constexpr static int half = N / 2;
constexpr static int three_quarters = half + quarter;
constexpr static int warp = 32;
constexpr static int fft_sm_required = N + 32;
};
enum sharedMemDegree {
NOSM = 0,
PARTIALSM = 1,
FULLSM = 2
};
class ForwardFFT {
public:
constexpr static int direction = 0;
};
class BackwardFFT {
public:
constexpr static int direction = 1;
};
class ReorderFFT {
constexpr static int reorder = 1;
};
class NoReorderFFT {
constexpr static int reorder = 0;
};
template <class params, class direction, class reorder = ReorderFFT>
class FFTDegree : public params {
public:
constexpr static int fft_direction = direction::direction;
constexpr static int fft_reorder = reorder::reorder;
};
template <int N, class direction, class reorder = ReorderFFT>
class FFTParams : public Degree<N> {
public:
constexpr static int fft_direction = direction::direction;
constexpr static int fft_reorder = reorder::reorder;
};
#endif // CNCRT_PARAMETERS_H

View File

@@ -0,0 +1,668 @@
#ifndef CNCRT_POLYNOMIAL_H
#define CNCRT_POLYNOMIAL_H
#include "complex/operations.cuh"
#include "crypto/torus.cuh"
#include "fft/bnsmfft.cuh"
#include "fft/smfft.cuh"
#include "parameters.cuh"
#include "utils/memory.cuh"
#include "utils/timer.cuh"
#include <cassert>
#include <cstdint>
#define PI 3.141592653589793238462643383279502884197
template <typename T>
__device__ T *get_chunk(T *data, int chunk_num, int chunk_size) {
int pos = chunk_num * chunk_size;
T *ptr = &data[pos];
return ptr;
}
class ExtraMemory {
public:
uint32_t m_size;
__device__ ExtraMemory(uint32_t size) : m_size(size) {}
};
template <typename T, class params> class PolynomialFourier;
template <typename T, class params> class Polynomial;
template <typename T, class params> class Vector;
template <typename FT, class params> class Twiddles;
template <typename T, class params> class VectorPolynomial {
public:
T *m_data;
uint32_t m_num_polynomials;
__device__ VectorPolynomial(T *data, uint32_t num_polynomials)
: m_data(data), m_num_polynomials(num_polynomials) {}
__device__ VectorPolynomial(SharedMemory &shmem, uint32_t num_polynomials)
: m_num_polynomials(num_polynomials) {
shmem.get_allocation(&m_data, m_num_polynomials * params::degree);
}
__device__ VectorPolynomial<T, params> get_chunk(int chunk_num,
int chunk_size) {
int pos = chunk_num * chunk_size;
T *ptr = &m_data[pos];
// todo(Joao): unsafe, user must pass chunk that has size multiple of
// polynomial degree
return VectorPolynomial<T, params>(ptr, chunk_size / params::degree);
}
__host__ VectorPolynomial() {}
__host__ VectorPolynomial(DeviceMemory &dmem, uint32_t num_polynomials,
int device)
: m_num_polynomials(num_polynomials) {
dmem.get_allocation(&m_data, m_num_polynomials * params::degree, device);
}
__host__ VectorPolynomial(DeviceMemory &dmem, T *source,
uint32_t num_polynomials, int device)
: m_num_polynomials(num_polynomials) {
dmem.get_allocation_and_copy_async(
&m_data, source, m_num_polynomials * params::degree, device);
}
__host__ void copy_to_host(T *dest) {
cudaMemcpyAsync(dest, m_data,
sizeof(T) * m_num_polynomials * params::degree,
cudaMemcpyDeviceToHost);
}
__device__ void copy_into(Polynomial<T, params> &dest,
int polynomial_number = 0) {
int tid = threadIdx.x;
int begin = polynomial_number * params::degree;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
dest.coefficients[tid] = m_data[tid + begin];
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();
}
// todo(Joao): we need to make these APIs more clear, as it's confusing what's
// being copied where
__device__ void copy_into_ith_polynomial(PolynomialFourier<T, params> &source,
int i) {
int tid = threadIdx.x;
int begin = i * (params::degree / 2 + 1);
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
this->m_data[tid + begin] = source.m_values[tid];
tid = tid + params::degree / params::opt;
}
if (threadIdx.x == 0) {
this->m_data[params::degree / 2 + begin] =
source.m_values[params::degree / 2];
}
}
__device__ void split_into_polynomials(Polynomial<T, params> &first,
Polynomial<T, params> &second) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
first.coefficients[tid] = m_data[tid];
second.coefficients[tid] = m_data[tid + params::degree];
tid = tid + params::degree / params::opt;
}
}
};
template <typename T, class params> class PolynomialFourier {
public:
T *m_values;
uint32_t degree;
__device__ __host__ PolynomialFourier(T *m_values) : m_values(m_values) {}
__device__ PolynomialFourier(SharedMemory &shmem) : degree(degree) {
shmem.get_allocation(&this->m_values, params::degree);
}
__device__ PolynomialFourier(SharedMemory &shmem, ExtraMemory extra_memory)
: degree(degree) {
shmem.get_allocation(&this->m_values, params::degree + extra_memory.m_size);
}
__device__ PolynomialFourier(SharedMemory &shmem, uint32_t degree)
: degree(degree) {
shmem.get_allocation(&this->m_values, degree);
}
__host__ PolynomialFourier(DeviceMemory &dmem, int device) : degree(degree) {
dmem.get_allocation(&this->m_values, params::degree, device);
}
__device__ char *reuse_memory() { return (char *)m_values; }
__device__ void copy_from(PolynomialFourier<T, params> &source, int begin) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
this->m_values[tid + begin] = source.m_values[tid];
tid = tid + params::degree / params::opt;
}
}
__device__ void fill_with(T value) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
m_values[tid] = value;
tid += params::degree / params::opt;
}
}
/*
__device__ void add_polynomial_inplace(PolynomialFourier<T, params> &source,
int begin) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
this->m_values[tid] += source.m_values[tid + begin];
tid = tid + params::degree / params::opt;
}
if (threadIdx.x == 0) {
this->m_values[params::degree / 2] += source.m_values[params::degree / 2 +
begin];
}
}
*/
__device__ void swap_quarters_inplace() {
int tid = threadIdx.x;
int s1 = params::quarter;
int s2 = params::three_quarters;
T tmp = m_values[s2 + tid];
m_values[s2 + tid] = m_values[s1 + tid];
m_values[s1 + tid] = tmp;
}
__device__ void add_polynomial_inplace(VectorPolynomial<T, params> &source,
int polynomial_number) {
int tid = threadIdx.x;
int begin = polynomial_number * (params::degree / 2 + 1);
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
this->m_values[tid] += source.m_data[tid + begin];
tid = tid + params::degree / params::opt;
}
if (threadIdx.x == 0) {
this->m_values[params::degree / 2] +=
source.m_data[params::degree / 2 + begin];
}
}
__device__ void
forward_negacyclic_fft_inplace(PolynomialFourier<double2, params> &X) {
// TODO function should be removed
}
__device__ void inverse_negacyclic_fft_inplace() {
// TODO function should be removed
}
template <typename Torus>
__device__ void add_to_torus(Polynomial<Torus, params> &result) {
// TODO function should be removed
}
__device__ T &operator[](int i) { return m_values[i]; }
};
template <typename T, class params> class Polynomial {
public:
T *coefficients;
uint32_t degree;
__device__ Polynomial(T *coefficients, uint32_t degree)
: coefficients(coefficients), degree(degree) {}
__device__ Polynomial(char *memory, uint32_t degree)
: coefficients((T *)memory), degree(degree) {}
__device__ Polynomial(SharedMemory &shmem, uint32_t degree) : degree(degree) {
shmem.get_allocation(&this->coefficients, degree);
}
__host__ Polynomial(DeviceMemory &dmem, uint32_t degree, int device)
: degree(degree) {
dmem.get_allocation(&this->coefficients, params::degree, device);
}
__host__ Polynomial(DeviceMemory &dmem, T *source, uint32_t degree,
int device)
: degree(degree) {
dmem.get_allocation_and_copy_async(&this->coefficients, source,
params::degree, device);
}
__host__ void copy_to_host(T *dest) {
cudaMemcpyAsync(dest, this->coefficients, sizeof(T) * params::degree,
cudaMemcpyDeviceToHost);
}
__device__ T get_coefficient(int i) { return this->coefficients[i]; }
__device__ char *reuse_memory() { return (char *)coefficients; }
__device__ void copy_coefficients_from(Polynomial<T, params> &source,
int begin_dest = 0,
int begin_src = 0) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
this->coefficients[tid + begin_dest] = source.coefficients[tid];
tid = tid + params::degree / params::opt;
}
}
__device__ void fill_with(T value) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
coefficients[tid] = value;
tid += params::degree / params::opt;
}
}
__device__ void multiply_by_monomial_negacyclic(Polynomial<T, params> &result,
uint32_t j) {
int tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
if (j < params::degree) {
if (tid < j)
result.coefficients[tid] =
-this->coefficients[tid - j + params::degree];
else
result.coefficients[tid] = this->coefficients[tid - j];
} else {
uint32_t jj = j - params::degree;
if (tid < jj)
result.coefficients[tid] =
this->coefficients[tid - jj + params::degree];
else
result.coefficients[tid] = -this->coefficients[tid - jj];
}
tid += params::degree / params::opt;
}
}
__device__ void multiply_by_monomial_negacyclic_inplace(uint32_t j) {
int tid = threadIdx.x;
T result[params::opt];
for (int i = 0; i < params::opt; i++) {
if (j < params::degree) {
if (tid < j)
result[i] = -this->coefficients[tid - j + params::degree];
else
result[i] = this->coefficients[tid - j];
} else {
uint32_t jj = j - params::degree;
if (tid < jj)
result[i] = this->coefficients[tid - jj + params::degree];
else
result[i] = -this->coefficients[tid - jj];
}
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
coefficients[tid] = result[i];
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
}
__device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
Polynomial<T, params> &result, uint32_t j) {
int tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
if (j < params::degree) {
if (tid < j)
result.coefficients[tid] =
-this->coefficients[tid - j + params::degree] -
this->coefficients[tid];
else
result.coefficients[tid] =
this->coefficients[tid - j] - this->coefficients[tid];
} else {
uint32_t jj = j - params::degree;
if (tid < jj)
result.coefficients[tid] =
this->coefficients[tid - jj + params::degree] -
this->coefficients[tid];
else
result.coefficients[tid] =
-this->coefficients[tid - jj] - this->coefficients[tid];
}
tid += params::degree / params::opt;
}
}
__device__ void divide_by_monomial_negacyclic(Polynomial<T, params> &result,
uint32_t j) {
int tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
if (j < params::degree) {
if (tid < params::degree - j) {
result.coefficients[tid] = this->coefficients[tid + j];
} else {
result.coefficients[tid] =
-this->coefficients[tid - params::degree + j];
}
} else {
uint32_t jj = j - params::degree;
if (tid < params::degree - jj) {
result.coefficients[tid] = -this->coefficients[tid + jj];
} else {
result.coefficients[tid] =
this->coefficients[tid - params::degree + jj];
}
}
tid += params::degree / params::opt;
}
}
__device__ void divide_by_monomial_negacyclic_inplace(uint32_t j) {
int tid = threadIdx.x;
T result[params::opt];
for (int i = 0; i < params::opt; i++) {
if (j < params::degree) {
if (tid < params::degree - j) {
result[i] = this->coefficients[tid + j];
} else {
result[i] = -this->coefficients[tid - params::degree + j];
}
} else {
uint32_t jj = j - params::degree;
if (tid < params::degree - jj) {
result[i] = -this->coefficients[tid + jj];
} else {
result[i] = this->coefficients[tid - params::degree + jj];
}
}
tid += params::degree / params::opt;
}
tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
coefficients[tid] = result[i];
tid = tid + params::degree / params::opt;
}
}
__device__ void round_to_closest_multiple_inplace(uint32_t base_log,
uint32_t l_gadget) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
T x = coefficients[tid];
T shift = sizeof(T) * 8 - l_gadget * base_log;
T mask = 1ll << (shift - 1);
T b = (x & mask) >> (shift - 1);
T res = x >> shift;
res += b;
res <<= shift;
coefficients[tid] = res;
tid = tid + params::degree / params::opt;
}
}
__device__ void
to_complex_compressed(PolynomialFourier<double2, params> &dest) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
dest.m_values[tid].x = (double)coefficients[2 * tid];
dest.m_values[tid].y = (double)coefficients[2 * tid + 1];
tid += params::degree / params::opt;
}
}
__device__ void to_complex(PolynomialFourier<double2, params> &dest) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
dest.m_values[tid].x = (double)coefficients[tid];
dest.m_values[tid].y = 0.0;
tid += params::degree / params::opt;
}
}
__device__ void multiply_by_scalar_inplace(T scalar) {
int tid = threadIdx.x;
const int grid_dim = blockDim.x;
const int slices = params::degree / grid_dim;
const int jump = grid_dim;
for (int i = 0; i < slices; i++) {
this->coefficients[tid] *= scalar;
tid += jump;
}
}
__device__ void add_scalar_inplace(T scalar) {
int tid = threadIdx.x;
const int grid_dim = blockDim.x;
const int slices = params::degree / grid_dim;
const int jump = grid_dim;
for (int i = 0; i < slices; i++) {
this->coefficients[tid] += scalar;
tid += jump;
}
}
__device__ void sub_scalar_inplace(T scalar) {
int tid = threadIdx.x;
const int grid_dim = blockDim.x;
const int slices = params::degree / grid_dim;
const int jump = grid_dim;
for (int i = 0; i < slices; i++) {
this->coefficients[tid] -= scalar;
tid += jump;
}
}
/*
__device__ void add_polynomial_inplace(Polynomial<T, params> &source,
int begin) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
this->coefficients[tid] += source.coefficients[tid + begin];
tid = tid + params::degree / params::opt;
}
}
*/
__device__ void sub_polynomial_inplace(Polynomial<T, params> &rhs) {
int tid = threadIdx.x;
const int grid_dim = blockDim.x;
const int slices = params::degree / grid_dim;
const int jump = grid_dim;
for (int i = 0; i < slices; i++) {
this->coefficients[tid] -= rhs.coefficients[tid];
tid += jump;
}
}
__device__ void negate_inplace() {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
coefficients[tid] = -coefficients[tid];
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();
}
__device__ void copy_into(Vector<T, params> &vec) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
vec.m_data[tid] = coefficients[tid];
tid = tid + params::degree / params::opt;
}
}
__device__ void copy_reversed_into(Vector<T, params> &vec) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
vec.m_data[tid] = coefficients[params::degree - tid - 1];
tid = tid + params::degree / params::opt;
}
}
__device__ void reverse_inplace() {
int tid = threadIdx.x;
T result[params::opt];
#pragma unroll
for (int i = 0; i < params::opt; i++) {
result[i] = coefficients[params::degree - tid - 1];
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
coefficients[tid] = result[i];
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();
}
template <typename FT>
__device__ void
forward_negacyclic_fft_half(PolynomialFourier<FT, params> &result) {
// TODO function should be removed
}
};
template <typename T, class params> class Vector {
public:
T *m_data;
uint32_t m_size;
__device__ Vector(T *elements, uint32_t size)
: m_data(elements), m_size(size) {}
template <typename V>
__device__ Vector(SharedMemory &shmem, V src, int size) : m_size(size) {
shmem.get_allocation(&m_data, m_size);
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt && tid < m_size; i++) {
if (tid > m_size)
continue;
m_data[tid] = src[tid];
tid += params::degree / params::opt;
}
}
__device__ Vector(SharedMemory &shmem, uint32_t size) : m_size(size) {
shmem.get_allocation(&m_data, m_size);
}
__host__ Vector() {}
__host__ Vector(DeviceMemory &dmem, uint32_t size, int device)
: m_size(size) {
dmem.get_allocation(&m_data, m_size, device);
}
__device__ T &operator[](int i) { return m_data[i]; }
__device__ Vector<T, params> get_chunk(int chunk_num, int chunk_size) {
int pos = chunk_num * chunk_size;
T *ptr = &m_data[pos];
return Vector<T, params>(ptr, chunk_size);
}
__host__ void copy_to_device(T *source, uint32_t elements) {
cudaMemcpyAsync(m_data, source, sizeof(T) * elements,
cudaMemcpyHostToDevice);
}
__host__ Vector(DeviceMemory &dmem, T *source, uint32_t size_source,
int device)
: m_size(size_source) {
dmem.get_allocation_and_copy_async(&m_data, source, m_size, device);
}
__host__ Vector(DeviceMemory &dmem, T *source, uint32_t allocation_size,
uint32_t copy_size, int device)
: m_size(allocation_size) {
if (copy_size > allocation_size) {
printf("warning: copying more than allocation");
}
dmem.get_allocation_and_copy_async(&m_data, source, m_size, copy_size,
device);
}
__host__ void copy_to_host(T *dest) {
cudaMemcpyAsync(dest, m_data, sizeof(T) * m_size, cudaMemcpyDeviceToHost);
}
__host__ void copy_to_host(T *dest, int elements) {
cudaMemcpyAsync(dest, m_data, sizeof(T) * elements, cudaMemcpyDeviceToHost);
}
__device__ T get_ith_element(int i) { return m_data[i]; }
__device__ T get_last_element() { return m_data[m_size - 1]; }
__device__ void set_last_element(T elem) { m_data[m_size - 1] = elem; }
// todo(Joao): let's do coalesced access here at some point
__device__ void operator-=(const Vector<T, params> &rhs) {
assert(m_size == rhs->m_size);
int tid = threadIdx.x;
int pos = tid;
int total = m_size / blockDim.x + 1;
for (int i = 0; i < total; i++) {
if (pos < m_size)
m_data[pos] -= rhs.m_data[pos];
pos += blockDim.x;
}
}
__device__ void operator*=(const T &rhs) {
int tid = threadIdx.x;
int pos = tid;
int total = m_size / blockDim.x + 1;
for (int i = 0; i < total; i++) {
if (pos < m_size)
m_data[pos] *= rhs;
pos += blockDim.x;
}
}
};
template <typename FT, class params> class Twiddles {
public:
Vector<FT, params> twiddles2, twiddles3, twiddles4, twiddles5, twiddles6,
twiddles7, twiddles8, twiddles9, twiddles10;
__device__
Twiddles(Vector<FT, params> &twiddles2, Vector<FT, params> &twiddles3,
Vector<FT, params> &twiddles4, Vector<FT, params> &twiddles5,
Vector<FT, params> &twiddles6, Vector<FT, params> &twiddles7,
Vector<FT, params> &twiddles8, Vector<FT, params> &twiddles9,
Vector<FT, params> &twiddles10)
: twiddles2(twiddles2), twiddles3(twiddles3), twiddles4(twiddles4),
twiddles5(twiddles5), twiddles6(twiddles6), twiddles7(twiddles7),
twiddles8(twiddles8), twiddles9(twiddles9), twiddles10(twiddles10) {}
};
#endif // CNCRT_POLYNOMIAL_H

View File

@@ -0,0 +1,65 @@
#ifndef CNCRT_POLYNOMIAL_MATH_H
#define CNCRT_POLYNOMIAL_MATH_H
#include "crypto/torus.cuh"
#include "parameters.cuh"
#include "polynomial.cuh"
template <class params, typename FT>
__device__ void polynomial_product_in_fourier_domain(FT *result, FT *first,
FT *second) {
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
result[tid] = first[tid] * second[tid];
tid += params::degree / params::opt;
}
if (threadIdx.x == 0) {
result[params::degree / 2] =
first[params::degree / 2] * second[params::degree / 2];
}
}
template <class params, typename FT>
__device__ void polynomial_product_in_fourier_domain(
PolynomialFourier<FT, params> &result, PolynomialFourier<FT, params> &first,
PolynomialFourier<FT, params> &second) {
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
result[tid] = first[tid] * second[tid];
tid += params::degree / params::opt;
}
if (threadIdx.x == 0) {
result[params::degree / 2] =
first[params::degree / 2] * second[params::degree / 2];
}
}
template <class params, typename FT>
__device__ void polynomial_product_accumulate_in_fourier_domain(
PolynomialFourier<FT, params> &result, PolynomialFourier<FT, params> &first,
PolynomialFourier<FT, params> &second) {
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
result[tid] += first[tid] * second[tid];
tid += params::degree / params::opt;
}
if (threadIdx.x == 0) {
result[params::degree / 2] +=
first[params::degree / 2] * second[params::degree / 2];
}
}
template <class params, typename FT>
__device__ void polynomial_product_accumulate_in_fourier_domain(
FT *result, FT *first, PolynomialFourier<FT, params> &second) {
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
result[tid] += first[tid] * second.m_values[tid];
tid += params::degree / params::opt;
}
}
#endif // CNCRT_POLYNOMIAL_MATH_H

View File

@@ -0,0 +1 @@
#include "polynomial.cuh"

76
src/types/int128.cuh Normal file
View File

@@ -0,0 +1,76 @@
#ifndef CNCRT_INT128_H
#define CNCRT_INT128_H
// abseil's int128 type
// licensed under Apache license
class uint128 {
public:
__device__ uint128(uint64_t high, uint64_t low) : hi_(high), lo_(low) {}
uint64_t lo_;
uint64_t hi_;
};
class int128 {
public:
int128() = default;
__device__ operator unsigned long long() const {
return static_cast<unsigned long long>(lo_);
}
__device__ int128(int64_t high, uint64_t low) : hi_(high), lo_(low) {}
uint64_t lo_;
int64_t hi_;
};
__device__ inline uint128 make_uint128(uint64_t high, uint64_t low) {
return uint128(high, low);
}
template <typename T> __device__ uint128 make_uint128_from_float(T v) {
if (v >= ldexp(static_cast<T>(1), 64)) {
uint64_t hi = static_cast<uint64_t>(ldexp(v, -64));
uint64_t lo = static_cast<uint64_t>(v - ldexp(static_cast<T>(hi), 64));
return make_uint128(hi, lo);
}
return make_uint128(0, static_cast<uint64_t>(v));
}
__device__ inline int128 make_int128(int64_t high, uint64_t low) {
return int128(high, low);
}
__device__ inline int64_t bitcast_to_signed(uint64_t v) {
return v & (uint64_t{1} << 63) ? ~static_cast<int64_t>(~v)
: static_cast<int64_t>(v);
}
__device__ inline uint64_t uint128_high64(uint128 v) { return v.hi_; }
__device__ inline uint64_t uint128_low64(uint128 v) { return v.lo_; }
__device__ __forceinline__ uint128 operator-(uint128 val) {
uint64_t hi = ~uint128_high64(val);
uint64_t lo = ~uint128_low64(val) + 1;
if (lo == 0)
++hi; // carry
return make_uint128(hi, lo);
}
template <typename T> __device__ int128 make_int128_from_float(T v) {
// We must convert the absolute value and then negate as needed, because
// floating point types are typically sign-magnitude. Otherwise, the
// difference between the high and low 64 bits when interpreted as two's
// complement overwhelms the precision of the mantissa.
uint128 result =
v < 0 ? -make_uint128_from_float(-v) : make_uint128_from_float(v);
return make_int128(bitcast_to_signed(uint128_high64(result)),
uint128_low64(result));
}
#endif

View File

@@ -0,0 +1,15 @@
int nextPow2(int x) {
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
void getNumBlocksAndThreads(const int n, const int maxBlockSize, int &blocks,
int &threads) {
threads = (n < maxBlockSize * 2) ? nextPow2((n + 1) / 2) : maxBlockSize;
blocks = (n + threads - 1) / threads;
}

90
src/utils/memory.cuh Normal file
View File

@@ -0,0 +1,90 @@
#ifndef CNCRT_SHMEM_H
#define CNCRT_SHMEM_H
#include "helper_cuda.h"
#include <atomic>
#include <iostream>
#include <mutex>
#include <thread>
#include <tuple>
#include <vector>
class SharedMemory {
public:
char *m_memory_block;
int m_last_byte;
__device__ SharedMemory(char *ptr) : m_memory_block(ptr), m_last_byte(0) {}
template <typename T> __device__ void get_allocation(T **ptr, int elements) {
*ptr = (T *)(&this->m_memory_block[m_last_byte]);
this->m_last_byte += elements * sizeof(T);
}
};
class DeviceMemory {
public:
std::vector<std::tuple<void *, int>> m_allocated;
std::mutex m_allocation_mtx;
std::atomic<uint32_t> m_total_devices;
DeviceMemory() : m_total_devices(1) {}
__host__ void set_device(int device) {
if (device > m_total_devices)
m_total_devices = device + 1;
}
template <typename T>
__host__ void get_allocation(T **ptr, int elements, int device) {
T *res;
cudaMalloc((void **)&res, sizeof(T) * elements);
*ptr = res;
std::lock_guard<std::mutex> lock(m_allocation_mtx);
m_allocated.push_back(std::make_tuple(res, device));
}
template <typename T>
__host__ void get_allocation_and_copy_async(T **ptr, T *src, int elements,
int device) {
T *res;
cudaMalloc((void **)&res, sizeof(T) * elements);
cudaMemcpyAsync(res, src, sizeof(T) * elements, cudaMemcpyHostToDevice);
*ptr = res;
std::lock_guard<std::mutex> lock(m_allocation_mtx);
m_allocated.push_back(std::make_tuple(res, device));
}
template <typename T>
__host__ void get_allocation_and_copy_async(T **ptr, T *src, int allocation,
int elements, int device) {
T *res;
cudaMalloc((void **)&res, sizeof(T) * allocation);
cudaMemcpyAsync(res, src, sizeof(T) * elements, cudaMemcpyHostToDevice);
*ptr = res;
std::lock_guard<std::mutex> lock(m_allocation_mtx);
m_allocated.push_back(std::make_tuple(res, device));
}
void free_all_from_device(int device) {
cudaSetDevice(device);
for (auto elem : m_allocated) {
auto dev = std::get<1>(elem);
if (dev == device) {
auto mem = std::get<0>(elem);
checkCudaErrors(cudaFree(mem));
}
}
}
__host__ ~DeviceMemory() {
for (auto elem : m_allocated) {
auto dev = std::get<1>(elem);
auto mem = std::get<0>(elem);
cudaSetDevice(dev);
checkCudaErrors(cudaFree(mem));
}
}
};
#endif // CNCRT_SHMEM_H

29
src/utils/timer.cuh Normal file
View File

@@ -0,0 +1,29 @@
#ifndef CNCRT_TIMER_H
#define CNCRT_TIMER_H
#define synchronize_threads_in_block() __syncthreads()
template <bool active> class CudaMeasureExecution {
public:
cudaEvent_t m_start, m_stop;
__host__ CudaMeasureExecution() {
if constexpr (active) {
cudaEventCreate(&m_start);
cudaEventCreate(&m_stop);
cudaEventRecord(m_start);
}
}
__host__ ~CudaMeasureExecution() {
if constexpr (active) {
float ms;
cudaEventRecord(m_stop);
cudaEventSynchronize(m_stop);
cudaEventElapsedTime(&ms, m_start, m_stop);
std::cout << "Execution took " << ms << "ms" << std::endl;
}
}
};
#endif // CNCRT_TIMER_H