mirror of
https://github.com/zama-ai/concrete.git
synced 2026-02-08 11:35:02 -05:00
chore(cuda): refactor cuda errors, remove deprecated files
This commit is contained in:
@@ -1,5 +1,13 @@
|
||||
#ifndef DEVICE_H
|
||||
#define DEVICE_H
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <cuda_runtime.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
extern "C" {
|
||||
cudaStream_t *cuda_create_stream(uint32_t gpu_index);
|
||||
@@ -35,4 +43,17 @@ int cuda_drop_async(void *ptr, cudaStream_t *stream, uint32_t gpu_index);
|
||||
int cuda_get_max_shared_memory(uint32_t gpu_index);
|
||||
|
||||
int cuda_synchronize_stream(void *v_stream);
|
||||
|
||||
#define check_cuda_error(ans) \
|
||||
{ cuda_error((ans), __FILE__, __LINE__); }
|
||||
inline void cuda_error(cudaError_t code, const char *file, int line,
|
||||
bool abort = true) {
|
||||
if (code != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code), file,
|
||||
line);
|
||||
if (abort)
|
||||
exit(code);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,664 +0,0 @@
|
||||
/**
|
||||
* Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
// These are helper functions for the SDK samples (string parsing, timers, etc)
|
||||
#ifndef STRING_HELPER_H
|
||||
#define STRING_HELPER_H
|
||||
|
||||
#include <fstream>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string>
|
||||
|
||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||
#ifndef _CRT_SECURE_NO_DEPRECATE
|
||||
#define _CRT_SECURE_NO_DEPRECATE
|
||||
#endif
|
||||
#ifndef STRCASECMP
|
||||
#define STRCASECMP _stricmp
|
||||
#endif
|
||||
#ifndef STRNCASECMP
|
||||
#define STRNCASECMP _strnicmp
|
||||
#endif
|
||||
#ifndef STRCPY
|
||||
#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
|
||||
#endif
|
||||
|
||||
#ifndef FOPEN
|
||||
#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
|
||||
#endif
|
||||
#ifndef FOPEN_FAIL
|
||||
#define FOPEN_FAIL(result) (result != 0)
|
||||
#endif
|
||||
#ifndef SSCANF
|
||||
#define SSCANF sscanf_s
|
||||
#endif
|
||||
#ifndef SPRINTF
|
||||
#define SPRINTF sprintf_s
|
||||
#endif
|
||||
#else // Linux Includes
|
||||
#include <string.h>
|
||||
#include <strings.h>
|
||||
|
||||
#ifndef STRCASECMP
|
||||
#define STRCASECMP strcasecmp
|
||||
#endif
|
||||
#ifndef STRNCASECMP
|
||||
#define STRNCASECMP strncasecmp
|
||||
#endif
|
||||
#ifndef STRCPY
|
||||
#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
|
||||
#endif
|
||||
|
||||
#ifndef FOPEN
|
||||
#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
|
||||
#endif
|
||||
#ifndef FOPEN_FAIL
|
||||
#define FOPEN_FAIL(result) (result == NULL)
|
||||
#endif
|
||||
#ifndef SSCANF
|
||||
#define SSCANF sscanf
|
||||
#endif
|
||||
#ifndef SPRINTF
|
||||
#define SPRINTF sprintf
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef EXIT_WAIVED
|
||||
#define EXIT_WAIVED 2
|
||||
#endif
|
||||
|
||||
// CUDA Utility Helper Functions
|
||||
inline int stringRemoveDelimiter(char delimiter, const char *string) {
|
||||
int string_start = 0;
|
||||
|
||||
while (string[string_start] == delimiter) {
|
||||
string_start++;
|
||||
}
|
||||
|
||||
if (string_start >= (int)strlen(string) - 1) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return string_start;
|
||||
}
|
||||
|
||||
inline int getFileExtension(char *filename, char **extension) {
|
||||
int string_length = (int)strlen(filename);
|
||||
|
||||
while (filename[string_length--] != '.') {
|
||||
if (string_length == 0)
|
||||
break;
|
||||
}
|
||||
|
||||
if (string_length > 0)
|
||||
string_length += 2;
|
||||
|
||||
if (string_length == 0)
|
||||
*extension = NULL;
|
||||
else
|
||||
*extension = &filename[string_length];
|
||||
|
||||
return string_length;
|
||||
}
|
||||
|
||||
inline bool checkCmdLineFlag(const int argc, const char **argv,
|
||||
const char *string_ref) {
|
||||
bool bFound = false;
|
||||
|
||||
if (argc >= 1) {
|
||||
for (int i = 1; i < argc; i++) {
|
||||
int string_start = stringRemoveDelimiter('-', argv[i]);
|
||||
const char *string_argv = &argv[i][string_start];
|
||||
|
||||
const char *equal_pos = strchr(string_argv, '=');
|
||||
int argv_length =
|
||||
(int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
|
||||
|
||||
int length = (int)strlen(string_ref);
|
||||
|
||||
if (length == argv_length &&
|
||||
!STRNCASECMP(string_argv, string_ref, length)) {
|
||||
bFound = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return bFound;
|
||||
}
|
||||
|
||||
// This function wraps the CUDA Driver API into a template function
|
||||
template <class T>
|
||||
inline bool getCmdLineArgumentValue(const int argc, const char **argv,
|
||||
const char *string_ref, T *value) {
|
||||
bool bFound = false;
|
||||
|
||||
if (argc >= 1) {
|
||||
for (int i = 1; i < argc; i++) {
|
||||
int string_start = stringRemoveDelimiter('-', argv[i]);
|
||||
const char *string_argv = &argv[i][string_start];
|
||||
int length = (int)strlen(string_ref);
|
||||
|
||||
if (!STRNCASECMP(string_argv, string_ref, length)) {
|
||||
if (length + 1 <= (int)strlen(string_argv)) {
|
||||
int auto_inc = (string_argv[length] == '=') ? 1 : 0;
|
||||
*value = (T)atoi(&string_argv[length + auto_inc]);
|
||||
}
|
||||
|
||||
bFound = true;
|
||||
i = argc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return bFound;
|
||||
}
|
||||
|
||||
inline int getCmdLineArgumentInt(const int argc, const char **argv,
|
||||
const char *string_ref) {
|
||||
bool bFound = false;
|
||||
int value = -1;
|
||||
|
||||
if (argc >= 1) {
|
||||
for (int i = 1; i < argc; i++) {
|
||||
int string_start = stringRemoveDelimiter('-', argv[i]);
|
||||
const char *string_argv = &argv[i][string_start];
|
||||
int length = (int)strlen(string_ref);
|
||||
|
||||
if (!STRNCASECMP(string_argv, string_ref, length)) {
|
||||
if (length + 1 <= (int)strlen(string_argv)) {
|
||||
int auto_inc = (string_argv[length] == '=') ? 1 : 0;
|
||||
value = atoi(&string_argv[length + auto_inc]);
|
||||
} else {
|
||||
value = 0;
|
||||
}
|
||||
|
||||
bFound = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (bFound) {
|
||||
return value;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
inline float getCmdLineArgumentFloat(const int argc, const char **argv,
|
||||
const char *string_ref) {
|
||||
bool bFound = false;
|
||||
float value = -1;
|
||||
|
||||
if (argc >= 1) {
|
||||
for (int i = 1; i < argc; i++) {
|
||||
int string_start = stringRemoveDelimiter('-', argv[i]);
|
||||
const char *string_argv = &argv[i][string_start];
|
||||
int length = (int)strlen(string_ref);
|
||||
|
||||
if (!STRNCASECMP(string_argv, string_ref, length)) {
|
||||
if (length + 1 <= (int)strlen(string_argv)) {
|
||||
int auto_inc = (string_argv[length] == '=') ? 1 : 0;
|
||||
value = (float)atof(&string_argv[length + auto_inc]);
|
||||
} else {
|
||||
value = 0.f;
|
||||
}
|
||||
|
||||
bFound = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (bFound) {
|
||||
return value;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
inline bool getCmdLineArgumentString(const int argc, const char **argv,
|
||||
const char *string_ref,
|
||||
char **string_retval) {
|
||||
bool bFound = false;
|
||||
|
||||
if (argc >= 1) {
|
||||
for (int i = 1; i < argc; i++) {
|
||||
int string_start = stringRemoveDelimiter('-', argv[i]);
|
||||
char *string_argv = (char *)&argv[i][string_start];
|
||||
int length = (int)strlen(string_ref);
|
||||
|
||||
if (!STRNCASECMP(string_argv, string_ref, length)) {
|
||||
*string_retval = &string_argv[length + 1];
|
||||
bFound = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!bFound) {
|
||||
*string_retval = NULL;
|
||||
}
|
||||
|
||||
return bFound;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Find the path for a file assuming that
|
||||
//! files are found in the searchPath.
|
||||
//!
|
||||
//! @return the path if succeeded, otherwise 0
|
||||
//! @param filename name of the file
|
||||
//! @param executable_path optional absolute path of the executable
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
inline char *sdkFindFilePath(const char *filename,
|
||||
const char *executable_path) {
|
||||
// <executable_name> defines a variable that is replaced with the name of the
|
||||
// executable
|
||||
|
||||
// Typical relative search paths to locate needed companion files (e.g. sample
|
||||
// input data, or JIT source files) The origin for the relative search may be
|
||||
// the .exe file, a .bat file launching an .exe, a browser .exe launching the
|
||||
// .exe or .bat, etc
|
||||
const char *searchPath[] = {
|
||||
"./", // same dir
|
||||
"./common/", // "/common/" subdir
|
||||
"./common/data/", // "/common/data/" subdir
|
||||
"./data/", // "/data/" subdir
|
||||
"./src/", // "/src/" subdir
|
||||
"./src/<executable_name>/data/", // "/src/<executable_name>/data/" subdir
|
||||
"./inc/", // "/inc/" subdir
|
||||
"./0_Simple/", // "/0_Simple/" subdir
|
||||
"./1_Utilities/", // "/1_Utilities/" subdir
|
||||
"./2_Graphics/", // "/2_Graphics/" subdir
|
||||
"./3_Imaging/", // "/3_Imaging/" subdir
|
||||
"./4_Finance/", // "/4_Finance/" subdir
|
||||
"./5_Simulations/", // "/5_Simulations/" subdir
|
||||
"./6_Advanced/", // "/6_Advanced/" subdir
|
||||
"./7_CUDALibraries/", // "/7_CUDALibraries/" subdir
|
||||
"./8_Android/", // "/8_Android/" subdir
|
||||
"./samples/", // "/samples/" subdir
|
||||
|
||||
"../", // up 1 in tree
|
||||
"../common/", // up 1 in tree, "/common/" subdir
|
||||
"../common/data/", // up 1 in tree, "/common/data/" subdir
|
||||
"../data/", // up 1 in tree, "/data/" subdir
|
||||
"../src/", // up 1 in tree, "/src/" subdir
|
||||
"../inc/", // up 1 in tree, "/inc/" subdir
|
||||
|
||||
"../0_Simple/<executable_name>/data/", // up 1 in tree,
|
||||
// "/0_Simple/<executable_name>/"
|
||||
// subdir
|
||||
"../1_Utilities/<executable_name>/data/", // up 1 in tree,
|
||||
// "/1_Utilities/<executable_name>/"
|
||||
// subdir
|
||||
"../2_Graphics/<executable_name>/data/", // up 1 in tree,
|
||||
// "/2_Graphics/<executable_name>/"
|
||||
// subdir
|
||||
"../3_Imaging/<executable_name>/data/", // up 1 in tree,
|
||||
// "/3_Imaging/<executable_name>/"
|
||||
// subdir
|
||||
"../4_Finance/<executable_name>/data/", // up 1 in tree,
|
||||
// "/4_Finance/<executable_name>/"
|
||||
// subdir
|
||||
"../5_Simulations/<executable_name>/data/", // up 1 in tree,
|
||||
// "/5_Simulations/<executable_name>/"
|
||||
// subdir
|
||||
"../6_Advanced/<executable_name>/data/", // up 1 in tree,
|
||||
// "/6_Advanced/<executable_name>/"
|
||||
// subdir
|
||||
"../7_CUDALibraries/<executable_name>/data/", // up 1 in tree,
|
||||
// "/7_CUDALibraries/<executable_name>/"
|
||||
// subdir
|
||||
"../8_Android/<executable_name>/data/", // up 1 in tree,
|
||||
// "/8_Android/<executable_name>/"
|
||||
// subdir
|
||||
"../samples/<executable_name>/data/", // up 1 in tree,
|
||||
// "/samples/<executable_name>/"
|
||||
// subdir
|
||||
"../../", // up 2 in tree
|
||||
"../../common/", // up 2 in tree, "/common/" subdir
|
||||
"../../common/data/", // up 2 in tree, "/common/data/" subdir
|
||||
"../../data/", // up 2 in tree, "/data/" subdir
|
||||
"../../src/", // up 2 in tree, "/src/" subdir
|
||||
"../../inc/", // up 2 in tree, "/inc/" subdir
|
||||
"../../sandbox/<executable_name>/data/", // up 2 in tree,
|
||||
// "/sandbox/<executable_name>/"
|
||||
// subdir
|
||||
"../../0_Simple/<executable_name>/data/", // up 2 in tree,
|
||||
// "/0_Simple/<executable_name>/"
|
||||
// subdir
|
||||
"../../1_Utilities/<executable_name>/data/", // up 2 in tree,
|
||||
// "/1_Utilities/<executable_name>/"
|
||||
// subdir
|
||||
"../../2_Graphics/<executable_name>/data/", // up 2 in tree,
|
||||
// "/2_Graphics/<executable_name>/"
|
||||
// subdir
|
||||
"../../3_Imaging/<executable_name>/data/", // up 2 in tree,
|
||||
// "/3_Imaging/<executable_name>/"
|
||||
// subdir
|
||||
"../../4_Finance/<executable_name>/data/", // up 2 in tree,
|
||||
// "/4_Finance/<executable_name>/"
|
||||
// subdir
|
||||
"../../5_Simulations/<executable_name>/data/", // up 2 in tree,
|
||||
// "/5_Simulations/<executable_name>/"
|
||||
// subdir
|
||||
"../../6_Advanced/<executable_name>/data/", // up 2 in tree,
|
||||
// "/6_Advanced/<executable_name>/"
|
||||
// subdir
|
||||
"../../7_CUDALibraries/<executable_name>/data/", // up 2 in tree,
|
||||
// "/7_CUDALibraries/<executable_name>/"
|
||||
// subdir
|
||||
"../../8_Android/<executable_name>/data/", // up 2 in tree,
|
||||
// "/8_Android/<executable_name>/"
|
||||
// subdir
|
||||
"../../samples/<executable_name>/data/", // up 2 in tree,
|
||||
// "/samples/<executable_name>/"
|
||||
// subdir
|
||||
"../../../", // up 3 in tree
|
||||
"../../../src/<executable_name>/", // up 3 in tree,
|
||||
// "/src/<executable_name>/" subdir
|
||||
"../../../src/<executable_name>/data/", // up 3 in tree,
|
||||
// "/src/<executable_name>/data/"
|
||||
// subdir
|
||||
"../../../src/<executable_name>/src/", // up 3 in tree,
|
||||
// "/src/<executable_name>/src/"
|
||||
// subdir
|
||||
"../../../src/<executable_name>/inc/", // up 3 in tree,
|
||||
// "/src/<executable_name>/inc/"
|
||||
// subdir
|
||||
"../../../sandbox/<executable_name>/", // up 3 in tree,
|
||||
// "/sandbox/<executable_name>/"
|
||||
// subdir
|
||||
"../../../sandbox/<executable_name>/data/", // up 3 in tree,
|
||||
// "/sandbox/<executable_name>/data/"
|
||||
// subdir
|
||||
"../../../sandbox/<executable_name>/src/", // up 3 in tree,
|
||||
// "/sandbox/<executable_name>/src/"
|
||||
// subdir
|
||||
"../../../sandbox/<executable_name>/inc/", // up 3 in tree,
|
||||
// "/sandbox/<executable_name>/inc/"
|
||||
// subdir
|
||||
"../../../0_Simple/<executable_name>/data/", // up 3 in tree,
|
||||
// "/0_Simple/<executable_name>/"
|
||||
// subdir
|
||||
"../../../1_Utilities/<executable_name>/data/", // up 3 in tree,
|
||||
// "/1_Utilities/<executable_name>/"
|
||||
// subdir
|
||||
"../../../2_Graphics/<executable_name>/data/", // up 3 in tree,
|
||||
// "/2_Graphics/<executable_name>/"
|
||||
// subdir
|
||||
"../../../3_Imaging/<executable_name>/data/", // up 3 in tree,
|
||||
// "/3_Imaging/<executable_name>/"
|
||||
// subdir
|
||||
"../../../4_Finance/<executable_name>/data/", // up 3 in tree,
|
||||
// "/4_Finance/<executable_name>/"
|
||||
// subdir
|
||||
"../../../5_Simulations/<executable_name>/data/", // up 3 in tree,
|
||||
// "/5_Simulations/<executable_name>/"
|
||||
// subdir
|
||||
"../../../6_Advanced/<executable_name>/data/", // up 3 in tree,
|
||||
// "/6_Advanced/<executable_name>/"
|
||||
// subdir
|
||||
"../../../7_CUDALibraries/<executable_name>/data/", // up 3 in tree,
|
||||
// "/7_CUDALibraries/<executable_name>/"
|
||||
// subdir
|
||||
"../../../8_Android/<executable_name>/data/", // up 3 in tree,
|
||||
// "/8_Android/<executable_name>/"
|
||||
// subdir
|
||||
"../../../0_Simple/<executable_name>/", // up 3 in tree,
|
||||
// "/0_Simple/<executable_name>/"
|
||||
// subdir
|
||||
"../../../1_Utilities/<executable_name>/", // up 3 in tree,
|
||||
// "/1_Utilities/<executable_name>/"
|
||||
// subdir
|
||||
"../../../2_Graphics/<executable_name>/", // up 3 in tree,
|
||||
// "/2_Graphics/<executable_name>/"
|
||||
// subdir
|
||||
"../../../3_Imaging/<executable_name>/", // up 3 in tree,
|
||||
// "/3_Imaging/<executable_name>/"
|
||||
// subdir
|
||||
"../../../4_Finance/<executable_name>/", // up 3 in tree,
|
||||
// "/4_Finance/<executable_name>/"
|
||||
// subdir
|
||||
"../../../5_Simulations/<executable_name>/", // up 3 in tree,
|
||||
// "/5_Simulations/<executable_name>/"
|
||||
// subdir
|
||||
"../../../6_Advanced/<executable_name>/", // up 3 in tree,
|
||||
// "/6_Advanced/<executable_name>/"
|
||||
// subdir
|
||||
"../../../7_CUDALibraries/<executable_name>/", // up 3 in tree,
|
||||
// "/7_CUDALibraries/<executable_name>/"
|
||||
// subdir
|
||||
"../../../8_Android/<executable_name>/", // up 3 in tree,
|
||||
// "/8_Android/<executable_name>/"
|
||||
// subdir
|
||||
"../../../samples/<executable_name>/data/", // up 3 in tree,
|
||||
// "/samples/<executable_name>/"
|
||||
// subdir
|
||||
"../../../common/", // up 3 in tree, "../../../common/" subdir
|
||||
"../../../common/data/", // up 3 in tree, "../../../common/data/" subdir
|
||||
"../../../data/", // up 3 in tree, "../../../data/" subdir
|
||||
"../../../../", // up 4 in tree
|
||||
"../../../../src/<executable_name>/", // up 4 in tree,
|
||||
// "/src/<executable_name>/" subdir
|
||||
"../../../../src/<executable_name>/data/", // up 4 in tree,
|
||||
// "/src/<executable_name>/data/"
|
||||
// subdir
|
||||
"../../../../src/<executable_name>/src/", // up 4 in tree,
|
||||
// "/src/<executable_name>/src/"
|
||||
// subdir
|
||||
"../../../../src/<executable_name>/inc/", // up 4 in tree,
|
||||
// "/src/<executable_name>/inc/"
|
||||
// subdir
|
||||
"../../../../sandbox/<executable_name>/", // up 4 in tree,
|
||||
// "/sandbox/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../sandbox/<executable_name>/data/", // up 4 in tree,
|
||||
// "/sandbox/<executable_name>/data/"
|
||||
// subdir
|
||||
"../../../../sandbox/<executable_name>/src/", // up 4 in tree,
|
||||
// "/sandbox/<executable_name>/src/"
|
||||
// subdir
|
||||
"../../../../sandbox/<executable_name>/inc/", // up 4 in tree,
|
||||
// "/sandbox/<executable_name>/inc/"
|
||||
// subdir
|
||||
"../../../../0_Simple/<executable_name>/data/", // up 4 in tree,
|
||||
// "/0_Simple/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../1_Utilities/<executable_name>/data/", // up 4 in tree,
|
||||
// "/1_Utilities/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../2_Graphics/<executable_name>/data/", // up 4 in tree,
|
||||
// "/2_Graphics/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../3_Imaging/<executable_name>/data/", // up 4 in tree,
|
||||
// "/3_Imaging/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../4_Finance/<executable_name>/data/", // up 4 in tree,
|
||||
// "/4_Finance/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../5_Simulations/<executable_name>/data/", // up 4 in tree,
|
||||
// "/5_Simulations/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../6_Advanced/<executable_name>/data/", // up 4 in tree,
|
||||
// "/6_Advanced/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../7_CUDALibraries/<executable_name>/data/", // up 4 in tree,
|
||||
// "/7_CUDALibraries/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../8_Android/<executable_name>/data/", // up 4 in tree,
|
||||
// "/8_Android/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../0_Simple/<executable_name>/", // up 4 in tree,
|
||||
// "/0_Simple/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../1_Utilities/<executable_name>/", // up 4 in tree,
|
||||
// "/1_Utilities/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../2_Graphics/<executable_name>/", // up 4 in tree,
|
||||
// "/2_Graphics/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../3_Imaging/<executable_name>/", // up 4 in tree,
|
||||
// "/3_Imaging/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../4_Finance/<executable_name>/", // up 4 in tree,
|
||||
// "/4_Finance/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../5_Simulations/<executable_name>/", // up 4 in tree,
|
||||
// "/5_Simulations/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../6_Advanced/<executable_name>/", // up 4 in tree,
|
||||
// "/6_Advanced/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../7_CUDALibraries/<executable_name>/", // up 4 in tree,
|
||||
// "/7_CUDALibraries/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../8_Android/<executable_name>/", // up 4 in tree,
|
||||
// "/8_Android/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../samples/<executable_name>/data/", // up 4 in tree,
|
||||
// "/samples/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../common/", // up 4 in tree, "../../../common/" subdir
|
||||
"../../../../common/data/", // up 4 in tree, "../../../common/data/"
|
||||
// subdir
|
||||
"../../../../data/", // up 4 in tree, "../../../data/" subdir
|
||||
"../../../../../", // up 5 in tree
|
||||
"../../../../../src/<executable_name>/", // up 5 in tree,
|
||||
// "/src/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../../src/<executable_name>/data/", // up 5 in tree,
|
||||
// "/src/<executable_name>/data/"
|
||||
// subdir
|
||||
"../../../../../src/<executable_name>/src/", // up 5 in tree,
|
||||
// "/src/<executable_name>/src/"
|
||||
// subdir
|
||||
"../../../../../src/<executable_name>/inc/", // up 5 in tree,
|
||||
// "/src/<executable_name>/inc/"
|
||||
// subdir
|
||||
"../../../../../sandbox/<executable_name>/", // up 5 in tree,
|
||||
// "/sandbox/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../../sandbox/<executable_name>/data/", // up 5 in tree,
|
||||
// "/sandbox/<executable_name>/data/"
|
||||
// subdir
|
||||
"../../../../../sandbox/<executable_name>/src/", // up 5 in tree,
|
||||
// "/sandbox/<executable_name>/src/"
|
||||
// subdir
|
||||
"../../../../../sandbox/<executable_name>/inc/", // up 5 in tree,
|
||||
// "/sandbox/<executable_name>/inc/"
|
||||
// subdir
|
||||
"../../../../../0_Simple/<executable_name>/data/", // up 5 in tree,
|
||||
// "/0_Simple/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../../1_Utilities/<executable_name>/data/", // up 5 in tree,
|
||||
// "/1_Utilities/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../../2_Graphics/<executable_name>/data/", // up 5 in tree,
|
||||
// "/2_Graphics/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../../3_Imaging/<executable_name>/data/", // up 5 in tree,
|
||||
// "/3_Imaging/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../../4_Finance/<executable_name>/data/", // up 5 in tree,
|
||||
// "/4_Finance/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../../5_Simulations/<executable_name>/data/", // up 5 in tree,
|
||||
// "/5_Simulations/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../../6_Advanced/<executable_name>/data/", // up 5 in tree,
|
||||
// "/6_Advanced/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../../7_CUDALibraries/<executable_name>/data/", // up 5 in tree,
|
||||
// "/7_CUDALibraries/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../../8_Android/<executable_name>/data/", // up 5 in tree,
|
||||
// "/8_Android/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../../samples/<executable_name>/data/", // up 5 in tree,
|
||||
// "/samples/<executable_name>/"
|
||||
// subdir
|
||||
"../../../../../common/", // up 5 in tree, "../../../common/" subdir
|
||||
"../../../../../common/data/", // up 5 in tree, "../../../common/data/"
|
||||
// subdir
|
||||
};
|
||||
|
||||
// Extract the executable name
|
||||
std::string executable_name;
|
||||
|
||||
if (executable_path != 0) {
|
||||
executable_name = std::string(executable_path);
|
||||
|
||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||
// Windows path delimiter
|
||||
size_t delimiter_pos = executable_name.find_last_of('\\');
|
||||
executable_name.erase(0, delimiter_pos + 1);
|
||||
|
||||
if (executable_name.rfind(".exe") != std::string::npos) {
|
||||
// we strip .exe, only if the .exe is found
|
||||
executable_name.resize(executable_name.size() - 4);
|
||||
}
|
||||
|
||||
#else
|
||||
// Linux & OSX path delimiter
|
||||
size_t delimiter_pos = executable_name.find_last_of('/');
|
||||
executable_name.erase(0, delimiter_pos + 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Loop over all search paths and return the first hit
|
||||
for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) {
|
||||
std::string path(searchPath[i]);
|
||||
size_t executable_name_pos = path.find("<executable_name>");
|
||||
|
||||
// If there is executable_name variable in the searchPath
|
||||
// replace it with the value
|
||||
if (executable_name_pos != std::string::npos) {
|
||||
if (executable_path != 0) {
|
||||
path.replace(executable_name_pos, strlen("<executable_name>"),
|
||||
executable_name);
|
||||
} else {
|
||||
// Skip this path entry if no executable argument is given
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _DEBUG
|
||||
printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
|
||||
#endif
|
||||
|
||||
// Test if the file exists
|
||||
path.append(filename);
|
||||
FILE *fp;
|
||||
FOPEN(fp, path.c_str(), "rb");
|
||||
|
||||
if (fp != NULL) {
|
||||
fclose(fp);
|
||||
// File found
|
||||
// returning an allocated array here for backwards compatibility reasons
|
||||
char *file_path = (char *)malloc(path.length() + 1);
|
||||
STRCPY(file_path, path.length() + 1, path.c_str());
|
||||
return file_path;
|
||||
}
|
||||
|
||||
if (fp) {
|
||||
fclose(fp);
|
||||
}
|
||||
}
|
||||
|
||||
// File not found
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -4,10 +4,9 @@
|
||||
#ifdef __CDT_PARSER__
|
||||
#undef __CUDA_RUNTIME_H__
|
||||
#include <cuda_runtime.h>
|
||||
#include <helper_cuda.h>
|
||||
#endif
|
||||
|
||||
#include "../include/helper_cuda.h"
|
||||
#include "device.h"
|
||||
#include "linear_algebra.h"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <stdio.h>
|
||||
@@ -58,7 +57,7 @@ __host__ void host_addition(void *v_stream, uint32_t gpu_index, T *output,
|
||||
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
addition<<<grid, thds, 0, *stream>>>(output, input_1, input_2, num_entries);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@@ -77,12 +76,12 @@ __host__ void host_addition_plaintext(void *v_stream, uint32_t gpu_index,
|
||||
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
checkCudaErrors(cudaMemcpyAsync(output, lwe_input,
|
||||
(input_lwe_dimension + 1) *
|
||||
input_lwe_ciphertext_count * sizeof(T),
|
||||
cudaMemcpyDeviceToDevice, *stream));
|
||||
check_cuda_error(cudaMemcpyAsync(output, lwe_input,
|
||||
(input_lwe_dimension + 1) *
|
||||
input_lwe_ciphertext_count * sizeof(T),
|
||||
cudaMemcpyDeviceToDevice, *stream));
|
||||
plaintext_addition<<<grid, thds, 0, *stream>>>(
|
||||
output, lwe_input, plaintext_input, input_lwe_dimension, num_entries);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
#endif // CUDA_ADD_H
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
|
||||
#include "cooperative_groups.h"
|
||||
|
||||
#include "../include/helper_cuda.h"
|
||||
#include "bootstrap.h"
|
||||
#include "bootstrap_low_latency.cuh"
|
||||
#include "device.h"
|
||||
@@ -156,7 +155,7 @@ __host__ void host_extract_bits(
|
||||
copy_and_shift_lwe<Torus, params><<<blocks, threads, 0, *stream>>>(
|
||||
lwe_array_in_buffer, lwe_array_in_shifted_buffer, lwe_array_in,
|
||||
1ll << (ciphertext_n_bits - delta_log - 1));
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
for (int bit_idx = 0; bit_idx < number_of_bits; bit_idx++) {
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
@@ -167,7 +166,7 @@ __host__ void host_extract_bits(
|
||||
copy_small_lwe<<<1, 256, 0, *stream>>>(
|
||||
list_lwe_array_out, lwe_array_out_ks_buffer, lwe_dimension_out + 1,
|
||||
number_of_bits, number_of_bits - bit_idx - 1);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
if (bit_idx == number_of_bits - 1) {
|
||||
break;
|
||||
@@ -177,7 +176,7 @@ __host__ void host_extract_bits(
|
||||
add_to_body<Torus><<<1, 1, 0, *stream>>>(lwe_array_out_ks_buffer,
|
||||
lwe_dimension_out,
|
||||
1ll << (ciphertext_n_bits - 2));
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
// Fill lut for the current bit (equivalent to trivial encryption as mask is
|
||||
// 0s) The LUT is filled with -alpha in each coefficient where alpha =
|
||||
@@ -185,7 +184,7 @@ __host__ void host_extract_bits(
|
||||
fill_lut_body_for_current_bit<Torus, params>
|
||||
<<<blocks, threads, 0, *stream>>>(
|
||||
lut_pbs, 0ll - 1ll << (delta_log - 1 + bit_idx));
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
host_bootstrap_low_latency<Torus, params>(
|
||||
v_stream, gpu_index, lwe_array_out_pbs_buffer, lut_pbs,
|
||||
@@ -199,7 +198,7 @@ __host__ void host_extract_bits(
|
||||
lwe_array_in_shifted_buffer, lwe_array_in_buffer,
|
||||
lwe_array_out_pbs_buffer, 1ll << (delta_log - 1 + bit_idx),
|
||||
1ll << (ciphertext_n_bits - delta_log - bit_idx - 2));
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
|
||||
#include "bootstrap.h"
|
||||
#include "device.h"
|
||||
#include "helper_cuda.h"
|
||||
#include "keyswitch.h"
|
||||
#include "linear_algebra.h"
|
||||
|
||||
@@ -48,7 +47,7 @@ extern "C" void cuda_boolean_and_32(
|
||||
cuda_memcpy_async_to_gpu(false_plaintext_array, h_false_plaintext_array,
|
||||
input_lwe_ciphertext_count * sizeof(uint32_t),
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
uint32_t *lwe_buffer_2 = (uint32_t *)cuda_malloc_async(
|
||||
(input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
|
||||
@@ -76,7 +75,7 @@ extern "C" void cuda_boolean_and_32(
|
||||
(glwe_dimension + 1) * polynomial_size *
|
||||
sizeof(uint32_t),
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
uint32_t *h_pbs_lut_indexes =
|
||||
(uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
|
||||
for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
|
||||
@@ -87,19 +86,19 @@ extern "C" void cuda_boolean_and_32(
|
||||
cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes,
|
||||
input_lwe_ciphertext_count * sizeof(uint32_t),
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async(
|
||||
(glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count *
|
||||
sizeof(uint32_t),
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
|
||||
lwe_buffer_2, bootstrapping_key, input_lwe_dimension, glwe_dimension,
|
||||
polynomial_size, pbs_base_log, pbs_level_count,
|
||||
input_lwe_ciphertext_count, 1, 0, max_shared_memory);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
cuda_drop_async(lwe_buffer_2, stream, gpu_index);
|
||||
cuda_drop_async(pbs_lut, stream, gpu_index);
|
||||
@@ -151,7 +150,7 @@ extern "C" void cuda_boolean_nand_32(
|
||||
cuda_memcpy_async_to_gpu(true_plaintext_array, h_true_plaintext_array,
|
||||
input_lwe_ciphertext_count * sizeof(uint32_t),
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
uint32_t *lwe_buffer_3 = (uint32_t *)cuda_malloc_async(
|
||||
(input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
|
||||
@@ -179,7 +178,7 @@ extern "C" void cuda_boolean_nand_32(
|
||||
(glwe_dimension + 1) * polynomial_size *
|
||||
sizeof(uint32_t),
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
uint32_t *h_pbs_lut_indexes =
|
||||
(uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
|
||||
for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
|
||||
@@ -190,19 +189,19 @@ extern "C" void cuda_boolean_nand_32(
|
||||
cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes,
|
||||
input_lwe_ciphertext_count * sizeof(uint32_t),
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async(
|
||||
(glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count *
|
||||
sizeof(uint32_t),
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
|
||||
lwe_buffer_3, bootstrapping_key, input_lwe_dimension, glwe_dimension,
|
||||
polynomial_size, pbs_base_log, pbs_level_count,
|
||||
input_lwe_ciphertext_count, 1, 0, max_shared_memory);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
cuda_drop_async(lwe_buffer_3, stream, gpu_index);
|
||||
cuda_drop_async(pbs_lut, stream, gpu_index);
|
||||
@@ -254,7 +253,7 @@ extern "C" void cuda_boolean_nor_32(
|
||||
cuda_memcpy_async_to_gpu(false_plaintext_array, h_false_plaintext_array,
|
||||
input_lwe_ciphertext_count * sizeof(uint32_t),
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
uint32_t *lwe_buffer_3 = (uint32_t *)cuda_malloc_async(
|
||||
(input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
|
||||
@@ -282,7 +281,7 @@ extern "C" void cuda_boolean_nor_32(
|
||||
(glwe_dimension + 1) * polynomial_size *
|
||||
sizeof(uint32_t),
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
uint32_t *h_pbs_lut_indexes =
|
||||
(uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
|
||||
for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
|
||||
@@ -293,19 +292,19 @@ extern "C" void cuda_boolean_nor_32(
|
||||
cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes,
|
||||
input_lwe_ciphertext_count * sizeof(uint32_t),
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async(
|
||||
(glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count *
|
||||
sizeof(uint32_t),
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
|
||||
lwe_buffer_3, bootstrapping_key, input_lwe_dimension, glwe_dimension,
|
||||
polynomial_size, pbs_base_log, pbs_level_count,
|
||||
input_lwe_ciphertext_count, 1, 0, max_shared_memory);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
cuda_drop_async(lwe_buffer_3, stream, gpu_index);
|
||||
cuda_drop_async(pbs_lut, stream, gpu_index);
|
||||
@@ -349,7 +348,7 @@ extern "C" void cuda_boolean_or_32(
|
||||
cuda_memcpy_async_to_gpu(true_plaintext_array, h_true_plaintext_array,
|
||||
input_lwe_ciphertext_count * sizeof(uint32_t),
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
uint32_t *lwe_buffer_2 = (uint32_t *)cuda_malloc_async(
|
||||
(input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
|
||||
@@ -377,7 +376,7 @@ extern "C" void cuda_boolean_or_32(
|
||||
(glwe_dimension + 1) * polynomial_size *
|
||||
sizeof(uint32_t),
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
uint32_t *h_pbs_lut_indexes =
|
||||
(uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
|
||||
for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
|
||||
@@ -388,19 +387,19 @@ extern "C" void cuda_boolean_or_32(
|
||||
cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes,
|
||||
input_lwe_ciphertext_count * sizeof(uint32_t),
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async(
|
||||
(glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count *
|
||||
sizeof(uint32_t),
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
|
||||
lwe_buffer_2, bootstrapping_key, input_lwe_dimension, glwe_dimension,
|
||||
polynomial_size, pbs_base_log, pbs_level_count,
|
||||
input_lwe_ciphertext_count, 1, 0, max_shared_memory);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
cuda_drop_async(lwe_buffer_2, stream, gpu_index);
|
||||
cuda_drop_async(pbs_lut, stream, gpu_index);
|
||||
@@ -444,7 +443,7 @@ extern "C" void cuda_boolean_xor_32(
|
||||
cuda_memcpy_async_to_gpu(true_plaintext_array, h_true_plaintext_array,
|
||||
input_lwe_ciphertext_count * sizeof(uint32_t),
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
uint32_t *lwe_buffer_2 = (uint32_t *)cuda_malloc_async(
|
||||
(input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
|
||||
@@ -468,7 +467,7 @@ extern "C" void cuda_boolean_xor_32(
|
||||
cuda_memcpy_async_to_gpu(cleartext_array, h_cleartext_array,
|
||||
input_lwe_ciphertext_count * sizeof(uint32_t),
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
uint32_t *lwe_buffer_3 = (uint32_t *)cuda_malloc_async(
|
||||
(input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
|
||||
@@ -493,7 +492,7 @@ extern "C" void cuda_boolean_xor_32(
|
||||
(glwe_dimension + 1) * polynomial_size *
|
||||
sizeof(uint32_t),
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
uint32_t *h_pbs_lut_indexes =
|
||||
(uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
|
||||
for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
|
||||
@@ -504,19 +503,19 @@ extern "C" void cuda_boolean_xor_32(
|
||||
cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes,
|
||||
input_lwe_ciphertext_count * sizeof(uint32_t),
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async(
|
||||
(glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count *
|
||||
sizeof(uint32_t),
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
|
||||
lwe_buffer_3, bootstrapping_key, input_lwe_dimension, glwe_dimension,
|
||||
polynomial_size, pbs_base_log, pbs_level_count,
|
||||
input_lwe_ciphertext_count, 1, 0, max_shared_memory);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
cuda_drop_async(lwe_buffer_3, stream, gpu_index);
|
||||
cuda_drop_async(pbs_lut, stream, gpu_index);
|
||||
@@ -560,7 +559,7 @@ extern "C" void cuda_boolean_xnor_32(
|
||||
cuda_memcpy_async_to_gpu(true_plaintext_array, h_true_plaintext_array,
|
||||
input_lwe_ciphertext_count * sizeof(uint32_t),
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
uint32_t *lwe_buffer_2 = (uint32_t *)cuda_malloc_async(
|
||||
(input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
|
||||
@@ -591,7 +590,7 @@ extern "C" void cuda_boolean_xnor_32(
|
||||
cuda_memcpy_async_to_gpu(cleartext_array, h_cleartext_array,
|
||||
input_lwe_ciphertext_count * sizeof(uint32_t),
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
uint32_t *lwe_buffer_4 = (uint32_t *)cuda_malloc_async(
|
||||
(input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
|
||||
@@ -616,7 +615,7 @@ extern "C" void cuda_boolean_xnor_32(
|
||||
(glwe_dimension + 1) * polynomial_size *
|
||||
sizeof(uint32_t),
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
uint32_t *h_pbs_lut_indexes =
|
||||
(uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
|
||||
for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
|
||||
@@ -627,19 +626,19 @@ extern "C" void cuda_boolean_xnor_32(
|
||||
cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes,
|
||||
input_lwe_ciphertext_count * sizeof(uint32_t),
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async(
|
||||
(glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count *
|
||||
sizeof(uint32_t),
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
|
||||
lwe_buffer_4, bootstrapping_key, input_lwe_dimension, glwe_dimension,
|
||||
polynomial_size, pbs_base_log, pbs_level_count,
|
||||
input_lwe_ciphertext_count, 1, 0, max_shared_memory);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
cuda_drop_async(lwe_buffer_4, stream, gpu_index);
|
||||
cuda_drop_async(pbs_lut, stream, gpu_index);
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
#ifdef __CDT_PARSER__
|
||||
#undef __CUDA_RUNTIME_H__
|
||||
#include <cuda_runtime.h>
|
||||
#include <helper_cuda.h>
|
||||
#endif
|
||||
|
||||
#ifndef CNCRT_AMORTIZED_PBS_H
|
||||
@@ -15,9 +14,7 @@
|
||||
#include "crypto/torus.cuh"
|
||||
#include "device.h"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/smfft.cuh"
|
||||
#include "fft/twiddles.cuh"
|
||||
#include "helper_cuda.h"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial.cuh"
|
||||
@@ -327,10 +324,10 @@ __host__ void host_bootstrap_amortized(
|
||||
// device then has to be allocated dynamically.
|
||||
// For lower compute capabilities, this call
|
||||
// just does nothing and the amount of shared memory used is 48 KB
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_bootstrap_amortized<Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, SM_FULL));
|
||||
checkCudaErrors(cudaFuncSetCacheConfig(
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
device_bootstrap_amortized<Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
d_mem = (char *)cuda_malloc_async(0, stream, gpu_index);
|
||||
@@ -341,7 +338,7 @@ __host__ void host_bootstrap_amortized(
|
||||
bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size,
|
||||
base_log, level_count, lwe_idx, 0);
|
||||
}
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
cuda_drop_async(d_mem, stream, gpu_index);
|
||||
}
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
#ifdef __CDT_PARSER__
|
||||
#undef __CUDA_RUNTIME_H__
|
||||
#include <cuda_runtime.h>
|
||||
#include <helper_cuda.h>
|
||||
#endif
|
||||
|
||||
#ifndef LOWLAT_PBS_H
|
||||
@@ -15,9 +14,7 @@
|
||||
#include "crypto/torus.cuh"
|
||||
#include "device.h"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/smfft.cuh"
|
||||
#include "fft/twiddles.cuh"
|
||||
#include "helper_cuda.h"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
@@ -299,12 +296,12 @@ __host__ void host_bootstrap_low_latency(
|
||||
|
||||
if (max_shared_memory < SM_PART) {
|
||||
kernel_args[11] = &DM_FULL;
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
d_mem = (char *)cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count *
|
||||
level_count * 2,
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
checkCudaErrors(cudaLaunchCooperativeKernel(
|
||||
check_cuda_error(cudaGetLastError());
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_bootstrap_low_latency<Torus, params, NOSM>, grid, thds,
|
||||
(void **)kernel_args, 0, *stream));
|
||||
} else if (max_shared_memory < SM_FULL) {
|
||||
@@ -312,14 +309,14 @@ __host__ void host_bootstrap_low_latency(
|
||||
d_mem = (char *)cuda_malloc_async(DM_PART * input_lwe_ciphertext_count *
|
||||
level_count * 2,
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_bootstrap_low_latency<Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, SM_PART));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_bootstrap_low_latency<Torus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
checkCudaErrors(cudaLaunchCooperativeKernel(
|
||||
check_cuda_error(cudaGetLastError());
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_bootstrap_low_latency<Torus, params, PARTIALSM>, grid,
|
||||
thds, (void **)kernel_args, SM_PART, *stream));
|
||||
|
||||
@@ -327,17 +324,17 @@ __host__ void host_bootstrap_low_latency(
|
||||
int DM_NONE = 0;
|
||||
kernel_args[11] = &DM_NONE;
|
||||
d_mem = (char *)cuda_malloc_async(0, stream, gpu_index);
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_bootstrap_low_latency<Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, SM_FULL));
|
||||
cudaFuncSetCacheConfig(device_bootstrap_low_latency<Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
checkCudaErrors(cudaLaunchCooperativeKernel(
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_bootstrap_low_latency<Torus, params, FULLSM>, grid, thds,
|
||||
(void **)kernel_args, SM_FULL, *stream));
|
||||
}
|
||||
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
// Synchronize the streams before copying the result to lwe_array_out at the
|
||||
// right place
|
||||
cuda_drop_async(mask_buffer_fft, stream, gpu_index);
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
#include "bootstrap.h"
|
||||
#include "bootstrap_amortized.cuh"
|
||||
#include "device.h"
|
||||
#include "helper_cuda.h"
|
||||
#include "keyswitch.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "utils/timer.cuh"
|
||||
|
||||
@@ -85,10 +85,10 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
|
||||
case 512:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
checkCudaErrors(cudaFuncSetCacheConfig(
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>, FULLSM>
|
||||
@@ -104,10 +104,10 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
|
||||
case 1024:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<Degree<1024>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
checkCudaErrors(cudaFuncSetCacheConfig(
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_NSMFFT<FFTDegree<Degree<1024>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_NSMFFT<FFTDegree<Degree<1024>, ForwardFFT>, FULLSM>
|
||||
@@ -123,10 +123,10 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
|
||||
case 2048:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<Degree<2048>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
checkCudaErrors(cudaFuncSetCacheConfig(
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_NSMFFT<FFTDegree<Degree<2048>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_NSMFFT<FFTDegree<Degree<2048>, ForwardFFT>, FULLSM>
|
||||
@@ -142,10 +142,10 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
|
||||
case 4096:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<Degree<4096>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
checkCudaErrors(cudaFuncSetCacheConfig(
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_NSMFFT<FFTDegree<Degree<4096>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_NSMFFT<FFTDegree<Degree<4096>, ForwardFFT>, FULLSM>
|
||||
@@ -161,10 +161,10 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
|
||||
case 8192:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<Degree<8192>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
checkCudaErrors(cudaFuncSetCacheConfig(
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_NSMFFT<FFTDegree<Degree<8192>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_NSMFFT<FFTDegree<Degree<8192>, ForwardFFT>, FULLSM>
|
||||
|
||||
@@ -64,13 +64,13 @@ void batch_fft_ggsw_vector(cudaStream_t *stream, double2 *dest, T *src,
|
||||
d_mem = (char *)cuda_malloc_async(shared_memory_size, stream, gpu_index);
|
||||
device_batch_fft_ggsw_vector<T, ST, params, NOSM>
|
||||
<<<gridSize, blockSize, 0, *stream>>>(dest, src, d_mem);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
cuda_drop_async(d_mem, stream, gpu_index);
|
||||
} else {
|
||||
device_batch_fft_ggsw_vector<T, ST, params, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(dest, src,
|
||||
d_mem);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cuda_runtime.h>
|
||||
#include <helper_cuda.h>
|
||||
|
||||
/// Unsafe function to create a CUDA stream, must check first that GPU exists
|
||||
cudaStream_t *cuda_create_stream(uint32_t gpu_index) {
|
||||
@@ -25,7 +24,8 @@ int cuda_destroy_stream(cudaStream_t *stream, uint32_t gpu_index) {
|
||||
void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
|
||||
cudaSetDevice(gpu_index);
|
||||
void *ptr;
|
||||
checkCudaErrors(cudaMalloc((void **)&ptr, size));
|
||||
cudaMalloc((void **)&ptr, size);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
return ptr;
|
||||
}
|
||||
@@ -37,13 +37,14 @@ void *cuda_malloc_async(uint64_t size, cudaStream_t *stream,
|
||||
void *ptr;
|
||||
|
||||
int support_async_alloc;
|
||||
checkCudaErrors(cudaDeviceGetAttribute(
|
||||
check_cuda_error(cudaDeviceGetAttribute(
|
||||
&support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
|
||||
|
||||
if (support_async_alloc)
|
||||
checkCudaErrors(cudaMallocAsync((void **)&ptr, size, *stream));
|
||||
else
|
||||
checkCudaErrors(cudaMalloc((void **)&ptr, size));
|
||||
if (support_async_alloc) {
|
||||
check_cuda_error(cudaMallocAsync((void **)&ptr, size, *stream));
|
||||
} else {
|
||||
check_cuda_error(cudaMalloc((void **)&ptr, size));
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
@@ -91,7 +92,7 @@ int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
}
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
checkCudaErrors(
|
||||
check_cuda_error(
|
||||
cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, *stream));
|
||||
return 0;
|
||||
}
|
||||
@@ -133,7 +134,7 @@ int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||
}
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
checkCudaErrors(
|
||||
check_cuda_error(
|
||||
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, *stream));
|
||||
return 0;
|
||||
}
|
||||
@@ -152,7 +153,7 @@ int cuda_drop(void *ptr, uint32_t gpu_index) {
|
||||
return -2;
|
||||
}
|
||||
cudaSetDevice(gpu_index);
|
||||
checkCudaErrors(cudaFree(ptr));
|
||||
check_cuda_error(cudaFree(ptr));
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -160,13 +161,14 @@ int cuda_drop(void *ptr, uint32_t gpu_index) {
|
||||
int cuda_drop_async(void *ptr, cudaStream_t *stream, uint32_t gpu_index) {
|
||||
|
||||
int support_async_alloc;
|
||||
checkCudaErrors(cudaDeviceGetAttribute(
|
||||
check_cuda_error(cudaDeviceGetAttribute(
|
||||
&support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
|
||||
|
||||
if (support_async_alloc)
|
||||
checkCudaErrors(cudaFreeAsync(ptr, *stream));
|
||||
else
|
||||
checkCudaErrors(cudaFree(ptr));
|
||||
if (support_async_alloc) {
|
||||
check_cuda_error(cudaFreeAsync(ptr, *stream));
|
||||
} else {
|
||||
check_cuda_error(cudaFree(ptr));
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#ifndef GPU_BOOTSTRAP_FFT_1024_CUH
|
||||
#define GPU_BOOTSTRAP_FFT_1024_CUH
|
||||
#ifndef GPU_BOOTSTRAP_FFT_CUH
|
||||
#define GPU_BOOTSTRAP_FFT_CUH
|
||||
|
||||
#include "complex/operations.cuh"
|
||||
#include "polynomial/functions.cuh"
|
||||
@@ -21,9 +21,6 @@
|
||||
* w_j,k = exp(-i pi j/2^k)
|
||||
* is replaced with:
|
||||
* \zeta_j,k = exp(-i pi (2j-1)/2^k)
|
||||
* - this technique also implies a correction of the
|
||||
* complex obtained after the FFT, which is done in the
|
||||
* forward_negacyclic_fft_inplace function of bootstrap.cuh
|
||||
*/
|
||||
template <class params> __device__ void NSMFFT_direct(double2 *A) {
|
||||
|
||||
@@ -118,7 +115,7 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
|
||||
|
||||
// none of the twiddles have equal real and imag part, so
|
||||
// complete complex multiplication has to be done
|
||||
// here we have more than one twiddles
|
||||
// here we have more than one twiddle
|
||||
while (m > 1) {
|
||||
tid = threadIdx.x;
|
||||
m >>= 1;
|
||||
@@ -145,7 +142,7 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
|
||||
/*
|
||||
* global batch fft
|
||||
* does fft in half size
|
||||
* unrolling halfsize fft result in half size + 1 eleemnts
|
||||
* unrolling half size fft result in half size + 1 elements
|
||||
* this function must be called with actual degree
|
||||
* function takes as input already compressed input
|
||||
*/
|
||||
@@ -174,4 +171,4 @@ __global__ void batch_NSMFFT(double2 *d_input, double2 *d_output,
|
||||
}
|
||||
}
|
||||
|
||||
#endif // GPU_BOOTSTRAP_FFT_1024_CUH
|
||||
#endif // GPU_BOOTSTRAP_FFT_CUH
|
||||
|
||||
@@ -1,402 +0,0 @@
|
||||
/*
|
||||
#ifndef GPU_BOOTSTRAP_SMFFT_CUH
|
||||
#define GPU_BOOTSTRAP_SMFFT_CUH
|
||||
|
||||
#include "../complex/operations.cuh"
|
||||
#include "twiddles.cuh"
|
||||
|
||||
__device__ __inline__ double2 Get_W_value_inverse(int index) {
|
||||
double2 ctemp = _gTwiddles[index];
|
||||
ctemp.y = -ctemp.y;
|
||||
return (ctemp);
|
||||
}
|
||||
template <class params>
|
||||
__device__ double2 Get_after_inverse_fft_twiddle(int index) {
|
||||
double2 ctemp;
|
||||
switch (params::degree) {
|
||||
case 512:
|
||||
ctemp = INVERSE_TWIDDLES_512[index];
|
||||
break;
|
||||
case 1024:
|
||||
ctemp = gTwiddles1024[index];
|
||||
ctemp.x /= params::degree;
|
||||
ctemp.y /= -params::degree;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return ctemp;
|
||||
}
|
||||
|
||||
__device__ __inline__ double shfl(double *value, int par) {
|
||||
#if (CUDART_VERSION >= 9000)
|
||||
return (__shfl_sync(0xffffffff, (*value), par));
|
||||
#else
|
||||
return (__shfl((*value), par));
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ __inline__ double shfl_xor(double *value, int par) {
|
||||
#if (CUDART_VERSION >= 9000)
|
||||
return (__shfl_xor_sync(0xffffffff, (*value), par));
|
||||
#else
|
||||
return (__shfl_xor((*value), par));
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ __inline__ double shfl_down(double *value, int par) {
|
||||
#if (CUDART_VERSION >= 9000)
|
||||
return (__shfl_down_sync(0xffffffff, (*value), par));
|
||||
#else
|
||||
return (__shfl_down((*value), par));
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ __inline__ void
|
||||
reorder_16_register(double2 *A_DFT_value, double2 *B_DFT_value,
|
||||
double2 *C_DFT_value, double2 *D_DFT_value, int *local_id) {
|
||||
double2 Af2temp, Bf2temp, Cf2temp, Df2temp;
|
||||
unsigned int target = (((unsigned int)__brev(((*local_id) & 15))) >> (28)) +
|
||||
16 * ((*local_id) >> 4);
|
||||
Af2temp.x = shfl(&(A_DFT_value->x), target);
|
||||
Af2temp.y = shfl(&(A_DFT_value->y), target);
|
||||
Bf2temp.x = shfl(&(B_DFT_value->x), target);
|
||||
Bf2temp.y = shfl(&(B_DFT_value->y), target);
|
||||
Cf2temp.x = shfl(&(C_DFT_value->x), target);
|
||||
Cf2temp.y = shfl(&(C_DFT_value->y), target);
|
||||
Df2temp.x = shfl(&(D_DFT_value->x), target);
|
||||
Df2temp.y = shfl(&(D_DFT_value->y), target);
|
||||
__syncwarp();
|
||||
(*A_DFT_value) = Af2temp;
|
||||
(*B_DFT_value) = Bf2temp;
|
||||
(*C_DFT_value) = Cf2temp;
|
||||
(*D_DFT_value) = Df2temp;
|
||||
}
|
||||
|
||||
__device__ __inline__ void reorder_32_register(double2 *A_DFT_value,
|
||||
double2 *B_DFT_value,
|
||||
double2 *C_DFT_value,
|
||||
double2 *D_DFT_value) {
|
||||
double2 Af2temp, Bf2temp, Cf2temp, Df2temp;
|
||||
unsigned int target = ((unsigned int)__brev(threadIdx.x)) >> (27);
|
||||
Af2temp.x = shfl(&(A_DFT_value->x), target);
|
||||
Af2temp.y = shfl(&(A_DFT_value->y), target);
|
||||
Bf2temp.x = shfl(&(B_DFT_value->x), target);
|
||||
Bf2temp.y = shfl(&(B_DFT_value->y), target);
|
||||
Cf2temp.x = shfl(&(C_DFT_value->x), target);
|
||||
Cf2temp.y = shfl(&(C_DFT_value->y), target);
|
||||
Df2temp.x = shfl(&(D_DFT_value->x), target);
|
||||
Df2temp.y = shfl(&(D_DFT_value->y), target);
|
||||
__syncwarp();
|
||||
(*A_DFT_value) = Af2temp;
|
||||
(*B_DFT_value) = Bf2temp;
|
||||
(*C_DFT_value) = Cf2temp;
|
||||
(*D_DFT_value) = Df2temp;
|
||||
}
|
||||
|
||||
template <class params>
|
||||
__device__ __inline__ void
|
||||
reorder_512(double2 *s_input, double2 *A_DFT_value, double2 *B_DFT_value,
|
||||
double2 *C_DFT_value, double2 *D_DFT_value) {
|
||||
int local_id = threadIdx.x & (params::warp - 1);
|
||||
int warp_id = threadIdx.x / params::warp;
|
||||
|
||||
// reorder elements within warp so we can save them in semi-transposed manner
|
||||
// into shared memory
|
||||
reorder_32_register(A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value);
|
||||
|
||||
// reorder elements within warp so we can save them in semi-transposed manner
|
||||
// into shared memory
|
||||
__syncthreads();
|
||||
unsigned int sm_store_pos =
|
||||
(local_id >> 1) + 16 * (local_id & 1) + warp_id * 132;
|
||||
s_input[sm_store_pos] = *A_DFT_value;
|
||||
s_input[sm_store_pos + 33] = *B_DFT_value;
|
||||
s_input[66 + sm_store_pos] = *C_DFT_value;
|
||||
s_input[66 + sm_store_pos + 33] = *D_DFT_value;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Read shared memory to get reordered input
|
||||
unsigned int sm_read_pos = (local_id & 15) * 32 + local_id + warp_id * 4;
|
||||
__syncthreads();
|
||||
*A_DFT_value = s_input[sm_read_pos + 0];
|
||||
*B_DFT_value = s_input[sm_read_pos + 1];
|
||||
*C_DFT_value = s_input[sm_read_pos + 2];
|
||||
*D_DFT_value = s_input[sm_read_pos + 3];
|
||||
|
||||
__syncthreads();
|
||||
reorder_16_register(A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value,
|
||||
&local_id);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <class params>
|
||||
__device__ __inline__ void
|
||||
reorder_1024(double2 *s_input, double2 *A_DFT_value, double2 *B_DFT_value,
|
||||
double2 *C_DFT_value, double2 *D_DFT_value) {
|
||||
int local_id = threadIdx.x & (params::warp - 1);
|
||||
int warp_id = threadIdx.x / params::warp;
|
||||
|
||||
// reorder elements within params::warp so we can save them in semi-transposed
|
||||
// manner into shared memory
|
||||
reorder_32_register(A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value);
|
||||
|
||||
// reorder elements within params::warp so we can save them in semi-transposed
|
||||
// manner into shared memory
|
||||
__syncthreads();
|
||||
unsigned int sm_store_pos =
|
||||
(local_id >> 0) + 32 * (local_id & 0) + warp_id * 132;
|
||||
s_input[sm_store_pos] = *A_DFT_value;
|
||||
s_input[sm_store_pos + 33] = *B_DFT_value;
|
||||
s_input[66 + sm_store_pos] = *C_DFT_value;
|
||||
s_input[66 + sm_store_pos + 33] = *D_DFT_value;
|
||||
|
||||
// Read shared memory to get reordered input
|
||||
unsigned int sm_read_pos = (local_id & 31) * 32 + local_id + warp_id * 4;
|
||||
__syncthreads();
|
||||
*A_DFT_value = s_input[sm_read_pos + 0];
|
||||
*B_DFT_value = s_input[sm_read_pos + 1];
|
||||
*C_DFT_value = s_input[sm_read_pos + 2];
|
||||
*D_DFT_value = s_input[sm_read_pos + 3];
|
||||
|
||||
__syncthreads();
|
||||
reorder_32_register(A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value);
|
||||
}
|
||||
|
||||
__device__ bool printOnce = true;
|
||||
|
||||
template <class params> __device__ void do_SMFFT_CT_DIT(double2 *s_input) {
|
||||
double2 A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value;
|
||||
double2 W;
|
||||
double2 Aftemp, Bftemp, Cftemp, Dftemp;
|
||||
|
||||
int j, m_param;
|
||||
int parity, itemp;
|
||||
int A_read_index, B_read_index, C_read_index, D_read_index;
|
||||
int PoT, PoTp1, q;
|
||||
|
||||
int local_id = threadIdx.x & (params::warp - 1);
|
||||
int warp_id = threadIdx.x / params::warp;
|
||||
A_DFT_value = s_input[local_id + (warp_id << 2) * params::warp];
|
||||
B_DFT_value =
|
||||
s_input[local_id + (warp_id << 2) * params::warp + params::warp];
|
||||
C_DFT_value =
|
||||
s_input[local_id + (warp_id << 2) * params::warp + 2 * params::warp];
|
||||
D_DFT_value =
|
||||
s_input[local_id + (warp_id << 2) * params::warp + 3 * params::warp];
|
||||
|
||||
switch (params::log2_degree) {
|
||||
case 9:
|
||||
reorder_512<params>(s_input, &A_DFT_value, &B_DFT_value, &C_DFT_value,
|
||||
&D_DFT_value);
|
||||
break;
|
||||
case 10:
|
||||
reorder_1024<params>(s_input, &A_DFT_value, &B_DFT_value, &C_DFT_value,
|
||||
&D_DFT_value);
|
||||
break;
|
||||
// case 11:
|
||||
// reorder_2048<params, opt>(s_input, &A_DFT_value, &B_DFT_value,
|
||||
//&C_DFT_value, &D_DFT_value); break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
//----> FFT
|
||||
PoT = 1;
|
||||
PoTp1 = 2;
|
||||
|
||||
//--> First iteration
|
||||
itemp = local_id & 1;
|
||||
parity = (1 - itemp * 2);
|
||||
|
||||
A_DFT_value.x = parity * A_DFT_value.x + shfl_xor(&A_DFT_value.x, 1);
|
||||
A_DFT_value.y = parity * A_DFT_value.y + shfl_xor(&A_DFT_value.y, 1);
|
||||
B_DFT_value.x = parity * B_DFT_value.x + shfl_xor(&B_DFT_value.x, 1);
|
||||
B_DFT_value.y = parity * B_DFT_value.y + shfl_xor(&B_DFT_value.y, 1);
|
||||
C_DFT_value.x = parity * C_DFT_value.x + shfl_xor(&C_DFT_value.x, 1);
|
||||
C_DFT_value.y = parity * C_DFT_value.y + shfl_xor(&C_DFT_value.y, 1);
|
||||
D_DFT_value.x = parity * D_DFT_value.x + shfl_xor(&D_DFT_value.x, 1);
|
||||
D_DFT_value.y = parity * D_DFT_value.y + shfl_xor(&D_DFT_value.y, 1);
|
||||
|
||||
//--> Second through Fifth iteration (no synchronization)
|
||||
PoT = 2;
|
||||
PoTp1 = 4;
|
||||
for (q = 1; q < 5; q++) {
|
||||
m_param = (local_id & (PoTp1 - 1));
|
||||
itemp = m_param >> q;
|
||||
parity = ((itemp << 1) - 1);
|
||||
if (params::fft_direction)
|
||||
W = Get_W_value_inverse((q - 1) * 257 + itemp * m_param);
|
||||
else
|
||||
W = _gTwiddles[(q - 1) * 257 + itemp * m_param];
|
||||
Aftemp.x = W.x * A_DFT_value.x - W.y * A_DFT_value.y;
|
||||
Aftemp.y = W.x * A_DFT_value.y + W.y * A_DFT_value.x;
|
||||
Bftemp.x = W.x * B_DFT_value.x - W.y * B_DFT_value.y;
|
||||
Bftemp.y = W.x * B_DFT_value.y + W.y * B_DFT_value.x;
|
||||
Cftemp.x = W.x * C_DFT_value.x - W.y * C_DFT_value.y;
|
||||
Cftemp.y = W.x * C_DFT_value.y + W.y * C_DFT_value.x;
|
||||
Dftemp.x = W.x * D_DFT_value.x - W.y * D_DFT_value.y;
|
||||
Dftemp.y = W.x * D_DFT_value.y + W.y * D_DFT_value.x;
|
||||
|
||||
A_DFT_value.x = Aftemp.x + parity * shfl_xor(&Aftemp.x, PoT);
|
||||
A_DFT_value.y = Aftemp.y + parity * shfl_xor(&Aftemp.y, PoT);
|
||||
B_DFT_value.x = Bftemp.x + parity * shfl_xor(&Bftemp.x, PoT);
|
||||
B_DFT_value.y = Bftemp.y + parity * shfl_xor(&Bftemp.y, PoT);
|
||||
C_DFT_value.x = Cftemp.x + parity * shfl_xor(&Cftemp.x, PoT);
|
||||
C_DFT_value.y = Cftemp.y + parity * shfl_xor(&Cftemp.y, PoT);
|
||||
D_DFT_value.x = Dftemp.x + parity * shfl_xor(&Dftemp.x, PoT);
|
||||
D_DFT_value.y = Dftemp.y + parity * shfl_xor(&Dftemp.y, PoT);
|
||||
|
||||
PoT = PoT << 1;
|
||||
PoTp1 = PoTp1 << 1;
|
||||
}
|
||||
|
||||
itemp = local_id + (warp_id << 2) * params::warp;
|
||||
s_input[itemp] = A_DFT_value;
|
||||
s_input[itemp + params::warp] = B_DFT_value;
|
||||
s_input[itemp + 2 * params::warp] = C_DFT_value;
|
||||
s_input[itemp + 3 * params::warp] = D_DFT_value;
|
||||
|
||||
for (q = 5; q < (params::log2_degree - 1); q++) {
|
||||
__syncthreads();
|
||||
m_param = threadIdx.x & (PoT - 1);
|
||||
j = threadIdx.x >> q;
|
||||
|
||||
if (params::fft_direction)
|
||||
W = Get_W_value_inverse((q - 1) * 257 + m_param);
|
||||
else
|
||||
W = _gTwiddles[(q - 1) * 257 + m_param];
|
||||
|
||||
A_read_index = j * (PoTp1 << 1) + m_param;
|
||||
B_read_index = j * (PoTp1 << 1) + m_param + PoT;
|
||||
C_read_index = j * (PoTp1 << 1) + m_param + PoTp1;
|
||||
D_read_index = j * (PoTp1 << 1) + m_param + 3 * PoT;
|
||||
|
||||
Aftemp = s_input[A_read_index];
|
||||
Bftemp = s_input[B_read_index];
|
||||
A_DFT_value.x = Aftemp.x + W.x * Bftemp.x - W.y * Bftemp.y;
|
||||
A_DFT_value.y = Aftemp.y + W.x * Bftemp.y + W.y * Bftemp.x;
|
||||
B_DFT_value.x = Aftemp.x - W.x * Bftemp.x + W.y * Bftemp.y;
|
||||
B_DFT_value.y = Aftemp.y - W.x * Bftemp.y - W.y * Bftemp.x;
|
||||
|
||||
Cftemp = s_input[C_read_index];
|
||||
Dftemp = s_input[D_read_index];
|
||||
C_DFT_value.x = Cftemp.x + W.x * Dftemp.x - W.y * Dftemp.y;
|
||||
C_DFT_value.y = Cftemp.y + W.x * Dftemp.y + W.y * Dftemp.x;
|
||||
D_DFT_value.x = Cftemp.x - W.x * Dftemp.x + W.y * Dftemp.y;
|
||||
D_DFT_value.y = Cftemp.y - W.x * Dftemp.y - W.y * Dftemp.x;
|
||||
|
||||
s_input[A_read_index] = A_DFT_value;
|
||||
s_input[B_read_index] = B_DFT_value;
|
||||
s_input[C_read_index] = C_DFT_value;
|
||||
s_input[D_read_index] = D_DFT_value;
|
||||
|
||||
PoT = PoT << 1;
|
||||
PoTp1 = PoTp1 << 1;
|
||||
}
|
||||
|
||||
// last iteration
|
||||
if (params::log2_degree > 6) {
|
||||
__syncthreads();
|
||||
m_param = threadIdx.x;
|
||||
|
||||
if (params::fft_direction)
|
||||
W = Get_W_value_inverse((q - 1) * 257 + m_param);
|
||||
else
|
||||
W = _gTwiddles[(q - 1) * 257 + m_param];
|
||||
|
||||
A_read_index = m_param;
|
||||
B_read_index = m_param + PoT;
|
||||
C_read_index = m_param + (PoT >> 1);
|
||||
D_read_index = m_param + 3 * (PoT >> 1);
|
||||
|
||||
Aftemp = s_input[A_read_index];
|
||||
Bftemp = s_input[B_read_index];
|
||||
A_DFT_value.x = Aftemp.x + W.x * Bftemp.x - W.y * Bftemp.y;
|
||||
A_DFT_value.y = Aftemp.y + W.x * Bftemp.y + W.y * Bftemp.x;
|
||||
B_DFT_value.x = Aftemp.x - W.x * Bftemp.x + W.y * Bftemp.y;
|
||||
B_DFT_value.y = Aftemp.y - W.x * Bftemp.y - W.y * Bftemp.x;
|
||||
|
||||
Cftemp = s_input[C_read_index];
|
||||
Dftemp = s_input[D_read_index];
|
||||
C_DFT_value.x = Cftemp.x + W.y * Dftemp.x + W.x * Dftemp.y;
|
||||
C_DFT_value.y = Cftemp.y + W.y * Dftemp.y - W.x * Dftemp.x;
|
||||
D_DFT_value.x = Cftemp.x - W.y * Dftemp.x - W.x * Dftemp.y;
|
||||
D_DFT_value.y = Cftemp.y - W.y * Dftemp.y + W.x * Dftemp.x;
|
||||
|
||||
s_input[A_read_index] = A_DFT_value;
|
||||
s_input[B_read_index] = B_DFT_value;
|
||||
s_input[C_read_index] = C_DFT_value;
|
||||
s_input[D_read_index] = D_DFT_value;
|
||||
}
|
||||
}
|
||||
|
||||
template <class params>
|
||||
__global__ void SMFFT_DIT_external(double2 *d_input, double2 *d_output) {
|
||||
__syncthreads();
|
||||
|
||||
extern __shared__ double2 sharedmemBSK[];
|
||||
|
||||
double2 *s_input = sharedmemBSK;
|
||||
|
||||
int cTid = threadIdx.x * params::opt;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
double2 tmp;
|
||||
switch (params::degree) {
|
||||
case 512:
|
||||
tmp = INVERSE_TWIDDLES_512[cTid];
|
||||
tmp.x *= params::degree;
|
||||
tmp.y *= -params::degree;
|
||||
break;
|
||||
case 1024:
|
||||
tmp = gTwiddles1024[cTid];
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
d_input[blockIdx.x * params::degree + cTid] *= tmp;
|
||||
cTid++;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
s_input[threadIdx.x] = d_input[threadIdx.x + blockIdx.x * params::degree];
|
||||
s_input[threadIdx.x + params::quarter] =
|
||||
d_input[threadIdx.x + blockIdx.x * params::degree + params::quarter];
|
||||
s_input[threadIdx.x + params::half] =
|
||||
d_input[threadIdx.x + blockIdx.x * params::degree + params::half];
|
||||
s_input[threadIdx.x + params::three_quarters] =
|
||||
d_input[threadIdx.x + blockIdx.x * params::degree +
|
||||
params::three_quarters];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
do_SMFFT_CT_DIT<params>(s_input);
|
||||
if (threadIdx.x == 0 && blockIdx.x == 0) {
|
||||
for (int i = 0; i < 1024; i++)
|
||||
printf("smfft[%u] %.10f %.10f\n", i, s_input[i].x, s_input[i].y);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
|
||||
|
||||
__syncthreads();
|
||||
d_output[threadIdx.x + blockIdx.x * params::degree] = s_input[threadIdx.x];
|
||||
d_output[threadIdx.x + blockIdx.x * params::degree + params::quarter] =
|
||||
s_input[threadIdx.x + params::quarter];
|
||||
d_output[threadIdx.x + blockIdx.x * params::degree + params::half] =
|
||||
s_input[threadIdx.x + params::half];
|
||||
d_output[threadIdx.x + blockIdx.x * params::degree + params::three_quarters] =
|
||||
s_input[threadIdx.x + params::three_quarters];
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
#endif // GPU_BOOTSTRAP_SMFFT_CUH
|
||||
|
||||
*/
|
||||
@@ -203,7 +203,7 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(
|
||||
keyswitch<<<grid, threads, shared_mem, *stream>>>(
|
||||
lwe_array_out, lwe_array_in, ksk, lwe_dimension_in, lwe_dimension_out,
|
||||
base_log, level_count, lwe_lower, lwe_upper, cutoff);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
|
||||
@@ -4,10 +4,9 @@
|
||||
#ifdef __CDT_PARSER__
|
||||
#undef __CUDA_RUNTIME_H__
|
||||
#include <cuda_runtime.h>
|
||||
#include <helper_cuda.h>
|
||||
#endif
|
||||
|
||||
#include "../include/helper_cuda.h"
|
||||
#include "device.h"
|
||||
#include "linear_algebra.h"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
|
||||
@@ -46,7 +45,7 @@ host_cleartext_multiplication(void *v_stream, uint32_t gpu_index, T *output,
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
cleartext_multiplication<<<grid, thds, 0, *stream>>>(
|
||||
output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
#endif // CUDA_MULT_H
|
||||
|
||||
@@ -4,10 +4,9 @@
|
||||
#ifdef __CDT_PARSER__
|
||||
#undef __CUDA_RUNTIME_H__
|
||||
#include <cuda_runtime.h>
|
||||
#include <helper_cuda.h>
|
||||
#endif
|
||||
|
||||
#include "../include/helper_cuda.h"
|
||||
#include "device.h"
|
||||
#include "linear_algebra.h"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
|
||||
@@ -40,7 +39,7 @@ __host__ void host_negation(void *v_stream, uint32_t gpu_index, T *output,
|
||||
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
negation<<<grid, thds, 0, *stream>>>(output, input, num_entries);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
#endif // CUDA_NEGATE_H
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#ifndef GPU_POLYNOMIAL_FUNCTIONS
|
||||
#define GPU_POLYNOMIAL_FUNCTIONS
|
||||
#include "helper_cuda.h"
|
||||
#include "device.h"
|
||||
#include "utils/timer.cuh"
|
||||
|
||||
/*
|
||||
|
||||
@@ -3,9 +3,8 @@
|
||||
|
||||
#include "complex/operations.cuh"
|
||||
#include "crypto/torus.cuh"
|
||||
#include "device.h"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/smfft.cuh"
|
||||
#include "helper_cuda.h"
|
||||
#include "parameters.cuh"
|
||||
#include "utils/timer.cuh"
|
||||
#include <cassert>
|
||||
|
||||
@@ -8,9 +8,7 @@
|
||||
#include "crypto/torus.cuh"
|
||||
#include "device.h"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/smfft.cuh"
|
||||
#include "fft/twiddles.cuh"
|
||||
#include "helper_cuda.h"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial.cuh"
|
||||
@@ -181,12 +179,12 @@ template <typename Torus, class params>
|
||||
__host__ void add_padding_to_lut_async(Torus *lut_out, Torus *lut_in,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t num_lut, cudaStream_t *stream) {
|
||||
checkCudaErrors(cudaMemsetAsync(lut_out, 0,
|
||||
num_lut * (glwe_dimension + 1) *
|
||||
params::degree * sizeof(Torus),
|
||||
*stream));
|
||||
check_cuda_error(cudaMemsetAsync(lut_out, 0,
|
||||
num_lut * (glwe_dimension + 1) *
|
||||
params::degree * sizeof(Torus),
|
||||
*stream));
|
||||
for (int i = 0; i < num_lut; i++)
|
||||
checkCudaErrors(cudaMemcpyAsync(
|
||||
check_cuda_error(cudaMemcpyAsync(
|
||||
lut_out + (2 * i + 1) * params::degree, lut_in + i * params::degree,
|
||||
params::degree * sizeof(Torus), cudaMemcpyDeviceToDevice, *stream));
|
||||
}
|
||||
@@ -304,10 +302,10 @@ __host__ void host_cmux_tree(void *v_stream, uint32_t gpu_index,
|
||||
d_mem = (char *)cuda_malloc_async(
|
||||
memory_needed_per_block * (1 << (r - 1)) * tau, stream, gpu_index);
|
||||
} else {
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_batch_cmux<Torus, STorus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, memory_needed_per_block));
|
||||
checkCudaErrors(
|
||||
check_cuda_error(
|
||||
cudaFuncSetCacheConfig(device_batch_cmux<Torus, STorus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
}
|
||||
@@ -349,11 +347,11 @@ __host__ void host_cmux_tree(void *v_stream, uint32_t gpu_index,
|
||||
polynomial_size, base_log, level_count,
|
||||
layer_idx, // r
|
||||
num_lut);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
for (int i = 0; i < tau; i++)
|
||||
checkCudaErrors(cudaMemcpyAsync(
|
||||
check_cuda_error(cudaMemcpyAsync(
|
||||
glwe_array_out + i * glwe_size, output + i * num_lut * glwe_size,
|
||||
glwe_size * sizeof(Torus), cudaMemcpyDeviceToDevice, *stream));
|
||||
|
||||
@@ -480,11 +478,11 @@ __host__ void host_blind_rotate_and_sample_extraction(
|
||||
d_mem = (char *)cuda_malloc_async(memory_needed_per_block * tau, stream,
|
||||
gpu_index);
|
||||
else {
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_blind_rotation_and_sample_extraction<Torus, STorus, params,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, memory_needed_per_block));
|
||||
checkCudaErrors(cudaFuncSetCacheConfig(
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
device_blind_rotation_and_sample_extraction<Torus, STorus, params,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
@@ -499,7 +497,7 @@ __host__ void host_blind_rotate_and_sample_extraction(
|
||||
batch_fft_ggsw_vector<Torus, STorus, params>(
|
||||
stream, d_ggsw_fft_in, ggsw_in, mbr_size, glwe_dimension, polynomial_size,
|
||||
l_gadget, gpu_index, max_shared_memory);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
//
|
||||
dim3 thds(polynomial_size / params::opt, 1, 1);
|
||||
@@ -519,7 +517,7 @@ __host__ void host_blind_rotate_and_sample_extraction(
|
||||
glwe_dimension, // k
|
||||
polynomial_size, base_log, l_gadget, memory_needed_per_block,
|
||||
d_mem);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
//
|
||||
cuda_drop_async(d_ggsw_fft_in, stream, gpu_index);
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
#include "bit_extraction.cuh"
|
||||
#include "bootstrap.h"
|
||||
#include "circuit_bootstrap.cuh"
|
||||
#include "helper_cuda.h"
|
||||
#include "device.h"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include "utils/timer.cuh"
|
||||
#include "vertical_packing.cuh"
|
||||
@@ -77,7 +77,7 @@ __host__ void host_circuit_bootstrap_vertical_packing(
|
||||
cuda_memcpy_async_to_gpu(
|
||||
lut_vector_indexes, h_lut_vector_indexes,
|
||||
number_of_inputs * level_count_cbs * sizeof(uint32_t), stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
uint32_t bits = sizeof(Torus) * 8;
|
||||
uint32_t delta_log = (bits - 1);
|
||||
@@ -89,7 +89,7 @@ __host__ void host_circuit_bootstrap_vertical_packing(
|
||||
polynomial_size, glwe_dimension, lwe_dimension, level_count_bsk,
|
||||
base_log_bsk, level_count_pksk, base_log_pksk, level_count_cbs,
|
||||
base_log_cbs, number_of_inputs, max_shared_memory);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
// Free memory
|
||||
cuda_drop_async(lwe_array_in_fp_ks_buffer, stream, gpu_index);
|
||||
@@ -112,7 +112,7 @@ __host__ void host_circuit_bootstrap_vertical_packing(
|
||||
v_stream, gpu_index, glwe_array_out, ggsw_out, lut_vector, glwe_dimension,
|
||||
polynomial_size, base_log_cbs, level_count_cbs, r, tau,
|
||||
max_shared_memory);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
// Blind rotation + sample extraction
|
||||
// mbr = tau * p - r = log2(N)
|
||||
@@ -151,7 +151,7 @@ __host__ void host_wop_pbs(
|
||||
(uint32_t *)cuda_malloc_async(sizeof(uint32_t), stream, gpu_index);
|
||||
cuda_memcpy_async_to_gpu(lut_vector_indexes, h_lut_vector_indexes,
|
||||
sizeof(uint32_t), stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
Torus *lut_pbs = (Torus *)cuda_malloc_async(
|
||||
(2 * polynomial_size) * sizeof(Torus), stream, gpu_index);
|
||||
Torus *lwe_array_in_buffer = (Torus *)cuda_malloc_async(
|
||||
@@ -176,7 +176,7 @@ __host__ void host_wop_pbs(
|
||||
number_of_bits_to_extract, delta_log, polynomial_size, lwe_dimension,
|
||||
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
|
||||
number_of_inputs, max_shared_memory);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
cuda_drop_async(lut_pbs, stream, gpu_index);
|
||||
cuda_drop_async(lut_vector_indexes, stream, gpu_index);
|
||||
cuda_drop_async(lwe_array_in_buffer, stream, gpu_index);
|
||||
@@ -192,7 +192,7 @@ __host__ void host_wop_pbs(
|
||||
number_of_inputs * number_of_bits_to_extract, number_of_inputs,
|
||||
max_shared_memory);
|
||||
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
check_cuda_error(cudaGetLastError());
|
||||
cuda_drop_async(lwe_array_out_bit_extract, stream, gpu_index);
|
||||
}
|
||||
#endif // WOP_PBS_H
|
||||
|
||||
Reference in New Issue
Block a user