chore(gpu): automatically generate rust bindings for cuda functions, except device.cu

This commit is contained in:
Agnes Leroy
2024-10-09 17:05:09 +02:00
committed by Agnès Leroy
parent 416fb5a719
commit e698d18242
104 changed files with 3883 additions and 3111 deletions

View File

@@ -56,7 +56,7 @@ jobs:
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
@@ -64,6 +64,7 @@ jobs:
make -j"$(nproc)"
sudo make install
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
with:

View File

@@ -59,7 +59,7 @@ jobs:
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}

View File

@@ -63,7 +63,7 @@ jobs:
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}

View File

@@ -63,7 +63,7 @@ jobs:
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}

View File

@@ -72,7 +72,7 @@ jobs:
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}

View File

@@ -73,7 +73,7 @@ jobs:
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}

View File

@@ -63,7 +63,7 @@ jobs:
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}

View File

@@ -63,7 +63,7 @@ jobs:
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}

View File

@@ -99,7 +99,7 @@ jobs:
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}

View File

@@ -97,7 +97,7 @@ jobs:
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}

View File

@@ -57,7 +57,7 @@ jobs:
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}

View File

@@ -99,7 +99,7 @@ jobs:
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}

View File

@@ -100,7 +100,7 @@ jobs:
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
@@ -108,6 +108,7 @@ jobs:
make -j"$(nproc)"
sudo make install
- name: Checkout tfhe-rs
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871

View File

@@ -107,7 +107,7 @@ jobs:
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
@@ -115,6 +115,7 @@ jobs:
make -j"$(nproc)"
sudo make install
- name: Checkout tfhe-rs
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
with:

View File

@@ -100,7 +100,7 @@ jobs:
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
@@ -108,6 +108,7 @@ jobs:
make -j"$(nproc)"
sudo make install
- name: Checkout tfhe-rs
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871

View File

@@ -107,7 +107,7 @@ jobs:
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
@@ -115,6 +115,7 @@ jobs:
make -j"$(nproc)"
sudo make install
- name: Checkout tfhe-rs
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871

View File

@@ -418,6 +418,14 @@ clippy_cuda_backend: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-p tfhe-cuda-backend -- --no-deps -D warnings
.PHONY: check_rust_bindings_did_not_change # Check rust bindings are up to date for tfhe-cuda-backend
check_rust_bindings_did_not_change:
cargo build -p tfhe-cuda-backend && \
git diff --quiet HEAD -- backends/tfhe-cuda-backend/src/bindings.rs || \
( echo "Generated bindings have changed! Please run 'git add backends/tfhe-cuda-backend/src/bindings.rs' \
and commit the changes." && exit 1 )
.PHONY: tfhe_lints # Run custom tfhe-rs lints
tfhe_lints: install_tfhe_lints
cd tfhe && RUSTFLAGS="$(RUSTFLAGS)" cargo tfhe-lints \
@@ -1257,7 +1265,7 @@ pcc: no_tfhe_typo no_dbg_log check_fmt check_typos lint_doc check_md_docs_are_te
clippy_all tfhe_lints check_compile_tests
.PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
pcc_gpu: clippy_gpu clippy_cuda_backend check_compile_tests_benches_gpu
pcc_gpu: clippy_gpu clippy_cuda_backend check_compile_tests_benches_gpu check_rust_bindings_did_not_change
.PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
fpcc: no_tfhe_typo no_dbg_log check_fmt check_typos lint_doc check_md_docs_are_tested clippy_fast \

View File

@@ -14,3 +14,4 @@ keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]
[build-dependencies]
cmake = { version = "0.1" }
pkg-config = { version = "0.3" }
bindgen = "0.70.1"

View File

@@ -1,5 +1,6 @@
use std::env;
use std::path::PathBuf;
use std::process::Command;
use std::{env, fs};
fn main() {
if let Ok(val) = env::var("DOCS_RS") {
@@ -26,6 +27,7 @@ fn main() {
println!("cargo::rerun-if-changed=cuda/tests_and_benchmarks");
println!("cargo::rerun-if-changed=cuda/CMakeLists.txt");
println!("cargo::rerun-if-changed=src");
if env::consts::OS == "linux" {
let output = Command::new("./get_os_name.sh").output().unwrap();
let distribution = String::from_utf8(output.stdout).unwrap();
@@ -35,6 +37,7 @@ fn main() {
Only Ubuntu is supported by tfhe-cuda-backend at this time. Build may fail\n"
);
}
let dest = cmake::build("cuda");
println!("cargo:rustc-link-search=native={}", dest.display());
println!("cargo:rustc-link-lib=static=tfhe_cuda_backend");
@@ -51,6 +54,37 @@ fn main() {
println!("cargo:rustc-link-lib=cudart");
println!("cargo:rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
println!("cargo:rustc-link-lib=stdc++");
let header_path = "wrapper.h";
println!("cargo:rerun-if-changed={}", header_path);
let out_path = PathBuf::from("src").join("bindings.rs");
// Check modification times
let header_modified = fs::metadata(header_path).unwrap().modified().unwrap();
let bindings_modified = if out_path.exists() {
fs::metadata(&out_path).unwrap().modified().unwrap()
} else {
std::time::SystemTime::UNIX_EPOCH // If bindings file doesn't exist, consider it older
};
// Regenerate bindings only if header has been modified
if header_modified > bindings_modified {
let bindings = bindgen::Builder::default()
.header(header_path)
.clang_arg("-x")
.clang_arg("c++")
.clang_arg("-std=c++17")
.clang_arg("-I/usr/include")
.clang_arg("-I/usr/local/include")
.ctypes_prefix("ffi")
.raw_line("use crate::ffi;")
.generate()
.expect("Unable to generate bindings");
bindings
.write_to_file(&out_path)
.expect("Couldn't write bindings!");
}
} else {
panic!(
"Error: platform not supported, tfhe-cuda-backend not built (only Linux is supported)"

View File

@@ -1,25 +1,24 @@
#ifndef CUDA_CIPHERTEXT_H
#define CUDA_CIPHERTEXT_H
#include "device.h"
#include <cstdint>
#include "stdint.h"
extern "C" {
void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *stream,
uint32_t gpu_index,
void *dest, void *src,
void *dest, void const *src,
uint32_t number_of_cts,
uint32_t lwe_dimension);
void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
uint32_t gpu_index,
void *dest, void *src,
void *dest, void const *src,
uint32_t number_of_cts,
uint32_t lwe_dimension);
void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
void *lwe_array_out, void *glwe_array_in,
uint32_t *nth_array, uint32_t num_nths,
void *lwe_array_out, void const *glwe_array_in,
uint32_t const *nth_array, uint32_t num_nths,
uint32_t glwe_dimension,
uint32_t polynomial_size);
};
}
#endif

View File

@@ -42,7 +42,7 @@ void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
cudaStream_t stream, uint32_t gpu_index);
void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
cudaStream_t stream, uint32_t gpu_index);
void cuda_memcpy_gpu_to_gpu(void *dest, void *src, uint64_t size,

View File

@@ -0,0 +1,45 @@
#ifndef CUDA_INTEGER_COMPRESSION_H
#define CUDA_INTEGER_COMPRESSION_H
#include "../../pbs/pbs_enums.h"
extern "C" {
void scratch_cuda_integer_compress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t compression_glwe_dimension,
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
bool allocate_gpu_memory);
void scratch_cuda_integer_decompress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t encryption_glwe_dimension,
uint32_t encryption_polynomial_size, uint32_t compression_glwe_dimension,
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
uint32_t storage_log_modulus, uint32_t body_count,
bool allocate_gpu_memory);
void cuda_integer_compress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *glwe_array_out, void const *lwe_array_in, void *const *fp_ksk,
uint32_t num_nths, int8_t *mem_ptr);
void cuda_integer_decompress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *glwe_in, uint32_t const *indexes_array,
uint32_t indexes_array_size, void *const *bsks, int8_t *mem_ptr);
void cleanup_cuda_integer_compress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);
void cleanup_cuda_integer_decompress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);
}
#endif

View File

@@ -1,46 +1,7 @@
#ifndef CUDA_INTEGER_COMPRESSION_H
#define CUDA_INTEGER_COMPRESSION_H
#ifndef CUDA_INTEGER_COMPRESSION_UTILITIES_H
#define CUDA_INTEGER_COMPRESSION_UTILITIES_H
#include "integer.h"
extern "C" {
void scratch_cuda_integer_compress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
bool allocate_gpu_memory);
void scratch_cuda_integer_decompress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count,
bool allocate_gpu_memory);
void cuda_integer_compress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths,
int8_t *mem_ptr);
void cuda_integer_decompress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *glwe_in, uint32_t *indexes_array,
uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr);
void cleanup_cuda_integer_compress_radix_ciphertext_64(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
void cleanup_cuda_integer_decompress_radix_ciphertext_64(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
}
#include "../integer_utilities.h"
template <typename Torus> struct int_compression {
int_radix_params compression_params;
@@ -54,7 +15,7 @@ template <typename Torus> struct int_compression {
Torus *tmp_lwe;
Torus *tmp_glwe_array_out;
int_compression(cudaStream_t *streams, uint32_t *gpu_indexes,
int_compression(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params compression_params,
uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
@@ -81,7 +42,7 @@ template <typename Torus> struct int_compression {
num_radix_blocks, true);
}
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
cuda_drop_async(tmp_lwe, streams[0], gpu_indexes[0]);
cuda_drop_async(tmp_glwe_array_out, streams[0], gpu_indexes[0]);
@@ -105,7 +66,7 @@ template <typename Torus> struct int_decompression {
int_radix_lut<Torus> *carry_extract_lut;
int_decompression(cudaStream_t *streams, uint32_t *gpu_indexes,
int_decompression(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params encryption_params,
int_radix_params compression_params,
uint32_t num_radix_blocks, uint32_t body_count,
@@ -150,7 +111,7 @@ template <typename Torus> struct int_decompression {
carry_extract_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
}
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
cuda_drop_async(tmp_extracted_glwe, streams[0], gpu_indexes[0]);
cuda_drop_async(tmp_extracted_lwe, streams[0], gpu_indexes[0]);

View File

@@ -0,0 +1,421 @@
#ifndef CUDA_INTEGER_H
#define CUDA_INTEGER_H
#include "../pbs/pbs_enums.h"
#include <stdint.h>
enum OUTPUT_CARRY { NONE = 0, GENERATED = 1, PROPAGATED = 2 };
enum SHIFT_OR_ROTATE_TYPE {
LEFT_SHIFT = 0,
RIGHT_SHIFT = 1,
LEFT_ROTATE = 2,
RIGHT_ROTATE = 3
};
enum BITOP_TYPE {
BITAND = 0,
BITOR = 1,
BITXOR = 2,
SCALAR_BITAND = 3,
SCALAR_BITOR = 4,
SCALAR_BITXOR = 5,
};
enum COMPARISON_TYPE {
EQ = 0,
NE = 1,
GT = 2,
GE = 3,
LT = 4,
LE = 5,
MAX = 6,
MIN = 7,
};
enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 };
enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 };
extern "C" {
void scratch_cuda_apply_univariate_lut_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory);
void cuda_apply_univariate_lut_kb_64(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, void *output_radix_lwe,
void const *input_radix_lwe,
int8_t *mem_ptr, void *const *ksks,
void *const *bsks, uint32_t num_blocks);
void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
void scratch_cuda_apply_bivariate_lut_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory);
void cuda_apply_bivariate_lut_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *output_radix_lwe, void const *input_radix_lwe_1,
void const *input_radix_lwe_2, int8_t *mem_ptr, void *const *ksks,
void *const *bsks, uint32_t num_blocks, uint32_t shift);
void cleanup_cuda_apply_bivariate_lut_kb_64(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
void cuda_apply_many_univariate_lut_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *output_radix_lwe, void const *input_radix_lwe, int8_t *mem_ptr,
void *const *ksks, void *const *bsks, uint32_t num_blocks,
uint32_t num_luts, uint32_t lut_stride);
void scratch_cuda_full_propagation_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory);
void cuda_full_propagation_64_inplace(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, void *input_blocks,
int8_t *mem_ptr, void *const *ksks,
void *const *bsks, uint32_t num_blocks);
void cleanup_cuda_full_propagation(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, int8_t **mem_ptr_void);
void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t message_modulus, uint32_t carry_modulus,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t pbs_base_log, uint32_t pbs_level, uint32_t ks_base_log,
uint32_t ks_level, uint32_t grouping_factor, uint32_t num_blocks,
PBS_TYPE pbs_type, bool allocate_gpu_memory);
void cuda_integer_mult_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *radix_lwe_out, void const *radix_lwe_left,
void const *radix_lwe_right, void *const *bsks, void *const *ksks,
int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks);
void cleanup_cuda_integer_mult(void *const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);
void cuda_negate_integer_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_array_in, uint32_t lwe_dimension,
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus);
void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array, void const *scalar_input, uint32_t lwe_dimension,
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus);
void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
bool allocate_gpu_memory);
void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array, uint32_t shift, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t num_blocks);
void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
bool allocate_gpu_memory);
void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array, uint32_t shift, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t num_blocks);
void cleanup_cuda_integer_radix_logical_scalar_shift(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);
void cleanup_cuda_integer_radix_arithmetic_scalar_shift(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);
void scratch_cuda_integer_radix_shift_and_rotate_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
bool is_signed, bool allocate_gpu_memory);
void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array, void const *lwe_shift, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t num_blocks);
void cleanup_cuda_integer_radix_shift_and_rotate(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
void scratch_cuda_integer_radix_comparison_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
COMPARISON_TYPE op_type, bool is_signed, bool allocate_gpu_memory);
void cuda_comparison_integer_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_array_1, void const *lwe_array_2,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
uint32_t lwe_ciphertext_count);
void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_array_in, void const *scalar_blocks,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
uint32_t lwe_ciphertext_count, uint32_t num_scalar_blocks);
void cleanup_cuda_integer_comparison(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, int8_t **mem_ptr_void);
void scratch_cuda_integer_radix_bitop_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
BITOP_TYPE op_type, bool allocate_gpu_memory);
void cuda_bitop_integer_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_array_1, void const *lwe_array_2,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
uint32_t lwe_ciphertext_count);
void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_array_input, void const *clear_blocks,
uint32_t num_clear_blocks, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t lwe_ciphertext_count, BITOP_TYPE op);
void cleanup_cuda_integer_bitop(void *const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);
void scratch_cuda_integer_radix_cmux_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory);
void cuda_cmux_integer_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_condition, void const *lwe_array_true,
void const *lwe_array_false, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t lwe_ciphertext_count);
void cleanup_cuda_integer_radix_cmux(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, int8_t **mem_ptr_void);
void scratch_cuda_integer_radix_scalar_rotate_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
bool allocate_gpu_memory);
void cuda_integer_radix_scalar_rotate_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array, uint32_t n, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t num_blocks);
void cleanup_cuda_integer_radix_scalar_rotate(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
void scratch_cuda_propagate_single_carry_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
void cuda_propagate_single_carry_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array, void *carry_out, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t num_blocks);
void cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array, void *carry_out, void *input_carries, int8_t *mem_ptr,
void *const *bsks, void *const *ksks, uint32_t num_blocks);
void cleanup_cuda_propagate_single_carry(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory);
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
uint32_t num_blocks_in_radix);
void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);
void scratch_cuda_integer_radix_overflowing_sub_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
void cuda_integer_radix_overflowing_sub_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *radix_lwe_out, void *radix_lwe_overflowed, void const *radix_lwe_left,
void const *radix_lwe_right, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t num_blocks_in_radix);
void cleanup_cuda_integer_radix_overflowing_sub(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
void scratch_cuda_integer_scalar_mul_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory);
void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array, uint64_t const *decomposed_scalar,
uint64_t const *has_at_least_one_set, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t message_modulus, uint32_t num_blocks, uint32_t num_scalars);
void cleanup_cuda_integer_radix_scalar_mul(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
void cuda_integer_div_rem_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *quotient, void *remainder, void const *numerator, void const *divisor,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
uint32_t num_blocks_in_radix);
void cleanup_cuda_integer_div_rem(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, int8_t **mem_ptr_void);
void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, int8_t signed_operation,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory);
void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lhs, void const *rhs, void *overflowed, int8_t signed_operation,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
uint32_t num_blocks_in_radix);
void cleanup_signed_overflowing_add_or_sub(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory);
void cuda_integer_compute_prefix_sum_hillis_steele_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *output_radix_lwe, void *generates_or_propagates, int8_t *mem_ptr,
void *const *ksks, void *const *bsks, uint32_t num_blocks, uint32_t shift);
void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);
void cuda_integer_reverse_blocks_64_inplace(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, void *lwe_array,
uint32_t num_blocks,
uint32_t lwe_size);
} // extern C
#endif // CUDA_INTEGER_H

View File

@@ -1,411 +1,13 @@
#ifndef CUDA_INTEGER_H
#define CUDA_INTEGER_H
#ifndef CUDA_INTEGER_UTILITIES_H
#define CUDA_INTEGER_UTILITIES_H
#include "integer.h"
#include "keyswitch.h"
#include "pbs/programmable_bootstrap.cuh"
#include "programmable_bootstrap.h"
#include "programmable_bootstrap_multibit.h"
#include <cassert>
#include <cmath>
#include <functional>
enum OUTPUT_CARRY { NONE = 0, GENERATED = 1, PROPAGATED = 2 };
enum SHIFT_OR_ROTATE_TYPE {
LEFT_SHIFT = 0,
RIGHT_SHIFT = 1,
LEFT_ROTATE = 2,
RIGHT_ROTATE = 3
};
enum BITOP_TYPE {
BITAND = 0,
BITOR = 1,
BITXOR = 2,
SCALAR_BITAND = 3,
SCALAR_BITOR = 4,
SCALAR_BITXOR = 5,
};
enum COMPARISON_TYPE {
EQ = 0,
NE = 1,
GT = 2,
GE = 3,
LT = 4,
LE = 5,
MAX = 6,
MIN = 7,
};
enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 };
enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 };
extern "C" {
void scratch_cuda_apply_univariate_lut_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
void cuda_apply_univariate_lut_kb_64(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count, void *output_radix_lwe,
void *input_radix_lwe, int8_t *mem_ptr,
void **ksks, void **bsks,
uint32_t num_blocks);
void cleanup_cuda_apply_univariate_lut_kb_64(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
void scratch_cuda_apply_bivariate_lut_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
void cuda_apply_bivariate_lut_kb_64(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count, void *output_radix_lwe,
void *input_radix_lwe_1,
void *input_radix_lwe_2, int8_t *mem_ptr,
void **ksks, void **bsks,
uint32_t num_blocks, uint32_t shift);
void cleanup_cuda_apply_bivariate_lut_kb_64(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
void cuda_apply_many_univariate_lut_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
void **bsks, uint32_t num_blocks, uint32_t num_luts, uint32_t lut_stride);
void scratch_cuda_full_propagation_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
void cuda_full_propagation_64_inplace(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count, void *input_blocks,
int8_t *mem_ptr, void **ksks, void **bsks,
uint32_t num_blocks);
void cleanup_cuda_full_propagation(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int8_t **mem_ptr_void);
void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
uint32_t grouping_factor, uint32_t num_blocks, PBS_TYPE pbs_type,
bool allocate_gpu_memory);
void cuda_integer_mult_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *radix_lwe_out, void *radix_lwe_left, void *radix_lwe_right,
void **bsks, void **ksks, int8_t *mem_ptr, uint32_t polynomial_size,
uint32_t num_blocks);
void cleanup_cuda_integer_mult(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int8_t **mem_ptr_void);
void cuda_negate_integer_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *lwe_array_in, uint32_t lwe_dimension,
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus);
void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
void *scalar_input, uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus);
void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
bool allocate_gpu_memory);
void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
uint32_t shift, int8_t *mem_ptr, void **bsks, void **ksks,
uint32_t num_blocks);
void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
bool allocate_gpu_memory);
void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
uint32_t shift, int8_t *mem_ptr, void **bsks, void **ksks,
uint32_t num_blocks);
void cleanup_cuda_integer_radix_logical_scalar_shift(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
void cleanup_cuda_integer_radix_arithmetic_scalar_shift(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
void scratch_cuda_integer_radix_shift_and_rotate_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
bool is_signed, bool allocate_gpu_memory);
void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
void *lwe_shift, int8_t *mem_ptr, void **bsks, void **ksks,
uint32_t num_blocks);
void cleanup_cuda_integer_radix_shift_and_rotate(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
void scratch_cuda_integer_radix_comparison_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
COMPARISON_TYPE op_type, bool is_signed, bool allocate_gpu_memory);
void cuda_comparison_integer_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr,
void **bsks, void **ksks, uint32_t lwe_ciphertext_count);
void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *lwe_array_in, void *scalar_blocks,
int8_t *mem_ptr, void **bsks, void **ksks, uint32_t lwe_ciphertext_count,
uint32_t num_scalar_blocks);
void cleanup_cuda_integer_comparison(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int8_t **mem_ptr_void);
void scratch_cuda_integer_radix_bitop_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
BITOP_TYPE op_type, bool allocate_gpu_memory);
void cuda_bitop_integer_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr,
void **bsks, void **ksks, uint32_t lwe_ciphertext_count);
void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *lwe_array_input, void *clear_blocks,
uint32_t num_clear_blocks, int8_t *mem_ptr, void **bsks, void **ksks,
uint32_t lwe_ciphertext_count, BITOP_TYPE op);
void cleanup_cuda_integer_bitop(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int8_t **mem_ptr_void);
void scratch_cuda_integer_radix_cmux_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory);
void cuda_cmux_integer_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *lwe_condition, void *lwe_array_true,
void *lwe_array_false, int8_t *mem_ptr, void **bsks, void **ksks,
uint32_t lwe_ciphertext_count);
void cleanup_cuda_integer_radix_cmux(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int8_t **mem_ptr_void);
void scratch_cuda_integer_radix_scalar_rotate_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
bool allocate_gpu_memory);
void cuda_integer_radix_scalar_rotate_kb_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
uint32_t n, int8_t *mem_ptr, void **bsks, void **ksks, uint32_t num_blocks);
void cleanup_cuda_integer_radix_scalar_rotate(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
void scratch_cuda_propagate_single_carry_kb_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
void cuda_propagate_single_carry_kb_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
void *carry_out, int8_t *mem_ptr, void **bsks, void **ksks,
uint32_t num_blocks);
void cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
void *carry_out, void *input_carries, int8_t *mem_ptr, void **bsks,
void **ksks, uint32_t num_blocks);
void cleanup_cuda_propagate_single_carry(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory);
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec,
int8_t *mem_ptr, void **bsks, void **ksks, uint32_t num_blocks_in_radix);
void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);
void scratch_cuda_integer_radix_overflowing_sub_kb_64(
void **stream, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
void cuda_integer_radix_overflowing_sub_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *radix_lwe_out, void *radix_lwe_overflowed, void *radix_lwe_left,
void *radix_lwe_right, int8_t *mem_ptr, void **bsks, void **ksks,
uint32_t num_blocks_in_radix);
void cleanup_cuda_integer_radix_overflowing_sub(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
void scratch_cuda_integer_scalar_mul_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory);
void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
uint64_t *decomposed_scalar, uint64_t *has_at_least_one_set,
int8_t *mem_ptr, void **bsks, void **ksks, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t message_modulus, uint32_t num_blocks,
uint32_t num_scalars);
void cleanup_cuda_integer_radix_scalar_mul(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
void cuda_integer_div_rem_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *quotient,
void *remainder, void *numerator, void *divisor, int8_t *mem_ptr,
void **bsks, void **ksks, uint32_t num_blocks_in_radix);
void cleanup_cuda_integer_div_rem(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int8_t **mem_ptr_void);
void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, int8_t signed_operation,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory);
void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lhs,
void *rhs, void *overflowed, int8_t signed_operation, int8_t *mem_ptr,
void **bsks, void **ksks, uint32_t num_blocks_in_radix);
void cleanup_signed_overflowing_add_or_sub(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory);
void cuda_integer_compute_prefix_sum_hillis_steele_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *output_radix_lwe, void *generates_or_propagates, int8_t *mem_ptr,
void **ksks, void **bsks, uint32_t num_blocks, uint32_t shift);
void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);
void cuda_integer_reverse_blocks_64_inplace(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count, void *lwe_array,
uint32_t num_blocks,
uint32_t lwe_size);
} // extern C
template <typename Torus>
__global__ void radix_blocks_rotate_right(Torus *dst, Torus *src,
uint32_t value, uint32_t blocks_count,
@@ -532,7 +134,7 @@ template <typename Torus> struct int_radix_lut {
std::vector<Torus *> lwe_after_pbs_vec;
std::vector<Torus *> lwe_trivial_indexes_vec;
int_radix_lut(cudaStream_t *streams, uint32_t *gpu_indexes,
int_radix_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
uint32_t num_radix_blocks, bool allocate_gpu_memory) {
@@ -638,7 +240,7 @@ template <typename Torus> struct int_radix_lut {
}
// constructor to reuse memory
int_radix_lut(cudaStream_t *streams, uint32_t *gpu_indexes,
int_radix_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
uint32_t num_radix_blocks, int_radix_lut *base_lut_object) {
@@ -746,7 +348,7 @@ template <typename Torus> struct int_radix_lut {
}
// Broadcast luts from gpu src_gpu_idx to all active gpus
void broadcast_lut(cudaStream_t *streams, uint32_t *gpu_indexes,
void broadcast_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t src_gpu_idx) {
Torus lut_size = (params.glwe_dimension + 1) * params.polynomial_size;
@@ -769,7 +371,7 @@ template <typename Torus> struct int_radix_lut {
}
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
for (uint i = 0; i < active_gpu_count; i++) {
cuda_drop_async(lut_vec[i], streams[i], gpu_indexes[i]);
@@ -824,10 +426,10 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
int_radix_lut<Torus> *lut;
// With offset
int_bit_extract_luts_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int_radix_params params,
uint32_t bits_per_block, uint32_t final_offset,
uint32_t num_radix_blocks,
int_bit_extract_luts_buffer(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
int_radix_params params, uint32_t bits_per_block,
uint32_t final_offset, uint32_t num_radix_blocks,
bool allocate_gpu_memory) {
this->params = params;
@@ -898,16 +500,16 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
}
// Without offset
int_bit_extract_luts_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int_radix_params params,
uint32_t bits_per_block,
int_bit_extract_luts_buffer(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
int_radix_params params, uint32_t bits_per_block,
uint32_t num_radix_blocks,
bool allocate_gpu_memory)
: int_bit_extract_luts_buffer(streams, gpu_indexes, gpu_count, params,
bits_per_block, 0, num_radix_blocks,
allocate_gpu_memory) {}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
lut->release(streams, gpu_indexes, gpu_count);
delete (lut);
@@ -933,8 +535,8 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
Torus offset;
int_shift_and_rotate_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count,
int_shift_and_rotate_buffer(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed,
int_radix_params params,
uint32_t num_radix_blocks,
@@ -1056,7 +658,7 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
}
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
cuda_drop_async(tmp_bits, streams[0], gpu_indexes[0]);
cuda_drop_async(tmp_shift_bits, streams[0], gpu_indexes[0]);
@@ -1085,7 +687,7 @@ template <typename Torus> struct int_fullprop_buffer {
Torus *tmp_small_lwe_vector;
Torus *tmp_big_lwe_vector;
int_fullprop_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
int_fullprop_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params params,
bool allocate_gpu_memory) {
this->params = params;
@@ -1142,7 +744,7 @@ template <typename Torus> struct int_fullprop_buffer {
}
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
lut->release(streams, gpu_indexes, 1);
@@ -1165,7 +767,7 @@ template <typename Torus> struct int_sc_prop_memory {
int_radix_params params;
int_sc_prop_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
int_sc_prop_memory(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params params,
uint32_t num_radix_blocks, bool allocate_gpu_memory) {
this->params = params;
@@ -1258,7 +860,7 @@ template <typename Torus> struct int_sc_prop_memory {
message_acc->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
cuda_drop_async(generates_or_propagates, streams[0], gpu_indexes[0]);
cuda_drop_async(step_output, streams[0], gpu_indexes[0]);
@@ -1285,9 +887,9 @@ template <typename Torus> struct int_overflowing_sub_memory {
int_radix_params params;
int_overflowing_sub_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int_radix_params params,
uint32_t num_radix_blocks,
int_overflowing_sub_memory(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
int_radix_params params, uint32_t num_radix_blocks,
bool allocate_gpu_memory) {
this->params = params;
auto glwe_dimension = params.glwe_dimension;
@@ -1379,7 +981,7 @@ template <typename Torus> struct int_overflowing_sub_memory {
message_acc->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
cuda_drop_async(generates_or_propagates, streams[0], gpu_indexes[0]);
cuda_drop_async(step_output, streams[0], gpu_indexes[0]);
@@ -1407,7 +1009,8 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
bool mem_reuse = false;
int_sum_ciphertexts_vec_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
int_sum_ciphertexts_vec_memory(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params params,
uint32_t num_blocks_in_radix,
uint32_t max_num_radix_in_vec,
@@ -1460,7 +1063,8 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
streams[0], gpu_indexes[0]);
}
int_sum_ciphertexts_vec_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
int_sum_ciphertexts_vec_memory(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params params,
uint32_t num_blocks_in_radix,
uint32_t max_num_radix_in_vec,
@@ -1496,7 +1100,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
streams[0], gpu_indexes[0]);
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
cuda_drop_async(d_smart_copy_in, streams[0], gpu_indexes[0]);
cuda_drop_async(d_smart_copy_out, streams[0], gpu_indexes[0]);
@@ -1523,7 +1127,7 @@ template <typename Torus> struct int_mul_memory {
int_radix_params params;
int_mul_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
int_mul_memory(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params params,
uint32_t num_radix_blocks, bool allocate_gpu_memory) {
this->params = params;
@@ -1597,7 +1201,7 @@ template <typename Torus> struct int_mul_memory {
small_lwe_vector);
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
cuda_drop_async(vector_result_sb, streams[0], gpu_indexes[0]);
cuda_drop_async(block_mul_res, streams[0], gpu_indexes[0]);
@@ -1621,7 +1225,8 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
bool reuse_memory = false;
int_logical_scalar_shift_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
int_logical_scalar_shift_buffer(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
SHIFT_OR_ROTATE_TYPE shift_type,
int_radix_params params,
@@ -1712,13 +1317,11 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
}
}
int_logical_scalar_shift_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count,
SHIFT_OR_ROTATE_TYPE shift_type,
int_radix_params params,
uint32_t num_radix_blocks,
bool allocate_gpu_memory,
Torus *pre_allocated_buffer) {
int_logical_scalar_shift_buffer(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, SHIFT_OR_ROTATE_TYPE shift_type,
int_radix_params params, uint32_t num_radix_blocks,
bool allocate_gpu_memory, Torus *pre_allocated_buffer) {
this->shift_type = shift_type;
this->params = params;
tmp_rotated = pre_allocated_buffer;
@@ -1800,7 +1403,7 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
}
}
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
for (auto &buffer : lut_buffers_bivariate) {
buffer->release(streams, gpu_indexes, gpu_count);
@@ -1826,8 +1429,9 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
cudaStream_t *local_streams_2;
uint32_t active_gpu_count;
int_arithmetic_scalar_shift_buffer(cudaStream_t *streams,
uint32_t *gpu_indexes, uint32_t gpu_count,
int_arithmetic_scalar_shift_buffer(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
SHIFT_OR_ROTATE_TYPE shift_type,
int_radix_params params,
uint32_t num_radix_blocks,
@@ -1971,7 +1575,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
}
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
for (uint j = 0; j < active_gpu_count; j++) {
cuda_destroy_stream(local_streams_1[j], gpu_indexes[j]);
@@ -2004,9 +1608,10 @@ template <typename Torus> struct int_zero_out_if_buffer {
cudaStream_t *false_streams;
uint32_t active_gpu_count;
int_zero_out_if_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int_radix_params params,
uint32_t num_radix_blocks, bool allocate_gpu_memory) {
int_zero_out_if_buffer(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
int_radix_params params, uint32_t num_radix_blocks,
bool allocate_gpu_memory) {
this->params = params;
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
@@ -2025,7 +1630,7 @@ template <typename Torus> struct int_zero_out_if_buffer {
}
}
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
cuda_drop_async(tmp, streams[0], gpu_indexes[0]);
for (uint j = 0; j < active_gpu_count; j++) {
@@ -2050,7 +1655,7 @@ template <typename Torus> struct int_cmux_buffer {
int_radix_params params;
int_cmux_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
int_cmux_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count,
std::function<Torus(Torus)> predicate_lut_f,
int_radix_params params, uint32_t num_radix_blocks,
@@ -2121,7 +1726,7 @@ template <typename Torus> struct int_cmux_buffer {
}
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
predicate_lut->release(streams, gpu_indexes, gpu_count);
delete predicate_lut;
@@ -2152,9 +1757,9 @@ template <typename Torus> struct int_are_all_block_true_buffer {
// value).
std::unordered_map<int, int_radix_lut<Torus> *> is_equal_to_lut_map;
int_are_all_block_true_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, COMPARISON_TYPE op,
int_radix_params params,
int_are_all_block_true_buffer(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
COMPARISON_TYPE op, int_radix_params params,
uint32_t num_radix_blocks,
bool allocate_gpu_memory) {
this->params = params;
@@ -2174,7 +1779,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {
}
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
for (auto &lut : is_equal_to_lut_map) {
lut.second->release(streams, gpu_indexes, gpu_count);
@@ -2197,9 +1802,10 @@ template <typename Torus> struct int_comparison_eq_buffer {
int_are_all_block_true_buffer<Torus> *are_all_block_true_buffer;
int_comparison_eq_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, COMPARISON_TYPE op,
int_radix_params params, uint32_t num_radix_blocks,
int_comparison_eq_buffer(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
COMPARISON_TYPE op, int_radix_params params,
uint32_t num_radix_blocks,
bool allocate_gpu_memory) {
this->params = params;
this->op = op;
@@ -2272,7 +1878,7 @@ template <typename Torus> struct int_comparison_eq_buffer {
}
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
operator_lut->release(streams, gpu_indexes, gpu_count);
delete operator_lut;
@@ -2298,7 +1904,8 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
Torus *tmp_x;
Torus *tmp_y;
int_tree_sign_reduction_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
int_tree_sign_reduction_buffer(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
std::function<Torus(Torus)> operator_f,
int_radix_params params,
@@ -2340,7 +1947,7 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
}
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
tree_inner_leaf_lut->release(streams, gpu_indexes, gpu_count);
delete tree_inner_leaf_lut;
@@ -2369,9 +1976,10 @@ template <typename Torus> struct int_comparison_diff_buffer {
Torus *tmp_signs_b;
int_radix_lut<Torus> *reduce_signs_lut;
int_comparison_diff_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, COMPARISON_TYPE op,
int_radix_params params, uint32_t num_radix_blocks,
int_comparison_diff_buffer(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
COMPARISON_TYPE op, int_radix_params params,
uint32_t num_radix_blocks,
bool allocate_gpu_memory) {
this->params = params;
this->op = op;
@@ -2415,7 +2023,7 @@ template <typename Torus> struct int_comparison_diff_buffer {
}
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
tree_buffer->release(streams, gpu_indexes, gpu_count);
delete tree_buffer;
@@ -2463,10 +2071,11 @@ template <typename Torus> struct int_comparison_buffer {
cudaStream_t *msb_streams;
uint32_t active_gpu_count;
int_comparison_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, COMPARISON_TYPE op,
int_radix_params params, uint32_t num_radix_blocks,
bool is_signed, bool allocate_gpu_memory) {
int_comparison_buffer(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
COMPARISON_TYPE op, int_radix_params params,
uint32_t num_radix_blocks, bool is_signed,
bool allocate_gpu_memory) {
this->params = params;
this->op = op;
this->is_signed = is_signed;
@@ -2610,7 +2219,7 @@ template <typename Torus> struct int_comparison_buffer {
}
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
switch (op) {
case COMPARISON_TYPE::MAX:
@@ -2701,8 +2310,9 @@ template <typename Torus> struct int_div_rem_memory {
// allocate and initialize if needed, temporary arrays used to calculate
// cuda integer div_rem operation
void init_temporary_buffers(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, uint32_t num_blocks) {
void init_temporary_buffers(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
uint32_t num_blocks) {
uint32_t big_lwe_size = params.big_lwe_dimension + 1;
// non boolean temporary arrays, with `num_blocks` blocks
@@ -2749,8 +2359,9 @@ template <typename Torus> struct int_div_rem_memory {
}
// initialize lookup tables for div_rem operation
void init_lookup_tables(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, uint32_t num_blocks) {
void init_lookup_tables(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
uint32_t num_blocks) {
uint32_t num_bits_in_message = 31 - __builtin_clz(params.message_modulus);
// create and generate masking_luts_1[] and masking_lut_2[]
@@ -2890,7 +2501,7 @@ template <typename Torus> struct int_div_rem_memory {
}
}
int_div_rem_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
int_div_rem_memory(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params params,
uint32_t num_blocks, bool allocate_gpu_memory) {
active_gpu_count = get_active_gpu_count(2 * num_blocks, gpu_count);
@@ -2930,7 +2541,7 @@ template <typename Torus> struct int_div_rem_memory {
}
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
uint32_t num_bits_in_message = 31 - __builtin_clz(params.message_modulus);
@@ -3033,9 +2644,9 @@ template <typename Torus> struct int_last_block_inner_propagate_memory {
int_radix_params params;
int_last_block_inner_propagate_memory(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_radix_params params, SIGNED_OPERATION op, uint32_t num_radix_blocks,
bool allocate_gpu_memory) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params params, SIGNED_OPERATION op,
uint32_t num_radix_blocks, bool allocate_gpu_memory) {
this->params = params;
auto message_modulus = params.message_modulus;
@@ -3100,7 +2711,7 @@ template <typename Torus> struct int_last_block_inner_propagate_memory {
gpu_indexes[0]);
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
last_block_inner_propagation_lut->release(streams, gpu_indexes, gpu_count);
delete last_block_inner_propagation_lut;
@@ -3114,8 +2725,9 @@ template <typename Torus> struct int_resolve_signed_overflow_memory {
Torus *x;
int_resolve_signed_overflow_memory(cudaStream_t *streams,
uint32_t *gpu_indexes, uint32_t gpu_count,
int_resolve_signed_overflow_memory(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int_radix_params params,
bool allocate_gpu_memory) {
@@ -3160,7 +2772,7 @@ template <typename Torus> struct int_resolve_signed_overflow_memory {
resolve_overflow_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
resolve_overflow_lut->release(streams, gpu_indexes, gpu_count);
delete resolve_overflow_lut;
@@ -3190,7 +2802,8 @@ template <typename Torus> struct int_signed_overflowing_add_or_sub_memory {
// allocate temporary arrays used to calculate
// cuda integer signed overflowing add or sub
void allocate_temporary_buffers(cudaStream_t *streams, uint32_t *gpu_indexes,
void allocate_temporary_buffers(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, uint32_t num_blocks) {
uint32_t big_lwe_size = params.big_lwe_dimension + 1;
@@ -3210,9 +2823,9 @@ template <typename Torus> struct int_signed_overflowing_add_or_sub_memory {
// constructor without memory reuse
int_signed_overflowing_add_or_sub_memory(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_radix_params params, uint32_t num_blocks, SIGNED_OPERATION op,
bool allocate_gpu_memory) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params params, uint32_t num_blocks,
SIGNED_OPERATION op, bool allocate_gpu_memory) {
this->params = params;
active_gpu_count = get_active_gpu_count(num_blocks, gpu_count);
@@ -3241,7 +2854,7 @@ template <typename Torus> struct int_signed_overflowing_add_or_sub_memory {
streams, gpu_indexes, gpu_count, params, allocate_gpu_memory);
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
// memory objects for other operations
scp_mem->release(streams, gpu_indexes, gpu_count);
@@ -3273,7 +2886,7 @@ template <typename Torus> struct int_bitop_buffer {
int_radix_params params;
int_radix_lut<Torus> *lut;
int_bitop_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
int_bitop_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, BITOP_TYPE op, int_radix_params params,
uint32_t num_radix_blocks, bool allocate_gpu_memory) {
@@ -3337,7 +2950,7 @@ template <typename Torus> struct int_bitop_buffer {
}
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
lut->release(streams, gpu_indexes, gpu_count);
delete lut;
@@ -3351,9 +2964,10 @@ template <typename Torus> struct int_scalar_mul_buffer {
Torus *preshifted_buffer;
Torus *all_shifted_buffer;
int_scalar_mul_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int_radix_params params,
uint32_t num_radix_blocks, bool allocate_gpu_memory) {
int_scalar_mul_buffer(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
int_radix_params params, uint32_t num_radix_blocks,
bool allocate_gpu_memory) {
this->params = params;
if (allocate_gpu_memory) {
@@ -3390,7 +3004,7 @@ template <typename Torus> struct int_scalar_mul_buffer {
}
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
sum_ciphertexts_vec_mem->release(streams, gpu_indexes, gpu_count);
delete sum_ciphertexts_vec_mem;
@@ -3398,4 +3012,4 @@ template <typename Torus> struct int_scalar_mul_buffer {
}
};
#endif // CUDA_INTEGER_H
#endif // CUDA_INTEGER_UTILITIES_H

View File

@@ -1,21 +1,23 @@
#ifndef CNCRT_KS_H_
#define CNCRT_KS_H_
#include <cstdint>
#include <stdint.h>
extern "C" {
void cuda_keyswitch_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
void const *lwe_output_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in,
uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
uint32_t num_samples);
void cuda_keyswitch_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
void const *lwe_output_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in,
uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
uint32_t num_samples);
void scratch_packing_keyswitch_lwe_list_to_glwe_64(
void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
@@ -23,10 +25,11 @@ void scratch_packing_keyswitch_lwe_list_to_glwe_64(
bool allocate_gpu_memory);
void cuda_packing_keyswitch_lwe_list_to_glwe_64(
void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
void *fp_ksk_array, int8_t *fp_ks_buffer, uint32_t input_lwe_dimension,
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_lwes);
void *stream, uint32_t gpu_index, void *glwe_array_out,
void const *lwe_array_in, void const *fp_ksk_array, int8_t *fp_ks_buffer,
uint32_t input_lwe_dimension, uint32_t output_glwe_dimension,
uint32_t output_polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_lwes);
void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
uint32_t gpu_index,

View File

@@ -1,50 +1,48 @@
#ifndef CUDA_LINALG_H_
#define CUDA_LINALG_H_
#include "programmable_bootstrap.h"
#include <cstdint>
#include <device.h>
#include <stdint.h>
extern "C" {
void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in,
void const *lwe_array_in,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_negate_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in,
void const *lwe_array_in,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in_1,
void *lwe_array_in_2,
void const *lwe_array_in_1,
void const *lwe_array_in_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in_1,
void *lwe_array_in_2,
void const *lwe_array_in_1,
void const *lwe_array_in_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *plaintext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, void const *plaintext_array_in,
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *plaintext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, void const *plaintext_array_in,
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count);
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *cleartext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, void const *cleartext_array_in,
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count);
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *cleartext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, void const *cleartext_array_in,
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count);
}
#endif // CUDA_LINALG_H_

View File

@@ -0,0 +1,7 @@
#ifndef CUDA_PBS_ENUMS_H
#define CUDA_PBS_ENUMS_H
enum PBS_TYPE { MULTI_BIT = 0, CLASSICAL = 1 };
enum PBS_VARIANT { DEFAULT = 0, CG = 1, TBC = 2 };
#endif // CUDA_PBS_ENUMS_H

View File

@@ -1,38 +1,7 @@
#ifndef CUDA_MULTI_BIT_H
#define CUDA_MULTI_BIT_H
#ifndef CUDA_MULTI_BIT_UTILITIES_H
#define CUDA_MULTI_BIT_UTILITIES_H
#include "programmable_bootstrap.h"
#include <cstdint>
extern "C" {
bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t num_samples);
void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
void *stream, uint32_t gpu_index, void *dest, void *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size, uint32_t grouping_factor);
void scratch_cuda_multi_bit_programmable_bootstrap_64(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t lut_stride);
void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
uint32_t gpu_index,
int8_t **pbs_buffer);
}
#include "pbs_utilities.h"
template <typename Torus>
bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
@@ -53,8 +22,9 @@ void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
template <typename Torus>
void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
@@ -70,8 +40,9 @@ void scratch_cuda_cg_multi_bit_programmable_bootstrap(
template <typename Torus>
void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
@@ -86,8 +57,9 @@ void scratch_cuda_multi_bit_programmable_bootstrap(
template <typename Torus>
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
@@ -121,6 +93,10 @@ template <typename Torus>
uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus, class params>
uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size);
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
int8_t *d_mem_keybundle = NULL;
int8_t *d_mem_acc_step_one = NULL;
@@ -288,8 +264,4 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
}
};
template <typename Torus, class params>
uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size);
#endif // CUDA_MULTI_BIT_H
#endif // CUDA_MULTI_BIT_UTILITIES_H

View File

@@ -1,87 +1,10 @@
#ifndef CUDA_BOOTSTRAP_H
#define CUDA_BOOTSTRAP_H
#ifndef CUDA_BOOTSTRAP_UTILITIES_H
#define CUDA_BOOTSTRAP_UTILITIES_H
#include "device.h"
#include <cstdint>
enum PBS_TYPE { MULTI_BIT = 0, CLASSICAL = 1 };
enum PBS_VARIANT { DEFAULT = 0, CG = 1, TBC = 2 };
extern "C" {
void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
void *input1, void *input2, void *output,
uint32_t polynomial_size,
uint32_t total_polynomials);
void cuda_convert_lwe_programmable_bootstrap_key_32(
void *stream, uint32_t gpu_index, void *dest, void *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size);
void cuda_convert_lwe_programmable_bootstrap_key_64(
void *stream, uint32_t gpu_index, void *dest, void *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size);
void scratch_cuda_programmable_bootstrap_amortized_32(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
void scratch_cuda_programmable_bootstrap_amortized_64(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples);
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples);
void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
uint32_t gpu_index,
int8_t **pbs_buffer);
void scratch_cuda_programmable_bootstrap_32(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
void scratch_cuda_programmable_bootstrap_64(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride);
void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride);
void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
int8_t **pbs_buffer);
}
#include "pbs_enums.h"
#include "vector_types.h"
#include <stdint.h>
template <typename Torus>
uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_one(
@@ -327,8 +250,9 @@ bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
template <typename Torus>
void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
@@ -337,8 +261,9 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
template <typename Torus>
void cuda_programmable_bootstrap_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
@@ -348,8 +273,9 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
template <typename Torus>
void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
@@ -408,4 +334,4 @@ __device__ const T *get_multi_bit_ith_lwe_gth_group_kth_block(
#endif
#endif // CUDA_BOOTSTRAP_H
#endif // CUDA_BOOTSTRAP_UTILITIES_H

View File

@@ -0,0 +1,86 @@
#ifndef CUDA_BOOTSTRAP_H
#define CUDA_BOOTSTRAP_H
#include "pbs_enums.h"
#include <stdint.h>
extern "C" {
void cuda_fourier_polynomial_mul(void *stream, uint32_t gpu_index,
void const *input1, void const *input2,
void *output, uint32_t polynomial_size,
uint32_t total_polynomials);
void cuda_convert_lwe_programmable_bootstrap_key_32(
void *stream, uint32_t gpu_index, void *dest, void const *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size);
void cuda_convert_lwe_programmable_bootstrap_key_64(
void *stream, uint32_t gpu_index, void *dest, void const *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size);
void scratch_cuda_programmable_bootstrap_amortized_32(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
void scratch_cuda_programmable_bootstrap_amortized_64(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples);
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples);
void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
uint32_t gpu_index,
int8_t **pbs_buffer);
void scratch_cuda_programmable_bootstrap_32(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
void scratch_cuda_programmable_bootstrap_64(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride);
void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride);
void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
int8_t **pbs_buffer);
}
#endif // CUDA_BOOTSTRAP_H

View File

@@ -0,0 +1,38 @@
#ifndef CUDA_MULTI_BIT_H
#define CUDA_MULTI_BIT_H
#include "pbs_enums.h"
#include "stdint.h"
extern "C" {
bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t num_samples);
void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
void *stream, uint32_t gpu_index, void *dest, void const *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size, uint32_t grouping_factor);
void scratch_cuda_multi_bit_programmable_bootstrap_64(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t lut_stride);
void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
uint32_t gpu_index,
int8_t **pbs_buffer);
}
#endif // CUDA_MULTI_BIT_H

View File

@@ -22,8 +22,8 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
}
void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
void *lwe_array_out, void *glwe_array_in,
uint32_t *nth_array, uint32_t num_nths,
void *lwe_array_out, void const *glwe_array_in,
uint32_t const *nth_array, uint32_t num_nths,
uint32_t glwe_dimension,
uint32_t polynomial_size) {
@@ -31,43 +31,43 @@ void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
case 256:
host_sample_extract<uint64_t, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
(uint64_t const *)glwe_array_in, (uint32_t const *)nth_array, num_nths,
glwe_dimension);
break;
case 512:
host_sample_extract<uint64_t, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
(uint64_t const *)glwe_array_in, (uint32_t const *)nth_array, num_nths,
glwe_dimension);
break;
case 1024:
host_sample_extract<uint64_t, AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
(uint64_t const *)glwe_array_in, (uint32_t const *)nth_array, num_nths,
glwe_dimension);
break;
case 2048:
host_sample_extract<uint64_t, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
(uint64_t const *)glwe_array_in, (uint32_t const *)nth_array, num_nths,
glwe_dimension);
break;
case 4096:
host_sample_extract<uint64_t, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
(uint64_t const *)glwe_array_in, (uint32_t const *)nth_array, num_nths,
glwe_dimension);
break;
case 8192:
host_sample_extract<uint64_t, AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
(uint64_t const *)glwe_array_in, (uint32_t const *)nth_array, num_nths,
glwe_dimension);
break;
case 16384:
host_sample_extract<uint64_t, AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
(uint64_t const *)glwe_array_in, (uint32_t const *)nth_array, num_nths,
glwe_dimension);
break;
default:

View File

@@ -27,8 +27,9 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu(cudaStream_t stream,
}
template <typename Torus, class params>
__global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in,
uint32_t *nth_array, uint32_t glwe_dimension) {
__global__ void sample_extract(Torus *lwe_array_out, Torus const *glwe_array_in,
uint32_t const *nth_array,
uint32_t glwe_dimension) {
const int input_id = blockIdx.x;
@@ -50,8 +51,9 @@ __global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in,
template <typename Torus, class params>
__host__ void host_sample_extract(cudaStream_t stream, uint32_t gpu_index,
Torus *lwe_array_out, Torus *glwe_array_in,
uint32_t *nth_array, uint32_t num_nths,
Torus *lwe_array_out,
Torus const *glwe_array_in,
uint32_t const *nth_array, uint32_t num_nths,
uint32_t glwe_dimension) {
cudaSetDevice(gpu_index);

View File

@@ -37,16 +37,18 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
*/
void cuda_keyswitch_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
void const *lwe_output_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in,
uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
uint32_t num_samples) {
host_keyswitch_lwe_ciphertext_vector<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const uint64_t *>(ksk), lwe_dimension_in, lwe_dimension_out,
base_log, level_count, num_samples);
}
void scratch_packing_keyswitch_lwe_list_to_glwe_64(
@@ -61,18 +63,19 @@ void scratch_packing_keyswitch_lwe_list_to_glwe_64(
* ciphertexts.
*/
void cuda_packing_keyswitch_lwe_list_to_glwe_64(
void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
void *fp_ksk_array, int8_t *fp_ks_buffer, uint32_t input_lwe_dimension,
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_lwes) {
void *stream, uint32_t gpu_index, void *glwe_array_out,
void const *lwe_array_in, void const *fp_ksk_array, int8_t *fp_ks_buffer,
uint32_t input_lwe_dimension, uint32_t output_glwe_dimension,
uint32_t output_polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_lwes) {
host_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(glwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(fp_ksk_array), fp_ks_buffer, input_lwe_dimension,
output_glwe_dimension, output_polynomial_size, base_log, level_count,
num_lwes);
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(fp_ksk_array), fp_ks_buffer,
input_lwe_dimension, output_glwe_dimension, output_polynomial_size,
base_log, level_count, num_lwes);
}
void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,

View File

@@ -101,9 +101,10 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
template <typename Torus>
__host__ void host_keyswitch_lwe_ciphertext_vector(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
Torus *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
Torus const *lwe_output_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus const *ksk, uint32_t lwe_dimension_in,
uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
uint32_t num_samples) {
cudaSetDevice(gpu_index);
@@ -124,13 +125,13 @@ __host__ void host_keyswitch_lwe_ciphertext_vector(
}
template <typename Torus>
void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count,
void execute_keyswitch_async(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
const LweArrayVariant<Torus> &lwe_array_out,
const LweArrayVariant<Torus> &lwe_output_indexes,
const LweArrayVariant<Torus> &lwe_array_in,
const LweArrayVariant<Torus> &lwe_input_indexes,
Torus **ksks, uint32_t lwe_dimension_in,
Torus *const *ksks, uint32_t lwe_dimension_in,
uint32_t lwe_dimension_out, uint32_t base_log,
uint32_t level_count, uint32_t num_samples) {
@@ -176,9 +177,9 @@ __host__ void scratch_packing_keyswitch_lwe_list_to_glwe(
// different thread blocks at the x-axis to work on that input.
template <typename Torus>
__device__ void packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext(
Torus *glwe_out, Torus *lwe_in, Torus *fp_ksk, uint32_t lwe_dimension_in,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count) {
Torus *glwe_out, Torus const *lwe_in, Torus const *fp_ksk,
uint32_t lwe_dimension_in, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
size_t glwe_size = (glwe_dimension + 1);
@@ -225,12 +226,11 @@ __device__ void packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext(
// Assumes there are (glwe_dimension+1) * polynomial_size threads split through
// different thread blocks at the x-axis to work on that input.
template <typename Torus>
__global__ void
packing_keyswitch_lwe_list_to_glwe(Torus *glwe_array_out, Torus *lwe_array_in,
Torus *fp_ksk, uint32_t lwe_dimension_in,
uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, Torus *d_mem) {
__global__ void packing_keyswitch_lwe_list_to_glwe(
Torus *glwe_array_out, Torus const *lwe_array_in, Torus const *fp_ksk,
uint32_t lwe_dimension_in, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
Torus *d_mem) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
@@ -276,7 +276,7 @@ __global__ void accumulate_glwes(Torus *glwe_out, Torus *glwe_array_in,
template <typename Torus>
__host__ void host_packing_keyswitch_lwe_list_to_glwe(
cudaStream_t stream, uint32_t gpu_index, Torus *glwe_out,
Torus *lwe_array_in, Torus *fp_ksk_array, int8_t *fp_ks_buffer,
Torus const *lwe_array_in, Torus const *fp_ksk_array, int8_t *fp_ks_buffer,
uint32_t lwe_dimension_in, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_lwes) {

View File

@@ -113,7 +113,7 @@ void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
}
/// Copy memory within a GPU asynchronously
void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
cudaStream_t stream, uint32_t gpu_index) {
if (size == 0)
return;

View File

@@ -1,8 +1,8 @@
#include "integer/addition.cuh"
void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, int8_t signed_operation,
@@ -23,9 +23,10 @@ void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
}
void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lhs,
void *rhs, void *overflowed, int8_t signed_operation, int8_t *mem_ptr,
void **bsks, void **ksks, uint32_t num_blocks) {
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lhs, void const *rhs, void *overflowed, int8_t signed_operation,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
uint32_t num_blocks) {
auto mem = (int_signed_overflowing_add_or_sub_memory<uint64_t> *)mem_ptr;
SIGNED_OPERATION op = (signed_operation == 1) ? SIGNED_OPERATION::ADDITION
@@ -33,13 +34,13 @@ void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
host_integer_signed_overflowing_add_or_sub_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lhs), static_cast<uint64_t *>(rhs),
static_cast<uint64_t *>(overflowed), op, bsks, (uint64_t **)(ksks), mem,
num_blocks);
static_cast<uint64_t *>(lhs), static_cast<uint64_t const *>(rhs),
static_cast<uint64_t *>(overflowed), op, bsks, (uint64_t *const *)(ksks),
mem, num_blocks);
}
void cleanup_signed_overflowing_add_or_sub(void **streams,
uint32_t *gpu_indexes,
void cleanup_signed_overflowing_add_or_sub(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_signed_overflowing_add_or_sub_memory<uint64_t> *mem_ptr =

View File

@@ -3,13 +3,13 @@
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer.h"
#include "integer/comparison.cuh"
#include "integer/integer.cuh"
#include "integer/integer_utilities.h"
#include "integer/negation.cuh"
#include "integer/scalar_shifts.cuh"
#include "linear_algebra.h"
#include "programmable_bootstrap.h"
#include "pbs/programmable_bootstrap.h"
#include "utils/helper.cuh"
#include "utils/kernel_dimensions.cuh"
#include <fstream>
@@ -20,10 +20,11 @@
template <typename Torus>
void host_resolve_signed_overflow(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *result, Torus *last_block_inner_propagation,
Torus *last_block_input_carry, Torus *last_block_output_carry,
int_resolve_signed_overflow_memory<Torus> *mem, void **bsks, Torus **ksks) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *result, Torus *last_block_inner_propagation,
Torus const *last_block_input_carry, Torus *last_block_output_carry,
int_resolve_signed_overflow_memory<Torus> *mem, void *const *bsks,
Torus *const *ksks) {
auto x = mem->x;
@@ -53,7 +54,8 @@ void host_resolve_signed_overflow(
template <typename Torus>
__host__ void scratch_cuda_integer_signed_overflowing_add_or_sub_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count,
int_signed_overflowing_add_or_sub_memory<Torus> **mem_ptr,
uint32_t num_blocks, SIGNED_OPERATION op, int_radix_params params,
bool allocate_gpu_memory) {
@@ -69,9 +71,9 @@ __host__ void scratch_cuda_integer_signed_overflowing_add_or_sub_kb(
*/
template <typename Torus>
__host__ void host_integer_signed_overflowing_add_or_sub_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lhs, Torus *rhs, Torus *overflowed, SIGNED_OPERATION op, void **bsks,
uint64_t **ksks,
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lhs, Torus const *rhs, Torus *overflowed,
SIGNED_OPERATION op, void *const *bsks, uint64_t *const *ksks,
int_signed_overflowing_add_or_sub_memory<uint64_t> *mem_ptr,
uint32_t num_blocks) {

View File

@@ -1,8 +1,8 @@
#include "integer/bitwise_ops.cuh"
void scratch_cuda_integer_radix_bitop_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
@@ -21,21 +21,23 @@ void scratch_cuda_integer_radix_bitop_kb_64(
}
void cuda_bitop_integer_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr,
void **bsks, void **ksks, uint32_t lwe_ciphertext_count) {
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_array_1, void const *lwe_array_2,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
uint32_t lwe_ciphertext_count) {
host_integer_radix_bitop_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_1),
static_cast<uint64_t *>(lwe_array_2),
static_cast<const uint64_t *>(lwe_array_1),
static_cast<const uint64_t *>(lwe_array_2),
(int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
lwe_ciphertext_count);
}
void cleanup_cuda_integer_bitop(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int8_t **mem_ptr_void) {
void cleanup_cuda_integer_bitop(void *const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_bitop_buffer<uint64_t> *mem_ptr =
(int_bitop_buffer<uint64_t> *)(*mem_ptr_void);

View File

@@ -4,7 +4,7 @@
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer.cuh"
#include "integer.h"
#include "integer/integer_utilities.h"
#include "pbs/programmable_bootstrap_classic.cuh"
#include "pbs/programmable_bootstrap_multibit.cuh"
#include "polynomial/functions.cuh"
@@ -12,12 +12,11 @@
#include <omp.h>
template <typename Torus>
__host__ void
host_integer_radix_bitop_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out,
Torus *lwe_array_1, Torus *lwe_array_2,
int_bitop_buffer<Torus> *mem_ptr, void **bsks,
Torus **ksks, uint32_t num_radix_blocks) {
__host__ void host_integer_radix_bitop_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_1,
Torus const *lwe_array_2, int_bitop_buffer<Torus> *mem_ptr,
void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks) {
auto lut = mem_ptr->lut;
@@ -28,9 +27,10 @@ host_integer_radix_bitop_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
template <typename Torus>
__host__ void scratch_cuda_integer_radix_bitop_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_bitop_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
int_radix_params params, BITOP_TYPE op, bool allocate_gpu_memory) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_bitop_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params, BITOP_TYPE op,
bool allocate_gpu_memory) {
*mem_ptr =
new int_bitop_buffer<Torus>(streams, gpu_indexes, gpu_count, op, params,

View File

@@ -1,8 +1,8 @@
#include "integer/cmux.cuh"
void scratch_cuda_integer_radix_cmux_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
@@ -24,23 +24,24 @@ void scratch_cuda_integer_radix_cmux_kb_64(
}
void cuda_cmux_integer_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *lwe_condition, void *lwe_array_true,
void *lwe_array_false, int8_t *mem_ptr, void **bsks, void **ksks,
uint32_t lwe_ciphertext_count) {
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_condition, void const *lwe_array_true,
void const *lwe_array_false, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t lwe_ciphertext_count) {
host_integer_radix_cmux_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_condition),
static_cast<uint64_t *>(lwe_array_true),
static_cast<uint64_t *>(lwe_array_false),
static_cast<const uint64_t *>(lwe_condition),
static_cast<const uint64_t *>(lwe_array_true),
static_cast<const uint64_t *>(lwe_array_false),
(int_cmux_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
lwe_ciphertext_count);
}
void cleanup_cuda_integer_radix_cmux(void **streams, uint32_t *gpu_indexes,
void cleanup_cuda_integer_radix_cmux(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {

View File

@@ -4,12 +4,13 @@
#include "integer.cuh"
template <typename Torus>
__host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out,
Torus *lwe_array_input, Torus *lwe_condition,
__host__ void zero_out_if(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus const *lwe_array_input,
Torus const *lwe_condition,
int_zero_out_if_buffer<Torus> *mem_ptr,
int_radix_lut<Torus> *predicate, void **bsks,
Torus **ksks, uint32_t num_radix_blocks) {
int_radix_lut<Torus> *predicate, void *const *bsks,
Torus *const *ksks, uint32_t num_radix_blocks) {
cudaSetDevice(gpu_indexes[0]);
auto params = mem_ptr->params;
@@ -42,10 +43,11 @@ __host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
template <typename Torus>
__host__ void host_integer_radix_cmux_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_condition, Torus *lwe_array_true,
Torus *lwe_array_false, int_cmux_buffer<Torus> *mem_ptr, void **bsks,
Torus **ksks, uint32_t num_radix_blocks) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_condition,
Torus const *lwe_array_true, Torus const *lwe_array_false,
int_cmux_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks,
uint32_t num_radix_blocks) {
auto params = mem_ptr->params;
@@ -89,8 +91,8 @@ __host__ void host_integer_radix_cmux_kb(
template <typename Torus>
__host__ void scratch_cuda_integer_radix_cmux_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_cmux_buffer<Torus> **mem_ptr,
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_cmux_buffer<Torus> **mem_ptr,
std::function<Torus(Torus)> predicate_lut_f, uint32_t num_radix_blocks,
int_radix_params params, bool allocate_gpu_memory) {

View File

@@ -1,8 +1,8 @@
#include "integer/comparison.cuh"
void scratch_cuda_integer_radix_comparison_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_radix_blocks,
@@ -37,9 +37,10 @@ void scratch_cuda_integer_radix_comparison_kb_64(
}
void cuda_comparison_integer_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr,
void **bsks, void **ksks, uint32_t num_radix_blocks) {
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_array_1, void const *lwe_array_2,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
uint32_t num_radix_blocks) {
int_comparison_buffer<uint64_t> *buffer =
(int_comparison_buffer<uint64_t> *)mem_ptr;
@@ -49,9 +50,9 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
host_integer_radix_equality_check_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_1),
static_cast<uint64_t *>(lwe_array_2), buffer, bsks, (uint64_t **)(ksks),
num_radix_blocks);
static_cast<const uint64_t *>(lwe_array_1),
static_cast<const uint64_t *>(lwe_array_2), buffer, bsks,
(uint64_t **)(ksks), num_radix_blocks);
break;
case GT:
case GE:
@@ -60,8 +61,8 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
host_integer_radix_difference_check_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_1),
static_cast<uint64_t *>(lwe_array_2), buffer,
static_cast<const uint64_t *>(lwe_array_1),
static_cast<const uint64_t *>(lwe_array_2), buffer,
buffer->diff_buffer->operator_f, bsks, (uint64_t **)(ksks),
num_radix_blocks);
break;
@@ -70,16 +71,17 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
host_integer_radix_maxmin_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_1),
static_cast<uint64_t *>(lwe_array_2), buffer, bsks, (uint64_t **)(ksks),
num_radix_blocks);
static_cast<const uint64_t *>(lwe_array_1),
static_cast<const uint64_t *>(lwe_array_2), buffer, bsks,
(uint64_t **)(ksks), num_radix_blocks);
break;
default:
PANIC("Cuda error: integer operation not supported")
}
}
void cleanup_cuda_integer_comparison(void **streams, uint32_t *gpu_indexes,
void cleanup_cuda_integer_comparison(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {

View File

@@ -4,8 +4,8 @@
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer.cuh"
#include "integer.h"
#include "integer/cmux.cuh"
#include "integer/integer_utilities.h"
#include "integer/negation.cuh"
#include "integer/scalar_addition.cuh"
#include "pbs/programmable_bootstrap_classic.cuh"
@@ -16,9 +16,9 @@
// lwe_dimension + 1 threads
// todo: This kernel MUST be refactored to a binary reduction
template <typename Torus>
__global__ void device_accumulate_all_blocks(Torus *output, Torus *input_block,
uint32_t lwe_dimension,
uint32_t num_blocks) {
__global__ void
device_accumulate_all_blocks(Torus *output, Torus const *input_block,
uint32_t lwe_dimension, uint32_t num_blocks) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < lwe_dimension + 1) {
auto block = &input_block[idx];
@@ -34,7 +34,7 @@ __global__ void device_accumulate_all_blocks(Torus *output, Torus *input_block,
template <typename Torus>
__host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
Torus *output, Torus *input,
Torus *output, Torus const *input,
uint32_t lwe_dimension,
uint32_t num_radix_blocks) {
@@ -57,10 +57,10 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
*/
template <typename Torus>
__host__ void are_all_comparisons_block_true(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
uint32_t num_radix_blocks) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks, uint32_t num_radix_blocks) {
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
@@ -159,10 +159,10 @@ __host__ void are_all_comparisons_block_true(
*/
template <typename Torus>
__host__ void is_at_least_one_comparisons_block_true(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
uint32_t num_radix_blocks) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks, uint32_t num_radix_blocks) {
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
@@ -239,10 +239,11 @@ __host__ void is_at_least_one_comparisons_block_true(
// are_all_comparisons_block_true
template <typename Torus>
__host__ void host_compare_with_zero_equality(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
int32_t num_radix_blocks, int_radix_lut<Torus> *zero_comparison) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks, int32_t num_radix_blocks,
int_radix_lut<Torus> *zero_comparison) {
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
@@ -301,10 +302,10 @@ __host__ void host_compare_with_zero_equality(
template <typename Torus>
__host__ void host_integer_radix_equality_check_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_1, Torus *lwe_array_2,
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
uint32_t num_radix_blocks) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_1,
Torus const *lwe_array_2, int_comparison_buffer<Torus> *mem_ptr,
void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks) {
auto eq_buffer = mem_ptr->eq_buffer;
@@ -325,12 +326,11 @@ __host__ void host_integer_radix_equality_check_kb(
}
template <typename Torus>
__host__ void
compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out,
Torus *lwe_array_left, Torus *lwe_array_right,
int_comparison_buffer<Torus> *mem_ptr, void **bsks,
Torus **ksks, uint32_t num_radix_blocks) {
__host__ void compare_radix_blocks_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_left,
Torus const *lwe_array_right, int_comparison_buffer<Torus> *mem_ptr,
void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks) {
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
@@ -374,13 +374,12 @@ compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
// (inferior, equal, superior) to one single shortint block containing the
// final sign
template <typename Torus>
__host__ void
tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out,
Torus *lwe_block_comparisons,
int_tree_sign_reduction_buffer<Torus> *tree_buffer,
std::function<Torus(Torus)> sign_handler_f, void **bsks,
Torus **ksks, uint32_t num_radix_blocks) {
__host__ void tree_sign_reduction(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_block_comparisons,
int_tree_sign_reduction_buffer<Torus> *tree_buffer,
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
Torus *const *ksks, uint32_t num_radix_blocks) {
auto params = tree_buffer->params;
auto big_lwe_dimension = params.big_lwe_dimension;
@@ -462,11 +461,11 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
template <typename Torus>
__host__ void host_integer_radix_difference_check_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_left, Torus *lwe_array_right,
int_comparison_buffer<Torus> *mem_ptr,
std::function<Torus(Torus)> reduction_lut_f, void **bsks, Torus **ksks,
uint32_t num_radix_blocks) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_left,
Torus const *lwe_array_right, int_comparison_buffer<Torus> *mem_ptr,
std::function<Torus(Torus)> reduction_lut_f, void *const *bsks,
Torus *const *ksks, uint32_t num_radix_blocks) {
auto diff_buffer = mem_ptr->diff_buffer;
@@ -477,8 +476,8 @@ __host__ void host_integer_radix_difference_check_kb(
auto carry_modulus = params.carry_modulus;
uint32_t packed_num_radix_blocks = num_radix_blocks;
auto lhs = lwe_array_left;
auto rhs = lwe_array_right;
Torus *lhs = (Torus *)lwe_array_left;
Torus *rhs = (Torus *)lwe_array_right;
if (carry_modulus >= message_modulus) {
// Packing is possible
// Pack inputs
@@ -586,10 +585,10 @@ __host__ void host_integer_radix_difference_check_kb(
template <typename Torus>
__host__ void scratch_cuda_integer_radix_comparison_check_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_comparison_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
int_radix_params params, COMPARISON_TYPE op, bool is_signed,
bool allocate_gpu_memory) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_comparison_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
bool is_signed, bool allocate_gpu_memory) {
*mem_ptr = new int_comparison_buffer<Torus>(streams, gpu_indexes, gpu_count,
op, params, num_radix_blocks,
@@ -597,12 +596,11 @@ __host__ void scratch_cuda_integer_radix_comparison_check_kb(
}
template <typename Torus>
__host__ void
host_integer_radix_maxmin_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out,
Torus *lwe_array_left, Torus *lwe_array_right,
int_comparison_buffer<Torus> *mem_ptr, void **bsks,
Torus **ksks, uint32_t total_num_radix_blocks) {
__host__ void host_integer_radix_maxmin_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_left,
Torus const *lwe_array_right, int_comparison_buffer<Torus> *mem_ptr,
void *const *bsks, Torus *const *ksks, uint32_t total_num_radix_blocks) {
// Compute the sign
host_integer_radix_difference_check_kb<Torus>(

View File

@@ -1,11 +1,12 @@
#include "compression.cuh"
void scratch_cuda_integer_compress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t compression_glwe_dimension,
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
bool allocate_gpu_memory) {
int_radix_params compression_params(
@@ -21,12 +22,13 @@ void scratch_cuda_integer_compress_radix_ciphertext_64(
allocate_gpu_memory);
}
void scratch_cuda_integer_decompress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count,
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t encryption_glwe_dimension,
uint32_t encryption_polynomial_size, uint32_t compression_glwe_dimension,
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
uint32_t storage_log_modulus, uint32_t body_count,
bool allocate_gpu_memory) {
// Decompression doesn't keyswitch, so big and small dimensions are the same
@@ -47,32 +49,31 @@ void scratch_cuda_integer_decompress_radix_ciphertext_64(
allocate_gpu_memory);
}
void cuda_integer_compress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths,
int8_t *mem_ptr) {
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *glwe_array_out, void const *lwe_array_in, void *const *fp_ksk,
uint32_t num_nths, int8_t *mem_ptr) {
host_integer_compress<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(glwe_array_out),
static_cast<uint64_t *>(lwe_array_in), (uint64_t **)(fp_ksk), num_nths,
(int_compression<uint64_t> *)mem_ptr);
static_cast<const uint64_t *>(lwe_array_in), (uint64_t *const *)(fp_ksk),
num_nths, (int_compression<uint64_t> *)mem_ptr);
}
void cuda_integer_decompress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *glwe_in, uint32_t *indexes_array,
uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr) {
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *glwe_in, uint32_t const *indexes_array,
uint32_t indexes_array_size, void *const *bsks, int8_t *mem_ptr) {
host_integer_decompress<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out), static_cast<uint64_t *>(glwe_in),
indexes_array, indexes_array_size, bsks,
(int_decompression<uint64_t> *)mem_ptr);
static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(glwe_in), indexes_array, indexes_array_size,
bsks, (int_decompression<uint64_t> *)mem_ptr);
}
void cleanup_cuda_integer_compress_radix_ciphertext_64(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {
void cleanup_cuda_integer_compress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_compression<uint64_t> *mem_ptr =
(int_compression<uint64_t> *)(*mem_ptr_void);
@@ -80,7 +81,7 @@ void cleanup_cuda_integer_compress_radix_ciphertext_64(void **streams,
}
void cleanup_cuda_integer_decompress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_decompression<uint64_t> *mem_ptr =

View File

@@ -2,9 +2,10 @@
#define CUDA_INTEGER_COMPRESSION_CUH
#include "ciphertext.h"
#include "compression.h"
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer/compression/compression.h"
#include "integer/compression/compression_utilities.h"
#include "integer/integer.cuh"
#include "linearalgebra/multiplication.cuh"
#include "polynomial/functions.cuh"
@@ -77,11 +78,12 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
}
template <typename Torus>
__host__ void host_integer_compress(cudaStream_t *streams,
uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *glwe_array_out, Torus *lwe_array_in,
Torus **fp_ksk, uint32_t num_radix_blocks,
int_compression<Torus> *mem_ptr) {
__host__ void
host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *glwe_array_out,
Torus const *lwe_array_in, Torus *const *fp_ksk,
uint32_t num_radix_blocks,
int_compression<Torus> *mem_ptr) {
auto compression_params = mem_ptr->compression_params;
auto input_lwe_dimension = compression_params.small_lwe_dimension;
@@ -138,9 +140,9 @@ __host__ void host_integer_compress(cudaStream_t *streams,
}
template <typename Torus>
__global__ void extract(Torus *glwe_array_out, Torus *array_in, uint32_t index,
uint32_t log_modulus, uint32_t input_len,
uint32_t initial_out_len) {
__global__ void extract(Torus *glwe_array_out, Torus const *array_in,
uint32_t index, uint32_t log_modulus,
uint32_t input_len, uint32_t initial_out_len) {
auto nbits = sizeof(Torus) * 8;
auto i = threadIdx.x + blockIdx.x * blockDim.x;
@@ -176,7 +178,7 @@ __global__ void extract(Torus *glwe_array_out, Torus *array_in, uint32_t index,
/// Extracts the glwe_index-nth GLWE ciphertext
template <typename Torus>
__host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
Torus *glwe_array_out, Torus *array_in,
Torus *glwe_array_out, Torus const *array_in,
uint32_t glwe_index,
int_decompression<Torus> *mem_ptr) {
if (array_in == glwe_array_out)
@@ -219,15 +221,14 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
}
template <typename Torus>
__host__ void
host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *d_lwe_array_out,
Torus *d_packed_glwe_in, uint32_t *h_indexes_array,
uint32_t indexes_array_size, void **d_bsks,
int_decompression<Torus> *h_mem_ptr) {
__host__ void host_integer_decompress(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *d_lwe_array_out, Torus const *d_packed_glwe_in,
uint32_t const *h_indexes_array, uint32_t indexes_array_size,
void *const *d_bsks, int_decompression<Torus> *h_mem_ptr) {
auto d_indexes_array = h_mem_ptr->tmp_indexes_array;
cuda_memcpy_async_to_gpu(d_indexes_array, h_indexes_array,
cuda_memcpy_async_to_gpu(d_indexes_array, (void *)h_indexes_array,
indexes_array_size * sizeof(uint32_t), streams[0],
gpu_indexes[0]);
@@ -355,10 +356,11 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
template <typename Torus>
__host__ void scratch_cuda_compress_integer_radix_ciphertext(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_compression<Torus> **mem_ptr, uint32_t num_radix_blocks,
int_radix_params compression_params, uint32_t lwe_per_glwe,
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_compression<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params compression_params,
uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
bool allocate_gpu_memory) {
*mem_ptr = new int_compression<Torus>(
streams, gpu_indexes, gpu_count, compression_params, num_radix_blocks,
@@ -367,11 +369,11 @@ __host__ void scratch_cuda_compress_integer_radix_ciphertext(
template <typename Torus>
__host__ void scratch_cuda_integer_decompress_radix_ciphertext(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_decompression<Torus> **mem_ptr, uint32_t num_radix_blocks,
uint32_t body_count, int_radix_params encryption_params,
int_radix_params compression_params, uint32_t storage_log_modulus,
bool allocate_gpu_memory) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_decompression<Torus> **mem_ptr,
uint32_t num_radix_blocks, uint32_t body_count,
int_radix_params encryption_params, int_radix_params compression_params,
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
*mem_ptr = new int_decompression<Torus>(
streams, gpu_indexes, gpu_count, encryption_params, compression_params,

View File

@@ -1,8 +1,8 @@
#include "integer/div_rem.cuh"
void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
@@ -20,20 +20,23 @@ void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
}
void cuda_integer_div_rem_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *quotient,
void *remainder, void *numerator, void *divisor, int8_t *mem_ptr,
void **bsks, void **ksks, uint32_t num_blocks) {
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *quotient, void *remainder, void const *numerator, void const *divisor,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
uint32_t num_blocks) {
auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;
host_integer_div_rem_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
bsks, (uint64_t **)(ksks), mem, num_blocks);
static_cast<const uint64_t *>(numerator),
static_cast<const uint64_t *>(divisor), bsks, (uint64_t **)(ksks), mem,
num_blocks);
}
void cleanup_cuda_integer_div_rem(void **streams, uint32_t *gpu_indexes,
void cleanup_cuda_integer_div_rem(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, int8_t **mem_ptr_void) {
int_div_rem_memory<uint64_t> *mem_ptr =
(int_div_rem_memory<uint64_t> *)(*mem_ptr_void);

View File

@@ -3,13 +3,13 @@
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer.h"
#include "integer/comparison.cuh"
#include "integer/integer.cuh"
#include "integer/integer_utilities.h"
#include "integer/negation.cuh"
#include "integer/scalar_shifts.cuh"
#include "linear_algebra.h"
#include "programmable_bootstrap.h"
#include "pbs/programmable_bootstrap.h"
#include "utils/helper.cuh"
#include "utils/kernel_dimensions.cuh"
#include <fstream>
@@ -160,21 +160,23 @@ template <typename Torus> struct lwe_ciphertext_list {
template <typename Torus>
__host__ void scratch_cuda_integer_div_rem_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_div_rem_memory<Torus> **mem_ptr, uint32_t num_blocks,
int_radix_params params, bool allocate_gpu_memory) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_div_rem_memory<Torus> **mem_ptr,
uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {
*mem_ptr = new int_div_rem_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory);
}
template <typename Torus>
__host__ void
host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *quotient, Torus *remainder,
Torus *numerator, Torus *divisor, void **bsks,
uint64_t **ksks, int_div_rem_memory<uint64_t> *mem_ptr,
uint32_t num_blocks) {
__host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *quotient,
Torus *remainder, Torus const *numerator,
Torus const *divisor, void *const *bsks,
uint64_t *const *ksks,
int_div_rem_memory<uint64_t> *mem_ptr,
uint32_t num_blocks) {
auto radix_params = mem_ptr->params;
@@ -222,8 +224,8 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
lwe_ciphertext_list<Torus> cleaned_merged_interesting_remainder(
mem_ptr->cleaned_merged_interesting_remainder, radix_params, num_blocks);
numerator_block_stack.clone_from(numerator, 0, num_blocks - 1, streams[0],
gpu_indexes[0]);
numerator_block_stack.clone_from((Torus *)numerator, 0, num_blocks - 1,
streams[0], gpu_indexes[0]);
remainder1.assign_zero(0, num_blocks - 1, streams[0], gpu_indexes[0]);
remainder2.assign_zero(0, num_blocks - 1, streams[0], gpu_indexes[0]);
@@ -245,9 +247,9 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
streams[0], gpu_indexes[0]);
interesting_remainder2.clone_from(remainder2, 0, last_non_trivial_block,
streams[0], gpu_indexes[0]);
interesting_divisor.clone_from(divisor, 0, last_non_trivial_block,
interesting_divisor.clone_from((Torus *)divisor, 0, last_non_trivial_block,
streams[0], gpu_indexes[0]);
divisor_ms_blocks.clone_from(divisor,
divisor_ms_blocks.clone_from((Torus *)divisor,
(msb_bit_set + 1) / num_bits_in_message,
num_blocks - 1, streams[0], gpu_indexes[0]);
@@ -256,65 +258,67 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
// msb_bit_set) the split versions share some bits they should not. So we do
// one PBS on the last block of the interesting_divisor, and first block of
// divisor_ms_blocks to trim out bits which should not be there
auto trim_last_interesting_divisor_bits =
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
if ((msb_bit_set + 1) % num_bits_in_message == 0) {
return;
}
// The last block of the interesting part of the remainder
// can contain bits which we should not account for
// we have to zero them out.
auto trim_last_interesting_divisor_bits = [&](cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count) {
if ((msb_bit_set + 1) % num_bits_in_message == 0) {
return;
}
// The last block of the interesting part of the remainder
// can contain bits which we should not account for
// we have to zero them out.
// Where the msb is set in the block
uint32_t pos_in_block = msb_bit_set % num_bits_in_message;
// Where the msb is set in the block
uint32_t pos_in_block = msb_bit_set % num_bits_in_message;
// e.g 2 bits in message:
// if pos_in_block is 0, then we want to keep only first bit (right
// shift
// mask by 1) if pos_in_block is 1, then we want to keep the two
// bits
// (right shift mask by 0)
uint32_t shift_amount = num_bits_in_message - (pos_in_block + 1);
// e.g 2 bits in message:
// if pos_in_block is 0, then we want to keep only first bit (right
// shift
// mask by 1) if pos_in_block is 1, then we want to keep the two
// bits
// (right shift mask by 0)
uint32_t shift_amount = num_bits_in_message - (pos_in_block + 1);
// Create mask of 1s on the message part, 0s in the carries
uint32_t full_message_mask = message_modulus - 1;
// Create mask of 1s on the message part, 0s in the carries
uint32_t full_message_mask = message_modulus - 1;
// Shift the mask so that we will only keep bits we should
uint32_t shifted_mask = full_message_mask >> shift_amount;
// Shift the mask so that we will only keep bits we should
uint32_t shifted_mask = full_message_mask >> shift_amount;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, interesting_divisor.last_block(),
interesting_divisor.last_block(), bsks, ksks, 1,
mem_ptr->masking_luts_1[shifted_mask]);
}; // trim_last_interesting_divisor_bits
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, interesting_divisor.last_block(),
interesting_divisor.last_block(), bsks, ksks, 1,
mem_ptr->masking_luts_1[shifted_mask]);
}; // trim_last_interesting_divisor_bits
auto trim_first_divisor_ms_bits =
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
if (divisor_ms_blocks.is_empty() ||
((msb_bit_set + 1) % num_bits_in_message) == 0) {
return;
}
// Where the msb is set in the block
uint32_t pos_in_block = msb_bit_set % num_bits_in_message;
auto trim_first_divisor_ms_bits = [&](cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count) {
if (divisor_ms_blocks.is_empty() ||
((msb_bit_set + 1) % num_bits_in_message) == 0) {
return;
}
// Where the msb is set in the block
uint32_t pos_in_block = msb_bit_set % num_bits_in_message;
// e.g 2 bits in message:
// if pos_in_block is 0, then we want to discard the first bit (left
// shift mask by 1) if pos_in_block is 1, then we want to discard the
// two bits (left shift mask by 2) let shift_amount =
// num_bits_in_message - pos_in_block
uint32_t shift_amount = pos_in_block + 1;
uint32_t full_message_mask = message_modulus - 1;
uint32_t shifted_mask = full_message_mask << shift_amount;
// e.g 2 bits in message:
// if pos_in_block is 0, then we want to discard the first bit (left
// shift mask by 1) if pos_in_block is 1, then we want to discard the
// two bits (left shift mask by 2) let shift_amount =
// num_bits_in_message - pos_in_block
uint32_t shift_amount = pos_in_block + 1;
uint32_t full_message_mask = message_modulus - 1;
uint32_t shifted_mask = full_message_mask << shift_amount;
// Keep the mask within the range of message bits, so that
// the estimated degree of the output is < msg_modulus
shifted_mask = shifted_mask & full_message_mask;
// Keep the mask within the range of message bits, so that
// the estimated degree of the output is < msg_modulus
shifted_mask = shifted_mask & full_message_mask;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(),
divisor_ms_blocks.first_block(), bsks, ksks, 1,
mem_ptr->masking_luts_2[shifted_mask]);
}; // trim_first_divisor_ms_bits
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(),
divisor_ms_blocks.first_block(), bsks, ksks, 1,
mem_ptr->masking_luts_2[shifted_mask]);
}; // trim_first_divisor_ms_bits
// This does
// R := R << 1; R(0) := N(i)
@@ -325,48 +329,50 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
// However, to keep the remainder clean (noise wise), what we do is that we
// put the remainder block from which we need to extract the bit, as the LSB
// of the Remainder, so that left shifting will pull the bit we need.
auto left_shift_interesting_remainder1 =
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
numerator_block_1.clone_from(
numerator_block_stack, numerator_block_stack.len - 1,
numerator_block_stack.len - 1, streams[0], gpu_indexes[0]);
numerator_block_stack.pop();
interesting_remainder1.insert(0, numerator_block_1.first_block(),
streams[0], gpu_indexes[0]);
auto left_shift_interesting_remainder1 = [&](cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count) {
numerator_block_1.clone_from(
numerator_block_stack, numerator_block_stack.len - 1,
numerator_block_stack.len - 1, streams[0], gpu_indexes[0]);
numerator_block_stack.pop();
interesting_remainder1.insert(0, numerator_block_1.first_block(),
streams[0], gpu_indexes[0]);
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, interesting_remainder1.data, 1,
mem_ptr->shift_mem_1, bsks, ksks, interesting_remainder1.len);
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, interesting_remainder1.data, 1,
mem_ptr->shift_mem_1, bsks, ksks, interesting_remainder1.len);
tmp_radix.clone_from(interesting_remainder1, 0,
interesting_remainder1.len - 1, streams[0],
gpu_indexes[0]);
tmp_radix.clone_from(interesting_remainder1, 0,
interesting_remainder1.len - 1, streams[0],
gpu_indexes[0]);
host_radix_blocks_rotate_left<Torus>(
streams, gpu_indexes, gpu_count, interesting_remainder1.data,
tmp_radix.data, 1, interesting_remainder1.len, big_lwe_size);
host_radix_blocks_rotate_left<Torus>(
streams, gpu_indexes, gpu_count, interesting_remainder1.data,
tmp_radix.data, 1, interesting_remainder1.len, big_lwe_size);
numerator_block_1.clone_from(
interesting_remainder1, interesting_remainder1.len - 1,
interesting_remainder1.len - 1, streams[0], gpu_indexes[0]);
numerator_block_1.clone_from(
interesting_remainder1, interesting_remainder1.len - 1,
interesting_remainder1.len - 1, streams[0], gpu_indexes[0]);
interesting_remainder1.pop();
interesting_remainder1.pop();
if (pos_in_block != 0) {
// We have not yet extracted all the bits from this numerator
// so, we put it back on the front so that it gets taken next
// iteration
numerator_block_stack.push(numerator_block_1.first_block(),
streams[0], gpu_indexes[0]);
}
}; // left_shift_interesting_remainder1
if (pos_in_block != 0) {
// We have not yet extracted all the bits from this numerator
// so, we put it back on the front so that it gets taken next
// iteration
numerator_block_stack.push(numerator_block_1.first_block(), streams[0],
gpu_indexes[0]);
}
}; // left_shift_interesting_remainder1
auto left_shift_interesting_remainder2 =
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, interesting_remainder2.data, 1,
mem_ptr->shift_mem_2, bsks, ksks, interesting_remainder2.len);
}; // left_shift_interesting_remainder2
auto left_shift_interesting_remainder2 = [&](cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count) {
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, interesting_remainder2.data, 1,
mem_ptr->shift_mem_2, bsks, ksks, interesting_remainder2.len);
}; // left_shift_interesting_remainder2
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
@@ -416,7 +422,8 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
// fills:
// `new_remainder` - radix ciphertext
// `subtraction_overflowed` - single ciphertext
auto do_overflowing_sub = [&](cudaStream_t *streams, uint32_t *gpu_indexes,
auto do_overflowing_sub = [&](cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count) {
host_integer_overflowing_sub_kb<Torus>(
streams, gpu_indexes, gpu_count, new_remainder.data,
@@ -427,8 +434,8 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
// fills:
// `at_least_one_upper_block_is_non_zero` - single ciphertext
auto check_divisor_upper_blocks = [&](cudaStream_t *streams,
uint32_t *gpu_indexes,
auto check_divisor_upper_blocks = [&](cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count) {
auto &trivial_blocks = divisor_ms_blocks;
if (trivial_blocks.is_empty()) {
@@ -459,7 +466,8 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
// fills:
// `cleaned_merged_interesting_remainder` - radix ciphertext
auto create_clean_version_of_merged_remainder =
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
[&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count,
cleaned_merged_interesting_remainder.data,
@@ -498,7 +506,8 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
streams[0], gpu_indexes[0]);
auto conditionally_zero_out_merged_interesting_remainder =
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
[&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count,
cleaned_merged_interesting_remainder.data,
@@ -510,7 +519,8 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
};
auto conditionally_zero_out_merged_new_remainder =
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
[&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, new_remainder.data,
new_remainder.data, overflow_sum_radix.data, bsks, ksks,
@@ -518,7 +528,8 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
mem_ptr->zero_out_if_overflow_happened[factor_lut_id], factor);
};
auto set_quotient_bit = [&](cudaStream_t *streams, uint32_t *gpu_indexes,
auto set_quotient_bit = [&](cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count) {
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, did_not_overflow.data,

View File

@@ -1,10 +1,11 @@
#include "integer/integer.cuh"
#include <linear_algebra.h>
void cuda_full_propagation_64_inplace(void **streams, uint32_t *gpu_indexes,
void cuda_full_propagation_64_inplace(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, void *input_blocks,
int8_t *mem_ptr, void **ksks, void **bsks,
uint32_t num_blocks) {
int8_t *mem_ptr, void *const *ksks,
void *const *bsks, uint32_t num_blocks) {
int_fullprop_buffer<uint64_t> *buffer =
(int_fullprop_buffer<uint64_t> *)mem_ptr;
@@ -16,11 +17,12 @@ void cuda_full_propagation_64_inplace(void **streams, uint32_t *gpu_indexes,
}
void scratch_cuda_full_propagation_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
ks_level, ks_base_log, pbs_level, pbs_base_log,
@@ -31,7 +33,8 @@ void scratch_cuda_full_propagation_64(
(int_fullprop_buffer<uint64_t> **)mem_ptr, params, allocate_gpu_memory);
}
void cleanup_cuda_full_propagation(void **streams, uint32_t *gpu_indexes,
void cleanup_cuda_full_propagation(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, int8_t **mem_ptr_void) {
int_fullprop_buffer<uint64_t> *mem_ptr =
@@ -41,8 +44,8 @@ void cleanup_cuda_full_propagation(void **streams, uint32_t *gpu_indexes,
}
void scratch_cuda_propagate_single_carry_kb_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
@@ -60,9 +63,9 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
}
void cuda_propagate_single_carry_kb_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
void *carry_out, int8_t *mem_ptr, void **bsks, void **ksks,
uint32_t num_blocks) {
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array, void *carry_out, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t num_blocks) {
host_propagate_single_carry<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(carry_out),
@@ -71,9 +74,9 @@ void cuda_propagate_single_carry_kb_64_inplace(
}
void cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
void *carry_out, void *input_carries, int8_t *mem_ptr, void **bsks,
void **ksks, uint32_t num_blocks) {
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array, void *carry_out, void *input_carries, int8_t *mem_ptr,
void *const *bsks, void *const *ksks, uint32_t num_blocks) {
host_propagate_single_carry<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(carry_out),
@@ -82,7 +85,8 @@ void cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
num_blocks);
}
void cleanup_cuda_propagate_single_carry(void **streams, uint32_t *gpu_indexes,
void cleanup_cuda_propagate_single_carry(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_sc_prop_memory<uint64_t> *mem_ptr =
@@ -91,12 +95,13 @@ void cleanup_cuda_propagate_single_carry(void **streams, uint32_t *gpu_indexes,
}
void scratch_cuda_apply_univariate_lut_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
@@ -105,26 +110,28 @@ void scratch_cuda_apply_univariate_lut_kb_64(
scratch_cuda_apply_univariate_lut_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_radix_lut<uint64_t> **)mem_ptr, static_cast<uint64_t *>(input_lut),
num_radix_blocks, params, allocate_gpu_memory);
(int_radix_lut<uint64_t> **)mem_ptr,
static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
allocate_gpu_memory);
}
void cuda_apply_univariate_lut_kb_64(void **streams, uint32_t *gpu_indexes,
void cuda_apply_univariate_lut_kb_64(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, void *output_radix_lwe,
void *input_radix_lwe, int8_t *mem_ptr,
void **ksks, void **bsks,
uint32_t num_blocks) {
void const *input_radix_lwe,
int8_t *mem_ptr, void *const *ksks,
void *const *bsks, uint32_t num_blocks) {
host_apply_univariate_lut_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(output_radix_lwe),
static_cast<uint64_t *>(input_radix_lwe),
static_cast<const uint64_t *>(input_radix_lwe),
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks,
num_blocks);
}
void cleanup_cuda_apply_univariate_lut_kb_64(void **streams,
uint32_t *gpu_indexes,
void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
@@ -132,25 +139,27 @@ void cleanup_cuda_apply_univariate_lut_kb_64(void **streams,
}
void cuda_apply_many_univariate_lut_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
void **bsks, uint32_t num_blocks, uint32_t lut_count, uint32_t lut_stride) {
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *output_radix_lwe, void const *input_radix_lwe, int8_t *mem_ptr,
void *const *ksks, void *const *bsks, uint32_t num_blocks,
uint32_t lut_count, uint32_t lut_stride) {
host_apply_many_univariate_lut_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(output_radix_lwe),
static_cast<uint64_t *>(input_radix_lwe),
static_cast<const uint64_t *>(input_radix_lwe),
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks, num_blocks,
lut_count, lut_stride);
}
void scratch_cuda_apply_bivariate_lut_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, void *input_lut, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
@@ -163,24 +172,23 @@ void scratch_cuda_apply_bivariate_lut_kb_64(
num_radix_blocks, params, allocate_gpu_memory);
}
void cuda_apply_bivariate_lut_kb_64(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count, void *output_radix_lwe,
void *input_radix_lwe_1,
void *input_radix_lwe_2, int8_t *mem_ptr,
void **ksks, void **bsks,
uint32_t num_blocks, uint32_t shift) {
void cuda_apply_bivariate_lut_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *output_radix_lwe, void const *input_radix_lwe_1,
void const *input_radix_lwe_2, int8_t *mem_ptr, void *const *ksks,
void *const *bsks, uint32_t num_blocks, uint32_t shift) {
host_apply_bivariate_lut_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(output_radix_lwe),
static_cast<uint64_t *>(input_radix_lwe_1),
static_cast<uint64_t *>(input_radix_lwe_2),
static_cast<const uint64_t *>(input_radix_lwe_1),
static_cast<const uint64_t *>(input_radix_lwe_2),
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks, num_blocks,
shift);
}
void cleanup_cuda_apply_bivariate_lut_kb_64(void **streams,
uint32_t *gpu_indexes,
void cleanup_cuda_apply_bivariate_lut_kb_64(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
@@ -188,12 +196,13 @@ void cleanup_cuda_apply_bivariate_lut_kb_64(void **streams,
}
void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
@@ -202,14 +211,15 @@ void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
scratch_cuda_apply_bivariate_lut_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_radix_lut<uint64_t> **)mem_ptr, static_cast<uint64_t *>(input_lut),
num_radix_blocks, params, allocate_gpu_memory);
(int_radix_lut<uint64_t> **)mem_ptr,
static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
allocate_gpu_memory);
}
void cuda_integer_compute_prefix_sum_hillis_steele_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *output_radix_lwe, void *generates_or_propagates, int8_t *mem_ptr,
void **ksks, void **bsks, uint32_t num_blocks, uint32_t shift) {
void *const *ksks, void *const *bsks, uint32_t num_blocks, uint32_t shift) {
int_radix_params params = ((int_radix_lut<uint64_t> *)mem_ptr)->params;
@@ -222,14 +232,14 @@ void cuda_integer_compute_prefix_sum_hillis_steele_64(
}
void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}
void cuda_integer_reverse_blocks_64_inplace(void **streams,
uint32_t *gpu_indexes,
void cuda_integer_reverse_blocks_64_inplace(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, void *lwe_array,
uint32_t num_blocks,
uint32_t lwe_size) {

View File

@@ -4,12 +4,12 @@
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "helper_multi_gpu.h"
#include "integer.h"
#include "integer/integer_utilities.h"
#include "integer/scalar_addition.cuh"
#include "linear_algebra.h"
#include "linearalgebra/addition.cuh"
#include "pbs/programmable_bootstrap.h"
#include "polynomial/functions.cuh"
#include "programmable_bootstrap.h"
#include "utils/helper.cuh"
#include "utils/helper_multi_gpu.cuh"
#include "utils/kernel_dimensions.cuh"
@@ -69,10 +69,10 @@ __global__ void radix_blocks_rotate_left(Torus *dst, Torus *src, uint32_t value,
// one block is responsible to process single lwe ciphertext
template <typename Torus>
__host__ void
host_radix_blocks_rotate_right(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *dst, Torus *src,
uint32_t value, uint32_t blocks_count,
uint32_t lwe_size) {
host_radix_blocks_rotate_right(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
Torus *dst, Torus *src, uint32_t value,
uint32_t blocks_count, uint32_t lwe_size) {
if (src == dst) {
PANIC("Cuda error (blocks_rotate_right): the source and destination "
"pointers should be different");
@@ -86,10 +86,10 @@ host_radix_blocks_rotate_right(cudaStream_t *streams, uint32_t *gpu_indexes,
// calculation is not inplace, so `dst` and `src` must not be the same
template <typename Torus>
__host__ void
host_radix_blocks_rotate_left(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *dst, Torus *src,
uint32_t value, uint32_t blocks_count,
uint32_t lwe_size) {
host_radix_blocks_rotate_left(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
Torus *dst, Torus *src, uint32_t value,
uint32_t blocks_count, uint32_t lwe_size) {
if (src == dst) {
PANIC("Cuda error (blocks_rotate_left): the source and destination "
"pointers should be different");
@@ -119,9 +119,9 @@ __global__ void radix_blocks_reverse_lwe_inplace(Torus *src,
template <typename Torus>
__host__ void
host_radix_blocks_reverse_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
Torus *src, uint32_t blocks_count,
uint32_t lwe_size) {
host_radix_blocks_reverse_inplace(cudaStream_t const *streams,
uint32_t const *gpu_indexes, Torus *src,
uint32_t blocks_count, uint32_t lwe_size) {
cudaSetDevice(gpu_indexes[0]);
int num_blocks = blocks_count / 2, num_threads = 1024;
radix_blocks_reverse_lwe_inplace<Torus>
@@ -131,10 +131,11 @@ host_radix_blocks_reverse_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
// polynomial_size threads
template <typename Torus>
__global__ void
device_pack_bivariate_blocks(Torus *lwe_array_out, Torus *lwe_indexes_out,
Torus *lwe_array_1, Torus *lwe_array_2,
Torus *lwe_indexes_in, uint32_t lwe_dimension,
uint32_t shift, uint32_t num_blocks) {
device_pack_bivariate_blocks(Torus *lwe_array_out, Torus const *lwe_indexes_out,
Torus const *lwe_array_1, Torus const *lwe_array_2,
Torus const *lwe_indexes_in,
uint32_t lwe_dimension, uint32_t shift,
uint32_t num_blocks) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < num_blocks * (lwe_dimension + 1)) {
@@ -151,13 +152,13 @@ device_pack_bivariate_blocks(Torus *lwe_array_out, Torus *lwe_indexes_out,
* becomes out = m1 * shift + m2
*/
template <typename Torus>
__host__ void pack_bivariate_blocks(cudaStream_t *streams,
uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out,
Torus *lwe_indexes_out, Torus *lwe_array_1,
Torus *lwe_array_2, Torus *lwe_indexes_in,
uint32_t lwe_dimension, uint32_t shift,
uint32_t num_radix_blocks) {
__host__ void
pack_bivariate_blocks(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out,
Torus const *lwe_indexes_out, Torus const *lwe_array_1,
Torus const *lwe_array_2, Torus const *lwe_indexes_in,
uint32_t lwe_dimension, uint32_t shift,
uint32_t num_radix_blocks) {
cudaSetDevice(gpu_indexes[0]);
// Left message is shifted
@@ -173,9 +174,10 @@ __host__ void pack_bivariate_blocks(cudaStream_t *streams,
template <typename Torus>
__host__ void integer_radix_apply_univariate_lookup_table_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in, void **bsks, Torus **ksks,
uint32_t num_radix_blocks, int_radix_lut<Torus> *lut) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks,
int_radix_lut<Torus> *lut) {
// apply_lookup_table
auto params = lut->params;
auto pbs_type = params.pbs_type;
@@ -202,10 +204,10 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
if (active_gpu_count == 1) {
execute_keyswitch_async<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
lwe_trivial_indexes_vec[0], lwe_array_in,
lut->lwe_indexes_in, ksks, big_lwe_dimension,
small_lwe_dimension, ks_base_log, ks_level,
num_radix_blocks);
lwe_trivial_indexes_vec[0],
(Torus *)lwe_array_in, lut->lwe_indexes_in,
ksks, big_lwe_dimension, small_lwe_dimension,
ks_base_log, ks_level, num_radix_blocks);
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
@@ -259,10 +261,10 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
template <typename Torus>
__host__ void integer_radix_apply_many_univariate_lookup_table_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in, void **bsks, Torus **ksks,
uint32_t num_radix_blocks, int_radix_lut<Torus> *lut, uint32_t lut_count,
uint32_t lut_stride) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks,
int_radix_lut<Torus> *lut, uint32_t lut_count, uint32_t lut_stride) {
// apply_lookup_table
auto params = lut->params;
auto pbs_type = params.pbs_type;
@@ -286,10 +288,10 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
if (active_gpu_count == 1) {
execute_keyswitch_async<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
lwe_trivial_indexes_vec[0], lwe_array_in,
lut->lwe_indexes_in, ksks, big_lwe_dimension,
small_lwe_dimension, ks_base_log, ks_level,
num_radix_blocks);
lwe_trivial_indexes_vec[0],
(Torus *)lwe_array_in, lut->lwe_indexes_in,
ksks, big_lwe_dimension, small_lwe_dimension,
ks_base_log, ks_level, num_radix_blocks);
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
@@ -343,10 +345,10 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
template <typename Torus>
__host__ void integer_radix_apply_bivariate_lookup_table_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_1, Torus *lwe_array_2, void **bsks,
Torus **ksks, uint32_t num_radix_blocks, int_radix_lut<Torus> *lut,
uint32_t shift) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_1,
Torus const *lwe_array_2, void *const *bsks, Torus *const *ksks,
uint32_t num_radix_blocks, int_radix_lut<Torus> *lut, uint32_t shift) {
auto params = lut->params;
auto pbs_type = params.pbs_type;
@@ -612,9 +614,10 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
template <typename Torus>
void scratch_cuda_propagate_single_carry_kb_inplace(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_sc_prop_memory<Torus> **mem_ptr, uint32_t num_radix_blocks,
int_radix_params params, bool allocate_gpu_memory) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_sc_prop_memory<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
bool allocate_gpu_memory) {
*mem_ptr =
new int_sc_prop_memory<Torus>(streams, gpu_indexes, gpu_count, params,
@@ -623,10 +626,10 @@ void scratch_cuda_propagate_single_carry_kb_inplace(
template <typename Torus>
void host_compute_prefix_sum_hillis_steele(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *step_output, Torus *generates_or_propagates, int_radix_params params,
int_radix_lut<Torus> *luts, void **bsks, Torus **ksks,
uint32_t num_blocks) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *step_output, Torus *generates_or_propagates,
int_radix_params params, int_radix_lut<Torus> *luts, void *const *bsks,
Torus *const *ksks, uint32_t num_blocks) {
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
@@ -659,11 +662,13 @@ void host_compute_prefix_sum_hillis_steele(
}
template <typename Torus>
void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
void host_propagate_single_carry(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array,
Torus *carry_out, Torus *input_carries,
int_sc_prop_memory<Torus> *mem, void **bsks,
Torus **ksks, uint32_t num_blocks) {
int_sc_prop_memory<Torus> *mem,
void *const *bsks, Torus *const *ksks,
uint32_t num_blocks) {
auto params = mem->params;
if (params.message_modulus == 2)
PANIC("Cuda error: single carry propagation is not supported for 1 bit "
@@ -700,7 +705,7 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
gpu_indexes[0]);
if (input_carries != nullptr) {
cuda_memcpy_async_gpu_to_gpu(input_carries, step_output,
cuda_memcpy_async_gpu_to_gpu((void *)input_carries, step_output,
big_lwe_size_bytes * num_blocks, streams[0],
gpu_indexes[0]);
}
@@ -716,10 +721,10 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
template <typename Torus>
void host_generate_last_block_inner_propagation(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *last_block_inner_propagation, Torus *lhs, Torus *rhs,
int_last_block_inner_propagate_memory<Torus> *mem, void **bsks,
Torus **ksks) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *last_block_inner_propagation, Torus const *lhs,
Torus const *rhs, int_last_block_inner_propagate_memory<Torus> *mem,
void *const *bsks, Torus *const *ksks) {
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, last_block_inner_propagation, lhs, rhs,
@@ -728,11 +733,12 @@ void host_generate_last_block_inner_propagation(
}
template <typename Torus>
void host_propagate_single_sub_borrow(cudaStream_t *streams,
uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *overflowed, Torus *lwe_array,
void host_propagate_single_sub_borrow(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *overflowed,
Torus *lwe_array,
int_overflowing_sub_memory<Torus> *mem,
void **bsks, Torus **ksks,
void *const *bsks, Torus *const *ksks,
uint32_t num_blocks) {
auto params = mem->params;
auto glwe_dimension = params.glwe_dimension;
@@ -784,10 +790,11 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams,
* have size = 2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus)
*/
template <typename Torus>
void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
void host_full_propagate_inplace(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *input_blocks,
int_fullprop_buffer<Torus> *mem_ptr,
Torus **ksks, void **bsks,
Torus *const *ksks, void *const *bsks,
uint32_t num_blocks) {
auto params = mem_ptr->lut->params;
@@ -821,14 +828,14 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
params.polynomial_size, params.pbs_base_log, params.pbs_level,
params.grouping_factor, 2, params.pbs_type, lut_count, lut_stride);
cuda_memcpy_async_gpu_to_gpu(cur_input_block, mem_ptr->tmp_big_lwe_vector,
big_lwe_size * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_memcpy_async_gpu_to_gpu(
(void *)cur_input_block, mem_ptr->tmp_big_lwe_vector,
big_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
if (i < num_blocks - 1) {
auto next_input_block = &input_blocks[(i + 1) * big_lwe_size];
host_addition<Torus>(streams[0], gpu_indexes[0], next_input_block,
next_input_block,
(Torus const *)next_input_block,
&mem_ptr->tmp_big_lwe_vector[big_lwe_size],
params.big_lwe_dimension, 1);
}
@@ -836,7 +843,8 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
}
template <typename Torus>
void scratch_cuda_full_propagation(cudaStream_t *streams, uint32_t *gpu_indexes,
void scratch_cuda_full_propagation(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int_fullprop_buffer<Torus> **mem_ptr,
int_radix_params params,
@@ -849,14 +857,16 @@ void scratch_cuda_full_propagation(cudaStream_t *streams, uint32_t *gpu_indexes,
// (lwe_dimension+1) threads
// (num_radix_blocks / 2) thread blocks
template <typename Torus>
__global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
__global__ void device_pack_blocks(Torus *lwe_array_out,
Torus const *lwe_array_in,
uint32_t lwe_dimension,
uint32_t num_radix_blocks, uint32_t factor) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < (lwe_dimension + 1)) {
for (int bid = 0; bid < (num_radix_blocks / 2); bid++) {
Torus *lsb_block = lwe_array_in + (2 * bid) * (lwe_dimension + 1);
Torus *lsb_block =
(Torus *)lwe_array_in + (2 * bid) * (lwe_dimension + 1);
Torus *msb_block = lsb_block + (lwe_dimension + 1);
Torus *packed_block = lwe_array_out + bid * (lwe_dimension + 1);
@@ -867,7 +877,7 @@ __global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
if (num_radix_blocks % 2 == 1) {
// We couldn't host_pack the last block, so we just copy it
Torus *lsb_block =
lwe_array_in + (num_radix_blocks - 1) * (lwe_dimension + 1);
(Torus *)lwe_array_in + (num_radix_blocks - 1) * (lwe_dimension + 1);
Torus *last_block =
lwe_array_out + (num_radix_blocks / 2) * (lwe_dimension + 1);
@@ -885,7 +895,7 @@ __global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
// Expects the carry buffer to be empty
template <typename Torus>
__host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index,
Torus *lwe_array_out, Torus *lwe_array_in,
Torus *lwe_array_out, Torus const *lwe_array_in,
uint32_t lwe_dimension, uint32_t num_radix_blocks,
uint32_t factor) {
if (num_radix_blocks == 0)
@@ -900,7 +910,7 @@ __host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index,
template <typename Torus>
__global__ void
device_create_trivial_radix(Torus *lwe_array, Torus *scalar_input,
device_create_trivial_radix(Torus *lwe_array, Torus const *scalar_input,
int32_t num_blocks, uint32_t lwe_dimension,
uint64_t delta) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -915,7 +925,7 @@ device_create_trivial_radix(Torus *lwe_array, Torus *scalar_input,
template <typename Torus>
__host__ void
create_trivial_radix(cudaStream_t stream, uint32_t gpu_index,
Torus *lwe_array_out, Torus *scalar_array,
Torus *lwe_array_out, Torus const *scalar_array,
uint32_t lwe_dimension, uint32_t num_radix_blocks,
uint32_t num_scalar_blocks, uint64_t message_modulus,
uint64_t carry_modulus) {
@@ -951,9 +961,10 @@ create_trivial_radix(cudaStream_t stream, uint32_t gpu_index,
* * (lwe_dimension+1) * sizeeof(Torus) bytes
*/
template <typename Torus>
__host__ void extract_n_bits(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out,
Torus *lwe_array_in, void **bsks, Torus **ksks,
__host__ void extract_n_bits(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in,
void *const *bsks, Torus *const *ksks,
uint32_t num_radix_blocks, uint32_t bits_per_block,
int_bit_extract_luts_buffer<Torus> *bit_extract) {
@@ -964,11 +975,11 @@ __host__ void extract_n_bits(cudaStream_t *streams, uint32_t *gpu_indexes,
template <typename Torus>
__host__ void
reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *signs_array_out, Torus *signs_array_in,
reduce_signs(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *signs_array_out, Torus *signs_array_in,
int_comparison_buffer<Torus> *mem_ptr,
std::function<Torus(Torus)> sign_handler_f, void **bsks,
Torus **ksks, uint32_t num_sign_blocks) {
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
Torus *const *ksks, uint32_t num_sign_blocks) {
auto diff_buffer = mem_ptr->diff_buffer;
@@ -1064,27 +1075,29 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
template <typename Torus>
void scratch_cuda_apply_univariate_lut_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_radix_lut<Torus> **mem_ptr, Torus *input_lut, uint32_t num_radix_blocks,
int_radix_params params, bool allocate_gpu_memory) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_lut<Torus> **mem_ptr, Torus const *input_lut,
uint32_t num_radix_blocks, int_radix_params params,
bool allocate_gpu_memory) {
*mem_ptr = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
1, num_radix_blocks, allocate_gpu_memory);
// It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
// 0
cuda_memcpy_async_to_gpu((*mem_ptr)->get_lut(gpu_indexes[0], 0), input_lut,
(params.glwe_dimension + 1) *
params.polynomial_size * sizeof(Torus),
streams[0], gpu_indexes[0]);
cuda_memcpy_async_to_gpu(
(*mem_ptr)->get_lut(gpu_indexes[0], 0), (void *)input_lut,
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
streams[0], gpu_indexes[0]);
(*mem_ptr)->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
}
template <typename Torus>
void host_apply_univariate_lut_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
void host_apply_univariate_lut_kb(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *radix_lwe_out,
Torus *radix_lwe_in,
int_radix_lut<Torus> *mem, Torus **ksks,
void **bsks, uint32_t num_blocks) {
Torus const *radix_lwe_in,
int_radix_lut<Torus> *mem, Torus *const *ksks,
void *const *bsks, uint32_t num_blocks) {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in, bsks, ksks,
@@ -1093,10 +1106,10 @@ void host_apply_univariate_lut_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
template <typename Torus>
void host_apply_many_univariate_lut_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *radix_lwe_out, Torus *radix_lwe_in, int_radix_lut<Torus> *mem,
Torus **ksks, void **bsks, uint32_t num_blocks, uint32_t lut_count,
uint32_t lut_stride) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *radix_lwe_out, Torus const *radix_lwe_in,
int_radix_lut<Torus> *mem, Torus *const *ksks, void *const *bsks,
uint32_t num_blocks, uint32_t lut_count, uint32_t lut_stride) {
integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in, bsks, ksks,
@@ -1105,28 +1118,28 @@ void host_apply_many_univariate_lut_kb(
template <typename Torus>
void scratch_cuda_apply_bivariate_lut_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_radix_lut<Torus> **mem_ptr, Torus *input_lut, uint32_t num_radix_blocks,
int_radix_params params, bool allocate_gpu_memory) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_lut<Torus> **mem_ptr, Torus const *input_lut,
uint32_t num_radix_blocks, int_radix_params params,
bool allocate_gpu_memory) {
*mem_ptr = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
1, num_radix_blocks, allocate_gpu_memory);
// It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
// 0
cuda_memcpy_async_to_gpu((*mem_ptr)->get_lut(gpu_indexes[0], 0), input_lut,
(params.glwe_dimension + 1) *
params.polynomial_size * sizeof(Torus),
streams[0], gpu_indexes[0]);
cuda_memcpy_async_to_gpu(
(*mem_ptr)->get_lut(gpu_indexes[0], 0), (void *)input_lut,
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
streams[0], gpu_indexes[0]);
(*mem_ptr)->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
}
template <typename Torus>
void host_apply_bivariate_lut_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *radix_lwe_out,
Torus *radix_lwe_in_1, Torus *radix_lwe_in_2,
int_radix_lut<Torus> *mem, Torus **ksks,
void **bsks, uint32_t num_blocks,
uint32_t shift) {
void host_apply_bivariate_lut_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *radix_lwe_out, Torus const *radix_lwe_in_1,
Torus const *radix_lwe_in_2, int_radix_lut<Torus> *mem, Torus *const *ksks,
void *const *bsks, uint32_t num_blocks, uint32_t shift) {
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in_1,

View File

@@ -66,12 +66,12 @@ void generate_ids_update_degrees(int *terms_degree, size_t *h_lwe_idx_in,
* the integer radix multiplication in keyswitch->bootstrap order.
*/
void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
uint32_t grouping_factor, uint32_t num_radix_blocks, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t message_modulus, uint32_t carry_modulus,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t pbs_base_log, uint32_t pbs_level, uint32_t ks_base_log,
uint32_t ks_level, uint32_t grouping_factor, uint32_t num_radix_blocks,
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
polynomial_size * glwe_dimension, lwe_dimension,
@@ -87,7 +87,7 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
case 8192:
case 16384:
scratch_cuda_integer_mult_radix_ciphertext_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(cudaStream_t const *)(streams), gpu_indexes, gpu_count,
(int_mul_memory<uint64_t> **)mem_ptr, num_radix_blocks, params,
allocate_gpu_memory);
break;
@@ -125,67 +125,67 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
* - 'pbs_type' selects which PBS implementation should be used
*/
void cuda_integer_mult_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *radix_lwe_out, void *radix_lwe_left, void *radix_lwe_right,
void **bsks, void **ksks, int8_t *mem_ptr, uint32_t polynomial_size,
uint32_t num_blocks) {
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *radix_lwe_out, void const *radix_lwe_left,
void const *radix_lwe_right, void *const *bsks, void *const *ksks,
int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks) {
switch (polynomial_size) {
case 256:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<256>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
static_cast<const uint64_t *>(radix_lwe_left),
static_cast<const uint64_t *>(radix_lwe_right), bsks,
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 512:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<512>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
static_cast<const uint64_t *>(radix_lwe_left),
static_cast<const uint64_t *>(radix_lwe_right), bsks,
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 1024:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<1024>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
static_cast<const uint64_t *>(radix_lwe_left),
static_cast<const uint64_t *>(radix_lwe_right), bsks,
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 2048:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<2048>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
static_cast<const uint64_t *>(radix_lwe_left),
static_cast<const uint64_t *>(radix_lwe_right), bsks,
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 4096:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<4096>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
static_cast<const uint64_t *>(radix_lwe_left),
static_cast<const uint64_t *>(radix_lwe_right), bsks,
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 8192:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<8192>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
static_cast<const uint64_t *>(radix_lwe_left),
static_cast<const uint64_t *>(radix_lwe_right), bsks,
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 16384:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<16384>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
static_cast<const uint64_t *>(radix_lwe_left),
static_cast<const uint64_t *>(radix_lwe_right), bsks,
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
default:
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
@@ -193,8 +193,9 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
}
}
void cleanup_cuda_integer_mult(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int8_t **mem_ptr_void) {
void cleanup_cuda_integer_mult(void *const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_mul_memory<uint64_t> *mem_ptr =
(int_mul_memory<uint64_t> *)(*mem_ptr_void);
@@ -203,10 +204,10 @@ void cleanup_cuda_integer_mult(void **streams, uint32_t *gpu_indexes,
}
void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
uint32_t pbs_base_log, uint32_t grouping_factor,
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {
@@ -222,9 +223,10 @@ void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
}
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec,
int8_t *mem_ptr, void **bsks, void **ksks, uint32_t num_blocks_in_radix) {
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
uint32_t num_blocks_in_radix) {
auto mem = (int_sum_ciphertexts_vec_memory<uint64_t> *)mem_ptr;
@@ -298,7 +300,7 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
}
void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr =
(int_sum_ciphertexts_vec_memory<uint64_t> *)(*mem_ptr_void);

View File

@@ -9,10 +9,10 @@
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "helper_multi_gpu.h"
#include "integer.h"
#include "integer/integer.cuh"
#include "integer/integer_utilities.h"
#include "linear_algebra.h"
#include "programmable_bootstrap.h"
#include "pbs/programmable_bootstrap.h"
#include "utils/helper.cuh"
#include "utils/helper_multi_gpu.cuh"
#include "utils/kernel_dimensions.cuh"
@@ -43,8 +43,8 @@ __global__ void smart_copy(Torus *dst, Torus *src, int32_t *id_out,
template <typename Torus, class params>
__global__ void
all_shifted_lhs_rhs(Torus *radix_lwe_left, Torus *lsb_ciphertext,
Torus *msb_ciphertext, Torus *radix_lwe_right,
all_shifted_lhs_rhs(Torus const *radix_lwe_left, Torus *lsb_ciphertext,
Torus *msb_ciphertext, Torus const *radix_lwe_right,
Torus *lsb_rhs, Torus *msb_rhs, int num_blocks) {
size_t block_id = blockIdx.x;
@@ -170,8 +170,8 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
}
template <typename Torus>
__host__ void scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
int_radix_params params, bool allocate_gpu_memory) {
@@ -182,9 +182,10 @@ __host__ void scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
template <typename Torus, class params>
__host__ void host_integer_partial_sum_ciphertexts_vec_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *radix_lwe_out, Torus *terms, int *terms_degree, void **bsks,
uint64_t **ksks, int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *radix_lwe_out, Torus *terms, int *terms_degree,
void *const *bsks, uint64_t *const *ksks,
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
uint32_t num_blocks_in_radix, uint32_t num_radix_in_vec,
int_radix_lut<Torus> *reused_lut) {
@@ -450,9 +451,9 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
template <typename Torus, class params>
__host__ void host_integer_mult_radix_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
uint64_t *radix_lwe_out, uint64_t *radix_lwe_left,
uint64_t *radix_lwe_right, void **bsks, uint64_t **ksks,
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, uint64_t *radix_lwe_out, uint64_t const *radix_lwe_left,
uint64_t const *radix_lwe_right, void *const *bsks, uint64_t *const *ksks,
int_mul_memory<Torus> *mem_ptr, uint32_t num_blocks) {
auto glwe_dimension = mem_ptr->params.glwe_dimension;
@@ -569,9 +570,10 @@ __host__ void host_integer_mult_radix_kb(
template <typename Torus>
__host__ void scratch_cuda_integer_mult_radix_ciphertext_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_mul_memory<Torus> **mem_ptr, uint32_t num_radix_blocks,
int_radix_params params, bool allocate_gpu_memory) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_mul_memory<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
bool allocate_gpu_memory) {
*mem_ptr = new int_mul_memory<Torus>(streams, gpu_indexes, gpu_count, params,
num_radix_blocks, allocate_gpu_memory);
}

View File

@@ -1,21 +1,21 @@
#include "integer/negation.cuh"
void cuda_negate_integer_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *lwe_array_in, uint32_t lwe_dimension,
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_array_in, uint32_t lwe_dimension,
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus) {
host_integer_radix_negation<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in), lwe_dimension,
static_cast<const uint64_t *>(lwe_array_in), lwe_dimension,
lwe_ciphertext_count, message_modulus, carry_modulus);
}
void scratch_cuda_integer_radix_overflowing_sub_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
@@ -33,10 +33,10 @@ void scratch_cuda_integer_radix_overflowing_sub_kb_64(
}
void cuda_integer_radix_overflowing_sub_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *radix_lwe_out, void *radix_lwe_overflowed, void *radix_lwe_left,
void *radix_lwe_right, int8_t *mem_ptr, void **bsks, void **ksks,
uint32_t num_blocks) {
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *radix_lwe_out, void *radix_lwe_overflowed, void const *radix_lwe_left,
void const *radix_lwe_right, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t num_blocks) {
auto mem = (int_overflowing_sub_memory<uint64_t> *)mem_ptr;
@@ -44,13 +44,13 @@ void cuda_integer_radix_overflowing_sub_kb_64(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_overflowed),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks), mem,
num_blocks);
static_cast<const uint64_t *>(radix_lwe_left),
static_cast<const uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
mem, num_blocks);
}
void cleanup_cuda_integer_radix_overflowing_sub(void **streams,
uint32_t *gpu_indexes,
void cleanup_cuda_integer_radix_overflowing_sub(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_overflowing_sub_memory<uint64_t> *mem_ptr =

View File

@@ -8,10 +8,10 @@
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer.h"
#include "integer/integer.cuh"
#include "integer/integer_utilities.h"
#include "linear_algebra.h"
#include "programmable_bootstrap.h"
#include "pbs/programmable_bootstrap.h"
#include "utils/helper.cuh"
#include "utils/kernel_dimensions.cuh"
#include <fstream>
@@ -23,9 +23,9 @@
template <typename Torus>
__global__ void
device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks,
uint64_t lwe_dimension, uint64_t message_modulus,
uint64_t delta) {
device_integer_radix_negation(Torus *output, Torus const *input,
int32_t num_blocks, uint64_t lwe_dimension,
uint64_t message_modulus, uint64_t delta) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < lwe_dimension + 1) {
bool is_body = (tid == lwe_dimension);
@@ -54,12 +54,11 @@ device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks,
}
template <typename Torus>
__host__ void
host_integer_radix_negation(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *output, Torus *input,
uint32_t lwe_dimension,
uint32_t input_lwe_ciphertext_count,
uint64_t message_modulus, uint64_t carry_modulus) {
__host__ void host_integer_radix_negation(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *output, Torus const *input,
uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
uint64_t message_modulus, uint64_t carry_modulus) {
cudaSetDevice(gpu_indexes[0]);
// lwe_size includes the presence of the body
@@ -85,9 +84,9 @@ host_integer_radix_negation(cudaStream_t *streams, uint32_t *gpu_indexes,
template <typename Torus>
__host__ void scratch_cuda_integer_overflowing_sub_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_overflowing_sub_memory<Torus> **mem_ptr, uint32_t num_blocks,
int_radix_params params, bool allocate_gpu_memory) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_overflowing_sub_memory<Torus> **mem_ptr,
uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {
*mem_ptr = new int_overflowing_sub_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory);
@@ -95,9 +94,10 @@ __host__ void scratch_cuda_integer_overflowing_sub_kb(
template <typename Torus>
__host__ void host_integer_overflowing_sub_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *radix_lwe_out, Torus *radix_lwe_overflowed, Torus *radix_lwe_left,
Torus *radix_lwe_right, void **bsks, uint64_t **ksks,
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *radix_lwe_out, Torus *radix_lwe_overflowed,
Torus const *radix_lwe_left, Torus const *radix_lwe_right,
void *const *bsks, uint64_t *const *ksks,
int_overflowing_sub_memory<uint64_t> *mem_ptr, uint32_t num_blocks) {
auto radix_params = mem_ptr->params;

View File

@@ -1,12 +1,14 @@
#include "integer/scalar_addition.cuh"
void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
void *scalar_input, uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus) {
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array, void const *scalar_input, uint32_t lwe_dimension,
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus) {
host_integer_radix_scalar_addition_inplace<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(scalar_input),
lwe_dimension, lwe_ciphertext_count, message_modulus, carry_modulus);
static_cast<uint64_t *>(lwe_array),
static_cast<const uint64_t *>(scalar_input), lwe_dimension,
lwe_ciphertext_count, message_modulus, carry_modulus);
}

View File

@@ -7,13 +7,13 @@
#endif
#include "device.h"
#include "integer.h"
#include "integer/integer_utilities.h"
#include "utils/kernel_dimensions.cuh"
#include <stdio.h>
template <typename Torus>
__global__ void device_integer_radix_scalar_addition_inplace(
Torus *lwe_array, Torus *scalar_input, int32_t num_blocks,
Torus *lwe_array, Torus const *scalar_input, int32_t num_blocks,
uint32_t lwe_dimension, uint64_t delta) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -25,10 +25,10 @@ __global__ void device_integer_radix_scalar_addition_inplace(
template <typename Torus>
__host__ void host_integer_radix_scalar_addition_inplace(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array, Torus *scalar_input, uint32_t lwe_dimension,
uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array, Torus const *scalar_input,
uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus) {
cudaSetDevice(gpu_indexes[0]);
// Create a 1-dimensional grid of threads
@@ -64,8 +64,8 @@ __global__ void device_integer_radix_add_scalar_one_inplace(
template <typename Torus>
__host__ void host_integer_radix_add_scalar_one_inplace(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array, uint32_t lwe_dimension,
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array, uint32_t lwe_dimension,
uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus) {
cudaSetDevice(gpu_indexes[0]);
@@ -104,10 +104,10 @@ __global__ void device_integer_radix_scalar_subtraction_inplace(
template <typename Torus>
__host__ void host_integer_radix_scalar_subtraction_inplace(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array, Torus *scalar_input, uint32_t lwe_dimension,
uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array, Torus *scalar_input,
uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus) {
cudaSetDevice(gpu_indexes[0]);
// Create a 1-dimensional grid of threads

View File

@@ -1,16 +1,16 @@
#include "integer/scalar_bitops.cuh"
void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *lwe_array_input, void *clear_blocks,
uint32_t num_clear_blocks, int8_t *mem_ptr, void **bsks, void **ksks,
uint32_t lwe_ciphertext_count, BITOP_TYPE op) {
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_array_input, void const *clear_blocks,
uint32_t num_clear_blocks, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t lwe_ciphertext_count, BITOP_TYPE op) {
host_integer_radix_scalar_bitop_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_input),
static_cast<uint64_t *>(clear_blocks), num_clear_blocks,
static_cast<const uint64_t *>(lwe_array_input),
static_cast<const uint64_t *>(clear_blocks), num_clear_blocks,
(int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
lwe_ciphertext_count, op);
}

View File

@@ -6,10 +6,11 @@
template <typename Torus>
__host__ void host_integer_radix_scalar_bitop_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_input, Torus *clear_blocks,
uint32_t num_clear_blocks, int_bitop_buffer<Torus> *mem_ptr, void **bsks,
Torus **ksks, uint32_t num_radix_blocks, BITOP_TYPE op) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_input,
Torus const *clear_blocks, uint32_t num_clear_blocks,
int_bitop_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks,
uint32_t num_radix_blocks, BITOP_TYPE op) {
auto lut = mem_ptr->lut;
auto params = lut->params;

View File

@@ -1,10 +1,10 @@
#include "integer/scalar_comparison.cuh"
void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *lwe_array_in, void *scalar_blocks,
int8_t *mem_ptr, void **bsks, void **ksks, uint32_t lwe_ciphertext_count,
uint32_t num_scalar_blocks) {
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_array_in, void const *scalar_blocks,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
uint32_t lwe_ciphertext_count, uint32_t num_scalar_blocks) {
int_comparison_buffer<uint64_t> *buffer =
(int_comparison_buffer<uint64_t> *)mem_ptr;
@@ -14,8 +14,8 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
host_integer_radix_scalar_equality_check_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(scalar_blocks), buffer, bsks,
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(scalar_blocks), buffer, bsks,
(uint64_t **)(ksks), lwe_ciphertext_count, num_scalar_blocks);
break;
case GT:
@@ -25,8 +25,8 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
host_integer_radix_scalar_difference_check_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(scalar_blocks), buffer,
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(scalar_blocks), buffer,
buffer->diff_buffer->operator_f, bsks, (uint64_t **)(ksks),
lwe_ciphertext_count, num_scalar_blocks);
break;
@@ -35,8 +35,8 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
host_integer_radix_scalar_maxmin_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(scalar_blocks), buffer, bsks,
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(scalar_blocks), buffer, bsks,
(uint64_t **)(ksks), lwe_ciphertext_count, num_scalar_blocks);
break;
default:

View File

@@ -5,10 +5,10 @@
template <typename Torus>
__host__ void scalar_compare_radix_blocks_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
uint32_t num_radix_blocks) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in,
Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks) {
if (num_radix_blocks == 0)
return;
@@ -57,11 +57,12 @@ __host__ void scalar_compare_radix_blocks_kb(
template <typename Torus>
__host__ void integer_radix_unsigned_scalar_difference_check_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr,
std::function<Torus(Torus)> sign_handler_f, void **bsks, Torus **ksks,
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
Torus const *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
Torus *const *ksks, uint32_t total_num_radix_blocks,
uint32_t total_num_scalar_blocks) {
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
@@ -243,11 +244,12 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
template <typename Torus>
__host__ void integer_radix_signed_scalar_difference_check_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr,
std::function<Torus(Torus)> sign_handler_f, void **bsks, Torus **ksks,
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
Torus const *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
Torus *const *ksks, uint32_t total_num_radix_blocks,
uint32_t total_num_scalar_blocks) {
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
@@ -287,7 +289,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
host_compare_with_zero_equality<Torus>(
streams, gpu_indexes, gpu_count, are_all_msb_zeros, lwe_array_in,
mem_ptr, bsks, ksks, total_num_radix_blocks, mem_ptr->is_zero_lut);
Torus *sign_block =
Torus const *sign_block =
lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size;
auto sign_bit_pos = (int)std::log2(message_modulus) - 1;
@@ -426,7 +428,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
lut_f);
signed_msb_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
Torus *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size;
Torus const *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, sign_block,
are_all_msb_zeros, bsks, ksks, 1, signed_msb_lut,
@@ -476,9 +478,10 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
scalar_compare_radix_blocks_kb<Torus>(lsb_streams, gpu_indexes, gpu_count,
lwe_array_ct_out, lhs, rhs, mem_ptr,
bsks, ksks, num_lsb_radix_blocks);
Torus *encrypted_sign_block =
Torus const *encrypted_sign_block =
lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size;
Torus *scalar_sign_block = scalar_blocks + (total_num_scalar_blocks - 1);
Torus const *scalar_sign_block =
scalar_blocks + (total_num_scalar_blocks - 1);
auto trivial_sign_block = mem_ptr->tmp_trivial_sign_block;
create_trivial_radix<Torus>(
@@ -505,10 +508,11 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
template <typename Torus>
__host__ void integer_radix_signed_scalar_maxmin_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in,
Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
void *const *bsks, Torus *const *ksks, uint32_t total_num_radix_blocks,
uint32_t total_num_scalar_blocks) {
auto params = mem_ptr->params;
// Calculates the difference sign between the ciphertext and the scalar
@@ -541,11 +545,12 @@ __host__ void integer_radix_signed_scalar_maxmin_kb(
template <typename Torus>
__host__ void host_integer_radix_scalar_difference_check_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr,
std::function<Torus(Torus)> sign_handler_f, void **bsks, Torus **ksks,
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
Torus const *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
Torus *const *ksks, uint32_t total_num_radix_blocks,
uint32_t total_num_scalar_blocks) {
if (mem_ptr->is_signed) {
// is signed and scalar is positive
@@ -563,10 +568,11 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
template <typename Torus>
__host__ void host_integer_radix_signed_scalar_maxmin_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in,
Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
void *const *bsks, Torus *const *ksks, uint32_t total_num_radix_blocks,
uint32_t total_num_scalar_blocks) {
if (mem_ptr->is_signed) {
// is signed and scalar is positive
@@ -582,10 +588,11 @@ __host__ void host_integer_radix_signed_scalar_maxmin_kb(
template <typename Torus>
__host__ void host_integer_radix_scalar_maxmin_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
Torus const *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
void *const *bsks, Torus *const *ksks, uint32_t total_num_radix_blocks,
uint32_t total_num_scalar_blocks) {
auto params = mem_ptr->params;
@@ -619,10 +626,11 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
template <typename Torus>
__host__ void host_integer_radix_scalar_equality_check_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
Torus const *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks,
uint32_t num_scalar_blocks) {
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;

View File

@@ -1,12 +1,12 @@
#include "integer/scalar_mul.cuh"
void scratch_cuda_integer_scalar_mul_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
@@ -20,9 +20,10 @@ void scratch_cuda_integer_scalar_mul_kb_64(
}
void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
uint64_t *decomposed_scalar, uint64_t *has_at_least_one_set, int8_t *mem,
void **bsks, void **ksks, uint32_t lwe_dimension, uint32_t polynomial_size,
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array, uint64_t const *decomposed_scalar,
uint64_t const *has_at_least_one_set, int8_t *mem, void *const *bsks,
void *const *ksks, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t message_modulus, uint32_t num_blocks, uint32_t num_scalars) {
switch (polynomial_size) {
@@ -86,8 +87,8 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
}
}
void cleanup_cuda_integer_radix_scalar_mul(void **streams,
uint32_t *gpu_indexes,
void cleanup_cuda_integer_radix_scalar_mul(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {

View File

@@ -7,7 +7,7 @@
#endif
#include "device.h"
#include "integer.h"
#include "integer/integer_utilities.h"
#include "multiplication.cuh"
#include "scalar_shifts.cuh"
#include "utils/kernel_dimensions.cuh"
@@ -29,9 +29,10 @@ __global__ void device_small_scalar_radix_multiplication(T *output_lwe_array,
template <typename T>
__host__ void scratch_cuda_integer_radix_scalar_mul_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_scalar_mul_buffer<T> **mem_ptr, uint32_t num_radix_blocks,
int_radix_params params, bool allocate_gpu_memory) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_scalar_mul_buffer<T> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
bool allocate_gpu_memory) {
*mem_ptr =
new int_scalar_mul_buffer<T>(streams, gpu_indexes, gpu_count, params,
@@ -40,11 +41,11 @@ __host__ void scratch_cuda_integer_radix_scalar_mul_kb(
template <typename T, class params>
__host__ void host_integer_scalar_mul_radix(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
T *lwe_array, T *decomposed_scalar, T *has_at_least_one_set,
int_scalar_mul_buffer<T> *mem, void **bsks, T **ksks,
uint32_t input_lwe_dimension, uint32_t message_modulus,
uint32_t num_radix_blocks, uint32_t num_scalars) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, T *lwe_array, T const *decomposed_scalar,
T const *has_at_least_one_set, int_scalar_mul_buffer<T> *mem,
void *const *bsks, T *const *ksks, uint32_t input_lwe_dimension,
uint32_t message_modulus, uint32_t num_radix_blocks, uint32_t num_scalars) {
if (num_radix_blocks == 0 | num_scalars == 0)
return;
@@ -121,8 +122,8 @@ __host__ void host_integer_scalar_mul_radix(
// Small scalar_mul is used in shift/rotate
template <typename T>
__host__ void host_integer_small_scalar_mul_radix(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
T *output_lwe_array, T *input_lwe_array, T scalar,
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, T *output_lwe_array, T *input_lwe_array, T scalar,
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {
cudaSetDevice(gpu_indexes[0]);

View File

@@ -1,8 +1,8 @@
#include "scalar_rotate.cuh"
void scratch_cuda_integer_radix_scalar_rotate_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
@@ -21,9 +21,9 @@ void scratch_cuda_integer_radix_scalar_rotate_kb_64(
}
void cuda_integer_radix_scalar_rotate_kb_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
uint32_t n, int8_t *mem_ptr, void **bsks, void **ksks,
uint32_t num_blocks) {
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array, uint32_t n, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t num_blocks) {
host_integer_radix_scalar_rotate_kb_inplace<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
@@ -32,8 +32,8 @@ void cuda_integer_radix_scalar_rotate_kb_64_inplace(
(uint64_t **)(ksks), num_blocks);
}
void cleanup_cuda_integer_radix_scalar_rotate(void **streams,
uint32_t *gpu_indexes,
void cleanup_cuda_integer_radix_scalar_rotate(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {

View File

@@ -4,7 +4,7 @@
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer.cuh"
#include "integer.h"
#include "integer/integer_utilities.h"
#include "pbs/programmable_bootstrap_classic.cuh"
#include "pbs/programmable_bootstrap_multibit.cuh"
#include "types/complex/operations.cuh"
@@ -13,10 +13,10 @@
template <typename Torus>
__host__ void scratch_cuda_integer_radix_scalar_rotate_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_logical_scalar_shift_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
int_radix_params params, SHIFT_OR_ROTATE_TYPE shift_type,
bool allocate_gpu_memory) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_logical_scalar_shift_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
*mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
@@ -25,9 +25,10 @@ __host__ void scratch_cuda_integer_radix_scalar_rotate_kb(
template <typename Torus>
__host__ void host_integer_radix_scalar_rotate_kb_inplace(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array, uint32_t n, int_logical_scalar_shift_buffer<Torus> *mem,
void **bsks, Torus **ksks, uint32_t num_blocks) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array, uint32_t n,
int_logical_scalar_shift_buffer<Torus> *mem, void *const *bsks,
Torus *const *ksks, uint32_t num_blocks) {
auto params = mem->params;
auto glwe_dimension = params.glwe_dimension;

View File

@@ -1,8 +1,8 @@
#include "scalar_shifts.cuh"
void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
@@ -25,9 +25,9 @@ void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
/// the application of a PBS onto the rotated blocks up to num_blocks -
/// rotations - 1 The remaining blocks are padded with zeros
void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
uint32_t shift, int8_t *mem_ptr, void **bsks, void **ksks,
uint32_t num_blocks) {
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array, uint32_t shift, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t num_blocks) {
host_integer_radix_logical_scalar_shift_kb_inplace<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
@@ -37,8 +37,8 @@ void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
}
void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
@@ -64,9 +64,9 @@ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
/// block, which is copied onto all remaining blocks instead of padding with
/// zeros as would be done in the logical shift.
void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
uint32_t shift, int8_t *mem_ptr, void **bsks, void **ksks,
uint32_t num_blocks) {
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array, uint32_t shift, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t num_blocks) {
host_integer_radix_arithmetic_scalar_shift_kb_inplace<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
@@ -75,10 +75,9 @@ void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
(uint64_t **)(ksks), num_blocks);
}
void cleanup_cuda_integer_radix_logical_scalar_shift(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {
void cleanup_cuda_integer_radix_logical_scalar_shift(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_logical_scalar_shift_buffer<uint64_t> *mem_ptr =
(int_logical_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
@@ -86,10 +85,9 @@ void cleanup_cuda_integer_radix_logical_scalar_shift(void **streams,
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}
void cleanup_cuda_integer_radix_arithmetic_scalar_shift(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {
void cleanup_cuda_integer_radix_arithmetic_scalar_shift(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_arithmetic_scalar_shift_buffer<uint64_t> *mem_ptr =
(int_arithmetic_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);

View File

@@ -4,7 +4,7 @@
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer.cuh"
#include "integer.h"
#include "integer/integer_utilities.h"
#include "pbs/programmable_bootstrap_classic.cuh"
#include "pbs/programmable_bootstrap_multibit.cuh"
#include "types/complex/operations.cuh"
@@ -13,10 +13,10 @@
template <typename Torus>
__host__ void scratch_cuda_integer_radix_logical_scalar_shift_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_logical_scalar_shift_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
int_radix_params params, SHIFT_OR_ROTATE_TYPE shift_type,
bool allocate_gpu_memory) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_logical_scalar_shift_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
*mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
@@ -25,10 +25,10 @@ __host__ void scratch_cuda_integer_radix_logical_scalar_shift_kb(
template <typename Torus>
__host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array, uint32_t shift,
int_logical_scalar_shift_buffer<Torus> *mem, void **bsks, Torus **ksks,
uint32_t num_blocks) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array, uint32_t shift,
int_logical_scalar_shift_buffer<Torus> *mem, void *const *bsks,
Torus *const *ksks, uint32_t num_blocks) {
auto params = mem->params;
auto glwe_dimension = params.glwe_dimension;
@@ -116,8 +116,8 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
template <typename Torus>
__host__ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_arithmetic_scalar_shift_buffer<Torus> **mem_ptr,
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_arithmetic_scalar_shift_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
@@ -128,10 +128,10 @@ __host__ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb(
template <typename Torus>
__host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array, uint32_t shift,
int_arithmetic_scalar_shift_buffer<Torus> *mem, void **bsks, Torus **ksks,
uint32_t num_blocks) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array, uint32_t shift,
int_arithmetic_scalar_shift_buffer<Torus> *mem, void *const *bsks,
Torus *const *ksks, uint32_t num_blocks) {
auto params = mem->params;
auto glwe_dimension = params.glwe_dimension;

View File

@@ -1,8 +1,8 @@
#include "shift_and_rotate.cuh"
void scratch_cuda_integer_radix_shift_and_rotate_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
@@ -21,19 +21,20 @@ void scratch_cuda_integer_radix_shift_and_rotate_kb_64(
}
void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
void *lwe_shift, int8_t *mem_ptr, void **bsks, void **ksks,
uint32_t num_blocks) {
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array, void const *lwe_shift, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t num_blocks) {
host_integer_radix_shift_and_rotate_kb_inplace<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(lwe_shift),
static_cast<uint64_t *>(lwe_array),
static_cast<const uint64_t *>(lwe_shift),
(int_shift_and_rotate_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks), num_blocks);
}
void cleanup_cuda_integer_radix_shift_and_rotate(void **streams,
uint32_t *gpu_indexes,
void cleanup_cuda_integer_radix_shift_and_rotate(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_shift_and_rotate_buffer<uint64_t> *mem_ptr =

View File

@@ -4,7 +4,7 @@
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer.cuh"
#include "integer.h"
#include "integer/integer_utilities.h"
#include "pbs/programmable_bootstrap_classic.cuh"
#include "pbs/programmable_bootstrap_multibit.cuh"
#include "scalar_mul.cuh"
@@ -14,10 +14,10 @@
template <typename Torus>
__host__ void scratch_cuda_integer_radix_shift_and_rotate_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_shift_and_rotate_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
int_radix_params params, SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed,
bool allocate_gpu_memory) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_shift_and_rotate_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed, bool allocate_gpu_memory) {
*mem_ptr = new int_shift_and_rotate_buffer<Torus>(
streams, gpu_indexes, gpu_count, shift_type, is_signed, params,
num_radix_blocks, allocate_gpu_memory);
@@ -25,9 +25,10 @@ __host__ void scratch_cuda_integer_radix_shift_and_rotate_kb(
template <typename Torus>
__host__ void host_integer_radix_shift_and_rotate_kb_inplace(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array, Torus *lwe_shift, int_shift_and_rotate_buffer<Torus> *mem,
void **bsks, Torus **ksks, uint32_t num_radix_blocks) {
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array, Torus const *lwe_shift,
int_shift_and_rotate_buffer<Torus> *mem, void *const *bsks,
Torus *const *ksks, uint32_t num_radix_blocks) {
uint32_t bits_per_block = std::log2(mem->params.message_modulus);
uint32_t total_nb_bits = bits_per_block * num_radix_blocks;
if (total_nb_bits == 0)
@@ -60,8 +61,9 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
// Extracts bits and put them in the bit index 2 (=> bit number 3)
// so that it is already aligned to the correct position of the cmux input
// and we reduce noise growth
extract_n_bits<Torus>(streams, gpu_indexes, gpu_count, shift_bits, lwe_shift,
bsks, ksks, 1, max_num_bits_that_tell_shift,
extract_n_bits<Torus>(streams, gpu_indexes, gpu_count, shift_bits,
(Torus *)lwe_shift, bsks, ksks, 1,
max_num_bits_that_tell_shift,
mem->bit_extract_luts_with_offset_2);
// If signed, do an "arithmetic shift" by padding with the sign bit

View File

@@ -6,15 +6,15 @@
*/
void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in_1,
void *lwe_array_in_2,
void const *lwe_array_in_1,
void const *lwe_array_in_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_addition<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in_1),
static_cast<uint32_t *>(lwe_array_in_2),
static_cast<const uint32_t *>(lwe_array_in_1),
static_cast<const uint32_t *>(lwe_array_in_2),
input_lwe_dimension, input_lwe_ciphertext_count);
}
@@ -46,15 +46,15 @@ void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
*/
void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in_1,
void *lwe_array_in_2,
void const *lwe_array_in_1,
void const *lwe_array_in_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_addition<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in_1),
static_cast<uint64_t *>(lwe_array_in_2),
static_cast<const uint64_t *>(lwe_array_in_1),
static_cast<const uint64_t *>(lwe_array_in_2),
input_lwe_dimension, input_lwe_ciphertext_count);
}
/*
@@ -62,15 +62,15 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
* plaintext vector. See the equivalent operation on u64 data for more details.
*/
void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *plaintext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, void const *plaintext_array_in,
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {
host_addition_plaintext<uint32_t>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(plaintext_array_in), input_lwe_dimension,
static_cast<const uint32_t *>(lwe_array_in),
static_cast<const uint32_t *>(plaintext_array_in), input_lwe_dimension,
input_lwe_ciphertext_count);
}
/*
@@ -102,14 +102,14 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
* performs the operation on the GPU.
*/
void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *plaintext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, void const *plaintext_array_in,
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {
host_addition_plaintext<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(plaintext_array_in), input_lwe_dimension,
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(plaintext_array_in), input_lwe_dimension,
input_lwe_ciphertext_count);
}

View File

@@ -13,9 +13,9 @@
#include <stdio.h>
template <typename T>
__global__ void plaintext_addition(T *output, T *lwe_input, T *plaintext_input,
uint32_t input_lwe_dimension,
uint32_t num_entries) {
__global__ void
plaintext_addition(T *output, T const *lwe_input, T const *plaintext_input,
uint32_t input_lwe_dimension, uint32_t num_entries) {
int tid = threadIdx.x;
int plaintext_index = blockIdx.x * blockDim.x + tid;
@@ -30,7 +30,7 @@ __global__ void plaintext_addition(T *output, T *lwe_input, T *plaintext_input,
template <typename T>
__host__ void
host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index, T *output,
T *lwe_input, T *plaintext_input,
T const *lwe_input, T const *plaintext_input,
uint32_t lwe_dimension, uint32_t lwe_ciphertext_count) {
cudaSetDevice(gpu_index);
@@ -49,7 +49,7 @@ host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index, T *output,
}
template <typename T>
__global__ void addition(T *output, T *input_1, T *input_2,
__global__ void addition(T *output, T const *input_1, T const *input_2,
uint32_t num_entries) {
int tid = threadIdx.x;
@@ -63,7 +63,7 @@ __global__ void addition(T *output, T *input_1, T *input_2,
// Coefficient-wise addition
template <typename T>
__host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output,
T *input_1, T *input_2,
T const *input_1, T const *input_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
@@ -83,7 +83,7 @@ __host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output,
}
template <typename T>
__global__ void subtraction(T *output, T *input_1, T *input_2,
__global__ void subtraction(T *output, T const *input_1, T const *input_2,
uint32_t num_entries) {
int tid = threadIdx.x;
@@ -97,7 +97,7 @@ __global__ void subtraction(T *output, T *input_1, T *input_2,
// Coefficient-wise subtraction
template <typename T>
__host__ void host_subtraction(cudaStream_t stream, uint32_t gpu_index,
T *output, T *input_1, T *input_2,
T *output, T const *input_1, T const *input_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
@@ -157,9 +157,11 @@ __host__ void host_subtraction_plaintext(cudaStream_t stream,
}
template <typename T>
__global__ void unchecked_sub_with_correcting_term(
T *output, T *input_1, T *input_2, uint32_t num_entries, uint32_t lwe_size,
uint32_t message_modulus, uint32_t carry_modulus, uint32_t degree) {
__global__ void
unchecked_sub_with_correcting_term(T *output, T const *input_1,
T const *input_2, uint32_t num_entries,
uint32_t lwe_size, uint32_t message_modulus,
uint32_t carry_modulus, uint32_t degree) {
uint32_t msg_mod = message_modulus;
uint64_t z = max((uint64_t)ceil(degree / msg_mod), (uint64_t)1);
z *= msg_mod;
@@ -178,9 +180,10 @@ __global__ void unchecked_sub_with_correcting_term(
}
template <typename T>
__host__ void host_unchecked_sub_with_correcting_term(
cudaStream_t stream, uint32_t gpu_index, T *output, T *input_1, T *input_2,
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus, uint32_t degree) {
cudaStream_t stream, uint32_t gpu_index, T *output, T const *input_1,
T const *input_2, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus, uint32_t degree) {
cudaSetDevice(gpu_index);
// lwe_size includes the presence of the body

View File

@@ -5,15 +5,15 @@
* cleartext vector. See the equivalent operation on u64 data for more details.
*/
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *cleartext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, void const *cleartext_array_in,
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {
host_cleartext_vec_multiplication<uint32_t>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(cleartext_array_in), input_lwe_dimension,
static_cast<const uint32_t *>(lwe_array_in),
static_cast<const uint32_t *>(cleartext_array_in), input_lwe_dimension,
input_lwe_ciphertext_count);
}
/*
@@ -45,14 +45,14 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
* function that performs the operation on the GPU.
*/
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *cleartext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, void const *cleartext_array_in,
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {
host_cleartext_vec_multiplication<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(cleartext_array_in), input_lwe_dimension,
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(cleartext_array_in), input_lwe_dimension,
input_lwe_ciphertext_count);
}

View File

@@ -14,8 +14,8 @@
#include <vector>
template <typename T>
__global__ void cleartext_vec_multiplication(T *output, T *lwe_input,
T *cleartext_input,
__global__ void cleartext_vec_multiplication(T *output, T const *lwe_input,
T const *cleartext_input,
uint32_t input_lwe_dimension,
uint32_t num_entries) {
@@ -29,11 +29,10 @@ __global__ void cleartext_vec_multiplication(T *output, T *lwe_input,
}
template <typename T>
__host__ void
host_cleartext_vec_multiplication(cudaStream_t stream, uint32_t gpu_index,
T *output, T *lwe_input, T *cleartext_input,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
__host__ void host_cleartext_vec_multiplication(
cudaStream_t stream, uint32_t gpu_index, T *output, T const *lwe_input,
T const *cleartext_input, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
cudaSetDevice(gpu_index);
// lwe_size includes the presence of the body
@@ -53,7 +52,7 @@ host_cleartext_vec_multiplication(cudaStream_t stream, uint32_t gpu_index,
template <typename T>
__global__ void
cleartext_multiplication(T *output, T *lwe_input, T cleartext_input,
cleartext_multiplication(T *output, T const *lwe_input, T cleartext_input,
uint32_t input_lwe_dimension, uint32_t num_entries) {
int tid = threadIdx.x;
@@ -67,7 +66,7 @@ cleartext_multiplication(T *output, T *lwe_input, T cleartext_input,
template <typename T>
__host__ void
host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
T *output, T *lwe_input, T cleartext_input,
T *output, T const *lwe_input, T cleartext_input,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {

View File

@@ -6,13 +6,13 @@
*/
void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in,
void const *lwe_array_in,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_negation<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in),
static_cast<const uint32_t *>(lwe_array_in),
input_lwe_dimension, input_lwe_ciphertext_count);
}
@@ -40,12 +40,12 @@ void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
*/
void cuda_negate_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in,
void const *lwe_array_in,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_negation<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_array_in),
input_lwe_dimension, input_lwe_ciphertext_count);
}

View File

@@ -11,7 +11,7 @@
#include "linear_algebra.h"
template <typename T>
__global__ void negation(T *output, T *input, uint32_t num_entries) {
__global__ void negation(T *output, T const *input, uint32_t num_entries) {
int tid = threadIdx.x;
int index = blockIdx.x * blockDim.x + tid;
@@ -23,7 +23,7 @@ __global__ void negation(T *output, T *input, uint32_t num_entries) {
template <typename T>
__host__ void host_negation(cudaStream_t stream, uint32_t gpu_index, T *output,
T *input, uint32_t input_lwe_dimension,
T const *input, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
cudaSetDevice(gpu_index);

View File

@@ -1,29 +1,29 @@
#include "bootstrapping_key.cuh"
void cuda_convert_lwe_programmable_bootstrap_key_32(
void *stream, uint32_t gpu_index, void *dest, void *src,
void *stream, uint32_t gpu_index, void *dest, void const *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size) {
uint32_t total_polynomials =
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
cuda_convert_lwe_programmable_bootstrap_key<uint32_t, int32_t>(
static_cast<cudaStream_t>(stream), gpu_index, (double2 *)dest,
(int32_t *)src, polynomial_size, total_polynomials);
(const int32_t *)src, polynomial_size, total_polynomials);
}
void cuda_convert_lwe_programmable_bootstrap_key_64(
void *stream, uint32_t gpu_index, void *dest, void *src,
void *stream, uint32_t gpu_index, void *dest, void const *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size) {
uint32_t total_polynomials =
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
cuda_convert_lwe_programmable_bootstrap_key<uint64_t, int64_t>(
static_cast<cudaStream_t>(stream), gpu_index, (double2 *)dest,
(int64_t *)src, polynomial_size, total_polynomials);
(const int64_t *)src, polynomial_size, total_polynomials);
}
void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
void *stream, uint32_t gpu_index, void *dest, void *src,
void *stream, uint32_t gpu_index, void *dest, void const *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size, uint32_t grouping_factor) {
uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
@@ -89,3 +89,175 @@ template __device__ const double2 *get_multi_bit_ith_lwe_gth_group_kth_block(
const double2 *ptr, int g, int i, int k, int level,
uint32_t grouping_factor, uint32_t polynomial_size, uint32_t glwe_dimension,
uint32_t level_count);
void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
void const *_input1, void const *_input2,
void *_output, uint32_t polynomial_size,
uint32_t total_polynomials) {
auto stream = static_cast<cudaStream_t>(stream_v);
cudaSetDevice(gpu_index);
auto input1 = (double2 *)_input1;
auto input2 = (double2 *)_input2;
auto output = (double2 *)_output;
size_t shared_memory_size = sizeof(double2) * polynomial_size / 2;
int gridSize = total_polynomials;
int blockSize = polynomial_size / choose_opt_amortized(polynomial_size);
double2 *buffer;
switch (polynomial_size) {
case 256:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
}
break;
case 512:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<521>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
}
break;
case 1024:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
}
break;
case 2048:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
}
break;
case 4096:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
}
break;
case 8192:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
}
break;
case 16384:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
}
break;
default:
break;
}
cuda_drop_async(buffer, stream, gpu_index);
}

View File

@@ -3,9 +3,9 @@
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "pbs/programmable_bootstrap.h"
#include "pbs/programmable_bootstrap_multibit.h"
#include "polynomial/parameters.cuh"
#include "programmable_bootstrap.h"
#include "programmable_bootstrap_multibit.h"
#include <atomic>
#include <cstdint>
@@ -75,7 +75,7 @@ __device__ const T *get_multi_bit_ith_lwe_gth_group_kth_block(
template <typename T, typename ST>
void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
uint32_t gpu_index,
double2 *dest, ST *src,
double2 *dest, ST const *src,
uint32_t polynomial_size,
uint32_t total_polynomials) {
cudaSetDevice(gpu_index);
@@ -249,175 +249,4 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
cudaFreeHost(h_bsk);
}
void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
void *_input1, void *_input2, void *_output,
uint32_t polynomial_size,
uint32_t total_polynomials) {
cudaSetDevice(gpu_index);
auto input1 = (double2 *)_input1;
auto input2 = (double2 *)_input2;
auto output = (double2 *)_output;
size_t shared_memory_size = sizeof(double2) * polynomial_size / 2;
int gridSize = total_polynomials;
int blockSize = polynomial_size / choose_opt_amortized(polynomial_size);
double2 *buffer;
switch (polynomial_size) {
case 256:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
}
break;
case 512:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<521>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
}
break;
case 1024:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
}
break;
case 2048:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
}
break;
case 4096:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
}
break;
case 8192:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
}
break;
case 16384:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
}
break;
default:
break;
}
cuda_drop_async(buffer, stream, gpu_index);
}
#endif // CNCRT_BSK_H

View File

@@ -1,12 +1,12 @@
#ifndef CUDA_PROGRAMMABLE_BOOTSTRAP_CUH
#define CUDA_PROGRAMMABLE_BOOTSTRAP_CUH
#include "bootstrapping_key.cuh"
#include "cooperative_groups.h"
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "helper_multi_gpu.h"
#include "programmable_bootstrap.h"
#include "programmable_bootstrap_multibit.h"
#include "pbs/programmable_bootstrap_multibit.h"
using namespace cooperative_groups;
namespace cg = cooperative_groups;
@@ -117,18 +117,22 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *join_buffer,
}
template <typename Torus>
void execute_pbs_async(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
const LweArrayVariant<Torus> &lwe_array_out,
const LweArrayVariant<Torus> &lwe_output_indexes,
std::vector<Torus *> lut_vec, std::vector<Torus *> lut_indexes_vec,
const LweArrayVariant<Torus> &lwe_array_in,
const LweArrayVariant<Torus> &lwe_input_indexes, void **bootstrapping_keys,
std::vector<int8_t *> pbs_buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type, uint32_t lut_count,
uint32_t lut_stride) {
void execute_pbs_async(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count,
const LweArrayVariant<Torus> &lwe_array_out,
const LweArrayVariant<Torus> &lwe_output_indexes,
const std::vector<Torus *> lut_vec,
const std::vector<Torus *> lut_indexes_vec,
const LweArrayVariant<Torus> &lwe_array_in,
const LweArrayVariant<Torus> &lwe_input_indexes,
void *const *bootstrapping_keys,
std::vector<int8_t *> pbs_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type,
uint32_t lut_count, uint32_t lut_stride) {
switch (sizeof(Torus)) {
case sizeof(uint32_t):
// 32 bits

View File

@@ -126,8 +126,9 @@ void scratch_cuda_programmable_bootstrap_amortized_64(
*/
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples) {
@@ -264,8 +265,9 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
*/
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples) {

View File

@@ -6,15 +6,16 @@
#include <cuda_runtime.h>
#endif
#include "bootstrapping_key.cuh"
#include "crypto/gadget.cuh"
#include "crypto/torus.cuh"
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "fft/twiddles.cuh"
#include "pbs/programmable_bootstrap.h"
#include "polynomial/functions.cuh"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial_math.cuh"
#include "programmable_bootstrap.h"
#include "types/complex/operations.cuh"
template <typename Torus, class params, sharedMemDegree SMD>

View File

@@ -12,10 +12,11 @@
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "fft/twiddles.cuh"
#include "pbs/pbs_utilities.h"
#include "pbs/programmable_bootstrap.h"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial_math.cuh"
#include "programmable_bootstrap.cuh"
#include "programmable_bootstrap.h"
#include "types/complex/operations.cuh"
using namespace cooperative_groups;
@@ -228,8 +229,9 @@ __host__ void scratch_programmable_bootstrap_cg(
template <typename Torus, class params>
__host__ void host_programmable_bootstrap_cg(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,

View File

@@ -8,11 +8,12 @@
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "fft/twiddles.cuh"
#include "pbs/pbs_multibit_utilities.h"
#include "pbs/programmable_bootstrap.h"
#include "polynomial/functions.cuh"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial_math.cuh"
#include "programmable_bootstrap.cuh"
#include "programmable_bootstrap.h"
#include "programmable_bootstrap_multibit.cuh"
#include "types/complex/operations.cuh"
#include <vector>
@@ -285,13 +286,14 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
template <typename Torus, class params>
__host__ void execute_cg_external_product_loop(
cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
Torus *lwe_array_out, Torus *lwe_output_indexes,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
uint32_t lwe_offset, uint32_t lut_count, uint32_t lut_stride) {
cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus *lwe_array_out,
Torus const *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t lwe_offset, uint32_t lut_count,
uint32_t lut_stride) {
auto lwe_chunk_size = buffer->lwe_chunk_size;
uint64_t full_dm =
@@ -369,8 +371,9 @@ __host__ void execute_cg_external_product_loop(
template <typename Torus, class params>
__host__ void host_cg_multi_bit_programmable_bootstrap(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, uint64_t const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,

View File

@@ -118,8 +118,9 @@ void scratch_cuda_programmable_bootstrap_tbc(
template <typename Torus>
void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
@@ -374,8 +375,9 @@ void scratch_cuda_programmable_bootstrap_64(
template <typename Torus>
void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
@@ -448,8 +450,9 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
template <typename Torus>
void cuda_programmable_bootstrap_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
@@ -523,8 +526,9 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
*/
void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride) {
@@ -540,12 +544,12 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
#if CUDA_ARCH >= 900
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
static_cast<uint32_t *>(lut_vector),
static_cast<uint32_t *>(lut_vector_indexes),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
static_cast<const uint32_t *>(lwe_output_indexes),
static_cast<const uint32_t *>(lut_vector),
static_cast<const uint32_t *>(lut_vector_indexes),
static_cast<const uint32_t *>(lwe_array_in),
static_cast<const uint32_t *>(lwe_input_indexes),
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
break;
@@ -555,24 +559,24 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
case CG:
cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
static_cast<uint32_t *>(lut_vector),
static_cast<uint32_t *>(lut_vector_indexes),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
static_cast<const uint32_t *>(lwe_output_indexes),
static_cast<const uint32_t *>(lut_vector),
static_cast<const uint32_t *>(lut_vector_indexes),
static_cast<const uint32_t *>(lwe_array_in),
static_cast<const uint32_t *>(lwe_input_indexes),
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
break;
case DEFAULT:
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
static_cast<uint32_t *>(lut_vector),
static_cast<uint32_t *>(lut_vector_indexes),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
static_cast<const uint32_t *>(lwe_output_indexes),
static_cast<const uint32_t *>(lut_vector),
static_cast<const uint32_t *>(lut_vector_indexes),
static_cast<const uint32_t *>(lwe_array_in),
static_cast<const uint32_t *>(lwe_input_indexes),
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
break;
@@ -644,8 +648,9 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
*/
void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride) {
@@ -660,12 +665,12 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
#if (CUDA_ARCH >= 900)
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lut_vector),
static_cast<const uint64_t *>(lut_vector_indexes),
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
break;
@@ -675,24 +680,24 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
case PBS_VARIANT::CG:
cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lut_vector),
static_cast<const uint64_t *>(lut_vector_indexes),
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
break;
case PBS_VARIANT::DEFAULT:
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lut_vector),
static_cast<const uint64_t *>(lut_vector_indexes),
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
break;
@@ -717,9 +722,9 @@ template bool has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
uint64_t const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<uint64_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
@@ -727,9 +732,9 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
uint64_t const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<uint64_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
@@ -748,9 +753,9 @@ template void scratch_cuda_programmable_bootstrap<uint64_t>(
template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
uint32_t *lwe_output_indexes, uint32_t *lut_vector,
uint32_t *lut_vector_indexes, uint32_t *lwe_array_in,
uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
uint32_t const *lwe_output_indexes, uint32_t const *lut_vector,
uint32_t const *lut_vector_indexes, uint32_t const *lwe_array_in,
uint32_t const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
@@ -758,9 +763,9 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
uint32_t *lwe_output_indexes, uint32_t *lut_vector,
uint32_t *lut_vector_indexes, uint32_t *lwe_array_in,
uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
uint32_t const *lwe_output_indexes, uint32_t const *lut_vector,
uint32_t const *lut_vector_indexes, uint32_t const *lwe_array_in,
uint32_t const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
@@ -787,18 +792,18 @@ template bool has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
#if CUDA_ARCH >= 900
template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
uint32_t *lwe_output_indexes, uint32_t *lut_vector,
uint32_t *lut_vector_indexes, uint32_t *lwe_array_in,
uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
uint32_t const *lwe_output_indexes, uint32_t const *lut_vector,
uint32_t const *lut_vector_indexes, uint32_t const *lwe_array_in,
uint32_t const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<uint32_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t lut_stride);
template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
uint64_t const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<uint64_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,

View File

@@ -11,9 +11,10 @@
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "fft/twiddles.cuh"
#include "pbs/pbs_utilities.h"
#include "pbs/programmable_bootstrap.h"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial_math.cuh"
#include "programmable_bootstrap.h"
#include "types/complex/operations.cuh"
template <typename Torus, class params, sharedMemDegree SMD>
@@ -363,16 +364,15 @@ __host__ void scratch_programmable_bootstrap(
}
template <typename Torus, class params>
__host__ void
execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, double2 *bootstrapping_key,
Torus *global_accumulator, double2 *global_accumulator_fft,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, int8_t *d_mem,
int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm,
uint64_t full_sm, uint64_t full_dm) {
__host__ void execute_step_one(
cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
Torus *global_accumulator, double2 *global_accumulator_fft,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) {
int max_shared_memory = cuda_get_max_shared_memory(0);
cudaSetDevice(gpu_index);
@@ -407,13 +407,14 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
template <typename Torus, class params>
__host__ void execute_step_two(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
double2 *bootstrapping_key, Torus *global_accumulator,
double2 *global_accumulator_fft, uint32_t input_lwe_ciphertext_count,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, int8_t *d_mem, int lwe_iteration,
uint64_t partial_sm, uint64_t partial_dm, uint64_t full_sm,
uint64_t full_dm, uint32_t lut_count, uint32_t lut_stride) {
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, double2 const *bootstrapping_key,
Torus *global_accumulator, double2 *global_accumulator_fft,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm, uint32_t lut_count,
uint32_t lut_stride) {
int max_shared_memory = cuda_get_max_shared_memory(0);
cudaSetDevice(gpu_index);
@@ -450,8 +451,9 @@ __host__ void execute_step_two(
template <typename Torus, class params>
__host__ void host_programmable_bootstrap(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *pbs_buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,

View File

@@ -1,7 +1,7 @@
#include "../polynomial/parameters.cuh"
#include "pbs/programmable_bootstrap_multibit.h"
#include "programmable_bootstrap_cg_multibit.cuh"
#include "programmable_bootstrap_multibit.cuh"
#include "programmable_bootstrap_multibit.h"
#if (CUDA_ARCH >= 900)
#include "programmable_bootstrap_tbc_multibit.cuh"
@@ -61,8 +61,9 @@ bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
template <typename Torus>
void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
@@ -138,8 +139,9 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
template <typename Torus>
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
@@ -214,8 +216,9 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
@@ -229,12 +232,12 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
#if CUDA_ARCH >= 900
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lut_vector),
static_cast<const uint64_t *>(lut_vector_indexes),
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lut_count, lut_stride);
break;
@@ -244,24 +247,24 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
case PBS_VARIANT::CG:
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lut_vector),
static_cast<const uint64_t *>(lut_vector_indexes),
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lut_count, lut_stride);
break;
case PBS_VARIANT::DEFAULT:
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lut_vector),
static_cast<const uint64_t *>(lut_vector_indexes),
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lut_count, lut_stride);
break;
@@ -493,9 +496,9 @@ template void scratch_cuda_multi_bit_programmable_bootstrap<uint64_t>(
template void
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
uint64_t const *lwe_input_indexes, uint64_t const *bootstrapping_key,
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
@@ -510,9 +513,9 @@ template void scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
template void
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
uint64_t const *lwe_input_indexes, uint64_t const *bootstrapping_key,
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
@@ -582,8 +585,9 @@ void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
template <typename Torus>
void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
@@ -678,9 +682,9 @@ template void scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
template void
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
uint64_t const *lwe_input_indexes, uint64_t const *bootstrapping_key,
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,

View File

@@ -8,12 +8,13 @@
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "fft/twiddles.cuh"
#include "pbs/pbs_multibit_utilities.h"
#include "pbs/programmable_bootstrap.h"
#include "pbs/programmable_bootstrap_multibit.h"
#include "polynomial/functions.cuh"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial_math.cuh"
#include "programmable_bootstrap.h"
#include "programmable_bootstrap_cg_classic.cuh"
#include "programmable_bootstrap_multibit.h"
#include "types/complex/operations.cuh"
#include <vector>
@@ -489,8 +490,8 @@ __host__ void scratch_multi_bit_programmable_bootstrap(
template <typename Torus, class params>
__host__ void execute_compute_keybundle(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_in,
Torus *lwe_input_indexes, Torus *bootstrapping_key,
cudaStream_t stream, uint32_t gpu_index, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
@@ -537,12 +538,14 @@ __host__ void execute_compute_keybundle(
}
template <typename Torus, class params>
__host__ void execute_step_one(
cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t j, uint32_t lwe_offset) {
__host__ void
execute_step_one(cudaStream_t stream, uint32_t gpu_index,
Torus const *lut_vector, Torus const *lut_vector_indexes,
Torus const *lwe_array_in, Torus const *lwe_input_indexes,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t j, uint32_t lwe_offset) {
uint64_t full_sm_accumulate_step_one =
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
@@ -593,7 +596,7 @@ __host__ void execute_step_one(
template <typename Torus, class params>
__host__ void execute_step_two(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
Torus const *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, int32_t grouping_factor, uint32_t level_count,
uint32_t j, uint32_t lwe_offset, uint32_t lut_count, uint32_t lut_stride) {
@@ -637,8 +640,9 @@ __host__ void execute_step_two(
template <typename Torus, class params>
__host__ void host_multi_bit_programmable_bootstrap(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,

View File

@@ -12,10 +12,11 @@
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "fft/twiddles.cuh"
#include "pbs/pbs_utilities.h"
#include "pbs/programmable_bootstrap.h"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial_math.cuh"
#include "programmable_bootstrap.cuh"
#include "programmable_bootstrap.h"
#include "types/complex/operations.cuh"
using namespace cooperative_groups;
@@ -253,8 +254,9 @@ __host__ void scratch_programmable_bootstrap_tbc(
template <typename Torus, class params>
__host__ void host_programmable_bootstrap_tbc(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,

View File

@@ -8,12 +8,13 @@
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "fft/twiddles.cuh"
#include "pbs/pbs_multibit_utilities.h"
#include "pbs/programmable_bootstrap.h"
#include "pbs/programmable_bootstrap_multibit.cuh"
#include "polynomial/functions.cuh"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial_math.cuh"
#include "programmable_bootstrap.cuh"
#include "programmable_bootstrap.h"
#include "programmable_bootstrap_multibit.cuh"
#include "types/complex/operations.cuh"
#include <vector>
@@ -290,13 +291,14 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap(
template <typename Torus, class params>
__host__ void execute_tbc_external_product_loop(
cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
Torus *lwe_array_out, Torus *lwe_output_indexes,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
uint32_t lwe_offset, uint32_t lut_count, uint32_t lut_stride) {
cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus *lwe_array_out,
Torus const *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t lwe_offset, uint32_t lut_count,
uint32_t lut_stride) {
auto lwe_chunk_size = buffer->lwe_chunk_size;
auto supports_dsm =
@@ -393,8 +395,9 @@ __host__ void execute_tbc_external_product_loop(
template <typename Torus, class params>
__host__ void host_tbc_multi_bit_programmable_bootstrap(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,

View File

@@ -188,7 +188,7 @@ __device__ void add_to_torus(double2 *m_values, Torus *result,
// Extracts the body of the nth-LWE in a GLWE.
template <typename Torus, class params>
__device__ void sample_extract_body(Torus *lwe_array_out, Torus *glwe,
__device__ void sample_extract_body(Torus *lwe_array_out, Torus const *glwe,
uint32_t glwe_dimension, uint32_t nth = 0) {
// Set first coefficient of the glwe as the body of the LWE sample
lwe_array_out[glwe_dimension * params::degree] =
@@ -197,7 +197,7 @@ __device__ void sample_extract_body(Torus *lwe_array_out, Torus *glwe,
// Extracts the mask from the nth-LWE in a GLWE.
template <typename Torus, class params>
__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *glwe,
__device__ void sample_extract_mask(Torus *lwe_array_out, Torus const *glwe,
uint32_t glwe_dimension = 1,
uint32_t nth = 0) {
for (int z = 0; z < glwe_dimension; z++) {

View File

@@ -5,7 +5,8 @@
/// Initialize same-size arrays on all active gpus
template <typename Torus>
void multi_gpu_alloc_array_async(cudaStream_t *streams, uint32_t *gpu_indexes,
void multi_gpu_alloc_array_async(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, std::vector<Torus *> &dest,
uint32_t elements_per_gpu) {
@@ -18,9 +19,10 @@ void multi_gpu_alloc_array_async(cudaStream_t *streams, uint32_t *gpu_indexes,
}
/// Copy an array residing on one GPU to all active gpus
template <typename Torus>
void multi_gpu_copy_array_async(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, std::vector<Torus *> &dest,
Torus *src, uint32_t elements_per_gpu) {
void multi_gpu_copy_array_async(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
std::vector<Torus *> &dest, Torus const *src,
uint32_t elements_per_gpu) {
dest.resize(gpu_count);
for (uint i = 0; i < gpu_count; i++) {
cuda_memcpy_async_gpu_to_gpu(dest[i], src, elements_per_gpu * sizeof(Torus),
@@ -31,9 +33,10 @@ void multi_gpu_copy_array_async(cudaStream_t *streams, uint32_t *gpu_indexes,
/// Initializes also the related indexing and initializes it to the trivial
/// index
template <typename Torus>
void multi_gpu_alloc_lwe_async(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, std::vector<Torus *> &dest,
uint32_t num_inputs, uint32_t lwe_size) {
void multi_gpu_alloc_lwe_async(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
std::vector<Torus *> &dest, uint32_t num_inputs,
uint32_t lwe_size) {
dest.resize(gpu_count);
for (uint i = 0; i < gpu_count; i++) {
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
@@ -48,9 +51,10 @@ void multi_gpu_alloc_lwe_async(cudaStream_t *streams, uint32_t *gpu_indexes,
/// The input indexing logic is given by an index array.
/// The output indexing is always the trivial one
template <typename Torus>
void multi_gpu_scatter_lwe_async(cudaStream_t *streams, uint32_t *gpu_indexes,
void multi_gpu_scatter_lwe_async(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, std::vector<Torus *> &dest,
Torus *src, Torus *h_src_indexes,
Torus const *src, Torus const *h_src_indexes,
bool is_trivial_index, uint32_t num_inputs,
uint32_t lwe_size) {
@@ -88,9 +92,9 @@ void multi_gpu_scatter_lwe_async(cudaStream_t *streams, uint32_t *gpu_indexes,
/// dest_indexes
/// The input indexing should be the trivial one
template <typename Torus>
void multi_gpu_gather_lwe_async(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *dest,
const std::vector<Torus *> &src,
void multi_gpu_gather_lwe_async(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
Torus *dest, const std::vector<Torus *> &src,
Torus *h_dest_indexes, bool is_trivial_index,
uint32_t num_inputs, uint32_t lwe_size) {
@@ -123,7 +127,8 @@ void multi_gpu_gather_lwe_async(cudaStream_t *streams, uint32_t *gpu_indexes,
}
template <typename Torus>
void multi_gpu_release_async(cudaStream_t *streams, uint32_t *gpu_indexes,
void multi_gpu_release_async(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
std::vector<Torus *> &vec) {
for (uint i = 0; i < vec.size(); i++)

View File

@@ -1,3 +1,4 @@
#include "pbs/pbs_utilities.h"
#include <benchmark/benchmark.h>
#include <cstdint>
#include <setup_and_teardown.h>

View File

@@ -1,9 +1,8 @@
#include "pbs/pbs_multibit_utilities.h"
#include "pbs/pbs_utilities.h"
#include <benchmark/benchmark.h>
#include <cmath>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <omp.h>
#include <setup_and_teardown.h>
typedef struct {
@@ -50,7 +49,6 @@ protected:
uint64_t *d_lut_pbs_indexes;
uint64_t *d_lwe_ct_in_array;
uint64_t *d_lwe_ct_out_array;
uint64_t *lwe_ct_out_array;
uint64_t *d_lwe_input_indexes;
uint64_t *d_lwe_output_indexes;
int8_t *buffer;
@@ -215,12 +213,15 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit)
for (auto _ : st) {
// Execute PBS
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
stream, gpu_index, d_lwe_ct_out_array, d_lwe_output_indexes,
d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
d_lwe_input_indexes, d_bsk, (pbs_buffer<uint64_t, MULTI_BIT> *)buffer,
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
pbs_base_log, pbs_level, input_lwe_ciphertext_count, lut_count,
lut_stride);
stream, gpu_index, d_lwe_ct_out_array,
(const uint64_t *)d_lwe_output_indexes,
(const uint64_t *)d_lut_pbs_identity,
(const uint64_t *)d_lut_pbs_indexes,
(const uint64_t *)d_lwe_ct_in_array,
(const uint64_t *)d_lwe_input_indexes, (const uint64_t *)d_bsk,
(pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
pbs_level, input_lwe_ciphertext_count, lut_count, lut_stride);
cuda_synchronize_stream(stream, gpu_index);
}

View File

@@ -1,10 +1,10 @@
#ifndef SETUP_AND_TEARDOWN_H
#define SETUP_AND_TEARDOWN_H
#include "pbs/programmable_bootstrap.h"
#include "pbs/programmable_bootstrap_multibit.h"
#include <device.h>
#include <keyswitch.h>
#include <programmable_bootstrap.h>
#include <programmable_bootstrap_multibit.h>
#include <utils.h>
void programmable_bootstrap_classical_setup(

View File

@@ -1,9 +1,10 @@
#include "pbs/pbs_utilities.h"
#include "pbs/programmable_bootstrap.h"
#include "utils.h"
#include "gtest/gtest.h"
#include <cstdint>
#include <device.h>
#include <functional>
#include <programmable_bootstrap.h>
#include <random>
#include <setup_and_teardown.h>
#include <stdio.h>

View File

@@ -1,6 +1,6 @@
#include <algorithm>
#include <programmable_bootstrap.h>
#include <programmable_bootstrap_multibit.h>
#include "pbs/programmable_bootstrap.h"
#include "pbs/programmable_bootstrap_multibit.h"
#include <cmath>
#include <cstdint>
#include <cstdlib>

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,11 @@
#![allow(warnings)]
pub type c_void = std::ffi::c_void;
pub type c_uint = std::ffi::c_uint;
pub type c_uchar = std::ffi::c_uchar;
pub type c_ushort = std::ffi::c_ushort;
pub type c_ulong = std::ffi::c_ulong;
pub type c_schar = std::ffi::c_schar;
pub type c_int = std::ffi::c_int;
pub type c_short = std::ffi::c_short;
pub type c_long = std::ffi::c_long;
pub type c_char = std::ffi::c_char;

View File

@@ -1 +1,4 @@
#[allow(warnings)]
pub mod bindings;
pub mod cuda_bind;
pub mod ffi;

View File

@@ -0,0 +1,7 @@
#include "cuda/include/ciphertext.h"
#include "cuda/include/integer/compression/compression.h"
#include "cuda/include/integer/integer.h"
#include "cuda/include/keyswitch.h"
#include "cuda/include/linear_algebra.h"
#include "cuda/include/pbs/programmable_bootstrap.h"
#include "cuda/include/pbs/programmable_bootstrap_multibit.h"

Some files were not shown because too many files have changed in this diff Show More