mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-09 14:47:56 -05:00
chore(gpu): automatically generate rust bindings for cuda functions, except device.cu
This commit is contained in:
@@ -56,7 +56,7 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
@@ -64,6 +64,7 @@ jobs:
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
|
||||
with:
|
||||
|
||||
2
.github/workflows/benchmark_gpu_integer.yml
vendored
2
.github/workflows/benchmark_gpu_integer.yml
vendored
@@ -59,7 +59,7 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
|
||||
@@ -63,7 +63,7 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
|
||||
@@ -63,7 +63,7 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
|
||||
@@ -72,7 +72,7 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
|
||||
@@ -73,7 +73,7 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
|
||||
@@ -63,7 +63,7 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
|
||||
2
.github/workflows/benchmark_gpu_l40.yml
vendored
2
.github/workflows/benchmark_gpu_l40.yml
vendored
@@ -63,7 +63,7 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
|
||||
2
.github/workflows/gpu_fast_h100_tests.yml
vendored
2
.github/workflows/gpu_fast_h100_tests.yml
vendored
@@ -99,7 +99,7 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
|
||||
2
.github/workflows/gpu_fast_tests.yml
vendored
2
.github/workflows/gpu_fast_tests.yml
vendored
@@ -97,7 +97,7 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
|
||||
2
.github/workflows/gpu_full_h100_tests.yml
vendored
2
.github/workflows/gpu_full_h100_tests.yml
vendored
@@ -57,7 +57,7 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
|
||||
@@ -99,7 +99,7 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
|
||||
@@ -100,7 +100,7 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
@@ -108,6 +108,7 @@ jobs:
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
|
||||
|
||||
|
||||
@@ -107,7 +107,7 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
@@ -115,6 +115,7 @@ jobs:
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
|
||||
with:
|
||||
|
||||
@@ -100,7 +100,7 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
@@ -108,6 +108,7 @@ jobs:
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
|
||||
|
||||
|
||||
@@ -107,7 +107,7 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
@@ -115,6 +115,7 @@ jobs:
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
|
||||
|
||||
|
||||
10
Makefile
10
Makefile
@@ -418,6 +418,14 @@ clippy_cuda_backend: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
-p tfhe-cuda-backend -- --no-deps -D warnings
|
||||
|
||||
.PHONY: check_rust_bindings_did_not_change # Check rust bindings are up to date for tfhe-cuda-backend
|
||||
check_rust_bindings_did_not_change:
|
||||
cargo build -p tfhe-cuda-backend && \
|
||||
git diff --quiet HEAD -- backends/tfhe-cuda-backend/src/bindings.rs || \
|
||||
( echo "Generated bindings have changed! Please run 'git add backends/tfhe-cuda-backend/src/bindings.rs' \
|
||||
and commit the changes." && exit 1 )
|
||||
|
||||
|
||||
.PHONY: tfhe_lints # Run custom tfhe-rs lints
|
||||
tfhe_lints: install_tfhe_lints
|
||||
cd tfhe && RUSTFLAGS="$(RUSTFLAGS)" cargo tfhe-lints \
|
||||
@@ -1257,7 +1265,7 @@ pcc: no_tfhe_typo no_dbg_log check_fmt check_typos lint_doc check_md_docs_are_te
|
||||
clippy_all tfhe_lints check_compile_tests
|
||||
|
||||
.PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
|
||||
pcc_gpu: clippy_gpu clippy_cuda_backend check_compile_tests_benches_gpu
|
||||
pcc_gpu: clippy_gpu clippy_cuda_backend check_compile_tests_benches_gpu check_rust_bindings_did_not_change
|
||||
|
||||
.PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
|
||||
fpcc: no_tfhe_typo no_dbg_log check_fmt check_typos lint_doc check_md_docs_are_tested clippy_fast \
|
||||
|
||||
@@ -14,3 +14,4 @@ keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]
|
||||
[build-dependencies]
|
||||
cmake = { version = "0.1" }
|
||||
pkg-config = { version = "0.3" }
|
||||
bindgen = "0.70.1"
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::env;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
use std::{env, fs};
|
||||
|
||||
fn main() {
|
||||
if let Ok(val) = env::var("DOCS_RS") {
|
||||
@@ -26,6 +27,7 @@ fn main() {
|
||||
println!("cargo::rerun-if-changed=cuda/tests_and_benchmarks");
|
||||
println!("cargo::rerun-if-changed=cuda/CMakeLists.txt");
|
||||
println!("cargo::rerun-if-changed=src");
|
||||
|
||||
if env::consts::OS == "linux" {
|
||||
let output = Command::new("./get_os_name.sh").output().unwrap();
|
||||
let distribution = String::from_utf8(output.stdout).unwrap();
|
||||
@@ -35,6 +37,7 @@ fn main() {
|
||||
Only Ubuntu is supported by tfhe-cuda-backend at this time. Build may fail\n"
|
||||
);
|
||||
}
|
||||
|
||||
let dest = cmake::build("cuda");
|
||||
println!("cargo:rustc-link-search=native={}", dest.display());
|
||||
println!("cargo:rustc-link-lib=static=tfhe_cuda_backend");
|
||||
@@ -51,6 +54,37 @@ fn main() {
|
||||
println!("cargo:rustc-link-lib=cudart");
|
||||
println!("cargo:rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
|
||||
println!("cargo:rustc-link-lib=stdc++");
|
||||
|
||||
let header_path = "wrapper.h";
|
||||
println!("cargo:rerun-if-changed={}", header_path);
|
||||
|
||||
let out_path = PathBuf::from("src").join("bindings.rs");
|
||||
|
||||
// Check modification times
|
||||
let header_modified = fs::metadata(header_path).unwrap().modified().unwrap();
|
||||
let bindings_modified = if out_path.exists() {
|
||||
fs::metadata(&out_path).unwrap().modified().unwrap()
|
||||
} else {
|
||||
std::time::SystemTime::UNIX_EPOCH // If bindings file doesn't exist, consider it older
|
||||
};
|
||||
// Regenerate bindings only if header has been modified
|
||||
if header_modified > bindings_modified {
|
||||
let bindings = bindgen::Builder::default()
|
||||
.header(header_path)
|
||||
.clang_arg("-x")
|
||||
.clang_arg("c++")
|
||||
.clang_arg("-std=c++17")
|
||||
.clang_arg("-I/usr/include")
|
||||
.clang_arg("-I/usr/local/include")
|
||||
.ctypes_prefix("ffi")
|
||||
.raw_line("use crate::ffi;")
|
||||
.generate()
|
||||
.expect("Unable to generate bindings");
|
||||
|
||||
bindings
|
||||
.write_to_file(&out_path)
|
||||
.expect("Couldn't write bindings!");
|
||||
}
|
||||
} else {
|
||||
panic!(
|
||||
"Error: platform not supported, tfhe-cuda-backend not built (only Linux is supported)"
|
||||
|
||||
@@ -1,25 +1,24 @@
|
||||
#ifndef CUDA_CIPHERTEXT_H
|
||||
#define CUDA_CIPHERTEXT_H
|
||||
|
||||
#include "device.h"
|
||||
#include <cstdint>
|
||||
#include "stdint.h"
|
||||
|
||||
extern "C" {
|
||||
void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *stream,
|
||||
uint32_t gpu_index,
|
||||
void *dest, void *src,
|
||||
void *dest, void const *src,
|
||||
uint32_t number_of_cts,
|
||||
uint32_t lwe_dimension);
|
||||
void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
|
||||
uint32_t gpu_index,
|
||||
void *dest, void *src,
|
||||
void *dest, void const *src,
|
||||
uint32_t number_of_cts,
|
||||
uint32_t lwe_dimension);
|
||||
|
||||
void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
|
||||
void *lwe_array_out, void *glwe_array_in,
|
||||
uint32_t *nth_array, uint32_t num_nths,
|
||||
void *lwe_array_out, void const *glwe_array_in,
|
||||
uint32_t const *nth_array, uint32_t num_nths,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size);
|
||||
};
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -42,7 +42,7 @@ void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
|
||||
void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index);
|
||||
|
||||
void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
|
||||
void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index);
|
||||
|
||||
void cuda_memcpy_gpu_to_gpu(void *dest, void *src, uint64_t size,
|
||||
|
||||
@@ -0,0 +1,45 @@
|
||||
#ifndef CUDA_INTEGER_COMPRESSION_H
|
||||
#define CUDA_INTEGER_COMPRESSION_H
|
||||
|
||||
#include "../../pbs/pbs_enums.h"
|
||||
|
||||
extern "C" {
|
||||
void scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t compression_glwe_dimension,
|
||||
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void scratch_cuda_integer_decompress_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t encryption_glwe_dimension,
|
||||
uint32_t encryption_polynomial_size, uint32_t compression_glwe_dimension,
|
||||
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint32_t storage_log_modulus, uint32_t body_count,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_compress_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *glwe_array_out, void const *lwe_array_in, void *const *fp_ksk,
|
||||
uint32_t num_nths, int8_t *mem_ptr);
|
||||
|
||||
void cuda_integer_decompress_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void const *glwe_in, uint32_t const *indexes_array,
|
||||
uint32_t indexes_array_size, void *const *bsks, int8_t *mem_ptr);
|
||||
|
||||
void cleanup_cuda_integer_compress_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void cleanup_cuda_integer_decompress_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,46 +1,7 @@
|
||||
#ifndef CUDA_INTEGER_COMPRESSION_H
|
||||
#define CUDA_INTEGER_COMPRESSION_H
|
||||
#ifndef CUDA_INTEGER_COMPRESSION_UTILITIES_H
|
||||
#define CUDA_INTEGER_COMPRESSION_UTILITIES_H
|
||||
|
||||
#include "integer.h"
|
||||
|
||||
extern "C" {
|
||||
void scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void scratch_cuda_integer_decompress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
|
||||
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_compress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths,
|
||||
int8_t *mem_ptr);
|
||||
|
||||
void cuda_integer_decompress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *glwe_in, uint32_t *indexes_array,
|
||||
uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr);
|
||||
|
||||
void cleanup_cuda_integer_compress_radix_ciphertext_64(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void cleanup_cuda_integer_decompress_radix_ciphertext_64(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
}
|
||||
#include "../integer_utilities.h"
|
||||
|
||||
template <typename Torus> struct int_compression {
|
||||
int_radix_params compression_params;
|
||||
@@ -54,7 +15,7 @@ template <typename Torus> struct int_compression {
|
||||
Torus *tmp_lwe;
|
||||
Torus *tmp_glwe_array_out;
|
||||
|
||||
int_compression(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
int_compression(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params compression_params,
|
||||
uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
|
||||
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
|
||||
@@ -81,7 +42,7 @@ template <typename Torus> struct int_compression {
|
||||
num_radix_blocks, true);
|
||||
}
|
||||
}
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
cuda_drop_async(tmp_lwe, streams[0], gpu_indexes[0]);
|
||||
cuda_drop_async(tmp_glwe_array_out, streams[0], gpu_indexes[0]);
|
||||
@@ -105,7 +66,7 @@ template <typename Torus> struct int_decompression {
|
||||
|
||||
int_radix_lut<Torus> *carry_extract_lut;
|
||||
|
||||
int_decompression(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
int_decompression(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params encryption_params,
|
||||
int_radix_params compression_params,
|
||||
uint32_t num_radix_blocks, uint32_t body_count,
|
||||
@@ -150,7 +111,7 @@ template <typename Torus> struct int_decompression {
|
||||
carry_extract_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
}
|
||||
}
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
cuda_drop_async(tmp_extracted_glwe, streams[0], gpu_indexes[0]);
|
||||
cuda_drop_async(tmp_extracted_lwe, streams[0], gpu_indexes[0]);
|
||||
421
backends/tfhe-cuda-backend/cuda/include/integer/integer.h
Normal file
421
backends/tfhe-cuda-backend/cuda/include/integer/integer.h
Normal file
@@ -0,0 +1,421 @@
|
||||
#ifndef CUDA_INTEGER_H
|
||||
#define CUDA_INTEGER_H
|
||||
|
||||
#include "../pbs/pbs_enums.h"
|
||||
#include <stdint.h>
|
||||
|
||||
enum OUTPUT_CARRY { NONE = 0, GENERATED = 1, PROPAGATED = 2 };
|
||||
enum SHIFT_OR_ROTATE_TYPE {
|
||||
LEFT_SHIFT = 0,
|
||||
RIGHT_SHIFT = 1,
|
||||
LEFT_ROTATE = 2,
|
||||
RIGHT_ROTATE = 3
|
||||
};
|
||||
enum BITOP_TYPE {
|
||||
BITAND = 0,
|
||||
BITOR = 1,
|
||||
BITXOR = 2,
|
||||
SCALAR_BITAND = 3,
|
||||
SCALAR_BITOR = 4,
|
||||
SCALAR_BITXOR = 5,
|
||||
};
|
||||
|
||||
enum COMPARISON_TYPE {
|
||||
EQ = 0,
|
||||
NE = 1,
|
||||
GT = 2,
|
||||
GE = 3,
|
||||
LT = 4,
|
||||
LE = 5,
|
||||
MAX = 6,
|
||||
MIN = 7,
|
||||
};
|
||||
|
||||
enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 };
|
||||
|
||||
enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 };
|
||||
|
||||
extern "C" {
|
||||
void scratch_cuda_apply_univariate_lut_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_apply_univariate_lut_kb_64(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, void *output_radix_lwe,
|
||||
void const *input_radix_lwe,
|
||||
int8_t *mem_ptr, void *const *ksks,
|
||||
void *const *bsks, uint32_t num_blocks);
|
||||
|
||||
void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_apply_bivariate_lut_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_apply_bivariate_lut_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *output_radix_lwe, void const *input_radix_lwe_1,
|
||||
void const *input_radix_lwe_2, int8_t *mem_ptr, void *const *ksks,
|
||||
void *const *bsks, uint32_t num_blocks, uint32_t shift);
|
||||
|
||||
void cleanup_cuda_apply_bivariate_lut_kb_64(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void cuda_apply_many_univariate_lut_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *output_radix_lwe, void const *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *const *ksks, void *const *bsks, uint32_t num_blocks,
|
||||
uint32_t num_luts, uint32_t lut_stride);
|
||||
|
||||
void scratch_cuda_full_propagation_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_full_propagation_64_inplace(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, void *input_blocks,
|
||||
int8_t *mem_ptr, void *const *ksks,
|
||||
void *const *bsks, uint32_t num_blocks);
|
||||
|
||||
void cleanup_cuda_full_propagation(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t pbs_base_log, uint32_t pbs_level, uint32_t ks_base_log,
|
||||
uint32_t ks_level, uint32_t grouping_factor, uint32_t num_blocks,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *radix_lwe_out, void const *radix_lwe_left,
|
||||
void const *radix_lwe_right, void *const *bsks, void *const *ksks,
|
||||
int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks);
|
||||
|
||||
void cleanup_cuda_integer_mult(void *const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void cuda_negate_integer_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void const *lwe_array_in, uint32_t lwe_dimension,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus);
|
||||
|
||||
void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array, void const *scalar_input, uint32_t lwe_dimension,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus);
|
||||
|
||||
void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array, uint32_t shift, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, uint32_t num_blocks);
|
||||
|
||||
void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array, uint32_t shift, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, uint32_t num_blocks);
|
||||
|
||||
void cleanup_cuda_integer_radix_logical_scalar_shift(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void cleanup_cuda_integer_radix_arithmetic_scalar_shift(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_radix_shift_and_rotate_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
bool is_signed, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array, void const *lwe_shift, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, uint32_t num_blocks);
|
||||
|
||||
void cleanup_cuda_integer_radix_shift_and_rotate(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_radix_comparison_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
COMPARISON_TYPE op_type, bool is_signed, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void const *lwe_array_1, void const *lwe_array_2,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
uint32_t lwe_ciphertext_count);
|
||||
|
||||
void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void const *lwe_array_in, void const *scalar_blocks,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
uint32_t lwe_ciphertext_count, uint32_t num_scalar_blocks);
|
||||
|
||||
void cleanup_cuda_integer_comparison(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_radix_bitop_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
BITOP_TYPE op_type, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_bitop_integer_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void const *lwe_array_1, void const *lwe_array_2,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
uint32_t lwe_ciphertext_count);
|
||||
|
||||
void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void const *lwe_array_input, void const *clear_blocks,
|
||||
uint32_t num_clear_blocks, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, uint32_t lwe_ciphertext_count, BITOP_TYPE op);
|
||||
|
||||
void cleanup_cuda_integer_bitop(void *const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_radix_cmux_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_cmux_integer_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void const *lwe_condition, void const *lwe_array_true,
|
||||
void const *lwe_array_false, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, uint32_t lwe_ciphertext_count);
|
||||
|
||||
void cleanup_cuda_integer_radix_cmux(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_radix_scalar_rotate_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_radix_scalar_rotate_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array, uint32_t n, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, uint32_t num_blocks);
|
||||
|
||||
void cleanup_cuda_integer_radix_scalar_rotate(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_propagate_single_carry_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_propagate_single_carry_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array, void *carry_out, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, uint32_t num_blocks);
|
||||
|
||||
void cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array, void *carry_out, void *input_carries, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks, uint32_t num_blocks);
|
||||
|
||||
void cleanup_cuda_propagate_single_carry(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
uint32_t num_blocks_in_radix);
|
||||
|
||||
void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_radix_overflowing_sub_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_radix_overflowing_sub_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *radix_lwe_out, void *radix_lwe_overflowed, void const *radix_lwe_left,
|
||||
void const *radix_lwe_right, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, uint32_t num_blocks_in_radix);
|
||||
|
||||
void cleanup_cuda_integer_radix_overflowing_sub(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_scalar_mul_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array, uint64_t const *decomposed_scalar,
|
||||
uint64_t const *has_at_least_one_set, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t message_modulus, uint32_t num_blocks, uint32_t num_scalars);
|
||||
|
||||
void cleanup_cuda_integer_radix_scalar_mul(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *quotient, void *remainder, void const *numerator, void const *divisor,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
uint32_t num_blocks_in_radix);
|
||||
|
||||
void cleanup_cuda_integer_div_rem(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, int8_t signed_operation,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lhs, void const *rhs, void *overflowed, int8_t signed_operation,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
uint32_t num_blocks_in_radix);
|
||||
|
||||
void cleanup_signed_overflowing_add_or_sub(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *output_radix_lwe, void *generates_or_propagates, int8_t *mem_ptr,
|
||||
void *const *ksks, void *const *bsks, uint32_t num_blocks, uint32_t shift);
|
||||
|
||||
void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void cuda_integer_reverse_blocks_64_inplace(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, void *lwe_array,
|
||||
uint32_t num_blocks,
|
||||
uint32_t lwe_size);
|
||||
|
||||
} // extern C
|
||||
#endif // CUDA_INTEGER_H
|
||||
@@ -1,411 +1,13 @@
|
||||
#ifndef CUDA_INTEGER_H
|
||||
#define CUDA_INTEGER_H
|
||||
#ifndef CUDA_INTEGER_UTILITIES_H
|
||||
#define CUDA_INTEGER_UTILITIES_H
|
||||
|
||||
#include "integer.h"
|
||||
#include "keyswitch.h"
|
||||
#include "pbs/programmable_bootstrap.cuh"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "programmable_bootstrap_multibit.h"
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <functional>
|
||||
|
||||
enum OUTPUT_CARRY { NONE = 0, GENERATED = 1, PROPAGATED = 2 };
|
||||
enum SHIFT_OR_ROTATE_TYPE {
|
||||
LEFT_SHIFT = 0,
|
||||
RIGHT_SHIFT = 1,
|
||||
LEFT_ROTATE = 2,
|
||||
RIGHT_ROTATE = 3
|
||||
};
|
||||
enum BITOP_TYPE {
|
||||
BITAND = 0,
|
||||
BITOR = 1,
|
||||
BITXOR = 2,
|
||||
SCALAR_BITAND = 3,
|
||||
SCALAR_BITOR = 4,
|
||||
SCALAR_BITXOR = 5,
|
||||
};
|
||||
|
||||
enum COMPARISON_TYPE {
|
||||
EQ = 0,
|
||||
NE = 1,
|
||||
GT = 2,
|
||||
GE = 3,
|
||||
LT = 4,
|
||||
LE = 5,
|
||||
MAX = 6,
|
||||
MIN = 7,
|
||||
};
|
||||
|
||||
enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 };
|
||||
|
||||
enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 };
|
||||
|
||||
extern "C" {
|
||||
void scratch_cuda_apply_univariate_lut_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_apply_univariate_lut_kb_64(void **streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, void *output_radix_lwe,
|
||||
void *input_radix_lwe, int8_t *mem_ptr,
|
||||
void **ksks, void **bsks,
|
||||
uint32_t num_blocks);
|
||||
|
||||
void cleanup_cuda_apply_univariate_lut_kb_64(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_apply_bivariate_lut_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_apply_bivariate_lut_kb_64(void **streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, void *output_radix_lwe,
|
||||
void *input_radix_lwe_1,
|
||||
void *input_radix_lwe_2, int8_t *mem_ptr,
|
||||
void **ksks, void **bsks,
|
||||
uint32_t num_blocks, uint32_t shift);
|
||||
|
||||
void cleanup_cuda_apply_bivariate_lut_kb_64(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void cuda_apply_many_univariate_lut_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
|
||||
void **bsks, uint32_t num_blocks, uint32_t num_luts, uint32_t lut_stride);
|
||||
|
||||
void scratch_cuda_full_propagation_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_full_propagation_64_inplace(void **streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, void *input_blocks,
|
||||
int8_t *mem_ptr, void **ksks, void **bsks,
|
||||
uint32_t num_blocks);
|
||||
|
||||
void cleanup_cuda_full_propagation(void **streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
|
||||
uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *radix_lwe_out, void *radix_lwe_left, void *radix_lwe_right,
|
||||
void **bsks, void **ksks, int8_t *mem_ptr, uint32_t polynomial_size,
|
||||
uint32_t num_blocks);
|
||||
|
||||
void cleanup_cuda_integer_mult(void **streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int8_t **mem_ptr_void);
|
||||
|
||||
void cuda_negate_integer_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *lwe_array_in, uint32_t lwe_dimension,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus);
|
||||
|
||||
void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
void *scalar_input, uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus);
|
||||
|
||||
void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
uint32_t shift, int8_t *mem_ptr, void **bsks, void **ksks,
|
||||
uint32_t num_blocks);
|
||||
|
||||
void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
uint32_t shift, int8_t *mem_ptr, void **bsks, void **ksks,
|
||||
uint32_t num_blocks);
|
||||
|
||||
void cleanup_cuda_integer_radix_logical_scalar_shift(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void cleanup_cuda_integer_radix_arithmetic_scalar_shift(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_radix_shift_and_rotate_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
bool is_signed, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
void *lwe_shift, int8_t *mem_ptr, void **bsks, void **ksks,
|
||||
uint32_t num_blocks);
|
||||
|
||||
void cleanup_cuda_integer_radix_shift_and_rotate(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_radix_comparison_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
COMPARISON_TYPE op_type, bool is_signed, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr,
|
||||
void **bsks, void **ksks, uint32_t lwe_ciphertext_count);
|
||||
|
||||
void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *lwe_array_in, void *scalar_blocks,
|
||||
int8_t *mem_ptr, void **bsks, void **ksks, uint32_t lwe_ciphertext_count,
|
||||
uint32_t num_scalar_blocks);
|
||||
|
||||
void cleanup_cuda_integer_comparison(void **streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_radix_bitop_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
BITOP_TYPE op_type, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_bitop_integer_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr,
|
||||
void **bsks, void **ksks, uint32_t lwe_ciphertext_count);
|
||||
|
||||
void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *lwe_array_input, void *clear_blocks,
|
||||
uint32_t num_clear_blocks, int8_t *mem_ptr, void **bsks, void **ksks,
|
||||
uint32_t lwe_ciphertext_count, BITOP_TYPE op);
|
||||
|
||||
void cleanup_cuda_integer_bitop(void **streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_radix_cmux_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_cmux_integer_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *lwe_condition, void *lwe_array_true,
|
||||
void *lwe_array_false, int8_t *mem_ptr, void **bsks, void **ksks,
|
||||
uint32_t lwe_ciphertext_count);
|
||||
|
||||
void cleanup_cuda_integer_radix_cmux(void **streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_radix_scalar_rotate_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_radix_scalar_rotate_kb_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
uint32_t n, int8_t *mem_ptr, void **bsks, void **ksks, uint32_t num_blocks);
|
||||
|
||||
void cleanup_cuda_integer_radix_scalar_rotate(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_propagate_single_carry_kb_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_propagate_single_carry_kb_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
void *carry_out, int8_t *mem_ptr, void **bsks, void **ksks,
|
||||
uint32_t num_blocks);
|
||||
|
||||
void cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
void *carry_out, void *input_carries, int8_t *mem_ptr, void **bsks,
|
||||
void **ksks, uint32_t num_blocks);
|
||||
|
||||
void cleanup_cuda_propagate_single_carry(void **streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec,
|
||||
int8_t *mem_ptr, void **bsks, void **ksks, uint32_t num_blocks_in_radix);
|
||||
|
||||
void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_radix_overflowing_sub_kb_64(
|
||||
void **stream, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_radix_overflowing_sub_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *radix_lwe_out, void *radix_lwe_overflowed, void *radix_lwe_left,
|
||||
void *radix_lwe_right, int8_t *mem_ptr, void **bsks, void **ksks,
|
||||
uint32_t num_blocks_in_radix);
|
||||
|
||||
void cleanup_cuda_integer_radix_overflowing_sub(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_scalar_mul_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
uint64_t *decomposed_scalar, uint64_t *has_at_least_one_set,
|
||||
int8_t *mem_ptr, void **bsks, void **ksks, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t message_modulus, uint32_t num_blocks,
|
||||
uint32_t num_scalars);
|
||||
|
||||
void cleanup_cuda_integer_radix_scalar_mul(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *quotient,
|
||||
void *remainder, void *numerator, void *divisor, int8_t *mem_ptr,
|
||||
void **bsks, void **ksks, uint32_t num_blocks_in_radix);
|
||||
|
||||
void cleanup_cuda_integer_div_rem(void **streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, int8_t signed_operation,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lhs,
|
||||
void *rhs, void *overflowed, int8_t signed_operation, int8_t *mem_ptr,
|
||||
void **bsks, void **ksks, uint32_t num_blocks_in_radix);
|
||||
|
||||
void cleanup_signed_overflowing_add_or_sub(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *output_radix_lwe, void *generates_or_propagates, int8_t *mem_ptr,
|
||||
void **ksks, void **bsks, uint32_t num_blocks, uint32_t shift);
|
||||
|
||||
void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void cuda_integer_reverse_blocks_64_inplace(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, void *lwe_array,
|
||||
uint32_t num_blocks,
|
||||
uint32_t lwe_size);
|
||||
|
||||
} // extern C
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void radix_blocks_rotate_right(Torus *dst, Torus *src,
|
||||
uint32_t value, uint32_t blocks_count,
|
||||
@@ -532,7 +134,7 @@ template <typename Torus> struct int_radix_lut {
|
||||
std::vector<Torus *> lwe_after_pbs_vec;
|
||||
std::vector<Torus *> lwe_trivial_indexes_vec;
|
||||
|
||||
int_radix_lut(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
int_radix_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
|
||||
uint32_t num_radix_blocks, bool allocate_gpu_memory) {
|
||||
|
||||
@@ -638,7 +240,7 @@ template <typename Torus> struct int_radix_lut {
|
||||
}
|
||||
|
||||
// constructor to reuse memory
|
||||
int_radix_lut(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
int_radix_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
|
||||
uint32_t num_radix_blocks, int_radix_lut *base_lut_object) {
|
||||
|
||||
@@ -746,7 +348,7 @@ template <typename Torus> struct int_radix_lut {
|
||||
}
|
||||
|
||||
// Broadcast luts from gpu src_gpu_idx to all active gpus
|
||||
void broadcast_lut(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void broadcast_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t src_gpu_idx) {
|
||||
Torus lut_size = (params.glwe_dimension + 1) * params.polynomial_size;
|
||||
|
||||
@@ -769,7 +371,7 @@ template <typename Torus> struct int_radix_lut {
|
||||
}
|
||||
}
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_drop_async(lut_vec[i], streams[i], gpu_indexes[i]);
|
||||
@@ -824,10 +426,10 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
|
||||
int_radix_lut<Torus> *lut;
|
||||
|
||||
// With offset
|
||||
int_bit_extract_luts_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params,
|
||||
uint32_t bits_per_block, uint32_t final_offset,
|
||||
uint32_t num_radix_blocks,
|
||||
int_bit_extract_luts_buffer(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int_radix_params params, uint32_t bits_per_block,
|
||||
uint32_t final_offset, uint32_t num_radix_blocks,
|
||||
bool allocate_gpu_memory) {
|
||||
this->params = params;
|
||||
|
||||
@@ -898,16 +500,16 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
|
||||
}
|
||||
|
||||
// Without offset
|
||||
int_bit_extract_luts_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params,
|
||||
uint32_t bits_per_block,
|
||||
int_bit_extract_luts_buffer(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int_radix_params params, uint32_t bits_per_block,
|
||||
uint32_t num_radix_blocks,
|
||||
bool allocate_gpu_memory)
|
||||
: int_bit_extract_luts_buffer(streams, gpu_indexes, gpu_count, params,
|
||||
bits_per_block, 0, num_radix_blocks,
|
||||
allocate_gpu_memory) {}
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
lut->release(streams, gpu_indexes, gpu_count);
|
||||
delete (lut);
|
||||
@@ -933,8 +535,8 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
|
||||
|
||||
Torus offset;
|
||||
|
||||
int_shift_and_rotate_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int_shift_and_rotate_buffer(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed,
|
||||
int_radix_params params,
|
||||
uint32_t num_radix_blocks,
|
||||
@@ -1056,7 +658,7 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
|
||||
}
|
||||
}
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
cuda_drop_async(tmp_bits, streams[0], gpu_indexes[0]);
|
||||
cuda_drop_async(tmp_shift_bits, streams[0], gpu_indexes[0]);
|
||||
@@ -1085,7 +687,7 @@ template <typename Torus> struct int_fullprop_buffer {
|
||||
Torus *tmp_small_lwe_vector;
|
||||
Torus *tmp_big_lwe_vector;
|
||||
|
||||
int_fullprop_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
int_fullprop_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
this->params = params;
|
||||
@@ -1142,7 +744,7 @@ template <typename Torus> struct int_fullprop_buffer {
|
||||
}
|
||||
}
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
|
||||
lut->release(streams, gpu_indexes, 1);
|
||||
@@ -1165,7 +767,7 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
|
||||
int_radix_params params;
|
||||
|
||||
int_sc_prop_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
int_sc_prop_memory(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params,
|
||||
uint32_t num_radix_blocks, bool allocate_gpu_memory) {
|
||||
this->params = params;
|
||||
@@ -1258,7 +860,7 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
message_acc->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
}
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
cuda_drop_async(generates_or_propagates, streams[0], gpu_indexes[0]);
|
||||
cuda_drop_async(step_output, streams[0], gpu_indexes[0]);
|
||||
@@ -1285,9 +887,9 @@ template <typename Torus> struct int_overflowing_sub_memory {
|
||||
|
||||
int_radix_params params;
|
||||
|
||||
int_overflowing_sub_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params,
|
||||
uint32_t num_radix_blocks,
|
||||
int_overflowing_sub_memory(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int_radix_params params, uint32_t num_radix_blocks,
|
||||
bool allocate_gpu_memory) {
|
||||
this->params = params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
@@ -1379,7 +981,7 @@ template <typename Torus> struct int_overflowing_sub_memory {
|
||||
message_acc->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
}
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
cuda_drop_async(generates_or_propagates, streams[0], gpu_indexes[0]);
|
||||
cuda_drop_async(step_output, streams[0], gpu_indexes[0]);
|
||||
@@ -1407,7 +1009,8 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
|
||||
bool mem_reuse = false;
|
||||
|
||||
int_sum_ciphertexts_vec_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
int_sum_ciphertexts_vec_memory(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params,
|
||||
uint32_t num_blocks_in_radix,
|
||||
uint32_t max_num_radix_in_vec,
|
||||
@@ -1460,7 +1063,8 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
streams[0], gpu_indexes[0]);
|
||||
}
|
||||
|
||||
int_sum_ciphertexts_vec_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
int_sum_ciphertexts_vec_memory(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params,
|
||||
uint32_t num_blocks_in_radix,
|
||||
uint32_t max_num_radix_in_vec,
|
||||
@@ -1496,7 +1100,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
streams[0], gpu_indexes[0]);
|
||||
}
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
cuda_drop_async(d_smart_copy_in, streams[0], gpu_indexes[0]);
|
||||
cuda_drop_async(d_smart_copy_out, streams[0], gpu_indexes[0]);
|
||||
@@ -1523,7 +1127,7 @@ template <typename Torus> struct int_mul_memory {
|
||||
|
||||
int_radix_params params;
|
||||
|
||||
int_mul_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
int_mul_memory(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params,
|
||||
uint32_t num_radix_blocks, bool allocate_gpu_memory) {
|
||||
this->params = params;
|
||||
@@ -1597,7 +1201,7 @@ template <typename Torus> struct int_mul_memory {
|
||||
small_lwe_vector);
|
||||
}
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
cuda_drop_async(vector_result_sb, streams[0], gpu_indexes[0]);
|
||||
cuda_drop_async(block_mul_res, streams[0], gpu_indexes[0]);
|
||||
@@ -1621,7 +1225,8 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
|
||||
|
||||
bool reuse_memory = false;
|
||||
|
||||
int_logical_scalar_shift_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
int_logical_scalar_shift_buffer(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
int_radix_params params,
|
||||
@@ -1712,13 +1317,11 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
|
||||
}
|
||||
}
|
||||
|
||||
int_logical_scalar_shift_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
int_radix_params params,
|
||||
uint32_t num_radix_blocks,
|
||||
bool allocate_gpu_memory,
|
||||
Torus *pre_allocated_buffer) {
|
||||
int_logical_scalar_shift_buffer(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
int_radix_params params, uint32_t num_radix_blocks,
|
||||
bool allocate_gpu_memory, Torus *pre_allocated_buffer) {
|
||||
this->shift_type = shift_type;
|
||||
this->params = params;
|
||||
tmp_rotated = pre_allocated_buffer;
|
||||
@@ -1800,7 +1403,7 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
|
||||
}
|
||||
}
|
||||
}
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
for (auto &buffer : lut_buffers_bivariate) {
|
||||
buffer->release(streams, gpu_indexes, gpu_count);
|
||||
@@ -1826,8 +1429,9 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
|
||||
cudaStream_t *local_streams_2;
|
||||
uint32_t active_gpu_count;
|
||||
|
||||
int_arithmetic_scalar_shift_buffer(cudaStream_t *streams,
|
||||
uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_arithmetic_scalar_shift_buffer(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
int_radix_params params,
|
||||
uint32_t num_radix_blocks,
|
||||
@@ -1971,7 +1575,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
|
||||
}
|
||||
}
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
for (uint j = 0; j < active_gpu_count; j++) {
|
||||
cuda_destroy_stream(local_streams_1[j], gpu_indexes[j]);
|
||||
@@ -2004,9 +1608,10 @@ template <typename Torus> struct int_zero_out_if_buffer {
|
||||
cudaStream_t *false_streams;
|
||||
uint32_t active_gpu_count;
|
||||
|
||||
int_zero_out_if_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params,
|
||||
uint32_t num_radix_blocks, bool allocate_gpu_memory) {
|
||||
int_zero_out_if_buffer(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int_radix_params params, uint32_t num_radix_blocks,
|
||||
bool allocate_gpu_memory) {
|
||||
this->params = params;
|
||||
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
|
||||
@@ -2025,7 +1630,7 @@ template <typename Torus> struct int_zero_out_if_buffer {
|
||||
}
|
||||
}
|
||||
}
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
cuda_drop_async(tmp, streams[0], gpu_indexes[0]);
|
||||
for (uint j = 0; j < active_gpu_count; j++) {
|
||||
@@ -2050,7 +1655,7 @@ template <typename Torus> struct int_cmux_buffer {
|
||||
|
||||
int_radix_params params;
|
||||
|
||||
int_cmux_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
int_cmux_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
std::function<Torus(Torus)> predicate_lut_f,
|
||||
int_radix_params params, uint32_t num_radix_blocks,
|
||||
@@ -2121,7 +1726,7 @@ template <typename Torus> struct int_cmux_buffer {
|
||||
}
|
||||
}
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
predicate_lut->release(streams, gpu_indexes, gpu_count);
|
||||
delete predicate_lut;
|
||||
@@ -2152,9 +1757,9 @@ template <typename Torus> struct int_are_all_block_true_buffer {
|
||||
// value).
|
||||
std::unordered_map<int, int_radix_lut<Torus> *> is_equal_to_lut_map;
|
||||
|
||||
int_are_all_block_true_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, COMPARISON_TYPE op,
|
||||
int_radix_params params,
|
||||
int_are_all_block_true_buffer(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
COMPARISON_TYPE op, int_radix_params params,
|
||||
uint32_t num_radix_blocks,
|
||||
bool allocate_gpu_memory) {
|
||||
this->params = params;
|
||||
@@ -2174,7 +1779,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {
|
||||
}
|
||||
}
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
for (auto &lut : is_equal_to_lut_map) {
|
||||
lut.second->release(streams, gpu_indexes, gpu_count);
|
||||
@@ -2197,9 +1802,10 @@ template <typename Torus> struct int_comparison_eq_buffer {
|
||||
|
||||
int_are_all_block_true_buffer<Torus> *are_all_block_true_buffer;
|
||||
|
||||
int_comparison_eq_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, COMPARISON_TYPE op,
|
||||
int_radix_params params, uint32_t num_radix_blocks,
|
||||
int_comparison_eq_buffer(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
COMPARISON_TYPE op, int_radix_params params,
|
||||
uint32_t num_radix_blocks,
|
||||
bool allocate_gpu_memory) {
|
||||
this->params = params;
|
||||
this->op = op;
|
||||
@@ -2272,7 +1878,7 @@ template <typename Torus> struct int_comparison_eq_buffer {
|
||||
}
|
||||
}
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
operator_lut->release(streams, gpu_indexes, gpu_count);
|
||||
delete operator_lut;
|
||||
@@ -2298,7 +1904,8 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
|
||||
Torus *tmp_x;
|
||||
Torus *tmp_y;
|
||||
|
||||
int_tree_sign_reduction_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
int_tree_sign_reduction_buffer(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
std::function<Torus(Torus)> operator_f,
|
||||
int_radix_params params,
|
||||
@@ -2340,7 +1947,7 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
|
||||
}
|
||||
}
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
tree_inner_leaf_lut->release(streams, gpu_indexes, gpu_count);
|
||||
delete tree_inner_leaf_lut;
|
||||
@@ -2369,9 +1976,10 @@ template <typename Torus> struct int_comparison_diff_buffer {
|
||||
Torus *tmp_signs_b;
|
||||
int_radix_lut<Torus> *reduce_signs_lut;
|
||||
|
||||
int_comparison_diff_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, COMPARISON_TYPE op,
|
||||
int_radix_params params, uint32_t num_radix_blocks,
|
||||
int_comparison_diff_buffer(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
COMPARISON_TYPE op, int_radix_params params,
|
||||
uint32_t num_radix_blocks,
|
||||
bool allocate_gpu_memory) {
|
||||
this->params = params;
|
||||
this->op = op;
|
||||
@@ -2415,7 +2023,7 @@ template <typename Torus> struct int_comparison_diff_buffer {
|
||||
}
|
||||
}
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
tree_buffer->release(streams, gpu_indexes, gpu_count);
|
||||
delete tree_buffer;
|
||||
@@ -2463,10 +2071,11 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
cudaStream_t *msb_streams;
|
||||
uint32_t active_gpu_count;
|
||||
|
||||
int_comparison_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, COMPARISON_TYPE op,
|
||||
int_radix_params params, uint32_t num_radix_blocks,
|
||||
bool is_signed, bool allocate_gpu_memory) {
|
||||
int_comparison_buffer(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
COMPARISON_TYPE op, int_radix_params params,
|
||||
uint32_t num_radix_blocks, bool is_signed,
|
||||
bool allocate_gpu_memory) {
|
||||
this->params = params;
|
||||
this->op = op;
|
||||
this->is_signed = is_signed;
|
||||
@@ -2610,7 +2219,7 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
}
|
||||
}
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
switch (op) {
|
||||
case COMPARISON_TYPE::MAX:
|
||||
@@ -2701,8 +2310,9 @@ template <typename Torus> struct int_div_rem_memory {
|
||||
|
||||
// allocate and initialize if needed, temporary arrays used to calculate
|
||||
// cuda integer div_rem operation
|
||||
void init_temporary_buffers(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, uint32_t num_blocks) {
|
||||
void init_temporary_buffers(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
uint32_t num_blocks) {
|
||||
uint32_t big_lwe_size = params.big_lwe_dimension + 1;
|
||||
|
||||
// non boolean temporary arrays, with `num_blocks` blocks
|
||||
@@ -2749,8 +2359,9 @@ template <typename Torus> struct int_div_rem_memory {
|
||||
}
|
||||
|
||||
// initialize lookup tables for div_rem operation
|
||||
void init_lookup_tables(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, uint32_t num_blocks) {
|
||||
void init_lookup_tables(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
uint32_t num_blocks) {
|
||||
uint32_t num_bits_in_message = 31 - __builtin_clz(params.message_modulus);
|
||||
|
||||
// create and generate masking_luts_1[] and masking_lut_2[]
|
||||
@@ -2890,7 +2501,7 @@ template <typename Torus> struct int_div_rem_memory {
|
||||
}
|
||||
}
|
||||
|
||||
int_div_rem_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
int_div_rem_memory(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params,
|
||||
uint32_t num_blocks, bool allocate_gpu_memory) {
|
||||
active_gpu_count = get_active_gpu_count(2 * num_blocks, gpu_count);
|
||||
@@ -2930,7 +2541,7 @@ template <typename Torus> struct int_div_rem_memory {
|
||||
}
|
||||
}
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
uint32_t num_bits_in_message = 31 - __builtin_clz(params.message_modulus);
|
||||
|
||||
@@ -3033,9 +2644,9 @@ template <typename Torus> struct int_last_block_inner_propagate_memory {
|
||||
int_radix_params params;
|
||||
|
||||
int_last_block_inner_propagate_memory(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_radix_params params, SIGNED_OPERATION op, uint32_t num_radix_blocks,
|
||||
bool allocate_gpu_memory) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params, SIGNED_OPERATION op,
|
||||
uint32_t num_radix_blocks, bool allocate_gpu_memory) {
|
||||
|
||||
this->params = params;
|
||||
auto message_modulus = params.message_modulus;
|
||||
@@ -3100,7 +2711,7 @@ template <typename Torus> struct int_last_block_inner_propagate_memory {
|
||||
gpu_indexes[0]);
|
||||
}
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
last_block_inner_propagation_lut->release(streams, gpu_indexes, gpu_count);
|
||||
delete last_block_inner_propagation_lut;
|
||||
@@ -3114,8 +2725,9 @@ template <typename Torus> struct int_resolve_signed_overflow_memory {
|
||||
|
||||
Torus *x;
|
||||
|
||||
int_resolve_signed_overflow_memory(cudaStream_t *streams,
|
||||
uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_resolve_signed_overflow_memory(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
@@ -3160,7 +2772,7 @@ template <typename Torus> struct int_resolve_signed_overflow_memory {
|
||||
resolve_overflow_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
}
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
resolve_overflow_lut->release(streams, gpu_indexes, gpu_count);
|
||||
delete resolve_overflow_lut;
|
||||
@@ -3190,7 +2802,8 @@ template <typename Torus> struct int_signed_overflowing_add_or_sub_memory {
|
||||
|
||||
// allocate temporary arrays used to calculate
|
||||
// cuda integer signed overflowing add or sub
|
||||
void allocate_temporary_buffers(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void allocate_temporary_buffers(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, uint32_t num_blocks) {
|
||||
uint32_t big_lwe_size = params.big_lwe_dimension + 1;
|
||||
|
||||
@@ -3210,9 +2823,9 @@ template <typename Torus> struct int_signed_overflowing_add_or_sub_memory {
|
||||
|
||||
// constructor without memory reuse
|
||||
int_signed_overflowing_add_or_sub_memory(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_radix_params params, uint32_t num_blocks, SIGNED_OPERATION op,
|
||||
bool allocate_gpu_memory) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params, uint32_t num_blocks,
|
||||
SIGNED_OPERATION op, bool allocate_gpu_memory) {
|
||||
this->params = params;
|
||||
active_gpu_count = get_active_gpu_count(num_blocks, gpu_count);
|
||||
|
||||
@@ -3241,7 +2854,7 @@ template <typename Torus> struct int_signed_overflowing_add_or_sub_memory {
|
||||
streams, gpu_indexes, gpu_count, params, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
// memory objects for other operations
|
||||
scp_mem->release(streams, gpu_indexes, gpu_count);
|
||||
@@ -3273,7 +2886,7 @@ template <typename Torus> struct int_bitop_buffer {
|
||||
int_radix_params params;
|
||||
int_radix_lut<Torus> *lut;
|
||||
|
||||
int_bitop_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
int_bitop_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, BITOP_TYPE op, int_radix_params params,
|
||||
uint32_t num_radix_blocks, bool allocate_gpu_memory) {
|
||||
|
||||
@@ -3337,7 +2950,7 @@ template <typename Torus> struct int_bitop_buffer {
|
||||
}
|
||||
}
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
lut->release(streams, gpu_indexes, gpu_count);
|
||||
delete lut;
|
||||
@@ -3351,9 +2964,10 @@ template <typename Torus> struct int_scalar_mul_buffer {
|
||||
Torus *preshifted_buffer;
|
||||
Torus *all_shifted_buffer;
|
||||
|
||||
int_scalar_mul_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params,
|
||||
uint32_t num_radix_blocks, bool allocate_gpu_memory) {
|
||||
int_scalar_mul_buffer(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int_radix_params params, uint32_t num_radix_blocks,
|
||||
bool allocate_gpu_memory) {
|
||||
this->params = params;
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
@@ -3390,7 +3004,7 @@ template <typename Torus> struct int_scalar_mul_buffer {
|
||||
}
|
||||
}
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
sum_ciphertexts_vec_mem->release(streams, gpu_indexes, gpu_count);
|
||||
delete sum_ciphertexts_vec_mem;
|
||||
@@ -3398,4 +3012,4 @@ template <typename Torus> struct int_scalar_mul_buffer {
|
||||
}
|
||||
};
|
||||
|
||||
#endif // CUDA_INTEGER_H
|
||||
#endif // CUDA_INTEGER_UTILITIES_H
|
||||
@@ -1,21 +1,23 @@
|
||||
#ifndef CNCRT_KS_H_
|
||||
#define CNCRT_KS_H_
|
||||
|
||||
#include <cstdint>
|
||||
#include <stdint.h>
|
||||
|
||||
extern "C" {
|
||||
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
|
||||
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
|
||||
void const *lwe_output_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in,
|
||||
uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples);
|
||||
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
|
||||
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
|
||||
void const *lwe_output_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in,
|
||||
uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples);
|
||||
|
||||
void scratch_packing_keyswitch_lwe_list_to_glwe_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
|
||||
@@ -23,10 +25,11 @@ void scratch_packing_keyswitch_lwe_list_to_glwe_64(
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_packing_keyswitch_lwe_list_to_glwe_64(
|
||||
void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
|
||||
void *fp_ksk_array, int8_t *fp_ks_buffer, uint32_t input_lwe_dimension,
|
||||
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_lwes);
|
||||
void *stream, uint32_t gpu_index, void *glwe_array_out,
|
||||
void const *lwe_array_in, void const *fp_ksk_array, int8_t *fp_ks_buffer,
|
||||
uint32_t input_lwe_dimension, uint32_t output_glwe_dimension,
|
||||
uint32_t output_polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_lwes);
|
||||
|
||||
void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
|
||||
uint32_t gpu_index,
|
||||
|
||||
@@ -1,50 +1,48 @@
|
||||
#ifndef CUDA_LINALG_H_
|
||||
#define CUDA_LINALG_H_
|
||||
|
||||
#include "programmable_bootstrap.h"
|
||||
#include <cstdint>
|
||||
#include <device.h>
|
||||
#include <stdint.h>
|
||||
|
||||
extern "C" {
|
||||
|
||||
void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in,
|
||||
void const *lwe_array_in,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_negate_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in,
|
||||
void const *lwe_array_in,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in_1,
|
||||
void *lwe_array_in_2,
|
||||
void const *lwe_array_in_1,
|
||||
void const *lwe_array_in_2,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in_1,
|
||||
void *lwe_array_in_2,
|
||||
void const *lwe_array_in_1,
|
||||
void const *lwe_array_in_2,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
|
||||
void *plaintext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_array_in, void const *plaintext_array_in,
|
||||
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
|
||||
void *plaintext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_array_in, void const *plaintext_array_in,
|
||||
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
|
||||
void *cleartext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_array_in, void const *cleartext_array_in,
|
||||
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
|
||||
void *cleartext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_array_in, void const *cleartext_array_in,
|
||||
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
#endif // CUDA_LINALG_H_
|
||||
|
||||
7
backends/tfhe-cuda-backend/cuda/include/pbs/pbs_enums.h
Normal file
7
backends/tfhe-cuda-backend/cuda/include/pbs/pbs_enums.h
Normal file
@@ -0,0 +1,7 @@
|
||||
#ifndef CUDA_PBS_ENUMS_H
|
||||
#define CUDA_PBS_ENUMS_H
|
||||
|
||||
enum PBS_TYPE { MULTI_BIT = 0, CLASSICAL = 1 };
|
||||
enum PBS_VARIANT { DEFAULT = 0, CG = 1, TBC = 2 };
|
||||
|
||||
#endif // CUDA_PBS_ENUMS_H
|
||||
@@ -1,38 +1,7 @@
|
||||
#ifndef CUDA_MULTI_BIT_H
|
||||
#define CUDA_MULTI_BIT_H
|
||||
#ifndef CUDA_MULTI_BIT_UTILITIES_H
|
||||
#define CUDA_MULTI_BIT_UTILITIES_H
|
||||
|
||||
#include "programmable_bootstrap.h"
|
||||
#include <cstdint>
|
||||
|
||||
extern "C" {
|
||||
|
||||
bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t num_samples);
|
||||
|
||||
void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
|
||||
void *stream, uint32_t gpu_index, void *dest, void *src,
|
||||
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor);
|
||||
|
||||
void scratch_cuda_multi_bit_programmable_bootstrap_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
uint32_t lut_stride);
|
||||
|
||||
void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
|
||||
uint32_t gpu_index,
|
||||
int8_t **pbs_buffer);
|
||||
}
|
||||
#include "pbs_utilities.h"
|
||||
|
||||
template <typename Torus>
|
||||
bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
|
||||
@@ -53,8 +22,9 @@ void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
|
||||
template <typename Torus>
|
||||
void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
@@ -70,8 +40,9 @@ void scratch_cuda_cg_multi_bit_programmable_bootstrap(
|
||||
template <typename Torus>
|
||||
void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
@@ -86,8 +57,9 @@ void scratch_cuda_multi_bit_programmable_bootstrap(
|
||||
template <typename Torus>
|
||||
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
@@ -121,6 +93,10 @@ template <typename Torus>
|
||||
uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
|
||||
template <typename Torus, class params>
|
||||
uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size);
|
||||
|
||||
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
int8_t *d_mem_keybundle = NULL;
|
||||
int8_t *d_mem_acc_step_one = NULL;
|
||||
@@ -288,8 +264,4 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus, class params>
|
||||
uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size);
|
||||
|
||||
#endif // CUDA_MULTI_BIT_H
|
||||
#endif // CUDA_MULTI_BIT_UTILITIES_H
|
||||
@@ -1,87 +1,10 @@
|
||||
#ifndef CUDA_BOOTSTRAP_H
|
||||
#define CUDA_BOOTSTRAP_H
|
||||
#ifndef CUDA_BOOTSTRAP_UTILITIES_H
|
||||
#define CUDA_BOOTSTRAP_UTILITIES_H
|
||||
|
||||
#include "device.h"
|
||||
#include <cstdint>
|
||||
|
||||
enum PBS_TYPE { MULTI_BIT = 0, CLASSICAL = 1 };
|
||||
enum PBS_VARIANT { DEFAULT = 0, CG = 1, TBC = 2 };
|
||||
|
||||
extern "C" {
|
||||
void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
|
||||
void *input1, void *input2, void *output,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t total_polynomials);
|
||||
|
||||
void cuda_convert_lwe_programmable_bootstrap_key_32(
|
||||
void *stream, uint32_t gpu_index, void *dest, void *src,
|
||||
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size);
|
||||
|
||||
void cuda_convert_lwe_programmable_bootstrap_key_64(
|
||||
void *stream, uint32_t gpu_index, void *dest, void *src,
|
||||
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size);
|
||||
|
||||
void scratch_cuda_programmable_bootstrap_amortized_32(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
void scratch_cuda_programmable_bootstrap_amortized_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples);
|
||||
|
||||
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples);
|
||||
|
||||
void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
|
||||
uint32_t gpu_index,
|
||||
int8_t **pbs_buffer);
|
||||
|
||||
void scratch_cuda_programmable_bootstrap_32(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
void scratch_cuda_programmable_bootstrap_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride);
|
||||
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride);
|
||||
|
||||
void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
|
||||
int8_t **pbs_buffer);
|
||||
}
|
||||
#include "pbs_enums.h"
|
||||
#include "vector_types.h"
|
||||
#include <stdint.h>
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_one(
|
||||
@@ -327,8 +250,9 @@ bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
|
||||
template <typename Torus>
|
||||
void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
@@ -337,8 +261,9 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
template <typename Torus>
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
@@ -348,8 +273,9 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
template <typename Torus>
|
||||
void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
@@ -408,4 +334,4 @@ __device__ const T *get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
|
||||
#endif
|
||||
|
||||
#endif // CUDA_BOOTSTRAP_H
|
||||
#endif // CUDA_BOOTSTRAP_UTILITIES_H
|
||||
@@ -0,0 +1,86 @@
|
||||
#ifndef CUDA_BOOTSTRAP_H
|
||||
#define CUDA_BOOTSTRAP_H
|
||||
|
||||
#include "pbs_enums.h"
|
||||
#include <stdint.h>
|
||||
|
||||
extern "C" {
|
||||
void cuda_fourier_polynomial_mul(void *stream, uint32_t gpu_index,
|
||||
void const *input1, void const *input2,
|
||||
void *output, uint32_t polynomial_size,
|
||||
uint32_t total_polynomials);
|
||||
|
||||
void cuda_convert_lwe_programmable_bootstrap_key_32(
|
||||
void *stream, uint32_t gpu_index, void *dest, void const *src,
|
||||
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size);
|
||||
|
||||
void cuda_convert_lwe_programmable_bootstrap_key_64(
|
||||
void *stream, uint32_t gpu_index, void *dest, void const *src,
|
||||
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size);
|
||||
|
||||
void scratch_cuda_programmable_bootstrap_amortized_32(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
void scratch_cuda_programmable_bootstrap_amortized_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *bootstrapping_key,
|
||||
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples);
|
||||
|
||||
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *bootstrapping_key,
|
||||
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples);
|
||||
|
||||
void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
|
||||
uint32_t gpu_index,
|
||||
int8_t **pbs_buffer);
|
||||
|
||||
void scratch_cuda_programmable_bootstrap_32(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
void scratch_cuda_programmable_bootstrap_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *bootstrapping_key,
|
||||
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride);
|
||||
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *bootstrapping_key,
|
||||
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride);
|
||||
|
||||
void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
|
||||
int8_t **pbs_buffer);
|
||||
}
|
||||
#endif // CUDA_BOOTSTRAP_H
|
||||
@@ -0,0 +1,38 @@
|
||||
#ifndef CUDA_MULTI_BIT_H
|
||||
#define CUDA_MULTI_BIT_H
|
||||
|
||||
#include "pbs_enums.h"
|
||||
#include "stdint.h"
|
||||
|
||||
extern "C" {
|
||||
|
||||
bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t num_samples);
|
||||
|
||||
void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
|
||||
void *stream, uint32_t gpu_index, void *dest, void const *src,
|
||||
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor);
|
||||
|
||||
void scratch_cuda_multi_bit_programmable_bootstrap_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *bootstrapping_key,
|
||||
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
uint32_t lut_stride);
|
||||
|
||||
void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
|
||||
uint32_t gpu_index,
|
||||
int8_t **pbs_buffer);
|
||||
}
|
||||
|
||||
#endif // CUDA_MULTI_BIT_H
|
||||
@@ -22,8 +22,8 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
|
||||
}
|
||||
|
||||
void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
|
||||
void *lwe_array_out, void *glwe_array_in,
|
||||
uint32_t *nth_array, uint32_t num_nths,
|
||||
void *lwe_array_out, void const *glwe_array_in,
|
||||
uint32_t const *nth_array, uint32_t num_nths,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size) {
|
||||
|
||||
@@ -31,43 +31,43 @@ void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
|
||||
case 256:
|
||||
host_sample_extract<uint64_t, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
|
||||
(uint64_t const *)glwe_array_in, (uint32_t const *)nth_array, num_nths,
|
||||
glwe_dimension);
|
||||
break;
|
||||
case 512:
|
||||
host_sample_extract<uint64_t, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
|
||||
(uint64_t const *)glwe_array_in, (uint32_t const *)nth_array, num_nths,
|
||||
glwe_dimension);
|
||||
break;
|
||||
case 1024:
|
||||
host_sample_extract<uint64_t, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
|
||||
(uint64_t const *)glwe_array_in, (uint32_t const *)nth_array, num_nths,
|
||||
glwe_dimension);
|
||||
break;
|
||||
case 2048:
|
||||
host_sample_extract<uint64_t, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
|
||||
(uint64_t const *)glwe_array_in, (uint32_t const *)nth_array, num_nths,
|
||||
glwe_dimension);
|
||||
break;
|
||||
case 4096:
|
||||
host_sample_extract<uint64_t, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
|
||||
(uint64_t const *)glwe_array_in, (uint32_t const *)nth_array, num_nths,
|
||||
glwe_dimension);
|
||||
break;
|
||||
case 8192:
|
||||
host_sample_extract<uint64_t, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
|
||||
(uint64_t const *)glwe_array_in, (uint32_t const *)nth_array, num_nths,
|
||||
glwe_dimension);
|
||||
break;
|
||||
case 16384:
|
||||
host_sample_extract<uint64_t, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
|
||||
(uint64_t const *)glwe_array_in, (uint32_t const *)nth_array, num_nths,
|
||||
glwe_dimension);
|
||||
break;
|
||||
default:
|
||||
|
||||
@@ -27,8 +27,9 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu(cudaStream_t stream,
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in,
|
||||
uint32_t *nth_array, uint32_t glwe_dimension) {
|
||||
__global__ void sample_extract(Torus *lwe_array_out, Torus const *glwe_array_in,
|
||||
uint32_t const *nth_array,
|
||||
uint32_t glwe_dimension) {
|
||||
|
||||
const int input_id = blockIdx.x;
|
||||
|
||||
@@ -50,8 +51,9 @@ __global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in,
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_sample_extract(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *lwe_array_out, Torus *glwe_array_in,
|
||||
uint32_t *nth_array, uint32_t num_nths,
|
||||
Torus *lwe_array_out,
|
||||
Torus const *glwe_array_in,
|
||||
uint32_t const *nth_array, uint32_t num_nths,
|
||||
uint32_t glwe_dimension) {
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
|
||||
@@ -37,16 +37,18 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
|
||||
*/
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
|
||||
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
void const *lwe_output_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in,
|
||||
uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples) {
|
||||
host_keyswitch_lwe_ciphertext_vector<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),
|
||||
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const uint64_t *>(ksk), lwe_dimension_in, lwe_dimension_out,
|
||||
base_log, level_count, num_samples);
|
||||
}
|
||||
|
||||
void scratch_packing_keyswitch_lwe_list_to_glwe_64(
|
||||
@@ -61,18 +63,19 @@ void scratch_packing_keyswitch_lwe_list_to_glwe_64(
|
||||
* ciphertexts.
|
||||
*/
|
||||
void cuda_packing_keyswitch_lwe_list_to_glwe_64(
|
||||
void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
|
||||
void *fp_ksk_array, int8_t *fp_ks_buffer, uint32_t input_lwe_dimension,
|
||||
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_lwes) {
|
||||
void *stream, uint32_t gpu_index, void *glwe_array_out,
|
||||
void const *lwe_array_in, void const *fp_ksk_array, int8_t *fp_ks_buffer,
|
||||
uint32_t input_lwe_dimension, uint32_t output_glwe_dimension,
|
||||
uint32_t output_polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_lwes) {
|
||||
|
||||
host_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(glwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(fp_ksk_array), fp_ks_buffer, input_lwe_dimension,
|
||||
output_glwe_dimension, output_polynomial_size, base_log, level_count,
|
||||
num_lwes);
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(fp_ksk_array), fp_ks_buffer,
|
||||
input_lwe_dimension, output_glwe_dimension, output_polynomial_size,
|
||||
base_log, level_count, num_lwes);
|
||||
}
|
||||
|
||||
void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
|
||||
|
||||
@@ -101,9 +101,10 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
template <typename Torus>
|
||||
__host__ void host_keyswitch_lwe_ciphertext_vector(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
|
||||
Torus *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
Torus const *lwe_output_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, Torus const *ksk, uint32_t lwe_dimension_in,
|
||||
uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
@@ -124,13 +125,13 @@ __host__ void host_keyswitch_lwe_ciphertext_vector(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
void execute_keyswitch_async(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
const LweArrayVariant<Torus> &lwe_array_out,
|
||||
const LweArrayVariant<Torus> &lwe_output_indexes,
|
||||
const LweArrayVariant<Torus> &lwe_array_in,
|
||||
const LweArrayVariant<Torus> &lwe_input_indexes,
|
||||
Torus **ksks, uint32_t lwe_dimension_in,
|
||||
Torus *const *ksks, uint32_t lwe_dimension_in,
|
||||
uint32_t lwe_dimension_out, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
@@ -176,9 +177,9 @@ __host__ void scratch_packing_keyswitch_lwe_list_to_glwe(
|
||||
// different thread blocks at the x-axis to work on that input.
|
||||
template <typename Torus>
|
||||
__device__ void packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext(
|
||||
Torus *glwe_out, Torus *lwe_in, Torus *fp_ksk, uint32_t lwe_dimension_in,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count) {
|
||||
Torus *glwe_out, Torus const *lwe_in, Torus const *fp_ksk,
|
||||
uint32_t lwe_dimension_in, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count) {
|
||||
|
||||
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
size_t glwe_size = (glwe_dimension + 1);
|
||||
@@ -225,12 +226,11 @@ __device__ void packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext(
|
||||
// Assumes there are (glwe_dimension+1) * polynomial_size threads split through
|
||||
// different thread blocks at the x-axis to work on that input.
|
||||
template <typename Torus>
|
||||
__global__ void
|
||||
packing_keyswitch_lwe_list_to_glwe(Torus *glwe_array_out, Torus *lwe_array_in,
|
||||
Torus *fp_ksk, uint32_t lwe_dimension_in,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, Torus *d_mem) {
|
||||
__global__ void packing_keyswitch_lwe_list_to_glwe(
|
||||
Torus *glwe_array_out, Torus const *lwe_array_in, Torus const *fp_ksk,
|
||||
uint32_t lwe_dimension_in, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
Torus *d_mem) {
|
||||
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
const int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
|
||||
@@ -276,7 +276,7 @@ __global__ void accumulate_glwes(Torus *glwe_out, Torus *glwe_array_in,
|
||||
template <typename Torus>
|
||||
__host__ void host_packing_keyswitch_lwe_list_to_glwe(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *glwe_out,
|
||||
Torus *lwe_array_in, Torus *fp_ksk_array, int8_t *fp_ks_buffer,
|
||||
Torus const *lwe_array_in, Torus const *fp_ksk_array, int8_t *fp_ks_buffer,
|
||||
uint32_t lwe_dimension_in, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_lwes) {
|
||||
|
||||
@@ -113,7 +113,7 @@ void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
}
|
||||
|
||||
/// Copy memory within a GPU asynchronously
|
||||
void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
|
||||
void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index) {
|
||||
if (size == 0)
|
||||
return;
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
#include "integer/addition.cuh"
|
||||
|
||||
void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, int8_t signed_operation,
|
||||
@@ -23,9 +23,10 @@ void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
|
||||
}
|
||||
|
||||
void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lhs,
|
||||
void *rhs, void *overflowed, int8_t signed_operation, int8_t *mem_ptr,
|
||||
void **bsks, void **ksks, uint32_t num_blocks) {
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lhs, void const *rhs, void *overflowed, int8_t signed_operation,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
auto mem = (int_signed_overflowing_add_or_sub_memory<uint64_t> *)mem_ptr;
|
||||
SIGNED_OPERATION op = (signed_operation == 1) ? SIGNED_OPERATION::ADDITION
|
||||
@@ -33,13 +34,13 @@ void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
|
||||
|
||||
host_integer_signed_overflowing_add_or_sub_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lhs), static_cast<uint64_t *>(rhs),
|
||||
static_cast<uint64_t *>(overflowed), op, bsks, (uint64_t **)(ksks), mem,
|
||||
num_blocks);
|
||||
static_cast<uint64_t *>(lhs), static_cast<uint64_t const *>(rhs),
|
||||
static_cast<uint64_t *>(overflowed), op, bsks, (uint64_t *const *)(ksks),
|
||||
mem, num_blocks);
|
||||
}
|
||||
|
||||
void cleanup_signed_overflowing_add_or_sub(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
void cleanup_signed_overflowing_add_or_sub(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_signed_overflowing_add_or_sub_memory<uint64_t> *mem_ptr =
|
||||
|
||||
@@ -3,13 +3,13 @@
|
||||
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer.h"
|
||||
#include "integer/comparison.cuh"
|
||||
#include "integer/integer.cuh"
|
||||
#include "integer/integer_utilities.h"
|
||||
#include "integer/negation.cuh"
|
||||
#include "integer/scalar_shifts.cuh"
|
||||
#include "linear_algebra.h"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "pbs/programmable_bootstrap.h"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <fstream>
|
||||
@@ -20,10 +20,11 @@
|
||||
|
||||
template <typename Torus>
|
||||
void host_resolve_signed_overflow(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *result, Torus *last_block_inner_propagation,
|
||||
Torus *last_block_input_carry, Torus *last_block_output_carry,
|
||||
int_resolve_signed_overflow_memory<Torus> *mem, void **bsks, Torus **ksks) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *result, Torus *last_block_inner_propagation,
|
||||
Torus const *last_block_input_carry, Torus *last_block_output_carry,
|
||||
int_resolve_signed_overflow_memory<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks) {
|
||||
|
||||
auto x = mem->x;
|
||||
|
||||
@@ -53,7 +54,8 @@ void host_resolve_signed_overflow(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_signed_overflowing_add_or_sub_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int_signed_overflowing_add_or_sub_memory<Torus> **mem_ptr,
|
||||
uint32_t num_blocks, SIGNED_OPERATION op, int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
@@ -69,9 +71,9 @@ __host__ void scratch_cuda_integer_signed_overflowing_add_or_sub_kb(
|
||||
*/
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_signed_overflowing_add_or_sub_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lhs, Torus *rhs, Torus *overflowed, SIGNED_OPERATION op, void **bsks,
|
||||
uint64_t **ksks,
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lhs, Torus const *rhs, Torus *overflowed,
|
||||
SIGNED_OPERATION op, void *const *bsks, uint64_t *const *ksks,
|
||||
int_signed_overflowing_add_or_sub_memory<uint64_t> *mem_ptr,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
#include "integer/bitwise_ops.cuh"
|
||||
|
||||
void scratch_cuda_integer_radix_bitop_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
|
||||
@@ -21,21 +21,23 @@ void scratch_cuda_integer_radix_bitop_kb_64(
|
||||
}
|
||||
|
||||
void cuda_bitop_integer_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr,
|
||||
void **bsks, void **ksks, uint32_t lwe_ciphertext_count) {
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void const *lwe_array_1, void const *lwe_array_2,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
uint32_t lwe_ciphertext_count) {
|
||||
|
||||
host_integer_radix_bitop_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_1),
|
||||
static_cast<uint64_t *>(lwe_array_2),
|
||||
static_cast<const uint64_t *>(lwe_array_1),
|
||||
static_cast<const uint64_t *>(lwe_array_2),
|
||||
(int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
|
||||
lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_bitop(void **streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int8_t **mem_ptr_void) {
|
||||
void cleanup_cuda_integer_bitop(void *const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_bitop_buffer<uint64_t> *mem_ptr =
|
||||
(int_bitop_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer.cuh"
|
||||
#include "integer.h"
|
||||
#include "integer/integer_utilities.h"
|
||||
#include "pbs/programmable_bootstrap_classic.cuh"
|
||||
#include "pbs/programmable_bootstrap_multibit.cuh"
|
||||
#include "polynomial/functions.cuh"
|
||||
@@ -12,12 +12,11 @@
|
||||
#include <omp.h>
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_integer_radix_bitop_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out,
|
||||
Torus *lwe_array_1, Torus *lwe_array_2,
|
||||
int_bitop_buffer<Torus> *mem_ptr, void **bsks,
|
||||
Torus **ksks, uint32_t num_radix_blocks) {
|
||||
__host__ void host_integer_radix_bitop_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_1,
|
||||
Torus const *lwe_array_2, int_bitop_buffer<Torus> *mem_ptr,
|
||||
void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks) {
|
||||
|
||||
auto lut = mem_ptr->lut;
|
||||
|
||||
@@ -28,9 +27,10 @@ host_integer_radix_bitop_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_radix_bitop_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_bitop_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
|
||||
int_radix_params params, BITOP_TYPE op, bool allocate_gpu_memory) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_bitop_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params, BITOP_TYPE op,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr =
|
||||
new int_bitop_buffer<Torus>(streams, gpu_indexes, gpu_count, op, params,
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
#include "integer/cmux.cuh"
|
||||
|
||||
void scratch_cuda_integer_radix_cmux_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
|
||||
@@ -24,23 +24,24 @@ void scratch_cuda_integer_radix_cmux_kb_64(
|
||||
}
|
||||
|
||||
void cuda_cmux_integer_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *lwe_condition, void *lwe_array_true,
|
||||
void *lwe_array_false, int8_t *mem_ptr, void **bsks, void **ksks,
|
||||
uint32_t lwe_ciphertext_count) {
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void const *lwe_condition, void const *lwe_array_true,
|
||||
void const *lwe_array_false, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, uint32_t lwe_ciphertext_count) {
|
||||
|
||||
host_integer_radix_cmux_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_condition),
|
||||
static_cast<uint64_t *>(lwe_array_true),
|
||||
static_cast<uint64_t *>(lwe_array_false),
|
||||
static_cast<const uint64_t *>(lwe_condition),
|
||||
static_cast<const uint64_t *>(lwe_array_true),
|
||||
static_cast<const uint64_t *>(lwe_array_false),
|
||||
(int_cmux_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
|
||||
|
||||
lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_cmux(void **streams, uint32_t *gpu_indexes,
|
||||
void cleanup_cuda_integer_radix_cmux(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
|
||||
@@ -4,12 +4,13 @@
|
||||
#include "integer.cuh"
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out,
|
||||
Torus *lwe_array_input, Torus *lwe_condition,
|
||||
__host__ void zero_out_if(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus const *lwe_array_input,
|
||||
Torus const *lwe_condition,
|
||||
int_zero_out_if_buffer<Torus> *mem_ptr,
|
||||
int_radix_lut<Torus> *predicate, void **bsks,
|
||||
Torus **ksks, uint32_t num_radix_blocks) {
|
||||
int_radix_lut<Torus> *predicate, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_radix_blocks) {
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
auto params = mem_ptr->params;
|
||||
|
||||
@@ -42,10 +43,11 @@ __host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_cmux_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_condition, Torus *lwe_array_true,
|
||||
Torus *lwe_array_false, int_cmux_buffer<Torus> *mem_ptr, void **bsks,
|
||||
Torus **ksks, uint32_t num_radix_blocks) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_condition,
|
||||
Torus const *lwe_array_true, Torus const *lwe_array_false,
|
||||
int_cmux_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
|
||||
@@ -89,8 +91,8 @@ __host__ void host_integer_radix_cmux_kb(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_radix_cmux_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_cmux_buffer<Torus> **mem_ptr,
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_cmux_buffer<Torus> **mem_ptr,
|
||||
std::function<Torus(Torus)> predicate_lut_f, uint32_t num_radix_blocks,
|
||||
int_radix_params params, bool allocate_gpu_memory) {
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
#include "integer/comparison.cuh"
|
||||
|
||||
void scratch_cuda_integer_radix_comparison_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
@@ -37,9 +37,10 @@ void scratch_cuda_integer_radix_comparison_kb_64(
|
||||
}
|
||||
|
||||
void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr,
|
||||
void **bsks, void **ksks, uint32_t num_radix_blocks) {
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void const *lwe_array_1, void const *lwe_array_2,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
int_comparison_buffer<uint64_t> *buffer =
|
||||
(int_comparison_buffer<uint64_t> *)mem_ptr;
|
||||
@@ -49,9 +50,9 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
host_integer_radix_equality_check_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_1),
|
||||
static_cast<uint64_t *>(lwe_array_2), buffer, bsks, (uint64_t **)(ksks),
|
||||
num_radix_blocks);
|
||||
static_cast<const uint64_t *>(lwe_array_1),
|
||||
static_cast<const uint64_t *>(lwe_array_2), buffer, bsks,
|
||||
(uint64_t **)(ksks), num_radix_blocks);
|
||||
break;
|
||||
case GT:
|
||||
case GE:
|
||||
@@ -60,8 +61,8 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
host_integer_radix_difference_check_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_1),
|
||||
static_cast<uint64_t *>(lwe_array_2), buffer,
|
||||
static_cast<const uint64_t *>(lwe_array_1),
|
||||
static_cast<const uint64_t *>(lwe_array_2), buffer,
|
||||
buffer->diff_buffer->operator_f, bsks, (uint64_t **)(ksks),
|
||||
num_radix_blocks);
|
||||
break;
|
||||
@@ -70,16 +71,17 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
host_integer_radix_maxmin_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_1),
|
||||
static_cast<uint64_t *>(lwe_array_2), buffer, bsks, (uint64_t **)(ksks),
|
||||
num_radix_blocks);
|
||||
static_cast<const uint64_t *>(lwe_array_1),
|
||||
static_cast<const uint64_t *>(lwe_array_2), buffer, bsks,
|
||||
(uint64_t **)(ksks), num_radix_blocks);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error: integer operation not supported")
|
||||
}
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_comparison(void **streams, uint32_t *gpu_indexes,
|
||||
void cleanup_cuda_integer_comparison(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
|
||||
@@ -4,8 +4,8 @@
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer.cuh"
|
||||
#include "integer.h"
|
||||
#include "integer/cmux.cuh"
|
||||
#include "integer/integer_utilities.h"
|
||||
#include "integer/negation.cuh"
|
||||
#include "integer/scalar_addition.cuh"
|
||||
#include "pbs/programmable_bootstrap_classic.cuh"
|
||||
@@ -16,9 +16,9 @@
|
||||
// lwe_dimension + 1 threads
|
||||
// todo: This kernel MUST be refactored to a binary reduction
|
||||
template <typename Torus>
|
||||
__global__ void device_accumulate_all_blocks(Torus *output, Torus *input_block,
|
||||
uint32_t lwe_dimension,
|
||||
uint32_t num_blocks) {
|
||||
__global__ void
|
||||
device_accumulate_all_blocks(Torus *output, Torus const *input_block,
|
||||
uint32_t lwe_dimension, uint32_t num_blocks) {
|
||||
int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (idx < lwe_dimension + 1) {
|
||||
auto block = &input_block[idx];
|
||||
@@ -34,7 +34,7 @@ __global__ void device_accumulate_all_blocks(Torus *output, Torus *input_block,
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *output, Torus *input,
|
||||
Torus *output, Torus const *input,
|
||||
uint32_t lwe_dimension,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
@@ -57,10 +57,10 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
|
||||
*/
|
||||
template <typename Torus>
|
||||
__host__ void are_all_comparisons_block_true(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
|
||||
uint32_t num_radix_blocks) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_radix_blocks) {
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
@@ -159,10 +159,10 @@ __host__ void are_all_comparisons_block_true(
|
||||
*/
|
||||
template <typename Torus>
|
||||
__host__ void is_at_least_one_comparisons_block_true(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
|
||||
uint32_t num_radix_blocks) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_radix_blocks) {
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
@@ -239,10 +239,11 @@ __host__ void is_at_least_one_comparisons_block_true(
|
||||
// are_all_comparisons_block_true
|
||||
template <typename Torus>
|
||||
__host__ void host_compare_with_zero_equality(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
|
||||
int32_t num_radix_blocks, int_radix_lut<Torus> *zero_comparison) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks, int32_t num_radix_blocks,
|
||||
int_radix_lut<Torus> *zero_comparison) {
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
@@ -301,10 +302,10 @@ __host__ void host_compare_with_zero_equality(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_equality_check_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_1, Torus *lwe_array_2,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
|
||||
uint32_t num_radix_blocks) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_1,
|
||||
Torus const *lwe_array_2, int_comparison_buffer<Torus> *mem_ptr,
|
||||
void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks) {
|
||||
|
||||
auto eq_buffer = mem_ptr->eq_buffer;
|
||||
|
||||
@@ -325,12 +326,11 @@ __host__ void host_integer_radix_equality_check_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out,
|
||||
Torus *lwe_array_left, Torus *lwe_array_right,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void **bsks,
|
||||
Torus **ksks, uint32_t num_radix_blocks) {
|
||||
__host__ void compare_radix_blocks_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_left,
|
||||
Torus const *lwe_array_right, int_comparison_buffer<Torus> *mem_ptr,
|
||||
void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks) {
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
@@ -374,13 +374,12 @@ compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
// (inferior, equal, superior) to one single shortint block containing the
|
||||
// final sign
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out,
|
||||
Torus *lwe_block_comparisons,
|
||||
int_tree_sign_reduction_buffer<Torus> *tree_buffer,
|
||||
std::function<Torus(Torus)> sign_handler_f, void **bsks,
|
||||
Torus **ksks, uint32_t num_radix_blocks) {
|
||||
__host__ void tree_sign_reduction(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_block_comparisons,
|
||||
int_tree_sign_reduction_buffer<Torus> *tree_buffer,
|
||||
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_radix_blocks) {
|
||||
|
||||
auto params = tree_buffer->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
@@ -462,11 +461,11 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_difference_check_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_left, Torus *lwe_array_right,
|
||||
int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> reduction_lut_f, void **bsks, Torus **ksks,
|
||||
uint32_t num_radix_blocks) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_left,
|
||||
Torus const *lwe_array_right, int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> reduction_lut_f, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_radix_blocks) {
|
||||
|
||||
auto diff_buffer = mem_ptr->diff_buffer;
|
||||
|
||||
@@ -477,8 +476,8 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
uint32_t packed_num_radix_blocks = num_radix_blocks;
|
||||
auto lhs = lwe_array_left;
|
||||
auto rhs = lwe_array_right;
|
||||
Torus *lhs = (Torus *)lwe_array_left;
|
||||
Torus *rhs = (Torus *)lwe_array_right;
|
||||
if (carry_modulus >= message_modulus) {
|
||||
// Packing is possible
|
||||
// Pack inputs
|
||||
@@ -586,10 +585,10 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_radix_comparison_check_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_comparison_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
|
||||
int_radix_params params, COMPARISON_TYPE op, bool is_signed,
|
||||
bool allocate_gpu_memory) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_comparison_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
|
||||
bool is_signed, bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_comparison_buffer<Torus>(streams, gpu_indexes, gpu_count,
|
||||
op, params, num_radix_blocks,
|
||||
@@ -597,12 +596,11 @@ __host__ void scratch_cuda_integer_radix_comparison_check_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_integer_radix_maxmin_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out,
|
||||
Torus *lwe_array_left, Torus *lwe_array_right,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void **bsks,
|
||||
Torus **ksks, uint32_t total_num_radix_blocks) {
|
||||
__host__ void host_integer_radix_maxmin_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_left,
|
||||
Torus const *lwe_array_right, int_comparison_buffer<Torus> *mem_ptr,
|
||||
void *const *bsks, Torus *const *ksks, uint32_t total_num_radix_blocks) {
|
||||
|
||||
// Compute the sign
|
||||
host_integer_radix_difference_check_kb<Torus>(
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
#include "compression.cuh"
|
||||
|
||||
void scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t compression_glwe_dimension,
|
||||
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params compression_params(
|
||||
@@ -21,12 +22,13 @@ void scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
void scratch_cuda_integer_decompress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
|
||||
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count,
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t encryption_glwe_dimension,
|
||||
uint32_t encryption_polynomial_size, uint32_t compression_glwe_dimension,
|
||||
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint32_t storage_log_modulus, uint32_t body_count,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
// Decompression doesn't keyswitch, so big and small dimensions are the same
|
||||
@@ -47,32 +49,31 @@ void scratch_cuda_integer_decompress_radix_ciphertext_64(
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
void cuda_integer_compress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths,
|
||||
int8_t *mem_ptr) {
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *glwe_array_out, void const *lwe_array_in, void *const *fp_ksk,
|
||||
uint32_t num_nths, int8_t *mem_ptr) {
|
||||
|
||||
host_integer_compress<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(glwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in), (uint64_t **)(fp_ksk), num_nths,
|
||||
(int_compression<uint64_t> *)mem_ptr);
|
||||
static_cast<const uint64_t *>(lwe_array_in), (uint64_t *const *)(fp_ksk),
|
||||
num_nths, (int_compression<uint64_t> *)mem_ptr);
|
||||
}
|
||||
void cuda_integer_decompress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *glwe_in, uint32_t *indexes_array,
|
||||
uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr) {
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void const *glwe_in, uint32_t const *indexes_array,
|
||||
uint32_t indexes_array_size, void *const *bsks, int8_t *mem_ptr) {
|
||||
|
||||
host_integer_decompress<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out), static_cast<uint64_t *>(glwe_in),
|
||||
indexes_array, indexes_array_size, bsks,
|
||||
(int_decompression<uint64_t> *)mem_ptr);
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(glwe_in), indexes_array, indexes_array_size,
|
||||
bsks, (int_decompression<uint64_t> *)mem_ptr);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_compress_radix_ciphertext_64(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
void cleanup_cuda_integer_compress_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_compression<uint64_t> *mem_ptr =
|
||||
(int_compression<uint64_t> *)(*mem_ptr_void);
|
||||
@@ -80,7 +81,7 @@ void cleanup_cuda_integer_compress_radix_ciphertext_64(void **streams,
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_decompress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_decompression<uint64_t> *mem_ptr =
|
||||
|
||||
@@ -2,9 +2,10 @@
|
||||
#define CUDA_INTEGER_COMPRESSION_CUH
|
||||
|
||||
#include "ciphertext.h"
|
||||
#include "compression.h"
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer/compression/compression.h"
|
||||
#include "integer/compression/compression_utilities.h"
|
||||
#include "integer/integer.cuh"
|
||||
#include "linearalgebra/multiplication.cuh"
|
||||
#include "polynomial/functions.cuh"
|
||||
@@ -77,11 +78,12 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_compress(cudaStream_t *streams,
|
||||
uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *glwe_array_out, Torus *lwe_array_in,
|
||||
Torus **fp_ksk, uint32_t num_radix_blocks,
|
||||
int_compression<Torus> *mem_ptr) {
|
||||
__host__ void
|
||||
host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *glwe_array_out,
|
||||
Torus const *lwe_array_in, Torus *const *fp_ksk,
|
||||
uint32_t num_radix_blocks,
|
||||
int_compression<Torus> *mem_ptr) {
|
||||
|
||||
auto compression_params = mem_ptr->compression_params;
|
||||
auto input_lwe_dimension = compression_params.small_lwe_dimension;
|
||||
@@ -138,9 +140,9 @@ __host__ void host_integer_compress(cudaStream_t *streams,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void extract(Torus *glwe_array_out, Torus *array_in, uint32_t index,
|
||||
uint32_t log_modulus, uint32_t input_len,
|
||||
uint32_t initial_out_len) {
|
||||
__global__ void extract(Torus *glwe_array_out, Torus const *array_in,
|
||||
uint32_t index, uint32_t log_modulus,
|
||||
uint32_t input_len, uint32_t initial_out_len) {
|
||||
auto nbits = sizeof(Torus) * 8;
|
||||
|
||||
auto i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
@@ -176,7 +178,7 @@ __global__ void extract(Torus *glwe_array_out, Torus *array_in, uint32_t index,
|
||||
/// Extracts the glwe_index-nth GLWE ciphertext
|
||||
template <typename Torus>
|
||||
__host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *glwe_array_out, Torus *array_in,
|
||||
Torus *glwe_array_out, Torus const *array_in,
|
||||
uint32_t glwe_index,
|
||||
int_decompression<Torus> *mem_ptr) {
|
||||
if (array_in == glwe_array_out)
|
||||
@@ -219,15 +221,14 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *d_lwe_array_out,
|
||||
Torus *d_packed_glwe_in, uint32_t *h_indexes_array,
|
||||
uint32_t indexes_array_size, void **d_bsks,
|
||||
int_decompression<Torus> *h_mem_ptr) {
|
||||
__host__ void host_integer_decompress(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *d_lwe_array_out, Torus const *d_packed_glwe_in,
|
||||
uint32_t const *h_indexes_array, uint32_t indexes_array_size,
|
||||
void *const *d_bsks, int_decompression<Torus> *h_mem_ptr) {
|
||||
|
||||
auto d_indexes_array = h_mem_ptr->tmp_indexes_array;
|
||||
cuda_memcpy_async_to_gpu(d_indexes_array, h_indexes_array,
|
||||
cuda_memcpy_async_to_gpu(d_indexes_array, (void *)h_indexes_array,
|
||||
indexes_array_size * sizeof(uint32_t), streams[0],
|
||||
gpu_indexes[0]);
|
||||
|
||||
@@ -355,10 +356,11 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_compress_integer_radix_ciphertext(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_compression<Torus> **mem_ptr, uint32_t num_radix_blocks,
|
||||
int_radix_params compression_params, uint32_t lwe_per_glwe,
|
||||
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_compression<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params compression_params,
|
||||
uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_compression<Torus>(
|
||||
streams, gpu_indexes, gpu_count, compression_params, num_radix_blocks,
|
||||
@@ -367,11 +369,11 @@ __host__ void scratch_cuda_compress_integer_radix_ciphertext(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_decompress_radix_ciphertext(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_decompression<Torus> **mem_ptr, uint32_t num_radix_blocks,
|
||||
uint32_t body_count, int_radix_params encryption_params,
|
||||
int_radix_params compression_params, uint32_t storage_log_modulus,
|
||||
bool allocate_gpu_memory) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_decompression<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, uint32_t body_count,
|
||||
int_radix_params encryption_params, int_radix_params compression_params,
|
||||
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_decompression<Torus>(
|
||||
streams, gpu_indexes, gpu_count, encryption_params, compression_params,
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
#include "integer/div_rem.cuh"
|
||||
|
||||
void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
@@ -20,20 +20,23 @@ void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
}
|
||||
|
||||
void cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *quotient,
|
||||
void *remainder, void *numerator, void *divisor, int8_t *mem_ptr,
|
||||
void **bsks, void **ksks, uint32_t num_blocks) {
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *quotient, void *remainder, void const *numerator, void const *divisor,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;
|
||||
|
||||
host_integer_div_rem_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
|
||||
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
|
||||
bsks, (uint64_t **)(ksks), mem, num_blocks);
|
||||
static_cast<const uint64_t *>(numerator),
|
||||
static_cast<const uint64_t *>(divisor), bsks, (uint64_t **)(ksks), mem,
|
||||
num_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_div_rem(void **streams, uint32_t *gpu_indexes,
|
||||
void cleanup_cuda_integer_div_rem(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int8_t **mem_ptr_void) {
|
||||
int_div_rem_memory<uint64_t> *mem_ptr =
|
||||
(int_div_rem_memory<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
@@ -3,13 +3,13 @@
|
||||
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer.h"
|
||||
#include "integer/comparison.cuh"
|
||||
#include "integer/integer.cuh"
|
||||
#include "integer/integer_utilities.h"
|
||||
#include "integer/negation.cuh"
|
||||
#include "integer/scalar_shifts.cuh"
|
||||
#include "linear_algebra.h"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "pbs/programmable_bootstrap.h"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <fstream>
|
||||
@@ -160,21 +160,23 @@ template <typename Torus> struct lwe_ciphertext_list {
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_div_rem_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_div_rem_memory<Torus> **mem_ptr, uint32_t num_blocks,
|
||||
int_radix_params params, bool allocate_gpu_memory) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_div_rem_memory<Torus> **mem_ptr,
|
||||
uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_div_rem_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *quotient, Torus *remainder,
|
||||
Torus *numerator, Torus *divisor, void **bsks,
|
||||
uint64_t **ksks, int_div_rem_memory<uint64_t> *mem_ptr,
|
||||
uint32_t num_blocks) {
|
||||
__host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *quotient,
|
||||
Torus *remainder, Torus const *numerator,
|
||||
Torus const *divisor, void *const *bsks,
|
||||
uint64_t *const *ksks,
|
||||
int_div_rem_memory<uint64_t> *mem_ptr,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
auto radix_params = mem_ptr->params;
|
||||
|
||||
@@ -222,8 +224,8 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
lwe_ciphertext_list<Torus> cleaned_merged_interesting_remainder(
|
||||
mem_ptr->cleaned_merged_interesting_remainder, radix_params, num_blocks);
|
||||
|
||||
numerator_block_stack.clone_from(numerator, 0, num_blocks - 1, streams[0],
|
||||
gpu_indexes[0]);
|
||||
numerator_block_stack.clone_from((Torus *)numerator, 0, num_blocks - 1,
|
||||
streams[0], gpu_indexes[0]);
|
||||
remainder1.assign_zero(0, num_blocks - 1, streams[0], gpu_indexes[0]);
|
||||
remainder2.assign_zero(0, num_blocks - 1, streams[0], gpu_indexes[0]);
|
||||
|
||||
@@ -245,9 +247,9 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
streams[0], gpu_indexes[0]);
|
||||
interesting_remainder2.clone_from(remainder2, 0, last_non_trivial_block,
|
||||
streams[0], gpu_indexes[0]);
|
||||
interesting_divisor.clone_from(divisor, 0, last_non_trivial_block,
|
||||
interesting_divisor.clone_from((Torus *)divisor, 0, last_non_trivial_block,
|
||||
streams[0], gpu_indexes[0]);
|
||||
divisor_ms_blocks.clone_from(divisor,
|
||||
divisor_ms_blocks.clone_from((Torus *)divisor,
|
||||
(msb_bit_set + 1) / num_bits_in_message,
|
||||
num_blocks - 1, streams[0], gpu_indexes[0]);
|
||||
|
||||
@@ -256,65 +258,67 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
// msb_bit_set) the split versions share some bits they should not. So we do
|
||||
// one PBS on the last block of the interesting_divisor, and first block of
|
||||
// divisor_ms_blocks to trim out bits which should not be there
|
||||
auto trim_last_interesting_divisor_bits =
|
||||
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
|
||||
if ((msb_bit_set + 1) % num_bits_in_message == 0) {
|
||||
return;
|
||||
}
|
||||
// The last block of the interesting part of the remainder
|
||||
// can contain bits which we should not account for
|
||||
// we have to zero them out.
|
||||
auto trim_last_interesting_divisor_bits = [&](cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
if ((msb_bit_set + 1) % num_bits_in_message == 0) {
|
||||
return;
|
||||
}
|
||||
// The last block of the interesting part of the remainder
|
||||
// can contain bits which we should not account for
|
||||
// we have to zero them out.
|
||||
|
||||
// Where the msb is set in the block
|
||||
uint32_t pos_in_block = msb_bit_set % num_bits_in_message;
|
||||
// Where the msb is set in the block
|
||||
uint32_t pos_in_block = msb_bit_set % num_bits_in_message;
|
||||
|
||||
// e.g 2 bits in message:
|
||||
// if pos_in_block is 0, then we want to keep only first bit (right
|
||||
// shift
|
||||
// mask by 1) if pos_in_block is 1, then we want to keep the two
|
||||
// bits
|
||||
// (right shift mask by 0)
|
||||
uint32_t shift_amount = num_bits_in_message - (pos_in_block + 1);
|
||||
// e.g 2 bits in message:
|
||||
// if pos_in_block is 0, then we want to keep only first bit (right
|
||||
// shift
|
||||
// mask by 1) if pos_in_block is 1, then we want to keep the two
|
||||
// bits
|
||||
// (right shift mask by 0)
|
||||
uint32_t shift_amount = num_bits_in_message - (pos_in_block + 1);
|
||||
|
||||
// Create mask of 1s on the message part, 0s in the carries
|
||||
uint32_t full_message_mask = message_modulus - 1;
|
||||
// Create mask of 1s on the message part, 0s in the carries
|
||||
uint32_t full_message_mask = message_modulus - 1;
|
||||
|
||||
// Shift the mask so that we will only keep bits we should
|
||||
uint32_t shifted_mask = full_message_mask >> shift_amount;
|
||||
// Shift the mask so that we will only keep bits we should
|
||||
uint32_t shifted_mask = full_message_mask >> shift_amount;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, interesting_divisor.last_block(),
|
||||
interesting_divisor.last_block(), bsks, ksks, 1,
|
||||
mem_ptr->masking_luts_1[shifted_mask]);
|
||||
}; // trim_last_interesting_divisor_bits
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, interesting_divisor.last_block(),
|
||||
interesting_divisor.last_block(), bsks, ksks, 1,
|
||||
mem_ptr->masking_luts_1[shifted_mask]);
|
||||
}; // trim_last_interesting_divisor_bits
|
||||
|
||||
auto trim_first_divisor_ms_bits =
|
||||
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
|
||||
if (divisor_ms_blocks.is_empty() ||
|
||||
((msb_bit_set + 1) % num_bits_in_message) == 0) {
|
||||
return;
|
||||
}
|
||||
// Where the msb is set in the block
|
||||
uint32_t pos_in_block = msb_bit_set % num_bits_in_message;
|
||||
auto trim_first_divisor_ms_bits = [&](cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
if (divisor_ms_blocks.is_empty() ||
|
||||
((msb_bit_set + 1) % num_bits_in_message) == 0) {
|
||||
return;
|
||||
}
|
||||
// Where the msb is set in the block
|
||||
uint32_t pos_in_block = msb_bit_set % num_bits_in_message;
|
||||
|
||||
// e.g 2 bits in message:
|
||||
// if pos_in_block is 0, then we want to discard the first bit (left
|
||||
// shift mask by 1) if pos_in_block is 1, then we want to discard the
|
||||
// two bits (left shift mask by 2) let shift_amount =
|
||||
// num_bits_in_message - pos_in_block
|
||||
uint32_t shift_amount = pos_in_block + 1;
|
||||
uint32_t full_message_mask = message_modulus - 1;
|
||||
uint32_t shifted_mask = full_message_mask << shift_amount;
|
||||
// e.g 2 bits in message:
|
||||
// if pos_in_block is 0, then we want to discard the first bit (left
|
||||
// shift mask by 1) if pos_in_block is 1, then we want to discard the
|
||||
// two bits (left shift mask by 2) let shift_amount =
|
||||
// num_bits_in_message - pos_in_block
|
||||
uint32_t shift_amount = pos_in_block + 1;
|
||||
uint32_t full_message_mask = message_modulus - 1;
|
||||
uint32_t shifted_mask = full_message_mask << shift_amount;
|
||||
|
||||
// Keep the mask within the range of message bits, so that
|
||||
// the estimated degree of the output is < msg_modulus
|
||||
shifted_mask = shifted_mask & full_message_mask;
|
||||
// Keep the mask within the range of message bits, so that
|
||||
// the estimated degree of the output is < msg_modulus
|
||||
shifted_mask = shifted_mask & full_message_mask;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(),
|
||||
divisor_ms_blocks.first_block(), bsks, ksks, 1,
|
||||
mem_ptr->masking_luts_2[shifted_mask]);
|
||||
}; // trim_first_divisor_ms_bits
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(),
|
||||
divisor_ms_blocks.first_block(), bsks, ksks, 1,
|
||||
mem_ptr->masking_luts_2[shifted_mask]);
|
||||
}; // trim_first_divisor_ms_bits
|
||||
|
||||
// This does
|
||||
// R := R << 1; R(0) := N(i)
|
||||
@@ -325,48 +329,50 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
// However, to keep the remainder clean (noise wise), what we do is that we
|
||||
// put the remainder block from which we need to extract the bit, as the LSB
|
||||
// of the Remainder, so that left shifting will pull the bit we need.
|
||||
auto left_shift_interesting_remainder1 =
|
||||
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
|
||||
numerator_block_1.clone_from(
|
||||
numerator_block_stack, numerator_block_stack.len - 1,
|
||||
numerator_block_stack.len - 1, streams[0], gpu_indexes[0]);
|
||||
numerator_block_stack.pop();
|
||||
interesting_remainder1.insert(0, numerator_block_1.first_block(),
|
||||
streams[0], gpu_indexes[0]);
|
||||
auto left_shift_interesting_remainder1 = [&](cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
numerator_block_1.clone_from(
|
||||
numerator_block_stack, numerator_block_stack.len - 1,
|
||||
numerator_block_stack.len - 1, streams[0], gpu_indexes[0]);
|
||||
numerator_block_stack.pop();
|
||||
interesting_remainder1.insert(0, numerator_block_1.first_block(),
|
||||
streams[0], gpu_indexes[0]);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, interesting_remainder1.data, 1,
|
||||
mem_ptr->shift_mem_1, bsks, ksks, interesting_remainder1.len);
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, interesting_remainder1.data, 1,
|
||||
mem_ptr->shift_mem_1, bsks, ksks, interesting_remainder1.len);
|
||||
|
||||
tmp_radix.clone_from(interesting_remainder1, 0,
|
||||
interesting_remainder1.len - 1, streams[0],
|
||||
gpu_indexes[0]);
|
||||
tmp_radix.clone_from(interesting_remainder1, 0,
|
||||
interesting_remainder1.len - 1, streams[0],
|
||||
gpu_indexes[0]);
|
||||
|
||||
host_radix_blocks_rotate_left<Torus>(
|
||||
streams, gpu_indexes, gpu_count, interesting_remainder1.data,
|
||||
tmp_radix.data, 1, interesting_remainder1.len, big_lwe_size);
|
||||
host_radix_blocks_rotate_left<Torus>(
|
||||
streams, gpu_indexes, gpu_count, interesting_remainder1.data,
|
||||
tmp_radix.data, 1, interesting_remainder1.len, big_lwe_size);
|
||||
|
||||
numerator_block_1.clone_from(
|
||||
interesting_remainder1, interesting_remainder1.len - 1,
|
||||
interesting_remainder1.len - 1, streams[0], gpu_indexes[0]);
|
||||
numerator_block_1.clone_from(
|
||||
interesting_remainder1, interesting_remainder1.len - 1,
|
||||
interesting_remainder1.len - 1, streams[0], gpu_indexes[0]);
|
||||
|
||||
interesting_remainder1.pop();
|
||||
interesting_remainder1.pop();
|
||||
|
||||
if (pos_in_block != 0) {
|
||||
// We have not yet extracted all the bits from this numerator
|
||||
// so, we put it back on the front so that it gets taken next
|
||||
// iteration
|
||||
numerator_block_stack.push(numerator_block_1.first_block(),
|
||||
streams[0], gpu_indexes[0]);
|
||||
}
|
||||
}; // left_shift_interesting_remainder1
|
||||
if (pos_in_block != 0) {
|
||||
// We have not yet extracted all the bits from this numerator
|
||||
// so, we put it back on the front so that it gets taken next
|
||||
// iteration
|
||||
numerator_block_stack.push(numerator_block_1.first_block(), streams[0],
|
||||
gpu_indexes[0]);
|
||||
}
|
||||
}; // left_shift_interesting_remainder1
|
||||
|
||||
auto left_shift_interesting_remainder2 =
|
||||
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, interesting_remainder2.data, 1,
|
||||
mem_ptr->shift_mem_2, bsks, ksks, interesting_remainder2.len);
|
||||
}; // left_shift_interesting_remainder2
|
||||
auto left_shift_interesting_remainder2 = [&](cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, interesting_remainder2.data, 1,
|
||||
mem_ptr->shift_mem_2, bsks, ksks, interesting_remainder2.len);
|
||||
}; // left_shift_interesting_remainder2
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
@@ -416,7 +422,8 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
// fills:
|
||||
// `new_remainder` - radix ciphertext
|
||||
// `subtraction_overflowed` - single ciphertext
|
||||
auto do_overflowing_sub = [&](cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
auto do_overflowing_sub = [&](cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
host_integer_overflowing_sub_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, new_remainder.data,
|
||||
@@ -427,8 +434,8 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
|
||||
// fills:
|
||||
// `at_least_one_upper_block_is_non_zero` - single ciphertext
|
||||
auto check_divisor_upper_blocks = [&](cudaStream_t *streams,
|
||||
uint32_t *gpu_indexes,
|
||||
auto check_divisor_upper_blocks = [&](cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
auto &trivial_blocks = divisor_ms_blocks;
|
||||
if (trivial_blocks.is_empty()) {
|
||||
@@ -459,7 +466,8 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
// fills:
|
||||
// `cleaned_merged_interesting_remainder` - radix ciphertext
|
||||
auto create_clean_version_of_merged_remainder =
|
||||
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
|
||||
[&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count,
|
||||
cleaned_merged_interesting_remainder.data,
|
||||
@@ -498,7 +506,8 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
streams[0], gpu_indexes[0]);
|
||||
|
||||
auto conditionally_zero_out_merged_interesting_remainder =
|
||||
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
|
||||
[&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count,
|
||||
cleaned_merged_interesting_remainder.data,
|
||||
@@ -510,7 +519,8 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
};
|
||||
|
||||
auto conditionally_zero_out_merged_new_remainder =
|
||||
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
|
||||
[&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, new_remainder.data,
|
||||
new_remainder.data, overflow_sum_radix.data, bsks, ksks,
|
||||
@@ -518,7 +528,8 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
mem_ptr->zero_out_if_overflow_happened[factor_lut_id], factor);
|
||||
};
|
||||
|
||||
auto set_quotient_bit = [&](cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
auto set_quotient_bit = [&](cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, did_not_overflow.data,
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
#include "integer/integer.cuh"
|
||||
#include <linear_algebra.h>
|
||||
|
||||
void cuda_full_propagation_64_inplace(void **streams, uint32_t *gpu_indexes,
|
||||
void cuda_full_propagation_64_inplace(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, void *input_blocks,
|
||||
int8_t *mem_ptr, void **ksks, void **bsks,
|
||||
uint32_t num_blocks) {
|
||||
int8_t *mem_ptr, void *const *ksks,
|
||||
void *const *bsks, uint32_t num_blocks) {
|
||||
|
||||
int_fullprop_buffer<uint64_t> *buffer =
|
||||
(int_fullprop_buffer<uint64_t> *)mem_ptr;
|
||||
@@ -16,11 +17,12 @@ void cuda_full_propagation_64_inplace(void **streams, uint32_t *gpu_indexes,
|
||||
}
|
||||
|
||||
void scratch_cuda_full_propagation_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
@@ -31,7 +33,8 @@ void scratch_cuda_full_propagation_64(
|
||||
(int_fullprop_buffer<uint64_t> **)mem_ptr, params, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cleanup_cuda_full_propagation(void **streams, uint32_t *gpu_indexes,
|
||||
void cleanup_cuda_full_propagation(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int8_t **mem_ptr_void) {
|
||||
|
||||
int_fullprop_buffer<uint64_t> *mem_ptr =
|
||||
@@ -41,8 +44,8 @@ void cleanup_cuda_full_propagation(void **streams, uint32_t *gpu_indexes,
|
||||
}
|
||||
|
||||
void scratch_cuda_propagate_single_carry_kb_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
@@ -60,9 +63,9 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
|
||||
}
|
||||
|
||||
void cuda_propagate_single_carry_kb_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
void *carry_out, int8_t *mem_ptr, void **bsks, void **ksks,
|
||||
uint32_t num_blocks) {
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array, void *carry_out, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, uint32_t num_blocks) {
|
||||
host_propagate_single_carry<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(carry_out),
|
||||
@@ -71,9 +74,9 @@ void cuda_propagate_single_carry_kb_64_inplace(
|
||||
}
|
||||
|
||||
void cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
void *carry_out, void *input_carries, int8_t *mem_ptr, void **bsks,
|
||||
void **ksks, uint32_t num_blocks) {
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array, void *carry_out, void *input_carries, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks, uint32_t num_blocks) {
|
||||
host_propagate_single_carry<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(carry_out),
|
||||
@@ -82,7 +85,8 @@ void cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
|
||||
num_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_propagate_single_carry(void **streams, uint32_t *gpu_indexes,
|
||||
void cleanup_cuda_propagate_single_carry(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_sc_prop_memory<uint64_t> *mem_ptr =
|
||||
@@ -91,12 +95,13 @@ void cleanup_cuda_propagate_single_carry(void **streams, uint32_t *gpu_indexes,
|
||||
}
|
||||
|
||||
void scratch_cuda_apply_univariate_lut_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
@@ -105,26 +110,28 @@ void scratch_cuda_apply_univariate_lut_kb_64(
|
||||
|
||||
scratch_cuda_apply_univariate_lut_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_radix_lut<uint64_t> **)mem_ptr, static_cast<uint64_t *>(input_lut),
|
||||
num_radix_blocks, params, allocate_gpu_memory);
|
||||
(int_radix_lut<uint64_t> **)mem_ptr,
|
||||
static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_apply_univariate_lut_kb_64(void **streams, uint32_t *gpu_indexes,
|
||||
void cuda_apply_univariate_lut_kb_64(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, void *output_radix_lwe,
|
||||
void *input_radix_lwe, int8_t *mem_ptr,
|
||||
void **ksks, void **bsks,
|
||||
uint32_t num_blocks) {
|
||||
void const *input_radix_lwe,
|
||||
int8_t *mem_ptr, void *const *ksks,
|
||||
void *const *bsks, uint32_t num_blocks) {
|
||||
|
||||
host_apply_univariate_lut_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(output_radix_lwe),
|
||||
static_cast<uint64_t *>(input_radix_lwe),
|
||||
static_cast<const uint64_t *>(input_radix_lwe),
|
||||
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks,
|
||||
num_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_apply_univariate_lut_kb_64(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
|
||||
@@ -132,25 +139,27 @@ void cleanup_cuda_apply_univariate_lut_kb_64(void **streams,
|
||||
}
|
||||
|
||||
void cuda_apply_many_univariate_lut_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
|
||||
void **bsks, uint32_t num_blocks, uint32_t lut_count, uint32_t lut_stride) {
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *output_radix_lwe, void const *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *const *ksks, void *const *bsks, uint32_t num_blocks,
|
||||
uint32_t lut_count, uint32_t lut_stride) {
|
||||
|
||||
host_apply_many_univariate_lut_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(output_radix_lwe),
|
||||
static_cast<uint64_t *>(input_radix_lwe),
|
||||
static_cast<const uint64_t *>(input_radix_lwe),
|
||||
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks, num_blocks,
|
||||
lut_count, lut_stride);
|
||||
}
|
||||
|
||||
void scratch_cuda_apply_bivariate_lut_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, void *input_lut, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
@@ -163,24 +172,23 @@ void scratch_cuda_apply_bivariate_lut_kb_64(
|
||||
num_radix_blocks, params, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_apply_bivariate_lut_kb_64(void **streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, void *output_radix_lwe,
|
||||
void *input_radix_lwe_1,
|
||||
void *input_radix_lwe_2, int8_t *mem_ptr,
|
||||
void **ksks, void **bsks,
|
||||
uint32_t num_blocks, uint32_t shift) {
|
||||
void cuda_apply_bivariate_lut_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *output_radix_lwe, void const *input_radix_lwe_1,
|
||||
void const *input_radix_lwe_2, int8_t *mem_ptr, void *const *ksks,
|
||||
void *const *bsks, uint32_t num_blocks, uint32_t shift) {
|
||||
|
||||
host_apply_bivariate_lut_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(output_radix_lwe),
|
||||
static_cast<uint64_t *>(input_radix_lwe_1),
|
||||
static_cast<uint64_t *>(input_radix_lwe_2),
|
||||
static_cast<const uint64_t *>(input_radix_lwe_1),
|
||||
static_cast<const uint64_t *>(input_radix_lwe_2),
|
||||
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks, num_blocks,
|
||||
shift);
|
||||
}
|
||||
|
||||
void cleanup_cuda_apply_bivariate_lut_kb_64(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
void cleanup_cuda_apply_bivariate_lut_kb_64(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
|
||||
@@ -188,12 +196,13 @@ void cleanup_cuda_apply_bivariate_lut_kb_64(void **streams,
|
||||
}
|
||||
|
||||
void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
@@ -202,14 +211,15 @@ void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
|
||||
scratch_cuda_apply_bivariate_lut_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_radix_lut<uint64_t> **)mem_ptr, static_cast<uint64_t *>(input_lut),
|
||||
num_radix_blocks, params, allocate_gpu_memory);
|
||||
(int_radix_lut<uint64_t> **)mem_ptr,
|
||||
static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *output_radix_lwe, void *generates_or_propagates, int8_t *mem_ptr,
|
||||
void **ksks, void **bsks, uint32_t num_blocks, uint32_t shift) {
|
||||
void *const *ksks, void *const *bsks, uint32_t num_blocks, uint32_t shift) {
|
||||
|
||||
int_radix_params params = ((int_radix_lut<uint64_t> *)mem_ptr)->params;
|
||||
|
||||
@@ -222,14 +232,14 @@ void cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
}
|
||||
|
||||
void cuda_integer_reverse_blocks_64_inplace(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
void cuda_integer_reverse_blocks_64_inplace(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, void *lwe_array,
|
||||
uint32_t num_blocks,
|
||||
uint32_t lwe_size) {
|
||||
|
||||
@@ -4,12 +4,12 @@
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "helper_multi_gpu.h"
|
||||
#include "integer.h"
|
||||
#include "integer/integer_utilities.h"
|
||||
#include "integer/scalar_addition.cuh"
|
||||
#include "linear_algebra.h"
|
||||
#include "linearalgebra/addition.cuh"
|
||||
#include "pbs/programmable_bootstrap.h"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/helper_multi_gpu.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
@@ -69,10 +69,10 @@ __global__ void radix_blocks_rotate_left(Torus *dst, Torus *src, uint32_t value,
|
||||
// one block is responsible to process single lwe ciphertext
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_radix_blocks_rotate_right(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *dst, Torus *src,
|
||||
uint32_t value, uint32_t blocks_count,
|
||||
uint32_t lwe_size) {
|
||||
host_radix_blocks_rotate_right(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *dst, Torus *src, uint32_t value,
|
||||
uint32_t blocks_count, uint32_t lwe_size) {
|
||||
if (src == dst) {
|
||||
PANIC("Cuda error (blocks_rotate_right): the source and destination "
|
||||
"pointers should be different");
|
||||
@@ -86,10 +86,10 @@ host_radix_blocks_rotate_right(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
// calculation is not inplace, so `dst` and `src` must not be the same
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_radix_blocks_rotate_left(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *dst, Torus *src,
|
||||
uint32_t value, uint32_t blocks_count,
|
||||
uint32_t lwe_size) {
|
||||
host_radix_blocks_rotate_left(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *dst, Torus *src, uint32_t value,
|
||||
uint32_t blocks_count, uint32_t lwe_size) {
|
||||
if (src == dst) {
|
||||
PANIC("Cuda error (blocks_rotate_left): the source and destination "
|
||||
"pointers should be different");
|
||||
@@ -119,9 +119,9 @@ __global__ void radix_blocks_reverse_lwe_inplace(Torus *src,
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_radix_blocks_reverse_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
Torus *src, uint32_t blocks_count,
|
||||
uint32_t lwe_size) {
|
||||
host_radix_blocks_reverse_inplace(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, Torus *src,
|
||||
uint32_t blocks_count, uint32_t lwe_size) {
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
int num_blocks = blocks_count / 2, num_threads = 1024;
|
||||
radix_blocks_reverse_lwe_inplace<Torus>
|
||||
@@ -131,10 +131,11 @@ host_radix_blocks_reverse_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
// polynomial_size threads
|
||||
template <typename Torus>
|
||||
__global__ void
|
||||
device_pack_bivariate_blocks(Torus *lwe_array_out, Torus *lwe_indexes_out,
|
||||
Torus *lwe_array_1, Torus *lwe_array_2,
|
||||
Torus *lwe_indexes_in, uint32_t lwe_dimension,
|
||||
uint32_t shift, uint32_t num_blocks) {
|
||||
device_pack_bivariate_blocks(Torus *lwe_array_out, Torus const *lwe_indexes_out,
|
||||
Torus const *lwe_array_1, Torus const *lwe_array_2,
|
||||
Torus const *lwe_indexes_in,
|
||||
uint32_t lwe_dimension, uint32_t shift,
|
||||
uint32_t num_blocks) {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
if (tid < num_blocks * (lwe_dimension + 1)) {
|
||||
@@ -151,13 +152,13 @@ device_pack_bivariate_blocks(Torus *lwe_array_out, Torus *lwe_indexes_out,
|
||||
* becomes out = m1 * shift + m2
|
||||
*/
|
||||
template <typename Torus>
|
||||
__host__ void pack_bivariate_blocks(cudaStream_t *streams,
|
||||
uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out,
|
||||
Torus *lwe_indexes_out, Torus *lwe_array_1,
|
||||
Torus *lwe_array_2, Torus *lwe_indexes_in,
|
||||
uint32_t lwe_dimension, uint32_t shift,
|
||||
uint32_t num_radix_blocks) {
|
||||
__host__ void
|
||||
pack_bivariate_blocks(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out,
|
||||
Torus const *lwe_indexes_out, Torus const *lwe_array_1,
|
||||
Torus const *lwe_array_2, Torus const *lwe_indexes_in,
|
||||
uint32_t lwe_dimension, uint32_t shift,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
// Left message is shifted
|
||||
@@ -173,9 +174,10 @@ __host__ void pack_bivariate_blocks(cudaStream_t *streams,
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in, void **bsks, Torus **ksks,
|
||||
uint32_t num_radix_blocks, int_radix_lut<Torus> *lut) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
|
||||
void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks,
|
||||
int_radix_lut<Torus> *lut) {
|
||||
// apply_lookup_table
|
||||
auto params = lut->params;
|
||||
auto pbs_type = params.pbs_type;
|
||||
@@ -202,10 +204,10 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
if (active_gpu_count == 1) {
|
||||
execute_keyswitch_async<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
|
||||
lwe_trivial_indexes_vec[0], lwe_array_in,
|
||||
lut->lwe_indexes_in, ksks, big_lwe_dimension,
|
||||
small_lwe_dimension, ks_base_log, ks_level,
|
||||
num_radix_blocks);
|
||||
lwe_trivial_indexes_vec[0],
|
||||
(Torus *)lwe_array_in, lut->lwe_indexes_in,
|
||||
ksks, big_lwe_dimension, small_lwe_dimension,
|
||||
ks_base_log, ks_level, num_radix_blocks);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
@@ -259,10 +261,10 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void integer_radix_apply_many_univariate_lookup_table_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in, void **bsks, Torus **ksks,
|
||||
uint32_t num_radix_blocks, int_radix_lut<Torus> *lut, uint32_t lut_count,
|
||||
uint32_t lut_stride) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
|
||||
void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks,
|
||||
int_radix_lut<Torus> *lut, uint32_t lut_count, uint32_t lut_stride) {
|
||||
// apply_lookup_table
|
||||
auto params = lut->params;
|
||||
auto pbs_type = params.pbs_type;
|
||||
@@ -286,10 +288,10 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
|
||||
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
if (active_gpu_count == 1) {
|
||||
execute_keyswitch_async<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
|
||||
lwe_trivial_indexes_vec[0], lwe_array_in,
|
||||
lut->lwe_indexes_in, ksks, big_lwe_dimension,
|
||||
small_lwe_dimension, ks_base_log, ks_level,
|
||||
num_radix_blocks);
|
||||
lwe_trivial_indexes_vec[0],
|
||||
(Torus *)lwe_array_in, lut->lwe_indexes_in,
|
||||
ksks, big_lwe_dimension, small_lwe_dimension,
|
||||
ks_base_log, ks_level, num_radix_blocks);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
@@ -343,10 +345,10 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_1, Torus *lwe_array_2, void **bsks,
|
||||
Torus **ksks, uint32_t num_radix_blocks, int_radix_lut<Torus> *lut,
|
||||
uint32_t shift) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_1,
|
||||
Torus const *lwe_array_2, void *const *bsks, Torus *const *ksks,
|
||||
uint32_t num_radix_blocks, int_radix_lut<Torus> *lut, uint32_t shift) {
|
||||
|
||||
auto params = lut->params;
|
||||
auto pbs_type = params.pbs_type;
|
||||
@@ -612,9 +614,10 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
|
||||
|
||||
template <typename Torus>
|
||||
void scratch_cuda_propagate_single_carry_kb_inplace(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_sc_prop_memory<Torus> **mem_ptr, uint32_t num_radix_blocks,
|
||||
int_radix_params params, bool allocate_gpu_memory) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_sc_prop_memory<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr =
|
||||
new int_sc_prop_memory<Torus>(streams, gpu_indexes, gpu_count, params,
|
||||
@@ -623,10 +626,10 @@ void scratch_cuda_propagate_single_carry_kb_inplace(
|
||||
|
||||
template <typename Torus>
|
||||
void host_compute_prefix_sum_hillis_steele(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *step_output, Torus *generates_or_propagates, int_radix_params params,
|
||||
int_radix_lut<Torus> *luts, void **bsks, Torus **ksks,
|
||||
uint32_t num_blocks) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *step_output, Torus *generates_or_propagates,
|
||||
int_radix_params params, int_radix_lut<Torus> *luts, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_blocks) {
|
||||
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
@@ -659,11 +662,13 @@ void host_compute_prefix_sum_hillis_steele(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void host_propagate_single_carry(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array,
|
||||
Torus *carry_out, Torus *input_carries,
|
||||
int_sc_prop_memory<Torus> *mem, void **bsks,
|
||||
Torus **ksks, uint32_t num_blocks) {
|
||||
int_sc_prop_memory<Torus> *mem,
|
||||
void *const *bsks, Torus *const *ksks,
|
||||
uint32_t num_blocks) {
|
||||
auto params = mem->params;
|
||||
if (params.message_modulus == 2)
|
||||
PANIC("Cuda error: single carry propagation is not supported for 1 bit "
|
||||
@@ -700,7 +705,7 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
gpu_indexes[0]);
|
||||
|
||||
if (input_carries != nullptr) {
|
||||
cuda_memcpy_async_gpu_to_gpu(input_carries, step_output,
|
||||
cuda_memcpy_async_gpu_to_gpu((void *)input_carries, step_output,
|
||||
big_lwe_size_bytes * num_blocks, streams[0],
|
||||
gpu_indexes[0]);
|
||||
}
|
||||
@@ -716,10 +721,10 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
|
||||
template <typename Torus>
|
||||
void host_generate_last_block_inner_propagation(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *last_block_inner_propagation, Torus *lhs, Torus *rhs,
|
||||
int_last_block_inner_propagate_memory<Torus> *mem, void **bsks,
|
||||
Torus **ksks) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *last_block_inner_propagation, Torus const *lhs,
|
||||
Torus const *rhs, int_last_block_inner_propagate_memory<Torus> *mem,
|
||||
void *const *bsks, Torus *const *ksks) {
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, last_block_inner_propagation, lhs, rhs,
|
||||
@@ -728,11 +733,12 @@ void host_generate_last_block_inner_propagation(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_propagate_single_sub_borrow(cudaStream_t *streams,
|
||||
uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *overflowed, Torus *lwe_array,
|
||||
void host_propagate_single_sub_borrow(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *overflowed,
|
||||
Torus *lwe_array,
|
||||
int_overflowing_sub_memory<Torus> *mem,
|
||||
void **bsks, Torus **ksks,
|
||||
void *const *bsks, Torus *const *ksks,
|
||||
uint32_t num_blocks) {
|
||||
auto params = mem->params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
@@ -784,10 +790,11 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams,
|
||||
* have size = 2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus)
|
||||
*/
|
||||
template <typename Torus>
|
||||
void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void host_full_propagate_inplace(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *input_blocks,
|
||||
int_fullprop_buffer<Torus> *mem_ptr,
|
||||
Torus **ksks, void **bsks,
|
||||
Torus *const *ksks, void *const *bsks,
|
||||
uint32_t num_blocks) {
|
||||
auto params = mem_ptr->lut->params;
|
||||
|
||||
@@ -821,14 +828,14 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
params.polynomial_size, params.pbs_base_log, params.pbs_level,
|
||||
params.grouping_factor, 2, params.pbs_type, lut_count, lut_stride);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(cur_input_block, mem_ptr->tmp_big_lwe_vector,
|
||||
big_lwe_size * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
(void *)cur_input_block, mem_ptr->tmp_big_lwe_vector,
|
||||
big_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
|
||||
if (i < num_blocks - 1) {
|
||||
auto next_input_block = &input_blocks[(i + 1) * big_lwe_size];
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], next_input_block,
|
||||
next_input_block,
|
||||
(Torus const *)next_input_block,
|
||||
&mem_ptr->tmp_big_lwe_vector[big_lwe_size],
|
||||
params.big_lwe_dimension, 1);
|
||||
}
|
||||
@@ -836,7 +843,8 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void scratch_cuda_full_propagation(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void scratch_cuda_full_propagation(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int_fullprop_buffer<Torus> **mem_ptr,
|
||||
int_radix_params params,
|
||||
@@ -849,14 +857,16 @@ void scratch_cuda_full_propagation(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
// (lwe_dimension+1) threads
|
||||
// (num_radix_blocks / 2) thread blocks
|
||||
template <typename Torus>
|
||||
__global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
__global__ void device_pack_blocks(Torus *lwe_array_out,
|
||||
Torus const *lwe_array_in,
|
||||
uint32_t lwe_dimension,
|
||||
uint32_t num_radix_blocks, uint32_t factor) {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
if (tid < (lwe_dimension + 1)) {
|
||||
for (int bid = 0; bid < (num_radix_blocks / 2); bid++) {
|
||||
Torus *lsb_block = lwe_array_in + (2 * bid) * (lwe_dimension + 1);
|
||||
Torus *lsb_block =
|
||||
(Torus *)lwe_array_in + (2 * bid) * (lwe_dimension + 1);
|
||||
Torus *msb_block = lsb_block + (lwe_dimension + 1);
|
||||
|
||||
Torus *packed_block = lwe_array_out + bid * (lwe_dimension + 1);
|
||||
@@ -867,7 +877,7 @@ __global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
if (num_radix_blocks % 2 == 1) {
|
||||
// We couldn't host_pack the last block, so we just copy it
|
||||
Torus *lsb_block =
|
||||
lwe_array_in + (num_radix_blocks - 1) * (lwe_dimension + 1);
|
||||
(Torus *)lwe_array_in + (num_radix_blocks - 1) * (lwe_dimension + 1);
|
||||
Torus *last_block =
|
||||
lwe_array_out + (num_radix_blocks / 2) * (lwe_dimension + 1);
|
||||
|
||||
@@ -885,7 +895,7 @@ __global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
// Expects the carry buffer to be empty
|
||||
template <typename Torus>
|
||||
__host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
Torus *lwe_array_out, Torus const *lwe_array_in,
|
||||
uint32_t lwe_dimension, uint32_t num_radix_blocks,
|
||||
uint32_t factor) {
|
||||
if (num_radix_blocks == 0)
|
||||
@@ -900,7 +910,7 @@ __host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index,
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void
|
||||
device_create_trivial_radix(Torus *lwe_array, Torus *scalar_input,
|
||||
device_create_trivial_radix(Torus *lwe_array, Torus const *scalar_input,
|
||||
int32_t num_blocks, uint32_t lwe_dimension,
|
||||
uint64_t delta) {
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
@@ -915,7 +925,7 @@ device_create_trivial_radix(Torus *lwe_array, Torus *scalar_input,
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
create_trivial_radix(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *lwe_array_out, Torus *scalar_array,
|
||||
Torus *lwe_array_out, Torus const *scalar_array,
|
||||
uint32_t lwe_dimension, uint32_t num_radix_blocks,
|
||||
uint32_t num_scalar_blocks, uint64_t message_modulus,
|
||||
uint64_t carry_modulus) {
|
||||
@@ -951,9 +961,10 @@ create_trivial_radix(cudaStream_t stream, uint32_t gpu_index,
|
||||
* * (lwe_dimension+1) * sizeeof(Torus) bytes
|
||||
*/
|
||||
template <typename Torus>
|
||||
__host__ void extract_n_bits(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out,
|
||||
Torus *lwe_array_in, void **bsks, Torus **ksks,
|
||||
__host__ void extract_n_bits(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
void *const *bsks, Torus *const *ksks,
|
||||
uint32_t num_radix_blocks, uint32_t bits_per_block,
|
||||
int_bit_extract_luts_buffer<Torus> *bit_extract) {
|
||||
|
||||
@@ -964,11 +975,11 @@ __host__ void extract_n_bits(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *signs_array_out, Torus *signs_array_in,
|
||||
reduce_signs(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *signs_array_out, Torus *signs_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> sign_handler_f, void **bsks,
|
||||
Torus **ksks, uint32_t num_sign_blocks) {
|
||||
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_sign_blocks) {
|
||||
|
||||
auto diff_buffer = mem_ptr->diff_buffer;
|
||||
|
||||
@@ -1064,27 +1075,29 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
|
||||
template <typename Torus>
|
||||
void scratch_cuda_apply_univariate_lut_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_radix_lut<Torus> **mem_ptr, Torus *input_lut, uint32_t num_radix_blocks,
|
||||
int_radix_params params, bool allocate_gpu_memory) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_lut<Torus> **mem_ptr, Torus const *input_lut,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
|
||||
1, num_radix_blocks, allocate_gpu_memory);
|
||||
// It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
|
||||
// 0
|
||||
cuda_memcpy_async_to_gpu((*mem_ptr)->get_lut(gpu_indexes[0], 0), input_lut,
|
||||
(params.glwe_dimension + 1) *
|
||||
params.polynomial_size * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_gpu(
|
||||
(*mem_ptr)->get_lut(gpu_indexes[0], 0), (void *)input_lut,
|
||||
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
(*mem_ptr)->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_apply_univariate_lut_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void host_apply_univariate_lut_kb(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *radix_lwe_out,
|
||||
Torus *radix_lwe_in,
|
||||
int_radix_lut<Torus> *mem, Torus **ksks,
|
||||
void **bsks, uint32_t num_blocks) {
|
||||
Torus const *radix_lwe_in,
|
||||
int_radix_lut<Torus> *mem, Torus *const *ksks,
|
||||
void *const *bsks, uint32_t num_blocks) {
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in, bsks, ksks,
|
||||
@@ -1093,10 +1106,10 @@ void host_apply_univariate_lut_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
|
||||
template <typename Torus>
|
||||
void host_apply_many_univariate_lut_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *radix_lwe_out, Torus *radix_lwe_in, int_radix_lut<Torus> *mem,
|
||||
Torus **ksks, void **bsks, uint32_t num_blocks, uint32_t lut_count,
|
||||
uint32_t lut_stride) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *radix_lwe_out, Torus const *radix_lwe_in,
|
||||
int_radix_lut<Torus> *mem, Torus *const *ksks, void *const *bsks,
|
||||
uint32_t num_blocks, uint32_t lut_count, uint32_t lut_stride) {
|
||||
|
||||
integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in, bsks, ksks,
|
||||
@@ -1105,28 +1118,28 @@ void host_apply_many_univariate_lut_kb(
|
||||
|
||||
template <typename Torus>
|
||||
void scratch_cuda_apply_bivariate_lut_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_radix_lut<Torus> **mem_ptr, Torus *input_lut, uint32_t num_radix_blocks,
|
||||
int_radix_params params, bool allocate_gpu_memory) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_lut<Torus> **mem_ptr, Torus const *input_lut,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
|
||||
1, num_radix_blocks, allocate_gpu_memory);
|
||||
// It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
|
||||
// 0
|
||||
cuda_memcpy_async_to_gpu((*mem_ptr)->get_lut(gpu_indexes[0], 0), input_lut,
|
||||
(params.glwe_dimension + 1) *
|
||||
params.polynomial_size * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_gpu(
|
||||
(*mem_ptr)->get_lut(gpu_indexes[0], 0), (void *)input_lut,
|
||||
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
(*mem_ptr)->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_apply_bivariate_lut_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *radix_lwe_out,
|
||||
Torus *radix_lwe_in_1, Torus *radix_lwe_in_2,
|
||||
int_radix_lut<Torus> *mem, Torus **ksks,
|
||||
void **bsks, uint32_t num_blocks,
|
||||
uint32_t shift) {
|
||||
void host_apply_bivariate_lut_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *radix_lwe_out, Torus const *radix_lwe_in_1,
|
||||
Torus const *radix_lwe_in_2, int_radix_lut<Torus> *mem, Torus *const *ksks,
|
||||
void *const *bsks, uint32_t num_blocks, uint32_t shift) {
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in_1,
|
||||
|
||||
@@ -66,12 +66,12 @@ void generate_ids_update_degrees(int *terms_degree, size_t *h_lwe_idx_in,
|
||||
* the integer radix multiplication in keyswitch->bootstrap order.
|
||||
*/
|
||||
void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
|
||||
uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t pbs_base_log, uint32_t pbs_level, uint32_t ks_base_log,
|
||||
uint32_t ks_level, uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
polynomial_size * glwe_dimension, lwe_dimension,
|
||||
@@ -87,7 +87,7 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
case 8192:
|
||||
case 16384:
|
||||
scratch_cuda_integer_mult_radix_ciphertext_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(cudaStream_t const *)(streams), gpu_indexes, gpu_count,
|
||||
(int_mul_memory<uint64_t> **)mem_ptr, num_radix_blocks, params,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
@@ -125,67 +125,67 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
* - 'pbs_type' selects which PBS implementation should be used
|
||||
*/
|
||||
void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *radix_lwe_out, void *radix_lwe_left, void *radix_lwe_right,
|
||||
void **bsks, void **ksks, int8_t *mem_ptr, uint32_t polynomial_size,
|
||||
uint32_t num_blocks) {
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *radix_lwe_out, void const *radix_lwe_left,
|
||||
void const *radix_lwe_right, void *const *bsks, void *const *ksks,
|
||||
int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<256>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
static_cast<const uint64_t *>(radix_lwe_left),
|
||||
static_cast<const uint64_t *>(radix_lwe_right), bsks,
|
||||
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 512:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<512>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
static_cast<const uint64_t *>(radix_lwe_left),
|
||||
static_cast<const uint64_t *>(radix_lwe_right), bsks,
|
||||
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 1024:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<1024>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
static_cast<const uint64_t *>(radix_lwe_left),
|
||||
static_cast<const uint64_t *>(radix_lwe_right), bsks,
|
||||
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<2048>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
static_cast<const uint64_t *>(radix_lwe_left),
|
||||
static_cast<const uint64_t *>(radix_lwe_right), bsks,
|
||||
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<4096>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
static_cast<const uint64_t *>(radix_lwe_left),
|
||||
static_cast<const uint64_t *>(radix_lwe_right), bsks,
|
||||
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<8192>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
static_cast<const uint64_t *>(radix_lwe_left),
|
||||
static_cast<const uint64_t *>(radix_lwe_right), bsks,
|
||||
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<16384>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
static_cast<const uint64_t *>(radix_lwe_left),
|
||||
static_cast<const uint64_t *>(radix_lwe_right), bsks,
|
||||
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
|
||||
@@ -193,8 +193,9 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
}
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_mult(void **streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int8_t **mem_ptr_void) {
|
||||
void cleanup_cuda_integer_mult(void *const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_mul_memory<uint64_t> *mem_ptr =
|
||||
(int_mul_memory<uint64_t> *)(*mem_ptr_void);
|
||||
@@ -203,10 +204,10 @@ void cleanup_cuda_integer_mult(void **streams, uint32_t *gpu_indexes,
|
||||
}
|
||||
|
||||
void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
@@ -222,9 +223,10 @@ void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
}
|
||||
|
||||
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec,
|
||||
int8_t *mem_ptr, void **bsks, void **ksks, uint32_t num_blocks_in_radix) {
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
uint32_t num_blocks_in_radix) {
|
||||
|
||||
auto mem = (int_sum_ciphertexts_vec_memory<uint64_t> *)mem_ptr;
|
||||
|
||||
@@ -298,7 +300,7 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr =
|
||||
(int_sum_ciphertexts_vec_memory<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
@@ -9,10 +9,10 @@
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "helper_multi_gpu.h"
|
||||
#include "integer.h"
|
||||
#include "integer/integer.cuh"
|
||||
#include "integer/integer_utilities.h"
|
||||
#include "linear_algebra.h"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "pbs/programmable_bootstrap.h"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/helper_multi_gpu.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
@@ -43,8 +43,8 @@ __global__ void smart_copy(Torus *dst, Torus *src, int32_t *id_out,
|
||||
|
||||
template <typename Torus, class params>
|
||||
__global__ void
|
||||
all_shifted_lhs_rhs(Torus *radix_lwe_left, Torus *lsb_ciphertext,
|
||||
Torus *msb_ciphertext, Torus *radix_lwe_right,
|
||||
all_shifted_lhs_rhs(Torus const *radix_lwe_left, Torus *lsb_ciphertext,
|
||||
Torus *msb_ciphertext, Torus const *radix_lwe_right,
|
||||
Torus *lsb_rhs, Torus *msb_rhs, int num_blocks) {
|
||||
|
||||
size_t block_id = blockIdx.x;
|
||||
@@ -170,8 +170,8 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
|
||||
}
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
|
||||
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
|
||||
int_radix_params params, bool allocate_gpu_memory) {
|
||||
|
||||
@@ -182,9 +182,10 @@ __host__ void scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *radix_lwe_out, Torus *terms, int *terms_degree, void **bsks,
|
||||
uint64_t **ksks, int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *radix_lwe_out, Torus *terms, int *terms_degree,
|
||||
void *const *bsks, uint64_t *const *ksks,
|
||||
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
|
||||
uint32_t num_blocks_in_radix, uint32_t num_radix_in_vec,
|
||||
int_radix_lut<Torus> *reused_lut) {
|
||||
|
||||
@@ -450,9 +451,9 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_integer_mult_radix_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
uint64_t *radix_lwe_out, uint64_t *radix_lwe_left,
|
||||
uint64_t *radix_lwe_right, void **bsks, uint64_t **ksks,
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, uint64_t *radix_lwe_out, uint64_t const *radix_lwe_left,
|
||||
uint64_t const *radix_lwe_right, void *const *bsks, uint64_t *const *ksks,
|
||||
int_mul_memory<Torus> *mem_ptr, uint32_t num_blocks) {
|
||||
|
||||
auto glwe_dimension = mem_ptr->params.glwe_dimension;
|
||||
@@ -569,9 +570,10 @@ __host__ void host_integer_mult_radix_kb(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_mult_radix_ciphertext_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_mul_memory<Torus> **mem_ptr, uint32_t num_radix_blocks,
|
||||
int_radix_params params, bool allocate_gpu_memory) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_mul_memory<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
*mem_ptr = new int_mul_memory<Torus>(streams, gpu_indexes, gpu_count, params,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
@@ -1,21 +1,21 @@
|
||||
#include "integer/negation.cuh"
|
||||
|
||||
void cuda_negate_integer_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *lwe_array_in, uint32_t lwe_dimension,
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void const *lwe_array_in, uint32_t lwe_dimension,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus) {
|
||||
|
||||
host_integer_radix_negation<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in), lwe_dimension,
|
||||
static_cast<const uint64_t *>(lwe_array_in), lwe_dimension,
|
||||
lwe_ciphertext_count, message_modulus, carry_modulus);
|
||||
}
|
||||
|
||||
void scratch_cuda_integer_radix_overflowing_sub_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
@@ -33,10 +33,10 @@ void scratch_cuda_integer_radix_overflowing_sub_kb_64(
|
||||
}
|
||||
|
||||
void cuda_integer_radix_overflowing_sub_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *radix_lwe_out, void *radix_lwe_overflowed, void *radix_lwe_left,
|
||||
void *radix_lwe_right, int8_t *mem_ptr, void **bsks, void **ksks,
|
||||
uint32_t num_blocks) {
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *radix_lwe_out, void *radix_lwe_overflowed, void const *radix_lwe_left,
|
||||
void const *radix_lwe_right, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, uint32_t num_blocks) {
|
||||
|
||||
auto mem = (int_overflowing_sub_memory<uint64_t> *)mem_ptr;
|
||||
|
||||
@@ -44,13 +44,13 @@ void cuda_integer_radix_overflowing_sub_kb_64(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_overflowed),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks), mem,
|
||||
num_blocks);
|
||||
static_cast<const uint64_t *>(radix_lwe_left),
|
||||
static_cast<const uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
|
||||
mem, num_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_overflowing_sub(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
void cleanup_cuda_integer_radix_overflowing_sub(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_overflowing_sub_memory<uint64_t> *mem_ptr =
|
||||
|
||||
@@ -8,10 +8,10 @@
|
||||
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer.h"
|
||||
#include "integer/integer.cuh"
|
||||
#include "integer/integer_utilities.h"
|
||||
#include "linear_algebra.h"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "pbs/programmable_bootstrap.h"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <fstream>
|
||||
@@ -23,9 +23,9 @@
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void
|
||||
device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks,
|
||||
uint64_t lwe_dimension, uint64_t message_modulus,
|
||||
uint64_t delta) {
|
||||
device_integer_radix_negation(Torus *output, Torus const *input,
|
||||
int32_t num_blocks, uint64_t lwe_dimension,
|
||||
uint64_t message_modulus, uint64_t delta) {
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < lwe_dimension + 1) {
|
||||
bool is_body = (tid == lwe_dimension);
|
||||
@@ -54,12 +54,11 @@ device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_integer_radix_negation(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *output, Torus *input,
|
||||
uint32_t lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count,
|
||||
uint64_t message_modulus, uint64_t carry_modulus) {
|
||||
__host__ void host_integer_radix_negation(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *output, Torus const *input,
|
||||
uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
|
||||
uint64_t message_modulus, uint64_t carry_modulus) {
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
|
||||
// lwe_size includes the presence of the body
|
||||
@@ -85,9 +84,9 @@ host_integer_radix_negation(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_overflowing_sub_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_overflowing_sub_memory<Torus> **mem_ptr, uint32_t num_blocks,
|
||||
int_radix_params params, bool allocate_gpu_memory) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_overflowing_sub_memory<Torus> **mem_ptr,
|
||||
uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_overflowing_sub_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory);
|
||||
@@ -95,9 +94,10 @@ __host__ void scratch_cuda_integer_overflowing_sub_kb(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_overflowing_sub_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *radix_lwe_out, Torus *radix_lwe_overflowed, Torus *radix_lwe_left,
|
||||
Torus *radix_lwe_right, void **bsks, uint64_t **ksks,
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *radix_lwe_out, Torus *radix_lwe_overflowed,
|
||||
Torus const *radix_lwe_left, Torus const *radix_lwe_right,
|
||||
void *const *bsks, uint64_t *const *ksks,
|
||||
int_overflowing_sub_memory<uint64_t> *mem_ptr, uint32_t num_blocks) {
|
||||
|
||||
auto radix_params = mem_ptr->params;
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
#include "integer/scalar_addition.cuh"
|
||||
|
||||
void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
void *scalar_input, uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus) {
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array, void const *scalar_input, uint32_t lwe_dimension,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus) {
|
||||
|
||||
host_integer_radix_scalar_addition_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(scalar_input),
|
||||
lwe_dimension, lwe_ciphertext_count, message_modulus, carry_modulus);
|
||||
static_cast<uint64_t *>(lwe_array),
|
||||
static_cast<const uint64_t *>(scalar_input), lwe_dimension,
|
||||
lwe_ciphertext_count, message_modulus, carry_modulus);
|
||||
}
|
||||
|
||||
@@ -7,13 +7,13 @@
|
||||
#endif
|
||||
|
||||
#include "device.h"
|
||||
#include "integer.h"
|
||||
#include "integer/integer_utilities.h"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <stdio.h>
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void device_integer_radix_scalar_addition_inplace(
|
||||
Torus *lwe_array, Torus *scalar_input, int32_t num_blocks,
|
||||
Torus *lwe_array, Torus const *scalar_input, int32_t num_blocks,
|
||||
uint32_t lwe_dimension, uint64_t delta) {
|
||||
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
@@ -25,10 +25,10 @@ __global__ void device_integer_radix_scalar_addition_inplace(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_addition_inplace(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array, Torus *scalar_input, uint32_t lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array, Torus const *scalar_input,
|
||||
uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus) {
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
|
||||
// Create a 1-dimensional grid of threads
|
||||
@@ -64,8 +64,8 @@ __global__ void device_integer_radix_add_scalar_one_inplace(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_add_scalar_one_inplace(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array, uint32_t lwe_dimension,
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array, uint32_t lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus) {
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
@@ -104,10 +104,10 @@ __global__ void device_integer_radix_scalar_subtraction_inplace(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_subtraction_inplace(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array, Torus *scalar_input, uint32_t lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array, Torus *scalar_input,
|
||||
uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus) {
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
|
||||
// Create a 1-dimensional grid of threads
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
#include "integer/scalar_bitops.cuh"
|
||||
|
||||
void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *lwe_array_input, void *clear_blocks,
|
||||
uint32_t num_clear_blocks, int8_t *mem_ptr, void **bsks, void **ksks,
|
||||
uint32_t lwe_ciphertext_count, BITOP_TYPE op) {
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void const *lwe_array_input, void const *clear_blocks,
|
||||
uint32_t num_clear_blocks, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, uint32_t lwe_ciphertext_count, BITOP_TYPE op) {
|
||||
|
||||
host_integer_radix_scalar_bitop_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_input),
|
||||
static_cast<uint64_t *>(clear_blocks), num_clear_blocks,
|
||||
static_cast<const uint64_t *>(lwe_array_input),
|
||||
static_cast<const uint64_t *>(clear_blocks), num_clear_blocks,
|
||||
(int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
|
||||
lwe_ciphertext_count, op);
|
||||
}
|
||||
|
||||
@@ -6,10 +6,11 @@
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_bitop_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_input, Torus *clear_blocks,
|
||||
uint32_t num_clear_blocks, int_bitop_buffer<Torus> *mem_ptr, void **bsks,
|
||||
Torus **ksks, uint32_t num_radix_blocks, BITOP_TYPE op) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_input,
|
||||
Torus const *clear_blocks, uint32_t num_clear_blocks,
|
||||
int_bitop_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks,
|
||||
uint32_t num_radix_blocks, BITOP_TYPE op) {
|
||||
|
||||
auto lut = mem_ptr->lut;
|
||||
auto params = lut->params;
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
#include "integer/scalar_comparison.cuh"
|
||||
|
||||
void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *lwe_array_in, void *scalar_blocks,
|
||||
int8_t *mem_ptr, void **bsks, void **ksks, uint32_t lwe_ciphertext_count,
|
||||
uint32_t num_scalar_blocks) {
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void const *lwe_array_in, void const *scalar_blocks,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
uint32_t lwe_ciphertext_count, uint32_t num_scalar_blocks) {
|
||||
|
||||
int_comparison_buffer<uint64_t> *buffer =
|
||||
(int_comparison_buffer<uint64_t> *)mem_ptr;
|
||||
@@ -14,8 +14,8 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
host_integer_radix_scalar_equality_check_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(scalar_blocks), buffer, bsks,
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(scalar_blocks), buffer, bsks,
|
||||
(uint64_t **)(ksks), lwe_ciphertext_count, num_scalar_blocks);
|
||||
break;
|
||||
case GT:
|
||||
@@ -25,8 +25,8 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
host_integer_radix_scalar_difference_check_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(scalar_blocks), buffer,
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(scalar_blocks), buffer,
|
||||
buffer->diff_buffer->operator_f, bsks, (uint64_t **)(ksks),
|
||||
lwe_ciphertext_count, num_scalar_blocks);
|
||||
break;
|
||||
@@ -35,8 +35,8 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
host_integer_radix_scalar_maxmin_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(scalar_blocks), buffer, bsks,
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(scalar_blocks), buffer, bsks,
|
||||
(uint64_t **)(ksks), lwe_ciphertext_count, num_scalar_blocks);
|
||||
break;
|
||||
default:
|
||||
|
||||
@@ -5,10 +5,10 @@
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scalar_compare_radix_blocks_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
|
||||
uint32_t num_radix_blocks) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
|
||||
void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks) {
|
||||
|
||||
if (num_radix_blocks == 0)
|
||||
return;
|
||||
@@ -57,11 +57,12 @@ __host__ void scalar_compare_radix_blocks_kb(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> sign_handler_f, void **bsks, Torus **ksks,
|
||||
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
|
||||
Torus const *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t total_num_radix_blocks,
|
||||
uint32_t total_num_scalar_blocks) {
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
@@ -243,11 +244,12 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> sign_handler_f, void **bsks, Torus **ksks,
|
||||
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
|
||||
Torus const *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t total_num_radix_blocks,
|
||||
uint32_t total_num_scalar_blocks) {
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
@@ -287,7 +289,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
host_compare_with_zero_equality<Torus>(
|
||||
streams, gpu_indexes, gpu_count, are_all_msb_zeros, lwe_array_in,
|
||||
mem_ptr, bsks, ksks, total_num_radix_blocks, mem_ptr->is_zero_lut);
|
||||
Torus *sign_block =
|
||||
Torus const *sign_block =
|
||||
lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size;
|
||||
|
||||
auto sign_bit_pos = (int)std::log2(message_modulus) - 1;
|
||||
@@ -426,7 +428,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
lut_f);
|
||||
signed_msb_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
|
||||
Torus *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size;
|
||||
Torus const *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size;
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, sign_block,
|
||||
are_all_msb_zeros, bsks, ksks, 1, signed_msb_lut,
|
||||
@@ -476,9 +478,10 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
scalar_compare_radix_blocks_kb<Torus>(lsb_streams, gpu_indexes, gpu_count,
|
||||
lwe_array_ct_out, lhs, rhs, mem_ptr,
|
||||
bsks, ksks, num_lsb_radix_blocks);
|
||||
Torus *encrypted_sign_block =
|
||||
Torus const *encrypted_sign_block =
|
||||
lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size;
|
||||
Torus *scalar_sign_block = scalar_blocks + (total_num_scalar_blocks - 1);
|
||||
Torus const *scalar_sign_block =
|
||||
scalar_blocks + (total_num_scalar_blocks - 1);
|
||||
|
||||
auto trivial_sign_block = mem_ptr->tmp_trivial_sign_block;
|
||||
create_trivial_radix<Torus>(
|
||||
@@ -505,10 +508,11 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void integer_radix_signed_scalar_maxmin_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
|
||||
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
|
||||
void *const *bsks, Torus *const *ksks, uint32_t total_num_radix_blocks,
|
||||
uint32_t total_num_scalar_blocks) {
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
// Calculates the difference sign between the ciphertext and the scalar
|
||||
@@ -541,11 +545,12 @@ __host__ void integer_radix_signed_scalar_maxmin_kb(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_difference_check_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> sign_handler_f, void **bsks, Torus **ksks,
|
||||
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
|
||||
Torus const *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t total_num_radix_blocks,
|
||||
uint32_t total_num_scalar_blocks) {
|
||||
|
||||
if (mem_ptr->is_signed) {
|
||||
// is signed and scalar is positive
|
||||
@@ -563,10 +568,11 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_signed_scalar_maxmin_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
|
||||
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
|
||||
void *const *bsks, Torus *const *ksks, uint32_t total_num_radix_blocks,
|
||||
uint32_t total_num_scalar_blocks) {
|
||||
|
||||
if (mem_ptr->is_signed) {
|
||||
// is signed and scalar is positive
|
||||
@@ -582,10 +588,11 @@ __host__ void host_integer_radix_signed_scalar_maxmin_kb(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_maxmin_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
|
||||
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
|
||||
Torus const *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
|
||||
void *const *bsks, Torus *const *ksks, uint32_t total_num_radix_blocks,
|
||||
uint32_t total_num_scalar_blocks) {
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
|
||||
@@ -619,10 +626,11 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_equality_check_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
|
||||
uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
|
||||
Torus const *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
|
||||
void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks,
|
||||
uint32_t num_scalar_blocks) {
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
#include "integer/scalar_mul.cuh"
|
||||
|
||||
void scratch_cuda_integer_scalar_mul_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
@@ -20,9 +20,10 @@ void scratch_cuda_integer_scalar_mul_kb_64(
|
||||
}
|
||||
|
||||
void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
uint64_t *decomposed_scalar, uint64_t *has_at_least_one_set, int8_t *mem,
|
||||
void **bsks, void **ksks, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array, uint64_t const *decomposed_scalar,
|
||||
uint64_t const *has_at_least_one_set, int8_t *mem, void *const *bsks,
|
||||
void *const *ksks, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t message_modulus, uint32_t num_blocks, uint32_t num_scalars) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
@@ -86,8 +87,8 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
}
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_scalar_mul(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
void cleanup_cuda_integer_radix_scalar_mul(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
#endif
|
||||
|
||||
#include "device.h"
|
||||
#include "integer.h"
|
||||
#include "integer/integer_utilities.h"
|
||||
#include "multiplication.cuh"
|
||||
#include "scalar_shifts.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
@@ -29,9 +29,10 @@ __global__ void device_small_scalar_radix_multiplication(T *output_lwe_array,
|
||||
|
||||
template <typename T>
|
||||
__host__ void scratch_cuda_integer_radix_scalar_mul_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_scalar_mul_buffer<T> **mem_ptr, uint32_t num_radix_blocks,
|
||||
int_radix_params params, bool allocate_gpu_memory) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_scalar_mul_buffer<T> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr =
|
||||
new int_scalar_mul_buffer<T>(streams, gpu_indexes, gpu_count, params,
|
||||
@@ -40,11 +41,11 @@ __host__ void scratch_cuda_integer_radix_scalar_mul_kb(
|
||||
|
||||
template <typename T, class params>
|
||||
__host__ void host_integer_scalar_mul_radix(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
T *lwe_array, T *decomposed_scalar, T *has_at_least_one_set,
|
||||
int_scalar_mul_buffer<T> *mem, void **bsks, T **ksks,
|
||||
uint32_t input_lwe_dimension, uint32_t message_modulus,
|
||||
uint32_t num_radix_blocks, uint32_t num_scalars) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, T *lwe_array, T const *decomposed_scalar,
|
||||
T const *has_at_least_one_set, int_scalar_mul_buffer<T> *mem,
|
||||
void *const *bsks, T *const *ksks, uint32_t input_lwe_dimension,
|
||||
uint32_t message_modulus, uint32_t num_radix_blocks, uint32_t num_scalars) {
|
||||
|
||||
if (num_radix_blocks == 0 | num_scalars == 0)
|
||||
return;
|
||||
@@ -121,8 +122,8 @@ __host__ void host_integer_scalar_mul_radix(
|
||||
// Small scalar_mul is used in shift/rotate
|
||||
template <typename T>
|
||||
__host__ void host_integer_small_scalar_mul_radix(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
T *output_lwe_array, T *input_lwe_array, T scalar,
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, T *output_lwe_array, T *input_lwe_array, T scalar,
|
||||
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
#include "scalar_rotate.cuh"
|
||||
|
||||
void scratch_cuda_integer_radix_scalar_rotate_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
@@ -21,9 +21,9 @@ void scratch_cuda_integer_radix_scalar_rotate_kb_64(
|
||||
}
|
||||
|
||||
void cuda_integer_radix_scalar_rotate_kb_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
uint32_t n, int8_t *mem_ptr, void **bsks, void **ksks,
|
||||
uint32_t num_blocks) {
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array, uint32_t n, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, uint32_t num_blocks) {
|
||||
|
||||
host_integer_radix_scalar_rotate_kb_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
@@ -32,8 +32,8 @@ void cuda_integer_radix_scalar_rotate_kb_64_inplace(
|
||||
(uint64_t **)(ksks), num_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_scalar_rotate(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
void cleanup_cuda_integer_radix_scalar_rotate(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer.cuh"
|
||||
#include "integer.h"
|
||||
#include "integer/integer_utilities.h"
|
||||
#include "pbs/programmable_bootstrap_classic.cuh"
|
||||
#include "pbs/programmable_bootstrap_multibit.cuh"
|
||||
#include "types/complex/operations.cuh"
|
||||
@@ -13,10 +13,10 @@
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_radix_scalar_rotate_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_logical_scalar_shift_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
|
||||
int_radix_params params, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
bool allocate_gpu_memory) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_logical_scalar_shift_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
|
||||
@@ -25,9 +25,10 @@ __host__ void scratch_cuda_integer_radix_scalar_rotate_kb(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array, uint32_t n, int_logical_scalar_shift_buffer<Torus> *mem,
|
||||
void **bsks, Torus **ksks, uint32_t num_blocks) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array, uint32_t n,
|
||||
int_logical_scalar_shift_buffer<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_blocks) {
|
||||
|
||||
auto params = mem->params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
#include "scalar_shifts.cuh"
|
||||
|
||||
void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
@@ -25,9 +25,9 @@ void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
|
||||
/// the application of a PBS onto the rotated blocks up to num_blocks -
|
||||
/// rotations - 1 The remaining blocks are padded with zeros
|
||||
void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
uint32_t shift, int8_t *mem_ptr, void **bsks, void **ksks,
|
||||
uint32_t num_blocks) {
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array, uint32_t shift, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, uint32_t num_blocks) {
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
@@ -37,8 +37,8 @@ void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
|
||||
}
|
||||
|
||||
void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
@@ -64,9 +64,9 @@ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
|
||||
/// block, which is copied onto all remaining blocks instead of padding with
|
||||
/// zeros as would be done in the logical shift.
|
||||
void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
uint32_t shift, int8_t *mem_ptr, void **bsks, void **ksks,
|
||||
uint32_t num_blocks) {
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array, uint32_t shift, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, uint32_t num_blocks) {
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
@@ -75,10 +75,9 @@ void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
|
||||
(uint64_t **)(ksks), num_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_logical_scalar_shift(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
void cleanup_cuda_integer_radix_logical_scalar_shift(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_logical_scalar_shift_buffer<uint64_t> *mem_ptr =
|
||||
(int_logical_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
@@ -86,10 +85,9 @@ void cleanup_cuda_integer_radix_logical_scalar_shift(void **streams,
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_arithmetic_scalar_shift(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
void cleanup_cuda_integer_radix_arithmetic_scalar_shift(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_arithmetic_scalar_shift_buffer<uint64_t> *mem_ptr =
|
||||
(int_arithmetic_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer.cuh"
|
||||
#include "integer.h"
|
||||
#include "integer/integer_utilities.h"
|
||||
#include "pbs/programmable_bootstrap_classic.cuh"
|
||||
#include "pbs/programmable_bootstrap_multibit.cuh"
|
||||
#include "types/complex/operations.cuh"
|
||||
@@ -13,10 +13,10 @@
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_radix_logical_scalar_shift_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_logical_scalar_shift_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
|
||||
int_radix_params params, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
bool allocate_gpu_memory) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_logical_scalar_shift_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
|
||||
@@ -25,10 +25,10 @@ __host__ void scratch_cuda_integer_radix_logical_scalar_shift_kb(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array, uint32_t shift,
|
||||
int_logical_scalar_shift_buffer<Torus> *mem, void **bsks, Torus **ksks,
|
||||
uint32_t num_blocks) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array, uint32_t shift,
|
||||
int_logical_scalar_shift_buffer<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_blocks) {
|
||||
|
||||
auto params = mem->params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
@@ -116,8 +116,8 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_arithmetic_scalar_shift_buffer<Torus> **mem_ptr,
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_arithmetic_scalar_shift_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
|
||||
|
||||
@@ -128,10 +128,10 @@ __host__ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array, uint32_t shift,
|
||||
int_arithmetic_scalar_shift_buffer<Torus> *mem, void **bsks, Torus **ksks,
|
||||
uint32_t num_blocks) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array, uint32_t shift,
|
||||
int_arithmetic_scalar_shift_buffer<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_blocks) {
|
||||
|
||||
auto params = mem->params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
#include "shift_and_rotate.cuh"
|
||||
|
||||
void scratch_cuda_integer_radix_shift_and_rotate_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
@@ -21,19 +21,20 @@ void scratch_cuda_integer_radix_shift_and_rotate_kb_64(
|
||||
}
|
||||
|
||||
void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
void *lwe_shift, int8_t *mem_ptr, void **bsks, void **ksks,
|
||||
uint32_t num_blocks) {
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array, void const *lwe_shift, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, uint32_t num_blocks) {
|
||||
|
||||
host_integer_radix_shift_and_rotate_kb_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(lwe_shift),
|
||||
static_cast<uint64_t *>(lwe_array),
|
||||
static_cast<const uint64_t *>(lwe_shift),
|
||||
(int_shift_and_rotate_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks), num_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_shift_and_rotate(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
void cleanup_cuda_integer_radix_shift_and_rotate(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_shift_and_rotate_buffer<uint64_t> *mem_ptr =
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer.cuh"
|
||||
#include "integer.h"
|
||||
#include "integer/integer_utilities.h"
|
||||
#include "pbs/programmable_bootstrap_classic.cuh"
|
||||
#include "pbs/programmable_bootstrap_multibit.cuh"
|
||||
#include "scalar_mul.cuh"
|
||||
@@ -14,10 +14,10 @@
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_radix_shift_and_rotate_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_shift_and_rotate_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
|
||||
int_radix_params params, SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed,
|
||||
bool allocate_gpu_memory) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_shift_and_rotate_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed, bool allocate_gpu_memory) {
|
||||
*mem_ptr = new int_shift_and_rotate_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, shift_type, is_signed, params,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
@@ -25,9 +25,10 @@ __host__ void scratch_cuda_integer_radix_shift_and_rotate_kb(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array, Torus *lwe_shift, int_shift_and_rotate_buffer<Torus> *mem,
|
||||
void **bsks, Torus **ksks, uint32_t num_radix_blocks) {
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array, Torus const *lwe_shift,
|
||||
int_shift_and_rotate_buffer<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_radix_blocks) {
|
||||
uint32_t bits_per_block = std::log2(mem->params.message_modulus);
|
||||
uint32_t total_nb_bits = bits_per_block * num_radix_blocks;
|
||||
if (total_nb_bits == 0)
|
||||
@@ -60,8 +61,9 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
// Extracts bits and put them in the bit index 2 (=> bit number 3)
|
||||
// so that it is already aligned to the correct position of the cmux input
|
||||
// and we reduce noise growth
|
||||
extract_n_bits<Torus>(streams, gpu_indexes, gpu_count, shift_bits, lwe_shift,
|
||||
bsks, ksks, 1, max_num_bits_that_tell_shift,
|
||||
extract_n_bits<Torus>(streams, gpu_indexes, gpu_count, shift_bits,
|
||||
(Torus *)lwe_shift, bsks, ksks, 1,
|
||||
max_num_bits_that_tell_shift,
|
||||
mem->bit_extract_luts_with_offset_2);
|
||||
|
||||
// If signed, do an "arithmetic shift" by padding with the sign bit
|
||||
|
||||
@@ -6,15 +6,15 @@
|
||||
*/
|
||||
void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in_1,
|
||||
void *lwe_array_in_2,
|
||||
void const *lwe_array_in_1,
|
||||
void const *lwe_array_in_2,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_addition<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in_1),
|
||||
static_cast<uint32_t *>(lwe_array_in_2),
|
||||
static_cast<const uint32_t *>(lwe_array_in_1),
|
||||
static_cast<const uint32_t *>(lwe_array_in_2),
|
||||
input_lwe_dimension, input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
@@ -46,15 +46,15 @@ void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
|
||||
*/
|
||||
void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in_1,
|
||||
void *lwe_array_in_2,
|
||||
void const *lwe_array_in_1,
|
||||
void const *lwe_array_in_2,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_addition<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in_1),
|
||||
static_cast<uint64_t *>(lwe_array_in_2),
|
||||
static_cast<const uint64_t *>(lwe_array_in_1),
|
||||
static_cast<const uint64_t *>(lwe_array_in_2),
|
||||
input_lwe_dimension, input_lwe_ciphertext_count);
|
||||
}
|
||||
/*
|
||||
@@ -62,15 +62,15 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
|
||||
* plaintext vector. See the equivalent operation on u64 data for more details.
|
||||
*/
|
||||
void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
|
||||
void *plaintext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_array_in, void const *plaintext_array_in,
|
||||
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_addition_plaintext<uint32_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(plaintext_array_in), input_lwe_dimension,
|
||||
static_cast<const uint32_t *>(lwe_array_in),
|
||||
static_cast<const uint32_t *>(plaintext_array_in), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
/*
|
||||
@@ -102,14 +102,14 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
|
||||
* performs the operation on the GPU.
|
||||
*/
|
||||
void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
|
||||
void *plaintext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_array_in, void const *plaintext_array_in,
|
||||
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_addition_plaintext<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(plaintext_array_in), input_lwe_dimension,
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(plaintext_array_in), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
@@ -13,9 +13,9 @@
|
||||
#include <stdio.h>
|
||||
|
||||
template <typename T>
|
||||
__global__ void plaintext_addition(T *output, T *lwe_input, T *plaintext_input,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t num_entries) {
|
||||
__global__ void
|
||||
plaintext_addition(T *output, T const *lwe_input, T const *plaintext_input,
|
||||
uint32_t input_lwe_dimension, uint32_t num_entries) {
|
||||
|
||||
int tid = threadIdx.x;
|
||||
int plaintext_index = blockIdx.x * blockDim.x + tid;
|
||||
@@ -30,7 +30,7 @@ __global__ void plaintext_addition(T *output, T *lwe_input, T *plaintext_input,
|
||||
template <typename T>
|
||||
__host__ void
|
||||
host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index, T *output,
|
||||
T *lwe_input, T *plaintext_input,
|
||||
T const *lwe_input, T const *plaintext_input,
|
||||
uint32_t lwe_dimension, uint32_t lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
@@ -49,7 +49,7 @@ host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index, T *output,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void addition(T *output, T *input_1, T *input_2,
|
||||
__global__ void addition(T *output, T const *input_1, T const *input_2,
|
||||
uint32_t num_entries) {
|
||||
|
||||
int tid = threadIdx.x;
|
||||
@@ -63,7 +63,7 @@ __global__ void addition(T *output, T *input_1, T *input_2,
|
||||
// Coefficient-wise addition
|
||||
template <typename T>
|
||||
__host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output,
|
||||
T *input_1, T *input_2,
|
||||
T const *input_1, T const *input_2,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
@@ -83,7 +83,7 @@ __host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void subtraction(T *output, T *input_1, T *input_2,
|
||||
__global__ void subtraction(T *output, T const *input_1, T const *input_2,
|
||||
uint32_t num_entries) {
|
||||
|
||||
int tid = threadIdx.x;
|
||||
@@ -97,7 +97,7 @@ __global__ void subtraction(T *output, T *input_1, T *input_2,
|
||||
// Coefficient-wise subtraction
|
||||
template <typename T>
|
||||
__host__ void host_subtraction(cudaStream_t stream, uint32_t gpu_index,
|
||||
T *output, T *input_1, T *input_2,
|
||||
T *output, T const *input_1, T const *input_2,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
@@ -157,9 +157,11 @@ __host__ void host_subtraction_plaintext(cudaStream_t stream,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void unchecked_sub_with_correcting_term(
|
||||
T *output, T *input_1, T *input_2, uint32_t num_entries, uint32_t lwe_size,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, uint32_t degree) {
|
||||
__global__ void
|
||||
unchecked_sub_with_correcting_term(T *output, T const *input_1,
|
||||
T const *input_2, uint32_t num_entries,
|
||||
uint32_t lwe_size, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, uint32_t degree) {
|
||||
uint32_t msg_mod = message_modulus;
|
||||
uint64_t z = max((uint64_t)ceil(degree / msg_mod), (uint64_t)1);
|
||||
z *= msg_mod;
|
||||
@@ -178,9 +180,10 @@ __global__ void unchecked_sub_with_correcting_term(
|
||||
}
|
||||
template <typename T>
|
||||
__host__ void host_unchecked_sub_with_correcting_term(
|
||||
cudaStream_t stream, uint32_t gpu_index, T *output, T *input_1, T *input_2,
|
||||
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, uint32_t degree) {
|
||||
cudaStream_t stream, uint32_t gpu_index, T *output, T const *input_1,
|
||||
T const *input_2, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, uint32_t degree) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
// lwe_size includes the presence of the body
|
||||
|
||||
@@ -5,15 +5,15 @@
|
||||
* cleartext vector. See the equivalent operation on u64 data for more details.
|
||||
*/
|
||||
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
|
||||
void *cleartext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_array_in, void const *cleartext_array_in,
|
||||
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_cleartext_vec_multiplication<uint32_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(cleartext_array_in), input_lwe_dimension,
|
||||
static_cast<const uint32_t *>(lwe_array_in),
|
||||
static_cast<const uint32_t *>(cleartext_array_in), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
/*
|
||||
@@ -45,14 +45,14 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
|
||||
* function that performs the operation on the GPU.
|
||||
*/
|
||||
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
|
||||
void *cleartext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_array_in, void const *cleartext_array_in,
|
||||
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_cleartext_vec_multiplication<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(cleartext_array_in), input_lwe_dimension,
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(cleartext_array_in), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
@@ -14,8 +14,8 @@
|
||||
#include <vector>
|
||||
|
||||
template <typename T>
|
||||
__global__ void cleartext_vec_multiplication(T *output, T *lwe_input,
|
||||
T *cleartext_input,
|
||||
__global__ void cleartext_vec_multiplication(T *output, T const *lwe_input,
|
||||
T const *cleartext_input,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t num_entries) {
|
||||
|
||||
@@ -29,11 +29,10 @@ __global__ void cleartext_vec_multiplication(T *output, T *lwe_input,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void
|
||||
host_cleartext_vec_multiplication(cudaStream_t stream, uint32_t gpu_index,
|
||||
T *output, T *lwe_input, T *cleartext_input,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
__host__ void host_cleartext_vec_multiplication(
|
||||
cudaStream_t stream, uint32_t gpu_index, T *output, T const *lwe_input,
|
||||
T const *cleartext_input, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
// lwe_size includes the presence of the body
|
||||
@@ -53,7 +52,7 @@ host_cleartext_vec_multiplication(cudaStream_t stream, uint32_t gpu_index,
|
||||
|
||||
template <typename T>
|
||||
__global__ void
|
||||
cleartext_multiplication(T *output, T *lwe_input, T cleartext_input,
|
||||
cleartext_multiplication(T *output, T const *lwe_input, T cleartext_input,
|
||||
uint32_t input_lwe_dimension, uint32_t num_entries) {
|
||||
|
||||
int tid = threadIdx.x;
|
||||
@@ -67,7 +66,7 @@ cleartext_multiplication(T *output, T *lwe_input, T cleartext_input,
|
||||
template <typename T>
|
||||
__host__ void
|
||||
host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
|
||||
T *output, T *lwe_input, T cleartext_input,
|
||||
T *output, T const *lwe_input, T cleartext_input,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
|
||||
@@ -6,13 +6,13 @@
|
||||
*/
|
||||
void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in,
|
||||
void const *lwe_array_in,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_negation<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<const uint32_t *>(lwe_array_in),
|
||||
input_lwe_dimension, input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
@@ -40,12 +40,12 @@ void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
|
||||
*/
|
||||
void cuda_negate_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in,
|
||||
void const *lwe_array_in,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_negation<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
input_lwe_dimension, input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
#include "linear_algebra.h"
|
||||
|
||||
template <typename T>
|
||||
__global__ void negation(T *output, T *input, uint32_t num_entries) {
|
||||
__global__ void negation(T *output, T const *input, uint32_t num_entries) {
|
||||
|
||||
int tid = threadIdx.x;
|
||||
int index = blockIdx.x * blockDim.x + tid;
|
||||
@@ -23,7 +23,7 @@ __global__ void negation(T *output, T *input, uint32_t num_entries) {
|
||||
|
||||
template <typename T>
|
||||
__host__ void host_negation(cudaStream_t stream, uint32_t gpu_index, T *output,
|
||||
T *input, uint32_t input_lwe_dimension,
|
||||
T const *input, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
@@ -1,29 +1,29 @@
|
||||
#include "bootstrapping_key.cuh"
|
||||
|
||||
void cuda_convert_lwe_programmable_bootstrap_key_32(
|
||||
void *stream, uint32_t gpu_index, void *dest, void *src,
|
||||
void *stream, uint32_t gpu_index, void *dest, void const *src,
|
||||
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size) {
|
||||
uint32_t total_polynomials =
|
||||
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
|
||||
cuda_convert_lwe_programmable_bootstrap_key<uint32_t, int32_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double2 *)dest,
|
||||
(int32_t *)src, polynomial_size, total_polynomials);
|
||||
(const int32_t *)src, polynomial_size, total_polynomials);
|
||||
}
|
||||
|
||||
void cuda_convert_lwe_programmable_bootstrap_key_64(
|
||||
void *stream, uint32_t gpu_index, void *dest, void *src,
|
||||
void *stream, uint32_t gpu_index, void *dest, void const *src,
|
||||
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size) {
|
||||
uint32_t total_polynomials =
|
||||
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
|
||||
cuda_convert_lwe_programmable_bootstrap_key<uint64_t, int64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double2 *)dest,
|
||||
(int64_t *)src, polynomial_size, total_polynomials);
|
||||
(const int64_t *)src, polynomial_size, total_polynomials);
|
||||
}
|
||||
|
||||
void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
|
||||
void *stream, uint32_t gpu_index, void *dest, void *src,
|
||||
void *stream, uint32_t gpu_index, void *dest, void const *src,
|
||||
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor) {
|
||||
uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
|
||||
@@ -89,3 +89,175 @@ template __device__ const double2 *get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
const double2 *ptr, int g, int i, int k, int level,
|
||||
uint32_t grouping_factor, uint32_t polynomial_size, uint32_t glwe_dimension,
|
||||
uint32_t level_count);
|
||||
|
||||
void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
|
||||
void const *_input1, void const *_input2,
|
||||
void *_output, uint32_t polynomial_size,
|
||||
uint32_t total_polynomials) {
|
||||
|
||||
auto stream = static_cast<cudaStream_t>(stream_v);
|
||||
cudaSetDevice(gpu_index);
|
||||
auto input1 = (double2 *)_input1;
|
||||
auto input2 = (double2 *)_input2;
|
||||
auto output = (double2 *)_output;
|
||||
|
||||
size_t shared_memory_size = sizeof(double2) * polynomial_size / 2;
|
||||
|
||||
int gridSize = total_polynomials;
|
||||
int blockSize = polynomial_size / choose_opt_amortized(polynomial_size);
|
||||
|
||||
double2 *buffer;
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
|
||||
output, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
|
||||
}
|
||||
break;
|
||||
case 512:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<521>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
|
||||
output, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
|
||||
}
|
||||
break;
|
||||
case 1024:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
|
||||
output, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
|
||||
}
|
||||
break;
|
||||
case 2048:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
|
||||
output, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
|
||||
}
|
||||
break;
|
||||
case 4096:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
|
||||
output, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
|
||||
}
|
||||
break;
|
||||
case 8192:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
|
||||
output, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
|
||||
}
|
||||
break;
|
||||
case 16384:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
|
||||
FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
|
||||
output, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
cuda_drop_async(buffer, stream, gpu_index);
|
||||
}
|
||||
|
||||
@@ -3,9 +3,9 @@
|
||||
|
||||
#include "device.h"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "pbs/programmable_bootstrap.h"
|
||||
#include "pbs/programmable_bootstrap_multibit.h"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "programmable_bootstrap_multibit.h"
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
|
||||
@@ -75,7 +75,7 @@ __device__ const T *get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
template <typename T, typename ST>
|
||||
void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
uint32_t gpu_index,
|
||||
double2 *dest, ST *src,
|
||||
double2 *dest, ST const *src,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t total_polynomials) {
|
||||
cudaSetDevice(gpu_index);
|
||||
@@ -249,175 +249,4 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
cudaFreeHost(h_bsk);
|
||||
}
|
||||
|
||||
void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
|
||||
void *_input1, void *_input2, void *_output,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t total_polynomials) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
auto input1 = (double2 *)_input1;
|
||||
auto input2 = (double2 *)_input2;
|
||||
auto output = (double2 *)_output;
|
||||
|
||||
size_t shared_memory_size = sizeof(double2) * polynomial_size / 2;
|
||||
|
||||
int gridSize = total_polynomials;
|
||||
int blockSize = polynomial_size / choose_opt_amortized(polynomial_size);
|
||||
|
||||
double2 *buffer;
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
|
||||
output, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
|
||||
}
|
||||
break;
|
||||
case 512:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<521>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
|
||||
output, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
|
||||
}
|
||||
break;
|
||||
case 1024:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
|
||||
output, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
|
||||
}
|
||||
break;
|
||||
case 2048:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
|
||||
output, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
|
||||
}
|
||||
break;
|
||||
case 4096:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
|
||||
output, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
|
||||
}
|
||||
break;
|
||||
case 8192:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
|
||||
output, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
|
||||
}
|
||||
break;
|
||||
case 16384:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
|
||||
FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
|
||||
output, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
cuda_drop_async(buffer, stream, gpu_index);
|
||||
}
|
||||
|
||||
#endif // CNCRT_BSK_H
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
#ifndef CUDA_PROGRAMMABLE_BOOTSTRAP_CUH
|
||||
#define CUDA_PROGRAMMABLE_BOOTSTRAP_CUH
|
||||
|
||||
#include "bootstrapping_key.cuh"
|
||||
#include "cooperative_groups.h"
|
||||
#include "device.h"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "helper_multi_gpu.h"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "programmable_bootstrap_multibit.h"
|
||||
#include "pbs/programmable_bootstrap_multibit.h"
|
||||
|
||||
using namespace cooperative_groups;
|
||||
namespace cg = cooperative_groups;
|
||||
@@ -117,18 +117,22 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *join_buffer,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void execute_pbs_async(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
const LweArrayVariant<Torus> &lwe_array_out,
|
||||
const LweArrayVariant<Torus> &lwe_output_indexes,
|
||||
std::vector<Torus *> lut_vec, std::vector<Torus *> lut_indexes_vec,
|
||||
const LweArrayVariant<Torus> &lwe_array_in,
|
||||
const LweArrayVariant<Torus> &lwe_input_indexes, void **bootstrapping_keys,
|
||||
std::vector<int8_t *> pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type, uint32_t lut_count,
|
||||
uint32_t lut_stride) {
|
||||
void execute_pbs_async(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
const LweArrayVariant<Torus> &lwe_array_out,
|
||||
const LweArrayVariant<Torus> &lwe_output_indexes,
|
||||
const std::vector<Torus *> lut_vec,
|
||||
const std::vector<Torus *> lut_indexes_vec,
|
||||
const LweArrayVariant<Torus> &lwe_array_in,
|
||||
const LweArrayVariant<Torus> &lwe_input_indexes,
|
||||
void *const *bootstrapping_keys,
|
||||
std::vector<int8_t *> pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type,
|
||||
uint32_t lut_count, uint32_t lut_stride) {
|
||||
|
||||
switch (sizeof(Torus)) {
|
||||
case sizeof(uint32_t):
|
||||
// 32 bits
|
||||
|
||||
@@ -126,8 +126,9 @@ void scratch_cuda_programmable_bootstrap_amortized_64(
|
||||
*/
|
||||
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *bootstrapping_key,
|
||||
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples) {
|
||||
@@ -264,8 +265,9 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
*/
|
||||
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *bootstrapping_key,
|
||||
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples) {
|
||||
|
||||
@@ -6,15 +6,16 @@
|
||||
#include <cuda_runtime.h>
|
||||
#endif
|
||||
|
||||
#include "bootstrapping_key.cuh"
|
||||
#include "crypto/gadget.cuh"
|
||||
#include "crypto/torus.cuh"
|
||||
#include "device.h"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/twiddles.cuh"
|
||||
#include "pbs/programmable_bootstrap.h"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "types/complex/operations.cuh"
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
|
||||
@@ -12,10 +12,11 @@
|
||||
#include "device.h"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/twiddles.cuh"
|
||||
#include "pbs/pbs_utilities.h"
|
||||
#include "pbs/programmable_bootstrap.h"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "programmable_bootstrap.cuh"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "types/complex/operations.cuh"
|
||||
|
||||
using namespace cooperative_groups;
|
||||
@@ -228,8 +229,9 @@ __host__ void scratch_programmable_bootstrap_cg(
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_programmable_bootstrap_cg(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
|
||||
@@ -8,11 +8,12 @@
|
||||
#include "device.h"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/twiddles.cuh"
|
||||
#include "pbs/pbs_multibit_utilities.h"
|
||||
#include "pbs/programmable_bootstrap.h"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "programmable_bootstrap.cuh"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "programmable_bootstrap_multibit.cuh"
|
||||
#include "types/complex/operations.cuh"
|
||||
#include <vector>
|
||||
@@ -285,13 +286,14 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void execute_cg_external_product_loop(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
|
||||
Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
|
||||
Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t lwe_offset, uint32_t lut_count, uint32_t lut_stride) {
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
|
||||
uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t lwe_offset, uint32_t lut_count,
|
||||
uint32_t lut_stride) {
|
||||
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
uint64_t full_dm =
|
||||
@@ -369,8 +371,9 @@ __host__ void execute_cg_external_product_loop(
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_cg_multi_bit_programmable_bootstrap(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, uint64_t const *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
|
||||
@@ -118,8 +118,9 @@ void scratch_cuda_programmable_bootstrap_tbc(
|
||||
template <typename Torus>
|
||||
void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
@@ -374,8 +375,9 @@ void scratch_cuda_programmable_bootstrap_64(
|
||||
template <typename Torus>
|
||||
void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
@@ -448,8 +450,9 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
template <typename Torus>
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
@@ -523,8 +526,9 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
*/
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *bootstrapping_key,
|
||||
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride) {
|
||||
@@ -540,12 +544,12 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
#if CUDA_ARCH >= 900
|
||||
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
|
||||
stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
static_cast<uint32_t *>(lut_vector),
|
||||
static_cast<uint32_t *>(lut_vector_indexes),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
static_cast<const uint32_t *>(lwe_output_indexes),
|
||||
static_cast<const uint32_t *>(lut_vector),
|
||||
static_cast<const uint32_t *>(lut_vector_indexes),
|
||||
static_cast<const uint32_t *>(lwe_array_in),
|
||||
static_cast<const uint32_t *>(lwe_input_indexes),
|
||||
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
break;
|
||||
@@ -555,24 +559,24 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
case CG:
|
||||
cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
|
||||
stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
static_cast<uint32_t *>(lut_vector),
|
||||
static_cast<uint32_t *>(lut_vector_indexes),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
static_cast<const uint32_t *>(lwe_output_indexes),
|
||||
static_cast<const uint32_t *>(lut_vector),
|
||||
static_cast<const uint32_t *>(lut_vector_indexes),
|
||||
static_cast<const uint32_t *>(lwe_array_in),
|
||||
static_cast<const uint32_t *>(lwe_input_indexes),
|
||||
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
break;
|
||||
case DEFAULT:
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
|
||||
stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
static_cast<uint32_t *>(lut_vector),
|
||||
static_cast<uint32_t *>(lut_vector_indexes),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
static_cast<const uint32_t *>(lwe_output_indexes),
|
||||
static_cast<const uint32_t *>(lut_vector),
|
||||
static_cast<const uint32_t *>(lut_vector_indexes),
|
||||
static_cast<const uint32_t *>(lwe_array_in),
|
||||
static_cast<const uint32_t *>(lwe_input_indexes),
|
||||
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
break;
|
||||
@@ -644,8 +648,9 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
*/
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *bootstrapping_key,
|
||||
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride) {
|
||||
@@ -660,12 +665,12 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
#if (CUDA_ARCH >= 900)
|
||||
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
|
||||
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
break;
|
||||
@@ -675,24 +680,24 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
case PBS_VARIANT::CG:
|
||||
cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
|
||||
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
break;
|
||||
case PBS_VARIANT::DEFAULT:
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
break;
|
||||
@@ -717,9 +722,9 @@ template bool has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
|
||||
|
||||
template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
|
||||
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
|
||||
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
|
||||
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
|
||||
uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
|
||||
uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
|
||||
uint64_t const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
pbs_buffer<uint64_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
@@ -727,9 +732,9 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
|
||||
|
||||
template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
|
||||
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
|
||||
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
|
||||
uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
|
||||
uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
|
||||
uint64_t const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
pbs_buffer<uint64_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
@@ -748,9 +753,9 @@ template void scratch_cuda_programmable_bootstrap<uint64_t>(
|
||||
|
||||
template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
|
||||
void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
|
||||
uint32_t *lwe_output_indexes, uint32_t *lut_vector,
|
||||
uint32_t *lut_vector_indexes, uint32_t *lwe_array_in,
|
||||
uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
uint32_t const *lwe_output_indexes, uint32_t const *lut_vector,
|
||||
uint32_t const *lut_vector_indexes, uint32_t const *lwe_array_in,
|
||||
uint32_t const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
@@ -758,9 +763,9 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
|
||||
|
||||
template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
|
||||
void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
|
||||
uint32_t *lwe_output_indexes, uint32_t *lut_vector,
|
||||
uint32_t *lut_vector_indexes, uint32_t *lwe_array_in,
|
||||
uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
uint32_t const *lwe_output_indexes, uint32_t const *lut_vector,
|
||||
uint32_t const *lut_vector_indexes, uint32_t const *lwe_array_in,
|
||||
uint32_t const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
@@ -787,18 +792,18 @@ template bool has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
|
||||
#if CUDA_ARCH >= 900
|
||||
template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
|
||||
void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
|
||||
uint32_t *lwe_output_indexes, uint32_t *lut_vector,
|
||||
uint32_t *lut_vector_indexes, uint32_t *lwe_array_in,
|
||||
uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
uint32_t const *lwe_output_indexes, uint32_t const *lut_vector,
|
||||
uint32_t const *lut_vector_indexes, uint32_t const *lwe_array_in,
|
||||
uint32_t const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
pbs_buffer<uint32_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
uint32_t lut_stride);
|
||||
template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
|
||||
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
|
||||
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
|
||||
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
|
||||
uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
|
||||
uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
|
||||
uint64_t const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
pbs_buffer<uint64_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
|
||||
@@ -11,9 +11,10 @@
|
||||
#include "device.h"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/twiddles.cuh"
|
||||
#include "pbs/pbs_utilities.h"
|
||||
#include "pbs/programmable_bootstrap.h"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "types/complex/operations.cuh"
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
@@ -363,16 +364,15 @@ __host__ void scratch_programmable_bootstrap(
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void
|
||||
execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
|
||||
Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
Torus *global_accumulator, double2 *global_accumulator_fft,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, int8_t *d_mem,
|
||||
int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm,
|
||||
uint64_t full_sm, uint64_t full_dm) {
|
||||
__host__ void execute_step_one(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
Torus *global_accumulator, double2 *global_accumulator_fft,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
|
||||
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) {
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
cudaSetDevice(gpu_index);
|
||||
@@ -407,13 +407,14 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
|
||||
template <typename Torus, class params>
|
||||
__host__ void execute_step_two(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
double2 *bootstrapping_key, Torus *global_accumulator,
|
||||
double2 *global_accumulator_fft, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, int8_t *d_mem, int lwe_iteration,
|
||||
uint64_t partial_sm, uint64_t partial_dm, uint64_t full_sm,
|
||||
uint64_t full_dm, uint32_t lut_count, uint32_t lut_stride) {
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, double2 const *bootstrapping_key,
|
||||
Torus *global_accumulator, double2 *global_accumulator_fft,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
|
||||
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm, uint32_t lut_count,
|
||||
uint32_t lut_stride) {
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
cudaSetDevice(gpu_index);
|
||||
@@ -450,8 +451,9 @@ __host__ void execute_step_two(
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_programmable_bootstrap(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#include "../polynomial/parameters.cuh"
|
||||
#include "pbs/programmable_bootstrap_multibit.h"
|
||||
#include "programmable_bootstrap_cg_multibit.cuh"
|
||||
#include "programmable_bootstrap_multibit.cuh"
|
||||
#include "programmable_bootstrap_multibit.h"
|
||||
|
||||
#if (CUDA_ARCH >= 900)
|
||||
#include "programmable_bootstrap_tbc_multibit.cuh"
|
||||
@@ -61,8 +61,9 @@ bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
|
||||
template <typename Torus>
|
||||
void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
@@ -138,8 +139,9 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
template <typename Torus>
|
||||
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
@@ -214,8 +216,9 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
|
||||
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *bootstrapping_key,
|
||||
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
@@ -229,12 +232,12 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
#if CUDA_ARCH >= 900
|
||||
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, lut_count, lut_stride);
|
||||
break;
|
||||
@@ -244,24 +247,24 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
case PBS_VARIANT::CG:
|
||||
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, lut_count, lut_stride);
|
||||
break;
|
||||
case PBS_VARIANT::DEFAULT:
|
||||
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, lut_count, lut_stride);
|
||||
break;
|
||||
@@ -493,9 +496,9 @@ template void scratch_cuda_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
template void
|
||||
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
|
||||
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
|
||||
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
|
||||
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
|
||||
uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
|
||||
uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
|
||||
uint64_t const *lwe_input_indexes, uint64_t const *bootstrapping_key,
|
||||
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
@@ -510,9 +513,9 @@ template void scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
template void
|
||||
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
|
||||
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
|
||||
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
|
||||
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
|
||||
uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
|
||||
uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
|
||||
uint64_t const *lwe_input_indexes, uint64_t const *bootstrapping_key,
|
||||
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
@@ -582,8 +585,9 @@ void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
|
||||
template <typename Torus>
|
||||
void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
@@ -678,9 +682,9 @@ template void scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
template void
|
||||
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
|
||||
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
|
||||
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
|
||||
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
|
||||
uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
|
||||
uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
|
||||
uint64_t const *lwe_input_indexes, uint64_t const *bootstrapping_key,
|
||||
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
|
||||
@@ -8,12 +8,13 @@
|
||||
#include "device.h"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/twiddles.cuh"
|
||||
#include "pbs/pbs_multibit_utilities.h"
|
||||
#include "pbs/programmable_bootstrap.h"
|
||||
#include "pbs/programmable_bootstrap_multibit.h"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "programmable_bootstrap_cg_classic.cuh"
|
||||
#include "programmable_bootstrap_multibit.h"
|
||||
#include "types/complex/operations.cuh"
|
||||
#include <vector>
|
||||
|
||||
@@ -489,8 +490,8 @@ __host__ void scratch_multi_bit_programmable_bootstrap(
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void execute_compute_keybundle(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
|
||||
@@ -537,12 +538,14 @@ __host__ void execute_compute_keybundle(
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void execute_step_one(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
|
||||
Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t j, uint32_t lwe_offset) {
|
||||
__host__ void
|
||||
execute_step_one(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus const *lut_vector, Torus const *lut_vector_indexes,
|
||||
Torus const *lwe_array_in, Torus const *lwe_input_indexes,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t j, uint32_t lwe_offset) {
|
||||
|
||||
uint64_t full_sm_accumulate_step_one =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
|
||||
@@ -593,7 +596,7 @@ __host__ void execute_step_one(
|
||||
template <typename Torus, class params>
|
||||
__host__ void execute_step_two(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
|
||||
Torus const *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
|
||||
uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, int32_t grouping_factor, uint32_t level_count,
|
||||
uint32_t j, uint32_t lwe_offset, uint32_t lut_count, uint32_t lut_stride) {
|
||||
@@ -637,8 +640,9 @@ __host__ void execute_step_two(
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_multi_bit_programmable_bootstrap(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
|
||||
@@ -12,10 +12,11 @@
|
||||
#include "device.h"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/twiddles.cuh"
|
||||
#include "pbs/pbs_utilities.h"
|
||||
#include "pbs/programmable_bootstrap.h"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "programmable_bootstrap.cuh"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "types/complex/operations.cuh"
|
||||
|
||||
using namespace cooperative_groups;
|
||||
@@ -253,8 +254,9 @@ __host__ void scratch_programmable_bootstrap_tbc(
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_programmable_bootstrap_tbc(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
|
||||
@@ -8,12 +8,13 @@
|
||||
#include "device.h"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/twiddles.cuh"
|
||||
#include "pbs/pbs_multibit_utilities.h"
|
||||
#include "pbs/programmable_bootstrap.h"
|
||||
#include "pbs/programmable_bootstrap_multibit.cuh"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "programmable_bootstrap.cuh"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "programmable_bootstrap_multibit.cuh"
|
||||
#include "types/complex/operations.cuh"
|
||||
#include <vector>
|
||||
|
||||
@@ -290,13 +291,14 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap(
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void execute_tbc_external_product_loop(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
|
||||
Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
|
||||
Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t lwe_offset, uint32_t lut_count, uint32_t lut_stride) {
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
|
||||
uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t lwe_offset, uint32_t lut_count,
|
||||
uint32_t lut_stride) {
|
||||
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
auto supports_dsm =
|
||||
@@ -393,8 +395,9 @@ __host__ void execute_tbc_external_product_loop(
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_tbc_multi_bit_programmable_bootstrap(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
|
||||
@@ -188,7 +188,7 @@ __device__ void add_to_torus(double2 *m_values, Torus *result,
|
||||
|
||||
// Extracts the body of the nth-LWE in a GLWE.
|
||||
template <typename Torus, class params>
|
||||
__device__ void sample_extract_body(Torus *lwe_array_out, Torus *glwe,
|
||||
__device__ void sample_extract_body(Torus *lwe_array_out, Torus const *glwe,
|
||||
uint32_t glwe_dimension, uint32_t nth = 0) {
|
||||
// Set first coefficient of the glwe as the body of the LWE sample
|
||||
lwe_array_out[glwe_dimension * params::degree] =
|
||||
@@ -197,7 +197,7 @@ __device__ void sample_extract_body(Torus *lwe_array_out, Torus *glwe,
|
||||
|
||||
// Extracts the mask from the nth-LWE in a GLWE.
|
||||
template <typename Torus, class params>
|
||||
__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *glwe,
|
||||
__device__ void sample_extract_mask(Torus *lwe_array_out, Torus const *glwe,
|
||||
uint32_t glwe_dimension = 1,
|
||||
uint32_t nth = 0) {
|
||||
for (int z = 0; z < glwe_dimension; z++) {
|
||||
|
||||
@@ -5,7 +5,8 @@
|
||||
|
||||
/// Initialize same-size arrays on all active gpus
|
||||
template <typename Torus>
|
||||
void multi_gpu_alloc_array_async(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void multi_gpu_alloc_array_async(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, std::vector<Torus *> &dest,
|
||||
uint32_t elements_per_gpu) {
|
||||
|
||||
@@ -18,9 +19,10 @@ void multi_gpu_alloc_array_async(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
}
|
||||
/// Copy an array residing on one GPU to all active gpus
|
||||
template <typename Torus>
|
||||
void multi_gpu_copy_array_async(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, std::vector<Torus *> &dest,
|
||||
Torus *src, uint32_t elements_per_gpu) {
|
||||
void multi_gpu_copy_array_async(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
std::vector<Torus *> &dest, Torus const *src,
|
||||
uint32_t elements_per_gpu) {
|
||||
dest.resize(gpu_count);
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
cuda_memcpy_async_gpu_to_gpu(dest[i], src, elements_per_gpu * sizeof(Torus),
|
||||
@@ -31,9 +33,10 @@ void multi_gpu_copy_array_async(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
/// Initializes also the related indexing and initializes it to the trivial
|
||||
/// index
|
||||
template <typename Torus>
|
||||
void multi_gpu_alloc_lwe_async(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, std::vector<Torus *> &dest,
|
||||
uint32_t num_inputs, uint32_t lwe_size) {
|
||||
void multi_gpu_alloc_lwe_async(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
std::vector<Torus *> &dest, uint32_t num_inputs,
|
||||
uint32_t lwe_size) {
|
||||
dest.resize(gpu_count);
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
|
||||
@@ -48,9 +51,10 @@ void multi_gpu_alloc_lwe_async(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
/// The input indexing logic is given by an index array.
|
||||
/// The output indexing is always the trivial one
|
||||
template <typename Torus>
|
||||
void multi_gpu_scatter_lwe_async(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void multi_gpu_scatter_lwe_async(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, std::vector<Torus *> &dest,
|
||||
Torus *src, Torus *h_src_indexes,
|
||||
Torus const *src, Torus const *h_src_indexes,
|
||||
bool is_trivial_index, uint32_t num_inputs,
|
||||
uint32_t lwe_size) {
|
||||
|
||||
@@ -88,9 +92,9 @@ void multi_gpu_scatter_lwe_async(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
/// dest_indexes
|
||||
/// The input indexing should be the trivial one
|
||||
template <typename Torus>
|
||||
void multi_gpu_gather_lwe_async(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *dest,
|
||||
const std::vector<Torus *> &src,
|
||||
void multi_gpu_gather_lwe_async(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *dest, const std::vector<Torus *> &src,
|
||||
Torus *h_dest_indexes, bool is_trivial_index,
|
||||
uint32_t num_inputs, uint32_t lwe_size) {
|
||||
|
||||
@@ -123,7 +127,8 @@ void multi_gpu_gather_lwe_async(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void multi_gpu_release_async(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
void multi_gpu_release_async(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
std::vector<Torus *> &vec) {
|
||||
|
||||
for (uint i = 0; i < vec.size(); i++)
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include "pbs/pbs_utilities.h"
|
||||
#include <benchmark/benchmark.h>
|
||||
#include <cstdint>
|
||||
#include <setup_and_teardown.h>
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
#include "pbs/pbs_multibit_utilities.h"
|
||||
#include "pbs/pbs_utilities.h"
|
||||
#include <benchmark/benchmark.h>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <omp.h>
|
||||
#include <setup_and_teardown.h>
|
||||
|
||||
typedef struct {
|
||||
@@ -50,7 +49,6 @@ protected:
|
||||
uint64_t *d_lut_pbs_indexes;
|
||||
uint64_t *d_lwe_ct_in_array;
|
||||
uint64_t *d_lwe_ct_out_array;
|
||||
uint64_t *lwe_ct_out_array;
|
||||
uint64_t *d_lwe_input_indexes;
|
||||
uint64_t *d_lwe_output_indexes;
|
||||
int8_t *buffer;
|
||||
@@ -215,12 +213,15 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit)
|
||||
for (auto _ : st) {
|
||||
// Execute PBS
|
||||
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
stream, gpu_index, d_lwe_ct_out_array, d_lwe_output_indexes,
|
||||
d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
|
||||
d_lwe_input_indexes, d_bsk, (pbs_buffer<uint64_t, MULTI_BIT> *)buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
|
||||
pbs_base_log, pbs_level, input_lwe_ciphertext_count, lut_count,
|
||||
lut_stride);
|
||||
stream, gpu_index, d_lwe_ct_out_array,
|
||||
(const uint64_t *)d_lwe_output_indexes,
|
||||
(const uint64_t *)d_lut_pbs_identity,
|
||||
(const uint64_t *)d_lut_pbs_indexes,
|
||||
(const uint64_t *)d_lwe_ct_in_array,
|
||||
(const uint64_t *)d_lwe_input_indexes, (const uint64_t *)d_bsk,
|
||||
(pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
|
||||
pbs_level, input_lwe_ciphertext_count, lut_count, lut_stride);
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
#ifndef SETUP_AND_TEARDOWN_H
|
||||
#define SETUP_AND_TEARDOWN_H
|
||||
|
||||
#include "pbs/programmable_bootstrap.h"
|
||||
#include "pbs/programmable_bootstrap_multibit.h"
|
||||
#include <device.h>
|
||||
#include <keyswitch.h>
|
||||
#include <programmable_bootstrap.h>
|
||||
#include <programmable_bootstrap_multibit.h>
|
||||
#include <utils.h>
|
||||
|
||||
void programmable_bootstrap_classical_setup(
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
#include "pbs/pbs_utilities.h"
|
||||
#include "pbs/programmable_bootstrap.h"
|
||||
#include "utils.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <cstdint>
|
||||
#include <device.h>
|
||||
#include <functional>
|
||||
#include <programmable_bootstrap.h>
|
||||
#include <random>
|
||||
#include <setup_and_teardown.h>
|
||||
#include <stdio.h>
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#include <algorithm>
|
||||
#include <programmable_bootstrap.h>
|
||||
#include <programmable_bootstrap_multibit.h>
|
||||
#include "pbs/programmable_bootstrap.h"
|
||||
#include "pbs/programmable_bootstrap_multibit.h"
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
|
||||
1684
backends/tfhe-cuda-backend/src/bindings.rs
Normal file
1684
backends/tfhe-cuda-backend/src/bindings.rs
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
11
backends/tfhe-cuda-backend/src/ffi.rs
Normal file
11
backends/tfhe-cuda-backend/src/ffi.rs
Normal file
@@ -0,0 +1,11 @@
|
||||
#![allow(warnings)]
|
||||
pub type c_void = std::ffi::c_void;
|
||||
pub type c_uint = std::ffi::c_uint;
|
||||
pub type c_uchar = std::ffi::c_uchar;
|
||||
pub type c_ushort = std::ffi::c_ushort;
|
||||
pub type c_ulong = std::ffi::c_ulong;
|
||||
pub type c_schar = std::ffi::c_schar;
|
||||
pub type c_int = std::ffi::c_int;
|
||||
pub type c_short = std::ffi::c_short;
|
||||
pub type c_long = std::ffi::c_long;
|
||||
pub type c_char = std::ffi::c_char;
|
||||
@@ -1 +1,4 @@
|
||||
#[allow(warnings)]
|
||||
pub mod bindings;
|
||||
pub mod cuda_bind;
|
||||
pub mod ffi;
|
||||
|
||||
7
backends/tfhe-cuda-backend/wrapper.h
Normal file
7
backends/tfhe-cuda-backend/wrapper.h
Normal file
@@ -0,0 +1,7 @@
|
||||
#include "cuda/include/ciphertext.h"
|
||||
#include "cuda/include/integer/compression/compression.h"
|
||||
#include "cuda/include/integer/integer.h"
|
||||
#include "cuda/include/keyswitch.h"
|
||||
#include "cuda/include/linear_algebra.h"
|
||||
#include "cuda/include/pbs/programmable_bootstrap.h"
|
||||
#include "cuda/include/pbs/programmable_bootstrap_multibit.h"
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user