mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-09 14:47:56 -05:00
feat(gpu): Implement 128 bit classic pbs
This commit is contained in:
committed by
bbarbakadze
parent
8dadb626f2
commit
459969e9d2
@@ -9,20 +9,26 @@
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_one(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator_rotated
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
size_t double_count = (sizeof(Torus) == 16) ? 2 : 1;
|
||||
return sizeof(Torus) * polynomial_size + // accumulator_rotated
|
||||
sizeof(double) * 2 * double_count * polynomial_size /
|
||||
2; // accumulator fft
|
||||
}
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_two(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
size_t double_count = (sizeof(Torus) == 16) ? 2 : 1;
|
||||
return sizeof(Torus) * polynomial_size + // accumulator
|
||||
sizeof(double) * 2 * double_count * polynomial_size /
|
||||
2; // accumulator fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t
|
||||
get_buffer_size_partial_sm_programmable_bootstrap(uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
size_t double_count = (sizeof(Torus) == 16) ? 2 : 1;
|
||||
return sizeof(double) * 2 * double_count * polynomial_size /
|
||||
2; // accumulator fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -215,6 +221,155 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
}
|
||||
};
|
||||
|
||||
template <PBS_TYPE pbs_type> struct pbs_buffer_128;
|
||||
|
||||
template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
|
||||
int8_t *d_mem;
|
||||
|
||||
__uint128_t *global_accumulator;
|
||||
double *global_join_buffer;
|
||||
|
||||
PBS_VARIANT pbs_variant;
|
||||
|
||||
pbs_buffer_128(cudaStream_t stream, uint32_t gpu_index,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
PBS_VARIANT pbs_variant, bool allocate_gpu_memory) {
|
||||
cuda_set_device(gpu_index);
|
||||
this->pbs_variant = pbs_variant;
|
||||
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
size_t global_join_buffer_size = (glwe_dimension + 1) * level_count *
|
||||
input_lwe_ciphertext_count *
|
||||
polynomial_size / 2 * sizeof(double) * 4;
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
switch (pbs_variant) {
|
||||
case PBS_VARIANT::DEFAULT: {
|
||||
uint64_t full_sm_step_one =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_step_one<
|
||||
__uint128_t>(polynomial_size);
|
||||
uint64_t full_sm_step_two =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_step_two<
|
||||
__uint128_t>(polynomial_size);
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_programmable_bootstrap<__uint128_t>(
|
||||
polynomial_size);
|
||||
|
||||
uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
|
||||
uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
|
||||
uint64_t full_dm = full_sm_step_one;
|
||||
|
||||
uint64_t device_mem = 0;
|
||||
if (max_shared_memory < partial_sm) {
|
||||
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
|
||||
(glwe_dimension + 1);
|
||||
} else if (max_shared_memory < full_sm_step_two) {
|
||||
device_mem =
|
||||
(partial_dm_step_two + partial_dm_step_one * level_count) *
|
||||
input_lwe_ciphertext_count * (glwe_dimension + 1);
|
||||
} else if (max_shared_memory < full_sm_step_one) {
|
||||
device_mem = partial_dm_step_one * input_lwe_ciphertext_count *
|
||||
level_count * (glwe_dimension + 1);
|
||||
}
|
||||
// Otherwise, both kernels run all in shared memory
|
||||
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
|
||||
|
||||
global_join_buffer = (double *)cuda_malloc_async(
|
||||
global_join_buffer_size, stream, gpu_index);
|
||||
|
||||
global_accumulator = (__uint128_t *)cuda_malloc_async(
|
||||
(glwe_dimension + 1) * input_lwe_ciphertext_count *
|
||||
polynomial_size * sizeof(__uint128_t),
|
||||
stream, gpu_index);
|
||||
} break;
|
||||
case PBS_VARIANT::CG: {
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_cg<__uint128_t>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_programmable_bootstrap_cg<__uint128_t>(
|
||||
polynomial_size);
|
||||
|
||||
uint64_t partial_dm = full_sm - partial_sm;
|
||||
uint64_t full_dm = full_sm;
|
||||
uint64_t device_mem = 0;
|
||||
|
||||
if (max_shared_memory < partial_sm) {
|
||||
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
|
||||
(glwe_dimension + 1);
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
|
||||
(glwe_dimension + 1);
|
||||
}
|
||||
|
||||
// Otherwise, both kernels run all in shared memory
|
||||
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
|
||||
|
||||
global_join_buffer = (double *)cuda_malloc_async(
|
||||
global_join_buffer_size, stream, gpu_index);
|
||||
} break;
|
||||
#if CUDA_ARCH >= 900
|
||||
case PBS_VARIANT::TBC: {
|
||||
|
||||
bool supports_dsm =
|
||||
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
|
||||
__uint128_t>(polynomial_size, max_shared_memory);
|
||||
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_tbc<__uint128_t>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_programmable_bootstrap_tbc<__uint128_t>(
|
||||
polynomial_size);
|
||||
uint64_t minimum_sm_tbc = 0;
|
||||
if (supports_dsm)
|
||||
minimum_sm_tbc =
|
||||
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<
|
||||
__uint128_t>(polynomial_size);
|
||||
|
||||
uint64_t partial_dm = full_sm - partial_sm;
|
||||
uint64_t full_dm = full_sm;
|
||||
uint64_t device_mem = 0;
|
||||
|
||||
// There is a minimum amount of memory we need to run the TBC PBS, which
|
||||
// is minimum_sm_tbc. We know that minimum_sm_tbc bytes are available
|
||||
// because otherwise the previous check would have redirected
|
||||
// computation to some other variant. If over that we don't have more
|
||||
// partial_sm bytes, TBC PBS will run on NOSM. If we have partial_sm but
|
||||
// not full_sm bytes, it will run on PARTIALSM. Otherwise, FULLSM.
|
||||
//
|
||||
// NOSM mode actually requires minimum_sm_tbc shared memory bytes.
|
||||
if (max_shared_memory < partial_sm + minimum_sm_tbc) {
|
||||
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
|
||||
(glwe_dimension + 1);
|
||||
} else if (max_shared_memory < full_sm + minimum_sm_tbc) {
|
||||
device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
|
||||
(glwe_dimension + 1);
|
||||
}
|
||||
|
||||
// Otherwise, both kernels run all in shared memory
|
||||
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
|
||||
|
||||
global_join_buffer = (double *)cuda_malloc_async(
|
||||
global_join_buffer_size, stream, gpu_index);
|
||||
} break;
|
||||
#endif
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unsupported implementation variant.")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void release(cudaStream_t stream, uint32_t gpu_index) {
|
||||
cuda_drop_async(d_mem, stream, gpu_index);
|
||||
cuda_drop_async(global_join_buffer, stream, gpu_index);
|
||||
|
||||
if (pbs_variant == DEFAULT)
|
||||
cuda_drop_async(global_accumulator, stream, gpu_index);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_programmable_bootstrap_cg(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
|
||||
@@ -20,6 +20,11 @@ void cuda_convert_lwe_programmable_bootstrap_key_64(
|
||||
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size);
|
||||
|
||||
void cuda_convert_lwe_programmable_bootstrap_key_128(
|
||||
void *stream, uint32_t gpu_index, void *dest, void const *src,
|
||||
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size);
|
||||
|
||||
void scratch_cuda_programmable_bootstrap_amortized_32(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
@@ -62,6 +67,11 @@ void scratch_cuda_programmable_bootstrap_64(
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
void scratch_cuda_programmable_bootstrap_128(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
@@ -80,7 +90,19 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);
|
||||
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *bootstrapping_key,
|
||||
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);
|
||||
|
||||
void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
|
||||
int8_t **pbs_buffer);
|
||||
|
||||
void cleanup_cuda_programmable_bootstrap_128(void *stream, uint32_t gpu_index,
|
||||
int8_t **pbs_buffer);
|
||||
}
|
||||
#endif // CUDA_BOOTSTRAP_H
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
#include "crypto/torus.cuh"
|
||||
#include "device.h"
|
||||
#include "fft128/f128.cuh"
|
||||
#include <cstdint>
|
||||
|
||||
/**
|
||||
@@ -42,6 +43,13 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void decompose_and_compress_next_128(double *result) {
|
||||
for (int j = 0; j < num_poly; j++) {
|
||||
auto result_slice = result + j * params::degree / 2 * 4;
|
||||
decompose_and_compress_next_polynomial_128(result_slice, j);
|
||||
}
|
||||
}
|
||||
|
||||
// Decomposes a single polynomial
|
||||
__device__ void decompose_and_compress_next_polynomial(double2 *result,
|
||||
int j) {
|
||||
@@ -75,10 +83,58 @@ public:
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
// Decomposes a single polynomial
|
||||
__device__ void decompose_and_compress_next_polynomial_128(double *result,
|
||||
int j) {
|
||||
uint32_t tid = threadIdx.x;
|
||||
auto state_slice = &state[j * params::degree];
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
auto input1 = &state_slice[tid];
|
||||
auto input2 = &state_slice[tid + params::degree / 2];
|
||||
T res_re = *input1 & mask_mod_b;
|
||||
T res_im = *input2 & mask_mod_b;
|
||||
|
||||
*input1 >>= base_log; // Update state
|
||||
*input2 >>= base_log; // Update state
|
||||
|
||||
T carry_re = ((res_re - 1ll) | *input1) & res_re;
|
||||
T carry_im = ((res_im - 1ll) | *input2) & res_im;
|
||||
carry_re >>= (base_log - 1);
|
||||
carry_im >>= (base_log - 1);
|
||||
|
||||
*input1 += carry_re; // Update state
|
||||
*input2 += carry_im; // Update state
|
||||
|
||||
res_re -= carry_re << base_log;
|
||||
res_im -= carry_im << base_log;
|
||||
|
||||
auto out_re = u128_to_signed_to_f128(res_re);
|
||||
auto out_im = u128_to_signed_to_f128(res_im);
|
||||
|
||||
auto out_re_hi = result + 0 * params::degree / 2;
|
||||
auto out_re_lo = result + 1 * params::degree / 2;
|
||||
auto out_im_hi = result + 2 * params::degree / 2;
|
||||
auto out_im_lo = result + 3 * params::degree / 2;
|
||||
|
||||
out_re_hi[tid] = out_re.hi;
|
||||
out_re_lo[tid] = out_re.lo;
|
||||
out_im_hi[tid] = out_im.hi;
|
||||
out_im_lo[tid] = out_im.lo;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
__device__ void decompose_and_compress_level(double2 *result, int level) {
|
||||
for (int i = 0; i < level_count - level; i++)
|
||||
decompose_and_compress_next(result);
|
||||
}
|
||||
|
||||
__device__ void decompose_and_compress_level_128(double *result, int level) {
|
||||
for (int i = 0; i < level_count - level; i++)
|
||||
decompose_and_compress_next_128(result);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus>
|
||||
|
||||
@@ -21,7 +21,7 @@ struct alignas(16) f128 {
|
||||
#else
|
||||
double s = a + b;
|
||||
return f128(s, b - (s - a));
|
||||
#endif;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Two-sum
|
||||
@@ -270,7 +270,7 @@ __host__ __device__ inline double bits_to_double(uint64_t bits) {
|
||||
return d;
|
||||
}
|
||||
|
||||
__host__ __device__ double u128_to_f64(__uint128_t x) {
|
||||
__host__ __device__ inline double u128_to_f64(__uint128_t x) {
|
||||
const __uint128_t ONE = 1;
|
||||
const double A = ONE << 52;
|
||||
const double B = ONE << 104;
|
||||
@@ -322,7 +322,7 @@ __host__ __device__ double u128_to_f64(__uint128_t x) {
|
||||
}
|
||||
}
|
||||
|
||||
__host__ __device__ __uint128_t f64_to_u128(const double f) {
|
||||
__host__ __device__ inline __uint128_t f64_to_u128(const double f) {
|
||||
const __uint128_t ONE = 1;
|
||||
const uint64_t f_bits = double_to_bits(f);
|
||||
if (f_bits < 1023ull << 52) {
|
||||
@@ -338,7 +338,7 @@ __host__ __device__ __uint128_t f64_to_u128(const double f) {
|
||||
}
|
||||
}
|
||||
|
||||
__host__ __device__ __uint128_t f64_to_i128(const double f) {
|
||||
__host__ __device__ inline __uint128_t f64_to_i128(const double f) {
|
||||
// Get raw bits of the double
|
||||
const uint64_t f_bits = double_to_bits(f);
|
||||
|
||||
@@ -366,14 +366,14 @@ __host__ __device__ __uint128_t f64_to_i128(const double f) {
|
||||
return (f_bits >> 63) ? -result : result;
|
||||
}
|
||||
|
||||
__host__ __device__ double i128_to_f64(__int128_t const x) {
|
||||
__host__ __device__ inline double i128_to_f64(__int128_t const x) {
|
||||
uint64_t sign = static_cast<uint64_t>(x >> 64) & (1ULL << 63);
|
||||
__uint128_t abs =
|
||||
(x < 0) ? static_cast<__uint128_t>(-x) : static_cast<__uint128_t>(x);
|
||||
|
||||
return bits_to_double(double_to_bits(u128_to_f64(abs)) | sign);
|
||||
}
|
||||
__host__ __device__ f128 u128_to_signed_to_f128(__uint128_t x) {
|
||||
__host__ __device__ inline f128 u128_to_signed_to_f128(__uint128_t x) {
|
||||
const double first_approx = i128_to_f64(x);
|
||||
const uint64_t sign_bit = double_to_bits(first_approx) & (1ull << 63);
|
||||
const __uint128_t first_approx_roundtrip =
|
||||
@@ -387,7 +387,7 @@ __host__ __device__ f128 u128_to_signed_to_f128(__uint128_t x) {
|
||||
return f128(first_approx, correction);
|
||||
}
|
||||
|
||||
__host__ __device__ __uint128_t u128_from_torus_f128(const f128 &a) {
|
||||
__host__ __device__ inline __uint128_t u128_from_torus_f128(const f128 &a) {
|
||||
auto x = f128::sub_estimate(a, f128::f128_floor(a));
|
||||
const double normalization = 340282366920938500000000000000000000000.;
|
||||
#ifdef __CUDA_ARCH__
|
||||
|
||||
@@ -272,6 +272,9 @@ batch_convert_u128_to_f128_as_integer(double *out_re_hi, double *out_re_lo,
|
||||
}
|
||||
|
||||
// params is expected to be full degree not half degree
|
||||
// converts standqard input into complex<128> represented by 4 double
|
||||
// with following pattern: [re_hi_0, re_hi_1, ... re_hi_n, re_lo_0, re_lo_1,
|
||||
// ... re_lo_n, im_hi_0, im_hi_1, ..., im_hi_n, im_lo_0, im_lo_1, ..., im_lo_n]
|
||||
template <class params>
|
||||
__global__ void
|
||||
batch_convert_u128_to_f128_as_torus(double *out_re_hi, double *out_re_lo,
|
||||
@@ -287,6 +290,29 @@ batch_convert_u128_to_f128_as_torus(double *out_re_hi, double *out_re_lo,
|
||||
&in[blockIdx.x * params::degree + params::degree / 2]);
|
||||
}
|
||||
|
||||
// params is expected to be full degree not half degree
|
||||
// converts standqard input into complex<128> represented by 4 double
|
||||
// with following pattern: [re_hi_0, re_lo_0, im_hi_0, im_lo_0, re_hi_1,
|
||||
// re_lo_1, im_hi_1, im_lo_1,
|
||||
// ...,re_hi_n, re_lo_n, im_hi_n, im_lo_n, ]
|
||||
template <class params>
|
||||
__global__ void
|
||||
batch_convert_u128_to_f128_strided_as_torus(double *d_out,
|
||||
const __uint128_t *d_in) {
|
||||
|
||||
constexpr size_t chunk_size = params::degree / 2 * 4;
|
||||
double *chunk = &d_out[blockIdx.x * chunk_size];
|
||||
double *out_re_hi = &chunk[0 * params::degree / 2];
|
||||
double *out_re_lo = &chunk[1 * params::degree / 2];
|
||||
double *out_im_hi = &chunk[2 * params::degree / 2];
|
||||
double *out_im_lo = &chunk[3 * params::degree / 2];
|
||||
|
||||
convert_u128_to_f128_as_torus<params>(
|
||||
out_re_hi, out_re_lo, out_im_hi, out_im_lo,
|
||||
&d_in[blockIdx.x * params::degree],
|
||||
&d_in[blockIdx.x * params::degree + params::degree / 2]);
|
||||
}
|
||||
|
||||
// params is expected to be full degree not half degree
|
||||
template <class params>
|
||||
__global__ void batch_convert_f128_to_u128_as_torus(__uint128_t *out,
|
||||
@@ -309,7 +335,7 @@ __global__ void
|
||||
batch_NSMFFT_128(double *in_re_hi, double *in_re_lo, double *in_im_hi,
|
||||
double *in_im_lo, double *out_re_hi, double *out_re_lo,
|
||||
double *out_im_hi, double *out_im_lo, double *buffer) {
|
||||
extern __shared__ double sharedMemoryFFT[];
|
||||
extern __shared__ double sharedMemoryFFT128[];
|
||||
double *re_hi, *re_lo, *im_hi, *im_lo;
|
||||
|
||||
if (SMD == NOSM) {
|
||||
@@ -322,10 +348,10 @@ batch_NSMFFT_128(double *in_re_hi, double *in_re_lo, double *in_im_hi,
|
||||
im_lo =
|
||||
&buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 3];
|
||||
} else {
|
||||
re_hi = &sharedMemoryFFT[params::degree / 2 * 0];
|
||||
re_lo = &sharedMemoryFFT[params::degree / 2 * 1];
|
||||
im_hi = &sharedMemoryFFT[params::degree / 2 * 2];
|
||||
im_lo = &sharedMemoryFFT[params::degree / 2 * 3];
|
||||
re_hi = &sharedMemoryFFT128[params::degree / 2 * 0];
|
||||
re_lo = &sharedMemoryFFT128[params::degree / 2 * 1];
|
||||
im_hi = &sharedMemoryFFT128[params::degree / 2 * 2];
|
||||
im_lo = &sharedMemoryFFT128[params::degree / 2 * 3];
|
||||
}
|
||||
|
||||
Index tid = threadIdx.x;
|
||||
@@ -356,6 +382,70 @@ batch_NSMFFT_128(double *in_re_hi, double *in_re_lo, double *in_im_hi,
|
||||
}
|
||||
}
|
||||
|
||||
template <class params, sharedMemDegree SMD>
|
||||
__global__ void batch_NSMFFT_strided_128(double *d_in, double *d_out,
|
||||
double *buffer) {
|
||||
extern __shared__ double sharedMemoryFFT128[];
|
||||
double *re_hi, *re_lo, *im_hi, *im_lo;
|
||||
|
||||
if (SMD == NOSM) {
|
||||
re_hi =
|
||||
&buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 0];
|
||||
re_lo =
|
||||
&buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 1];
|
||||
im_hi =
|
||||
&buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 2];
|
||||
im_lo =
|
||||
&buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 3];
|
||||
} else {
|
||||
re_hi = &sharedMemoryFFT128[params::degree / 2 * 0];
|
||||
re_lo = &sharedMemoryFFT128[params::degree / 2 * 1];
|
||||
im_hi = &sharedMemoryFFT128[params::degree / 2 * 2];
|
||||
im_lo = &sharedMemoryFFT128[params::degree / 2 * 3];
|
||||
}
|
||||
|
||||
constexpr size_t chunk_size = params::degree / 2 * 4;
|
||||
double *chunk = &d_in[blockIdx.x * chunk_size];
|
||||
double *tmp_re_hi = &chunk[0 * params::degree / 2];
|
||||
double *tmp_re_lo = &chunk[1 * params::degree / 2];
|
||||
double *tmp_im_hi = &chunk[2 * params::degree / 2];
|
||||
double *tmp_im_lo = &chunk[3 * params::degree / 2];
|
||||
|
||||
Index tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (Index i = 0; i < params::opt / 2; ++i) {
|
||||
re_hi[tid] = tmp_re_hi[tid];
|
||||
re_lo[tid] = tmp_re_lo[tid];
|
||||
im_hi[tid] = tmp_im_hi[tid];
|
||||
im_lo[tid] = tmp_im_lo[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
if constexpr (params::fft_direction == 1) {
|
||||
negacyclic_backward_fft_f128<HalfDegree<params>>(re_hi, re_lo, im_hi,
|
||||
im_lo);
|
||||
} else {
|
||||
negacyclic_forward_fft_f128<HalfDegree<params>>(re_hi, re_lo, im_hi, im_lo);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
chunk = &d_out[blockIdx.x * chunk_size];
|
||||
tmp_re_hi = &chunk[0 * params::degree / 2];
|
||||
tmp_re_lo = &chunk[1 * params::degree / 2];
|
||||
tmp_im_hi = &chunk[2 * params::degree / 2];
|
||||
tmp_im_lo = &chunk[3 * params::degree / 2];
|
||||
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (Index i = 0; i < params::opt / 2; ++i) {
|
||||
tmp_re_hi[tid] = re_hi[tid];
|
||||
tmp_re_lo[tid] = re_lo[tid];
|
||||
tmp_im_hi[tid] = im_hi[tid];
|
||||
tmp_im_lo[tid] = im_lo[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
template <class params>
|
||||
__host__ void host_fourier_transform_forward_as_integer_f128(
|
||||
cudaStream_t stream, uint32_t gpu_index, double *re0, double *re1,
|
||||
|
||||
@@ -22,6 +22,18 @@ void cuda_convert_lwe_programmable_bootstrap_key_64(
|
||||
(const int64_t *)src, polynomial_size, total_polynomials);
|
||||
}
|
||||
|
||||
void cuda_convert_lwe_programmable_bootstrap_key_128(
|
||||
void *stream, uint32_t gpu_index, void *dest, void const *src,
|
||||
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size) {
|
||||
|
||||
uint32_t total_polynomials =
|
||||
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
|
||||
cuda_convert_lwe_programmable_bootstrap_key_u128(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double *)dest,
|
||||
(const __uint128_t *)src, polynomial_size, total_polynomials);
|
||||
}
|
||||
|
||||
void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
|
||||
void *stream, uint32_t gpu_index, void *dest, void const *src,
|
||||
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
|
||||
#include "device.h"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft128/fft128.cuh"
|
||||
|
||||
#include "pbs/programmable_bootstrap.h"
|
||||
#include "pbs/programmable_bootstrap_multibit.h"
|
||||
#include "polynomial/parameters.cuh"
|
||||
@@ -16,6 +18,13 @@ __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
|
||||
level_count;
|
||||
}
|
||||
|
||||
__device__ inline int get_start_ith_ggsw_128(int i, uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count) {
|
||||
return i * polynomial_size / 2 * 4 * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) * level_count;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////
|
||||
template <typename T>
|
||||
__device__ const T *get_ith_mask_kth_block(const T *ptr, int i, int k,
|
||||
@@ -39,6 +48,31 @@ __device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
|
||||
(glwe_dimension + 1) * (glwe_dimension + 1) +
|
||||
k * polynomial_size / 2 * (glwe_dimension + 1)];
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ const T *
|
||||
get_ith_mask_kth_block_128(const T *ptr, int i, int k, int level,
|
||||
uint32_t polynomial_size, int glwe_dimension,
|
||||
uint32_t level_count) {
|
||||
return &ptr[get_start_ith_ggsw_128(i, polynomial_size, glwe_dimension,
|
||||
level_count) +
|
||||
(level_count - level - 1) * polynomial_size / 2 * 4 *
|
||||
(glwe_dimension + 1) * (glwe_dimension + 1) +
|
||||
k * polynomial_size / 2 * 4 * (glwe_dimension + 1)];
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ T *get_ith_mask_kth_block_128(T *ptr, int i, int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count) {
|
||||
return &ptr[get_start_ith_ggsw_128(i, polynomial_size, glwe_dimension,
|
||||
level_count) +
|
||||
(level_count - level - 1) * polynomial_size / 2 * 4 *
|
||||
(glwe_dimension + 1) * (glwe_dimension + 1) +
|
||||
k * polynomial_size / 2 * 4 * (glwe_dimension + 1)];
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
@@ -251,4 +285,93 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
cudaFreeHost(h_bsk);
|
||||
}
|
||||
|
||||
template <class params>
|
||||
void convert_u128_to_f128_and_forward_fft_128(cudaStream_t stream,
|
||||
uint32_t gpu_index, double *d_bsk,
|
||||
__uint128_t const *d_standard,
|
||||
uint32_t number_of_samples) {
|
||||
|
||||
cuda_set_device(gpu_index);
|
||||
size_t required_shared_memory_size = sizeof(double) * params::degree / 2 * 4;
|
||||
int grid_size = number_of_samples;
|
||||
int block_size = params::degree / params::opt;
|
||||
bool full_sm =
|
||||
(required_shared_memory_size <= cuda_get_max_shared_memory(gpu_index));
|
||||
size_t buffer_size =
|
||||
full_sm ? 0 : (size_t)number_of_samples * params::degree / 2 * 4;
|
||||
size_t shared_memory_size = full_sm ? required_shared_memory_size : 0;
|
||||
double *buffer = (double *)cuda_malloc_async(buffer_size, stream, gpu_index);
|
||||
|
||||
// configure shared memory for batch fft kernel
|
||||
if (full_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT_strided_128<FFTDegree<params, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_NSMFFT_strided_128<FFTDegree<params, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
}
|
||||
|
||||
// convert u128 into 4 x double
|
||||
batch_convert_u128_to_f128_strided_as_torus<params>
|
||||
<<<grid_size, block_size, 0, stream>>>(d_bsk, d_standard);
|
||||
|
||||
// call negacyclic 128 bit forward fft.
|
||||
if (full_sm) {
|
||||
batch_NSMFFT_strided_128<FFTDegree<params, ForwardFFT>, FULLSM>
|
||||
<<<grid_size, block_size, shared_memory_size, stream>>>(d_bsk, d_bsk,
|
||||
buffer);
|
||||
} else {
|
||||
batch_NSMFFT_strided_128<FFTDegree<params, ForwardFFT>, NOSM>
|
||||
<<<grid_size, block_size, shared_memory_size, stream>>>(d_bsk, d_bsk,
|
||||
buffer);
|
||||
}
|
||||
cuda_drop_async(buffer, stream, gpu_index);
|
||||
}
|
||||
|
||||
inline void cuda_convert_lwe_programmable_bootstrap_key_u128(
|
||||
cudaStream_t stream, uint32_t gpu_index, double *dest,
|
||||
__uint128_t const *src, uint32_t polynomial_size,
|
||||
uint32_t total_polynomials) {
|
||||
|
||||
// Here the buffer size is the size of double times the number of polynomials
|
||||
// time 4 each polynomial is represented with 4 double array with size
|
||||
// polynomial_size / 2 into the complex domain to perform the FFT
|
||||
size_t buffer_size =
|
||||
total_polynomials * polynomial_size / 2 * sizeof(double) * 4;
|
||||
|
||||
__uint128_t *d_standard =
|
||||
(__uint128_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
|
||||
|
||||
cuda_memcpy_async_to_gpu(d_standard, src, buffer_size, stream, gpu_index);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
convert_u128_to_f128_and_forward_fft_128<AmortizedDegree<256>>(
|
||||
stream, gpu_index, dest, d_standard, total_polynomials);
|
||||
break;
|
||||
case 512:
|
||||
convert_u128_to_f128_and_forward_fft_128<AmortizedDegree<512>>(
|
||||
stream, gpu_index, dest, d_standard, total_polynomials);
|
||||
break;
|
||||
case 1024:
|
||||
convert_u128_to_f128_and_forward_fft_128<AmortizedDegree<1024>>(
|
||||
stream, gpu_index, dest, d_standard, total_polynomials);
|
||||
break;
|
||||
case 2048:
|
||||
convert_u128_to_f128_and_forward_fft_128<AmortizedDegree<2048>>(
|
||||
stream, gpu_index, dest, d_standard, total_polynomials);
|
||||
break;
|
||||
case 4096:
|
||||
convert_u128_to_f128_and_forward_fft_128<AmortizedDegree<4096>>(
|
||||
stream, gpu_index, dest, d_standard, total_polynomials);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (convert BSK): unsupported polynomial size. Supported "
|
||||
"N's are powers of two in the interval [256..4096].")
|
||||
}
|
||||
|
||||
cuda_drop_async(d_standard, stream, gpu_index);
|
||||
}
|
||||
|
||||
#endif // CNCRT_BSK_H
|
||||
|
||||
@@ -0,0 +1,205 @@
|
||||
#include "programmable_bootstrap_classic_128.cuh"
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the PBS on 128 bits inputs, into `buffer`. It also configures SM options on
|
||||
* the GPU in case FULLSM or PARTIALSM mode is going to be used.
|
||||
*/
|
||||
void scratch_cuda_programmable_bootstrap_128(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
auto buffer = (pbs_buffer_128<CLASSICAL> **)pbs_buffer;
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_programmable_bootstrap_128<AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_programmable_bootstrap_128<AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_programmable_bootstrap_128<AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_programmable_bootstrap_128<AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_programmable_bootstrap_128<AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
|
||||
"Supported N's are powers of two"
|
||||
" in the interval [256..4096].")
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, double const *bootstrapping_key,
|
||||
pbs_buffer_128<CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
|
||||
uint32_t lut_stride) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_programmable_bootstrap_128<AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 512:
|
||||
host_programmable_bootstrap_128<AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 1024:
|
||||
host_programmable_bootstrap_128<AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 2048:
|
||||
host_programmable_bootstrap_128<AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 4096:
|
||||
host_programmable_bootstrap_128<AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
|
||||
"Supported N's are powers of two"
|
||||
" in the interval [256..4096].")
|
||||
}
|
||||
}
|
||||
|
||||
/* Perform bootstrapping on a batch of input u128 LWE ciphertexts.
|
||||
*
|
||||
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
|
||||
* launch
|
||||
* - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
* - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
|
||||
* (a0,..an-1,b) where n is the LWE dimension
|
||||
* - lut_vector: should hold as many luts of size polynomial_size
|
||||
* as there are input ciphertexts, but actually holds
|
||||
* num_luts vectors to reduce memory usage
|
||||
* - lut_vector_indexes: stores the index corresponding to
|
||||
* which lut to use for each sample in
|
||||
* lut_vector
|
||||
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
|
||||
* mask values + 1 body value
|
||||
* - bootstrapping_key: GGSW encryption of the LWE secret key sk1
|
||||
* under secret key sk2
|
||||
* bsk = Z + sk1 H
|
||||
* where H is the gadget matrix and Z is a matrix (k+1).l
|
||||
* containing GLWE encryptions of 0 under sk2.
|
||||
* bsk is thus a tensor of size (k+1)^2.l.N.n
|
||||
* where l is the number of decomposition levels and
|
||||
* k is the GLWE dimension, N is the polynomial size for
|
||||
* GLWE. The polynomial size for GLWE and the lut
|
||||
* are the same because they have to be in the same ring
|
||||
* to be multiplied.
|
||||
* - lwe_dimension: size of the Torus vector used to encrypt the input
|
||||
* LWE ciphertexts - referred to as n above (~ 600)
|
||||
* - glwe_dimension: size of the polynomial vector used to encrypt the LUT
|
||||
* GLWE ciphertexts - referred to as k above. Only the value 1 is supported for
|
||||
* this parameter.
|
||||
* - polynomial_size: size of the test polynomial (lut) and size of the
|
||||
* GLWE polynomial (~1024)
|
||||
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
|
||||
* - level_count: number of decomposition levels in the gadget matrix (~4)
|
||||
* - num_samples: number of encrypted input messages
|
||||
*
|
||||
* This function calls a wrapper to a device kernel that performs the
|
||||
* bootstrapping:
|
||||
* - the kernel is templatized based on integer discretization and
|
||||
* polynomial degree
|
||||
* - num_samples * level_count * (glwe_dimension + 1) blocks of threads are
|
||||
* launched, where each thread is going to handle one or more polynomial
|
||||
* coefficients at each stage, for a given level of decomposition, either for
|
||||
* the LUT mask or its body:
|
||||
* - perform the blind rotation
|
||||
* - round the result
|
||||
* - get the decomposition for the current level
|
||||
* - switch to the FFT domain
|
||||
* - multiply with the bootstrapping key
|
||||
* - come back to the coefficients representation
|
||||
* - between each stage a synchronization of the threads is necessary (some
|
||||
* synchronizations happen at the block level, some happen between blocks, using
|
||||
* cooperative groups).
|
||||
* - in case the device has enough shared memory, temporary arrays used for
|
||||
* the different stages (accumulators) are stored into the shared memory
|
||||
* - the accumulators serve to combine the results for all decomposition
|
||||
* levels
|
||||
* - the constant memory (64K) is used for storing the roots of identity
|
||||
* values for the FFT
|
||||
*/
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *bootstrapping_key,
|
||||
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (classical PBS): base log should be <= 64")
|
||||
|
||||
pbs_buffer_128<CLASSICAL> *buffer = (pbs_buffer_128<CLASSICAL> *)mem_ptr;
|
||||
|
||||
executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128<__uint128_t>(
|
||||
stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
|
||||
static_cast<const __uint128_t *>(lwe_output_indexes),
|
||||
static_cast<const __uint128_t *>(lut_vector),
|
||||
static_cast<const __uint128_t *>(lut_vector_indexes),
|
||||
static_cast<const __uint128_t *>(lwe_array_in),
|
||||
static_cast<const __uint128_t *>(lwe_input_indexes),
|
||||
static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_many_lut, lut_stride);
|
||||
}
|
||||
|
||||
/*
|
||||
* This cleanup function frees the data on GPU for the PBS buffer for 32 or 64
|
||||
* bits inputs.
|
||||
*/
|
||||
void cleanup_cuda_programmable_bootstrap_128(void *stream, uint32_t gpu_index,
|
||||
int8_t **buffer) {
|
||||
auto x = (pbs_buffer_128<CLASSICAL> *)(*buffer);
|
||||
x->release(static_cast<cudaStream_t>(stream), gpu_index);
|
||||
}
|
||||
@@ -0,0 +1,488 @@
|
||||
#ifndef CUDA_PBS_CUH_128
|
||||
#define CUDA_PBS_CUH_128
|
||||
|
||||
#ifdef __CDT_PARSER__
|
||||
#undef __CUDA_RUNTIME_H__
|
||||
#include <cuda_runtime.h>
|
||||
#endif
|
||||
|
||||
#include "crypto/gadget.cuh"
|
||||
#include "crypto/torus.cuh"
|
||||
#include "device.h"
|
||||
#include "fft128/fft128.cuh"
|
||||
#include "pbs/bootstrapping_key.cuh"
|
||||
#include "pbs/pbs_utilities.h"
|
||||
#include "pbs/programmable_bootstrap.h"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "types/complex/operations.cuh"
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
__global__ void __launch_bounds__(params::degree / params::opt)
|
||||
device_programmable_bootstrap_step_one_128(
|
||||
const Torus *__restrict__ lut_vector,
|
||||
const Torus *__restrict__ lut_vector_indexes,
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes,
|
||||
const double *__restrict__ bootstrapping_key, Torus *global_accumulator,
|
||||
double *global_join_buffer, uint32_t lwe_iteration,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, int8_t *device_mem,
|
||||
uint64_t device_memory_size_per_block) {
|
||||
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
// bootstrap, since shared memory is kept in L1 cache and accessing it is
|
||||
// much faster than global memory
|
||||
extern __shared__ int8_t sharedmem[];
|
||||
int8_t *selected_memory;
|
||||
uint32_t glwe_dimension = gridDim.y - 1;
|
||||
|
||||
if constexpr (SMD == FULLSM) {
|
||||
selected_memory = sharedmem;
|
||||
} else {
|
||||
int block_index = blockIdx.z + blockIdx.y * gridDim.z +
|
||||
blockIdx.x * gridDim.z * gridDim.y;
|
||||
selected_memory = &device_mem[block_index * device_memory_size_per_block];
|
||||
}
|
||||
|
||||
Torus *accumulator = (Torus *)selected_memory;
|
||||
double *accumulator_fft =
|
||||
(double *)accumulator +
|
||||
(ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double));
|
||||
|
||||
if constexpr (SMD == PARTIALSM)
|
||||
accumulator_fft = (double *)sharedmem;
|
||||
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
const Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
|
||||
|
||||
const Torus *block_lut_vector =
|
||||
&lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *
|
||||
(glwe_dimension + 1)];
|
||||
|
||||
Torus *global_slice =
|
||||
global_accumulator +
|
||||
(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree;
|
||||
|
||||
double *global_fft_slice =
|
||||
global_join_buffer + (blockIdx.y + blockIdx.z * (glwe_dimension + 1) +
|
||||
blockIdx.x * level_count * (glwe_dimension + 1)) *
|
||||
(polynomial_size / 2) * 4;
|
||||
|
||||
if (lwe_iteration == 0) {
|
||||
// First iteration
|
||||
// Put "b" in [0, 2N[
|
||||
Torus b_hat = 0;
|
||||
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
params::log2_degree + 1);
|
||||
// The y-dimension is used to select the element of the GLWE this block will
|
||||
// compute
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
|
||||
false);
|
||||
|
||||
// Persist
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
global_slice[tid] = accumulator[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
// Put "a" in [0, 2N[
|
||||
Torus a_hat = 0;
|
||||
modulus_switch(block_lwe_array_in[lwe_iteration], a_hat,
|
||||
params::log2_degree + 1); // 2 * params::log2_degree + 1);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
Torus, params::opt, params::degree / params::opt>(global_slice,
|
||||
accumulator, a_hat);
|
||||
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
init_decomposer_state_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, base_log, level_count);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Decompose the accumulator. Each block gets one level of the
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
|
||||
gadget_acc.decompose_and_compress_level_128(accumulator_fft, blockIdx.z);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
// accumulator_rotated, so we need to synchronize here to make sure they
|
||||
// don't modify the same memory space at the same time
|
||||
// Switch to the FFT space
|
||||
auto acc_fft_re_hi = accumulator_fft + 0 * params::degree / 2;
|
||||
auto acc_fft_re_lo = accumulator_fft + 1 * params::degree / 2;
|
||||
auto acc_fft_im_hi = accumulator_fft + 2 * params::degree / 2;
|
||||
auto acc_fft_im_lo = accumulator_fft + 3 * params::degree / 2;
|
||||
|
||||
auto global_fft_re_hi = global_fft_slice + 0 * params::degree / 2;
|
||||
auto global_fft_re_lo = global_fft_slice + 1 * params::degree / 2;
|
||||
auto global_fft_im_hi = global_fft_slice + 2 * params::degree / 2;
|
||||
auto global_fft_im_lo = global_fft_slice + 3 * params::degree / 2;
|
||||
|
||||
negacyclic_forward_fft_f128<HalfDegree<params>>(acc_fft_re_hi, acc_fft_re_lo,
|
||||
acc_fft_im_hi, acc_fft_im_lo);
|
||||
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
global_fft_re_hi[tid] = acc_fft_re_hi[tid];
|
||||
global_fft_re_lo[tid] = acc_fft_re_lo[tid];
|
||||
global_fft_im_hi[tid] = acc_fft_im_hi[tid];
|
||||
global_fft_im_lo[tid] = acc_fft_im_lo[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
__global__ void __launch_bounds__(params::degree / params::opt)
|
||||
device_programmable_bootstrap_step_two_128(
|
||||
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
const Torus *__restrict__ lut_vector,
|
||||
const Torus *__restrict__ lut_vector_indexes,
|
||||
const double *__restrict__ bootstrapping_key, Torus *global_accumulator,
|
||||
double *global_join_buffer, uint32_t lwe_iteration,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, int8_t *device_mem,
|
||||
uint64_t device_memory_size_per_block, uint32_t num_many_lut,
|
||||
uint32_t lut_stride) {
|
||||
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
// bootstrap, since shared memory is kept in L1 cache and accessing it is
|
||||
// much faster than global memory
|
||||
extern __shared__ int8_t sharedmem[];
|
||||
int8_t *selected_memory;
|
||||
uint32_t glwe_dimension = gridDim.y - 1;
|
||||
|
||||
if constexpr (SMD == FULLSM) {
|
||||
selected_memory = sharedmem;
|
||||
} else {
|
||||
int block_index = blockIdx.x + blockIdx.y * gridDim.x +
|
||||
blockIdx.z * gridDim.x * gridDim.y;
|
||||
selected_memory = &device_mem[block_index * device_memory_size_per_block];
|
||||
}
|
||||
|
||||
// We always compute the pointer with most restrictive alignment to avoid
|
||||
// alignment issues
|
||||
Torus *accumulator = (Torus *)selected_memory;
|
||||
double *accumulator_fft =
|
||||
(double *)accumulator +
|
||||
(ptrdiff_t)(sizeof(Torus) * params::degree / sizeof(double));
|
||||
|
||||
if constexpr (SMD == PARTIALSM)
|
||||
accumulator_fft = (double *)sharedmem;
|
||||
|
||||
for (int level = 0; level < level_count; level++) {
|
||||
double *global_fft_slice =
|
||||
global_join_buffer + (level + blockIdx.x * level_count) *
|
||||
(glwe_dimension + 1) * (params::degree / 2) *
|
||||
4;
|
||||
|
||||
for (int j = 0; j < (glwe_dimension + 1); j++) {
|
||||
double *fft = global_fft_slice + j * params::degree / 2 * 4;
|
||||
|
||||
// Get the bootstrapping key piece necessary for the multiplication
|
||||
// It is already in the Fourier domain
|
||||
auto bsk_slice = get_ith_mask_kth_block_128(
|
||||
bootstrapping_key, lwe_iteration, j, level, polynomial_size,
|
||||
glwe_dimension, level_count);
|
||||
auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2 * 4;
|
||||
|
||||
polynomial_product_accumulate_in_fourier_domain_128<params>(
|
||||
accumulator_fft, fft, bsk_poly, !level && !j);
|
||||
}
|
||||
}
|
||||
|
||||
Torus *global_slice =
|
||||
global_accumulator +
|
||||
(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree;
|
||||
|
||||
// Load the persisted accumulator
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
accumulator[tid] = global_slice[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
// Perform the inverse FFT on the result of the GGSW x GLWE and add to the
|
||||
// accumulator
|
||||
auto acc_fft_re_hi = accumulator_fft + 0 * params::degree / 2;
|
||||
auto acc_fft_re_lo = accumulator_fft + 1 * params::degree / 2;
|
||||
auto acc_fft_im_hi = accumulator_fft + 2 * params::degree / 2;
|
||||
auto acc_fft_im_lo = accumulator_fft + 3 * params::degree / 2;
|
||||
|
||||
negacyclic_backward_fft_f128<HalfDegree<params>>(
|
||||
acc_fft_re_hi, acc_fft_re_lo, acc_fft_im_hi, acc_fft_im_lo);
|
||||
|
||||
add_to_torus_128<Torus, params>(acc_fft_re_hi, acc_fft_re_lo, acc_fft_im_hi,
|
||||
acc_fft_im_lo, accumulator);
|
||||
|
||||
if (lwe_iteration + 1 == lwe_dimension) {
|
||||
// Last iteration
|
||||
auto block_lwe_array_out =
|
||||
&lwe_array_out[lwe_output_indexes[blockIdx.x] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
if (blockIdx.y < glwe_dimension) {
|
||||
// Perform a sample extract. At this point, all blocks have the result,
|
||||
// but we do the computation at block 0 to avoid waiting for extra blocks,
|
||||
// in case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
if (num_many_lut > 1) {
|
||||
for (int i = 1; i < num_many_lut; i++) {
|
||||
auto next_lwe_array_out =
|
||||
lwe_array_out +
|
||||
(i * gridDim.x * (glwe_dimension * polynomial_size + 1));
|
||||
auto next_block_lwe_array_out =
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
sample_extract_mask<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 1, i * lut_stride);
|
||||
}
|
||||
}
|
||||
} else if (blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
if (num_many_lut > 1) {
|
||||
for (int i = 1; i < num_many_lut; i++) {
|
||||
|
||||
auto next_lwe_array_out =
|
||||
lwe_array_out +
|
||||
(i * gridDim.x * (glwe_dimension * polynomial_size + 1));
|
||||
auto next_block_lwe_array_out =
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
sample_extract_body<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 0, i * lut_stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Persist the updated accumulator
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
global_slice[tid] = accumulator[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename params>
|
||||
__host__ void scratch_programmable_bootstrap_128(
|
||||
cudaStream_t stream, uint32_t gpu_index, pbs_buffer_128<CLASSICAL> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
cuda_set_device(gpu_index);
|
||||
uint64_t full_sm_step_one =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_step_one<__uint128_t>(
|
||||
polynomial_size);
|
||||
uint64_t full_sm_step_two =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_step_two<__uint128_t>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_programmable_bootstrap<__uint128_t>(
|
||||
polynomial_size);
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
|
||||
// Configure step one
|
||||
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_step_one_128<__uint128_t, params,
|
||||
PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_step_one_128<__uint128_t, params,
|
||||
PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else if (max_shared_memory >= partial_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
// Configure step two
|
||||
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_two) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_step_two_128<__uint128_t, params,
|
||||
PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_step_two_128<__uint128_t, params,
|
||||
PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else if (max_shared_memory >= partial_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_step_two_128<__uint128_t, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_two));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_step_two_128<__uint128_t, params, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
*buffer = new pbs_buffer_128<CLASSICAL>(
|
||||
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <class params>
|
||||
__host__ void execute_step_one_128(
|
||||
cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector,
|
||||
__uint128_t const *lut_vector_indexes, __uint128_t const *lwe_array_in,
|
||||
__uint128_t const *lwe_input_indexes, double const *bootstrapping_key,
|
||||
__uint128_t *global_accumulator, double *global_join_buffer,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
|
||||
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) {
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
int thds = polynomial_size / params::opt;
|
||||
dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1, level_count);
|
||||
|
||||
if (max_shared_memory < partial_sm) {
|
||||
device_programmable_bootstrap_step_one_128<__uint128_t, params, NOSM>
|
||||
<<<grid, thds, 0, stream>>>(
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, global_accumulator, global_join_buffer,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, full_dm);
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
device_programmable_bootstrap_step_one_128<__uint128_t, params, PARTIALSM>
|
||||
<<<grid, thds, partial_sm, stream>>>(
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, global_accumulator, global_join_buffer,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, partial_dm);
|
||||
} else {
|
||||
device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM>
|
||||
<<<grid, thds, full_sm, stream>>>(
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, global_accumulator, global_join_buffer,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, 0);
|
||||
}
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <class params>
|
||||
__host__ void execute_step_two_128(
|
||||
cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
|
||||
__uint128_t const *lwe_output_indexes, __uint128_t const *lut_vector,
|
||||
__uint128_t const *lut_vector_indexes, double const *bootstrapping_key,
|
||||
__uint128_t *global_accumulator, double *global_join_buffer,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
|
||||
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
int thds = polynomial_size / params::opt;
|
||||
dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1);
|
||||
|
||||
if (max_shared_memory < partial_sm) {
|
||||
device_programmable_bootstrap_step_two_128<__uint128_t, params, NOSM>
|
||||
<<<grid, thds, 0, stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
bootstrapping_key, global_accumulator, global_join_buffer,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, full_dm, num_many_lut, lut_stride);
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
device_programmable_bootstrap_step_two_128<__uint128_t, params, PARTIALSM>
|
||||
<<<grid, thds, partial_sm, stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
bootstrapping_key, global_accumulator, global_join_buffer,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, partial_dm, num_many_lut, lut_stride);
|
||||
} else {
|
||||
device_programmable_bootstrap_step_two_128<__uint128_t, params, FULLSM>
|
||||
<<<grid, thds, full_sm, stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
bootstrapping_key, global_accumulator, global_join_buffer,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, 0, num_many_lut, lut_stride);
|
||||
}
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
/*
|
||||
* Host wrapper to the programmable bootstrap 128
|
||||
*/
|
||||
template <class params>
|
||||
__host__ void host_programmable_bootstrap_128(
|
||||
cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
|
||||
__uint128_t const *lwe_output_indexes, __uint128_t const *lut_vector,
|
||||
__uint128_t const *lut_vector_indexes, __uint128_t const *lwe_array_in,
|
||||
__uint128_t const *lwe_input_indexes, double const *bootstrapping_key,
|
||||
pbs_buffer_128<CLASSICAL> *pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
// With SM each block corresponds to either the mask or body, no need to
|
||||
// duplicate data for each
|
||||
uint64_t full_sm_step_one =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_step_one<__uint128_t>(
|
||||
polynomial_size);
|
||||
uint64_t full_sm_step_two =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_step_two<__uint128_t>(
|
||||
polynomial_size);
|
||||
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_programmable_bootstrap<__uint128_t>(
|
||||
polynomial_size);
|
||||
|
||||
uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
|
||||
uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
|
||||
uint64_t full_dm_step_one = full_sm_step_one;
|
||||
uint64_t full_dm_step_two = full_sm_step_two;
|
||||
|
||||
__uint128_t *global_accumulator = pbs_buffer->global_accumulator;
|
||||
double *global_join_buffer = pbs_buffer->global_join_buffer;
|
||||
int8_t *d_mem = pbs_buffer->d_mem;
|
||||
|
||||
for (int i = 0; i < lwe_dimension; i++) {
|
||||
execute_step_one_128<params>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, global_accumulator,
|
||||
global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
|
||||
partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
|
||||
execute_step_two_128<params>(
|
||||
stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, bootstrapping_key, global_accumulator,
|
||||
global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
|
||||
partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two,
|
||||
num_many_lut, lut_stride);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // TFHE_RS_BACKENDS_TFHE_CUDA_BACKEND_CUDA_SRC_PBS_PROGRAMMABLE_BOOTSTRAP_CLASSIC_128_CUH_
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
#include "crypto/torus.cuh"
|
||||
#include "device.h"
|
||||
#include "fft128/f128.cuh"
|
||||
#include "parameters.cuh"
|
||||
|
||||
// Return A if C == 0 and B if C == 1
|
||||
@@ -172,6 +173,35 @@ __device__ void add_to_torus(double2 *m_values, Torus *result,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* In case of classical PBS, this method should accumulate the result.
|
||||
* In case of multi-bit PBS, it should overwrite.
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
__device__ void add_to_torus_128(double *re_hi, double *re_lo, double *im_hi,
|
||||
double *im_lo, Torus *result,
|
||||
bool overwrite_result = false) {
|
||||
constexpr double normalization = 1. / (params::degree / 2);
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
f128 f128_real(re_hi[tid] * normalization, re_lo[tid] * normalization);
|
||||
f128 f128_imag(im_hi[tid] * normalization, im_lo[tid] * normalization);
|
||||
|
||||
Torus torus_real = u128_from_torus_f128(f128_real);
|
||||
Torus torus_imag = u128_from_torus_f128(f128_imag);
|
||||
|
||||
if (overwrite_result) {
|
||||
result[tid] = torus_real;
|
||||
result[tid + params::degree / 2] = torus_imag;
|
||||
} else {
|
||||
result[tid] += torus_real;
|
||||
result[tid + params::degree / 2] += torus_imag;
|
||||
}
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
// Extracts the body of the nth-LWE in a GLWE.
|
||||
template <typename Torus, class params>
|
||||
__device__ void sample_extract_body(Torus *lwe_array_out, Torus const *glwe,
|
||||
|
||||
@@ -56,6 +56,64 @@ __device__ void polynomial_product_accumulate_in_fourier_domain(
|
||||
}
|
||||
}
|
||||
|
||||
// Computes result += first * second
|
||||
// If init_accumulator is set, assumes that result was not initialized and does
|
||||
// that with the outcome of first * second
|
||||
template <class params>
|
||||
__device__ void polynomial_product_accumulate_in_fourier_domain_128(
|
||||
double *result, double *first, const double *second,
|
||||
bool init_accumulator = false) {
|
||||
int tid = threadIdx.x;
|
||||
if (init_accumulator) {
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
f128 a_re(first[tid + 0 * params::degree / 2],
|
||||
first[tid + 1 * params::degree / 2]);
|
||||
f128 a_im(first[tid + 2 * params::degree / 2],
|
||||
first[tid + 3 * params::degree / 2]);
|
||||
f128 b_re(second[tid + 0 * params::degree / 2],
|
||||
second[tid + 1 * params::degree / 2]);
|
||||
f128 b_im(second[tid + 2 * params::degree / 2],
|
||||
second[tid + 3 * params::degree / 2]);
|
||||
f128 c_re, c_im;
|
||||
|
||||
f128::cplx_f128_mul_assign(c_re, c_im, a_re, a_im, b_re, b_im);
|
||||
|
||||
result[tid + 0 * params::degree / 2] = c_re.hi;
|
||||
result[tid + 1 * params::degree / 2] = c_re.lo;
|
||||
result[tid + 2 * params::degree / 2] = c_im.hi;
|
||||
result[tid + 3 * params::degree / 2] = c_im.lo;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
f128 a_re(first[tid + 0 * params::degree / 2],
|
||||
first[tid + 1 * params::degree / 2]);
|
||||
f128 a_im(first[tid + 2 * params::degree / 2],
|
||||
first[tid + 3 * params::degree / 2]);
|
||||
f128 b_re(second[tid + 0 * params::degree / 2],
|
||||
second[tid + 1 * params::degree / 2]);
|
||||
f128 b_im(second[tid + 2 * params::degree / 2],
|
||||
second[tid + 3 * params::degree / 2]);
|
||||
f128 res_re(result[tid + 0 * params::degree / 2],
|
||||
result[tid + 1 * params::degree / 2]);
|
||||
f128 res_im(result[tid + 2 * params::degree / 2],
|
||||
result[tid + 3 * params::degree / 2]);
|
||||
f128 c_re, c_im;
|
||||
|
||||
f128::cplx_f128_mul_assign(c_re, c_im, a_re, a_im, b_re, b_im);
|
||||
f128::cplx_f128_add_assign(res_re, res_im, res_re, res_im, c_re, c_im);
|
||||
|
||||
result[tid + 0 * params::degree / 2] = res_re.hi;
|
||||
result[tid + 1 * params::degree / 2] = res_re.lo;
|
||||
result[tid + 2 * params::degree / 2] = res_im.hi;
|
||||
result[tid + 3 * params::degree / 2] = res_im.lo;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Computes result += x
|
||||
// If init_accumulator is set, assumes that result was not initialized and does
|
||||
// that with the outcome of first * second
|
||||
|
||||
@@ -1416,6 +1416,18 @@ unsafe extern "C" {
|
||||
polynomial_size: u32,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_convert_lwe_programmable_bootstrap_key_128(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
dest: *mut ffi::c_void,
|
||||
src: *const ffi::c_void,
|
||||
input_lwe_dim: u32,
|
||||
glwe_dim: u32,
|
||||
level_count: u32,
|
||||
polynomial_size: u32,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_programmable_bootstrap_amortized_32(
|
||||
stream: *mut ffi::c_void,
|
||||
@@ -1509,6 +1521,18 @@ unsafe extern "C" {
|
||||
allocate_gpu_memory: bool,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_programmable_bootstrap_128(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
buffer: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
level_count: u32,
|
||||
input_lwe_ciphertext_count: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
stream: *mut ffi::c_void,
|
||||
@@ -1553,6 +1577,28 @@ unsafe extern "C" {
|
||||
lut_stride: u32,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
lwe_array_out: *mut ffi::c_void,
|
||||
lwe_output_indexes: *const ffi::c_void,
|
||||
lut_vector: *const ffi::c_void,
|
||||
lut_vector_indexes: *const ffi::c_void,
|
||||
lwe_array_in: *const ffi::c_void,
|
||||
lwe_input_indexes: *const ffi::c_void,
|
||||
bootstrapping_key: *const ffi::c_void,
|
||||
buffer: *mut i8,
|
||||
lwe_dimension: u32,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
base_log: u32,
|
||||
level_count: u32,
|
||||
num_samples: u32,
|
||||
num_many_lut: u32,
|
||||
lut_stride: u32,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_programmable_bootstrap(
|
||||
stream: *mut ffi::c_void,
|
||||
@@ -1560,6 +1606,13 @@ unsafe extern "C" {
|
||||
pbs_buffer: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_programmable_bootstrap_128(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
pbs_buffer: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
|
||||
glwe_dimension: u32,
|
||||
|
||||
@@ -348,6 +348,18 @@ pub const FFT128_U128_PARAMS: FftTestParams<u128> = FftTestParams {
|
||||
ciphertext_modulus: CiphertextModulus::<u128>::new_native(),
|
||||
};
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
pub const FFT128_U128_GPU_PARAMS: FftTestParams<u128> = FftTestParams {
|
||||
lwe_dimension: LweDimension(879),
|
||||
glwe_dimension: GlweDimension(2),
|
||||
polynomial_size: PolynomialSize(2048),
|
||||
lwe_noise_distribution: DynamicDistribution::new_t_uniform(46),
|
||||
glwe_noise_distribution: DynamicDistribution::new_t_uniform(30),
|
||||
pbs_base_log: DecompositionBaseLog(32),
|
||||
pbs_level: DecompositionLevelCount(3),
|
||||
ciphertext_modulus: CiphertextModulus::new_native(),
|
||||
};
|
||||
|
||||
pub const FFT_WOPBS_PARAMS: FftWopPbsTestParams<u64> = FftWopPbsTestParams {
|
||||
lwe_dimension: LweDimension(481),
|
||||
glwe_dimension: GlweDimension(1),
|
||||
|
||||
@@ -0,0 +1,180 @@
|
||||
pub(crate) use crate::core_crypto::algorithms::test::gen_keys_or_get_from_cache_if_enabled;
|
||||
|
||||
use crate::core_crypto::algorithms::test::{
|
||||
FftBootstrapKeys, FftTestParams, TestResources, FFT128_U128_GPU_PARAMS,
|
||||
};
|
||||
use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
|
||||
use crate::core_crypto::gpu::lwe_bootstrap_key::CudaLweBootstrapKey;
|
||||
use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
|
||||
use crate::core_crypto::gpu::vec::{CudaVec, GpuIndex};
|
||||
use crate::core_crypto::gpu::{cuda_programmable_bootstrap_lwe_ciphertext, CudaStreams};
|
||||
|
||||
use crate::core_crypto::keycache::KeyCacheAccess;
|
||||
use crate::core_crypto::prelude::*;
|
||||
use itertools::Itertools;
|
||||
use serde::de::DeserializeOwned;
|
||||
use serde::Serialize;
|
||||
|
||||
pub fn generate_keys<
|
||||
Scalar: UnsignedTorus + Sync + Send + CastFrom<usize> + CastInto<usize> + Serialize + DeserializeOwned,
|
||||
>(
|
||||
params: FftTestParams<Scalar>,
|
||||
rsc: &mut TestResources,
|
||||
) -> FftBootstrapKeys<Scalar> {
|
||||
// Generate an LweSecretKey with binary coefficients
|
||||
let small_lwe_sk =
|
||||
LweSecretKey::generate_new_binary(params.lwe_dimension, &mut rsc.secret_random_generator);
|
||||
|
||||
// Generate a GlweSecretKey with binary coefficients
|
||||
let glwe_sk = GlweSecretKey::generate_new_binary(
|
||||
params.glwe_dimension,
|
||||
params.polynomial_size,
|
||||
&mut rsc.secret_random_generator,
|
||||
);
|
||||
|
||||
// Create a copy of the GlweSecretKey re-interpreted as an LweSecretKey
|
||||
let big_lwe_sk = glwe_sk.clone().into_lwe_secret_key();
|
||||
|
||||
let bsk = par_allocate_and_generate_new_lwe_bootstrap_key(
|
||||
&small_lwe_sk,
|
||||
&glwe_sk,
|
||||
params.pbs_base_log,
|
||||
params.pbs_level,
|
||||
params.glwe_noise_distribution,
|
||||
params.ciphertext_modulus,
|
||||
&mut rsc.encryption_random_generator,
|
||||
);
|
||||
|
||||
FftBootstrapKeys {
|
||||
small_lwe_sk,
|
||||
big_lwe_sk,
|
||||
bsk,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn execute_bootstrap_u128<Scalar>(params: FftTestParams<Scalar>)
|
||||
where
|
||||
Scalar: Numeric
|
||||
+ UnsignedTorus
|
||||
+ CastFrom<usize>
|
||||
+ CastInto<usize>
|
||||
+ Send
|
||||
+ Sync
|
||||
+ Serialize
|
||||
+ DeserializeOwned,
|
||||
FftTestParams<Scalar>: KeyCacheAccess<Keys = FftBootstrapKeys<Scalar>>,
|
||||
{
|
||||
let lwe_noise_distribution = params.lwe_noise_distribution;
|
||||
let glwe_dimension = params.glwe_dimension;
|
||||
let polynomial_size = params.polynomial_size;
|
||||
let ciphertext_modulus = params.ciphertext_modulus;
|
||||
|
||||
let mut rsc = TestResources::new();
|
||||
|
||||
let mut keys_gen = |params| generate_keys(params, &mut rsc);
|
||||
let keys = gen_keys_or_get_from_cache_if_enabled(params, &mut keys_gen);
|
||||
let (std_bootstrapping_key, small_lwe_sk, big_lwe_sk) =
|
||||
(keys.bsk, keys.small_lwe_sk, keys.big_lwe_sk);
|
||||
let output_lwe_dimension = big_lwe_sk.lwe_dimension();
|
||||
|
||||
let gpu_index = 0;
|
||||
let stream = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
|
||||
let d_bsk = CudaLweBootstrapKey::from_lwe_bootstrap_key(&std_bootstrapping_key, &stream);
|
||||
|
||||
// Our 4 bits message space
|
||||
let message_modulus: Scalar = Scalar::ONE << 4;
|
||||
|
||||
// Our input message
|
||||
let input_message: Scalar = 3usize.cast_into();
|
||||
let number_of_messages = 1;
|
||||
// Delta used to encode 4 bits of message + a bit of padding on Scalar
|
||||
let delta: Scalar = (Scalar::ONE << (Scalar::BITS - 1)) / message_modulus;
|
||||
|
||||
// Apply our encoding
|
||||
let plaintext = Plaintext(input_message * delta);
|
||||
|
||||
// Allocate a new LweCiphertext and encrypt our plaintext
|
||||
let lwe_ciphertext_in: LweCiphertextOwned<Scalar> = allocate_and_encrypt_new_lwe_ciphertext(
|
||||
&small_lwe_sk,
|
||||
plaintext,
|
||||
lwe_noise_distribution,
|
||||
ciphertext_modulus,
|
||||
&mut rsc.encryption_random_generator,
|
||||
);
|
||||
|
||||
let f = |x: Scalar| x;
|
||||
let accumulator: GlweCiphertextOwned<Scalar> = generate_programmable_bootstrap_glwe_lut(
|
||||
polynomial_size,
|
||||
glwe_dimension.to_glwe_size(),
|
||||
message_modulus.cast_into(),
|
||||
ciphertext_modulus,
|
||||
delta,
|
||||
f,
|
||||
);
|
||||
|
||||
let d_lwe_ciphertext_in =
|
||||
CudaLweCiphertextList::from_lwe_ciphertext(&lwe_ciphertext_in, &stream);
|
||||
|
||||
let mut d_out_pbs_ct = CudaLweCiphertextList::new(
|
||||
output_lwe_dimension,
|
||||
LweCiphertextCount(1),
|
||||
ciphertext_modulus,
|
||||
&stream,
|
||||
);
|
||||
|
||||
let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &stream);
|
||||
|
||||
let mut test_vector_indexes: Vec<Scalar> = vec![Scalar::ZERO; number_of_messages];
|
||||
for (i, ind) in test_vector_indexes.iter_mut().enumerate() {
|
||||
*ind = <usize as CastInto<Scalar>>::cast_into(i);
|
||||
}
|
||||
let mut d_test_vector_indexes =
|
||||
unsafe { CudaVec::<Scalar>::new_async(number_of_messages, &stream, 0) };
|
||||
unsafe { d_test_vector_indexes.copy_from_cpu_async(&test_vector_indexes, &stream, 0) };
|
||||
|
||||
let num_blocks = d_lwe_ciphertext_in.0.lwe_ciphertext_count.0;
|
||||
let lwe_indexes_usize: Vec<usize> = (0..num_blocks).collect_vec();
|
||||
let lwe_indexes = lwe_indexes_usize
|
||||
.iter()
|
||||
.map(|&x| <usize as CastInto<Scalar>>::cast_into(x))
|
||||
.collect_vec();
|
||||
let mut d_output_indexes = unsafe { CudaVec::<Scalar>::new_async(num_blocks, &stream, 0) };
|
||||
let mut d_input_indexes = unsafe { CudaVec::<Scalar>::new_async(num_blocks, &stream, 0) };
|
||||
unsafe {
|
||||
d_input_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0);
|
||||
d_output_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0);
|
||||
}
|
||||
|
||||
cuda_programmable_bootstrap_lwe_ciphertext(
|
||||
&d_lwe_ciphertext_in,
|
||||
&mut d_out_pbs_ct,
|
||||
&d_accumulator,
|
||||
&d_test_vector_indexes,
|
||||
&d_output_indexes,
|
||||
&d_input_indexes,
|
||||
LweCiphertextCount(num_blocks),
|
||||
&d_bsk,
|
||||
&stream,
|
||||
);
|
||||
|
||||
let pbs_ct = d_out_pbs_ct.into_lwe_ciphertext(&stream);
|
||||
|
||||
// Decrypt the PBS result
|
||||
let pbs_plaintext: Plaintext<Scalar> = decrypt_lwe_ciphertext(&big_lwe_sk, &pbs_ct);
|
||||
|
||||
// Create a SignedDecomposer to perform the rounding of the decrypted plaintext
|
||||
// We pass a DecompositionBaseLog of 5 and a DecompositionLevelCount of 1 indicating we want
|
||||
// to round the 5 MSB, 1 bit of padding plus our 4 bits of message
|
||||
let signed_decomposer =
|
||||
SignedDecomposer::new(DecompositionBaseLog(5), DecompositionLevelCount(1));
|
||||
|
||||
// Round and remove our encoding
|
||||
let pbs_result: Scalar = signed_decomposer.closest_representable(pbs_plaintext.0) / delta;
|
||||
|
||||
assert_eq!(f(input_message), pbs_result);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bootstrap_u128() {
|
||||
execute_bootstrap_u128::<u128>(FFT128_U128_GPU_PARAMS);
|
||||
}
|
||||
@@ -7,6 +7,7 @@ mod lwe_linear_algebra;
|
||||
mod lwe_multi_bit_programmable_bootstrapping;
|
||||
mod lwe_packing_keyswitch;
|
||||
mod lwe_programmable_bootstrapping;
|
||||
mod lwe_programmable_bootstrapping_128;
|
||||
mod noise_distribution;
|
||||
|
||||
pub struct CudaPackingKeySwitchKeys<Scalar: UnsignedInteger> {
|
||||
|
||||
@@ -37,6 +37,11 @@ impl CudaLweBootstrapKey {
|
||||
let decomp_level_count = bsk.decomposition_level_count();
|
||||
let decomp_base_log = bsk.decomposition_base_log();
|
||||
let glwe_dimension = bsk.glwe_size().to_glwe_dimension();
|
||||
let double_count = if size_of::<InputBskCont::Element>() == 16 {
|
||||
2
|
||||
} else {
|
||||
1
|
||||
};
|
||||
|
||||
// Allocate memory
|
||||
let mut d_vec = CudaVec::<f64>::new_multi_gpu(
|
||||
@@ -45,10 +50,11 @@ impl CudaLweBootstrapKey {
|
||||
glwe_dimension.to_glwe_size(),
|
||||
polynomial_size,
|
||||
decomp_level_count,
|
||||
),
|
||||
) * double_count,
|
||||
streams,
|
||||
);
|
||||
// Copy to the GPU
|
||||
|
||||
unsafe {
|
||||
convert_lwe_programmable_bootstrap_key_async(
|
||||
streams,
|
||||
@@ -59,7 +65,7 @@ impl CudaLweBootstrapKey {
|
||||
decomp_level_count,
|
||||
polynomial_size,
|
||||
);
|
||||
}
|
||||
};
|
||||
streams.synchronize();
|
||||
Self {
|
||||
d_vec,
|
||||
|
||||
@@ -121,41 +121,84 @@ pub unsafe fn programmable_bootstrap_async<T: UnsignedInteger>(
|
||||
let num_many_lut = 1u32;
|
||||
let lut_stride = 0u32;
|
||||
let mut pbs_buffer: *mut i8 = std::ptr::null_mut();
|
||||
scratch_cuda_programmable_bootstrap_64(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
std::ptr::addr_of_mut!(pbs_buffer),
|
||||
glwe_dimension.0 as u32,
|
||||
polynomial_size.0 as u32,
|
||||
level.0 as u32,
|
||||
num_samples,
|
||||
true,
|
||||
);
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
lwe_array_out.as_mut_c_ptr(0),
|
||||
lwe_out_indexes.as_c_ptr(0),
|
||||
test_vector.as_c_ptr(0),
|
||||
test_vector_indexes.as_c_ptr(0),
|
||||
lwe_array_in.as_c_ptr(0),
|
||||
lwe_in_indexes.as_c_ptr(0),
|
||||
bootstrapping_key.as_c_ptr(0),
|
||||
pbs_buffer,
|
||||
lwe_dimension.0 as u32,
|
||||
glwe_dimension.0 as u32,
|
||||
polynomial_size.0 as u32,
|
||||
base_log.0 as u32,
|
||||
level.0 as u32,
|
||||
num_samples,
|
||||
num_many_lut,
|
||||
lut_stride,
|
||||
);
|
||||
cleanup_cuda_programmable_bootstrap(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
std::ptr::addr_of_mut!(pbs_buffer),
|
||||
);
|
||||
|
||||
if size_of::<T>() == 16 {
|
||||
scratch_cuda_programmable_bootstrap_128(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
std::ptr::addr_of_mut!(pbs_buffer),
|
||||
glwe_dimension.0 as u32,
|
||||
polynomial_size.0 as u32,
|
||||
level.0 as u32,
|
||||
num_samples,
|
||||
true,
|
||||
);
|
||||
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
lwe_array_out.as_mut_c_ptr(0),
|
||||
lwe_out_indexes.as_c_ptr(0),
|
||||
test_vector.as_c_ptr(0),
|
||||
test_vector_indexes.as_c_ptr(0),
|
||||
lwe_array_in.as_c_ptr(0),
|
||||
lwe_in_indexes.as_c_ptr(0),
|
||||
bootstrapping_key.as_c_ptr(0),
|
||||
pbs_buffer,
|
||||
lwe_dimension.0 as u32,
|
||||
glwe_dimension.0 as u32,
|
||||
polynomial_size.0 as u32,
|
||||
base_log.0 as u32,
|
||||
level.0 as u32,
|
||||
num_samples,
|
||||
num_many_lut,
|
||||
lut_stride,
|
||||
);
|
||||
|
||||
cleanup_cuda_programmable_bootstrap_128(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
std::ptr::addr_of_mut!(pbs_buffer),
|
||||
);
|
||||
} else {
|
||||
scratch_cuda_programmable_bootstrap_64(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
std::ptr::addr_of_mut!(pbs_buffer),
|
||||
glwe_dimension.0 as u32,
|
||||
polynomial_size.0 as u32,
|
||||
level.0 as u32,
|
||||
num_samples,
|
||||
true,
|
||||
);
|
||||
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
lwe_array_out.as_mut_c_ptr(0),
|
||||
lwe_out_indexes.as_c_ptr(0),
|
||||
test_vector.as_c_ptr(0),
|
||||
test_vector_indexes.as_c_ptr(0),
|
||||
lwe_array_in.as_c_ptr(0),
|
||||
lwe_in_indexes.as_c_ptr(0),
|
||||
bootstrapping_key.as_c_ptr(0),
|
||||
pbs_buffer,
|
||||
lwe_dimension.0 as u32,
|
||||
glwe_dimension.0 as u32,
|
||||
polynomial_size.0 as u32,
|
||||
base_log.0 as u32,
|
||||
level.0 as u32,
|
||||
num_samples,
|
||||
num_many_lut,
|
||||
lut_stride,
|
||||
);
|
||||
|
||||
cleanup_cuda_programmable_bootstrap(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
std::ptr::addr_of_mut!(pbs_buffer),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Programmable multi-bit bootstrap on a vector of LWE ciphertexts
|
||||
@@ -341,19 +384,30 @@ pub unsafe fn convert_lwe_programmable_bootstrap_key_async<T: UnsignedInteger>(
|
||||
l_gadget: DecompositionLevelCount,
|
||||
polynomial_size: PolynomialSize,
|
||||
) {
|
||||
let size = std::mem::size_of_val(src);
|
||||
for (i, &stream_ptr) in streams.ptr.iter().enumerate() {
|
||||
assert_eq!(dest.len() * std::mem::size_of::<T>(), size);
|
||||
cuda_convert_lwe_programmable_bootstrap_key_64(
|
||||
stream_ptr,
|
||||
streams.gpu_indexes[i].get(),
|
||||
dest.as_mut_c_ptr(i as u32),
|
||||
src.as_ptr().cast(),
|
||||
input_lwe_dim.0 as u32,
|
||||
glwe_dim.0 as u32,
|
||||
l_gadget.0 as u32,
|
||||
polynomial_size.0 as u32,
|
||||
);
|
||||
if size_of::<T>() == 16 {
|
||||
cuda_convert_lwe_programmable_bootstrap_key_128(
|
||||
stream_ptr,
|
||||
streams.gpu_indexes[i].get(),
|
||||
dest.as_mut_c_ptr(i as u32),
|
||||
src.as_ptr().cast(),
|
||||
input_lwe_dim.0 as u32,
|
||||
glwe_dim.0 as u32,
|
||||
l_gadget.0 as u32,
|
||||
polynomial_size.0 as u32,
|
||||
);
|
||||
} else {
|
||||
cuda_convert_lwe_programmable_bootstrap_key_64(
|
||||
stream_ptr,
|
||||
streams.gpu_indexes[i].get(),
|
||||
dest.as_mut_c_ptr(i as u32),
|
||||
src.as_ptr().cast(),
|
||||
input_lwe_dim.0 as u32,
|
||||
glwe_dim.0 as u32,
|
||||
l_gadget.0 as u32,
|
||||
polynomial_size.0 as u32,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user