feat(gpu): Implement 128 bit classic pbs

This commit is contained in:
Beka Barbakadze
2025-03-05 16:48:48 +04:00
committed by bbarbakadze
parent 8dadb626f2
commit 459969e9d2
17 changed files with 1611 additions and 66 deletions

View File

@@ -9,20 +9,26 @@
template <typename Torus>
uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_one(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator_rotated
sizeof(double2) * polynomial_size / 2; // accumulator fft
size_t double_count = (sizeof(Torus) == 16) ? 2 : 1;
return sizeof(Torus) * polynomial_size + // accumulator_rotated
sizeof(double) * 2 * double_count * polynomial_size /
2; // accumulator fft
}
template <typename Torus>
uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_two(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator
sizeof(double2) * polynomial_size / 2; // accumulator fft
size_t double_count = (sizeof(Torus) == 16) ? 2 : 1;
return sizeof(Torus) * polynomial_size + // accumulator
sizeof(double) * 2 * double_count * polynomial_size /
2; // accumulator fft
}
template <typename Torus>
uint64_t
get_buffer_size_partial_sm_programmable_bootstrap(uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft
size_t double_count = (sizeof(Torus) == 16) ? 2 : 1;
return sizeof(double) * 2 * double_count * polynomial_size /
2; // accumulator fft
}
template <typename Torus>
@@ -215,6 +221,155 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
}
};
template <PBS_TYPE pbs_type> struct pbs_buffer_128;
template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
int8_t *d_mem;
__uint128_t *global_accumulator;
double *global_join_buffer;
PBS_VARIANT pbs_variant;
pbs_buffer_128(cudaStream_t stream, uint32_t gpu_index,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
PBS_VARIANT pbs_variant, bool allocate_gpu_memory) {
cuda_set_device(gpu_index);
this->pbs_variant = pbs_variant;
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
size_t global_join_buffer_size = (glwe_dimension + 1) * level_count *
input_lwe_ciphertext_count *
polynomial_size / 2 * sizeof(double) * 4;
if (allocate_gpu_memory) {
switch (pbs_variant) {
case PBS_VARIANT::DEFAULT: {
uint64_t full_sm_step_one =
get_buffer_size_full_sm_programmable_bootstrap_step_one<
__uint128_t>(polynomial_size);
uint64_t full_sm_step_two =
get_buffer_size_full_sm_programmable_bootstrap_step_two<
__uint128_t>(polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap<__uint128_t>(
polynomial_size);
uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
uint64_t full_dm = full_sm_step_one;
uint64_t device_mem = 0;
if (max_shared_memory < partial_sm) {
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
} else if (max_shared_memory < full_sm_step_two) {
device_mem =
(partial_dm_step_two + partial_dm_step_one * level_count) *
input_lwe_ciphertext_count * (glwe_dimension + 1);
} else if (max_shared_memory < full_sm_step_one) {
device_mem = partial_dm_step_one * input_lwe_ciphertext_count *
level_count * (glwe_dimension + 1);
}
// Otherwise, both kernels run all in shared memory
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
global_join_buffer = (double *)cuda_malloc_async(
global_join_buffer_size, stream, gpu_index);
global_accumulator = (__uint128_t *)cuda_malloc_async(
(glwe_dimension + 1) * input_lwe_ciphertext_count *
polynomial_size * sizeof(__uint128_t),
stream, gpu_index);
} break;
case PBS_VARIANT::CG: {
uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_cg<__uint128_t>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap_cg<__uint128_t>(
polynomial_size);
uint64_t partial_dm = full_sm - partial_sm;
uint64_t full_dm = full_sm;
uint64_t device_mem = 0;
if (max_shared_memory < partial_sm) {
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
} else if (max_shared_memory < full_sm) {
device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
}
// Otherwise, both kernels run all in shared memory
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
global_join_buffer = (double *)cuda_malloc_async(
global_join_buffer_size, stream, gpu_index);
} break;
#if CUDA_ARCH >= 900
case PBS_VARIANT::TBC: {
bool supports_dsm =
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
__uint128_t>(polynomial_size, max_shared_memory);
uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_tbc<__uint128_t>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap_tbc<__uint128_t>(
polynomial_size);
uint64_t minimum_sm_tbc = 0;
if (supports_dsm)
minimum_sm_tbc =
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<
__uint128_t>(polynomial_size);
uint64_t partial_dm = full_sm - partial_sm;
uint64_t full_dm = full_sm;
uint64_t device_mem = 0;
// There is a minimum amount of memory we need to run the TBC PBS, which
// is minimum_sm_tbc. We know that minimum_sm_tbc bytes are available
// because otherwise the previous check would have redirected
// computation to some other variant. If over that we don't have more
// partial_sm bytes, TBC PBS will run on NOSM. If we have partial_sm but
// not full_sm bytes, it will run on PARTIALSM. Otherwise, FULLSM.
//
// NOSM mode actually requires minimum_sm_tbc shared memory bytes.
if (max_shared_memory < partial_sm + minimum_sm_tbc) {
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
} else if (max_shared_memory < full_sm + minimum_sm_tbc) {
device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
}
// Otherwise, both kernels run all in shared memory
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
global_join_buffer = (double *)cuda_malloc_async(
global_join_buffer_size, stream, gpu_index);
} break;
#endif
default:
PANIC("Cuda error (PBS): unsupported implementation variant.")
}
}
}
void release(cudaStream_t stream, uint32_t gpu_index) {
cuda_drop_async(d_mem, stream, gpu_index);
cuda_drop_async(global_join_buffer, stream, gpu_index);
if (pbs_variant == DEFAULT)
cuda_drop_async(global_accumulator, stream, gpu_index);
}
};
template <typename Torus>
uint64_t get_buffer_size_programmable_bootstrap_cg(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,

View File

@@ -20,6 +20,11 @@ void cuda_convert_lwe_programmable_bootstrap_key_64(
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size);
void cuda_convert_lwe_programmable_bootstrap_key_128(
void *stream, uint32_t gpu_index, void *dest, void const *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size);
void scratch_cuda_programmable_bootstrap_amortized_32(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -62,6 +67,11 @@ void scratch_cuda_programmable_bootstrap_64(
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
void scratch_cuda_programmable_bootstrap_128(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
@@ -80,7 +90,19 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);
void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);
void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
int8_t **pbs_buffer);
void cleanup_cuda_programmable_bootstrap_128(void *stream, uint32_t gpu_index,
int8_t **pbs_buffer);
}
#endif // CUDA_BOOTSTRAP_H

View File

@@ -3,6 +3,7 @@
#include "crypto/torus.cuh"
#include "device.h"
#include "fft128/f128.cuh"
#include <cstdint>
/**
@@ -42,6 +43,13 @@ public:
}
}
__device__ void decompose_and_compress_next_128(double *result) {
for (int j = 0; j < num_poly; j++) {
auto result_slice = result + j * params::degree / 2 * 4;
decompose_and_compress_next_polynomial_128(result_slice, j);
}
}
// Decomposes a single polynomial
__device__ void decompose_and_compress_next_polynomial(double2 *result,
int j) {
@@ -75,10 +83,58 @@ public:
synchronize_threads_in_block();
}
// Decomposes a single polynomial
__device__ void decompose_and_compress_next_polynomial_128(double *result,
int j) {
uint32_t tid = threadIdx.x;
auto state_slice = &state[j * params::degree];
for (int i = 0; i < params::opt / 2; i++) {
auto input1 = &state_slice[tid];
auto input2 = &state_slice[tid + params::degree / 2];
T res_re = *input1 & mask_mod_b;
T res_im = *input2 & mask_mod_b;
*input1 >>= base_log; // Update state
*input2 >>= base_log; // Update state
T carry_re = ((res_re - 1ll) | *input1) & res_re;
T carry_im = ((res_im - 1ll) | *input2) & res_im;
carry_re >>= (base_log - 1);
carry_im >>= (base_log - 1);
*input1 += carry_re; // Update state
*input2 += carry_im; // Update state
res_re -= carry_re << base_log;
res_im -= carry_im << base_log;
auto out_re = u128_to_signed_to_f128(res_re);
auto out_im = u128_to_signed_to_f128(res_im);
auto out_re_hi = result + 0 * params::degree / 2;
auto out_re_lo = result + 1 * params::degree / 2;
auto out_im_hi = result + 2 * params::degree / 2;
auto out_im_lo = result + 3 * params::degree / 2;
out_re_hi[tid] = out_re.hi;
out_re_lo[tid] = out_re.lo;
out_im_hi[tid] = out_im.hi;
out_im_lo[tid] = out_im.lo;
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
}
__device__ void decompose_and_compress_level(double2 *result, int level) {
for (int i = 0; i < level_count - level; i++)
decompose_and_compress_next(result);
}
__device__ void decompose_and_compress_level_128(double *result, int level) {
for (int i = 0; i < level_count - level; i++)
decompose_and_compress_next_128(result);
}
};
template <typename Torus>

View File

@@ -21,7 +21,7 @@ struct alignas(16) f128 {
#else
double s = a + b;
return f128(s, b - (s - a));
#endif;
#endif
}
// Two-sum
@@ -270,7 +270,7 @@ __host__ __device__ inline double bits_to_double(uint64_t bits) {
return d;
}
__host__ __device__ double u128_to_f64(__uint128_t x) {
__host__ __device__ inline double u128_to_f64(__uint128_t x) {
const __uint128_t ONE = 1;
const double A = ONE << 52;
const double B = ONE << 104;
@@ -322,7 +322,7 @@ __host__ __device__ double u128_to_f64(__uint128_t x) {
}
}
__host__ __device__ __uint128_t f64_to_u128(const double f) {
__host__ __device__ inline __uint128_t f64_to_u128(const double f) {
const __uint128_t ONE = 1;
const uint64_t f_bits = double_to_bits(f);
if (f_bits < 1023ull << 52) {
@@ -338,7 +338,7 @@ __host__ __device__ __uint128_t f64_to_u128(const double f) {
}
}
__host__ __device__ __uint128_t f64_to_i128(const double f) {
__host__ __device__ inline __uint128_t f64_to_i128(const double f) {
// Get raw bits of the double
const uint64_t f_bits = double_to_bits(f);
@@ -366,14 +366,14 @@ __host__ __device__ __uint128_t f64_to_i128(const double f) {
return (f_bits >> 63) ? -result : result;
}
__host__ __device__ double i128_to_f64(__int128_t const x) {
__host__ __device__ inline double i128_to_f64(__int128_t const x) {
uint64_t sign = static_cast<uint64_t>(x >> 64) & (1ULL << 63);
__uint128_t abs =
(x < 0) ? static_cast<__uint128_t>(-x) : static_cast<__uint128_t>(x);
return bits_to_double(double_to_bits(u128_to_f64(abs)) | sign);
}
__host__ __device__ f128 u128_to_signed_to_f128(__uint128_t x) {
__host__ __device__ inline f128 u128_to_signed_to_f128(__uint128_t x) {
const double first_approx = i128_to_f64(x);
const uint64_t sign_bit = double_to_bits(first_approx) & (1ull << 63);
const __uint128_t first_approx_roundtrip =
@@ -387,7 +387,7 @@ __host__ __device__ f128 u128_to_signed_to_f128(__uint128_t x) {
return f128(first_approx, correction);
}
__host__ __device__ __uint128_t u128_from_torus_f128(const f128 &a) {
__host__ __device__ inline __uint128_t u128_from_torus_f128(const f128 &a) {
auto x = f128::sub_estimate(a, f128::f128_floor(a));
const double normalization = 340282366920938500000000000000000000000.;
#ifdef __CUDA_ARCH__

View File

@@ -272,6 +272,9 @@ batch_convert_u128_to_f128_as_integer(double *out_re_hi, double *out_re_lo,
}
// params is expected to be full degree not half degree
// converts standqard input into complex<128> represented by 4 double
// with following pattern: [re_hi_0, re_hi_1, ... re_hi_n, re_lo_0, re_lo_1,
// ... re_lo_n, im_hi_0, im_hi_1, ..., im_hi_n, im_lo_0, im_lo_1, ..., im_lo_n]
template <class params>
__global__ void
batch_convert_u128_to_f128_as_torus(double *out_re_hi, double *out_re_lo,
@@ -287,6 +290,29 @@ batch_convert_u128_to_f128_as_torus(double *out_re_hi, double *out_re_lo,
&in[blockIdx.x * params::degree + params::degree / 2]);
}
// params is expected to be full degree not half degree
// converts standqard input into complex<128> represented by 4 double
// with following pattern: [re_hi_0, re_lo_0, im_hi_0, im_lo_0, re_hi_1,
// re_lo_1, im_hi_1, im_lo_1,
// ...,re_hi_n, re_lo_n, im_hi_n, im_lo_n, ]
template <class params>
__global__ void
batch_convert_u128_to_f128_strided_as_torus(double *d_out,
const __uint128_t *d_in) {
constexpr size_t chunk_size = params::degree / 2 * 4;
double *chunk = &d_out[blockIdx.x * chunk_size];
double *out_re_hi = &chunk[0 * params::degree / 2];
double *out_re_lo = &chunk[1 * params::degree / 2];
double *out_im_hi = &chunk[2 * params::degree / 2];
double *out_im_lo = &chunk[3 * params::degree / 2];
convert_u128_to_f128_as_torus<params>(
out_re_hi, out_re_lo, out_im_hi, out_im_lo,
&d_in[blockIdx.x * params::degree],
&d_in[blockIdx.x * params::degree + params::degree / 2]);
}
// params is expected to be full degree not half degree
template <class params>
__global__ void batch_convert_f128_to_u128_as_torus(__uint128_t *out,
@@ -309,7 +335,7 @@ __global__ void
batch_NSMFFT_128(double *in_re_hi, double *in_re_lo, double *in_im_hi,
double *in_im_lo, double *out_re_hi, double *out_re_lo,
double *out_im_hi, double *out_im_lo, double *buffer) {
extern __shared__ double sharedMemoryFFT[];
extern __shared__ double sharedMemoryFFT128[];
double *re_hi, *re_lo, *im_hi, *im_lo;
if (SMD == NOSM) {
@@ -322,10 +348,10 @@ batch_NSMFFT_128(double *in_re_hi, double *in_re_lo, double *in_im_hi,
im_lo =
&buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 3];
} else {
re_hi = &sharedMemoryFFT[params::degree / 2 * 0];
re_lo = &sharedMemoryFFT[params::degree / 2 * 1];
im_hi = &sharedMemoryFFT[params::degree / 2 * 2];
im_lo = &sharedMemoryFFT[params::degree / 2 * 3];
re_hi = &sharedMemoryFFT128[params::degree / 2 * 0];
re_lo = &sharedMemoryFFT128[params::degree / 2 * 1];
im_hi = &sharedMemoryFFT128[params::degree / 2 * 2];
im_lo = &sharedMemoryFFT128[params::degree / 2 * 3];
}
Index tid = threadIdx.x;
@@ -356,6 +382,70 @@ batch_NSMFFT_128(double *in_re_hi, double *in_re_lo, double *in_im_hi,
}
}
template <class params, sharedMemDegree SMD>
__global__ void batch_NSMFFT_strided_128(double *d_in, double *d_out,
double *buffer) {
extern __shared__ double sharedMemoryFFT128[];
double *re_hi, *re_lo, *im_hi, *im_lo;
if (SMD == NOSM) {
re_hi =
&buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 0];
re_lo =
&buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 1];
im_hi =
&buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 2];
im_lo =
&buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 3];
} else {
re_hi = &sharedMemoryFFT128[params::degree / 2 * 0];
re_lo = &sharedMemoryFFT128[params::degree / 2 * 1];
im_hi = &sharedMemoryFFT128[params::degree / 2 * 2];
im_lo = &sharedMemoryFFT128[params::degree / 2 * 3];
}
constexpr size_t chunk_size = params::degree / 2 * 4;
double *chunk = &d_in[blockIdx.x * chunk_size];
double *tmp_re_hi = &chunk[0 * params::degree / 2];
double *tmp_re_lo = &chunk[1 * params::degree / 2];
double *tmp_im_hi = &chunk[2 * params::degree / 2];
double *tmp_im_lo = &chunk[3 * params::degree / 2];
Index tid = threadIdx.x;
#pragma unroll
for (Index i = 0; i < params::opt / 2; ++i) {
re_hi[tid] = tmp_re_hi[tid];
re_lo[tid] = tmp_re_lo[tid];
im_hi[tid] = tmp_im_hi[tid];
im_lo[tid] = tmp_im_lo[tid];
tid += params::degree / params::opt;
}
__syncthreads();
if constexpr (params::fft_direction == 1) {
negacyclic_backward_fft_f128<HalfDegree<params>>(re_hi, re_lo, im_hi,
im_lo);
} else {
negacyclic_forward_fft_f128<HalfDegree<params>>(re_hi, re_lo, im_hi, im_lo);
}
__syncthreads();
chunk = &d_out[blockIdx.x * chunk_size];
tmp_re_hi = &chunk[0 * params::degree / 2];
tmp_re_lo = &chunk[1 * params::degree / 2];
tmp_im_hi = &chunk[2 * params::degree / 2];
tmp_im_lo = &chunk[3 * params::degree / 2];
tid = threadIdx.x;
#pragma unroll
for (Index i = 0; i < params::opt / 2; ++i) {
tmp_re_hi[tid] = re_hi[tid];
tmp_re_lo[tid] = re_lo[tid];
tmp_im_hi[tid] = im_hi[tid];
tmp_im_lo[tid] = im_lo[tid];
tid += params::degree / params::opt;
}
}
template <class params>
__host__ void host_fourier_transform_forward_as_integer_f128(
cudaStream_t stream, uint32_t gpu_index, double *re0, double *re1,

View File

@@ -22,6 +22,18 @@ void cuda_convert_lwe_programmable_bootstrap_key_64(
(const int64_t *)src, polynomial_size, total_polynomials);
}
void cuda_convert_lwe_programmable_bootstrap_key_128(
void *stream, uint32_t gpu_index, void *dest, void const *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size) {
uint32_t total_polynomials =
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
cuda_convert_lwe_programmable_bootstrap_key_u128(
static_cast<cudaStream_t>(stream), gpu_index, (double *)dest,
(const __uint128_t *)src, polynomial_size, total_polynomials);
}
void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
void *stream, uint32_t gpu_index, void *dest, void const *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,

View File

@@ -3,6 +3,8 @@
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "fft128/fft128.cuh"
#include "pbs/programmable_bootstrap.h"
#include "pbs/programmable_bootstrap_multibit.h"
#include "polynomial/parameters.cuh"
@@ -16,6 +18,13 @@ __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
level_count;
}
__device__ inline int get_start_ith_ggsw_128(int i, uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count) {
return i * polynomial_size / 2 * 4 * (glwe_dimension + 1) *
(glwe_dimension + 1) * level_count;
}
////////////////////////////////////////////////
template <typename T>
__device__ const T *get_ith_mask_kth_block(const T *ptr, int i, int k,
@@ -39,6 +48,31 @@ __device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
(glwe_dimension + 1) * (glwe_dimension + 1) +
k * polynomial_size / 2 * (glwe_dimension + 1)];
}
template <typename T>
__device__ const T *
get_ith_mask_kth_block_128(const T *ptr, int i, int k, int level,
uint32_t polynomial_size, int glwe_dimension,
uint32_t level_count) {
return &ptr[get_start_ith_ggsw_128(i, polynomial_size, glwe_dimension,
level_count) +
(level_count - level - 1) * polynomial_size / 2 * 4 *
(glwe_dimension + 1) * (glwe_dimension + 1) +
k * polynomial_size / 2 * 4 * (glwe_dimension + 1)];
}
template <typename T>
__device__ T *get_ith_mask_kth_block_128(T *ptr, int i, int k, int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count) {
return &ptr[get_start_ith_ggsw_128(i, polynomial_size, glwe_dimension,
level_count) +
(level_count - level - 1) * polynomial_size / 2 * 4 *
(glwe_dimension + 1) * (glwe_dimension + 1) +
k * polynomial_size / 2 * 4 * (glwe_dimension + 1)];
}
template <typename T>
__device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
uint32_t polynomial_size,
@@ -251,4 +285,93 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
cudaFreeHost(h_bsk);
}
template <class params>
void convert_u128_to_f128_and_forward_fft_128(cudaStream_t stream,
uint32_t gpu_index, double *d_bsk,
__uint128_t const *d_standard,
uint32_t number_of_samples) {
cuda_set_device(gpu_index);
size_t required_shared_memory_size = sizeof(double) * params::degree / 2 * 4;
int grid_size = number_of_samples;
int block_size = params::degree / params::opt;
bool full_sm =
(required_shared_memory_size <= cuda_get_max_shared_memory(gpu_index));
size_t buffer_size =
full_sm ? 0 : (size_t)number_of_samples * params::degree / 2 * 4;
size_t shared_memory_size = full_sm ? required_shared_memory_size : 0;
double *buffer = (double *)cuda_malloc_async(buffer_size, stream, gpu_index);
// configure shared memory for batch fft kernel
if (full_sm) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT_strided_128<FFTDegree<params, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT_strided_128<FFTDegree<params, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
}
// convert u128 into 4 x double
batch_convert_u128_to_f128_strided_as_torus<params>
<<<grid_size, block_size, 0, stream>>>(d_bsk, d_standard);
// call negacyclic 128 bit forward fft.
if (full_sm) {
batch_NSMFFT_strided_128<FFTDegree<params, ForwardFFT>, FULLSM>
<<<grid_size, block_size, shared_memory_size, stream>>>(d_bsk, d_bsk,
buffer);
} else {
batch_NSMFFT_strided_128<FFTDegree<params, ForwardFFT>, NOSM>
<<<grid_size, block_size, shared_memory_size, stream>>>(d_bsk, d_bsk,
buffer);
}
cuda_drop_async(buffer, stream, gpu_index);
}
inline void cuda_convert_lwe_programmable_bootstrap_key_u128(
cudaStream_t stream, uint32_t gpu_index, double *dest,
__uint128_t const *src, uint32_t polynomial_size,
uint32_t total_polynomials) {
// Here the buffer size is the size of double times the number of polynomials
// time 4 each polynomial is represented with 4 double array with size
// polynomial_size / 2 into the complex domain to perform the FFT
size_t buffer_size =
total_polynomials * polynomial_size / 2 * sizeof(double) * 4;
__uint128_t *d_standard =
(__uint128_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
cuda_memcpy_async_to_gpu(d_standard, src, buffer_size, stream, gpu_index);
switch (polynomial_size) {
case 256:
convert_u128_to_f128_and_forward_fft_128<AmortizedDegree<256>>(
stream, gpu_index, dest, d_standard, total_polynomials);
break;
case 512:
convert_u128_to_f128_and_forward_fft_128<AmortizedDegree<512>>(
stream, gpu_index, dest, d_standard, total_polynomials);
break;
case 1024:
convert_u128_to_f128_and_forward_fft_128<AmortizedDegree<1024>>(
stream, gpu_index, dest, d_standard, total_polynomials);
break;
case 2048:
convert_u128_to_f128_and_forward_fft_128<AmortizedDegree<2048>>(
stream, gpu_index, dest, d_standard, total_polynomials);
break;
case 4096:
convert_u128_to_f128_and_forward_fft_128<AmortizedDegree<4096>>(
stream, gpu_index, dest, d_standard, total_polynomials);
break;
default:
PANIC("Cuda error (convert BSK): unsupported polynomial size. Supported "
"N's are powers of two in the interval [256..4096].")
}
cuda_drop_async(d_standard, stream, gpu_index);
}
#endif // CNCRT_BSK_H

View File

@@ -0,0 +1,205 @@
#include "programmable_bootstrap_classic_128.cuh"
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the PBS on 128 bits inputs, into `buffer`. It also configures SM options on
* the GPU in case FULLSM or PARTIALSM mode is going to be used.
*/
void scratch_cuda_programmable_bootstrap_128(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
auto buffer = (pbs_buffer_128<CLASSICAL> **)pbs_buffer;
switch (polynomial_size) {
case 256:
scratch_programmable_bootstrap_128<AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
break;
case 512:
scratch_programmable_bootstrap_128<AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
break;
case 1024:
scratch_programmable_bootstrap_128<AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
break;
case 2048:
scratch_programmable_bootstrap_128<AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
break;
case 4096:
scratch_programmable_bootstrap_128<AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
break;
default:
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
"Supported N's are powers of two"
" in the interval [256..4096].")
}
}
template <typename Torus>
void executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, double const *bootstrapping_key,
pbs_buffer_128<CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride) {
switch (polynomial_size) {
case 256:
host_programmable_bootstrap_128<AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
case 512:
host_programmable_bootstrap_128<AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
case 1024:
host_programmable_bootstrap_128<AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
case 2048:
host_programmable_bootstrap_128<AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
case 4096:
host_programmable_bootstrap_128<AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
default:
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
"Supported N's are powers of two"
" in the interval [256..4096].")
}
}
/* Perform bootstrapping on a batch of input u128 LWE ciphertexts.
*
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
* (a0,..an-1,b) where n is the LWE dimension
* - lut_vector: should hold as many luts of size polynomial_size
* as there are input ciphertexts, but actually holds
* num_luts vectors to reduce memory usage
* - lut_vector_indexes: stores the index corresponding to
* which lut to use for each sample in
* lut_vector
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
* mask values + 1 body value
* - bootstrapping_key: GGSW encryption of the LWE secret key sk1
* under secret key sk2
* bsk = Z + sk1 H
* where H is the gadget matrix and Z is a matrix (k+1).l
* containing GLWE encryptions of 0 under sk2.
* bsk is thus a tensor of size (k+1)^2.l.N.n
* where l is the number of decomposition levels and
* k is the GLWE dimension, N is the polynomial size for
* GLWE. The polynomial size for GLWE and the lut
* are the same because they have to be in the same ring
* to be multiplied.
* - lwe_dimension: size of the Torus vector used to encrypt the input
* LWE ciphertexts - referred to as n above (~ 600)
* - glwe_dimension: size of the polynomial vector used to encrypt the LUT
* GLWE ciphertexts - referred to as k above. Only the value 1 is supported for
* this parameter.
* - polynomial_size: size of the test polynomial (lut) and size of the
* GLWE polynomial (~1024)
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
* - level_count: number of decomposition levels in the gadget matrix (~4)
* - num_samples: number of encrypted input messages
*
* This function calls a wrapper to a device kernel that performs the
* bootstrapping:
* - the kernel is templatized based on integer discretization and
* polynomial degree
* - num_samples * level_count * (glwe_dimension + 1) blocks of threads are
* launched, where each thread is going to handle one or more polynomial
* coefficients at each stage, for a given level of decomposition, either for
* the LUT mask or its body:
* - perform the blind rotation
* - round the result
* - get the decomposition for the current level
* - switch to the FFT domain
* - multiply with the bootstrapping key
* - come back to the coefficients representation
* - between each stage a synchronization of the threads is necessary (some
* synchronizations happen at the block level, some happen between blocks, using
* cooperative groups).
* - in case the device has enough shared memory, temporary arrays used for
* the different stages (accumulators) are stored into the shared memory
* - the accumulators serve to combine the results for all decomposition
* levels
* - the constant memory (64K) is used for storing the roots of identity
* values for the FFT
*/
void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride) {
if (base_log > 64)
PANIC("Cuda error (classical PBS): base log should be <= 64")
pbs_buffer_128<CLASSICAL> *buffer = (pbs_buffer_128<CLASSICAL> *)mem_ptr;
executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128<__uint128_t>(
stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
static_cast<const __uint128_t *>(lwe_output_indexes),
static_cast<const __uint128_t *>(lut_vector),
static_cast<const __uint128_t *>(lut_vector_indexes),
static_cast<const __uint128_t *>(lwe_array_in),
static_cast<const __uint128_t *>(lwe_input_indexes),
static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_many_lut, lut_stride);
}
/*
* This cleanup function frees the data on GPU for the PBS buffer for 32 or 64
* bits inputs.
*/
void cleanup_cuda_programmable_bootstrap_128(void *stream, uint32_t gpu_index,
int8_t **buffer) {
auto x = (pbs_buffer_128<CLASSICAL> *)(*buffer);
x->release(static_cast<cudaStream_t>(stream), gpu_index);
}

View File

@@ -0,0 +1,488 @@
#ifndef CUDA_PBS_CUH_128
#define CUDA_PBS_CUH_128
#ifdef __CDT_PARSER__
#undef __CUDA_RUNTIME_H__
#include <cuda_runtime.h>
#endif
#include "crypto/gadget.cuh"
#include "crypto/torus.cuh"
#include "device.h"
#include "fft128/fft128.cuh"
#include "pbs/bootstrapping_key.cuh"
#include "pbs/pbs_utilities.h"
#include "pbs/programmable_bootstrap.h"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial_math.cuh"
#include "types/complex/operations.cuh"
template <typename Torus, class params, sharedMemDegree SMD>
__global__ void __launch_bounds__(params::degree / params::opt)
device_programmable_bootstrap_step_one_128(
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes,
const Torus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const double *__restrict__ bootstrapping_key, Torus *global_accumulator,
double *global_join_buffer, uint32_t lwe_iteration,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *device_mem,
uint64_t device_memory_size_per_block) {
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
// much faster than global memory
extern __shared__ int8_t sharedmem[];
int8_t *selected_memory;
uint32_t glwe_dimension = gridDim.y - 1;
if constexpr (SMD == FULLSM) {
selected_memory = sharedmem;
} else {
int block_index = blockIdx.z + blockIdx.y * gridDim.z +
blockIdx.x * gridDim.z * gridDim.y;
selected_memory = &device_mem[block_index * device_memory_size_per_block];
}
Torus *accumulator = (Torus *)selected_memory;
double *accumulator_fft =
(double *)accumulator +
(ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double));
if constexpr (SMD == PARTIALSM)
accumulator_fft = (double *)sharedmem;
// The third dimension of the block is used to determine on which ciphertext
// this block is operating, in the case of batch bootstraps
const Torus *block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
const Torus *block_lut_vector =
&lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *
(glwe_dimension + 1)];
Torus *global_slice =
global_accumulator +
(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree;
double *global_fft_slice =
global_join_buffer + (blockIdx.y + blockIdx.z * (glwe_dimension + 1) +
blockIdx.x * level_count * (glwe_dimension + 1)) *
(polynomial_size / 2) * 4;
if (lwe_iteration == 0) {
// First iteration
// Put "b" in [0, 2N[
Torus b_hat = 0;
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
params::log2_degree + 1);
// The y-dimension is used to select the element of the GLWE this block will
// compute
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
false);
// Persist
int tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
global_slice[tid] = accumulator[tid];
tid += params::degree / params::opt;
}
}
// Put "a" in [0, 2N[
Torus a_hat = 0;
modulus_switch(block_lwe_array_in[lwe_iteration], a_hat,
params::log2_degree + 1); // 2 * params::log2_degree + 1);
synchronize_threads_in_block();
// Perform ACC * (X^ä - 1)
multiply_by_monomial_negacyclic_and_sub_polynomial<
Torus, params::opt, params::degree / params::opt>(global_slice,
accumulator, a_hat);
// Perform a rounding to increase the accuracy of the
// bootstrapped ciphertext
init_decomposer_state_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, base_log, level_count);
synchronize_threads_in_block();
// Decompose the accumulator. Each block gets one level of the
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
gadget_acc.decompose_and_compress_level_128(accumulator_fft, blockIdx.z);
// We are using the same memory space for accumulator_fft and
// accumulator_rotated, so we need to synchronize here to make sure they
// don't modify the same memory space at the same time
// Switch to the FFT space
auto acc_fft_re_hi = accumulator_fft + 0 * params::degree / 2;
auto acc_fft_re_lo = accumulator_fft + 1 * params::degree / 2;
auto acc_fft_im_hi = accumulator_fft + 2 * params::degree / 2;
auto acc_fft_im_lo = accumulator_fft + 3 * params::degree / 2;
auto global_fft_re_hi = global_fft_slice + 0 * params::degree / 2;
auto global_fft_re_lo = global_fft_slice + 1 * params::degree / 2;
auto global_fft_im_hi = global_fft_slice + 2 * params::degree / 2;
auto global_fft_im_lo = global_fft_slice + 3 * params::degree / 2;
negacyclic_forward_fft_f128<HalfDegree<params>>(acc_fft_re_hi, acc_fft_re_lo,
acc_fft_im_hi, acc_fft_im_lo);
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
global_fft_re_hi[tid] = acc_fft_re_hi[tid];
global_fft_re_lo[tid] = acc_fft_re_lo[tid];
global_fft_im_hi[tid] = acc_fft_im_hi[tid];
global_fft_im_lo[tid] = acc_fft_im_lo[tid];
tid += params::degree / params::opt;
}
}
template <typename Torus, class params, sharedMemDegree SMD>
__global__ void __launch_bounds__(params::degree / params::opt)
device_programmable_bootstrap_step_two_128(
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes,
const double *__restrict__ bootstrapping_key, Torus *global_accumulator,
double *global_join_buffer, uint32_t lwe_iteration,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *device_mem,
uint64_t device_memory_size_per_block, uint32_t num_many_lut,
uint32_t lut_stride) {
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
// much faster than global memory
extern __shared__ int8_t sharedmem[];
int8_t *selected_memory;
uint32_t glwe_dimension = gridDim.y - 1;
if constexpr (SMD == FULLSM) {
selected_memory = sharedmem;
} else {
int block_index = blockIdx.x + blockIdx.y * gridDim.x +
blockIdx.z * gridDim.x * gridDim.y;
selected_memory = &device_mem[block_index * device_memory_size_per_block];
}
// We always compute the pointer with most restrictive alignment to avoid
// alignment issues
Torus *accumulator = (Torus *)selected_memory;
double *accumulator_fft =
(double *)accumulator +
(ptrdiff_t)(sizeof(Torus) * params::degree / sizeof(double));
if constexpr (SMD == PARTIALSM)
accumulator_fft = (double *)sharedmem;
for (int level = 0; level < level_count; level++) {
double *global_fft_slice =
global_join_buffer + (level + blockIdx.x * level_count) *
(glwe_dimension + 1) * (params::degree / 2) *
4;
for (int j = 0; j < (glwe_dimension + 1); j++) {
double *fft = global_fft_slice + j * params::degree / 2 * 4;
// Get the bootstrapping key piece necessary for the multiplication
// It is already in the Fourier domain
auto bsk_slice = get_ith_mask_kth_block_128(
bootstrapping_key, lwe_iteration, j, level, polynomial_size,
glwe_dimension, level_count);
auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2 * 4;
polynomial_product_accumulate_in_fourier_domain_128<params>(
accumulator_fft, fft, bsk_poly, !level && !j);
}
}
Torus *global_slice =
global_accumulator +
(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree;
// Load the persisted accumulator
int tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
accumulator[tid] = global_slice[tid];
tid += params::degree / params::opt;
}
// Perform the inverse FFT on the result of the GGSW x GLWE and add to the
// accumulator
auto acc_fft_re_hi = accumulator_fft + 0 * params::degree / 2;
auto acc_fft_re_lo = accumulator_fft + 1 * params::degree / 2;
auto acc_fft_im_hi = accumulator_fft + 2 * params::degree / 2;
auto acc_fft_im_lo = accumulator_fft + 3 * params::degree / 2;
negacyclic_backward_fft_f128<HalfDegree<params>>(
acc_fft_re_hi, acc_fft_re_lo, acc_fft_im_hi, acc_fft_im_lo);
add_to_torus_128<Torus, params>(acc_fft_re_hi, acc_fft_re_lo, acc_fft_im_hi,
acc_fft_im_lo, accumulator);
if (lwe_iteration + 1 == lwe_dimension) {
// Last iteration
auto block_lwe_array_out =
&lwe_array_out[lwe_output_indexes[blockIdx.x] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
if (blockIdx.y < glwe_dimension) {
// Perform a sample extract. At this point, all blocks have the result,
// but we do the computation at block 0 to avoid waiting for extra blocks,
// in case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
if (num_many_lut > 1) {
for (int i = 1; i < num_many_lut; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.x * (glwe_dimension * polynomial_size + 1));
auto next_block_lwe_array_out =
&next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
sample_extract_mask<Torus, params>(next_block_lwe_array_out,
accumulator, 1, i * lut_stride);
}
}
} else if (blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
if (num_many_lut > 1) {
for (int i = 1; i < num_many_lut; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.x * (glwe_dimension * polynomial_size + 1));
auto next_block_lwe_array_out =
&next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
sample_extract_body<Torus, params>(next_block_lwe_array_out,
accumulator, 0, i * lut_stride);
}
}
}
} else {
// Persist the updated accumulator
tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
global_slice[tid] = accumulator[tid];
tid += params::degree / params::opt;
}
}
}
template <typename params>
__host__ void scratch_programmable_bootstrap_128(
cudaStream_t stream, uint32_t gpu_index, pbs_buffer_128<CLASSICAL> **buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
cuda_set_device(gpu_index);
uint64_t full_sm_step_one =
get_buffer_size_full_sm_programmable_bootstrap_step_one<__uint128_t>(
polynomial_size);
uint64_t full_sm_step_two =
get_buffer_size_full_sm_programmable_bootstrap_step_two<__uint128_t>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap<__uint128_t>(
polynomial_size);
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
// Configure step one
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_step_one_128<__uint128_t, params,
PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_step_one_128<__uint128_t, params,
PARTIALSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else if (max_shared_memory >= partial_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}
// Configure step two
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_two) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_step_two_128<__uint128_t, params,
PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_step_two_128<__uint128_t, params,
PARTIALSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else if (max_shared_memory >= partial_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_step_two_128<__uint128_t, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_two));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_step_two_128<__uint128_t, params, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}
*buffer = new pbs_buffer_128<CLASSICAL>(
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT, allocate_gpu_memory);
}
template <class params>
__host__ void execute_step_one_128(
cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector,
__uint128_t const *lut_vector_indexes, __uint128_t const *lwe_array_in,
__uint128_t const *lwe_input_indexes, double const *bootstrapping_key,
__uint128_t *global_accumulator, double *global_join_buffer,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) {
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
cuda_set_device(gpu_index);
int thds = polynomial_size / params::opt;
dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1, level_count);
if (max_shared_memory < partial_sm) {
device_programmable_bootstrap_step_one_128<__uint128_t, params, NOSM>
<<<grid, thds, 0, stream>>>(
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, global_accumulator, global_join_buffer,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, full_dm);
} else if (max_shared_memory < full_sm) {
device_programmable_bootstrap_step_one_128<__uint128_t, params, PARTIALSM>
<<<grid, thds, partial_sm, stream>>>(
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, global_accumulator, global_join_buffer,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, partial_dm);
} else {
device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM>
<<<grid, thds, full_sm, stream>>>(
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, global_accumulator, global_join_buffer,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, 0);
}
check_cuda_error(cudaGetLastError());
}
template <class params>
__host__ void execute_step_two_128(
cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
__uint128_t const *lwe_output_indexes, __uint128_t const *lut_vector,
__uint128_t const *lut_vector_indexes, double const *bootstrapping_key,
__uint128_t *global_accumulator, double *global_join_buffer,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm,
uint32_t num_many_lut, uint32_t lut_stride) {
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
cuda_set_device(gpu_index);
int thds = polynomial_size / params::opt;
dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1);
if (max_shared_memory < partial_sm) {
device_programmable_bootstrap_step_two_128<__uint128_t, params, NOSM>
<<<grid, thds, 0, stream>>>(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
bootstrapping_key, global_accumulator, global_join_buffer,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, full_dm, num_many_lut, lut_stride);
} else if (max_shared_memory < full_sm) {
device_programmable_bootstrap_step_two_128<__uint128_t, params, PARTIALSM>
<<<grid, thds, partial_sm, stream>>>(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
bootstrapping_key, global_accumulator, global_join_buffer,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, partial_dm, num_many_lut, lut_stride);
} else {
device_programmable_bootstrap_step_two_128<__uint128_t, params, FULLSM>
<<<grid, thds, full_sm, stream>>>(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
bootstrapping_key, global_accumulator, global_join_buffer,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, 0, num_many_lut, lut_stride);
}
check_cuda_error(cudaGetLastError());
}
/*
* Host wrapper to the programmable bootstrap 128
*/
template <class params>
__host__ void host_programmable_bootstrap_128(
cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
__uint128_t const *lwe_output_indexes, __uint128_t const *lut_vector,
__uint128_t const *lut_vector_indexes, __uint128_t const *lwe_array_in,
__uint128_t const *lwe_input_indexes, double const *bootstrapping_key,
pbs_buffer_128<CLASSICAL> *pbs_buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t num_many_lut, uint32_t lut_stride) {
cuda_set_device(gpu_index);
// With SM each block corresponds to either the mask or body, no need to
// duplicate data for each
uint64_t full_sm_step_one =
get_buffer_size_full_sm_programmable_bootstrap_step_one<__uint128_t>(
polynomial_size);
uint64_t full_sm_step_two =
get_buffer_size_full_sm_programmable_bootstrap_step_two<__uint128_t>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap<__uint128_t>(
polynomial_size);
uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
uint64_t full_dm_step_one = full_sm_step_one;
uint64_t full_dm_step_two = full_sm_step_two;
__uint128_t *global_accumulator = pbs_buffer->global_accumulator;
double *global_join_buffer = pbs_buffer->global_join_buffer;
int8_t *d_mem = pbs_buffer->d_mem;
for (int i = 0; i < lwe_dimension; i++) {
execute_step_one_128<params>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, global_accumulator,
global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
execute_step_two_128<params>(
stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, bootstrapping_key, global_accumulator,
global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two,
num_many_lut, lut_stride);
}
}
#endif // TFHE_RS_BACKENDS_TFHE_CUDA_BACKEND_CUDA_SRC_PBS_PROGRAMMABLE_BOOTSTRAP_CLASSIC_128_CUH_

View File

@@ -3,6 +3,7 @@
#include "crypto/torus.cuh"
#include "device.h"
#include "fft128/f128.cuh"
#include "parameters.cuh"
// Return A if C == 0 and B if C == 1
@@ -172,6 +173,35 @@ __device__ void add_to_torus(double2 *m_values, Torus *result,
}
}
/**
* In case of classical PBS, this method should accumulate the result.
* In case of multi-bit PBS, it should overwrite.
*/
template <typename Torus, class params>
__device__ void add_to_torus_128(double *re_hi, double *re_lo, double *im_hi,
double *im_lo, Torus *result,
bool overwrite_result = false) {
constexpr double normalization = 1. / (params::degree / 2);
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
f128 f128_real(re_hi[tid] * normalization, re_lo[tid] * normalization);
f128 f128_imag(im_hi[tid] * normalization, im_lo[tid] * normalization);
Torus torus_real = u128_from_torus_f128(f128_real);
Torus torus_imag = u128_from_torus_f128(f128_imag);
if (overwrite_result) {
result[tid] = torus_real;
result[tid + params::degree / 2] = torus_imag;
} else {
result[tid] += torus_real;
result[tid + params::degree / 2] += torus_imag;
}
tid = tid + params::degree / params::opt;
}
}
// Extracts the body of the nth-LWE in a GLWE.
template <typename Torus, class params>
__device__ void sample_extract_body(Torus *lwe_array_out, Torus const *glwe,

View File

@@ -56,6 +56,64 @@ __device__ void polynomial_product_accumulate_in_fourier_domain(
}
}
// Computes result += first * second
// If init_accumulator is set, assumes that result was not initialized and does
// that with the outcome of first * second
template <class params>
__device__ void polynomial_product_accumulate_in_fourier_domain_128(
double *result, double *first, const double *second,
bool init_accumulator = false) {
int tid = threadIdx.x;
if (init_accumulator) {
for (int i = 0; i < params::opt / 2; i++) {
f128 a_re(first[tid + 0 * params::degree / 2],
first[tid + 1 * params::degree / 2]);
f128 a_im(first[tid + 2 * params::degree / 2],
first[tid + 3 * params::degree / 2]);
f128 b_re(second[tid + 0 * params::degree / 2],
second[tid + 1 * params::degree / 2]);
f128 b_im(second[tid + 2 * params::degree / 2],
second[tid + 3 * params::degree / 2]);
f128 c_re, c_im;
f128::cplx_f128_mul_assign(c_re, c_im, a_re, a_im, b_re, b_im);
result[tid + 0 * params::degree / 2] = c_re.hi;
result[tid + 1 * params::degree / 2] = c_re.lo;
result[tid + 2 * params::degree / 2] = c_im.hi;
result[tid + 3 * params::degree / 2] = c_im.lo;
tid += params::degree / params::opt;
}
} else {
for (int i = 0; i < params::opt / 2; i++) {
f128 a_re(first[tid + 0 * params::degree / 2],
first[tid + 1 * params::degree / 2]);
f128 a_im(first[tid + 2 * params::degree / 2],
first[tid + 3 * params::degree / 2]);
f128 b_re(second[tid + 0 * params::degree / 2],
second[tid + 1 * params::degree / 2]);
f128 b_im(second[tid + 2 * params::degree / 2],
second[tid + 3 * params::degree / 2]);
f128 res_re(result[tid + 0 * params::degree / 2],
result[tid + 1 * params::degree / 2]);
f128 res_im(result[tid + 2 * params::degree / 2],
result[tid + 3 * params::degree / 2]);
f128 c_re, c_im;
f128::cplx_f128_mul_assign(c_re, c_im, a_re, a_im, b_re, b_im);
f128::cplx_f128_add_assign(res_re, res_im, res_re, res_im, c_re, c_im);
result[tid + 0 * params::degree / 2] = res_re.hi;
result[tid + 1 * params::degree / 2] = res_re.lo;
result[tid + 2 * params::degree / 2] = res_im.hi;
result[tid + 3 * params::degree / 2] = res_im.lo;
tid += params::degree / params::opt;
}
}
}
// Computes result += x
// If init_accumulator is set, assumes that result was not initialized and does
// that with the outcome of first * second

View File

@@ -1416,6 +1416,18 @@ unsafe extern "C" {
polynomial_size: u32,
);
}
unsafe extern "C" {
pub fn cuda_convert_lwe_programmable_bootstrap_key_128(
stream: *mut ffi::c_void,
gpu_index: u32,
dest: *mut ffi::c_void,
src: *const ffi::c_void,
input_lwe_dim: u32,
glwe_dim: u32,
level_count: u32,
polynomial_size: u32,
);
}
unsafe extern "C" {
pub fn scratch_cuda_programmable_bootstrap_amortized_32(
stream: *mut ffi::c_void,
@@ -1509,6 +1521,18 @@ unsafe extern "C" {
allocate_gpu_memory: bool,
);
}
unsafe extern "C" {
pub fn scratch_cuda_programmable_bootstrap_128(
stream: *mut ffi::c_void,
gpu_index: u32,
buffer: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
level_count: u32,
input_lwe_ciphertext_count: u32,
allocate_gpu_memory: bool,
);
}
unsafe extern "C" {
pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
stream: *mut ffi::c_void,
@@ -1553,6 +1577,28 @@ unsafe extern "C" {
lut_stride: u32,
);
}
unsafe extern "C" {
pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
stream: *mut ffi::c_void,
gpu_index: u32,
lwe_array_out: *mut ffi::c_void,
lwe_output_indexes: *const ffi::c_void,
lut_vector: *const ffi::c_void,
lut_vector_indexes: *const ffi::c_void,
lwe_array_in: *const ffi::c_void,
lwe_input_indexes: *const ffi::c_void,
bootstrapping_key: *const ffi::c_void,
buffer: *mut i8,
lwe_dimension: u32,
glwe_dimension: u32,
polynomial_size: u32,
base_log: u32,
level_count: u32,
num_samples: u32,
num_many_lut: u32,
lut_stride: u32,
);
}
unsafe extern "C" {
pub fn cleanup_cuda_programmable_bootstrap(
stream: *mut ffi::c_void,
@@ -1560,6 +1606,13 @@ unsafe extern "C" {
pbs_buffer: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn cleanup_cuda_programmable_bootstrap_128(
stream: *mut ffi::c_void,
gpu_index: u32,
pbs_buffer: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
glwe_dimension: u32,

View File

@@ -348,6 +348,18 @@ pub const FFT128_U128_PARAMS: FftTestParams<u128> = FftTestParams {
ciphertext_modulus: CiphertextModulus::<u128>::new_native(),
};
#[cfg(feature = "gpu")]
pub const FFT128_U128_GPU_PARAMS: FftTestParams<u128> = FftTestParams {
lwe_dimension: LweDimension(879),
glwe_dimension: GlweDimension(2),
polynomial_size: PolynomialSize(2048),
lwe_noise_distribution: DynamicDistribution::new_t_uniform(46),
glwe_noise_distribution: DynamicDistribution::new_t_uniform(30),
pbs_base_log: DecompositionBaseLog(32),
pbs_level: DecompositionLevelCount(3),
ciphertext_modulus: CiphertextModulus::new_native(),
};
pub const FFT_WOPBS_PARAMS: FftWopPbsTestParams<u64> = FftWopPbsTestParams {
lwe_dimension: LweDimension(481),
glwe_dimension: GlweDimension(1),

View File

@@ -0,0 +1,180 @@
pub(crate) use crate::core_crypto::algorithms::test::gen_keys_or_get_from_cache_if_enabled;
use crate::core_crypto::algorithms::test::{
FftBootstrapKeys, FftTestParams, TestResources, FFT128_U128_GPU_PARAMS,
};
use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
use crate::core_crypto::gpu::lwe_bootstrap_key::CudaLweBootstrapKey;
use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
use crate::core_crypto::gpu::vec::{CudaVec, GpuIndex};
use crate::core_crypto::gpu::{cuda_programmable_bootstrap_lwe_ciphertext, CudaStreams};
use crate::core_crypto::keycache::KeyCacheAccess;
use crate::core_crypto::prelude::*;
use itertools::Itertools;
use serde::de::DeserializeOwned;
use serde::Serialize;
pub fn generate_keys<
Scalar: UnsignedTorus + Sync + Send + CastFrom<usize> + CastInto<usize> + Serialize + DeserializeOwned,
>(
params: FftTestParams<Scalar>,
rsc: &mut TestResources,
) -> FftBootstrapKeys<Scalar> {
// Generate an LweSecretKey with binary coefficients
let small_lwe_sk =
LweSecretKey::generate_new_binary(params.lwe_dimension, &mut rsc.secret_random_generator);
// Generate a GlweSecretKey with binary coefficients
let glwe_sk = GlweSecretKey::generate_new_binary(
params.glwe_dimension,
params.polynomial_size,
&mut rsc.secret_random_generator,
);
// Create a copy of the GlweSecretKey re-interpreted as an LweSecretKey
let big_lwe_sk = glwe_sk.clone().into_lwe_secret_key();
let bsk = par_allocate_and_generate_new_lwe_bootstrap_key(
&small_lwe_sk,
&glwe_sk,
params.pbs_base_log,
params.pbs_level,
params.glwe_noise_distribution,
params.ciphertext_modulus,
&mut rsc.encryption_random_generator,
);
FftBootstrapKeys {
small_lwe_sk,
big_lwe_sk,
bsk,
}
}
pub fn execute_bootstrap_u128<Scalar>(params: FftTestParams<Scalar>)
where
Scalar: Numeric
+ UnsignedTorus
+ CastFrom<usize>
+ CastInto<usize>
+ Send
+ Sync
+ Serialize
+ DeserializeOwned,
FftTestParams<Scalar>: KeyCacheAccess<Keys = FftBootstrapKeys<Scalar>>,
{
let lwe_noise_distribution = params.lwe_noise_distribution;
let glwe_dimension = params.glwe_dimension;
let polynomial_size = params.polynomial_size;
let ciphertext_modulus = params.ciphertext_modulus;
let mut rsc = TestResources::new();
let mut keys_gen = |params| generate_keys(params, &mut rsc);
let keys = gen_keys_or_get_from_cache_if_enabled(params, &mut keys_gen);
let (std_bootstrapping_key, small_lwe_sk, big_lwe_sk) =
(keys.bsk, keys.small_lwe_sk, keys.big_lwe_sk);
let output_lwe_dimension = big_lwe_sk.lwe_dimension();
let gpu_index = 0;
let stream = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
let d_bsk = CudaLweBootstrapKey::from_lwe_bootstrap_key(&std_bootstrapping_key, &stream);
// Our 4 bits message space
let message_modulus: Scalar = Scalar::ONE << 4;
// Our input message
let input_message: Scalar = 3usize.cast_into();
let number_of_messages = 1;
// Delta used to encode 4 bits of message + a bit of padding on Scalar
let delta: Scalar = (Scalar::ONE << (Scalar::BITS - 1)) / message_modulus;
// Apply our encoding
let plaintext = Plaintext(input_message * delta);
// Allocate a new LweCiphertext and encrypt our plaintext
let lwe_ciphertext_in: LweCiphertextOwned<Scalar> = allocate_and_encrypt_new_lwe_ciphertext(
&small_lwe_sk,
plaintext,
lwe_noise_distribution,
ciphertext_modulus,
&mut rsc.encryption_random_generator,
);
let f = |x: Scalar| x;
let accumulator: GlweCiphertextOwned<Scalar> = generate_programmable_bootstrap_glwe_lut(
polynomial_size,
glwe_dimension.to_glwe_size(),
message_modulus.cast_into(),
ciphertext_modulus,
delta,
f,
);
let d_lwe_ciphertext_in =
CudaLweCiphertextList::from_lwe_ciphertext(&lwe_ciphertext_in, &stream);
let mut d_out_pbs_ct = CudaLweCiphertextList::new(
output_lwe_dimension,
LweCiphertextCount(1),
ciphertext_modulus,
&stream,
);
let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &stream);
let mut test_vector_indexes: Vec<Scalar> = vec![Scalar::ZERO; number_of_messages];
for (i, ind) in test_vector_indexes.iter_mut().enumerate() {
*ind = <usize as CastInto<Scalar>>::cast_into(i);
}
let mut d_test_vector_indexes =
unsafe { CudaVec::<Scalar>::new_async(number_of_messages, &stream, 0) };
unsafe { d_test_vector_indexes.copy_from_cpu_async(&test_vector_indexes, &stream, 0) };
let num_blocks = d_lwe_ciphertext_in.0.lwe_ciphertext_count.0;
let lwe_indexes_usize: Vec<usize> = (0..num_blocks).collect_vec();
let lwe_indexes = lwe_indexes_usize
.iter()
.map(|&x| <usize as CastInto<Scalar>>::cast_into(x))
.collect_vec();
let mut d_output_indexes = unsafe { CudaVec::<Scalar>::new_async(num_blocks, &stream, 0) };
let mut d_input_indexes = unsafe { CudaVec::<Scalar>::new_async(num_blocks, &stream, 0) };
unsafe {
d_input_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0);
d_output_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0);
}
cuda_programmable_bootstrap_lwe_ciphertext(
&d_lwe_ciphertext_in,
&mut d_out_pbs_ct,
&d_accumulator,
&d_test_vector_indexes,
&d_output_indexes,
&d_input_indexes,
LweCiphertextCount(num_blocks),
&d_bsk,
&stream,
);
let pbs_ct = d_out_pbs_ct.into_lwe_ciphertext(&stream);
// Decrypt the PBS result
let pbs_plaintext: Plaintext<Scalar> = decrypt_lwe_ciphertext(&big_lwe_sk, &pbs_ct);
// Create a SignedDecomposer to perform the rounding of the decrypted plaintext
// We pass a DecompositionBaseLog of 5 and a DecompositionLevelCount of 1 indicating we want
// to round the 5 MSB, 1 bit of padding plus our 4 bits of message
let signed_decomposer =
SignedDecomposer::new(DecompositionBaseLog(5), DecompositionLevelCount(1));
// Round and remove our encoding
let pbs_result: Scalar = signed_decomposer.closest_representable(pbs_plaintext.0) / delta;
assert_eq!(f(input_message), pbs_result);
}
#[test]
fn test_bootstrap_u128() {
execute_bootstrap_u128::<u128>(FFT128_U128_GPU_PARAMS);
}

View File

@@ -7,6 +7,7 @@ mod lwe_linear_algebra;
mod lwe_multi_bit_programmable_bootstrapping;
mod lwe_packing_keyswitch;
mod lwe_programmable_bootstrapping;
mod lwe_programmable_bootstrapping_128;
mod noise_distribution;
pub struct CudaPackingKeySwitchKeys<Scalar: UnsignedInteger> {

View File

@@ -37,6 +37,11 @@ impl CudaLweBootstrapKey {
let decomp_level_count = bsk.decomposition_level_count();
let decomp_base_log = bsk.decomposition_base_log();
let glwe_dimension = bsk.glwe_size().to_glwe_dimension();
let double_count = if size_of::<InputBskCont::Element>() == 16 {
2
} else {
1
};
// Allocate memory
let mut d_vec = CudaVec::<f64>::new_multi_gpu(
@@ -45,10 +50,11 @@ impl CudaLweBootstrapKey {
glwe_dimension.to_glwe_size(),
polynomial_size,
decomp_level_count,
),
) * double_count,
streams,
);
// Copy to the GPU
unsafe {
convert_lwe_programmable_bootstrap_key_async(
streams,
@@ -59,7 +65,7 @@ impl CudaLweBootstrapKey {
decomp_level_count,
polynomial_size,
);
}
};
streams.synchronize();
Self {
d_vec,

View File

@@ -121,41 +121,84 @@ pub unsafe fn programmable_bootstrap_async<T: UnsignedInteger>(
let num_many_lut = 1u32;
let lut_stride = 0u32;
let mut pbs_buffer: *mut i8 = std::ptr::null_mut();
scratch_cuda_programmable_bootstrap_64(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
level.0 as u32,
num_samples,
true,
);
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
streams.ptr[0],
streams.gpu_indexes[0].get(),
lwe_array_out.as_mut_c_ptr(0),
lwe_out_indexes.as_c_ptr(0),
test_vector.as_c_ptr(0),
test_vector_indexes.as_c_ptr(0),
lwe_array_in.as_c_ptr(0),
lwe_in_indexes.as_c_ptr(0),
bootstrapping_key.as_c_ptr(0),
pbs_buffer,
lwe_dimension.0 as u32,
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
base_log.0 as u32,
level.0 as u32,
num_samples,
num_many_lut,
lut_stride,
);
cleanup_cuda_programmable_bootstrap(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
);
if size_of::<T>() == 16 {
scratch_cuda_programmable_bootstrap_128(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
level.0 as u32,
num_samples,
true,
);
cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
streams.ptr[0],
streams.gpu_indexes[0].get(),
lwe_array_out.as_mut_c_ptr(0),
lwe_out_indexes.as_c_ptr(0),
test_vector.as_c_ptr(0),
test_vector_indexes.as_c_ptr(0),
lwe_array_in.as_c_ptr(0),
lwe_in_indexes.as_c_ptr(0),
bootstrapping_key.as_c_ptr(0),
pbs_buffer,
lwe_dimension.0 as u32,
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
base_log.0 as u32,
level.0 as u32,
num_samples,
num_many_lut,
lut_stride,
);
cleanup_cuda_programmable_bootstrap_128(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
);
} else {
scratch_cuda_programmable_bootstrap_64(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
level.0 as u32,
num_samples,
true,
);
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
streams.ptr[0],
streams.gpu_indexes[0].get(),
lwe_array_out.as_mut_c_ptr(0),
lwe_out_indexes.as_c_ptr(0),
test_vector.as_c_ptr(0),
test_vector_indexes.as_c_ptr(0),
lwe_array_in.as_c_ptr(0),
lwe_in_indexes.as_c_ptr(0),
bootstrapping_key.as_c_ptr(0),
pbs_buffer,
lwe_dimension.0 as u32,
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
base_log.0 as u32,
level.0 as u32,
num_samples,
num_many_lut,
lut_stride,
);
cleanup_cuda_programmable_bootstrap(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
);
}
}
/// Programmable multi-bit bootstrap on a vector of LWE ciphertexts
@@ -341,19 +384,30 @@ pub unsafe fn convert_lwe_programmable_bootstrap_key_async<T: UnsignedInteger>(
l_gadget: DecompositionLevelCount,
polynomial_size: PolynomialSize,
) {
let size = std::mem::size_of_val(src);
for (i, &stream_ptr) in streams.ptr.iter().enumerate() {
assert_eq!(dest.len() * std::mem::size_of::<T>(), size);
cuda_convert_lwe_programmable_bootstrap_key_64(
stream_ptr,
streams.gpu_indexes[i].get(),
dest.as_mut_c_ptr(i as u32),
src.as_ptr().cast(),
input_lwe_dim.0 as u32,
glwe_dim.0 as u32,
l_gadget.0 as u32,
polynomial_size.0 as u32,
);
if size_of::<T>() == 16 {
cuda_convert_lwe_programmable_bootstrap_key_128(
stream_ptr,
streams.gpu_indexes[i].get(),
dest.as_mut_c_ptr(i as u32),
src.as_ptr().cast(),
input_lwe_dim.0 as u32,
glwe_dim.0 as u32,
l_gadget.0 as u32,
polynomial_size.0 as u32,
);
} else {
cuda_convert_lwe_programmable_bootstrap_key_64(
stream_ptr,
streams.gpu_indexes[i].get(),
dest.as_mut_c_ptr(i as u32),
src.as_ptr().cast(),
input_lwe_dim.0 as u32,
glwe_dim.0 as u32,
l_gadget.0 as u32,
polynomial_size.0 as u32,
);
}
}
}