Compare commits

...

7 Commits

Author SHA1 Message Date
Beka Barbakadze
aa3f83d016 all tests are passing 2025-03-10 18:49:37 +04:00
Beka Barbakadze
320310a6b7 fix some bugs 2025-03-10 15:59:48 +04:00
Beka Barbakadze
f704a38814 fix fmt 2025-03-07 18:34:41 +04:00
Beka Barbakadze
1659c07c89 fix test 2025-03-07 18:31:07 +04:00
Beka Barbakadze
6cbe56283e feat(gpu): add classic default 128 bit pbs 2025-03-07 17:37:13 +04:00
Beka Barbakadze
e90ec935a1 change parameters 2025-03-05 17:37:51 +04:00
Beka Barbakadze
7f3ac17cee feat(gpu): Implement 128 bit classic pbs 2025-03-05 16:48:48 +04:00
20 changed files with 1984 additions and 76 deletions

View File

@@ -9,20 +9,29 @@
template <typename Torus>
uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_one(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator_rotated
sizeof(double2) * polynomial_size / 2; // accumulator fft
size_t scalar_size = sizeof(Torus);
size_t split_count = (scalar_size == 16) ? 2 : 1;
return scalar_size * polynomial_size + // accumulator_rotated
sizeof(double) * 2 * split_count * polynomial_size /
2; // accumulator fft
}
template <typename Torus>
uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_two(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator
sizeof(double2) * polynomial_size / 2; // accumulator fft
size_t scalar_size = sizeof(Torus);
size_t split_count = (scalar_size == 16) ? 2 : 1;
return scalar_size * polynomial_size + // accumulator
sizeof(double) * 2 * split_count * polynomial_size /
2; // accumulator fft
}
template <typename Torus>
uint64_t
get_buffer_size_partial_sm_programmable_bootstrap(uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft
size_t scalar_size = sizeof(Torus);
size_t split_count = (scalar_size == 16) ? 2 : 1;
return sizeof(double) * 2 * split_count * polynomial_size /
2; // accumulator fft
}
template <typename Torus>
@@ -215,6 +224,158 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
}
};
template <PBS_TYPE pbs_type> struct pbs_buffer_128;
template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
int8_t *d_mem;
__uint128_t *global_accumulator;
double *global_join_buffer;
PBS_VARIANT pbs_variant;
pbs_buffer_128(cudaStream_t stream, uint32_t gpu_index,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
PBS_VARIANT pbs_variant, bool allocate_gpu_memory) {
cuda_set_device(gpu_index);
this->pbs_variant = pbs_variant;
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
if (allocate_gpu_memory) {
switch (pbs_variant) {
case PBS_VARIANT::DEFAULT: {
uint64_t full_sm_step_one =
get_buffer_size_full_sm_programmable_bootstrap_step_one<
__uint128_t>(polynomial_size);
uint64_t full_sm_step_two =
get_buffer_size_full_sm_programmable_bootstrap_step_two<
__uint128_t>(polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap<__uint128_t>(
polynomial_size);
uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
uint64_t full_dm = full_sm_step_one;
uint64_t device_mem = 0;
if (max_shared_memory < partial_sm) {
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
} else if (max_shared_memory < full_sm_step_two) {
device_mem =
(partial_dm_step_two + partial_dm_step_one * level_count) *
input_lwe_ciphertext_count * (glwe_dimension + 1);
} else if (max_shared_memory < full_sm_step_one) {
device_mem = partial_dm_step_one * input_lwe_ciphertext_count *
level_count * (glwe_dimension + 1);
}
// Otherwise, both kernels run all in shared memory
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
global_join_buffer = (double *)cuda_malloc_async(
(glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
(polynomial_size / 2) * sizeof(double) * 4,
stream, gpu_index);
global_accumulator = (__uint128_t *)cuda_malloc_async(
(glwe_dimension + 1) * input_lwe_ciphertext_count *
polynomial_size * sizeof(__uint128_t),
stream, gpu_index);
} break;
case PBS_VARIANT::CG: {
uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_cg<__uint128_t>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap_cg<__uint128_t>(
polynomial_size);
uint64_t partial_dm = full_sm - partial_sm;
uint64_t full_dm = full_sm;
uint64_t device_mem = 0;
if (max_shared_memory < partial_sm) {
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
} else if (max_shared_memory < full_sm) {
device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
}
// Otherwise, both kernels run all in shared memory
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
global_join_buffer = (double *)cuda_malloc_async(
(glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
polynomial_size / 2 * sizeof(double) * 4,
stream, gpu_index);
} break;
#if CUDA_ARCH >= 900
case PBS_VARIANT::TBC: {
bool supports_dsm =
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
__uint128_t>(polynomial_size, max_shared_memory);
uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_tbc<__uint128_t>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap_tbc<__uint128_t>(
polynomial_size);
uint64_t minimum_sm_tbc = 0;
if (supports_dsm)
minimum_sm_tbc =
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<
__uint128_t>(polynomial_size);
uint64_t partial_dm = full_sm - partial_sm;
uint64_t full_dm = full_sm;
uint64_t device_mem = 0;
// There is a minimum amount of memory we need to run the TBC PBS, which
// is minimum_sm_tbc. We know that minimum_sm_tbc bytes are available
// because otherwise the previous check would have redirected
// computation to some other variant. If over that we don't have more
// partial_sm bytes, TBC PBS will run on NOSM. If we have partial_sm but
// not full_sm bytes, it will run on PARTIALSM. Otherwise, FULLSM.
//
// NOSM mode actually requires minimum_sm_tbc shared memory bytes.
if (max_shared_memory < partial_sm + minimum_sm_tbc) {
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
} else if (max_shared_memory < full_sm + minimum_sm_tbc) {
device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
}
// Otherwise, both kernels run all in shared memory
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
global_join_buffer = (double *)cuda_malloc_async(
(glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
polynomial_size / 2 * sizeof(double) * 4,
stream, gpu_index);
} break;
#endif
default:
PANIC("Cuda error (PBS): unsupported implementation variant.")
}
}
}
void release(cudaStream_t stream, uint32_t gpu_index) {
cuda_drop_async(d_mem, stream, gpu_index);
cuda_drop_async(global_join_buffer, stream, gpu_index);
if (pbs_variant == DEFAULT)
cuda_drop_async(global_accumulator, stream, gpu_index);
}
};
template <typename Torus>
uint64_t get_buffer_size_programmable_bootstrap_cg(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,

View File

@@ -20,6 +20,11 @@ void cuda_convert_lwe_programmable_bootstrap_key_64(
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size);
void cuda_convert_lwe_programmable_bootstrap_key_128(
void *stream, uint32_t gpu_index, void *dest, void const *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size);
void scratch_cuda_programmable_bootstrap_amortized_32(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -62,6 +67,11 @@ void scratch_cuda_programmable_bootstrap_64(
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
void scratch_cuda_programmable_bootstrap_128(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
@@ -80,7 +90,19 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);
void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);
void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
int8_t **pbs_buffer);
void cleanup_cuda_programmable_bootstrap_128(void *stream, uint32_t gpu_index,
int8_t **pbs_buffer);
}
#endif // CUDA_BOOTSTRAP_H

View File

@@ -3,6 +3,7 @@
#include "crypto/torus.cuh"
#include "device.h"
#include "fft128/f128.cuh"
#include <cstdint>
/**
@@ -42,6 +43,13 @@ public:
}
}
__device__ void decompose_and_compress_next_128(double *result) {
for (int j = 0; j < num_poly; j++) {
auto result_slice = result + j * params::degree / 2 * 4;
decompose_and_compress_next_polynomial_128(result_slice, j);
}
}
// Decomposes a single polynomial
__device__ void decompose_and_compress_next_polynomial(double2 *result,
int j) {
@@ -75,10 +83,58 @@ public:
synchronize_threads_in_block();
}
// Decomposes a single polynomial
__device__ void decompose_and_compress_next_polynomial_128(double *result,
int j) {
uint32_t tid = threadIdx.x;
auto state_slice = &state[j * params::degree];
for (int i = 0; i < params::opt / 2; i++) {
auto input1 = &state_slice[tid];
auto input2 = &state_slice[tid + params::degree / 2];
T res_re = *input1 & mask_mod_b;
T res_im = *input2 & mask_mod_b;
*input1 >>= base_log; // Update state
*input2 >>= base_log; // Update state
T carry_re = ((res_re - 1ll) | *input1) & res_re;
T carry_im = ((res_im - 1ll) | *input2) & res_im;
carry_re >>= (base_log - 1);
carry_im >>= (base_log - 1);
*input1 += carry_re; // Update state
*input2 += carry_im; // Update state
res_re -= carry_re << base_log;
res_im -= carry_im << base_log;
auto out_re = u128_to_signed_to_f128(res_re);
auto out_im = u128_to_signed_to_f128(res_im);
auto out_re_hi = result + 0LL * params::degree / 2;
auto out_re_lo = result + 1LL * params::degree / 2;
auto out_im_hi = result + 2LL * params::degree / 2;
auto out_im_lo = result + 3LL * params::degree / 2;
out_re_hi[tid] = out_re.hi;
out_re_lo[tid] = out_re.lo;
out_im_hi[tid] = out_im.hi;
out_im_lo[tid] = out_im.lo;
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
}
__device__ void decompose_and_compress_level(double2 *result, int level) {
for (int i = 0; i < level_count - level; i++)
decompose_and_compress_next(result);
}
__device__ void decompose_and_compress_level_128(double *result, int level) {
for (int i = 0; i < level_count - level; i++)
decompose_and_compress_next_128(result);
}
};
template <typename Torus>

View File

@@ -21,7 +21,7 @@ struct alignas(16) f128 {
#else
double s = a + b;
return f128(s, b - (s - a));
#endif;
#endif
}
// Two-sum
@@ -270,7 +270,7 @@ __host__ __device__ inline double bits_to_double(uint64_t bits) {
return d;
}
__host__ __device__ double u128_to_f64(__uint128_t x) {
__host__ __device__ inline double u128_to_f64(__uint128_t x) {
const __uint128_t ONE = 1;
const double A = ONE << 52;
const double B = ONE << 104;
@@ -322,7 +322,7 @@ __host__ __device__ double u128_to_f64(__uint128_t x) {
}
}
__host__ __device__ __uint128_t f64_to_u128(const double f) {
__host__ __device__ inline __uint128_t f64_to_u128(const double f) {
const __uint128_t ONE = 1;
const uint64_t f_bits = double_to_bits(f);
if (f_bits < 1023ull << 52) {
@@ -338,7 +338,7 @@ __host__ __device__ __uint128_t f64_to_u128(const double f) {
}
}
__host__ __device__ __uint128_t f64_to_i128(const double f) {
__host__ __device__ inline __uint128_t f64_to_i128(const double f) {
// Get raw bits of the double
const uint64_t f_bits = double_to_bits(f);
@@ -366,14 +366,14 @@ __host__ __device__ __uint128_t f64_to_i128(const double f) {
return (f_bits >> 63) ? -result : result;
}
__host__ __device__ double i128_to_f64(__int128_t const x) {
__host__ __device__ inline double i128_to_f64(__int128_t const x) {
uint64_t sign = static_cast<uint64_t>(x >> 64) & (1ULL << 63);
__uint128_t abs =
(x < 0) ? static_cast<__uint128_t>(-x) : static_cast<__uint128_t>(x);
return bits_to_double(double_to_bits(u128_to_f64(abs)) | sign);
}
__host__ __device__ f128 u128_to_signed_to_f128(__uint128_t x) {
__host__ __device__ inline f128 u128_to_signed_to_f128(__uint128_t x) {
const double first_approx = i128_to_f64(x);
const uint64_t sign_bit = double_to_bits(first_approx) & (1ull << 63);
const __uint128_t first_approx_roundtrip =
@@ -387,7 +387,7 @@ __host__ __device__ f128 u128_to_signed_to_f128(__uint128_t x) {
return f128(first_approx, correction);
}
__host__ __device__ __uint128_t u128_from_torus_f128(const f128 &a) {
__host__ __device__ inline __uint128_t u128_from_torus_f128(const f128 &a) {
auto x = f128::sub_estimate(a, f128::f128_floor(a));
const double normalization = 340282366920938500000000000000000000000.;
#ifdef __CUDA_ARCH__

View File

@@ -287,6 +287,25 @@ batch_convert_u128_to_f128_as_torus(double *out_re_hi, double *out_re_lo,
&in[blockIdx.x * params::degree + params::degree / 2]);
}
// params is expected to be full degree not half degree
template <class params>
__global__ void
batch_convert_u128_to_f128_strided_as_torus(double *d_out,
const __uint128_t *d_in) {
constexpr size_t chunk_size = params::degree / 2 * 4;
double *chunk = &d_out[blockIdx.x * chunk_size];
double *out_re_hi = &chunk[0ULL * params::degree / 2];
double *out_re_lo = &chunk[1ULL * params::degree / 2];
double *out_im_hi = &chunk[2ULL * params::degree / 2];
double *out_im_lo = &chunk[3ULL * params::degree / 2];
convert_u128_to_f128_as_torus<params>(
out_re_hi, out_re_lo, out_im_hi, out_im_lo,
&d_in[blockIdx.x * params::degree],
&d_in[blockIdx.x * params::degree + params::degree / 2]);
}
// params is expected to be full degree not half degree
template <class params>
__global__ void batch_convert_f128_to_u128_as_torus(__uint128_t *out,
@@ -309,7 +328,7 @@ __global__ void
batch_NSMFFT_128(double *in_re_hi, double *in_re_lo, double *in_im_hi,
double *in_im_lo, double *out_re_hi, double *out_re_lo,
double *out_im_hi, double *out_im_lo, double *buffer) {
extern __shared__ double sharedMemoryFFT[];
extern __shared__ double sharedMemoryFFT128[];
double *re_hi, *re_lo, *im_hi, *im_lo;
if (SMD == NOSM) {
@@ -322,10 +341,10 @@ batch_NSMFFT_128(double *in_re_hi, double *in_re_lo, double *in_im_hi,
im_lo =
&buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 3];
} else {
re_hi = &sharedMemoryFFT[params::degree / 2 * 0];
re_lo = &sharedMemoryFFT[params::degree / 2 * 1];
im_hi = &sharedMemoryFFT[params::degree / 2 * 2];
im_lo = &sharedMemoryFFT[params::degree / 2 * 3];
re_hi = &sharedMemoryFFT128[params::degree / 2 * 0];
re_lo = &sharedMemoryFFT128[params::degree / 2 * 1];
im_hi = &sharedMemoryFFT128[params::degree / 2 * 2];
im_lo = &sharedMemoryFFT128[params::degree / 2 * 3];
}
Index tid = threadIdx.x;
@@ -356,6 +375,70 @@ batch_NSMFFT_128(double *in_re_hi, double *in_re_lo, double *in_im_hi,
}
}
template <class params, sharedMemDegree SMD>
__global__ void batch_NSMFFT_strided_128(double *d_in, double *d_out,
double *buffer) {
extern __shared__ double sharedMemoryFFT128[];
double *re_hi, *re_lo, *im_hi, *im_lo;
if (SMD == NOSM) {
re_hi =
&buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 0];
re_lo =
&buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 1];
im_hi =
&buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 2];
im_lo =
&buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 3];
} else {
re_hi = &sharedMemoryFFT128[params::degree / 2 * 0];
re_lo = &sharedMemoryFFT128[params::degree / 2 * 1];
im_hi = &sharedMemoryFFT128[params::degree / 2 * 2];
im_lo = &sharedMemoryFFT128[params::degree / 2 * 3];
}
constexpr size_t chunk_size = params::degree / 2 * 4;
double *chunk = &d_in[blockIdx.x * chunk_size];
double *tmp_re_hi = &chunk[0ULL * params::degree / 2];
double *tmp_re_lo = &chunk[1ULL * params::degree / 2];
double *tmp_im_hi = &chunk[2ULL * params::degree / 2];
double *tmp_im_lo = &chunk[3ULL * params::degree / 2];
Index tid = threadIdx.x;
#pragma unroll
for (Index i = 0; i < params::opt / 2; ++i) {
re_hi[tid] = tmp_re_hi[tid];
re_lo[tid] = tmp_re_lo[tid];
im_hi[tid] = tmp_im_hi[tid];
im_lo[tid] = tmp_im_lo[tid];
tid += params::degree / params::opt;
}
__syncthreads();
if constexpr (params::fft_direction == 1) {
negacyclic_backward_fft_f128<HalfDegree<params>>(re_hi, re_lo, im_hi,
im_lo);
} else {
negacyclic_forward_fft_f128<HalfDegree<params>>(re_hi, re_lo, im_hi, im_lo);
}
__syncthreads();
chunk = &d_out[blockIdx.x * chunk_size];
tmp_re_hi = &chunk[0ULL * params::degree / 2];
tmp_re_lo = &chunk[1ULL * params::degree / 2];
tmp_im_hi = &chunk[2ULL * params::degree / 2];
tmp_im_lo = &chunk[3ULL * params::degree / 2];
tid = threadIdx.x;
#pragma unroll
for (Index i = 0; i < params::opt / 2; ++i) {
tmp_re_hi[tid] = re_hi[tid];
tmp_re_lo[tid] = re_lo[tid];
tmp_im_hi[tid] = im_hi[tid];
tmp_im_lo[tid] = im_lo[tid];
tid += params::degree / params::opt;
}
}
template <class params>
__host__ void host_fourier_transform_forward_as_integer_f128(
cudaStream_t stream, uint32_t gpu_index, double *re0, double *re1,

View File

@@ -22,6 +22,19 @@ void cuda_convert_lwe_programmable_bootstrap_key_64(
(const int64_t *)src, polynomial_size, total_polynomials);
}
void cuda_convert_lwe_programmable_bootstrap_key_128(
void *stream, uint32_t gpu_index, void *dest, void const *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size) {
printf("bsk transform.cu\n");
uint32_t total_polynomials =
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
cuda_convert_lwe_programmable_bootstrap_key_u128(
static_cast<cudaStream_t>(stream), gpu_index, (double *)dest,
(const __uint128_t *)src, polynomial_size, total_polynomials);
}
void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
void *stream, uint32_t gpu_index, void *dest, void const *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,

View File

@@ -3,6 +3,8 @@
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "fft128/fft128.cuh"
#include "pbs/programmable_bootstrap.h"
#include "pbs/programmable_bootstrap_multibit.h"
#include "polynomial/parameters.cuh"
@@ -16,6 +18,13 @@ __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
level_count;
}
__device__ inline int get_start_ith_ggsw_128(int i, uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count) {
return i * polynomial_size / 2 * 4 * (glwe_dimension + 1) *
(glwe_dimension + 1) * level_count;
}
////////////////////////////////////////////////
template <typename T>
__device__ const T *get_ith_mask_kth_block(const T *ptr, int i, int k,
@@ -39,6 +48,31 @@ __device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
(glwe_dimension + 1) * (glwe_dimension + 1) +
k * polynomial_size / 2 * (glwe_dimension + 1)];
}
template <typename T>
__device__ const T *
get_ith_mask_kth_block_128(const T *ptr, int i, int k, int level,
uint32_t polynomial_size, int glwe_dimension,
uint32_t level_count) {
return &ptr[get_start_ith_ggsw_128(i, polynomial_size, glwe_dimension,
level_count) +
(level_count - level - 1) * polynomial_size / 2 * 4 *
(glwe_dimension + 1) * (glwe_dimension + 1) +
k * polynomial_size / 2 * 4 * (glwe_dimension + 1)];
}
template <typename T>
__device__ T *get_ith_mask_kth_block_128(T *ptr, int i, int k, int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count) {
return &ptr[get_start_ith_ggsw_128(i, polynomial_size, glwe_dimension,
level_count) +
(level_count - level - 1) * polynomial_size / 2 * 4 *
(glwe_dimension + 1) * (glwe_dimension + 1) +
k * polynomial_size / 2 * 4 * (glwe_dimension + 1)];
}
template <typename T>
__device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
uint32_t polynomial_size,
@@ -250,5 +284,138 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
cuda_drop_async(buffer, stream, gpu_index);
cudaFreeHost(h_bsk);
}
template <int N> __global__ void dprint_array(double *a) {
if (threadIdx.x == 0 && blockIdx.x == 0) {
for (int i = 0; i < N; i++)
printf("%.30f, ", a[i]);
printf("\n");
}
}
template <class params>
void convert_and_transform_128(cudaStream_t stream, uint32_t gpu_index,
double *d_bsk, __uint128_t const *d_standard,
uint32_t number_of_samples) {
printf("bsk transform\n");
size_t required_shared_memory_size = sizeof(double) * params::degree / 2 * 4;
int grid_size = number_of_samples;
int block_size = params::degree / params::opt;
bool full_sm =
(required_shared_memory_size <= cuda_get_max_shared_memory(gpu_index));
size_t buffer_size =
full_sm ? 0 : (size_t)number_of_samples * params::degree / 2 * 4;
size_t shared_memory_size = full_sm ? required_shared_memory_size : 0;
double *buffer = (double *)cuda_malloc_async(buffer_size, stream, gpu_index);
// configure shared memory for batch fft kernel
if (full_sm) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT_strided_128<FFTDegree<params, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT_strided_128<FFTDegree<params, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
}
// convert u128 into 4 x double
batch_convert_u128_to_f128_strided_as_torus<params>
<<<grid_size, block_size, 0, stream>>>(d_bsk, d_standard);
// call negacyclic 128 bit forward fft.
if (full_sm) {
batch_NSMFFT_strided_128<FFTDegree<params, ForwardFFT>, FULLSM>
<<<grid_size, block_size, shared_memory_size, stream>>>(d_bsk, d_bsk,
buffer);
} else {
batch_NSMFFT_strided_128<FFTDegree<params, ForwardFFT>, NOSM>
<<<grid_size, block_size, shared_memory_size, stream>>>(d_bsk, d_bsk,
buffer);
}
cuda_drop_async(buffer, stream, gpu_index);
for (int i = 0; i < number_of_samples; i++) {
auto chunk = d_bsk + i * params::degree / 2 * 4;
auto re_hi = chunk;
auto re_lo = chunk + params::degree / 2;
auto im_hi = chunk + 2 * params::degree / 2;
auto im_lo = chunk + 3 * params::degree / 2;
cudaDeviceSynchronize();
printf("#re_hi ");
cudaDeviceSynchronize();
dprint_array<params::degree / 2><<<1, 1>>>(re_hi);
cudaDeviceSynchronize();
printf("#re_lo");
cudaDeviceSynchronize();
dprint_array<params::degree / 2><<<1, 1>>>(re_lo);
cudaDeviceSynchronize();
printf("#im_hi");
cudaDeviceSynchronize();
dprint_array<params::degree / 2><<<1, 1>>>(im_hi);
cudaDeviceSynchronize();
printf("#im_lo");
cudaDeviceSynchronize();
dprint_array<params::degree / 2><<<1, 1>>>(im_lo);
cudaDeviceSynchronize();
}
// cudaDeviceSynchronize();
// printf("#cuda\n");
// printf("#re_hi\n");
// dprint_array<params::degree / 2><<<1, 1>>>(d_bsk);
// cudaDeviceSynchronize();
// printf("#re_lo\n");
// dprint_array<params::degree / 2><<<1, 1>>>(&d_bsk[1ULL *
// params::degree/2]); cudaDeviceSynchronize(); printf("#im_hi\n");
// dprint_array<params::degree / 2><<<1, 1>>>(&d_bsk[2ULL *
// params::degree/2]); cudaDeviceSynchronize(); printf("#im_lo\n");
// dprint_array<params::degree / 2><<<1, 1>>>(&d_bsk[3ULL *
// params::degree/2]); cudaDeviceSynchronize();
}
inline void cuda_convert_lwe_programmable_bootstrap_key_u128(
cudaStream_t stream, uint32_t gpu_index, double *dest,
__uint128_t const *src, uint32_t polynomial_size,
uint32_t total_polynomials) {
cuda_set_device(gpu_index);
// Here the buffer size is the size of double times the number of polynomials
// time 4 each polynomial is represented with 4 double array with size
// polynomial_size / 2 into the complex domain to perform the FFT
size_t buffer_size =
total_polynomials * polynomial_size / 2 * sizeof(double) * 4;
__uint128_t *d_standard =
(__uint128_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
cuda_memcpy_async_to_gpu(d_standard, src, buffer_size, stream, gpu_index);
switch (polynomial_size) {
case 256:
convert_and_transform_128<AmortizedDegree<256>>(
stream, gpu_index, dest, d_standard, total_polynomials);
break;
case 512:
convert_and_transform_128<AmortizedDegree<512>>(
stream, gpu_index, dest, d_standard, total_polynomials);
break;
case 1024:
convert_and_transform_128<AmortizedDegree<1024>>(
stream, gpu_index, dest, d_standard, total_polynomials);
break;
case 2048:
convert_and_transform_128<AmortizedDegree<2048>>(
stream, gpu_index, dest, d_standard, total_polynomials);
break;
case 4096:
convert_and_transform_128<AmortizedDegree<4096>>(
stream, gpu_index, dest, d_standard, total_polynomials);
break;
default:
PANIC("Cuda error (convert BSK): unsupported polynomial size. Supported "
"N's are powers of two in the interval [256..4096].")
}
cuda_drop_async(d_standard, stream, gpu_index);
}
#endif // CNCRT_BSK_H

View File

@@ -0,0 +1,207 @@
#include "programmable_bootstrap_classic_128.cuh"
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the PBS on 128 bits inputs, into `buffer`. It also configures SM options on
* the GPU in case FULLSM or PARTIALSM mode is going to be used.
*/
void scratch_cuda_programmable_bootstrap_128(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
auto buffer = (pbs_buffer_128<CLASSICAL> **)pbs_buffer;
switch (polynomial_size) {
case 256:
scratch_programmable_bootstrap_128<AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
break;
case 512:
scratch_programmable_bootstrap_128<AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
break;
case 1024:
scratch_programmable_bootstrap_128<AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
break;
case 2048:
scratch_programmable_bootstrap_128<AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
break;
case 4096:
scratch_programmable_bootstrap_128<AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
break;
default:
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
"Supported N's are powers of two"
" in the interval [256..4096].")
}
}
template <typename Torus>
void executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, double const *bootstrapping_key,
pbs_buffer_128<CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride) {
switch (polynomial_size) {
case 256:
host_programmable_bootstrap_128<AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
case 512:
host_programmable_bootstrap_128<AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
case 1024:
host_programmable_bootstrap_128<AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
case 2048:
host_programmable_bootstrap_128<AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
case 4096:
host_programmable_bootstrap_128<AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
default:
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
"Supported N's are powers of two"
" in the interval [256..4096].")
}
}
/* Perform bootstrapping on a batch of input u128 LWE ciphertexts.
*
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
* (a0,..an-1,b) where n is the LWE dimension
* - lut_vector: should hold as many luts of size polynomial_size
* as there are input ciphertexts, but actually holds
* num_luts vectors to reduce memory usage
* - lut_vector_indexes: stores the index corresponding to
* which lut to use for each sample in
* lut_vector
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
* mask values + 1 body value
* - bootstrapping_key: GGSW encryption of the LWE secret key sk1
* under secret key sk2
* bsk = Z + sk1 H
* where H is the gadget matrix and Z is a matrix (k+1).l
* containing GLWE encryptions of 0 under sk2.
* bsk is thus a tensor of size (k+1)^2.l.N.n
* where l is the number of decomposition levels and
* k is the GLWE dimension, N is the polynomial size for
* GLWE. The polynomial size for GLWE and the lut
* are the same because they have to be in the same ring
* to be multiplied.
* - lwe_dimension: size of the Torus vector used to encrypt the input
* LWE ciphertexts - referred to as n above (~ 600)
* - glwe_dimension: size of the polynomial vector used to encrypt the LUT
* GLWE ciphertexts - referred to as k above. Only the value 1 is supported for
* this parameter.
* - polynomial_size: size of the test polynomial (lut) and size of the
* GLWE polynomial (~1024)
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
* - level_count: number of decomposition levels in the gadget matrix (~4)
* - num_samples: number of encrypted input messages
*
* This function calls a wrapper to a device kernel that performs the
* bootstrapping:
* - the kernel is templatized based on integer discretization and
* polynomial degree
* - num_samples * level_count * (glwe_dimension + 1) blocks of threads are
* launched, where each thread is going to handle one or more polynomial
* coefficients at each stage, for a given level of decomposition, either for
* the LUT mask or its body:
* - perform the blind rotation
* - round the result
* - get the decomposition for the current level
* - switch to the FFT domain
* - multiply with the bootstrapping key
* - come back to the coefficients representation
* - between each stage a synchronization of the threads is necessary (some
* synchronizations happen at the block level, some happen between blocks, using
* cooperative groups).
* - in case the device has enough shared memory, temporary arrays used for
* the different stages (accumulators) are stored into the shared memory
* - the accumulators serve to combine the results for all decomposition
* levels
* - the constant memory (64K) is used for storing the roots of identity
* values for the FFT
*/
void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride) {
if (base_log > 64)
PANIC("Cuda error (classical PBS): base log should be <= 64")
if ((glwe_dimension + 1) * level_count > 8)
PANIC("Cuda error (multi-bit PBS): (k + 1)*l should be <= 8")
pbs_buffer_128<CLASSICAL> *buffer = (pbs_buffer_128<CLASSICAL> *)mem_ptr;
executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128<__uint128_t>(
stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
static_cast<const __uint128_t *>(lwe_output_indexes),
static_cast<const __uint128_t *>(lut_vector),
static_cast<const __uint128_t *>(lut_vector_indexes),
static_cast<const __uint128_t *>(lwe_array_in),
static_cast<const __uint128_t *>(lwe_input_indexes),
static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_many_lut, lut_stride);
}
/*
* This cleanup function frees the data on GPU for the PBS buffer for 32 or 64
* bits inputs.
*/
void cleanup_cuda_programmable_bootstrap_128(void *stream, uint32_t gpu_index,
int8_t **buffer) {
auto x = (pbs_buffer_128<CLASSICAL> *)(*buffer);
x->release(static_cast<cudaStream_t>(stream), gpu_index);
}

View File

@@ -0,0 +1,688 @@
#ifndef CUDA_PBS_CUH_128
#define CUDA_PBS_CUH_128
#ifdef __CDT_PARSER__
#undef __CUDA_RUNTIME_H__
#include <cuda_runtime.h>
#endif
#include "crypto/gadget.cuh"
#include "crypto/torus.cuh"
#include "device.h"
#include "fft128/fft128.cuh"
#include "pbs/bootstrapping_key.cuh"
#include "pbs/pbs_utilities.h"
#include "pbs/programmable_bootstrap.h"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial_math.cuh"
#include "types/complex/operations.cuh"
__device__ void print_u128(__uint128_t x) {
int8_t digits[40];
int i = 0;
do {
digits[i] = x % 10;
x /= 10;
i++;
} while (x);
for (int j = i - 1; j >= 0; j--) {
printf("%d", digits[j]);
}
}
template <typename Torus, class params, sharedMemDegree SMD>
__global__ void __launch_bounds__(params::degree / params::opt)
device_programmable_bootstrap_step_one_128(
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes,
const Torus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const double *__restrict__ bootstrapping_key, Torus *global_accumulator,
double *global_join_buffer, uint32_t lwe_iteration,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *device_mem,
uint64_t device_memory_size_per_block) {
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
// much faster than global memory
extern __shared__ int8_t sharedmem[];
int8_t *selected_memory;
uint32_t glwe_dimension = gridDim.y - 1;
if constexpr (SMD == FULLSM) {
selected_memory = sharedmem;
} else {
int block_index = blockIdx.z + blockIdx.y * gridDim.z +
blockIdx.x * gridDim.z * gridDim.y;
selected_memory = &device_mem[block_index * device_memory_size_per_block];
}
Torus *accumulator = (Torus *)selected_memory;
double *accumulator_fft =
(double *)accumulator +
(ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double));
if constexpr (SMD == PARTIALSM)
accumulator_fft = (double *)sharedmem;
// The third dimension of the block is used to determine on which ciphertext
// this block is operating, in the case of batch bootstraps
const Torus *block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
const Torus *block_lut_vector =
&lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *
(glwe_dimension + 1)];
Torus *global_slice =
global_accumulator +
(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree;
double *global_fft_slice =
global_join_buffer + (blockIdx.y + blockIdx.z * (glwe_dimension + 1) +
blockIdx.x * level_count * (glwe_dimension + 1)) *
(polynomial_size / 2) * 4;
if (lwe_iteration == 0) {
// First iteration
// Put "b" in [0, 2N[
Torus b_hat = 0;
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
params::log2_degree + 1);
// The y-dimension is used to select the element of the GLWE this block will
// compute
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
false);
// Persist
int tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
global_slice[tid] = accumulator[tid];
tid += params::degree / params::opt;
}
// debug
__syncthreads();
if (threadIdx.x == 0 && blockIdx.x == 0 && blockIdx.y == 1 &&
blockIdx.z == 0) {
printf("after div: ");
for (int j = 0; j < params::degree; j++) {
print_u128(accumulator[j]);
printf(", ");
}
printf("\n");
printf("b_hat: %d\n", b_hat);
}
__syncthreads();
}
// Put "a" in [0, 2N[
Torus a_hat = 0;
modulus_switch(block_lwe_array_in[lwe_iteration], a_hat,
params::log2_degree + 1); // 2 * params::log2_degree + 1);
synchronize_threads_in_block();
// Perform ACC * (X^ä - 1)
multiply_by_monomial_negacyclic_and_sub_polynomial<
Torus, params::opt, params::degree / params::opt>(global_slice,
accumulator, a_hat);
// debug
__syncthreads();
if (threadIdx.x == 0 && blockIdx.x == 0 && blockIdx.y == 1 &&
blockIdx.z == 0 && lwe_iteration == 0) {
printf("after mul: ");
for (int j = 0; j < params::degree; j++) {
print_u128(accumulator[j]);
printf(", ");
}
printf("\n");
printf("a_hat: %d\n", a_hat);
}
__syncthreads();
// Perform a rounding to increase the accuracy of the
// bootstrapped ciphertext
init_decomposer_state_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, base_log, level_count);
synchronize_threads_in_block();
// Decompose the accumulator. Each block gets one level of the
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
gadget_acc.decompose_and_compress_level_128(accumulator_fft, blockIdx.z);
// We are using the same memory space for accumulator_fft and
// accumulator_rotated, so we need to synchronize here to make sure they
// don't modify the same memory space at the same time
// Switch to the FFT space
auto acc_fft_re_hi = accumulator_fft + 0LL * params::degree / 2;
auto acc_fft_re_lo = accumulator_fft + 1LL * params::degree / 2;
auto acc_fft_im_hi = accumulator_fft + 2LL * params::degree / 2;
auto acc_fft_im_lo = accumulator_fft + 3LL * params::degree / 2;
auto global_fft_re_hi = global_fft_slice + 0LL * params::degree / 2;
auto global_fft_re_lo = global_fft_slice + 1LL * params::degree / 2;
auto global_fft_im_hi = global_fft_slice + 2LL * params::degree / 2;
auto global_fft_im_lo = global_fft_slice + 3LL * params::degree / 2;
// debug
__syncthreads();
if (threadIdx.x == 0 && blockIdx.x == 0 && blockIdx.y == 1 &&
blockIdx.z == 0 && lwe_iteration == 0) {
printf("before_fft_re_hi: ");
for (int j = 0; j < params::degree / 2; j++) {
printf("%.5f, ", acc_fft_re_hi[j]);
}
printf("\n");
printf("before_fft_re_lo: ");
for (int j = 0; j < params::degree / 2; j++) {
printf("%.5f, ", acc_fft_re_lo[j]);
}
printf("\n");
printf("before_fft_im_hi: ");
for (int j = 0; j < params::degree / 2; j++) {
printf("%.5f, ", acc_fft_im_hi[j]);
}
printf("\n");
printf("before_fft_im_lo: ");
for (int j = 0; j < params::degree / 2; j++) {
printf("%.5f, ", acc_fft_im_lo[j]);
}
printf("\n");
}
__syncthreads();
negacyclic_forward_fft_f128<HalfDegree<params>>(acc_fft_re_hi, acc_fft_re_lo,
acc_fft_im_hi, acc_fft_im_lo);
// debug
__syncthreads();
if (threadIdx.x == 0 && blockIdx.x == 0 && blockIdx.y == 1 &&
blockIdx.z == 0 && lwe_iteration == 0) {
printf("after_fft_re_hi: ");
for (int j = 0; j < params::degree / 2; j++) {
printf("%.5f, ", acc_fft_re_hi[j]);
}
printf("\n");
printf("after_fft_re_lo: ");
for (int j = 0; j < params::degree / 2; j++) {
printf("%.5f, ", acc_fft_re_lo[j]);
}
printf("\n");
printf("after_fft_im_hi: ");
for (int j = 0; j < params::degree / 2; j++) {
printf("%.5f, ", acc_fft_im_hi[j]);
}
printf("\n");
printf("after_fft_im_lo: ");
for (int j = 0; j < params::degree / 2; j++) {
printf("%.5f, ", acc_fft_im_lo[j]);
}
printf("\n");
}
__syncthreads();
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
global_fft_re_hi[tid] = acc_fft_re_hi[tid];
global_fft_re_lo[tid] = acc_fft_re_lo[tid];
global_fft_im_hi[tid] = acc_fft_im_hi[tid];
global_fft_im_lo[tid] = acc_fft_im_lo[tid];
tid += params::degree / params::opt;
}
}
template <typename Torus, class params, sharedMemDegree SMD>
__global__ void __launch_bounds__(params::degree / params::opt)
device_programmable_bootstrap_step_two_128(
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes,
const double *__restrict__ bootstrapping_key, Torus *global_accumulator,
double *global_join_buffer, uint32_t lwe_iteration,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *device_mem,
uint64_t device_memory_size_per_block, uint32_t num_many_lut,
uint32_t lut_stride) {
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
// much faster than global memory
extern __shared__ int8_t sharedmem[];
int8_t *selected_memory;
uint32_t glwe_dimension = gridDim.y - 1;
if constexpr (SMD == FULLSM) {
selected_memory = sharedmem;
} else {
int block_index = blockIdx.x + blockIdx.y * gridDim.x +
blockIdx.z * gridDim.x * gridDim.y;
selected_memory = &device_mem[block_index * device_memory_size_per_block];
}
// We always compute the pointer with most restrictive alignment to avoid
// alignment issues
Torus *accumulator = (Torus *)selected_memory;
double *accumulator_fft =
(double *)accumulator +
(ptrdiff_t)(sizeof(Torus) * params::degree / sizeof(double));
if constexpr (SMD == PARTIALSM)
accumulator_fft = (double *)sharedmem;
for (int level = 0; level < level_count; level++) {
double *global_fft_slice =
global_join_buffer + (level + blockIdx.x * level_count) *
(glwe_dimension + 1) * (params::degree / 2) *
4;
for (int j = 0; j < (glwe_dimension + 1); j++) {
double *fft = global_fft_slice + j * params::degree / 2 * 4;
// Get the bootstrapping key piece necessary for the multiplication
// It is already in the Fourier domain
auto bsk_slice = get_ith_mask_kth_block_128(
bootstrapping_key, lwe_iteration, j, level, polynomial_size,
glwe_dimension, level_count);
auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2 * 4;
// debug
__syncthreads();
if (threadIdx.x == 0 && blockIdx.x == 0 && blockIdx.y == 1 &&
blockIdx.z == 0 && lwe_iteration == 0) {
printf("fft_re_hi: ");
for (int j = 0; j < params::degree / 2; j++) {
printf("%.5f, ", fft[j]);
}
printf("\n");
printf("fft_re_lo: ");
for (int j = 0; j < params::degree / 2; j++) {
printf("%.5f, ", fft[params::degree / 2 + j]);
}
printf("\n");
printf("fft_im_hi: ");
for (int j = 0; j < params::degree / 2; j++) {
printf("%.5f, ", fft[2 * params::degree / 2 + j]);
}
printf("\n");
printf("fft_im_lo: ");
for (int j = 0; j < params::degree / 2; j++) {
printf("%.5f, ", fft[3 * params::degree / 2 + j]);
}
printf("\n");
}
__syncthreads();
// debug
__syncthreads();
if (threadIdx.x == 0 && blockIdx.x == 0 && blockIdx.y == 1 &&
blockIdx.z == 0 && lwe_iteration == 0) {
printf("bsk_poly_re_hi: ");
for (int j = 0; j < params::degree / 2; j++) {
printf("%.5f, ", bsk_poly[j]);
}
printf("\n");
printf("bsk_poly_re_lo: ");
for (int j = 0; j < params::degree / 2; j++) {
printf("%.5f, ", bsk_poly[params::degree / 2 + j]);
}
printf("\n");
printf("bsk_poly_im_hi: ");
for (int j = 0; j < params::degree / 2; j++) {
printf("%.5f, ", bsk_poly[2 * params::degree / 2 + j]);
}
printf("\n");
printf("bsk_poly_im_lo: ");
for (int j = 0; j < params::degree / 2; j++) {
printf("%.5f, ", bsk_poly[3 * params::degree / 2 + j]);
}
printf("\n");
}
__syncthreads();
polynomial_product_accumulate_in_fourier_domain_128<params>(
accumulator_fft, fft, bsk_poly, !level && !j);
}
}
Torus *global_slice =
global_accumulator +
(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree;
// Load the persisted accumulator
int tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
accumulator[tid] = global_slice[tid];
tid += params::degree / params::opt;
}
// Perform the inverse FFT on the result of the GGSW x GLWE and add to the
// accumulator
auto acc_fft_re_hi = accumulator_fft + 0LL * params::degree / 2;
auto acc_fft_re_lo = accumulator_fft + 1LL * params::degree / 2;
auto acc_fft_im_hi = accumulator_fft + 2LL * params::degree / 2;
auto acc_fft_im_lo = accumulator_fft + 3LL * params::degree / 2;
// debug
__syncthreads();
if (threadIdx.x == 0 && blockIdx.x == 0 && blockIdx.y == 1 &&
blockIdx.z == 0 && lwe_iteration == 0) {
printf("before_ifft_acc_fft_re_hi: ");
for (int j = 0; j < params::degree / 2; j++) {
printf("%.5f, ", acc_fft_re_hi[j]);
}
printf("\n");
printf("before_ifft_acc_fft_re_lo: ");
for (int j = 0; j < params::degree / 2; j++) {
printf("%.5f, ", acc_fft_re_lo[j]);
}
printf("\n");
printf("before_ifft_acc_fft_im_hi: ");
for (int j = 0; j < params::degree / 2; j++) {
printf("%.5f, ", acc_fft_im_hi[j]);
}
printf("\n");
printf("before_ifft_acc_fft_im_lo: ");
for (int j = 0; j < params::degree / 2; j++) {
printf("%.5f, ", acc_fft_im_lo[j]);
}
printf("\n");
}
__syncthreads();
negacyclic_backward_fft_f128<HalfDegree<params>>(
acc_fft_re_hi, acc_fft_re_lo, acc_fft_im_hi, acc_fft_im_lo);
add_to_torus_128<Torus, params>(acc_fft_re_hi, acc_fft_re_lo, acc_fft_im_hi,
acc_fft_im_lo, accumulator);
if (lwe_iteration + 1 == lwe_dimension) {
// Last iteration
auto block_lwe_array_out =
&lwe_array_out[lwe_output_indexes[blockIdx.x] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
if (blockIdx.y < glwe_dimension) {
// Perform a sample extract. At this point, all blocks have the result,
// but we do the computation at block 0 to avoid waiting for extra blocks,
// in case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
if (num_many_lut > 1) {
for (int i = 1; i < num_many_lut; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.x * (glwe_dimension * polynomial_size + 1));
auto next_block_lwe_array_out =
&next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
sample_extract_mask<Torus, params>(next_block_lwe_array_out,
accumulator, 1, i * lut_stride);
}
}
} else if (blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
if (num_many_lut > 1) {
for (int i = 1; i < num_many_lut; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.x * (glwe_dimension * polynomial_size + 1));
auto next_block_lwe_array_out =
&next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
sample_extract_body<Torus, params>(next_block_lwe_array_out,
accumulator, 0, i * lut_stride);
}
}
}
} else {
// Persist the updated accumulator
tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
global_slice[tid] = accumulator[tid];
tid += params::degree / params::opt;
}
}
}
template <typename params>
__host__ void scratch_programmable_bootstrap_128(
cudaStream_t stream, uint32_t gpu_index, pbs_buffer_128<CLASSICAL> **buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
uint64_t full_sm_step_one =
get_buffer_size_full_sm_programmable_bootstrap_step_one<__uint128_t>(
polynomial_size);
uint64_t full_sm_step_two =
get_buffer_size_full_sm_programmable_bootstrap_step_two<__uint128_t>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap<__uint128_t>(
polynomial_size);
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
printf("full_sm_step_one: %llu\n", full_sm_step_one);
printf("full_sm_step_two: %llu\n", full_sm_step_two);
printf("partial_sm: %llu\n", partial_sm);
printf("max_shared_memory: %llu\n", max_shared_memory);
// Configure step one
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_step_one_128<__uint128_t, params,
PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_step_one_128<__uint128_t, params,
PARTIALSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else if (max_shared_memory >= partial_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}
// Configure step two
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_two) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_step_two_128<__uint128_t, params,
PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_step_two_128<__uint128_t, params,
PARTIALSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else if (max_shared_memory >= partial_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_step_two_128<__uint128_t, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_two));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_step_two_128<__uint128_t, params, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}
*buffer = new pbs_buffer_128<CLASSICAL>(
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT, allocate_gpu_memory);
}
template <class params>
__host__ void execute_step_one_128(
cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector,
__uint128_t const *lut_vector_indexes, __uint128_t const *lwe_array_in,
__uint128_t const *lwe_input_indexes, double const *bootstrapping_key,
__uint128_t *global_accumulator, double *global_join_buffer,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) {
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
cuda_set_device(gpu_index);
int thds = polynomial_size / params::opt;
dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1, level_count);
if (max_shared_memory < partial_sm) {
printf("step one NOSM\n");
device_programmable_bootstrap_step_one_128<__uint128_t, params, NOSM>
<<<grid, thds, 0, stream>>>(
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, global_accumulator, global_join_buffer,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, full_dm);
} else if (max_shared_memory < full_sm) {
printf("step one PARTIALSM\n");
device_programmable_bootstrap_step_one_128<__uint128_t, params, PARTIALSM>
<<<grid, thds, partial_sm, stream>>>(
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, global_accumulator, global_join_buffer,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, partial_dm);
} else {
// printf("step one FULLSM\n");
device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM>
<<<grid, thds, full_sm, stream>>>(
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, global_accumulator, global_join_buffer,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, 0);
}
check_cuda_error(cudaGetLastError());
}
template <class params>
__host__ void execute_step_two_128(
cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
__uint128_t const *lwe_output_indexes, __uint128_t const *lut_vector,
__uint128_t const *lut_vector_indexes, double const *bootstrapping_key,
__uint128_t *global_accumulator, double *global_join_buffer,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm,
uint32_t num_many_lut, uint32_t lut_stride) {
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
cuda_set_device(gpu_index);
int thds = polynomial_size / params::opt;
dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1);
if (max_shared_memory < partial_sm) {
printf("step two NOSM\n");
device_programmable_bootstrap_step_two_128<__uint128_t, params, NOSM>
<<<grid, thds, 0, stream>>>(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
bootstrapping_key, global_accumulator, global_join_buffer,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, full_dm, num_many_lut, lut_stride);
} else if (max_shared_memory < full_sm) {
printf("step two PARTIALSM\n");
device_programmable_bootstrap_step_two_128<__uint128_t, params, PARTIALSM>
<<<grid, thds, partial_sm, stream>>>(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
bootstrapping_key, global_accumulator, global_join_buffer,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, partial_dm, num_many_lut, lut_stride);
} else {
// printf("step two FULLSM\n");
device_programmable_bootstrap_step_two_128<__uint128_t, params, FULLSM>
<<<grid, thds, full_sm, stream>>>(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
bootstrapping_key, global_accumulator, global_join_buffer,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, 0, num_many_lut, lut_stride);
}
check_cuda_error(cudaGetLastError());
}
/*
* Host wrapper to the programmable bootstrap 128
*/
template <class params>
__host__ void host_programmable_bootstrap_128(
cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
__uint128_t const *lwe_output_indexes, __uint128_t const *lut_vector,
__uint128_t const *lut_vector_indexes, __uint128_t const *lwe_array_in,
__uint128_t const *lwe_input_indexes, double const *bootstrapping_key,
pbs_buffer_128<CLASSICAL> *pbs_buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t num_many_lut, uint32_t lut_stride) {
cuda_set_device(gpu_index);
// With SM each block corresponds to either the mask or body, no need to
// duplicate data for each
uint64_t full_sm_step_one =
get_buffer_size_full_sm_programmable_bootstrap_step_one<__uint128_t>(
polynomial_size);
uint64_t full_sm_step_two =
get_buffer_size_full_sm_programmable_bootstrap_step_two<__uint128_t>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap<__uint128_t>(
polynomial_size);
uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
uint64_t full_dm_step_one = full_sm_step_one;
uint64_t full_dm_step_two = full_sm_step_two;
__uint128_t *global_accumulator = pbs_buffer->global_accumulator;
double *global_join_buffer = pbs_buffer->global_join_buffer;
int8_t *d_mem = pbs_buffer->d_mem;
for (int i = 0; i < lwe_dimension; i++) {
execute_step_one_128<params>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, global_accumulator,
global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
execute_step_two_128<params>(
stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, bootstrapping_key, global_accumulator,
global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two,
num_many_lut, lut_stride);
}
}
#endif // TFHE_RS_BACKENDS_TFHE_CUDA_BACKEND_CUDA_SRC_PBS_PROGRAMMABLE_BOOTSTRAP_CLASSIC_128_CUH_

View File

@@ -3,6 +3,7 @@
#include "crypto/torus.cuh"
#include "device.h"
#include "fft128/f128.cuh"
#include "parameters.cuh"
// Return A if C == 0 and B if C == 1
@@ -172,6 +173,35 @@ __device__ void add_to_torus(double2 *m_values, Torus *result,
}
}
/**
* In case of classical PBS, this method should accumulate the result.
* In case of multi-bit PBS, it should overwrite.
*/
template <typename Torus, class params>
__device__ void add_to_torus_128(double *re_hi, double *re_lo, double *im_hi,
double *im_lo, Torus *result,
bool overwrite_result = false) {
constexpr double normalization = 1. / (params::degree / 2);
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
f128 f128_real(re_hi[tid], re_lo[tid]);
f128 f128_imag(im_hi[tid], im_lo[tid]);
Torus torus_real = u128_from_torus_f128(f128_real);
Torus torus_imag = u128_from_torus_f128(f128_imag);
if (overwrite_result) {
result[tid] = torus_real;
result[tid + params::degree / 2] = torus_imag;
} else {
result[tid] += torus_real;
result[tid + params::degree / 2] += torus_imag;
}
tid = tid + params::degree / params::opt;
}
}
// Extracts the body of the nth-LWE in a GLWE.
template <typename Torus, class params>
__device__ void sample_extract_body(Torus *lwe_array_out, Torus const *glwe,

View File

@@ -56,6 +56,64 @@ __device__ void polynomial_product_accumulate_in_fourier_domain(
}
}
// Computes result += first * second
// If init_accumulator is set, assumes that result was not initialized and does
// that with the outcome of first * second
template <class params>
__device__ void polynomial_product_accumulate_in_fourier_domain_128(
double *result, double *first, const double *second,
bool init_accumulator = false) {
int tid = threadIdx.x;
if (init_accumulator) {
for (int i = 0; i < params::opt / 2; i++) {
f128 a_re(first[tid + 0ULL * params::degree / 2],
first[tid + 1ULL * params::degree / 2]);
f128 a_im(first[tid + 2ULL * params::degree / 2],
first[tid + 3ULL * params::degree / 2]);
f128 b_re(second[tid + 0ULL * params::degree / 2],
second[tid + 1ULL * params::degree / 2]);
f128 b_im(second[tid + 2ULL * params::degree / 2],
second[tid + 3ULL * params::degree / 2]);
f128 c_re, c_im;
f128::cplx_f128_mul_assign(c_re, c_im, a_re, a_im, b_re, b_im);
result[tid + 0ULL * params::degree / 2] = c_re.hi;
result[tid + 1ULL * params::degree / 2] = c_re.lo;
result[tid + 2ULL * params::degree / 2] = c_im.hi;
result[tid + 3ULL * params::degree / 2] = c_im.lo;
tid += params::degree / params::opt;
}
} else {
for (int i = 0; i < params::opt / 2; i++) {
f128 a_re(first[tid + 0ULL * params::degree / 2],
first[tid + 1ULL * params::degree / 2]);
f128 a_im(first[tid + 2ULL * params::degree / 2],
first[tid + 3ULL * params::degree / 2]);
f128 b_re(second[tid + 0ULL * params::degree / 2],
second[tid + 1ULL * params::degree / 2]);
f128 b_im(second[tid + 2ULL * params::degree / 2],
second[tid + 3ULL * params::degree / 2]);
f128 res_re(result[tid + 0ULL * params::degree / 2],
result[tid + 1ULL * params::degree / 2]);
f128 res_im(result[tid + 2ULL * params::degree / 2],
result[tid + 3ULL * params::degree / 2]);
f128 c_re, c_im;
f128::cplx_f128_mul_assign(c_re, c_im, a_re, a_im, b_re, b_im);
f128::cplx_f128_add_assign(res_re, res_im, res_re, res_im, c_re, c_im);
result[tid + 0ULL * params::degree / 2] = res_re.hi;
result[tid + 1ULL * params::degree / 2] = res_re.lo;
result[tid + 2ULL * params::degree / 2] = res_im.hi;
result[tid + 3ULL * params::degree / 2] = res_im.lo;
tid += params::degree / params::opt;
}
}
}
// Computes result += x
// If init_accumulator is set, assumes that result was not initialized and does
// that with the outcome of first * second

View File

@@ -1431,6 +1431,18 @@ unsafe extern "C" {
polynomial_size: u32,
);
}
unsafe extern "C" {
pub fn cuda_convert_lwe_programmable_bootstrap_key_128(
stream: *mut ffi::c_void,
gpu_index: u32,
dest: *mut ffi::c_void,
src: *const ffi::c_void,
input_lwe_dim: u32,
glwe_dim: u32,
level_count: u32,
polynomial_size: u32,
);
}
unsafe extern "C" {
pub fn scratch_cuda_programmable_bootstrap_amortized_32(
stream: *mut ffi::c_void,
@@ -1524,6 +1536,18 @@ unsafe extern "C" {
allocate_gpu_memory: bool,
);
}
unsafe extern "C" {
pub fn scratch_cuda_programmable_bootstrap_128(
stream: *mut ffi::c_void,
gpu_index: u32,
buffer: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
level_count: u32,
input_lwe_ciphertext_count: u32,
allocate_gpu_memory: bool,
);
}
unsafe extern "C" {
pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
stream: *mut ffi::c_void,
@@ -1568,6 +1592,28 @@ unsafe extern "C" {
lut_stride: u32,
);
}
unsafe extern "C" {
pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
stream: *mut ffi::c_void,
gpu_index: u32,
lwe_array_out: *mut ffi::c_void,
lwe_output_indexes: *const ffi::c_void,
lut_vector: *const ffi::c_void,
lut_vector_indexes: *const ffi::c_void,
lwe_array_in: *const ffi::c_void,
lwe_input_indexes: *const ffi::c_void,
bootstrapping_key: *const ffi::c_void,
buffer: *mut i8,
lwe_dimension: u32,
glwe_dimension: u32,
polynomial_size: u32,
base_log: u32,
level_count: u32,
num_samples: u32,
num_many_lut: u32,
lut_stride: u32,
);
}
unsafe extern "C" {
pub fn cleanup_cuda_programmable_bootstrap(
stream: *mut ffi::c_void,
@@ -1575,6 +1621,13 @@ unsafe extern "C" {
pbs_buffer: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn cleanup_cuda_programmable_bootstrap_128(
stream: *mut ffi::c_void,
gpu_index: u32,
pbs_buffer: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
glwe_dimension: u32,

View File

@@ -321,9 +321,9 @@ pub const FFT_U64_PARAMS: FftTestParams<u64> = FftTestParams {
};
pub const FFT_U128_PARAMS: FftTestParams<u128> = FftTestParams {
lwe_dimension: LweDimension(742),
lwe_dimension: LweDimension(5),
glwe_dimension: GlweDimension(1),
polynomial_size: PolynomialSize(2048),
polynomial_size: PolynomialSize(256),
lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
0.00000000004998277131225527,
)),
@@ -348,6 +348,16 @@ pub const FFT128_U128_PARAMS: FftTestParams<u128> = FftTestParams {
ciphertext_modulus: CiphertextModulus::<u128>::new_native(),
};
pub const FFT128_U128_GPU_PARAMS: FftTestParams<u128> = FftTestParams {
lwe_dimension: LweDimension(879),
glwe_dimension: GlweDimension(2),
polynomial_size: PolynomialSize(2048),
lwe_noise_distribution: DynamicDistribution::new_t_uniform(46),
glwe_noise_distribution: DynamicDistribution::new_t_uniform(30),
pbs_base_log: DecompositionBaseLog(32),
pbs_level: DecompositionLevelCount(3),
ciphertext_modulus: CiphertextModulus::new_native(),
};
pub const FFT_WOPBS_PARAMS: FftWopPbsTestParams<u64> = FftWopPbsTestParams {
lwe_dimension: LweDimension(481),
glwe_dimension: GlweDimension(1),

View File

@@ -256,6 +256,7 @@ where
ContLut: ContainerMut<Element = Scalar>,
ContLwe: Container<Element = Scalar>,
{
println!("rust#4");
fn implementation<Scalar: UnsignedTorus + CastInto<usize>>(
this: Fourier128LweBootstrapKey<&[f64]>,
mut lut: GlweCiphertext<&mut [Scalar]>,
@@ -270,7 +271,7 @@ where
let ciphertext_modulus = lut.ciphertext_modulus();
assert!(ciphertext_modulus.is_compatible_with_native_modulus());
let monomial_degree = pbs_modulus_switch(*lwe_body, lut_poly_size);
println!("monomial_degree: {:?}", monomial_degree);
lut.as_mut_polynomial_list()
.iter_mut()
.for_each(|mut poly| {
@@ -283,6 +284,7 @@ where
// We initialize the ct_0 used for the successive cmuxes
let mut ct0 = lut;
println!("ct0_after_div: {:?}", ct0.get_body());
for (lwe_mask_element, bootstrap_key_ggsw) in
izip!(lwe_mask.iter(), this.into_ggsw_iter())
{
@@ -304,11 +306,15 @@ where
MonomialDegree(pbs_modulus_switch(*lwe_mask_element, lut_poly_size)),
);
}
// println!("ct1_after_mul: {:?}", ct1.get_body());
// println!("MonomialDegree: {:?}", MonomialDegree(pbs_modulus_switch
// (*lwe_mask_element, lut_poly_size)));
// ct1 is re-created each loop it can be moved, ct0 is already a view, but
// as_mut_view is required to keep borrow rules consistent
cmux(&mut ct0, &mut ct1, &bootstrap_key_ggsw, fft, stack);
}
break;
}
if !ciphertext_modulus.is_native_modulus() {
@@ -343,6 +349,8 @@ where
ContLweIn: Container<Element = Scalar>,
ContAcc: Container<Element = Scalar>,
{
println!("rust#2");
fn implementation<Scalar: UnsignedTorus + CastInto<usize>>(
this: Fourier128LweBootstrapKey<&[f64]>,
mut lwe_out: LweCiphertext<&mut [Scalar]>,
@@ -351,15 +359,17 @@ where
fft: Fft128View<'_>,
stack: &mut PodStack,
) {
// We type check dynamically with TypeId
#[allow(clippy::transmute_undefined_repr)]
if TypeId::of::<Scalar>() == TypeId::of::<u128>() {
let mut lwe_out: LweCiphertext<&mut [u128]> = unsafe { transmute(lwe_out) };
let lwe_in: LweCiphertext<&[u128]> = unsafe { transmute(lwe_in) };
let accumulator: GlweCiphertext<&[u128]> = unsafe { transmute(accumulator) };
println!("rust#3");
return this.bootstrap_u128(&mut lwe_out, &lwe_in, &accumulator, fft, stack);
}
// We type check dynamically with TypeId
// #[allow(clippy::transmute_undefined_repr)]
// if TypeId::of::<Scalar>() == TypeId::of::<u128>() {
// let mut lwe_out: LweCiphertext<&mut [u128]> = unsafe { transmute(lwe_out) };
// let lwe_in: LweCiphertext<&[u128]> = unsafe { transmute(lwe_in) };
// let accumulator: GlweCiphertext<&[u128]> = unsafe { transmute(accumulator) };
//
// return this.bootstrap_u128(&mut lwe_out, &lwe_in, &accumulator, fft, stack);
// }
let (local_accumulator_data, stack) =
stack.collect_aligned(CACHELINE_ALIGN, accumulator.as_ref().iter().copied());
@@ -446,6 +456,7 @@ where
ContLweIn: Container<Element = Scalar>,
ContAcc: Container<Element = Scalar>,
{
println!("rust#1");
self.bootstrap(lwe_out, lwe_in, accumulator, fft.as_view(), stack);
}

View File

@@ -328,6 +328,10 @@ where
fourier_im1,
coef_poly.as_ref(),
);
println!("re0: {:?}", fourier_re0);
println!("re1: {:?}", fourier_re1);
println!("im0: {:?}", fourier_im0);
println!("im1: {:?}", fourier_im1);
}
}
implementation(self.as_mut_view(), coef_ggsw.as_view(), fft);
@@ -461,6 +465,12 @@ pub fn add_external_product_assign<Scalar, ContOut, ContGgsw, ContGlwe>(
let (fourier_im0, stack) = stack.make_aligned_raw::<f64>(len, align);
let (fourier_im1, _) = stack.make_aligned_raw::<f64>(len, align);
// We perform the forward fft transform for the glwe polynomial
println!("before_fft_fourier_re0: {:?}", fourier_re0);
println!("before_fft_fourier_re1: {:?}", fourier_re1);
println!("before_fft_fourier_im0: {:?}", fourier_im0);
println!("before_fft_fourier_im1: {:?}", fourier_im1);
fft.forward_as_integer(
fourier_re0,
fourier_re1,
@@ -468,6 +478,12 @@ pub fn add_external_product_assign<Scalar, ContOut, ContGgsw, ContGlwe>(
fourier_im1,
glwe_poly.as_ref(),
);
println!("after_fft_fourier_re0: {:?}", fourier_re0);
println!("after_fft_fourier_re1: {:?}", fourier_re1);
println!("after_fft_fourier_im0: {:?}", fourier_im0);
println!("after_fft_fourier_im1: {:?}", fourier_im1);
// Now we loop through the polynomials of the output, and add the
// corresponding product of polynomials.
update_with_fmadd(
@@ -503,6 +519,10 @@ pub fn add_external_product_assign<Scalar, ContOut, ContGgsw, ContGlwe>(
output_fft_buffer_im0.into_chunks(fourier_poly_size),
output_fft_buffer_im1.into_chunks(fourier_poly_size),
) {
println!("before_ifft_fourier_re0: {:?}", fourier_re0);
println!("before_ifft_fourier_re1: {:?}", fourier_re1);
println!("before_ifft_fourier_im0: {:?}", fourier_im0);
println!("before_ifft_fourier_im1: {:?}", fourier_im1);
fft.add_backward_as_torus(
out.as_mut(),
fourier_re0,
@@ -678,6 +698,11 @@ pub fn update_with_fmadd(
ggsw_row.data_im0.into_chunks(fourier_poly_size),
ggsw_row.data_im1.into_chunks(fourier_poly_size),
) {
println!("ggsw_poly_re0: {:?}", ggsw_poly_re0);
println!("ggsw_poly_re1: {:?}", ggsw_poly_re1);
println!("ggsw_poly_im0: {:?}", ggsw_poly_im0);
println!("ggsw_poly_im1: {:?}", ggsw_poly_im1);
struct Impl<'a> {
output_fourier_re0: &'a mut [f64],
output_fourier_re1: &'a mut [f64],
@@ -784,6 +809,7 @@ pub fn cmux<Scalar, ContCt0, ContCt1, ContGgsw>(
for (c1, c0) in izip!(ct1.as_mut(), ct0.as_ref()) {
*c1 = c1.wrapping_sub(*c0);
}
println!("ct1_after_mul_sub: {:?}", ct1.get_body());
add_external_product_assign(&mut ct0, &ggsw, &ct1, fft, stack);
}

View File

@@ -174,6 +174,7 @@ where
fft: Fft128View<'_>,
stack: &mut PodStack,
) {
println!("rust#-1");
let align = CACHELINE_ALIGN;
let ciphertext_modulus = accumulator.ciphertext_modulus();

View File

@@ -0,0 +1,245 @@
pub(crate) use crate::core_crypto::algorithms::test::gen_keys_or_get_from_cache_if_enabled;
use crate::core_crypto::algorithms::test::{
FftBootstrapKeys, FftTestParams, TestResources, FFT_U128_PARAMS, FFT_U32_PARAMS, FFT128_U128_GPU_PARAMS,
};
use crate::core_crypto::fft_impl::common::FourierBootstrapKey;
use crate::core_crypto::fft_impl::fft128::crypto::bootstrap::Fourier128LweBootstrapKeyOwned;
use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
use crate::core_crypto::gpu::lwe_bootstrap_key::CudaLweBootstrapKey;
use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
use crate::core_crypto::gpu::vec::{CudaVec, GpuIndex};
use crate::core_crypto::gpu::{cuda_programmable_bootstrap_lwe_ciphertext, CudaStreams};
use crate::core_crypto::keycache::KeyCacheAccess;
use crate::core_crypto::prelude::*;
use dyn_stack::{GlobalPodBuffer, PodStack};
use itertools::Itertools;
use serde::de::DeserializeOwned;
use serde::Serialize;
pub fn generate_keys<
Scalar: UnsignedTorus + Sync + Send + CastFrom<usize> + CastInto<usize> + Serialize + DeserializeOwned,
>(
params: FftTestParams<Scalar>,
rsc: &mut TestResources,
) -> FftBootstrapKeys<Scalar> {
// Generate an LweSecretKey with binary coefficients
let small_lwe_sk =
LweSecretKey::generate_new_binary(params.lwe_dimension, &mut rsc.secret_random_generator);
// Generate a GlweSecretKey with binary coefficients
let glwe_sk = GlweSecretKey::generate_new_binary(
params.glwe_dimension,
params.polynomial_size,
&mut rsc.secret_random_generator,
);
// Create a copy of the GlweSecretKey re-interpreted as an LweSecretKey
let big_lwe_sk = glwe_sk.clone().into_lwe_secret_key();
let bsk = par_allocate_and_generate_new_lwe_bootstrap_key(
&small_lwe_sk,
&glwe_sk,
params.pbs_base_log,
params.pbs_level,
params.glwe_noise_distribution,
params.ciphertext_modulus,
&mut rsc.encryption_random_generator,
);
FftBootstrapKeys {
small_lwe_sk,
big_lwe_sk,
bsk,
}
}
pub fn execute_bootstrap_u128<Scalar, K>(params: FftTestParams<Scalar>)
where
Scalar: Numeric
+ UnsignedTorus
+ CastFrom<usize>
+ CastInto<usize>
+ Send
+ Sync
+ Serialize
+ DeserializeOwned,
K: FourierBootstrapKey<Scalar>,
FftTestParams<Scalar>: KeyCacheAccess<Keys = FftBootstrapKeys<Scalar>>,
{
let lwe_noise_distribution = params.lwe_noise_distribution;
let glwe_dimension = params.glwe_dimension;
let polynomial_size = params.polynomial_size;
let ciphertext_modulus = params.ciphertext_modulus;
let mut rsc = TestResources::new();
let fft = K::new_fft(polynomial_size);
let mut keys_gen = |params| generate_keys(params, &mut rsc);
let keys = gen_keys_or_get_from_cache_if_enabled(params, &mut keys_gen);
let (std_bootstrapping_key, small_lwe_sk, big_lwe_sk) =
(keys.bsk, keys.small_lwe_sk, keys.big_lwe_sk);
let output_lwe_dimension = big_lwe_sk.lwe_dimension();
let mut cnt = 0;
let mut level_matrix_cnt = 0;
for Ggsw in std_bootstrapping_key.iter() {
for ggsw_level_matrix in Ggsw.iter() {
for glwe_ciphertext in ggsw_level_matrix.as_glwe_list().iter() {
for polynomial in glwe_ciphertext.as_polynomial_list().iter() {
for coef in polynomial.iter() {
cnt += 1;
}
}
}
}
}
// println!("std_bootstrapping_key: {:?}", std_bootstrapping_key);
// println!("std_bootstrapping_key: {:?}", std_bootstrapping_key.iter().len());
// Create the empty bootstrapping key in the Fourier domain
let mut fourier_bsk = K::new(
std_bootstrapping_key.input_lwe_dimension(),
std_bootstrapping_key.polynomial_size(),
std_bootstrapping_key.glwe_size(),
std_bootstrapping_key.decomposition_base_log(),
std_bootstrapping_key.decomposition_level_count(),
);
println!(
"decomposition_base_log: {:?}",
std_bootstrapping_key.decomposition_base_log()
);
println!("cnt: {:?}", cnt);
println!("rust transforming standard bsk");
fourier_bsk.fill_with_forward_fourier(
&std_bootstrapping_key,
&fft,
PodStack::new(&mut GlobalPodBuffer::new(
K::fill_with_forward_fourier_scratch(&fft).unwrap(),
)),
);
let gpu_index = 0;
let stream = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
let d_bsk = CudaLweBootstrapKey::from_lwe_bootstrap_key(&std_bootstrapping_key, &stream);
// Our 4 bits message space
let message_modulus: Scalar = Scalar::ONE << 4;
// Our input message
let input_message: Scalar = 3usize.cast_into();
let number_of_messages = 1;
// Delta used to encode 4 bits of message + a bit of padding on Scalar
let delta: Scalar = (Scalar::ONE << (Scalar::BITS - 1)) / message_modulus;
// Apply our encoding
let plaintext = Plaintext(input_message * delta);
// Allocate a new LweCiphertext and encrypt our plaintext
let lwe_ciphertext_in: LweCiphertextOwned<Scalar> = allocate_and_encrypt_new_lwe_ciphertext(
&small_lwe_sk,
plaintext,
lwe_noise_distribution,
ciphertext_modulus,
&mut rsc.encryption_random_generator,
);
let f = |x: Scalar| x;
let accumulator: GlweCiphertextOwned<Scalar> = generate_programmable_bootstrap_glwe_lut(
polynomial_size,
glwe_dimension.to_glwe_size(),
message_modulus.cast_into(),
ciphertext_modulus,
delta,
f,
);
// Allocate the LweCiphertext to store the result of the PBS
let mut pbs_ct: LweCiphertext<Vec<Scalar>> = LweCiphertext::new(
Scalar::ZERO,
big_lwe_sk.lwe_dimension().to_lwe_size(),
ciphertext_modulus,
);
println!("Computing PBS...");
let d_lwe_ciphertext_in =
CudaLweCiphertextList::from_lwe_ciphertext(&lwe_ciphertext_in, &stream);
let mut d_out_pbs_ct = CudaLweCiphertextList::new(
output_lwe_dimension,
LweCiphertextCount(1),
ciphertext_modulus,
&stream,
);
let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &stream);
let mut test_vector_indexes: Vec<Scalar> = vec![Scalar::ZERO; number_of_messages];
for (i, ind) in test_vector_indexes.iter_mut().enumerate() {
*ind = <usize as CastInto<Scalar>>::cast_into(i);
}
let mut d_test_vector_indexes =
unsafe { CudaVec::<Scalar>::new_async(number_of_messages, &stream, 0) };
unsafe { d_test_vector_indexes.copy_from_cpu_async(&test_vector_indexes, &stream, 0) };
let num_blocks = d_lwe_ciphertext_in.0.lwe_ciphertext_count.0;
let lwe_indexes_usize: Vec<usize> = (0..num_blocks).collect_vec();
let lwe_indexes = lwe_indexes_usize
.iter()
.map(|&x| <usize as CastInto<Scalar>>::cast_into(x))
.collect_vec();
let mut d_output_indexes = unsafe { CudaVec::<Scalar>::new_async(num_blocks, &stream, 0) };
let mut d_input_indexes = unsafe { CudaVec::<Scalar>::new_async(num_blocks, &stream, 0) };
unsafe {
d_input_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0);
d_output_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0);
}
cuda_programmable_bootstrap_lwe_ciphertext(
&d_lwe_ciphertext_in,
&mut d_out_pbs_ct,
&d_accumulator,
&d_test_vector_indexes,
&d_output_indexes,
&d_input_indexes,
LweCiphertextCount(num_blocks),
&d_bsk,
&stream,
);
pbs_ct = d_out_pbs_ct.into_lwe_ciphertext(&stream);
// fourier_bsk.bootstrap(
// &mut pbs_ct,
// &lwe_ciphertext_in,
// &accumulator,
// &fft,
// PodStack::new(&mut GlobalPodBuffer::new(
// K::bootstrap_scratch(
// std_bootstrapping_key.glwe_size(),
// std_bootstrapping_key.polynomial_size(),
// &fft,
// )
// .unwrap(),
// )),
// );
// Decrypt the PBS result
let pbs_plaintext: Plaintext<Scalar> = decrypt_lwe_ciphertext(&big_lwe_sk, &pbs_ct);
// Create a SignedDecomposer to perform the rounding of the decrypted plaintext
// We pass a DecompositionBaseLog of 5 and a DecompositionLevelCount of 1 indicating we want
// to round the 5 MSB, 1 bit of padding plus our 4 bits of message
let signed_decomposer =
SignedDecomposer::new(DecompositionBaseLog(5), DecompositionLevelCount(1));
// Round and remove our encoding
let pbs_result: Scalar = signed_decomposer.closest_representable(pbs_plaintext.0) / delta;
println!("Checking result...");
assert_eq!(f(input_message), pbs_result);
}
#[test]
fn test_bootstrap_u128() {
execute_bootstrap_u128::<u128, Fourier128LweBootstrapKeyOwned>(FFT128_U128_GPU_PARAMS);
}

View File

@@ -7,6 +7,7 @@ mod lwe_linear_algebra;
mod lwe_multi_bit_programmable_bootstrapping;
mod lwe_packing_keyswitch;
mod lwe_programmable_bootstrapping;
mod lwe_programmable_bootstrapping_128;
mod noise_distribution;
pub struct CudaPackingKeySwitchKeys<Scalar: UnsignedInteger> {

View File

@@ -32,11 +32,22 @@ impl CudaLweBootstrapKey {
where
InputBskCont::Element: UnsignedInteger,
{
println!("#1");
let input_lwe_dimension = bsk.input_lwe_dimension();
let polynomial_size = bsk.polynomial_size();
let decomp_level_count = bsk.decomposition_level_count();
let decomp_base_log = bsk.decomposition_base_log();
let glwe_dimension = bsk.glwe_size().to_glwe_dimension();
let split_count = if size_of::<InputBskCont::Element>() == 16 {
2
} else {
1
};
println!("split_count: {:?}", split_count);
println!("input_lwe_dimension: {:?}", input_lwe_dimension);
println!("polynomial_size: {:?}", polynomial_size);
println!("decomp_level_count: {:?}", decomp_level_count);
println!("glwe_dimension: {:?}", glwe_dimension);
// Allocate memory
let mut d_vec = CudaVec::<f64>::new_multi_gpu(
@@ -45,10 +56,11 @@ impl CudaLweBootstrapKey {
glwe_dimension.to_glwe_size(),
polynomial_size,
decomp_level_count,
),
) * split_count,
streams,
);
// Copy to the GPU
unsafe {
convert_lwe_programmable_bootstrap_key_async(
streams,
@@ -59,7 +71,7 @@ impl CudaLweBootstrapKey {
decomp_level_count,
polynomial_size,
);
}
};
streams.synchronize();
Self {
d_vec,

View File

@@ -10,6 +10,7 @@ use crate::core_crypto::prelude::{
UnsignedInteger,
};
pub use algorithms::*;
use core::mem;
pub use entities::*;
use std::ffi::c_void;
use tfhe_cuda_backend::bindings::*;
@@ -121,41 +122,84 @@ pub unsafe fn programmable_bootstrap_async<T: UnsignedInteger>(
let num_many_lut = 1u32;
let lut_stride = 0u32;
let mut pbs_buffer: *mut i8 = std::ptr::null_mut();
scratch_cuda_programmable_bootstrap_64(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
level.0 as u32,
num_samples,
true,
);
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
streams.ptr[0],
streams.gpu_indexes[0].get(),
lwe_array_out.as_mut_c_ptr(0),
lwe_out_indexes.as_c_ptr(0),
test_vector.as_c_ptr(0),
test_vector_indexes.as_c_ptr(0),
lwe_array_in.as_c_ptr(0),
lwe_in_indexes.as_c_ptr(0),
bootstrapping_key.as_c_ptr(0),
pbs_buffer,
lwe_dimension.0 as u32,
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
base_log.0 as u32,
level.0 as u32,
num_samples,
num_many_lut,
lut_stride,
);
cleanup_cuda_programmable_bootstrap(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
);
if size_of::<T>() == 16 {
scratch_cuda_programmable_bootstrap_128(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
level.0 as u32,
num_samples,
true,
);
cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
streams.ptr[0],
streams.gpu_indexes[0].get(),
lwe_array_out.as_mut_c_ptr(0),
lwe_out_indexes.as_c_ptr(0),
test_vector.as_c_ptr(0),
test_vector_indexes.as_c_ptr(0),
lwe_array_in.as_c_ptr(0),
lwe_in_indexes.as_c_ptr(0),
bootstrapping_key.as_c_ptr(0),
pbs_buffer,
lwe_dimension.0 as u32,
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
base_log.0 as u32,
level.0 as u32,
num_samples,
num_many_lut,
lut_stride,
);
cleanup_cuda_programmable_bootstrap_128(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
);
} else {
scratch_cuda_programmable_bootstrap_64(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
level.0 as u32,
num_samples,
true,
);
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
streams.ptr[0],
streams.gpu_indexes[0].get(),
lwe_array_out.as_mut_c_ptr(0),
lwe_out_indexes.as_c_ptr(0),
test_vector.as_c_ptr(0),
test_vector_indexes.as_c_ptr(0),
lwe_array_in.as_c_ptr(0),
lwe_in_indexes.as_c_ptr(0),
bootstrapping_key.as_c_ptr(0),
pbs_buffer,
lwe_dimension.0 as u32,
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
base_log.0 as u32,
level.0 as u32,
num_samples,
num_many_lut,
lut_stride,
);
cleanup_cuda_programmable_bootstrap(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
);
}
}
/// Programmable multi-bit bootstrap on a vector of LWE ciphertexts
@@ -342,18 +386,38 @@ pub unsafe fn convert_lwe_programmable_bootstrap_key_async<T: UnsignedInteger>(
polynomial_size: PolynomialSize,
) {
let size = std::mem::size_of_val(src);
println!("#1.5");
for (i, &stream_ptr) in streams.ptr.iter().enumerate() {
assert_eq!(dest.len() * std::mem::size_of::<T>(), size);
cuda_convert_lwe_programmable_bootstrap_key_64(
stream_ptr,
streams.gpu_indexes[i].get(),
dest.as_mut_c_ptr(i as u32),
src.as_ptr().cast(),
input_lwe_dim.0 as u32,
glwe_dim.0 as u32,
l_gadget.0 as u32,
polynomial_size.0 as u32,
);
println!("#1.6");
println!("dest.len(): {:?}", dest.len());
println!("std::mem::size_of::<T>() {:?}", std::mem::size_of::<T>());
println!("size: {:?}", size);
if size_of::<T>() == 16 {
println!("#2");
cuda_convert_lwe_programmable_bootstrap_key_128(
stream_ptr,
streams.gpu_indexes[i].get(),
dest.as_mut_c_ptr(i as u32),
src.as_ptr().cast(),
input_lwe_dim.0 as u32,
glwe_dim.0 as u32,
l_gadget.0 as u32,
polynomial_size.0 as u32,
);
} else {
println!("#3");
cuda_convert_lwe_programmable_bootstrap_key_64(
stream_ptr,
streams.gpu_indexes[i].get(),
dest.as_mut_c_ptr(i as u32),
src.as_ptr().cast(),
input_lwe_dim.0 as u32,
glwe_dim.0 as u32,
l_gadget.0 as u32,
polynomial_size.0 as u32,
);
}
}
}