mirror of
https://github.com/zama-ai/concrete.git
synced 2026-02-08 19:44:57 -05:00
chore(cuda): Remove old comments from concrete-cuda.
This commit is contained in:
@@ -12,7 +12,6 @@
|
||||
#include "../include/helper_cuda.h"
|
||||
#include "bootstrap.h"
|
||||
#include "complex/operations.cuh"
|
||||
//#include "crypto/bootstrapping_key.cuh"
|
||||
#include "crypto/gadget.cuh"
|
||||
#include "crypto/torus.cuh"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
@@ -83,7 +82,6 @@ __global__ void device_bootstrap_amortized(
|
||||
// polynomials take coefficients between -B/2 and B/2 they can be represented
|
||||
// with only 16 bits, assuming the base log does not exceed 2^16
|
||||
int16_t *accumulator_mask_decomposed = (int16_t *)selected_memory;
|
||||
// TODO (Agnes) why not the 16 bits representation here?
|
||||
int16_t *accumulator_body_decomposed =
|
||||
(int16_t *)accumulator_mask_decomposed + polynomial_size;
|
||||
Torus *accumulator_mask = (Torus *)accumulator_body_decomposed +
|
||||
@@ -103,28 +101,11 @@ __global__ void device_bootstrap_amortized(
|
||||
accumulator_fft =
|
||||
(double2 *)body_res_fft + (ptrdiff_t)(polynomial_size / 2);
|
||||
|
||||
/*
|
||||
int dif0 = ((char*)accumulator_body_decomposed - (char*)selected_memory);
|
||||
int dif1 = ((char*)accumulator_mask - (char*)accumulator_body_decomposed);
|
||||
int dif2 = ((char*)accumulator_body - (char*)accumulator_mask);
|
||||
int dif3 = ((char*)accumulator_mask_rotated - (char*)accumulator_body);
|
||||
int dif4 = ((char*)accumulator_body_rotated -
|
||||
(char*)accumulator_mask_rotated); int dif5 = ((char*)mask_res_fft -
|
||||
(char*)accumulator_body_rotated); int dif6 = ((char*)body_res_fft -
|
||||
(char*)mask_res_fft); int dif7 = (SMD != PARTIALSM)? (char*)accumulator_fft -
|
||||
(char*)body_res_fft:0; if (threadIdx.x == 0 && blockIdx.x == 0) {
|
||||
printf("device and shared mem: %d %d %d %d %d %d %d %d\n ",dif0, dif1, dif2,
|
||||
dif3, dif4, dif5, dif6, dif7);
|
||||
}
|
||||
*/
|
||||
|
||||
auto block_lwe_in = &lwe_in[blockIdx.x * (lwe_mask_size + 1)];
|
||||
Torus *block_lut_vector =
|
||||
&lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree * 2];
|
||||
|
||||
// TODO (Agnes) try to store the gadget matrix in const memory to see if
|
||||
// register use decreases Since all const mem is used for twiddles currently,
|
||||
// it would mean moving some of them to global memory instead
|
||||
|
||||
GadgetMatrix<Torus, params> gadget(base_log, l_gadget);
|
||||
|
||||
// Put "b", the body, in [0, 2N[
|
||||
@@ -145,7 +126,6 @@ __global__ void device_bootstrap_amortized(
|
||||
// into l_gadget polynomials, and performing polynomial multiplication
|
||||
// via an FFT with the RGSW encrypted secret key
|
||||
for (int iteration = 0; iteration < lwe_mask_size; iteration++) {
|
||||
// TODO make sure that following sync is necessary
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Put "a" in [0, 2N[ instead of Zq
|
||||
@@ -153,18 +133,6 @@ __global__ void device_bootstrap_amortized(
|
||||
block_lwe_in[iteration],
|
||||
2 * params::degree); // 2 * params::log2_degree + 1);
|
||||
|
||||
// TODO (Agnes) why is there this if condition?
|
||||
if (a_hat == 0) {
|
||||
// todo(Joao): **cannot use this optimization**
|
||||
// the reason is that one of the input ciphertexts (blockIdx.z)
|
||||
// might skip an iteration while others don't, which as a result
|
||||
// will make that block not call the grid.sync(), causing a deadlock;
|
||||
// maybe it's a workaround to add grid.sync() here, but not sure if
|
||||
// there are any edge cases?
|
||||
|
||||
// continue
|
||||
}
|
||||
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
Torus, params::opt, params::degree / params::opt>(
|
||||
@@ -200,7 +168,6 @@ __global__ void device_bootstrap_amortized(
|
||||
// Now that the rotation is done, decompose the resulting polynomial
|
||||
// coefficients so as to multiply each decomposed level with the
|
||||
// corresponding part of the bootstrapping key
|
||||
// TODO (Agnes) explain why we do that for the mask and body separately
|
||||
for (int decomp_level = 0; decomp_level < l_gadget; decomp_level++) {
|
||||
|
||||
gadget.decompose_one_level(accumulator_mask_decomposed,
|
||||
@@ -227,8 +194,6 @@ __global__ void device_bootstrap_amortized(
|
||||
|
||||
// Get the bootstrapping key piece necessary for the multiplication
|
||||
// It is already in the Fourier domain
|
||||
// TODO (Agnes) Explain why for the mask polynomial multiplication
|
||||
// we need the bsk_body_slice and vice versa
|
||||
auto bsk_mask_slice = PolynomialFourier<double2, params>(
|
||||
get_ith_mask_kth_block(
|
||||
bootstrapping_key, iteration, 0, decomp_level,
|
||||
@@ -241,7 +206,7 @@ __global__ void device_bootstrap_amortized(
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform the coefficient-wise product with the two pieces of
|
||||
// bootstrapping key TODO (Agnes) why two pieces?
|
||||
// bootstrapping key
|
||||
polynomial_product_accumulate_in_fourier_domain(
|
||||
mask_res_fft, accumulator_fft, bsk_mask_slice);
|
||||
polynomial_product_accumulate_in_fourier_domain(
|
||||
@@ -333,7 +298,7 @@ __global__ void device_bootstrap_amortized(
|
||||
// The blind rotation for this block is over
|
||||
// Now we can perform the sample extraction: for the body it's just
|
||||
// the resulting constant coefficient of the accumulator
|
||||
// For the mask it's more complicated TODO (Agnes) explain why
|
||||
// For the mask it's more complicated
|
||||
sample_extract_mask<Torus, params>(block_lwe_out, accumulator_mask);
|
||||
sample_extract_body<Torus, params>(block_lwe_out, accumulator_body);
|
||||
}
|
||||
@@ -380,11 +345,6 @@ __host__ void host_bootstrap_amortized(
|
||||
// handles opt polynomial coefficients
|
||||
// (actually opt/2 coefficients since we compress the real polynomial into a
|
||||
// complex)
|
||||
// TODO (Agnes) Polynomial size / params::opt should be equal to 256 or 512
|
||||
// probably, maybe 1024 would be too big?
|
||||
// Or would it actually be good in our case to have the largest possible
|
||||
// number of threads per block since anyway few blocks will run
|
||||
// concurrently?
|
||||
dim3 grid(input_lwe_ciphertext_count, 1, 1);
|
||||
dim3 thds(polynomial_size / params::opt, 1, 1);
|
||||
|
||||
@@ -426,7 +386,6 @@ __host__ void host_bootstrap_amortized(
|
||||
device_bootstrap_amortized<Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
SM_FULL));
|
||||
// TODO (Agnes): is this necessary?
|
||||
checkCudaErrors(cudaFuncSetCacheConfig(
|
||||
device_bootstrap_amortized<Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
@@ -454,7 +413,6 @@ int cuda_get_pbs_per_gpu(int polynomial_size) {
|
||||
int num_threads = polynomial_size / params::opt;
|
||||
cudaGetDeviceCount(0);
|
||||
cudaDeviceProp device_properties;
|
||||
// FIXME: here we assume every device has same properties
|
||||
cudaGetDeviceProperties(&device_properties, 0);
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&blocks_per_sm, device_bootstrap_amortized<Torus, params>,
|
||||
|
||||
@@ -48,12 +48,11 @@
|
||||
* - switch to the FFT domain
|
||||
* - multiply with the bootstrapping key
|
||||
* - come back to the coefficients representation
|
||||
* - between each stage a synchronization of the threads is necessary TODO
|
||||
* (Agnes) check this
|
||||
* - between each stage a synchronization of the threads is necessary
|
||||
* - in case the device has enough shared memory, temporary arrays used for
|
||||
* the different stages (accumulators) are stored into the shared memory
|
||||
* - the accumulators serve to combine the results for all decomposition
|
||||
* levels TODO (Agnes) check this
|
||||
* levels
|
||||
* - the constant memory (64K) is used for storing the roots of identity
|
||||
* values for the FFT
|
||||
*/
|
||||
|
||||
@@ -23,8 +23,7 @@
|
||||
#include "utils/memory.cuh"
|
||||
#include "utils/timer.cuh"
|
||||
|
||||
// Cooperative groups are used in the low latency
|
||||
// version of the bootstrapping
|
||||
// Cooperative groups are used in the low latency PBS
|
||||
using namespace cooperative_groups;
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
@@ -58,11 +57,6 @@ mul_trgsw_trlwe(Torus *accumulator,
|
||||
// needed to perform the external product in this block (corresponding to
|
||||
// the same decomposition level)
|
||||
|
||||
// auto bsk_mask_slice = bootstrapping_key.get_ith_mask_kth_block(
|
||||
// gpu_num, iteration, blockIdx.y, blockIdx.x);
|
||||
// auto bsk_body_slice = bootstrapping_key.get_ith_body_kth_block(
|
||||
// gpu_num, iteration, blockIdx.y, blockIdx.x);
|
||||
|
||||
auto bsk_mask_slice = PolynomialFourier<double2, params>(
|
||||
get_ith_mask_kth_block(
|
||||
bootstrapping_key, iteration, blockIdx.y, blockIdx.x,
|
||||
@@ -195,7 +189,6 @@ __global__ void device_bootstrap_low_latency(
|
||||
// Since the space is L1 cache is small, we use the same memory location for
|
||||
// the rotated accumulator and the fft accumulator, since we know that the
|
||||
// rotated array is not in use anymore by the time we perform the fft
|
||||
|
||||
GadgetMatrix<Torus, params> gadget(base_log, l_gadget);
|
||||
|
||||
// Put "b" in [0, 2N[
|
||||
@@ -222,17 +215,6 @@ __global__ void device_bootstrap_low_latency(
|
||||
block_lwe_in[i],
|
||||
2 * params::degree); // 2 * params::log2_degree + 1);
|
||||
|
||||
if (a_hat == 0) {
|
||||
// todo(Joao): **cannot use this optimization**
|
||||
// the reason is that one of the input ciphertexts (blockIdx.z)
|
||||
// might skip an iteration while others don't, which as a result
|
||||
// will make that block not call the grid.sync(), causing a deadlock;
|
||||
// maybe it's a workaround to add grid.sync() here, but not sure if
|
||||
// there are any edge cases?
|
||||
|
||||
// continue
|
||||
}
|
||||
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
Torus, params::opt, params::degree / params::opt>(
|
||||
@@ -245,8 +227,6 @@ __global__ void device_bootstrap_low_latency(
|
||||
params::degree / params::opt>(
|
||||
accumulator_rotated, base_log, l_gadget);
|
||||
|
||||
|
||||
|
||||
// Decompose the accumulator. Each block gets one level of the
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
|
||||
@@ -337,7 +337,6 @@ void host_cmux_tree(
|
||||
device_batch_cmux<Torus, STorus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
memory_needed_per_block));
|
||||
// TODO (Agnes): is this necessary?
|
||||
checkCudaErrors(cudaFuncSetCacheConfig(
|
||||
device_batch_cmux<Torus, STorus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
|
||||
@@ -83,9 +83,7 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
|
||||
|
||||
int gridSize = total_polynomials;
|
||||
int blockSize = polynomial_size / choose_opt(polynomial_size);
|
||||
// todo(Joao): let's use cudaMallocHost here,
|
||||
// since it allocates page-staged memory which allows
|
||||
// faster data copy
|
||||
|
||||
double2 *h_bsk = (double2 *)malloc(buffer_size);
|
||||
double2 *d_bsk;
|
||||
cudaMalloc((void **)&d_bsk, buffer_size);
|
||||
@@ -110,7 +108,6 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
|
||||
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
switch (polynomial_size) {
|
||||
// FIXME (Agnes): check if polynomial sizes are ok
|
||||
case 512:
|
||||
batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
|
||||
|
||||
@@ -36,8 +36,6 @@ __device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T rescale_torus_element(T element,
|
||||
uint32_t log_shift) {
|
||||
// todo(Joao): not sure if this works
|
||||
// return element >> log_shift;
|
||||
return round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
|
||||
(double)log_shift);
|
||||
}
|
||||
|
||||
@@ -80,7 +80,7 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
|
||||
* Each thread is always in charge of "opt/2" pairs of coefficients,
|
||||
* which is why we always loop through N/2 by N/opt strides
|
||||
* The pragma unroll instruction tells the compiler to unroll the
|
||||
* full loop, which should increase performance TODO (Agnes) check this
|
||||
* full loop, which should increase performance
|
||||
*/
|
||||
bit_reverse_inplace<params>(A);
|
||||
__syncthreads();
|
||||
@@ -113,8 +113,6 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
|
||||
// between groups of 4 coefficients
|
||||
// k=2, \zeta=exp(i pi/4) for even coefficients and
|
||||
// exp(3 i pi / 4) for odd coefficients
|
||||
// TODO (Agnes) how does this work on the gpu? aren't we doing
|
||||
// a lot more computations than we should?
|
||||
tid = threadIdx.x;
|
||||
// odd = 0 for even coefficients, 1 for odd coefficients
|
||||
int odd = tid & 1;
|
||||
@@ -371,7 +369,7 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
|
||||
* Each thread is always in charge of "opt/2" pairs of coefficients,
|
||||
* which is why we always loop through N/2 by N/opt strides
|
||||
* The pragma unroll instruction tells the compiler to unroll the
|
||||
* full loop, which should increase performance TODO (Agnes) check this
|
||||
* full loop, which should increase performance
|
||||
*/
|
||||
int tid;
|
||||
int i1, i2;
|
||||
@@ -589,8 +587,6 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
|
||||
// between groups of 4 coefficients
|
||||
// k=2, \zeta=exp(i pi/4) for even coefficients and
|
||||
// exp(3 i pi / 4) for odd coefficients
|
||||
// TODO (Agnes) how does this work on the gpu? aren't we doing
|
||||
// a lot more computations than we should?
|
||||
tid = threadIdx.x;
|
||||
// odd = 0 for even coefficients, 1 for odd coefficients
|
||||
int odd = tid & 1;
|
||||
@@ -602,7 +598,6 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
|
||||
i1 = (tid << 1) - odd;
|
||||
i2 = i1 + 2;
|
||||
|
||||
// TODO(Beka) optimize twiddle multiplication
|
||||
double2 w;
|
||||
if (odd) {
|
||||
w.x = -0.707106781186547461715008466854;
|
||||
@@ -629,7 +624,6 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
|
||||
// of coefficients, with a stride of 2
|
||||
i1 = tid << 1;
|
||||
i2 = i1 + 1;
|
||||
// TODO(Beka) optimize twiddle multiplication
|
||||
double2 w = {0, -1};
|
||||
u = A[i1], v = A[i2];
|
||||
A[i1] = (u + v) * 0.5;
|
||||
|
||||
@@ -2,14 +2,6 @@
|
||||
#ifndef GPU_BOOTSTRAP_TWIDDLES_CUH
|
||||
#define GPU_BOOTSTRAP_TWIDDLES_CUH
|
||||
|
||||
// TODO (Agnes) depending on the device architecture
|
||||
// can we make more of them __constant__?
|
||||
// Do we have to define them all regardless of the
|
||||
// polynomial degree and q values?
|
||||
|
||||
// TODO (Beka) make those two arrays with dynamic size
|
||||
// or find exact maximum for 8192 length poly it shuld
|
||||
// be less than 2048
|
||||
extern __constant__ short SW1[2048];
|
||||
extern __constant__ short SW2[2048];
|
||||
|
||||
|
||||
@@ -136,8 +136,6 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(void *v_stream, Torus *lwe_ou
|
||||
lwe_upper = (int)ceil((double)lwe_dim / (double)ideal_threads);
|
||||
}
|
||||
|
||||
// int lwe_size_before =
|
||||
// (lwe_dimension_before + 1) * num_samples;
|
||||
int lwe_size_after =
|
||||
(lwe_dimension_after + 1) * num_samples;
|
||||
|
||||
|
||||
@@ -166,7 +166,6 @@ __device__ void add_to_torus(double2 *m_values, Torus *result) {
|
||||
Torus mx = (sizeof(Torus) == 4) ? UINT32_MAX : UINT64_MAX;
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
// TODO (Beka) check if better memory access is possible
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
double v1 = m_values[tid].x;
|
||||
double v2 = m_values[tid].y;
|
||||
@@ -194,8 +193,6 @@ __device__ void add_to_torus(double2 *m_values, Torus *result) {
|
||||
template <typename Torus, class params>
|
||||
__device__ void sample_extract_body(Torus *lwe_out, Torus *accumulator) {
|
||||
// Set first coefficient of the accumulator as the body of the LWE sample
|
||||
// todo(Joao): not every thread needs to set it
|
||||
// if (threadIdx.x == 0)
|
||||
lwe_out[params::degree] = accumulator[0];
|
||||
}
|
||||
|
||||
|
||||
@@ -50,8 +50,7 @@ public:
|
||||
int chunk_size) {
|
||||
int pos = chunk_num * chunk_size;
|
||||
T *ptr = &m_data[pos];
|
||||
// todo(Joao): unsafe, user must pass chunk that has size multiple of
|
||||
// polynomial degree
|
||||
|
||||
return VectorPolynomial<T, params>(ptr, chunk_size / params::degree);
|
||||
}
|
||||
|
||||
@@ -88,8 +87,6 @@ public:
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
// todo(Joao): we need to make these APIs more clear, as it's confusing what's
|
||||
// being copied where
|
||||
__device__ void copy_into_ith_polynomial(PolynomialFourier<T, params> &source,
|
||||
int i) {
|
||||
int tid = threadIdx.x;
|
||||
@@ -160,22 +157,6 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
__device__ void add_polynomial_inplace(PolynomialFourier<T, params> &source,
|
||||
int begin) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
this->m_values[tid] += source.m_values[tid + begin];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
if (threadIdx.x == 0) {
|
||||
this->m_values[params::degree / 2] += source.m_values[params::degree / 2 +
|
||||
begin];
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
__device__ void swap_quarters_inplace() {
|
||||
int tid = threadIdx.x;
|
||||
int s1 = params::quarter;
|
||||
@@ -202,20 +183,6 @@ begin];
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void
|
||||
forward_negacyclic_fft_inplace(PolynomialFourier<double2, params> &X) {
|
||||
// TODO function should be removed
|
||||
}
|
||||
|
||||
__device__ void inverse_negacyclic_fft_inplace() {
|
||||
// TODO function should be removed
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__device__ void add_to_torus(Polynomial<Torus, params> &result) {
|
||||
// TODO function should be removed
|
||||
}
|
||||
|
||||
__device__ T &operator[](int i) { return m_values[i]; }
|
||||
};
|
||||
|
||||
@@ -474,18 +441,6 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
__device__ void add_polynomial_inplace(Polynomial<T, params> &source,
|
||||
int begin) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
this->coefficients[tid] += source.coefficients[tid + begin];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
__device__ void sub_polynomial_inplace(Polynomial<T, params> &rhs) {
|
||||
int tid = threadIdx.x;
|
||||
const int grid_dim = blockDim.x;
|
||||
@@ -543,11 +498,6 @@ public:
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
template <typename FT>
|
||||
__device__ void
|
||||
forward_negacyclic_fft_half(PolynomialFourier<FT, params> &result) {
|
||||
// TODO function should be removed
|
||||
}
|
||||
};
|
||||
template <typename T, class params> class Vector {
|
||||
public:
|
||||
@@ -624,7 +574,6 @@ public:
|
||||
|
||||
__device__ void set_last_element(T elem) { m_data[m_size - 1] = elem; }
|
||||
|
||||
// todo(Joao): let's do coalesced access here at some point
|
||||
__device__ void operator-=(const Vector<T, params> &rhs) {
|
||||
assert(m_size == rhs->m_size);
|
||||
int tid = threadIdx.x;
|
||||
|
||||
Reference in New Issue
Block a user