From bc90576454e855e93f8d5cd0a11ae899efdc3e8d Mon Sep 17 00:00:00 2001
From: Beka Barbakadze <beka.barbakadze@zama.ai>
Date: Wed, 11 Jan 2023 16:14:33 +0400
Subject: [PATCH] docs(cuda): add Rust doc for all concrete-cuda entry points

---
 include/bootstrap.h          |   1 +
 src/addition.cu              |  63 ++++++++++++++++
 src/bit_extraction.cu        |  48 ++++++++++++
 src/bit_extraction.cuh       |  65 +++++++++++------
 src/bootstrap_amortized.cu   | 122 +++++++++++++++++--------------
 src/bootstrap_low_latency.cu | 137 +++++++++++++++++++++--------------
 src/circuit_bootstrap.cu     |  40 ++++++++++
 src/circuit_bootstrap.cuh    |  37 ++++++++--
 src/keyswitch.cu             |  49 +++++++++----
 src/multiplication.cu        |  32 ++++++++
 src/negation.cu              |  27 +++++++
 src/vertical_packing.cu      |  54 ++++++++++++++
 src/wop_bootstrap.cu         |  61 ++++++++++++++++
 13 files changed, 585 insertions(+), 151 deletions(-)

diff --git a/include/bootstrap.h b/include/bootstrap.h
index da28cde2b..3ba371236 100644
--- a/include/bootstrap.h
+++ b/include/bootstrap.h
@@ -86,6 +86,7 @@ void cuda_extract_bits_64(
     uint32_t glwe_dimension, uint32_t base_log_bsk, uint32_t level_count_bsk,
     uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples,
     uint32_t max_shared_memory);
+
 void cuda_circuit_bootstrap_32(
     void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
     void *fourier_bsk, void *fp_ksk_array, void *lwe_array_in_shifted_buffer,
diff --git a/src/addition.cu b/src/addition.cu
index fcaa669f0..c998c5de2 100644
--- a/src/addition.cu
+++ b/src/addition.cu
@@ -1,5 +1,9 @@
 #include "addition.cuh"
 
+/*
+ * Perform the addition of two u32 input LWE ciphertext vectors.
+ * See the equivalent operation on u64 ciphertexts for more details.
+ */
 void cuda_add_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
                                        void *lwe_array_out,
                                        void *lwe_array_in_1,
@@ -12,6 +16,33 @@ void cuda_add_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
                 static_cast<uint32_t *>(lwe_array_in_2), input_lwe_dimension,
                 input_lwe_ciphertext_count);
 }
+
+/*
+ * Perform the addition of two u64 input LWE ciphertext vectors.
+ * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `gpu_index` is the index of the GPU to be used in the kernel launch
+ * - `lwe_array_out` is an array of size
+ * `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
+ * been allocated on the GPU before calling this function, and that will hold
+ * the result of the computation.
+ * - `lwe_array_in_1` is the first LWE ciphertext vector used as input, it
+ * should have been allocated and initialized before calling this function. It
+ * has the same size as the output array.
+ * - `lwe_array_in_2` is the second LWE ciphertext vector used as input, it
+ * should have been allocated and initialized before calling this function. It
+ * has the same size as the output array.
+ * - `input_lwe_dimension` is the number of mask elements in the two input and
+ * in the output ciphertext vectors
+ * - `input_lwe_ciphertext_count` is the number of ciphertexts contained in each
+ * input LWE ciphertext vector, as well as in the output.
+ *
+ * Each element (mask element or body) of the input LWE ciphertext vector 1 is
+ * added to the corresponding element in the input LWE ciphertext 2. The result
+ * is stored in the output LWE ciphertext vector. The two input LWE ciphertext
+ * vectors are left unchanged. This function is a wrapper to a device function
+ * that performs the operation on the GPU.
+ */
 void cuda_add_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
                                        void *lwe_array_out,
                                        void *lwe_array_in_1,
@@ -24,6 +55,10 @@ void cuda_add_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
                 static_cast<uint64_t *>(lwe_array_in_2), input_lwe_dimension,
                 input_lwe_ciphertext_count);
 }
+/*
+ * Perform the addition of a u32 input LWE ciphertext vector with a u32
+ * plaintext vector. See the equivalent operation on u64 data for more details.
+ */
 void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
     void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
     void *plaintext_array_in, uint32_t input_lwe_dimension,
@@ -35,6 +70,34 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
                           static_cast<uint32_t *>(plaintext_array_in),
                           input_lwe_dimension, input_lwe_ciphertext_count);
 }
+/*
+ * Perform the addition of a u64 input LWE ciphertext vector with a u64 input
+ * plaintext vector.
+ * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `gpu_index` is the index of the GPU to be used in the kernel launch
+ * - `lwe_array_out` is an array of size
+ * `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
+ * been allocated on the GPU before calling this function, and that will hold
+ * the result of the computation.
+ * - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
+ * been allocated and initialized before calling this function. It has the same
+ * size as the output array.
+ * - `plaintext_array_in` is the plaintext vector used as input, it should have
+ * been allocated and initialized before calling this function. It should be of
+ * size `input_lwe_ciphertext_count`.
+ * - `input_lwe_dimension` is the number of mask elements in the input and
+ * output LWE ciphertext vectors
+ * - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the
+ * input LWE ciphertext vector, as well as in the output. It is also the number
+ * of plaintexts in the input plaintext vector.
+ *
+ * Each plaintext of the input plaintext vector is added to the body of the
+ * corresponding LWE ciphertext in the LWE ciphertext vector. The result of the
+ * operation is stored in the output LWE ciphertext vector. The two input
+ * vectors are unchanged. This function is a wrapper to a device function that
+ * performs the operation on the GPU.
+ */
 void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
     void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
     void *plaintext_array_in, uint32_t input_lwe_dimension,
diff --git a/src/bit_extraction.cu b/src/bit_extraction.cu
index e02a254dc..5d9f29065 100644
--- a/src/bit_extraction.cu
+++ b/src/bit_extraction.cu
@@ -1,5 +1,8 @@
 #include "bit_extraction.cuh"
 
+/* Perform bit extract on a batch of 32 bit LWE ciphertexts.
+ * See the corresponding function on 64 bit LWE ciphertexts for more details.
+ */
 void cuda_extract_bits_32(
     void *v_stream, uint32_t gpu_index, void *list_lwe_array_out,
     void *lwe_array_in, void *lwe_array_in_buffer,
@@ -97,6 +100,51 @@ void cuda_extract_bits_32(
   }
 }
 
+/* Perform bit extract on a batch of 64 bit lwe ciphertexts.
+ * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `gpu_index` is the index of the GPU to be used in the kernel launch
+ *  - 'number_of_bits' will be extracted from each ciphertext
+ * starting at the bit number 'delta_log' (0-indexed) included.
+ * Output bits are ordered from the MSB to LSB. Every extracted bit is
+ * represented as an LWE ciphertext, containing the encryption of the bit scaled
+ * by q/2.
+ *  - 'list_lwe_array_out' output batch LWE ciphertexts for each bit of every
+ * input ciphertext
+ *  - 'lwe_array_in' batch of input LWE ciphertexts, with size -
+ * ('lwe_dimension_in' + 1) * number_of_samples * sizeof(u64)
+ * The following 5 parameters are used during calculations, they are not actual
+ * inputs of the function they are just allocated memory for calculation
+ * process, like this, memory can be allocated once and can be used as much
+ * as needed for different calls of extract_bit function.
+ *  - 'lwe_array_in_buffer' same size as 'lwe_array_in'
+ *  - 'lwe_array_in_shifted_buffer' same size as 'lwe_array_in'
+ *  - 'lwe_array_out_ks_buffer'  with size:
+ * ('lwe_dimension_out' + 1) * number_of_samples * sizeof(u64)
+ *  - 'lwe_array_out_pbs_buffer' same size as 'lwe_array_in'
+ *  - 'lut_pbs' with size:
+ * (glwe_dimension + 1) * (lwe_dimension_in + 1) * sizeof(u64)
+ * The other inputs are:
+ *  - 'lut_vector_indexes' stores the index corresponding to which test
+ * vector to use
+ *  - 'ksk' keyswitch key
+ *  - 'fourier_bsk'  complex compressed bsk in fourier domain
+ *  - 'lwe_dimension_in' input LWE ciphertext dimension, supported input
+ * dimensions are: {512, 1024,2048, 4096, 8192}
+ *  - 'lwe_dimension_out' output LWE ciphertext dimension
+ *  - 'glwe_dimension' GLWE dimension,  only glwe_dimension = 1 is supported
+ * for now
+ *  - 'base_log_bsk' base_log for bootstrapping
+ *  - 'level_count_bsk' decomposition level count for bootstrapping
+ *  - 'base_log_ksk' base_log for keyswitch
+ *  - 'level_count_ksk' decomposition level for keyswitch
+ *  - 'number_of_samples' number of input LWE ciphertexts
+ *  - 'max_shared_memory' maximum amount of shared memory to be used inside
+ * device functions
+ *
+ * This function will call corresponding template of wrapper host function which
+ * will manage the calls of device functions.
+ */
 void cuda_extract_bits_64(
     void *v_stream, uint32_t gpu_index, void *list_lwe_array_out,
     void *lwe_array_in, void *lwe_array_in_buffer,
diff --git a/src/bit_extraction.cuh b/src/bit_extraction.cuh
index 37fbfc3a1..f8d6539a0 100644
--- a/src/bit_extraction.cuh
+++ b/src/bit_extraction.cuh
@@ -11,10 +11,12 @@
 #include "polynomial/parameters.cuh"
 #include "utils/timer.cuh"
 
-// only works for big lwe for ks+bs case
-// state_lwe_buffer is copied from big lwe input
-// shifted_lwe_buffer is scalar multiplication of lwe input
-// blockIdx.x refers to input ciphertext id
+/*
+ * Function copies batch lwe input to two different buffers,
+ * one is shifted by value
+ * one is copied without any modification
+ * works for ciphertexts with sizes supported by params::degree
+ */
 template <typename Torus, class params>
 __global__ void copy_and_shift_lwe(Torus *dst_copy, Torus *dst_shift,
                                    Torus *src, Torus value) {
@@ -37,8 +39,10 @@ __global__ void copy_and_shift_lwe(Torus *dst_copy, Torus *dst_shift,
   }
 }
 
-// only works for small lwe in ks+bs case
-// function copies lwe when length is not a power of two
+/*
+ * Function copies batch of lwe to lwe when size is not supported by
+ * params::degree
+ */
 template <typename Torus>
 __global__ void copy_small_lwe(Torus *dst, Torus *src, uint32_t small_lwe_size,
                                uint32_t number_of_bits, uint32_t lwe_id) {
@@ -62,24 +66,27 @@ __global__ void copy_small_lwe(Torus *dst, Torus *src, uint32_t small_lwe_size,
     cur_dst[tid] = cur_src[tid];
 }
 
-// only used in extract bits for one ciphertext
-// should be called with one block and one thread
-// NOTE: check if putting this functionality in copy_small_lwe or
-// fill_pbs_lut vector is faster
+/*
+ * Function used to wrapping add value on the body of ciphertexts,
+ * should be called with blocksize.x = 1;
+ * blickIdx.x refers id of ciphertext
+ * NOTE: check if putting thi functionality in copy_small_lwe or fill_pbs_lut
+ * is faster
+ */
 template <typename Torus>
 __global__ void add_to_body(Torus *lwe, size_t lwe_dimension, Torus value) {
   lwe[blockIdx.x * (lwe_dimension + 1) + lwe_dimension] += value;
 }
 
-// Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption of 0
-// if the extracted bit was 0 and 1 in the other case
-//
-// Remove the extracted bit from the state LWE to get a 0 at the extracted bit
-// location.
-//
-// Shift on padding bit for next iteration, that's why
-// alpha= 1ll << (ciphertext_n_bits - delta_log - bit_idx - 2) is used
-// instead of alpha= 1ll << (ciphertext_n_bits - delta_log - bit_idx - 1)
+/*
+ * Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption of 0
+ * if the extracted bit was 0 and 1 in the other case
+ * Remove the extracted bit from the state LWE to get a 0 at the extracted bit
+ * location.
+ * Shift on padding bit for next iteration, that's why
+ * alpha= 1ll << (ciphertext_n_bits - delta_log - bit_idx - 2) is used
+ * instead of alpha= 1ll << (ciphertext_n_bits - delta_log - bit_idx - 1)
+ */
 template <typename Torus, class params>
 __global__ void add_sub_and_mul_lwe(Torus *shifted_lwe, Torus *state_lwe,
                                     Torus *pbs_lwe_array_out, Torus add_value,
@@ -104,9 +111,11 @@ __global__ void add_sub_and_mul_lwe(Torus *shifted_lwe, Torus *state_lwe,
   }
 }
 
-// Fill lut(only body) for the current bit (equivalent to trivial encryption as
-// mask is 0s)
-// The LUT is filled with value
+/*
+ * Fill lut(only body) for the current bit, equivalent to trivial encryption as
+ * msk is 0s
+ * blockIdx.x refers id of lut vector
+ */
 template <typename Torus, class params>
 __global__ void fill_lut_body_for_current_bit(Torus *lut, Torus value) {
 
@@ -119,6 +128,11 @@ __global__ void fill_lut_body_for_current_bit(Torus *lut, Torus value) {
   }
 }
 
+/*
+ * Host function for cuda extract bits.
+ * it executes device functions in specific order and manages
+ * parallelism
+ */
 template <typename Torus, class params>
 __host__ void host_extract_bits(
     void *v_stream, uint32_t gpu_index, Torus *list_lwe_array_out,
@@ -137,6 +151,7 @@ __host__ void host_extract_bits(
   int blocks = 1;
   int threads = params::degree / params::opt;
 
+  // shift lwe on padding bit and copy in new buffer
   copy_and_shift_lwe<Torus, params><<<blocks, threads, 0, *stream>>>(
       lwe_array_in_buffer, lwe_array_in_shifted_buffer, lwe_array_in,
       1ll << (ciphertext_n_bits - delta_log - 1));
@@ -157,11 +172,15 @@ __host__ void host_extract_bits(
       break;
     }
 
+    // Add q/4 to center the error while computing a negacyclic LUT
     add_to_body<Torus><<<1, 1, 0, *stream>>>(lwe_array_out_ks_buffer,
                                              lwe_dimension_out,
                                              1ll << (ciphertext_n_bits - 2));
     checkCudaErrors(cudaGetLastError());
 
+    // Fill lut for the current bit (equivalent to trivial encryption as mask is
+    // 0s) The LUT is filled with -alpha in each coefficient where alpha =
+    // delta*2^{bit_idx-1}
     fill_lut_body_for_current_bit<Torus, params>
         <<<blocks, threads, 0, *stream>>>(
             lut_pbs, 0ll - 1ll << (delta_log - 1 + bit_idx));
@@ -173,6 +192,8 @@ __host__ void host_extract_bits(
         lwe_dimension_out, lwe_dimension_in, base_log_bsk, level_count_bsk,
         number_of_samples, 1, max_shared_memory);
 
+    // Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption
+    // of 0 if the extracted bit was 0 and 1 in the other case
     add_sub_and_mul_lwe<Torus, params><<<1, threads, 0, *stream>>>(
         lwe_array_in_shifted_buffer, lwe_array_in_buffer,
         lwe_array_out_pbs_buffer, 1ll << (delta_log - 1 + bit_idx),
diff --git a/src/bootstrap_amortized.cu b/src/bootstrap_amortized.cu
index 11d06e985..612f5c33b 100644
--- a/src/bootstrap_amortized.cu
+++ b/src/bootstrap_amortized.cu
@@ -1,60 +1,7 @@
 #include "bootstrap_amortized.cuh"
 
-/* Perform bootstrapping on a batch of input LWE ciphertexts
- *
- *  - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
- * (a0,..an-1,b) where n is the LWE dimension
- *  - lut_vector: should hold as many test vectors of size polynomial_size
- * as there are input ciphertexts, but actually holds
- * num_lut_vectors vectors to reduce memory usage
- *  - lut_vector_indexes: stores the index corresponding to
- * which test vector to use for each sample in
- * lut_vector
- *  - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
- * mask values + 1 body value
- *  - bootstrapping_key: RGSW encryption of the LWE secret key sk1
- * under secret key sk2
- * bsk = Z + sk1 H
- * where H is the gadget matrix and Z is a matrix (k+1).l
- * containing GLWE encryptions of 0 under sk2.
- * bsk is thus a tensor of size (k+1)^2.l.N.n
- * where l is the number of decomposition levels and
- * k is the GLWE dimension, N is the polynomial size for
- * GLWE. The polynomial size for GLWE and the test vector
- * are the same because they have to be in the same ring
- * to be multiplied.
- * Note: it is necessary to generate (k+1).k.l.N.n
- * uniformly random coefficients for the zero encryptions
- * - input_lwe_dimension: size of the Torus vector used to encrypt the input
- * LWE ciphertexts - referred to as n above (~ 600)
- * - polynomial_size: size of the test polynomial (test vector) and size of the
- * GLWE polynomial (~1024)
- * - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
- * - level_count: number of decomposition levels in the gadget matrix (~4)
- * - num_samples: number of encrypted input messages
- * - num_lut_vectors: parameter to set the actual number of test vectors to be
- * used
- * - q: number of bytes in the integer representation (32 or 64)
- *
- * This function calls a wrapper to a device kernel that performs the
- * bootstrapping:
- * 	- the kernel is templatized based on integer discretization and
- * polynomial degree
- * 	- num_samples blocks of threads are launched, where each thread is going
- * to handle one or more polynomial coefficients at each stage:
- * 		- perform the blind rotation
- * 		- round the result
- * 		- decompose into level_count levels, then for each level:
- * 		  - switch to the FFT domain
- * 		  - multiply with the bootstrapping key
- * 		  - come back to the coefficients representation
- * 	- between each stage a synchronization of the threads is necessary
- * 	- in case the device has enough shared memory, temporary arrays used for
- * the different stages (accumulators) are stored into the shared memory
- * 	- the accumulators serve to combine the results for all decomposition
- * levels
- * 	- the constant memory (64K) is used for storing the roots of identity
- * values for the FFT
+/* Perform the programmable bootstrapping on a batch of input u32 LWE
+ * ciphertexts. See the corresponding operation on 64 bits for more details.
  */
 
 void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
@@ -115,6 +62,71 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
   }
 }
 
+/* Perform the programmable bootstrapping on a batch of input u64 LWE
+ * ciphertexts. This functions performs best for large numbers of inputs (> 10).
+ * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `gpu_index` is the index of the GPU to be used in the kernel launch
+ *  - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
+ * (a0,..an-1,b) where n is the LWE dimension
+ *  - lut_vector: should hold as many test vectors of size polynomial_size
+ * as there are input ciphertexts, but actually holds
+ * num_lut_vectors vectors to reduce memory usage
+ *  - lut_vector_indexes: stores the index corresponding to
+ * which test vector of lut_vector to use for each LWE input in
+ * lwe_array_in
+ *  - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
+ * mask values + 1 body value
+ *  - bootstrapping_key: GGSW encryption of the LWE secret key sk1
+ * under secret key sk2
+ * bsk = Z + sk1 H
+ * where H is the gadget matrix and Z is a matrix (k+1).l
+ * containing GLWE encryptions of 0 under sk2.
+ * bsk is thus a tensor of size (k+1)^2.l.N.n
+ * where l is the number of decomposition levels and
+ * k is the GLWE dimension, N is the polynomial size for
+ * GLWE. The polynomial size for GLWE and the test vector
+ * are the same because they have to be in the same ring
+ * to be multiplied.
+ * - input_lwe_dimension: size of the Torus vector used to encrypt the input
+ * LWE ciphertexts - referred to as n above (~ 600)
+ * - polynomial_size: size of the test polynomial (test vector) and size of the
+ * GLWE polynomials (~1024) (where `size` refers to the polynomial degree + 1).
+ * - base_log: log of the base used for the gadget matrix - B = 2^base_log (~8)
+ * - level_count: number of decomposition levels in the gadget matrix (~4)
+ * - num_samples: number of encrypted input messages
+ * - num_lut_vectors: parameter to set the actual number of test vectors to be
+ * used
+ * - lwe_idx: the index of the LWE input to consider for the GPU of index
+ * gpu_index. In case of multi-GPU computing, it is assumed that only a part of
+ * the input LWE array is copied to each GPU, but the whole LUT array is copied
+ * (because the case when the number of LUTs is smaller than the number of input
+ * LWEs is not trivial to take into account in the data repartition on the
+ * GPUs). `lwe_idx` is used to determine which LUT to consider for a given LWE
+ * input in the LUT array `lut_vector`.
+ *  - 'max_shared_memory' maximum amount of shared memory to be used inside
+ * device functions
+ *
+ * This function calls a wrapper to a device kernel that performs the
+ * bootstrapping:
+ * 	- the kernel is templatized based on integer discretization and
+ * polynomial degree
+ * 	- num_samples blocks of threads are launched, where each thread is going
+ * to handle one or more polynomial coefficients at each stage:
+ * 		- perform the blind rotation
+ * 		- round the result
+ * 		- decompose into level_count levels, then for each level:
+ * 		  - switch to the FFT domain
+ * 		  - multiply with the bootstrapping key
+ * 		  - come back to the coefficients representation
+ * 	- between each stage a synchronization of the threads is necessary
+ * 	- in case the device has enough shared memory, temporary arrays used for
+ * the different stages (accumulators) are stored into the shared memory
+ * 	- the accumulators serve to combine the results for all decomposition
+ * levels
+ * 	- the constant memory (64K) is used for storing the roots of identity
+ * values for the FFT
+ */
 void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
     void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
     void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
diff --git a/src/bootstrap_low_latency.cu b/src/bootstrap_low_latency.cu
index 92cd6943b..246963314 100644
--- a/src/bootstrap_low_latency.cu
+++ b/src/bootstrap_low_latency.cu
@@ -1,60 +1,11 @@
 #include "bootstrap_low_latency.cuh"
 
-/* Perform bootstrapping on a batch of input LWE ciphertexts
- *
- *  - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
- * (a0,..an-1,b) where n is the LWE dimension
- *  - lut_vector: should hold as many test vectors of size polynomial_size
- * as there are input ciphertexts, but actually holds
- * num_lut_vectors vectors to reduce memory usage
- *  - lut_vector_indexes: stores the index corresponding to
- * which test vector to use for each sample in
- * lut_vector
- *  - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
- * mask values + 1 body value
- *  - bootstrapping_key: RGSW encryption of the LWE secret key sk1
- * under secret key sk2
- * bsk = Z + sk1 H
- * where H is the gadget matrix and Z is a matrix (k+1).l
- * containing GLWE encryptions of 0 under sk2.
- * bsk is thus a tensor of size (k+1)^2.l.N.n
- * where l is the number of decomposition levels and
- * k is the GLWE dimension, N is the polynomial size for
- * GLWE. The polynomial size for GLWE and the test vector
- * are the same because they have to be in the same ring
- * to be multiplied.
- * Note: it is necessary to generate (k+1).k.l.N.n
- * uniformly random coefficients for the zero encryptions
- * - lwe_dimension: size of the Torus vector used to encrypt the input
- * LWE ciphertexts - referred to as n above (~ 600)
- * - polynomial_size: size of the test polynomial (test vector) and size of the
- * GLWE polynomial (~1024)
- * - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
- * - level_count: number of decomposition levels in the gadget matrix (~4)
- * - num_samples: number of encrypted input messages
- * - num_lut_vectors: parameter to set the actual number of test vectors to be
- * used
- * - q: number of bytes in the integer representation (32 or 64)
- *
- * This function calls a wrapper to a device kernel that performs the
- * bootstrapping:
- * 	- the kernel is templatized based on integer discretization and
- * polynomial degree
- * 	- num_samples blocks of threads are launched, where each thread is going
- * to handle one or more polynomial coefficients at each stage:
- * 		- perform the blind rotation
- * 		- round the result
- * 		- decompose into level_count levels, then for each level:
- * 		  - switch to the FFT domain
- * 		  - multiply with the bootstrapping key
- * 		  - come back to the coefficients representation
- * 	- between each stage a synchronization of the threads is necessary
- * 	- in case the device has enough shared memory, temporary arrays used for
- * the different stages (accumulators) are stored into the shared memory
- * 	- the accumulators serve to combine the results for all decomposition
- * levels
- * 	- the constant memory (64K) is used for storing the roots of identity
- * values for the FFT
+/* Perform bootstrapping on a batch of input u32 LWE ciphertexts.
+ * This function performs best for small numbers of inputs. Beyond a certain
+ * number of inputs (the exact number depends on the cryptographic parameters),
+ * the kernel cannot be launched and it is necessary to split the kernel call
+ * into several calls on smaller batches of inputs. For more details on this
+ * operation, head on to the equivalent u64 operation.
  */
 void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
     void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
@@ -124,6 +75,82 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
   }
 }
 
+/* Perform bootstrapping on a batch of input u64 LWE ciphertexts.
+ * This function performs best for small numbers of inputs. Beyond a certain
+ * number of inputs (the exact number depends on the cryptographic parameters),
+ * the kernel cannot be launched and it is necessary to split the kernel call
+ * into several calls on smaller batches of inputs.
+ *
+ * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `gpu_index` is the index of the GPU to be used in the kernel launch
+ *  - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
+ * (a0,..an-1,b) where n is the LWE dimension
+ *  - lut_vector: should hold as many test vectors of size polynomial_size
+ * as there are input ciphertexts, but actually holds
+ * num_lut_vectors vectors to reduce memory usage
+ *  - lut_vector_indexes: stores the index corresponding to
+ * which test vector to use for each sample in
+ * lut_vector
+ *  - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
+ * mask values + 1 body value
+ *  - bootstrapping_key: GGSW encryption of the LWE secret key sk1
+ * under secret key sk2
+ * bsk = Z + sk1 H
+ * where H is the gadget matrix and Z is a matrix (k+1).l
+ * containing GLWE encryptions of 0 under sk2.
+ * bsk is thus a tensor of size (k+1)^2.l.N.n
+ * where l is the number of decomposition levels and
+ * k is the GLWE dimension, N is the polynomial size for
+ * GLWE. The polynomial size for GLWE and the test vector
+ * are the same because they have to be in the same ring
+ * to be multiplied.
+ * - lwe_dimension: size of the Torus vector used to encrypt the input
+ * LWE ciphertexts - referred to as n above (~ 600)
+ * - glwe_dimension: size of the polynomial vector used to encrypt the LUT
+ * GLWE ciphertexts - referred to as k above. Only the value 1 is supported for
+ * this parameter.
+ * - polynomial_size: size of the test polynomial (test vector) and size of the
+ * GLWE polynomial (~1024)
+ * - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
+ * - level_count: number of decomposition levels in the gadget matrix (~4)
+ * - num_samples: number of encrypted input messages
+ * - num_lut_vectors: parameter to set the actual number of test vectors to be
+ * used
+ * - lwe_idx: the index of the LWE input to consider for the GPU of index
+ * gpu_index. In case of multi-GPU computing, it is assumed that only a part of
+ * the input LWE array is copied to each GPU, but the whole LUT array is copied
+ * (because the case when the number of LUTs is smaller than the number of input
+ * LWEs is not trivial to take into account in the data repartition on the
+ * GPUs). `lwe_idx` is used to determine which LUT to consider for a given LWE
+ * input in the LUT array `lut_vector`.
+ *  - 'max_shared_memory' maximum amount of shared memory to be used inside
+ * device functions
+ *
+ * This function calls a wrapper to a device kernel that performs the
+ * bootstrapping:
+ * 	- the kernel is templatized based on integer discretization and
+ * polynomial degree
+ * 	- num_samples * level_count * (glwe_dimension + 1) blocks of threads are
+ * launched, where each thread	is going to handle one or more polynomial
+ * coefficients at each stage, for a given level of decomposition, either for
+ * the LUT mask or its body:
+ * 		- perform the blind rotation
+ * 		- round the result
+ * 		- get the decomposition for the current level
+ * 		- switch to the FFT domain
+ * 		- multiply with the bootstrapping key
+ * 		- come back to the coefficients representation
+ * 	- between each stage a synchronization of the threads is necessary (some
+ * synchronizations happen at the block level, some happen between blocks, using
+ * cooperative groups).
+ * 	- in case the device has enough shared memory, temporary arrays used for
+ * the different stages (accumulators) are stored into the shared memory
+ * 	- the accumulators serve to combine the results for all decomposition
+ * levels
+ * 	- the constant memory (64K) is used for storing the roots of identity
+ * values for the FFT
+ */
 void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
     void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
     void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
diff --git a/src/circuit_bootstrap.cu b/src/circuit_bootstrap.cu
index d43d531cc..df3622ec7 100644
--- a/src/circuit_bootstrap.cu
+++ b/src/circuit_bootstrap.cu
@@ -1,5 +1,9 @@
 #include "circuit_bootstrap.cuh"
 
+/*
+ * Perform circuit bootstrapping for the batch of 32 bit LWE ciphertexts.
+ * Head out to the equivalent operation on 64 bits for more details.
+ */
 void cuda_circuit_bootstrap_32(
     void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
     void *fourier_bsk, void *fp_ksk_array, void *lwe_array_in_shifted_buffer,
@@ -88,6 +92,42 @@ void cuda_circuit_bootstrap_32(
   }
 }
 
+/*
+ * Perform circuit bootstrapping on a batch of 64 bit input LWE ciphertexts.
+ * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `gpu_index` is the index of the GPU to be used in the kernel launch
+ *  - 'ggsw_out' output batch of ggsw with size:
+ * 'number_of_samples' * 'level_cbs' * ('glwe_dimension' + 1)^2 *
+ * polynomial_size * sizeof(u64)
+ *  - 'lwe_array_in' input batch of lwe ciphertexts, with size:
+ * 'number_of_samples' * '(lwe_dimension' + 1) * sizeof(u64)
+ *  - 'fourier_bsk' bootstrapping key in fourier domain with size:
+ * 'lwe_dimension' * 'level_bsk' * ('glwe_dimension' + 1)^2 *
+ * 'polynomial_size' / 2 * sizeof(double2)
+ *  - 'fp_ksk_array' batch of fp-keyswitch keys with size:
+ * ('polynomial_size' + 1) * 'level_pksk' * ('glwe_dimension' + 1)^2 *
+ * 'polynomial_size' * sizeof(u64)
+ *  The following 5 parameters are used during calculations, they are not actual
+ *  inputs of the function, they are just allocated memory for calculation
+ *  process, like this, memory can be allocated once and can be used as much
+ *  as needed for different calls of circuit_bootstrap function
+ *  - 'lwe_array_in_shifted_buffer' with size:
+ * 'number_of_samples' * 'level_cbs' * ('lwe_dimension' + 1) * sizeof(u64)
+ *  - 'lut_vector' with size:
+ * 'level_cbs' * ('glwe_dimension' + 1) * 'polynomial_size' * sizeof(u64)
+ *  - 'lut_vector_indexes' stores the index corresponding to which test
+ *  vector to use
+ *  - 'lwe_array_out_pbs_buffer' with size
+ * 'number_of_samples' * 'level_cbs' * ('polynomial_size' + 1) * sizeof(u64)
+ *  - 'lwe_array_in_fp_ks_buffer' with size
+ * 'number_of_samples' * 'level_cbs' * ('glwe_dimension' + 1) *
+ * ('polynomial_size' + 1) * sizeof(u64)
+ *
+ * This function calls a wrapper to a device kernel that performs the
+ * circuit bootstrap. The kernel is templatized based on integer discretization
+ * and polynomial degree.
+ */
 void cuda_circuit_bootstrap_64(
     void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
     void *fourier_bsk, void *fp_ksk_array, void *lwe_array_in_shifted_buffer,
diff --git a/src/circuit_bootstrap.cuh b/src/circuit_bootstrap.cuh
index 71419b3cc..9b3aa36a6 100644
--- a/src/circuit_bootstrap.cuh
+++ b/src/circuit_bootstrap.cuh
@@ -10,9 +10,11 @@
 #include "polynomial/parameters.cuh"
 #include "utils/timer.cuh"
 
-// works for lwe with generic sizes
-// shifted_lwe_buffer is scalar multiplication of lwe input
-// blockIdx.x refers to input ciphertext id
+/*
+ * scalar multiplication to value for batch of lwe_ciphertext
+ * works for any size of lwe input
+ * blockIdx.x refers to input ciphertext it
+ */
 template <typename Torus, class params>
 __global__ void shift_lwe_cbs(Torus *dst_shift, Torus *src, Torus value,
                               size_t lwe_size) {
@@ -35,9 +37,14 @@ __global__ void shift_lwe_cbs(Torus *dst_shift, Torus *src, Torus value,
     cur_dst[tid] = cur_src[tid] * value;
 }
 
-// Fill lut (equivalent to trivial encryption as mask is 0s)
-// The LUT is filled with -alpha in each coefficient where
-// alpha = 2^{log(q) - 1 - base_log * level}
+/*
+ * Fill lut, equivalent to trivial encryption as mask is 0s.
+ * The LUT is filled with -alpha in each coefficient where
+ * alpha = 2^{log(q) - 1 - base_log * level}
+ * blockIdx.x refers to lut id
+ * value is not passed and calculated inside function because lut id is one
+ * of the variable.
+ */
 template <typename Torus, class params>
 __global__ void fill_lut_body_for_cbs(Torus *lut, uint32_t ciphertext_n_bits,
                                       uint32_t base_log_cbs) {
@@ -55,6 +62,19 @@ __global__ void fill_lut_body_for_cbs(Torus *lut, uint32_t ciphertext_n_bits,
   }
 }
 
+/*
+ * copy pbs result (glwe_dimension + 1) times to be an input of fp-ks
+ * each of the input ciphertext from lwe_src is  copied (glwe_dimension + 1)
+ * times inside lwe_dst, and then value is added to the body.
+ * blockIdx.x refers to destination lwe ciphertext id: 'dst_lwe_id'
+ * 'src_lwe_id' = 'dst_lwe_id' / (glwe_dimension + 1)
+ *
+ * example: glwe_dimension = 1
+ *                 src_0         ...          src_n
+ *                  / \                        / \
+ *                 /   \                      /   \
+ *             dst_0  dst_1               dst_2n  dst_2n+1
+ */
 template <typename Torus, class params>
 __global__ void copy_add_lwe_cbs(Torus *lwe_dst, Torus *lwe_src,
                                  uint32_t ciphertext_n_bits,
@@ -77,6 +97,11 @@ __global__ void copy_add_lwe_cbs(Torus *lwe_dst, Torus *lwe_src,
   }
 }
 
+/*
+ * Host function for cuda circuit bootstrap.
+ * It executes device functions in specific order and manages
+ * parallelism
+ */
 template <typename Torus, class params>
 __host__ void host_circuit_bootstrap(
     void *v_stream, uint32_t gpu_index, Torus *ggsw_out, Torus *lwe_array_in,
diff --git a/src/keyswitch.cu b/src/keyswitch.cu
index c9b9377ac..51bc397fb 100644
--- a/src/keyswitch.cu
+++ b/src/keyswitch.cu
@@ -4,15 +4,8 @@
 
 #include <cstdint>
 
-/* Perform keyswitch on a batch of input LWE ciphertexts for 32 bits
- *
- *  - lwe_array_out: output batch of num_samples keyswitched ciphertexts c =
- * (a0,..an-1,b) where n is the LWE dimension
- *  - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
- *            mask values + 1 body value
- *
- * This function calls a wrapper to a device kernel that performs the keyswitch
- * 	- num_samples blocks of threads are launched
+/* Perform keyswitch on a batch of 32 bits input LWE ciphertexts.
+ * Head out to the equivalent operation on 64 bits for more details.
  */
 void cuda_keyswitch_lwe_ciphertext_vector_32(
     void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
@@ -24,12 +17,18 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
       lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
 }
 
-/* Perform keyswitch on a batch of input LWE ciphertexts for 64 bits
+/* Perform keyswitch on a batch of 64 bits input LWE ciphertexts.
  *
+ * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `gpu_index` is the index of the GPU to be used in the kernel launch
  *  - lwe_array_out: output batch of num_samples keyswitched ciphertexts c =
- * (a0,..an-1,b) where n is the LWE dimension
- *  - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
- *            mask values + 1 body value
+ * (a0,..an-1,b) where n is the output LWE dimension (lwe_dimension_out)
+ *  - lwe_array_in: input batch of num_samples LWE ciphertexts, containing
+ * lwe_dimension_in mask values + 1 body value
+ *  - ksk: the keyswitch key to be used in the operation
+ *  - base log: the log of the base used in the decomposition (should be the one
+ * used to create the ksk)
  *
  * This function calls a wrapper to a device kernel that performs the keyswitch
  * 	- num_samples blocks of threads are launched
@@ -44,6 +43,9 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
       lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
 }
 
+/* Perform functional packing keyswitch on a batch of 32 bits input LWE
+ * ciphertexts. See the equivalent function on 64 bit inputs for more details.
+ */
 void cuda_fp_keyswitch_lwe_to_glwe_32(void *v_stream, void *glwe_array_out,
                                       void *lwe_array_in, void *fp_ksk_array,
                                       uint32_t input_lwe_dimension,
@@ -60,6 +62,27 @@ void cuda_fp_keyswitch_lwe_to_glwe_32(void *v_stream, void *glwe_array_out,
       output_glwe_dimension, output_polynomial_size, base_log, level_count,
       number_of_input_lwe, number_of_keys);
 }
+
+/* Perform functional packing keyswitch on a batch of 64 bits input LWE
+ * ciphertexts.
+ *
+ * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `glwe_array_out`: output batch of keyswitched ciphertexts
+ * - `lwe_array_in`: input batch of num_samples LWE ciphertexts, containing
+ * lwe_dimension_in mask values + 1 body value
+ *  - `fp_ksk_array`: the functional packing keyswitch keys to be used in the
+ * operation
+ *  - `base log`: the log of the base used in the decomposition (should be the
+ * one used to create the ksk)
+ *  - `level_count`: the number of levels used in the decomposition (should be
+ * the one used to  create the fp_ksks).
+ *  - `number_of_input_lwe`: the number of inputs
+ *  - `number_of_keys`: the number of fp_ksks
+ *
+ * This function calls a wrapper to a device kernel that performs the functional
+ * packing keyswitch.
+ */
 void cuda_fp_keyswitch_lwe_to_glwe_64(void *v_stream, void *glwe_array_out,
                                       void *lwe_array_in, void *fp_ksk_array,
                                       uint32_t input_lwe_dimension,
diff --git a/src/multiplication.cu b/src/multiplication.cu
index 397c16255..8b1d0371b 100644
--- a/src/multiplication.cu
+++ b/src/multiplication.cu
@@ -1,5 +1,9 @@
 #include "multiplication.cuh"
 
+/*
+ * Perform the multiplication of a u32 input LWE ciphertext vector with a u32
+ * cleartext vector. See the equivalent operation on u64 data for more details.
+ */
 void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
     void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
     void *cleartext_array_in, uint32_t input_lwe_dimension,
@@ -11,6 +15,34 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
       static_cast<uint32_t *>(cleartext_array_in), input_lwe_dimension,
       input_lwe_ciphertext_count);
 }
+/*
+ * Perform the multiplication of a u64 input LWE ciphertext vector with a u64
+ * input cleartext vector.
+ * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `gpu_index` is the index of the GPU to be used in the kernel launch
+ * - `lwe_array_out` is an array of size
+ * `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
+ * been allocated on the GPU before calling this function, and that will hold
+ * the result of the computation.
+ * - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
+ * been allocated and initialized before calling this function. It has the same
+ * size as the output array.
+ * - `cleartext_array_in` is the cleartext vector used as input, it should have
+ * been allocated and initialized before calling this function. It should be of
+ * size `input_lwe_ciphertext_count`.
+ * - `input_lwe_dimension` is the number of mask elements in the input and
+ * output LWE ciphertext vectors
+ * - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the
+ * input LWE ciphertext vector, as well as in the output. It is also the number
+ * of cleartexts in the input cleartext vector.
+ *
+ * Each cleartext of the input cleartext vector is multiplied to the mask and
+ * body of the corresponding LWE ciphertext in the LWE ciphertext vector. The
+ * result of the operation is stored in the output LWE ciphertext vector. The
+ * two input vectors are unchanged. This function is a wrapper to a device
+ * function that performs the operation on the GPU.
+ */
 void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
     void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
     void *cleartext_array_in, uint32_t input_lwe_dimension,
diff --git a/src/negation.cu b/src/negation.cu
index bb5ebcf6f..1f0f10d4e 100644
--- a/src/negation.cu
+++ b/src/negation.cu
@@ -1,5 +1,9 @@
 #include "negation.cuh"
 
+/*
+ * Perform the negation of a u32 input LWE ciphertext vector.
+ * See the equivalent operation on u64 ciphertexts for more details.
+ */
 void cuda_negate_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
                                           void *lwe_array_out,
                                           void *lwe_array_in,
@@ -10,6 +14,29 @@ void cuda_negate_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
                 static_cast<uint32_t *>(lwe_array_in), input_lwe_dimension,
                 input_lwe_ciphertext_count);
 }
+
+/*
+ * Perform the negation of a u64 input LWE ciphertext vector.
+ * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `gpu_index` is the index of the GPU to be used in the kernel launch
+ * - `lwe_array_out` is an array of size
+ * `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
+ * been allocated on the GPU before calling this function, and that will hold
+ * the result of the computation.
+ * - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
+ * been allocated and initialized before calling this function. It has the same
+ * size as the output array.
+ * - `input_lwe_dimension` is the number of mask elements in the two input and
+ * in the output ciphertext vectors
+ * - `input_lwe_ciphertext_count` is the number of ciphertexts contained in each
+ * input LWE ciphertext vector, as well as in the output.
+ *
+ * Each element (mask element or body) of the input LWE ciphertext vector is
+ * negated. The result is stored in the output LWE ciphertext vector. The input
+ * LWE ciphertext vector is left unchanged. This function is a wrapper to a
+ * device function that performs the operation on the GPU.
+ */
 void cuda_negate_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
                                           void *lwe_array_out,
                                           void *lwe_array_in,
diff --git a/src/vertical_packing.cu b/src/vertical_packing.cu
index 1f5499698..9f148f9fd 100644
--- a/src/vertical_packing.cu
+++ b/src/vertical_packing.cu
@@ -1,5 +1,9 @@
 #include "vertical_packing.cuh"
 
+/*
+ * Perform cmux tree on a batch of 32-bit input GGSW ciphertexts.
+ * Check the equivalent function for 64-bit inputs for more details.
+ */
 void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
                        void *ggsw_in, void *lut_vector, uint32_t glwe_dimension,
                        uint32_t polynomial_size, uint32_t base_log,
@@ -55,6 +59,33 @@ void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
   }
 }
 
+/*
+ * Perform Cmux tree on a batch of 64-bit input GGSW ciphertexts
+ * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `gpu_index` is the index of the GPU to be used in the kernel launch
+ *  - 'glwe_array_out' output batch of GLWE buffer for Cmux tree, 'tau' GLWE's
+ * will be the output of the function
+ *  - 'ggsw_in' batch of input GGSW ciphertexts, function expects 'r' GGSW
+ * ciphertexts as input.
+ *  - 'lut_vector' batch of test vectors (LUTs) there should be 2^r LUTs
+ * inside 'lut_vector' parameter
+ *  - 'glwe_dimension' GLWE dimension, supported values: {1}
+ *  - 'polynomial_size' size of the test polynomial, supported values: {512,
+ * 1024, 2048, 4096, 8192}
+ *  - 'base_log' base log parameter for cmux block
+ *  - 'level_count' decomposition level for cmux block
+ *  - 'r' number of input GGSW ciphertexts
+ *  - 'tau' number of input LWE ciphertext which were used to generate GGSW
+ * ciphertexts stored in 'ggsw_in', it is also an amount of output GLWE
+ * ciphertexts
+ *  - 'max_shared_memory' maximum shared memory amount to be used for cmux
+ *  kernel
+ *
+ * This function calls a wrapper to a device kernel that performs the
+ * Cmux tree. The kernel is templatized based on integer discretization and
+ * polynomial degree.
+ */
 void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
                        void *ggsw_in, void *lut_vector, uint32_t glwe_dimension,
                        uint32_t polynomial_size, uint32_t base_log,
@@ -110,6 +141,29 @@ void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
   }
 }
 
+/*
+ * Performs blind rotation on batch of 64-bit input ggsw ciphertexts
+ * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `gpu_index` is the index of the GPU to be used in the kernel launch
+ *  - 'lwe_out'  batch of output lwe ciphertexts, there should be 'tau'
+ * ciphertexts inside 'lwe_out'
+ *  - 'ggsw_in' batch of input ggsw ciphertexts, function expects 'mbr_size'
+ * ggsw ciphertexts inside 'ggsw_in'
+ *  - 'lut_vector' list of test vectors, function expects 'tau' test vectors
+ * inside 'lut_vector' parameter
+ *  - 'glwe_dimension' glwe dimension, supported values : {1}
+ *  - 'polynomial_size' size of test polynomial supported sizes: {512, 1024,
+ * 2048, 4096, 8192}
+ *  - 'base_log' base log parameter
+ *  - 'l_gadget' decomposition level
+ *  - 'max_shared_memory' maximum number of shared memory to be used in
+ * device functions(kernels)
+ *
+ * This function calls a wrapper to a device kernel that performs the
+ * blind rotation and sample extraction. The kernel is templatized based on
+ * integer discretization and polynomial degree.
+ */
 void cuda_blind_rotate_and_sample_extraction_64(
     void *v_stream, uint32_t gpu_index, void *lwe_out, void *ggsw_in,
     void *lut_vector, uint32_t mbr_size, uint32_t tau, uint32_t glwe_dimension,
diff --git a/src/wop_bootstrap.cu b/src/wop_bootstrap.cu
index 57fcb15c7..c144a3654 100644
--- a/src/wop_bootstrap.cu
+++ b/src/wop_bootstrap.cu
@@ -1,5 +1,32 @@
 #include "wop_bootstrap.cuh"
 
+/*
+ * Entry point for cuda circuit bootstrap + vertical packing for batches of
+ * input 64 bit LWE ciphertexts.
+ *  - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ *  - `gpu_index` is the index of the GPU to be used in the kernel launch
+ *  - 'lwe_array_out' list of output lwe ciphertexts
+ *  - 'lwe_array_in' list of input lwe_ciphertexts
+ *  - 'fourier_bsk' bootstrapping key in fourier domain, expected half size
+ * compressed complex key.
+ *  - 'cbs_fpksk' list of private functional packing keyswitch keys
+ *  - 'lut_vector' list of test vectors
+ *  - 'polynomial_size' size of the test polynomial, supported sizes:
+ * {512, 1024, 2048, 4096, 8192}
+ *  - 'glwe_dimension' supported dimensions: {1}
+ *  - 'lwe_dimension' dimension of input LWE ciphertexts
+ *  - 'level_count_bsk' decomposition level for bootstrapping
+ *  - 'base_log_bsk'  base log parameter for bootstrapping
+ *  - 'level_count_pksk' decomposition level for fp-keyswitch
+ *  - 'base_log_pksk' base log parameter for fp-keyswitch
+ *  - 'level_count_cbs' level of circuit bootstrap
+ *  - 'base_log_cbs' base log parameter for circuit bootstrap
+ *  - 'number_of_inputs' number of input LWE ciphertexts
+ *  - 'max_shared_memory' maximum shared memory amount to be used in
+ *  bootstrapping kernel
+ *
+ */
 void cuda_circuit_bootstrap_vertical_packing_64(
     void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
     void *fourier_bsk, void *cbs_fpksk, void *lut_vector,
@@ -77,6 +104,40 @@ void cuda_circuit_bootstrap_vertical_packing_64(
   }
 }
 
+/*
+ * Entry point for entire without padding programmable bootstrap on 64 bit input
+ * LWE ciphertexts.
+ *  - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ *  - `gpu_index` is the index of the GPU to be used in the kernel launch
+ *  - 'lwe_array_out' list of output lwe ciphertexts
+ *  - 'lwe_array_in' list of input lwe_ciphertexts
+ *  - 'lut_vector' list of test vectors
+ *  - 'fourier_bsk' bootstrapping key in fourier domain, expected half size
+ * compressed complex key.
+ *  - 'ksk' keyswitch key to use inside extract bits block
+ *  - 'cbs_fpksk' list of fp-keyswitch keys
+ *  - 'glwe_dimension' supported dimensions: {1}
+ *  - 'lwe_dimension' dimension of input lwe ciphertexts
+ *  - 'polynomial_size' size of the test polynomial, supported sizes:
+ * {512, 1024, 2048, 4096, 8192}
+ *  - 'base_log_bsk'  base log parameter for bootstrapping
+ *  - 'level_count_bsk' decomposition level for bootstrapping
+ *  - 'base_log_ksk' base log parameter for keyswitch
+ *  - 'level_count_ksk' decomposition level for keyswitch
+ *  - 'base_log_pksk' base log parameter for fp-keyswitch
+ *  - 'level_count_pksk' decomposition level for fp-keyswitch
+ *  - 'base_log_cbs' base log parameter for circuit bootstrap
+ *  - 'level_count_cbs' level of circuit bootstrap
+ *  - 'number_of_bits_of_message_including_padding' number of bits to extract
+ * from each input lwe ciphertext including padding bit
+ *  - 'number_of_bits_to_extract' number of bits to extract
+ * from each input lwe ciphertext without padding bit
+ *  - 'number_of_inputs' number of input lwe ciphertexts
+ *  - 'max_shared_memory' maximum shared memory amount to be used in
+ *  bootstrapping kernel
+ *
+ */
 void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
                      void *lwe_array_in, void *lut_vector, void *fourier_bsk,
                      void *ksk, void *cbs_fpksk, uint32_t glwe_dimension,