refactor(cuda): introduce cmux tree scratch

This commit is contained in:
Agnes Leroy
2023-02-15 09:22:48 +01:00
committed by Agnès Leroy
parent e6dfb588db
commit 870d896ad9
7 changed files with 303 additions and 106 deletions

View File

@@ -45,24 +45,6 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_test_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
void *ggsw_in, void *lut_vector, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t r, uint32_t tau,
uint32_t max_shared_memory);
void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
void *ggsw_in, void *lut_vector, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t r, uint32_t tau,
uint32_t max_shared_memory);
void cuda_blind_rotate_and_sample_extraction_64(
void *v_stream, uint32_t gpu_index, void *lwe_out, void *ggsw_in,
void *lut_vector, uint32_t mbr_size, uint32_t tau, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t l_gadget,
uint32_t max_shared_memory);
void cuda_extract_bits_32(
void *v_stream, uint32_t gpu_index, void *list_lwe_array_out,
void *lwe_array_in, void *lwe_array_in_buffer,

View File

@@ -0,0 +1,48 @@
#ifndef VERTICAL_PACKING_H
#define VERTICAL_PACKING_H
#include <cstdint>
extern "C" {
void scratch_cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index,
int8_t **cmux_tree_buffer,
uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t r, uint32_t tau,
uint32_t max_shared_memory,
bool allocate_gpu_memory);
void scratch_cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index,
int8_t **cmux_tree_buffer,
uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t r, uint32_t tau,
uint32_t max_shared_memory,
bool allocate_gpu_memory);
void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
void *ggsw_in, void *lut_vector,
int8_t *cmux_tree_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t r, uint32_t tau,
uint32_t max_shared_memory);
void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
void *ggsw_in, void *lut_vector,
int8_t *cmux_tree_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t r, uint32_t tau,
uint32_t max_shared_memory);
void cleanup_cuda_cmux_tree(void *v_stream, uint32_t gpu_index,
int8_t **cmux_tree_buffer);
void cuda_blind_rotate_and_sample_extraction_64(
void *v_stream, uint32_t gpu_index, void *lwe_out, void *ggsw_in,
void *lut_vector, uint32_t mbr_size, uint32_t tau, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t l_gadget,
uint32_t max_shared_memory);
}
#endif // VERTICAL_PACKING_H

View File

@@ -2,7 +2,8 @@ set(SOURCES
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/boolean_gates.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h)
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h)
file(GLOB SOURCES
"*.cu"
"*.h"

View File

@@ -50,7 +50,7 @@ __global__ void device_batch_fft_ggsw_vector(double2 *dest, T *src,
*/
template <typename T, typename ST, class params>
void batch_fft_ggsw_vector(cudaStream_t *stream, double2 *dest, T *src,
uint32_t r, uint32_t glwe_dim,
int8_t *d_mem, uint32_t r, uint32_t glwe_dim,
uint32_t polynomial_size, uint32_t level_count,
uint32_t gpu_index, uint32_t max_shared_memory) {
@@ -59,19 +59,15 @@ void batch_fft_ggsw_vector(cudaStream_t *stream, double2 *dest, T *src,
int gridSize = r * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
int blockSize = polynomial_size / params::opt;
int8_t *d_mem;
if (max_shared_memory < shared_memory_size) {
d_mem = (int8_t *)cuda_malloc_async(shared_memory_size, stream, gpu_index);
device_batch_fft_ggsw_vector<T, ST, params, NOSM>
<<<gridSize, blockSize, 0, *stream>>>(dest, src, d_mem);
check_cuda_error(cudaGetLastError());
cuda_drop_async(d_mem, stream, gpu_index);
} else {
device_batch_fft_ggsw_vector<T, ST, params, FULLSM>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(dest, src,
d_mem);
check_cuda_error(cudaGetLastError());
}
check_cuda_error(cudaGetLastError());
}
#endif // CONCRETE_CORE_GGSW_CUH

View File

@@ -1,11 +1,101 @@
#include "vertical_packing.cuh"
#include "vertical_packing.h"
#include <cassert>
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the Cmux tree on 32 bits inputs, into `cmux_tree_buffer`. It also configures
* SM options on the GPU in case FULLSM mode is going to be used.
*/
void scratch_cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index,
int8_t **cmux_tree_buffer,
uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t r, uint32_t tau,
uint32_t max_shared_memory,
bool allocate_gpu_memory) {
switch (polynomial_size) {
case 512:
scratch_cmux_tree<uint32_t, int32_t, Degree<512>>(
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
level_count, r, tau, max_shared_memory, allocate_gpu_memory);
break;
case 1024:
scratch_cmux_tree<uint32_t, int32_t, Degree<1024>>(
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
level_count, r, tau, max_shared_memory, allocate_gpu_memory);
break;
case 2048:
scratch_cmux_tree<uint32_t, int32_t, Degree<2048>>(
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
level_count, r, tau, max_shared_memory, allocate_gpu_memory);
break;
case 4096:
scratch_cmux_tree<uint32_t, int32_t, Degree<4096>>(
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
level_count, r, tau, max_shared_memory, allocate_gpu_memory);
break;
case 8192:
scratch_cmux_tree<uint32_t, int32_t, Degree<8192>>(
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
level_count, r, tau, max_shared_memory, allocate_gpu_memory);
break;
default:
break;
}
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the Cmux tree on 64 bits inputs, into `cmux_tree_buffer`. It also configures
* SM options on the GPU in case FULLSM mode is going to be used.
*/
void scratch_cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index,
int8_t **cmux_tree_buffer,
uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t r, uint32_t tau,
uint32_t max_shared_memory,
bool allocate_gpu_memory) {
switch (polynomial_size) {
case 512:
scratch_cmux_tree<uint64_t, int64_t, Degree<512>>(
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
level_count, r, tau, max_shared_memory, allocate_gpu_memory);
break;
case 1024:
scratch_cmux_tree<uint64_t, int64_t, Degree<1024>>(
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
level_count, r, tau, max_shared_memory, allocate_gpu_memory);
break;
case 2048:
scratch_cmux_tree<uint64_t, int64_t, Degree<2048>>(
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
level_count, r, tau, max_shared_memory, allocate_gpu_memory);
break;
case 4096:
scratch_cmux_tree<uint64_t, int64_t, Degree<4096>>(
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
level_count, r, tau, max_shared_memory, allocate_gpu_memory);
break;
case 8192:
scratch_cmux_tree<uint64_t, int64_t, Degree<8192>>(
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
level_count, r, tau, max_shared_memory, allocate_gpu_memory);
break;
default:
break;
}
}
/*
* Perform cmux tree on a batch of 32-bit input GGSW ciphertexts.
* Check the equivalent function for 64-bit inputs for more details.
*/
void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
void *ggsw_in, void *lut_vector, uint32_t glwe_dimension,
void *ggsw_in, void *lut_vector,
int8_t *cmux_tree_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t r, uint32_t tau,
uint32_t max_shared_memory) {
@@ -25,32 +115,32 @@ void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
case 512:
host_cmux_tree<uint32_t, int32_t, Degree<512>>(
v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in,
(uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
level_count, r, tau, max_shared_memory);
(uint32_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
polynomial_size, base_log, level_count, r, tau, max_shared_memory);
break;
case 1024:
host_cmux_tree<uint32_t, int32_t, Degree<1024>>(
v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in,
(uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
level_count, r, tau, max_shared_memory);
(uint32_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
polynomial_size, base_log, level_count, r, tau, max_shared_memory);
break;
case 2048:
host_cmux_tree<uint32_t, int32_t, Degree<2048>>(
v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in,
(uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
level_count, r, tau, max_shared_memory);
(uint32_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
polynomial_size, base_log, level_count, r, tau, max_shared_memory);
break;
case 4096:
host_cmux_tree<uint32_t, int32_t, Degree<4096>>(
v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in,
(uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
level_count, r, tau, max_shared_memory);
(uint32_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
polynomial_size, base_log, level_count, r, tau, max_shared_memory);
break;
case 8192:
host_cmux_tree<uint32_t, int32_t, Degree<8192>>(
v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in,
(uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
level_count, r, tau, max_shared_memory);
(uint32_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
polynomial_size, base_log, level_count, r, tau, max_shared_memory);
break;
default:
break;
@@ -85,7 +175,8 @@ void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
* polynomial degree.
*/
void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
void *ggsw_in, void *lut_vector, uint32_t glwe_dimension,
void *ggsw_in, void *lut_vector,
int8_t *cmux_tree_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t r, uint32_t tau,
uint32_t max_shared_memory) {
@@ -105,38 +196,49 @@ void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
case 512:
host_cmux_tree<uint64_t, int64_t, Degree<512>>(
v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in,
(uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
level_count, r, tau, max_shared_memory);
(uint64_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
polynomial_size, base_log, level_count, r, tau, max_shared_memory);
break;
case 1024:
host_cmux_tree<uint64_t, int64_t, Degree<1024>>(
v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in,
(uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
level_count, r, tau, max_shared_memory);
(uint64_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
polynomial_size, base_log, level_count, r, tau, max_shared_memory);
break;
case 2048:
host_cmux_tree<uint64_t, int64_t, Degree<2048>>(
v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in,
(uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
level_count, r, tau, max_shared_memory);
(uint64_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
polynomial_size, base_log, level_count, r, tau, max_shared_memory);
break;
case 4096:
host_cmux_tree<uint64_t, int64_t, Degree<4096>>(
v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in,
(uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
level_count, r, tau, max_shared_memory);
(uint64_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
polynomial_size, base_log, level_count, r, tau, max_shared_memory);
break;
case 8192:
host_cmux_tree<uint64_t, int64_t, Degree<8192>>(
v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in,
(uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
level_count, r, tau, max_shared_memory);
(uint64_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
polynomial_size, base_log, level_count, r, tau, max_shared_memory);
break;
default:
break;
}
}
/*
* This cleanup function frees the data for the Cmux tree on GPU in
* cmux_tree_buffer for 32 or 64 bits inputs.
*/
void cleanup_cuda_cmux_tree(void *v_stream, uint32_t gpu_index,
int8_t **cmux_tree_buffer) {
auto stream = static_cast<cudaStream_t *>(v_stream);
// Free memory
cuda_drop_async(*cmux_tree_buffer, stream, gpu_index);
}
/*
* Performs blind rotation on batch of 64-bit input ggsw ciphertexts
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel

View File

@@ -1,6 +1,7 @@
#ifndef VERTICAL_PACKING_H
#define VERTICAL_PACKING_H
#ifndef VERTICAL_PACKING_CUH
#define VERTICAL_PACKING_CUH
#include "../include/vertical_packing.h"
#include "bootstrap.h"
#include "complex/operations.cuh"
#include "crypto/gadget.cuh"
@@ -242,6 +243,71 @@ __global__ void device_batch_cmux(Torus *glwe_array_out, Torus *glwe_array_in,
selected_memory, output_idx, input_idx1, input_idx2, glwe_dim,
polynomial_size, base_log, level_count, ggsw_idx);
}
template <typename Torus>
__host__ __device__ int
get_memory_needed_per_block_cmux_tree(uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // glwe_sub_mask
sizeof(Torus) * polynomial_size + // glwe_sub_body
sizeof(double2) * polynomial_size / 2 + // mask_res_fft
sizeof(double2) * polynomial_size / 2 + // body_res_fft
sizeof(double2) * polynomial_size / 2; // glwe_fft
}
template <typename Torus>
__host__ __device__ int
get_buffer_size_cmux_tree(uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t r, uint32_t tau,
uint32_t max_shared_memory) {
int memory_needed_per_block =
get_memory_needed_per_block_cmux_tree<Torus>(polynomial_size);
int num_lut = (1 << r);
int ggsw_size = polynomial_size * (glwe_dimension + 1) *
(glwe_dimension + 1) * level_count;
int glwe_size = (glwe_dimension + 1) * polynomial_size;
int device_mem = 0;
if (max_shared_memory < memory_needed_per_block) {
device_mem = memory_needed_per_block * (1 << (r - 1)) * tau;
}
if (max_shared_memory < polynomial_size * sizeof(double)) {
device_mem += polynomial_size * sizeof(double);
}
return r * ggsw_size * sizeof(double) +
num_lut * tau * glwe_size * sizeof(Torus) +
num_lut * tau * glwe_size * sizeof(Torus) + device_mem;
}
template <typename Torus, typename STorus, typename params>
__host__ void
scratch_cmux_tree(void *v_stream, uint32_t gpu_index, int8_t **cmux_tree_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t r, uint32_t tau,
uint32_t max_shared_memory, bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
int memory_needed_per_block =
get_memory_needed_per_block_cmux_tree<Torus>(polynomial_size);
if (max_shared_memory >= memory_needed_per_block) {
check_cuda_error(cudaFuncSetAttribute(
device_batch_cmux<Torus, STorus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, memory_needed_per_block));
check_cuda_error(
cudaFuncSetCacheConfig(device_batch_cmux<Torus, STorus, params, FULLSM>,
cudaFuncCachePreferShared));
}
if (allocate_gpu_memory) {
int buffer_size = get_buffer_size_cmux_tree<Torus>(
glwe_dimension, polynomial_size, level_count, r, tau,
max_shared_memory);
*cmux_tree_buffer =
(int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
check_cuda_error(cudaGetLastError());
}
}
/*
* This kernel executes the CMUX tree used by the hybrid packing of the WoPBS.
*
@@ -259,12 +325,12 @@ __global__ void device_batch_cmux(Torus *glwe_array_out, Torus *glwe_array_in,
* - tau: The quantity of CMUX trees that should be executed
*/
template <typename Torus, typename STorus, class params>
__host__ void host_cmux_tree(void *v_stream, uint32_t gpu_index,
Torus *glwe_array_out, Torus *ggsw_in,
Torus *lut_vector, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t r, uint32_t tau,
uint32_t max_shared_memory) {
__host__ void
host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out,
Torus *ggsw_in, Torus *lut_vector, int8_t *cmux_tree_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t r,
uint32_t tau, uint32_t max_shared_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
@@ -277,11 +343,7 @@ __host__ void host_cmux_tree(void *v_stream, uint32_t gpu_index,
}
int memory_needed_per_block =
sizeof(Torus) * polynomial_size + // glwe_sub_mask
sizeof(Torus) * polynomial_size + // glwe_sub_body
sizeof(double2) * polynomial_size / 2 + // mask_res_fft
sizeof(double2) * polynomial_size / 2 + // body_res_fft
sizeof(double2) * polynomial_size / 2; // glwe_fft
get_memory_needed_per_block_cmux_tree<Torus>(polynomial_size);
dim3 thds(polynomial_size / params::opt, 1, 1);
@@ -289,51 +351,45 @@ __host__ void host_cmux_tree(void *v_stream, uint32_t gpu_index,
int ggsw_size = polynomial_size * (glwe_dimension + 1) *
(glwe_dimension + 1) * level_count;
double2 *d_ggsw_fft_in = (double2 *)cuda_malloc_async(
r * ggsw_size * sizeof(double), stream, gpu_index);
double2 *d_ggsw_fft_in = (double2 *)cmux_tree_buffer;
int8_t *d_mem_fft =
(int8_t *)d_ggsw_fft_in + (ptrdiff_t)(r * ggsw_size * sizeof(double));
batch_fft_ggsw_vector<Torus, STorus, params>(
stream, d_ggsw_fft_in, ggsw_in, r, glwe_dimension, polynomial_size,
level_count, gpu_index, max_shared_memory);
stream, d_ggsw_fft_in, ggsw_in, d_mem_fft, r, glwe_dimension,
polynomial_size, level_count, gpu_index, max_shared_memory);
//////////////////////
// Allocate global memory in case parameters are too large
int8_t *d_mem;
if (max_shared_memory < memory_needed_per_block) {
d_mem = (int8_t *)cuda_malloc_async(
memory_needed_per_block * (1 << (r - 1)) * tau, stream, gpu_index);
} else {
check_cuda_error(cudaFuncSetAttribute(
device_batch_cmux<Torus, STorus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, memory_needed_per_block));
check_cuda_error(
cudaFuncSetCacheConfig(device_batch_cmux<Torus, STorus, params, FULLSM>,
cudaFuncCachePreferShared));
int8_t *d_buffer1 = d_mem_fft;
if (max_shared_memory < polynomial_size * sizeof(double)) {
d_buffer1 = d_mem_fft + (ptrdiff_t)(polynomial_size * sizeof(double));
}
// Allocate buffers
int glwe_size = (glwe_dimension + 1) * polynomial_size;
Torus *d_buffer1 = (Torus *)cuda_malloc_async(
num_lut * tau * glwe_size * sizeof(Torus), stream, gpu_index);
Torus *d_buffer2 = (Torus *)cuda_malloc_async(
num_lut * tau * glwe_size * sizeof(Torus), stream, gpu_index);
int8_t *d_buffer2 =
d_buffer1 + (ptrdiff_t)(num_lut * tau * glwe_size * sizeof(Torus));
add_padding_to_lut_async<Torus, params>(d_buffer1, lut_vector, glwe_dimension,
num_lut * tau, stream);
add_padding_to_lut_async<Torus, params>(
(Torus *)d_buffer1, lut_vector, glwe_dimension, num_lut * tau, stream);
Torus *output;
// Run the cmux tree
for (int layer_idx = 0; layer_idx < r; layer_idx++) {
output = (layer_idx % 2 ? d_buffer1 : d_buffer2);
Torus *input = (layer_idx % 2 ? d_buffer2 : d_buffer1);
output = (layer_idx % 2 ? (Torus *)d_buffer1 : (Torus *)d_buffer2);
Torus *input = (layer_idx % 2 ? (Torus *)d_buffer2 : (Torus *)d_buffer1);
int num_cmuxes = (1 << (r - 1 - layer_idx));
dim3 grid(num_cmuxes, tau, 1);
int8_t *d_mem =
d_buffer2 + (ptrdiff_t)(num_lut * tau * glwe_size * sizeof(Torus));
// walks horizontally through the leaves
if (max_shared_memory < memory_needed_per_block)
if (max_shared_memory < memory_needed_per_block) {
device_batch_cmux<Torus, STorus, params, NOSM>
<<<grid, thds, 0, *stream>>>(output, input, d_ggsw_fft_in, d_mem,
memory_needed_per_block,
@@ -341,7 +397,7 @@ __host__ void host_cmux_tree(void *v_stream, uint32_t gpu_index,
polynomial_size, base_log, level_count,
layer_idx, // r
num_lut);
else
} else {
device_batch_cmux<Torus, STorus, params, FULLSM>
<<<grid, thds, memory_needed_per_block, *stream>>>(
output, input, d_ggsw_fft_in, d_mem, memory_needed_per_block,
@@ -349,20 +405,15 @@ __host__ void host_cmux_tree(void *v_stream, uint32_t gpu_index,
polynomial_size, base_log, level_count,
layer_idx, // r
num_lut);
}
check_cuda_error(cudaGetLastError());
}
for (int i = 0; i < tau; i++)
for (int i = 0; i < tau; i++) {
check_cuda_error(cudaMemcpyAsync(
glwe_array_out + i * glwe_size, output + i * num_lut * glwe_size,
glwe_size * sizeof(Torus), cudaMemcpyDeviceToDevice, *stream));
// Free memory
cuda_drop_async(d_ggsw_fft_in, stream, gpu_index);
cuda_drop_async(d_buffer1, stream, gpu_index);
cuda_drop_async(d_buffer2, stream, gpu_index);
if (max_shared_memory < memory_needed_per_block)
cuda_drop_async(d_mem, stream, gpu_index);
}
}
/*
@@ -495,10 +546,13 @@ __host__ void host_blind_rotate_and_sample_extraction(
double2 *d_ggsw_fft_in = (double2 *)cuda_malloc_async(
mbr_size * ggsw_size * sizeof(double), stream, gpu_index);
int8_t *d_mem_fft = (int8_t *)cuda_malloc_async(
polynomial_size * sizeof(double), stream, gpu_index);
batch_fft_ggsw_vector<Torus, STorus, params>(
stream, d_ggsw_fft_in, ggsw_in, mbr_size, glwe_dimension, polynomial_size,
level_count, gpu_index, max_shared_memory);
stream, d_ggsw_fft_in, ggsw_in, d_mem_fft, mbr_size, glwe_dimension,
polynomial_size, level_count, gpu_index, max_shared_memory);
check_cuda_error(cudaGetLastError());
cuda_drop_async(d_mem_fft, stream, gpu_index);
//
dim3 thds(polynomial_size / params::opt, 1, 1);
@@ -525,4 +579,4 @@ __host__ void host_blind_rotate_and_sample_extraction(
if (max_shared_memory < memory_needed_per_block)
cuda_drop_async(d_mem, stream, gpu_index);
}
#endif // VERTICAL_PACKING_H
#endif // VERTICAL_PACKING_CUH

View File

@@ -68,9 +68,13 @@ __host__ void scratch_circuit_bootstrap_vertical_packing(
// allocate and initialize device pointers for circuit bootstrap and vertical
// packing
if (allocate_gpu_memory) {
int buffer_size = get_buffer_size_cbs_vp<Torus>(
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
number_of_inputs, tau);
int buffer_size =
get_buffer_size_cbs_vp<Torus>(glwe_dimension, lwe_dimension,
polynomial_size, level_count_cbs,
number_of_inputs, tau) +
get_buffer_size_cmux_tree<Torus>(glwe_dimension, polynomial_size,
level_count_cbs, r, tau,
max_shared_memory);
*cbs_vp_buffer =
(int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
}
@@ -88,6 +92,9 @@ __host__ void scratch_circuit_bootstrap_vertical_packing(
uint32_t bits = sizeof(Torus) * 8;
*cbs_delta_log = (bits - 1);
scratch_cmux_tree<Torus, STorus, params>(
v_stream, gpu_index, cbs_vp_buffer, glwe_dimension, polynomial_size,
level_count_cbs, r, tau, max_shared_memory, false);
}
/*
@@ -157,12 +164,15 @@ __host__ void host_circuit_bootstrap_vertical_packing(
// split the vec of GGSW in two, the msb GGSW is for the CMux tree and the
// lsb GGSW is for the last blind rotation.
uint32_t r = number_of_inputs - params::log2_degree;
int8_t *cmux_tree_buffer =
(int8_t *)glwe_array_out +
tau * (glwe_dimension + 1) * polynomial_size * sizeof(Torus);
// CMUX Tree
// r = tau * p - log2(N)
host_cmux_tree<Torus, STorus, params>(
v_stream, gpu_index, glwe_array_out, ggsw_out, lut_vector, glwe_dimension,
polynomial_size, base_log_cbs, level_count_cbs, r, tau,
max_shared_memory);
v_stream, gpu_index, glwe_array_out, ggsw_out, lut_vector,
cmux_tree_buffer, glwe_dimension, polynomial_size, base_log_cbs,
level_count_cbs, r, tau, max_shared_memory);
check_cuda_error(cudaGetLastError());
// Blind rotation + sample extraction
@@ -215,10 +225,14 @@ scratch_wop_pbs(void *v_stream, uint32_t gpu_index, int8_t **wop_pbs_buffer,
number_of_inputs * number_of_bits_to_extract;
uint32_t tau = number_of_inputs;
uint32_t r = cbs_vp_number_of_inputs - params::log2_degree;
int buffer_size = get_buffer_size_cbs_vp<Torus>(
glwe_dimension, lwe_dimension, polynomial_size,
level_count_cbs, cbs_vp_number_of_inputs, tau) +
wop_pbs_buffer_size;
int buffer_size =
get_buffer_size_cbs_vp<Torus>(glwe_dimension, lwe_dimension,
polynomial_size, level_count_cbs,
cbs_vp_number_of_inputs, tau) +
get_buffer_size_cmux_tree<Torus>(glwe_dimension, polynomial_size,
level_count_cbs, r, tau,
max_shared_memory) +
wop_pbs_buffer_size;
*wop_pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);