feat(cuda): add a cbs+vp entry point

- fix bug in CBS as well
- update cuda benchmarks
This commit is contained in:
Agnes Leroy
2022-12-07 10:08:52 +01:00
committed by Agnès Leroy
parent 2db1ef6a56
commit 4da789abda
3 changed files with 347 additions and 20 deletions

View File

@@ -105,6 +105,22 @@ void cuda_circuit_bootstrap_64(
uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk,
uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs,
uint32_t number_of_samples, uint32_t max_shared_memory);
void cuda_circuit_bootstrap_vertical_packing_32(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *fourier_bsk, void *cbs_fpksk, void *lut_vector,
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t level_count_bsk, uint32_t base_log_bsk, uint32_t level_count_pksk,
uint32_t base_log_pksk, uint32_t level_count_cbs, uint32_t base_log_cbs,
uint32_t number_of_inputs, uint32_t lut_number, uint32_t max_shared_memory);
void cuda_circuit_bootstrap_vertical_packing_64(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *fourier_bsk, void *cbs_fpksk, void *lut_vector,
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t level_count_bsk, uint32_t base_log_bsk, uint32_t level_count_pksk,
uint32_t base_log_pksk, uint32_t level_count_cbs, uint32_t base_log_cbs,
uint32_t number_of_inputs, uint32_t lut_number, uint32_t max_shared_memory);
}
#ifdef __CUDACC__

View File

@@ -520,3 +520,157 @@ void cuda_circuit_bootstrap_64(
break;
}
}
void cuda_circuit_bootstrap_vertical_packing_32(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *fourier_bsk, void *cbs_fpksk, void *lut_vector,
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t level_count_bsk, uint32_t base_log_bsk, uint32_t level_count_pksk,
uint32_t base_log_pksk, uint32_t level_count_cbs, uint32_t base_log_cbs,
uint32_t number_of_inputs, uint32_t lut_number,
uint32_t max_shared_memory) {
assert(("Error (GPU circuit bootstrap): glwe_dimension should be equal to 1",
glwe_dimension == 1));
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
"512, 1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 ||
polynomial_size == 2048 || polynomial_size == 4096 ||
polynomial_size == 8192));
// The number of inputs should be lower than the number of streaming
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
// to the occupancy of 50%). The only supported value for k is 1, so
// k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
"equal to the "
"number of streaming multiprocessors on the device divided by 8 * "
"level_count_bsk",
number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk));
switch (polynomial_size) {
case 512:
host_circuit_bootstrap_vertical_packing<uint32_t, int32_t, Degree<512>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lwe_array_in, (uint32_t *)lut_vector,
(double2 *)fourier_bsk, (uint32_t *)cbs_fpksk, glwe_dimension,
lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk,
base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs,
number_of_inputs, lut_number, max_shared_memory);
break;
case 1024:
host_circuit_bootstrap_vertical_packing<uint32_t, int32_t, Degree<1024>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lwe_array_in, (uint32_t *)lut_vector,
(double2 *)fourier_bsk, (uint32_t *)cbs_fpksk, glwe_dimension,
lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk,
base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs,
number_of_inputs, lut_number, max_shared_memory);
break;
case 2048:
host_circuit_bootstrap_vertical_packing<uint32_t, int32_t, Degree<2048>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lwe_array_in, (uint32_t *)lut_vector,
(double2 *)fourier_bsk, (uint32_t *)cbs_fpksk, glwe_dimension,
lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk,
base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs,
number_of_inputs, lut_number, max_shared_memory);
break;
case 4096:
host_circuit_bootstrap_vertical_packing<uint32_t, int32_t, Degree<4096>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lwe_array_in, (uint32_t *)lut_vector,
(double2 *)fourier_bsk, (uint32_t *)cbs_fpksk, glwe_dimension,
lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk,
base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs,
number_of_inputs, lut_number, max_shared_memory);
break;
case 8192:
host_circuit_bootstrap_vertical_packing<uint32_t, int32_t, Degree<8192>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lwe_array_in, (uint32_t *)lut_vector,
(double2 *)fourier_bsk, (uint32_t *)cbs_fpksk, glwe_dimension,
lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk,
base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs,
number_of_inputs, lut_number, max_shared_memory);
break;
default:
break;
}
}
void cuda_circuit_bootstrap_vertical_packing_64(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *fourier_bsk, void *cbs_fpksk, void *lut_vector,
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t level_count_bsk, uint32_t base_log_bsk, uint32_t level_count_pksk,
uint32_t base_log_pksk, uint32_t level_count_cbs, uint32_t base_log_cbs,
uint32_t number_of_inputs, uint32_t lut_number,
uint32_t max_shared_memory) {
assert(("Error (GPU circuit bootstrap): glwe_dimension should be equal to 1",
glwe_dimension == 1));
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
"512, 1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 ||
polynomial_size == 2048 || polynomial_size == 4096 ||
polynomial_size == 8192));
// The number of inputs should be lower than the number of streaming
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
// to the occupancy of 50%). The only supported value for k is 1, so
// k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
"equal to the "
"number of streaming multiprocessors on the device divided by 8 * "
"level_count_bsk",
number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk));
switch (polynomial_size) {
case 512:
host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<512>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
(double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, glwe_dimension,
lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk,
base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs,
number_of_inputs, lut_number, max_shared_memory);
break;
case 1024:
host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<1024>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
(double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, glwe_dimension,
lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk,
base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs,
number_of_inputs, lut_number, max_shared_memory);
break;
case 2048:
host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<2048>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
(double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, glwe_dimension,
lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk,
base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs,
number_of_inputs, lut_number, max_shared_memory);
break;
case 4096:
host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<4096>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
(double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, glwe_dimension,
lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk,
base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs,
number_of_inputs, lut_number, max_shared_memory);
break;
case 8192:
host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<8192>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
(double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, glwe_dimension,
lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk,
base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs,
number_of_inputs, lut_number, max_shared_memory);
break;
default:
break;
}
}

View File

@@ -19,6 +19,7 @@
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial.cuh"
#include "polynomial/polynomial_math.cuh"
#include "utils/kernel_dimensions.cuh"
#include "utils/memory.cuh"
#include "utils/timer.cuh"
@@ -187,6 +188,21 @@ cmux(Torus *glwe_array_out, Torus *glwe_array_in, double2 *ggsw_in,
add_to_torus<Torus, params>(body_res_fft, mb_body);
}
template <typename Torus, class params>
__global__ void device_build_lut(Torus *lut_out, Torus *lut_in,
uint32_t glwe_dimension, uint32_t lut_number) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < glwe_dimension * params::degree * lut_number) {
int lut_index = index / (glwe_dimension * params::degree);
for (int j = 0; j < glwe_dimension; j++) {
lut_out[index + lut_index * (glwe_dimension + 1) * params::degree +
j * params::degree] = 0;
}
lut_out[index + lut_index * (glwe_dimension + 1) * params::degree +
glwe_dimension * params::degree] = lut_in[index];
}
}
/**
* Computes several CMUXes using an array of GLWE ciphertexts and a single GGSW
* ciphertext. The GLWE ciphertexts are picked two-by-two in sequence. Each
@@ -252,10 +268,18 @@ void host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out,
uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t r,
uint32_t max_shared_memory) {
auto stream = static_cast<cudaStream_t *>(v_stream);
int num_lut = (1 << r);
int num_lut = (1 << r);
if (r == 0) {
// Just copy the LUT
checkCudaErrors(
cudaMemcpyAsync(glwe_array_out, lut_vector,
(glwe_dimension + 1) * polynomial_size * sizeof(Torus),
cudaMemcpyDeviceToDevice, *stream));
checkCudaErrors(cudaStreamSynchronize(*stream));
return;
}
cuda_initialize_twiddles(polynomial_size, 0);
int memory_needed_per_block =
@@ -315,7 +339,7 @@ void host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out,
int num_cmuxes = (1 << (r - 1 - layer_idx));
dim3 grid(num_cmuxes, 1, 1);
// walks horizontally through the leafs
// walks horizontally through the leaves
if (max_shared_memory < memory_needed_per_block)
device_batch_cmux<Torus, STorus, params, NOSM>
<<<grid, thds, 0, *stream>>>(output, input, d_ggsw_fft_in, d_mem,
@@ -334,10 +358,9 @@ void host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out,
);
}
checkCudaErrors(
cudaMemcpyAsync(glwe_array_out, output,
(glwe_dimension + 1) * polynomial_size * sizeof(Torus),
cudaMemcpyDeviceToDevice, *stream));
checkCudaErrors(cudaMemcpyAsync(glwe_array_out, output,
glwe_size * sizeof(Torus),
cudaMemcpyDeviceToDevice, *stream));
// We only need synchronization to assert that data is in glwe_array_out
// before returning. Memory release can be added to the stream and processed
@@ -466,29 +489,31 @@ __global__ void fill_lut_body_for_cbs(Torus *lut, uint32_t ciphertext_n_bits,
for (int i = 0; i < params::opt; i++) {
cur_mask[tid] = 0;
cur_poly[tid] =
0ll - 1ll << (ciphertext_n_bits - 1 - base_log_cbs * (blockIdx.x + 1));
0ll -
(1ll << (ciphertext_n_bits - 1 - base_log_cbs * (blockIdx.x + 1)));
tid += params::degree / params::opt;
}
}
template <typename Torus, class params>
__global__ void copy_add_lwe_cbs(Torus *lwe_dst, Torus *lwe_src, Torus value) {
__global__ void copy_add_lwe_cbs(Torus *lwe_dst, Torus *lwe_src,
uint32_t ciphertext_n_bits,
uint32_t base_log_cbs, uint32_t level_cbs) {
size_t tid = threadIdx.x;
size_t src_lwe_id = blockIdx.y;
size_t dst_lwe_id = blockIdx.x;
size_t src_lwe_id = dst_lwe_id / 2;
size_t cur_cbs_level = src_lwe_id % level_cbs + 1;
auto cur_src = &lwe_src[src_lwe_id * (params::degree + 1)];
auto cur_dst = &lwe_dst[dst_lwe_id * (params::degree + 1)];
#pragma unroll
for (int i = 0; i < params::opt; i++) {
cur_dst[tid] = cur_src[tid];
tid += params::degree / params::opt;
}
Torus val = 1ll << (ciphertext_n_bits - 1 - base_log_cbs * cur_cbs_level);
if (threadIdx.x == 0) {
cur_dst[params::degree] = cur_src[params::degree] + value;
cur_dst[params::degree] = cur_src[params::degree] + val;
}
}
@@ -683,7 +708,6 @@ void host_blind_rotate_and_sample_extraction(
assert(glwe_dimension ==
1); // For larger k we will need to adjust the mask size
auto stream = static_cast<cudaStream_t *>(v_stream);
int memory_needed_per_block =
@@ -794,14 +818,13 @@ __host__ void host_circuit_bootstrap(
lwe_dimension, polynomial_size, base_log_bsk, level_bsk, pbs_count,
level_cbs, 0, max_shared_memory);
dim3 copy_grid(pbs_count * (glwe_dimension + 1), pbs_count, 1);
dim3 copy_grid(pbs_count * (glwe_dimension + 1), 1, 1);
dim3 copy_block(params::degree / params::opt, 1, 1);
// Add q/4 to center the error while computing a negacyclic LUT
// copy pbs result (glwe_dimension + 1) times to be an input of fp-ks
copy_add_lwe_cbs<Torus, params><<<copy_grid, copy_block>>>(
lwe_array_in_fp_ks_buffer, lwe_array_out_pbs_buffer,
1ll << (ciphertext_n_bits - 1 - base_log_cbs * level_cbs));
lwe_array_in_fp_ks_buffer, lwe_array_out_pbs_buffer, ciphertext_n_bits,
base_log_cbs, level_cbs);
cuda_fp_keyswitch_lwe_to_glwe(
v_stream, ggsw_out, lwe_array_in_fp_ks_buffer, fp_ksk_array,
@@ -809,4 +832,138 @@ __host__ void host_circuit_bootstrap(
level_pksk, pbs_count * (glwe_dimension + 1), glwe_dimension + 1);
}
#endif // WO_PBS_H
// number_of_inputs is the total number of LWE ciphertexts passed to CBS + VP,
// i.e. tau * p where tau is the number of LUTs (the original number of LWEs
// before bit extraction) and p is the number of extracted bits
template <typename Torus, typename STorus, class params>
__host__ void host_circuit_bootstrap_vertical_packing(
void *v_stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_array_in, Torus *lut_vector, double2 *fourier_bsk,
Torus *cbs_fpksk, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t base_log_bsk, uint32_t level_count_bsk,
uint32_t base_log_pksk, uint32_t level_count_pksk, uint32_t base_log_cbs,
uint32_t level_count_cbs, uint32_t number_of_inputs, uint32_t lut_number,
uint32_t max_shared_memory) {
auto stream = static_cast<cudaStream_t *>(v_stream);
// allocate and initialize device pointers for circuit bootstrap
// output ggsw array for cbs
int ggsw_size = level_count_cbs * (glwe_dimension + 1) *
(glwe_dimension + 1) * polynomial_size;
Torus *ggsw_out = (Torus *)cuda_malloc_async(
number_of_inputs * ggsw_size * sizeof(Torus), *stream, gpu_index);
// input lwe array for fp-ks
Torus *lwe_array_in_fp_ks_buffer = (Torus *)cuda_malloc_async(
number_of_inputs * level_count_cbs * (glwe_dimension + 1) *
(polynomial_size + 1) * sizeof(Torus),
*stream, gpu_index);
// buffer for pbs output
Torus *lwe_array_out_pbs_buffer =
(Torus *)cuda_malloc_async(number_of_inputs * level_count_cbs *
(polynomial_size + 1) * sizeof(Torus),
*stream, gpu_index);
// vector for shifted lwe input
Torus *lwe_array_in_shifted_buffer = (Torus *)cuda_malloc_async(
number_of_inputs * level_count_cbs * (lwe_dimension + 1) * sizeof(Torus),
*stream, gpu_index);
// lut vector buffer for cbs
Torus *lut_vector_cbs = (Torus *)cuda_malloc_async(
level_count_cbs * (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
*stream, gpu_index);
// indexes of lut vectors for cbs
uint32_t *h_lut_vector_indexes =
(uint32_t *)malloc(number_of_inputs * level_count_cbs * sizeof(uint32_t));
for (uint index = 0; index < level_count_cbs * number_of_inputs; index++) {
h_lut_vector_indexes[index] = index % level_count_cbs;
}
uint32_t *lut_vector_indexes = (uint32_t *)cuda_malloc_async(
number_of_inputs * level_count_cbs * sizeof(uint32_t), *stream,
gpu_index);
cuda_memcpy_async_to_gpu(lut_vector_indexes, h_lut_vector_indexes,
number_of_inputs * level_count_cbs *
sizeof(uint32_t),
v_stream, gpu_index);
checkCudaErrors(cudaGetLastError());
uint32_t bits = sizeof(Torus) * 8;
uint32_t delta_log = (bits - 1);
host_circuit_bootstrap<Torus, params>(
v_stream, gpu_index, ggsw_out, lwe_array_in, fourier_bsk, cbs_fpksk,
lwe_array_in_shifted_buffer, lut_vector_cbs, lut_vector_indexes,
lwe_array_out_pbs_buffer, lwe_array_in_fp_ks_buffer, delta_log,
polynomial_size, glwe_dimension, lwe_dimension, level_count_bsk,
base_log_bsk, level_count_pksk, base_log_pksk, level_count_cbs,
base_log_cbs, number_of_inputs, max_shared_memory);
checkCudaErrors(cudaGetLastError());
// Free memory
cuda_drop_async(lwe_array_in_fp_ks_buffer, *stream, gpu_index);
cuda_drop_async(lwe_array_in_shifted_buffer, *stream, gpu_index);
cuda_drop_async(lwe_array_out_pbs_buffer, *stream, gpu_index);
cuda_drop_async(lut_vector_cbs, *stream, gpu_index);
cuda_drop_async(lut_vector_indexes, *stream, gpu_index);
free(h_lut_vector_indexes);
// we need to expand the lut to fill the masks with zeros
Torus *lut_vector_glwe = (Torus *)cuda_malloc_async(
(glwe_dimension + 1) * lut_number * polynomial_size * sizeof(Torus),
*stream, gpu_index);
int num_blocks = 0, num_threads = 0;
int num_entries = glwe_dimension * polynomial_size * lut_number;
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
device_build_lut<Torus, params><<<num_blocks, num_threads, 0, *stream>>>(
lut_vector_glwe, lut_vector, glwe_dimension, lut_number);
checkCudaErrors(cudaGetLastError());
// number_of_inputs = tau * p is the total number of GGSWs
if (number_of_inputs > params::log2_degree) {
// split the vec of GGSW in two, the msb GGSW is for the CMux tree and the
// lsb GGSW is for the last blind rotation.
uint32_t r = number_of_inputs - params::log2_degree;
Torus *br_ggsw = (Torus *)ggsw_out +
(ptrdiff_t)(r * level_count_cbs * (glwe_dimension + 1) *
(glwe_dimension + 1) * polynomial_size);
for (uint i = 0; i < lut_number; i++) {
Torus *lut_glwe = (Torus *)lut_vector_glwe +
(ptrdiff_t)(i * (glwe_dimension + 1) * polynomial_size);
// CMUX Tree
Torus *glwe_array_out = (Torus *)cuda_malloc_async(
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), *stream,
gpu_index);
checkCudaErrors(cudaGetLastError());
// r = tau * p - log2(N)
host_cmux_tree<Torus, STorus, params>(
v_stream, gpu_index, glwe_array_out, ggsw_out, lut_glwe,
glwe_dimension, polynomial_size, base_log_cbs, level_count_cbs, r,
max_shared_memory);
checkCudaErrors(cudaGetLastError());
// Blind rotation + sample extraction
// mbr = tau * p - r = log2(N)
Torus *lwe_out =
(Torus *)lwe_array_out + (ptrdiff_t)(i * (lwe_dimension + 1));
host_blind_rotate_and_sample_extraction<Torus, STorus, params>(
v_stream, gpu_index, lwe_out, br_ggsw, glwe_array_out,
number_of_inputs - r, 1, glwe_dimension, polynomial_size,
base_log_cbs, level_count_cbs, max_shared_memory);
cuda_drop_async(glwe_array_out, *stream, gpu_index);
}
} else {
// Blind rotation + sample extraction
for (uint i = 0; i < lut_number; i++) {
Torus *lut_glwe = (Torus *)lut_vector_glwe +
(ptrdiff_t)(i * (glwe_dimension + 1) * polynomial_size);
Torus *lwe_out =
(Torus *)lwe_array_out + (ptrdiff_t)(i * (lwe_dimension + 1));
host_blind_rotate_and_sample_extraction<Torus, STorus, params>(
v_stream, gpu_index, lwe_out, ggsw_out, lut_glwe, number_of_inputs, 1,
glwe_dimension, polynomial_size, base_log_cbs, level_count_cbs,
max_shared_memory);
}
}
cuda_drop_async(ggsw_out, *stream, gpu_index);
}
#endif // WOP_PBS_H