refactor(cuda): Implements support to N=256 in the cmux tree, bit

extraction, and cbs.
This commit is contained in:
Pedro Alves
2023-02-22 11:13:38 -03:00
committed by Agnès Leroy
parent 75e9baae78
commit 184d453387
4 changed files with 187 additions and 37 deletions

View File

@@ -12,6 +12,12 @@ void scratch_cuda_extract_bits_32(
bool allocate_gpu_memory) {
switch (polynomial_size) {
case 256:
scratch_extract_bits<uint32_t, int32_t, Degree<256>>(
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_count, number_of_inputs, max_shared_memory,
allocate_gpu_memory);
break;
case 512:
scratch_extract_bits<uint32_t, int32_t, Degree<512>>(
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
@@ -59,6 +65,12 @@ void scratch_cuda_extract_bits_64(
bool allocate_gpu_memory) {
switch (polynomial_size) {
case 256:
scratch_extract_bits<uint64_t, int64_t, Degree<256>>(
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_count, number_of_inputs, max_shared_memory,
allocate_gpu_memory);
break;
case 512:
scratch_extract_bits<uint64_t, int64_t, Degree<512>>(
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
@@ -110,10 +122,10 @@ void cuda_extract_bits_32(void *v_stream, uint32_t gpu_index,
assert(("Error (GPU extract bits): base log should be <= 32",
base_log_bsk <= 32));
assert(("Error (GPU extract bits): lwe_dimension_in should be one of "
"512, 1024, 2048, 4096, 8192",
lwe_dimension_in == 512 || lwe_dimension_in == 1024 ||
lwe_dimension_in == 2048 || lwe_dimension_in == 4096 ||
lwe_dimension_in == 8192));
"256, 512, 1024, 2048, 4096, 8192",
lwe_dimension_in == 256 || lwe_dimension_in == 512 ||
lwe_dimension_in == 1024 || lwe_dimension_in == 2048 ||
lwe_dimension_in == 4096 || lwe_dimension_in == 8192));
assert(("Error (GPU extract bits): lwe_dimension_in should be equal to "
"polynomial_size",
lwe_dimension_in == polynomial_size));
@@ -130,6 +142,15 @@ void cuda_extract_bits_32(void *v_stream, uint32_t gpu_index,
number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk));
switch (lwe_dimension_in) {
case 256:
host_extract_bits<uint32_t, Degree<256>>(
v_stream, gpu_index, (uint32_t *)list_lwe_array_out,
(uint32_t *)lwe_array_in, bit_extract_buffer, (uint32_t *)ksk,
(double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in,
lwe_dimension_out, glwe_dimension, polynomial_size, base_log_bsk,
level_count_bsk, base_log_ksk, level_count_ksk, number_of_samples,
max_shared_memory);
break;
case 512:
host_extract_bits<uint32_t, Degree<512>>(
v_stream, gpu_index, (uint32_t *)list_lwe_array_out,
@@ -210,7 +231,7 @@ void cuda_extract_bits_32(void *v_stream, uint32_t gpu_index,
* - 'ksk' keyswitch key
* - 'fourier_bsk' complex compressed bsk in fourier domain
* - 'lwe_dimension_in' input LWE ciphertext dimension, supported input
* dimensions are: {512, 1024,2048, 4096, 8192}
* dimensions are: {256, 512, 1024,2048, 4096, 8192}
* - 'lwe_dimension_out' output LWE ciphertext dimension
* - 'glwe_dimension' GLWE dimension, only glwe_dimension = 1 is supported
* for now
@@ -238,10 +259,10 @@ void cuda_extract_bits_64(void *v_stream, uint32_t gpu_index,
assert(("Error (GPU extract bits): base log should be <= 64",
base_log_bsk <= 64));
assert(("Error (GPU extract bits): lwe_dimension_in should be one of "
"512, 1024, 2048, 4096, 8192",
lwe_dimension_in == 512 || lwe_dimension_in == 1024 ||
lwe_dimension_in == 2048 || lwe_dimension_in == 4096 ||
lwe_dimension_in == 8192));
"256, 512, 1024, 2048, 4096, 8192",
lwe_dimension_in == 256 || lwe_dimension_in == 512 ||
lwe_dimension_in == 1024 || lwe_dimension_in == 2048 ||
lwe_dimension_in == 4096 || lwe_dimension_in == 8192));
assert(("Error (GPU extract bits): lwe_dimension_in should be equal to "
"polynomial_size",
lwe_dimension_in == polynomial_size));
@@ -258,6 +279,15 @@ void cuda_extract_bits_64(void *v_stream, uint32_t gpu_index,
number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk));
switch (lwe_dimension_in) {
case 256:
host_extract_bits<uint64_t, Degree<256>>(
v_stream, gpu_index, (uint64_t *)list_lwe_array_out,
(uint64_t *)lwe_array_in, bit_extract_buffer, (uint64_t *)ksk,
(double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in,
lwe_dimension_out, glwe_dimension, polynomial_size, base_log_bsk,
level_count_bsk, base_log_ksk, level_count_ksk, number_of_samples,
max_shared_memory);
break;
case 512:
host_extract_bits<uint64_t, Degree<512>>(
v_stream, gpu_index, (uint64_t *)list_lwe_array_out,

View File

@@ -13,6 +13,12 @@ void scratch_cuda_circuit_bootstrap_32(
uint32_t max_shared_memory, bool allocate_gpu_memory) {
switch (polynomial_size) {
case 256:
scratch_circuit_bootstrap<uint32_t, int32_t, Degree<256>>(
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
allocate_gpu_memory);
break;
case 512:
scratch_circuit_bootstrap<uint32_t, int32_t, Degree<512>>(
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
@@ -60,6 +66,12 @@ void scratch_cuda_circuit_bootstrap_64(
uint32_t max_shared_memory, bool allocate_gpu_memory) {
switch (polynomial_size) {
case 256:
scratch_circuit_bootstrap<uint64_t, int64_t, Degree<256>>(
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
allocate_gpu_memory);
break;
case 512:
scratch_circuit_bootstrap<uint64_t, int64_t, Degree<512>>(
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
@@ -108,10 +120,10 @@ void cuda_circuit_bootstrap_32(
uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
uint32_t max_shared_memory) {
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
"512, 1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 ||
polynomial_size == 2048 || polynomial_size == 4096 ||
polynomial_size == 8192));
"256, 512, 1024, 2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
// The number of samples should be lower than the number of streaming
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
// to the occupancy of 50%). The only supported value for k is 1, so
@@ -124,6 +136,15 @@ void cuda_circuit_bootstrap_32(
"level_count_bsk",
number_of_inputs <= number_of_sm / 4. / 2. / level_bsk));
switch (polynomial_size) {
case 256:
host_circuit_bootstrap<uint32_t, Degree<256>>(
v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
(double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
(uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
max_shared_memory);
break;
case 512:
host_circuit_bootstrap<uint32_t, Degree<512>>(
v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
@@ -208,10 +229,10 @@ void cuda_circuit_bootstrap_64(
uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
uint32_t max_shared_memory) {
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
"512, 1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 ||
polynomial_size == 2048 || polynomial_size == 4096 ||
polynomial_size == 8192));
"256, 512, 1024, 2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
// The number of samples should be lower than the number of streaming
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
// to the occupancy of 50%). The only supported value for k is 1, so
@@ -225,6 +246,15 @@ void cuda_circuit_bootstrap_64(
number_of_inputs <= number_of_sm / 4. / 2. / level_bsk));
// The number of samples should be lower than the number of streaming
switch (polynomial_size) {
case 256:
host_circuit_bootstrap<uint64_t, Degree<256>>(
v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
(double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
(uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
max_shared_memory);
break;
case 512:
host_circuit_bootstrap<uint64_t, Degree<512>>(
v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,

View File

@@ -16,6 +16,11 @@ void scratch_cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index,
bool allocate_gpu_memory) {
switch (polynomial_size) {
case 256:
scratch_cmux_tree<uint32_t, int32_t, Degree<256>>(
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
level_count, r, tau, max_shared_memory, allocate_gpu_memory);
break;
case 512:
scratch_cmux_tree<uint32_t, int32_t, Degree<512>>(
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
@@ -59,6 +64,11 @@ void scratch_cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index,
uint32_t max_shared_memory,
bool allocate_gpu_memory) {
switch (polynomial_size) {
case 256:
scratch_cmux_tree<uint64_t, int64_t, Degree<256>>(
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
level_count, r, tau, max_shared_memory, allocate_gpu_memory);
break;
case 512:
scratch_cmux_tree<uint64_t, int64_t, Degree<512>>(
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
@@ -101,17 +111,24 @@ void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
uint32_t max_shared_memory) {
assert(("Error (GPU Cmux tree): base log should be <= 32", base_log <= 32));
assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, "
"2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 ||
polynomial_size == 2048 || polynomial_size == 4096 ||
polynomial_size == 8192));
assert((
"Error (GPU Cmux tree): polynomial size should be one of 256, 512, 1024, "
"2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
// For larger k we will need to adjust the mask size
assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
"be >= 1 ",
r >= 1));
switch (polynomial_size) {
case 256:
host_cmux_tree<uint32_t, int32_t, Degree<256>>(
v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in,
(uint32_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
polynomial_size, base_log, level_count, r, tau, max_shared_memory);
break;
case 512:
host_cmux_tree<uint32_t, int32_t, Degree<512>>(
v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in,
@@ -182,17 +199,24 @@ void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
uint32_t max_shared_memory) {
assert(("Error (GPU Cmux tree): base log should be <= 64", base_log <= 64));
assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, "
"2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 ||
polynomial_size == 2048 || polynomial_size == 4096 ||
polynomial_size == 8192));
assert((
"Error (GPU Cmux tree): polynomial size should be one of 256, 512, 1024, "
"2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
// For larger k we will need to adjust the mask size
assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
"be >= 1 ",
r >= 1));
switch (polynomial_size) {
case 256:
host_cmux_tree<uint64_t, int64_t, Degree<256>>(
v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in,
(uint64_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
polynomial_size, base_log, level_count, r, tau, max_shared_memory);
break;
case 512:
host_cmux_tree<uint64_t, int64_t, Degree<512>>(
v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in,
@@ -251,6 +275,11 @@ void scratch_cuda_blind_rotation_sample_extraction_32(
bool allocate_gpu_memory) {
switch (polynomial_size) {
case 256:
scratch_blind_rotation_sample_extraction<uint32_t, int32_t, Degree<256>>(
v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
break;
case 512:
scratch_blind_rotation_sample_extraction<uint32_t, int32_t, Degree<512>>(
v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
@@ -293,6 +322,11 @@ void scratch_cuda_blind_rotation_sample_extraction_64(
bool allocate_gpu_memory) {
switch (polynomial_size) {
case 256:
scratch_blind_rotation_sample_extraction<uint64_t, int64_t, Degree<256>>(
v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
break;
case 512:
scratch_blind_rotation_sample_extraction<uint64_t, int64_t, Degree<512>>(
v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
@@ -353,6 +387,12 @@ void cuda_blind_rotate_and_sample_extraction_64(
uint32_t l_gadget, uint32_t max_shared_memory) {
switch (polynomial_size) {
case 256:
host_blind_rotate_and_sample_extraction<uint64_t, int64_t, Degree<256>>(
v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in,
(uint64_t *)lut_vector, br_se_buffer, mbr_size, tau, glwe_dimension,
polynomial_size, base_log, l_gadget, max_shared_memory);
break;
case 512:
host_blind_rotate_and_sample_extraction<uint64_t, int64_t, Degree<512>>(
v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in,

View File

@@ -14,6 +14,12 @@ void scratch_cuda_circuit_bootstrap_vertical_packing_32(
bool allocate_gpu_memory) {
switch (polynomial_size) {
case 256:
scratch_circuit_bootstrap_vertical_packing<uint32_t, int32_t, Degree<256>>(
v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, level_count_cbs, number_of_inputs, tau,
max_shared_memory, allocate_gpu_memory);
break;
case 512:
scratch_circuit_bootstrap_vertical_packing<uint32_t, int32_t, Degree<512>>(
v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
@@ -63,6 +69,12 @@ void scratch_cuda_circuit_bootstrap_vertical_packing_64(
bool allocate_gpu_memory) {
switch (polynomial_size) {
case 256:
scratch_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<256>>(
v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, level_count_cbs, number_of_inputs, tau,
max_shared_memory, allocate_gpu_memory);
break;
case 512:
scratch_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<512>>(
v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
@@ -113,6 +125,14 @@ void scratch_cuda_wop_pbs_32(
uint32_t number_of_bits_to_extract, uint32_t number_of_inputs,
uint32_t max_shared_memory, bool allocate_gpu_memory) {
switch (polynomial_size) {
case 256:
scratch_wop_pbs<uint32_t, int32_t, Degree<256>>(
v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log,
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
level_count_bsk, number_of_bits_of_message_including_padding,
number_of_bits_to_extract, number_of_inputs, max_shared_memory,
allocate_gpu_memory);
break;
case 512:
scratch_wop_pbs<uint32_t, int32_t, Degree<512>>(
v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log,
@@ -173,6 +193,14 @@ void scratch_cuda_wop_pbs_64(
uint32_t number_of_bits_to_extract, uint32_t number_of_inputs,
uint32_t max_shared_memory, bool allocate_gpu_memory) {
switch (polynomial_size) {
case 256:
scratch_wop_pbs<uint64_t, int64_t, Degree<256>>(
v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log,
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
level_count_bsk, number_of_bits_of_message_including_padding,
number_of_bits_to_extract, number_of_inputs, max_shared_memory,
allocate_gpu_memory);
break;
case 512:
scratch_wop_pbs<uint64_t, int64_t, Degree<512>>(
v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log,
@@ -232,7 +260,7 @@ void scratch_cuda_wop_pbs_64(
* - 'lut_vector' list of test vectors
* - 'cbs_vp_buffer' a pre-allocated array to store intermediate results
* - 'polynomial_size' size of the test polynomial, supported sizes:
* {512, 1024, 2048, 4096, 8192}
* {256, 512, 1024, 2048, 4096, 8192}
* - 'glwe_dimension' supported dimensions: {1}
* - 'lwe_dimension' dimension of input LWE ciphertexts
* - 'level_count_bsk' decomposition level for bootstrapping
@@ -255,10 +283,10 @@ void cuda_circuit_bootstrap_vertical_packing_64(
uint32_t base_log_cbs, uint32_t number_of_inputs, uint32_t lut_number,
uint32_t max_shared_memory) {
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
"512, 1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 ||
polynomial_size == 2048 || polynomial_size == 4096 ||
polynomial_size == 8192));
"256, 512, 1024, 2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
// The number of inputs should be lower than the number of streaming
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
// to the occupancy of 50%). The only supported value for k is 1, so
@@ -271,6 +299,16 @@ void cuda_circuit_bootstrap_vertical_packing_64(
"level_count_bsk",
number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk));
switch (polynomial_size) {
case 256:
host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<256>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
(double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, cbs_vp_buffer,
cbs_delta_log, glwe_dimension, lwe_dimension, polynomial_size,
base_log_bsk, level_count_bsk, base_log_pksk, level_count_pksk,
base_log_cbs, level_count_cbs, number_of_inputs, lut_number,
max_shared_memory);
break;
case 512:
host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<512>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
@@ -343,7 +381,7 @@ void cuda_circuit_bootstrap_vertical_packing_64(
* - 'glwe_dimension' supported dimensions: {1}
* - 'lwe_dimension' dimension of input lwe ciphertexts
* - 'polynomial_size' size of the test polynomial, supported sizes:
* {512, 1024, 2048, 4096, 8192}
* {256, 512, 1024, 2048, 4096, 8192}
* - 'base_log_bsk' base log parameter for bootstrapping
* - 'level_count_bsk' decomposition level for bootstrapping
* - 'base_log_ksk' base log parameter for keyswitch
@@ -374,10 +412,10 @@ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
uint32_t number_of_bits_to_extract, uint32_t delta_log,
uint32_t number_of_inputs, uint32_t max_shared_memory) {
assert(("Error (GPU WOP PBS): polynomial_size should be one of "
"512, 1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 ||
polynomial_size == 2048 || polynomial_size == 4096 ||
polynomial_size == 8192));
"256, 512, 1024, 2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
// The number of inputs should be lower than the number of streaming
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
// to the occupancy of 50%). The only supported value for k is 1, so
@@ -390,6 +428,18 @@ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
"level_count_bsk",
number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk));
switch (polynomial_size) {
case 256:
host_wop_pbs<uint64_t, int64_t, Degree<256>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
(double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk,
wop_pbs_buffer, cbs_delta_log, glwe_dimension, lwe_dimension,
polynomial_size, base_log_bsk, level_count_bsk, base_log_ksk,
level_count_ksk, base_log_pksk, level_count_pksk, base_log_cbs,
level_count_cbs, number_of_bits_of_message_including_padding,
number_of_bits_to_extract, delta_log, number_of_inputs,
max_shared_memory);
break;
case 512:
host_wop_pbs<uint64_t, int64_t, Degree<512>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,