mirror of
https://github.com/zama-ai/concrete.git
synced 2026-02-08 11:35:02 -05:00
refactor(cuda): Implements support to N=256 in the cmux tree, bit
extraction, and cbs.
This commit is contained in:
@@ -12,6 +12,12 @@ void scratch_cuda_extract_bits_32(
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_extract_bits<uint32_t, int32_t, Degree<256>>(
|
||||
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_extract_bits<uint32_t, int32_t, Degree<512>>(
|
||||
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
|
||||
@@ -59,6 +65,12 @@ void scratch_cuda_extract_bits_64(
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_extract_bits<uint64_t, int64_t, Degree<256>>(
|
||||
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_extract_bits<uint64_t, int64_t, Degree<512>>(
|
||||
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
|
||||
@@ -110,10 +122,10 @@ void cuda_extract_bits_32(void *v_stream, uint32_t gpu_index,
|
||||
assert(("Error (GPU extract bits): base log should be <= 32",
|
||||
base_log_bsk <= 32));
|
||||
assert(("Error (GPU extract bits): lwe_dimension_in should be one of "
|
||||
"512, 1024, 2048, 4096, 8192",
|
||||
lwe_dimension_in == 512 || lwe_dimension_in == 1024 ||
|
||||
lwe_dimension_in == 2048 || lwe_dimension_in == 4096 ||
|
||||
lwe_dimension_in == 8192));
|
||||
"256, 512, 1024, 2048, 4096, 8192",
|
||||
lwe_dimension_in == 256 || lwe_dimension_in == 512 ||
|
||||
lwe_dimension_in == 1024 || lwe_dimension_in == 2048 ||
|
||||
lwe_dimension_in == 4096 || lwe_dimension_in == 8192));
|
||||
assert(("Error (GPU extract bits): lwe_dimension_in should be equal to "
|
||||
"polynomial_size",
|
||||
lwe_dimension_in == polynomial_size));
|
||||
@@ -130,6 +142,15 @@ void cuda_extract_bits_32(void *v_stream, uint32_t gpu_index,
|
||||
number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk));
|
||||
|
||||
switch (lwe_dimension_in) {
|
||||
case 256:
|
||||
host_extract_bits<uint32_t, Degree<256>>(
|
||||
v_stream, gpu_index, (uint32_t *)list_lwe_array_out,
|
||||
(uint32_t *)lwe_array_in, bit_extract_buffer, (uint32_t *)ksk,
|
||||
(double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in,
|
||||
lwe_dimension_out, glwe_dimension, polynomial_size, base_log_bsk,
|
||||
level_count_bsk, base_log_ksk, level_count_ksk, number_of_samples,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 512:
|
||||
host_extract_bits<uint32_t, Degree<512>>(
|
||||
v_stream, gpu_index, (uint32_t *)list_lwe_array_out,
|
||||
@@ -210,7 +231,7 @@ void cuda_extract_bits_32(void *v_stream, uint32_t gpu_index,
|
||||
* - 'ksk' keyswitch key
|
||||
* - 'fourier_bsk' complex compressed bsk in fourier domain
|
||||
* - 'lwe_dimension_in' input LWE ciphertext dimension, supported input
|
||||
* dimensions are: {512, 1024,2048, 4096, 8192}
|
||||
* dimensions are: {256, 512, 1024,2048, 4096, 8192}
|
||||
* - 'lwe_dimension_out' output LWE ciphertext dimension
|
||||
* - 'glwe_dimension' GLWE dimension, only glwe_dimension = 1 is supported
|
||||
* for now
|
||||
@@ -238,10 +259,10 @@ void cuda_extract_bits_64(void *v_stream, uint32_t gpu_index,
|
||||
assert(("Error (GPU extract bits): base log should be <= 64",
|
||||
base_log_bsk <= 64));
|
||||
assert(("Error (GPU extract bits): lwe_dimension_in should be one of "
|
||||
"512, 1024, 2048, 4096, 8192",
|
||||
lwe_dimension_in == 512 || lwe_dimension_in == 1024 ||
|
||||
lwe_dimension_in == 2048 || lwe_dimension_in == 4096 ||
|
||||
lwe_dimension_in == 8192));
|
||||
"256, 512, 1024, 2048, 4096, 8192",
|
||||
lwe_dimension_in == 256 || lwe_dimension_in == 512 ||
|
||||
lwe_dimension_in == 1024 || lwe_dimension_in == 2048 ||
|
||||
lwe_dimension_in == 4096 || lwe_dimension_in == 8192));
|
||||
assert(("Error (GPU extract bits): lwe_dimension_in should be equal to "
|
||||
"polynomial_size",
|
||||
lwe_dimension_in == polynomial_size));
|
||||
@@ -258,6 +279,15 @@ void cuda_extract_bits_64(void *v_stream, uint32_t gpu_index,
|
||||
number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk));
|
||||
|
||||
switch (lwe_dimension_in) {
|
||||
case 256:
|
||||
host_extract_bits<uint64_t, Degree<256>>(
|
||||
v_stream, gpu_index, (uint64_t *)list_lwe_array_out,
|
||||
(uint64_t *)lwe_array_in, bit_extract_buffer, (uint64_t *)ksk,
|
||||
(double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in,
|
||||
lwe_dimension_out, glwe_dimension, polynomial_size, base_log_bsk,
|
||||
level_count_bsk, base_log_ksk, level_count_ksk, number_of_samples,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 512:
|
||||
host_extract_bits<uint64_t, Degree<512>>(
|
||||
v_stream, gpu_index, (uint64_t *)list_lwe_array_out,
|
||||
|
||||
@@ -13,6 +13,12 @@ void scratch_cuda_circuit_bootstrap_32(
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_circuit_bootstrap<uint32_t, int32_t, Degree<256>>(
|
||||
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_circuit_bootstrap<uint32_t, int32_t, Degree<512>>(
|
||||
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
|
||||
@@ -60,6 +66,12 @@ void scratch_cuda_circuit_bootstrap_64(
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_circuit_bootstrap<uint64_t, int64_t, Degree<256>>(
|
||||
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_circuit_bootstrap<uint64_t, int64_t, Degree<512>>(
|
||||
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
|
||||
@@ -108,10 +120,10 @@ void cuda_circuit_bootstrap_32(
|
||||
uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory) {
|
||||
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
|
||||
"512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 ||
|
||||
polynomial_size == 2048 || polynomial_size == 4096 ||
|
||||
polynomial_size == 8192));
|
||||
"256, 512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
// The number of samples should be lower than the number of streaming
|
||||
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
|
||||
// to the occupancy of 50%). The only supported value for k is 1, so
|
||||
@@ -124,6 +136,15 @@ void cuda_circuit_bootstrap_32(
|
||||
"level_count_bsk",
|
||||
number_of_inputs <= number_of_sm / 4. / 2. / level_bsk));
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_circuit_bootstrap<uint32_t, Degree<256>>(
|
||||
v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
|
||||
(double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
|
||||
(uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
|
||||
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
|
||||
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 512:
|
||||
host_circuit_bootstrap<uint32_t, Degree<512>>(
|
||||
v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
|
||||
@@ -208,10 +229,10 @@ void cuda_circuit_bootstrap_64(
|
||||
uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory) {
|
||||
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
|
||||
"512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 ||
|
||||
polynomial_size == 2048 || polynomial_size == 4096 ||
|
||||
polynomial_size == 8192));
|
||||
"256, 512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
// The number of samples should be lower than the number of streaming
|
||||
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
|
||||
// to the occupancy of 50%). The only supported value for k is 1, so
|
||||
@@ -225,6 +246,15 @@ void cuda_circuit_bootstrap_64(
|
||||
number_of_inputs <= number_of_sm / 4. / 2. / level_bsk));
|
||||
// The number of samples should be lower than the number of streaming
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_circuit_bootstrap<uint64_t, Degree<256>>(
|
||||
v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
|
||||
(double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
|
||||
(uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
|
||||
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
|
||||
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 512:
|
||||
host_circuit_bootstrap<uint64_t, Degree<512>>(
|
||||
v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
|
||||
|
||||
@@ -16,6 +16,11 @@ void scratch_cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_cmux_tree<uint32_t, int32_t, Degree<256>>(
|
||||
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, r, tau, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_cmux_tree<uint32_t, int32_t, Degree<512>>(
|
||||
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
|
||||
@@ -59,6 +64,11 @@ void scratch_cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index,
|
||||
uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_cmux_tree<uint64_t, int64_t, Degree<256>>(
|
||||
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, r, tau, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_cmux_tree<uint64_t, int64_t, Degree<512>>(
|
||||
v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
|
||||
@@ -101,17 +111,24 @@ void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
assert(("Error (GPU Cmux tree): base log should be <= 32", base_log <= 32));
|
||||
assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, "
|
||||
"2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 ||
|
||||
polynomial_size == 2048 || polynomial_size == 4096 ||
|
||||
polynomial_size == 8192));
|
||||
assert((
|
||||
"Error (GPU Cmux tree): polynomial size should be one of 256, 512, 1024, "
|
||||
"2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
// For larger k we will need to adjust the mask size
|
||||
assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
|
||||
"be >= 1 ",
|
||||
r >= 1));
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_cmux_tree<uint32_t, int32_t, Degree<256>>(
|
||||
v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in,
|
||||
(uint32_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, r, tau, max_shared_memory);
|
||||
break;
|
||||
case 512:
|
||||
host_cmux_tree<uint32_t, int32_t, Degree<512>>(
|
||||
v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in,
|
||||
@@ -182,17 +199,24 @@ void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
assert(("Error (GPU Cmux tree): base log should be <= 64", base_log <= 64));
|
||||
assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, "
|
||||
"2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 ||
|
||||
polynomial_size == 2048 || polynomial_size == 4096 ||
|
||||
polynomial_size == 8192));
|
||||
assert((
|
||||
"Error (GPU Cmux tree): polynomial size should be one of 256, 512, 1024, "
|
||||
"2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
// For larger k we will need to adjust the mask size
|
||||
assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
|
||||
"be >= 1 ",
|
||||
r >= 1));
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_cmux_tree<uint64_t, int64_t, Degree<256>>(
|
||||
v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in,
|
||||
(uint64_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, r, tau, max_shared_memory);
|
||||
break;
|
||||
case 512:
|
||||
host_cmux_tree<uint64_t, int64_t, Degree<512>>(
|
||||
v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in,
|
||||
@@ -251,6 +275,11 @@ void scratch_cuda_blind_rotation_sample_extraction_32(
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_blind_rotation_sample_extraction<uint32_t, int32_t, Degree<256>>(
|
||||
v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_blind_rotation_sample_extraction<uint32_t, int32_t, Degree<512>>(
|
||||
v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
|
||||
@@ -293,6 +322,11 @@ void scratch_cuda_blind_rotation_sample_extraction_64(
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_blind_rotation_sample_extraction<uint64_t, int64_t, Degree<256>>(
|
||||
v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_blind_rotation_sample_extraction<uint64_t, int64_t, Degree<512>>(
|
||||
v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
|
||||
@@ -353,6 +387,12 @@ void cuda_blind_rotate_and_sample_extraction_64(
|
||||
uint32_t l_gadget, uint32_t max_shared_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_blind_rotate_and_sample_extraction<uint64_t, int64_t, Degree<256>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in,
|
||||
(uint64_t *)lut_vector, br_se_buffer, mbr_size, tau, glwe_dimension,
|
||||
polynomial_size, base_log, l_gadget, max_shared_memory);
|
||||
break;
|
||||
case 512:
|
||||
host_blind_rotate_and_sample_extraction<uint64_t, int64_t, Degree<512>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in,
|
||||
|
||||
@@ -14,6 +14,12 @@ void scratch_cuda_circuit_bootstrap_vertical_packing_32(
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_circuit_bootstrap_vertical_packing<uint32_t, int32_t, Degree<256>>(
|
||||
v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, level_count_cbs, number_of_inputs, tau,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_circuit_bootstrap_vertical_packing<uint32_t, int32_t, Degree<512>>(
|
||||
v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
|
||||
@@ -63,6 +69,12 @@ void scratch_cuda_circuit_bootstrap_vertical_packing_64(
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<256>>(
|
||||
v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, level_count_cbs, number_of_inputs, tau,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<512>>(
|
||||
v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
|
||||
@@ -113,6 +125,14 @@ void scratch_cuda_wop_pbs_32(
|
||||
uint32_t number_of_bits_to_extract, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_wop_pbs<uint32_t, int32_t, Degree<256>>(
|
||||
v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
|
||||
level_count_bsk, number_of_bits_of_message_including_padding,
|
||||
number_of_bits_to_extract, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_wop_pbs<uint32_t, int32_t, Degree<512>>(
|
||||
v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log,
|
||||
@@ -173,6 +193,14 @@ void scratch_cuda_wop_pbs_64(
|
||||
uint32_t number_of_bits_to_extract, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_wop_pbs<uint64_t, int64_t, Degree<256>>(
|
||||
v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
|
||||
level_count_bsk, number_of_bits_of_message_including_padding,
|
||||
number_of_bits_to_extract, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_wop_pbs<uint64_t, int64_t, Degree<512>>(
|
||||
v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log,
|
||||
@@ -232,7 +260,7 @@ void scratch_cuda_wop_pbs_64(
|
||||
* - 'lut_vector' list of test vectors
|
||||
* - 'cbs_vp_buffer' a pre-allocated array to store intermediate results
|
||||
* - 'polynomial_size' size of the test polynomial, supported sizes:
|
||||
* {512, 1024, 2048, 4096, 8192}
|
||||
* {256, 512, 1024, 2048, 4096, 8192}
|
||||
* - 'glwe_dimension' supported dimensions: {1}
|
||||
* - 'lwe_dimension' dimension of input LWE ciphertexts
|
||||
* - 'level_count_bsk' decomposition level for bootstrapping
|
||||
@@ -255,10 +283,10 @@ void cuda_circuit_bootstrap_vertical_packing_64(
|
||||
uint32_t base_log_cbs, uint32_t number_of_inputs, uint32_t lut_number,
|
||||
uint32_t max_shared_memory) {
|
||||
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
|
||||
"512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 ||
|
||||
polynomial_size == 2048 || polynomial_size == 4096 ||
|
||||
polynomial_size == 8192));
|
||||
"256, 512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
// The number of inputs should be lower than the number of streaming
|
||||
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
|
||||
// to the occupancy of 50%). The only supported value for k is 1, so
|
||||
@@ -271,6 +299,16 @@ void cuda_circuit_bootstrap_vertical_packing_64(
|
||||
"level_count_bsk",
|
||||
number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk));
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<256>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
|
||||
(double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, cbs_vp_buffer,
|
||||
cbs_delta_log, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
base_log_bsk, level_count_bsk, base_log_pksk, level_count_pksk,
|
||||
base_log_cbs, level_count_cbs, number_of_inputs, lut_number,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 512:
|
||||
host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<512>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out,
|
||||
@@ -343,7 +381,7 @@ void cuda_circuit_bootstrap_vertical_packing_64(
|
||||
* - 'glwe_dimension' supported dimensions: {1}
|
||||
* - 'lwe_dimension' dimension of input lwe ciphertexts
|
||||
* - 'polynomial_size' size of the test polynomial, supported sizes:
|
||||
* {512, 1024, 2048, 4096, 8192}
|
||||
* {256, 512, 1024, 2048, 4096, 8192}
|
||||
* - 'base_log_bsk' base log parameter for bootstrapping
|
||||
* - 'level_count_bsk' decomposition level for bootstrapping
|
||||
* - 'base_log_ksk' base log parameter for keyswitch
|
||||
@@ -374,10 +412,10 @@ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
uint32_t number_of_bits_to_extract, uint32_t delta_log,
|
||||
uint32_t number_of_inputs, uint32_t max_shared_memory) {
|
||||
assert(("Error (GPU WOP PBS): polynomial_size should be one of "
|
||||
"512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 ||
|
||||
polynomial_size == 2048 || polynomial_size == 4096 ||
|
||||
polynomial_size == 8192));
|
||||
"256, 512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
// The number of inputs should be lower than the number of streaming
|
||||
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
|
||||
// to the occupancy of 50%). The only supported value for k is 1, so
|
||||
@@ -390,6 +428,18 @@ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
"level_count_bsk",
|
||||
number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk));
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_wop_pbs<uint64_t, int64_t, Degree<256>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
|
||||
(double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk,
|
||||
wop_pbs_buffer, cbs_delta_log, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log_bsk, level_count_bsk, base_log_ksk,
|
||||
level_count_ksk, base_log_pksk, level_count_pksk, base_log_cbs,
|
||||
level_count_cbs, number_of_bits_of_message_including_padding,
|
||||
number_of_bits_to_extract, delta_log, number_of_inputs,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 512:
|
||||
host_wop_pbs<uint64_t, int64_t, Degree<512>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out,
|
||||
|
||||
Reference in New Issue
Block a user