mirror of
https://github.com/zama-ai/concrete.git
synced 2026-02-08 03:25:05 -05:00
fix(concrete_cuda): make sure r > 0 in the wop PBS
Sometimes the optimizer provides inconsistent parameter sets for the wop PBS depending on how many inputs & bits to extract we're going to need for a certain precision. This results in r < 0 and until now the corresponding error in concrete-cuda was very hard to understand. This commit fixes this behavior. Also, when introducing the support for k > 1 we forgot to update the checks on the low lat PBS. This commit also fixes them.
This commit is contained in:
@@ -3,7 +3,7 @@
|
||||
/*
|
||||
* Runs standard checks to validate the inputs
|
||||
*/
|
||||
void checks_fast_extract_bits(int nbits, int polynomial_size,
|
||||
void checks_fast_extract_bits(int glwe_dimension, int polynomial_size,
|
||||
int level_count_bsk, int number_of_samples) {
|
||||
|
||||
assert(("Error (GPU extract bits): polynomial_size should be one of "
|
||||
@@ -13,26 +13,27 @@ void checks_fast_extract_bits(int nbits, int polynomial_size,
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
// The number of samples should be lower than four time the number of
|
||||
// streaming multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being
|
||||
// related to the occupancy of 50%). The only supported value for k is 1, so
|
||||
// k + 1 = 2 for now.
|
||||
// related to the occupancy of 50%).
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
|
||||
"equal to the "
|
||||
"number of streaming multiprocessors on the device divided by 8 * "
|
||||
"equal to the number of streaming multiprocessors on the device "
|
||||
"divided by 4 * (k + 1) "
|
||||
"level_count_bsk",
|
||||
number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk));
|
||||
number_of_samples <=
|
||||
number_of_sm / 4. / (glwe_dimension + 1) / level_count_bsk));
|
||||
}
|
||||
|
||||
/*
|
||||
* Runs standard checks to validate the inputs
|
||||
*/
|
||||
void checks_extract_bits(int nbits, int polynomial_size, int base_log_bsk,
|
||||
int level_count_bsk, int number_of_samples) {
|
||||
void checks_extract_bits(int nbits, int glwe_dimension, int polynomial_size,
|
||||
int base_log_bsk, int level_count_bsk,
|
||||
int number_of_samples) {
|
||||
|
||||
assert(("Error (GPU extract bits): base log should be <= nbits",
|
||||
base_log_bsk <= nbits));
|
||||
checks_fast_extract_bits(nbits, polynomial_size, level_count_bsk,
|
||||
checks_fast_extract_bits(glwe_dimension, polynomial_size, level_count_bsk,
|
||||
number_of_samples);
|
||||
}
|
||||
|
||||
@@ -47,7 +48,8 @@ void scratch_cuda_extract_bits_32(
|
||||
uint32_t level_count, uint32_t number_of_inputs, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
checks_fast_extract_bits(32, polynomial_size, level_count, number_of_inputs);
|
||||
checks_fast_extract_bits(glwe_dimension, polynomial_size, level_count,
|
||||
number_of_inputs);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -101,7 +103,8 @@ void scratch_cuda_extract_bits_64(
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t number_of_inputs, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
checks_fast_extract_bits(64, polynomial_size, level_count, number_of_inputs);
|
||||
checks_fast_extract_bits(glwe_dimension, polynomial_size, level_count,
|
||||
number_of_inputs);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -158,8 +161,8 @@ void cuda_extract_bits_32(void *v_stream, uint32_t gpu_index,
|
||||
uint32_t level_count_bsk, uint32_t base_log_ksk,
|
||||
uint32_t level_count_ksk, uint32_t number_of_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
checks_extract_bits(32, polynomial_size, base_log_bsk, level_count_bsk,
|
||||
number_of_samples);
|
||||
checks_extract_bits(32, glwe_dimension, polynomial_size, base_log_bsk,
|
||||
level_count_bsk, number_of_samples);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -276,8 +279,8 @@ void cuda_extract_bits_64(void *v_stream, uint32_t gpu_index,
|
||||
uint32_t level_count_bsk, uint32_t base_log_ksk,
|
||||
uint32_t level_count_ksk, uint32_t number_of_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
checks_extract_bits(64, polynomial_size, base_log_bsk, level_count_bsk,
|
||||
number_of_samples);
|
||||
checks_extract_bits(64, glwe_dimension, polynomial_size, base_log_bsk,
|
||||
level_count_bsk, number_of_samples);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
/*
|
||||
* Runs standard checks to validate the inputs
|
||||
*/
|
||||
void checks_fast_bootstrap_amortized(int nbits, int polynomial_size) {
|
||||
void checks_fast_bootstrap_amortized(int polynomial_size) {
|
||||
assert(
|
||||
("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
|
||||
"1024, 2048, 4096, 8192",
|
||||
@@ -18,7 +18,7 @@ void checks_fast_bootstrap_amortized(int nbits, int polynomial_size) {
|
||||
void checks_bootstrap_amortized(int nbits, int base_log, int polynomial_size) {
|
||||
assert(("Error (GPU amortized PBS): base log should be <= nbits",
|
||||
base_log <= nbits));
|
||||
checks_fast_bootstrap_amortized(nbits, polynomial_size);
|
||||
checks_fast_bootstrap_amortized(polynomial_size);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -34,7 +34,7 @@ void scratch_cuda_bootstrap_amortized_32(void *v_stream, uint32_t gpu_index,
|
||||
uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
checks_fast_bootstrap_amortized(32, polynomial_size);
|
||||
checks_fast_bootstrap_amortized(polynomial_size);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -85,7 +85,7 @@ void scratch_cuda_bootstrap_amortized_64(void *v_stream, uint32_t gpu_index,
|
||||
uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
checks_fast_bootstrap_amortized(64, polynomial_size);
|
||||
checks_fast_bootstrap_amortized(polynomial_size);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
/*
|
||||
* Runs standard checks to validate the inputs
|
||||
*/
|
||||
void checks_fast_bootstrap_low_latency(int nbits, int level_count,
|
||||
void checks_fast_bootstrap_low_latency(int glwe_dimension, int level_count,
|
||||
int polynomial_size, int num_samples) {
|
||||
|
||||
assert((
|
||||
@@ -17,23 +17,25 @@ void checks_fast_bootstrap_low_latency(int nbits, int level_count,
|
||||
// value for k is 1, so k + 1 = 2 for now.
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
assert(("Error (GPU low latency PBS): the number of input LWEs must be lower "
|
||||
"or equal to the "
|
||||
"number of streaming multiprocessors on the device divided by 8 * "
|
||||
"level_count",
|
||||
num_samples <= number_of_sm * 4. / 2. / level_count));
|
||||
assert(
|
||||
("Error (GPU low latency PBS): the number of input LWEs must be lower "
|
||||
"or equal to the number of streaming multiprocessors on the device "
|
||||
"divided by 4 * "
|
||||
"(k + 1) * level_count",
|
||||
num_samples <= number_of_sm * 4. / (glwe_dimension + 1) / level_count));
|
||||
}
|
||||
|
||||
/*
|
||||
* Runs standard checks to validate the inputs
|
||||
*/
|
||||
void checks_bootstrap_low_latency(int nbits, int level_count, int base_log,
|
||||
void checks_bootstrap_low_latency(int nbits, int glwe_dimension,
|
||||
int level_count, int base_log,
|
||||
int polynomial_size, int num_samples) {
|
||||
|
||||
assert(("Error (GPU low latency PBS): base log should be <= nbits",
|
||||
base_log <= nbits));
|
||||
checks_fast_bootstrap_low_latency(nbits, level_count, polynomial_size,
|
||||
num_samples);
|
||||
checks_fast_bootstrap_low_latency(glwe_dimension, level_count,
|
||||
polynomial_size, num_samples);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -47,8 +49,8 @@ void scratch_cuda_bootstrap_low_latency_32(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
checks_fast_bootstrap_low_latency(32, level_count, polynomial_size,
|
||||
input_lwe_ciphertext_count);
|
||||
checks_fast_bootstrap_low_latency(
|
||||
glwe_dimension, level_count, polynomial_size, input_lwe_ciphertext_count);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -103,8 +105,8 @@ void scratch_cuda_bootstrap_low_latency_64(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
checks_fast_bootstrap_low_latency(64, level_count, polynomial_size,
|
||||
input_lwe_ciphertext_count);
|
||||
checks_fast_bootstrap_low_latency(
|
||||
glwe_dimension, level_count, polynomial_size, input_lwe_ciphertext_count);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -163,8 +165,8 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
checks_bootstrap_low_latency(32, level_count, base_log, polynomial_size,
|
||||
num_samples);
|
||||
checks_bootstrap_low_latency(32, glwe_dimension, level_count, base_log,
|
||||
polynomial_size, num_samples);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -304,8 +306,8 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
checks_bootstrap_low_latency(64, level_count, base_log, polynomial_size,
|
||||
num_samples);
|
||||
checks_bootstrap_low_latency(64, glwe_dimension, level_count, base_log,
|
||||
polynomial_size, num_samples);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
/*
|
||||
* Runs standard checks to validate the inputs
|
||||
*/
|
||||
void checks_fast_circuit_bootstrap(int polynomial_size, int number_of_inputs) {
|
||||
void checks_fast_circuit_bootstrap(int polynomial_size) {
|
||||
|
||||
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
|
||||
"256, 512, 1024, 2048, 4096, 8192",
|
||||
@@ -16,8 +16,8 @@ void checks_fast_circuit_bootstrap(int polynomial_size, int number_of_inputs) {
|
||||
/*
|
||||
* Runs standard checks to validate the inputs
|
||||
*/
|
||||
void checks_circuit_bootstrap(int polynomial_size, int level_bsk,
|
||||
int number_of_inputs) {
|
||||
void checks_circuit_bootstrap(int glwe_dimension, int polynomial_size,
|
||||
int level_bsk, int number_of_inputs) {
|
||||
// The number of samples should be lower than the number of streaming
|
||||
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
|
||||
// to the occupancy of 50%). The only supported value for k is 1, so
|
||||
@@ -26,11 +26,12 @@ void checks_circuit_bootstrap(int polynomial_size, int level_bsk,
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
|
||||
"equal to the "
|
||||
"number of streaming multiprocessors on the device divided by 8 * "
|
||||
"level_count_bsk",
|
||||
number_of_inputs <= number_of_sm / 4. / 2. / level_bsk));
|
||||
"number of streaming multiprocessors on the device divided by 4 * "
|
||||
"(k + 1) * level_count_bsk",
|
||||
number_of_inputs <=
|
||||
number_of_sm / 4. / (glwe_dimension + 1) / level_bsk));
|
||||
|
||||
checks_fast_circuit_bootstrap(polynomial_size, number_of_inputs);
|
||||
checks_fast_circuit_bootstrap(polynomial_size);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -44,7 +45,7 @@ void scratch_cuda_circuit_bootstrap_32(
|
||||
uint32_t level_count_cbs, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
|
||||
checks_fast_circuit_bootstrap(polynomial_size, number_of_inputs);
|
||||
checks_fast_circuit_bootstrap(polynomial_size);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -99,7 +100,7 @@ void scratch_cuda_circuit_bootstrap_64(
|
||||
uint32_t level_count_cbs, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
|
||||
checks_fast_circuit_bootstrap(polynomial_size, number_of_inputs);
|
||||
checks_fast_circuit_bootstrap(polynomial_size);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -156,7 +157,8 @@ void cuda_circuit_bootstrap_32(
|
||||
uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
checks_circuit_bootstrap(polynomial_size, level_bsk, number_of_inputs);
|
||||
checks_circuit_bootstrap(glwe_dimension, polynomial_size, level_bsk,
|
||||
number_of_inputs);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -252,7 +254,8 @@ void cuda_circuit_bootstrap_64(
|
||||
uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
checks_circuit_bootstrap(polynomial_size, level_bsk, number_of_inputs);
|
||||
checks_circuit_bootstrap(glwe_dimension, polynomial_size, level_bsk,
|
||||
number_of_inputs);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
/*
|
||||
* Runs standard checks to validate the inputs
|
||||
*/
|
||||
void checks_fast_cmux_tree(int nbits, int polynomial_size, int r) {
|
||||
void checks_fast_cmux_tree(int polynomial_size, int r) {
|
||||
assert((
|
||||
"Error (GPU Cmux tree): polynomial size should be one of 256, 512, 1024, "
|
||||
"2048, 4096, 8192",
|
||||
@@ -14,7 +14,7 @@ void checks_fast_cmux_tree(int nbits, int polynomial_size, int r) {
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
// For larger k we will need to adjust the mask size
|
||||
assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
|
||||
"be >= 1 ",
|
||||
"be >= 1",
|
||||
r >= 1));
|
||||
}
|
||||
|
||||
@@ -25,7 +25,7 @@ void checks_cmux_tree(int nbits, int polynomial_size, int base_log, int r) {
|
||||
|
||||
assert(("Error (GPU Cmux tree): base log should be <= nbits",
|
||||
base_log <= nbits));
|
||||
checks_fast_cmux_tree(nbits, polynomial_size, r);
|
||||
checks_fast_cmux_tree(polynomial_size, r);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -34,9 +34,7 @@ void checks_cmux_tree(int nbits, int polynomial_size, int base_log, int r) {
|
||||
void checks_blind_rotation_and_sample_extraction(int polynomial_size) {
|
||||
|
||||
assert(("Error (GPU Blind rotation + sample extraction): polynomial size "
|
||||
"should be one of 256, 512, "
|
||||
"1024, "
|
||||
"2048, 4096, 8192",
|
||||
"should be one of 256, 512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
@@ -54,7 +52,7 @@ void scratch_cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index,
|
||||
uint32_t r, uint32_t tau,
|
||||
uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
checks_fast_cmux_tree(32, polynomial_size, r);
|
||||
checks_fast_cmux_tree(polynomial_size, r);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -104,7 +102,7 @@ void scratch_cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index,
|
||||
uint32_t r, uint32_t tau,
|
||||
uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
checks_fast_cmux_tree(64, polynomial_size, r);
|
||||
checks_fast_cmux_tree(polynomial_size, r);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
#include "wop_bootstrap.cuh"
|
||||
#include <cmath>
|
||||
|
||||
/*
|
||||
* Runs standard checks to validate the inputs
|
||||
*/
|
||||
void checks_wop_pbs(int polynomial_size, int level_count_bsk,
|
||||
int number_of_inputs) {
|
||||
void checks_wop_pbs(int glwe_dimension, int polynomial_size,
|
||||
int level_count_bsk, int number_of_inputs,
|
||||
int number_of_bits_to_extract) {
|
||||
assert(("Error (GPU WOP PBS): polynomial_size should be one of "
|
||||
"256, 512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
@@ -18,9 +20,16 @@ void checks_wop_pbs(int polynomial_size, int level_count_bsk,
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
assert(("Error (GPU WOP PBS): the number of input LWEs must be lower or "
|
||||
"equal to the "
|
||||
"number of streaming multiprocessors on the device divided by 8 * "
|
||||
"number of streaming multiprocessors on the device divided by 4 * (k "
|
||||
"+ 1) * "
|
||||
"level_count_bsk",
|
||||
number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk));
|
||||
number_of_inputs <=
|
||||
number_of_sm / 4. / (glwe_dimension + 1) / level_count_bsk));
|
||||
assert(
|
||||
("Error (GPU WOP PBS): the number of inputs x the number of extracted "
|
||||
"bits should be "
|
||||
"larger than log2 of the polynomial size",
|
||||
number_of_inputs * number_of_bits_to_extract >= log2(polynomial_size)));
|
||||
}
|
||||
|
||||
void checks_fast_circuit_bootstrap_vertical_packing(int polynomial_size) {
|
||||
@@ -31,7 +40,8 @@ void checks_fast_circuit_bootstrap_vertical_packing(int polynomial_size) {
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
}
|
||||
|
||||
void checks_circuit_bootstrap_vertical_packing(int polynomial_size,
|
||||
void checks_circuit_bootstrap_vertical_packing(int glwe_dimension,
|
||||
int polynomial_size,
|
||||
int number_of_inputs,
|
||||
int level_count_bsk) {
|
||||
// The number of inputs should be lower than the number of streaming
|
||||
@@ -42,9 +52,11 @@ void checks_circuit_bootstrap_vertical_packing(int polynomial_size,
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
|
||||
"equal to the "
|
||||
"number of streaming multiprocessors on the device divided by 8 * "
|
||||
"number of streaming multiprocessors on the device divided by 4 * (k "
|
||||
"+ 1) "
|
||||
"level_count_bsk",
|
||||
number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk));
|
||||
number_of_inputs <=
|
||||
number_of_sm / 4. / (glwe_dimension + 1) / level_count_bsk));
|
||||
checks_fast_circuit_bootstrap_vertical_packing(polynomial_size);
|
||||
}
|
||||
|
||||
@@ -176,7 +188,8 @@ void scratch_cuda_wop_pbs_32(
|
||||
uint32_t number_of_bits_of_message_including_padding,
|
||||
uint32_t number_of_bits_to_extract, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
checks_wop_pbs(polynomial_size, level_count_bsk, number_of_inputs);
|
||||
checks_wop_pbs(glwe_dimension, polynomial_size, level_count_bsk,
|
||||
number_of_inputs, number_of_bits_to_extract);
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_wop_pbs<uint32_t, int32_t, Degree<256>>(
|
||||
@@ -245,7 +258,8 @@ void scratch_cuda_wop_pbs_64(
|
||||
uint32_t number_of_bits_of_message_including_padding,
|
||||
uint32_t number_of_bits_to_extract, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
checks_wop_pbs(polynomial_size, level_count_bsk, number_of_inputs);
|
||||
checks_wop_pbs(glwe_dimension, polynomial_size, level_count_bsk,
|
||||
number_of_inputs, number_of_bits_to_extract);
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_wop_pbs<uint64_t, int64_t, Degree<256>>(
|
||||
@@ -337,8 +351,8 @@ void cuda_circuit_bootstrap_vertical_packing_64(
|
||||
uint32_t base_log_cbs, uint32_t number_of_inputs, uint32_t lut_number,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
checks_circuit_bootstrap_vertical_packing(polynomial_size, number_of_inputs,
|
||||
level_count_bsk);
|
||||
checks_circuit_bootstrap_vertical_packing(glwe_dimension, polynomial_size,
|
||||
number_of_inputs, level_count_bsk);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -453,7 +467,8 @@ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
uint32_t number_of_bits_of_message_including_padding,
|
||||
uint32_t number_of_bits_to_extract, uint32_t delta_log,
|
||||
uint32_t number_of_inputs, uint32_t max_shared_memory) {
|
||||
checks_wop_pbs(polynomial_size, level_count_bsk, number_of_inputs);
|
||||
checks_wop_pbs(glwe_dimension, polynomial_size, level_count_bsk,
|
||||
number_of_inputs, number_of_bits_to_extract);
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_wop_pbs<uint64_t, int64_t, Degree<256>>(
|
||||
|
||||
Reference in New Issue
Block a user