fix(cuda): fix the assert on the number of inputs in the low lat pbs

This commit is contained in:
Agnes Leroy
2022-12-02 10:13:36 +01:00
committed by Agnès Leroy
parent 9bcf0f8a70
commit 0a0c45338c
2 changed files with 11 additions and 11 deletions

View File

@@ -72,7 +72,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
polynomial_size == 512 || polynomial_size == 1024 ||
polynomial_size == 2048 || polynomial_size == 4096 ||
polynomial_size == 8192));
// The number of samples should be lower than SM/(4 * (k + 1) * l) (the
// The number of samples should be lower than 4 * SM/((k + 1) * l) (the
// factor 4 being related to the occupancy of 50%). The only supported
// value for k is 1, so k + 1 = 2 for now.
int number_of_sm = 0;
@@ -81,7 +81,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
"or equal to the "
"number of streaming multiprocessors on the device divided by 8 * "
"level_count",
num_samples <= number_of_sm / 4. / 2. / level_count));
num_samples <= number_of_sm * 4. / 2. / level_count));
switch (polynomial_size) {
case 512:
@@ -140,7 +140,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
polynomial_size == 512 || polynomial_size == 1024 ||
polynomial_size == 2048 || polynomial_size == 4096 ||
polynomial_size == 8192));
// The number of samples should be lower than SM/(4 * (k + 1) * l) (the
// The number of samples should be lower than 4 * SM/((k + 1) * l) (the
// factor 4 being related to the occupancy of 50%). The only supported
// value for k is 1, so k + 1 = 2 for now.
int number_of_sm = 0;
@@ -149,7 +149,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
"or equal to the "
"number of streaming multiprocessors on the device divided by 8 * "
"level_count",
num_samples <= number_of_sm / 4. / 2. / level_count));
num_samples <= number_of_sm * 4. / 2. / level_count));
switch (polynomial_size) {
case 512:

View File

@@ -129,8 +129,8 @@ void cuda_extract_bits_32(
lwe_dimension_in == 512 || lwe_dimension_in == 1024 ||
lwe_dimension_in == 2048 || lwe_dimension_in == 4096 ||
lwe_dimension_in == 8192));
// The number of samples should be lower than the number of streaming
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
// The number of samples should be lower than 4 time the number of streaming
// multiprocessors divided by ((k + 1) * l) (the factor 4 being related
// to the occupancy of 50%). The only supported value for k is 1, so
// k + 1 = 2 for now.
int number_of_sm = 0;
@@ -139,7 +139,7 @@ void cuda_extract_bits_32(
"equal to the "
"number of streaming multiprocessors on the device divided by 8 * "
"level_count_bsk",
number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk));
number_of_samples <= number_of_sm * 4. / 2. / level_count_bsk));
switch (lwe_dimension_in) {
case 512:
@@ -226,9 +226,9 @@ void cuda_extract_bits_64(
lwe_dimension_in == 512 || lwe_dimension_in == 1024 ||
lwe_dimension_in == 2048 || lwe_dimension_in == 4096 ||
lwe_dimension_in == 8192));
// The number of samples should be lower than the number of streaming
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
// to the occupancy of 50%). The only supported value for k is 1, so
// The number of samples should be lower than four time the number of
// streaming multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being
// related to the occupancy of 50%). The only supported value for k is 1, so
// k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
@@ -236,7 +236,7 @@ void cuda_extract_bits_64(
"equal to the "
"number of streaming multiprocessors on the device divided by 8 * "
"level_count_bsk",
number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk));
number_of_samples <= number_of_sm * 4. / 2. / level_count_bsk));
switch (lwe_dimension_in) {
case 512: