chore(concrete_cuda): Add checks to validate inputs passed to the scratch functions

This commit is contained in:
Pedro Alves
2023-03-13 15:37:40 +01:00
committed by Agnès Leroy
parent e36e2bd340
commit 5cb4e5ff4e
6 changed files with 261 additions and 176 deletions

View File

@@ -1,5 +1,41 @@
#include "bit_extraction.cuh"
/*
* Runs standard checks to validate the inputs
*/
void checks_fast_extract_bits(int nbits, int polynomial_size,
int level_count_bsk, int number_of_samples) {
assert(("Error (GPU extract bits): polynomial_size should be one of "
"256, 512, 1024, 2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
// The number of samples should be lower than four time the number of
// streaming multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being
// related to the occupancy of 50%). The only supported value for k is 1, so
// k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
"equal to the "
"number of streaming multiprocessors on the device divided by 8 * "
"level_count_bsk",
number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk));
}
/*
* Runs standard checks to validate the inputs
*/
void checks_extract_bits(int nbits, int polynomial_size, int base_log_bsk,
int level_count_bsk, int number_of_samples) {
assert(("Error (GPU extract bits): base log should be <= nbits",
base_log_bsk <= nbits));
checks_fast_extract_bits(nbits, polynomial_size, level_count_bsk,
number_of_samples);
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the bit extraction on 32 bits inputs, into `cbs_buffer`. It also
@@ -11,6 +47,8 @@ void scratch_cuda_extract_bits_32(
uint32_t level_count, uint32_t number_of_inputs, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
checks_fast_extract_bits(32, polynomial_size, level_count, number_of_inputs);
switch (polynomial_size) {
case 256:
scratch_extract_bits<uint32_t, int32_t, Degree<256>>(
@@ -63,6 +101,7 @@ void scratch_cuda_extract_bits_64(
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t number_of_inputs, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
checks_fast_extract_bits(64, polynomial_size, level_count, number_of_inputs);
switch (polynomial_size) {
case 256:
@@ -119,24 +158,8 @@ void cuda_extract_bits_32(void *v_stream, uint32_t gpu_index,
uint32_t level_count_bsk, uint32_t base_log_ksk,
uint32_t level_count_ksk, uint32_t number_of_samples,
uint32_t max_shared_memory) {
assert(("Error (GPU extract bits): base log should be <= 32",
base_log_bsk <= 32));
assert(("Error (GPU extract bits): polynomial_size should be one of "
"256, 512, 1024, 2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
// The number of samples should be lower than four time the number of
// streaming multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being
// related to the occupancy of 50%). The only supported value for k is 1, so
// k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
"equal to the "
"number of streaming multiprocessors on the device divided by 8 * "
"level_count_bsk",
number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk));
checks_extract_bits(32, polynomial_size, base_log_bsk, level_count_bsk,
number_of_samples);
switch (polynomial_size) {
case 256:
@@ -253,24 +276,8 @@ void cuda_extract_bits_64(void *v_stream, uint32_t gpu_index,
uint32_t level_count_bsk, uint32_t base_log_ksk,
uint32_t level_count_ksk, uint32_t number_of_samples,
uint32_t max_shared_memory) {
assert(("Error (GPU extract bits): base log should be <= 64",
base_log_bsk <= 64));
assert(("Error (GPU extract bits): polynomial_size should be one of "
"256, 512, 1024, 2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
// The number of samples should be lower than four time the number of
// streaming multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being
// related to the occupancy of 50%). The only supported value for k is 1, so
// k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
"equal to the "
"number of streaming multiprocessors on the device divided by 8 * "
"level_count_bsk",
number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk));
checks_extract_bits(64, polynomial_size, base_log_bsk, level_count_bsk,
number_of_samples);
switch (polynomial_size) {
case 256:

View File

@@ -1,5 +1,26 @@
#include "bootstrap_amortized.cuh"
/*
* Runs standard checks to validate the inputs
*/
void checks_fast_bootstrap_amortized(int nbits, int polynomial_size) {
assert(
("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
"1024, 2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
}
/*
* Runs standard checks to validate the inputs
*/
void checks_bootstrap_amortized(int nbits, int base_log, int polynomial_size) {
assert(("Error (GPU amortized PBS): base log should be <= nbits",
base_log <= nbits));
checks_fast_bootstrap_amortized(nbits, polynomial_size);
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the amortized PBS on 32 bits inputs, into `pbs_buffer`. It also
@@ -13,6 +34,7 @@ void scratch_cuda_bootstrap_amortized_32(void *v_stream, uint32_t gpu_index,
uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory,
bool allocate_gpu_memory) {
checks_fast_bootstrap_amortized(32, polynomial_size);
switch (polynomial_size) {
case 256:
@@ -63,6 +85,7 @@ void scratch_cuda_bootstrap_amortized_64(void *v_stream, uint32_t gpu_index,
uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory,
bool allocate_gpu_memory) {
checks_fast_bootstrap_amortized(64, polynomial_size);
switch (polynomial_size) {
case 256:
@@ -111,14 +134,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
uint32_t max_shared_memory) {
assert(
("Error (GPU amortized PBS): base log should be <= 32", base_log <= 32));
assert(
("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
"1024, 2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
checks_bootstrap_amortized(32, base_log, polynomial_size);
switch (polynomial_size) {
case 256:
@@ -247,14 +263,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
uint32_t max_shared_memory) {
assert(
("Error (GPU amortized PBS): base log should be <= 64", base_log <= 64));
assert(
("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
"1024, 2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
checks_bootstrap_amortized(64, base_log, polynomial_size);
switch (polynomial_size) {
case 256:

View File

@@ -1,5 +1,41 @@
#include "bootstrap_low_latency.cuh"
/*
* Runs standard checks to validate the inputs
*/
void checks_fast_bootstrap_low_latency(int nbits, int level_count,
int polynomial_size, int num_samples) {
assert((
"Error (GPU low latency PBS): polynomial size should be one of 256, 512, "
"1024, 2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
// The number of samples should be lower than 4 * SM/((k + 1) * l) (the
// factor 4 being related to the occupancy of 50%). The only supported
// value for k is 1, so k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU low latency PBS): the number of input LWEs must be lower "
"or equal to the "
"number of streaming multiprocessors on the device divided by 8 * "
"level_count",
num_samples <= number_of_sm * 4. / 2. / level_count));
}
/*
* Runs standard checks to validate the inputs
*/
void checks_bootstrap_low_latency(int nbits, int level_count, int base_log,
int polynomial_size, int num_samples) {
assert(("Error (GPU low latency PBS): base log should be <= nbits",
base_log <= nbits));
checks_fast_bootstrap_low_latency(nbits, level_count, polynomial_size,
num_samples);
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the low latency PBS on 32 bits inputs, into `pbs_buffer`. It also
@@ -11,6 +47,8 @@ void scratch_cuda_bootstrap_low_latency_32(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
checks_fast_bootstrap_low_latency(32, level_count, polynomial_size,
input_lwe_ciphertext_count);
switch (polynomial_size) {
case 256:
@@ -65,6 +103,8 @@ void scratch_cuda_bootstrap_low_latency_64(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
checks_fast_bootstrap_low_latency(64, level_count, polynomial_size,
input_lwe_ciphertext_count);
switch (polynomial_size) {
case 256:
@@ -123,24 +163,8 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
uint32_t max_shared_memory) {
assert(("Error (GPU low latency PBS): base log should be <= 32",
base_log <= 32));
assert((
"Error (GPU low latency PBS): polynomial size should be one of 256, 512, "
"1024, 2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
// The number of samples should be lower than 4 * SM/((k + 1) * l) (the
// factor 4 being related to the occupancy of 50%). The only supported
// value for k is 1, so k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU low latency PBS): the number of input LWEs must be lower "
"or equal to the "
"number of streaming multiprocessors on the device divided by 8 * "
"level_count",
num_samples <= number_of_sm * 4. / 2. / level_count));
checks_bootstrap_low_latency(32, level_count, base_log, polynomial_size,
num_samples);
switch (polynomial_size) {
case 256:
@@ -280,24 +304,8 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
uint32_t max_shared_memory) {
assert(("Error (GPU low latency PBS): base log should be <= 64",
base_log <= 64));
assert((
"Error (GPU low latency PBS): polynomial size should be one of 256, 512, "
"1024, 2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
// The number of samples should be lower than 4 * SM/((k + 1) * l) (the
// factor 4 being related to the occupancy of 50%). The only supported
// value for k is 1, so k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU low latency PBS): the number of input LWEs must be lower "
"or equal to the "
"number of streaming multiprocessors on the device divided by 8 * "
"level_count",
num_samples <= number_of_sm * 4. / 2. / level_count));
checks_bootstrap_low_latency(64, level_count, base_log, polynomial_size,
num_samples);
switch (polynomial_size) {
case 256:

View File

@@ -1,6 +1,38 @@
#include "circuit_bootstrap.cuh"
#include "circuit_bootstrap.h"
/*
* Runs standard checks to validate the inputs
*/
void checks_fast_circuit_bootstrap(int polynomial_size, int number_of_inputs) {
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
"256, 512, 1024, 2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
}
/*
* Runs standard checks to validate the inputs
*/
void checks_circuit_bootstrap(int polynomial_size, int level_bsk,
int number_of_inputs) {
// The number of samples should be lower than the number of streaming
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
// to the occupancy of 50%). The only supported value for k is 1, so
// k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
"equal to the "
"number of streaming multiprocessors on the device divided by 8 * "
"level_count_bsk",
number_of_inputs <= number_of_sm / 4. / 2. / level_bsk));
checks_fast_circuit_bootstrap(polynomial_size, number_of_inputs);
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the circuit bootstrap on 32 bits inputs, into `cbs_buffer`. It also
@@ -12,6 +44,8 @@ void scratch_cuda_circuit_bootstrap_32(
uint32_t level_count_cbs, uint32_t number_of_inputs,
uint32_t max_shared_memory, bool allocate_gpu_memory) {
checks_fast_circuit_bootstrap(polynomial_size, number_of_inputs);
switch (polynomial_size) {
case 256:
scratch_circuit_bootstrap<uint32_t, int32_t, Degree<256>>(
@@ -65,6 +99,8 @@ void scratch_cuda_circuit_bootstrap_64(
uint32_t level_count_cbs, uint32_t number_of_inputs,
uint32_t max_shared_memory, bool allocate_gpu_memory) {
checks_fast_circuit_bootstrap(polynomial_size, number_of_inputs);
switch (polynomial_size) {
case 256:
scratch_circuit_bootstrap<uint64_t, int64_t, Degree<256>>(
@@ -119,22 +155,9 @@ void cuda_circuit_bootstrap_32(
uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
uint32_t max_shared_memory) {
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
"256, 512, 1024, 2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
// The number of samples should be lower than the number of streaming
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
// to the occupancy of 50%). The only supported value for k is 1, so
// k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
"equal to the "
"number of streaming multiprocessors on the device divided by 8 * "
"level_count_bsk",
number_of_inputs <= number_of_sm / 4. / 2. / level_bsk));
checks_circuit_bootstrap(polynomial_size, level_bsk, number_of_inputs);
switch (polynomial_size) {
case 256:
host_circuit_bootstrap<uint32_t, Degree<256>>(
@@ -228,23 +251,9 @@ void cuda_circuit_bootstrap_64(
uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
uint32_t max_shared_memory) {
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
"256, 512, 1024, 2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
// The number of samples should be lower than the number of streaming
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
// to the occupancy of 50%). The only supported value for k is 1, so
// k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
"equal to the "
"number of streaming multiprocessors on the device divided by 8 * "
"level_count_bsk",
number_of_inputs <= number_of_sm / 4. / 2. / level_bsk));
// The number of samples should be lower than the number of streaming
checks_circuit_bootstrap(polynomial_size, level_bsk, number_of_inputs);
switch (polynomial_size) {
case 256:
host_circuit_bootstrap<uint64_t, Degree<256>>(

View File

@@ -2,6 +2,46 @@
#include "vertical_packing.h"
#include <cassert>
/*
* Runs standard checks to validate the inputs
*/
void checks_fast_cmux_tree(int nbits, int polynomial_size, int r) {
assert((
"Error (GPU Cmux tree): polynomial size should be one of 256, 512, 1024, "
"2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
// For larger k we will need to adjust the mask size
assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
"be >= 1 ",
r >= 1));
}
/*
* Runs standard checks to validate the inputs
*/
void checks_cmux_tree(int nbits, int polynomial_size, int base_log, int r) {
assert(("Error (GPU Cmux tree): base log should be <= nbits",
base_log <= nbits));
checks_fast_cmux_tree(nbits, polynomial_size, r);
}
/*
* Runs standard checks to validate the inputs
*/
void checks_blind_rotation_and_sample_extraction(int polynomial_size) {
assert(("Error (GPU Blind rotation + sample extraction): polynomial size "
"should be one of 256, 512, "
"1024, "
"2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the Cmux tree on 32 bits inputs, into `cmux_tree_buffer`. It also configures
@@ -14,6 +54,7 @@ void scratch_cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index,
uint32_t r, uint32_t tau,
uint32_t max_shared_memory,
bool allocate_gpu_memory) {
checks_fast_cmux_tree(32, polynomial_size, r);
switch (polynomial_size) {
case 256:
@@ -63,6 +104,8 @@ void scratch_cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index,
uint32_t r, uint32_t tau,
uint32_t max_shared_memory,
bool allocate_gpu_memory) {
checks_fast_cmux_tree(64, polynomial_size, r);
switch (polynomial_size) {
case 256:
scratch_cmux_tree<uint64_t, int64_t, Degree<256>>(
@@ -110,17 +153,7 @@ void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
uint32_t level_count, uint32_t r, uint32_t tau,
uint32_t max_shared_memory) {
assert(("Error (GPU Cmux tree): base log should be <= 32", base_log <= 32));
assert((
"Error (GPU Cmux tree): polynomial size should be one of 256, 512, 1024, "
"2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
// For larger k we will need to adjust the mask size
assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
"be >= 1 ",
r >= 1));
checks_cmux_tree(32, polynomial_size, base_log, r);
switch (polynomial_size) {
case 256:
@@ -197,18 +230,7 @@ void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t r, uint32_t tau,
uint32_t max_shared_memory) {
assert(("Error (GPU Cmux tree): base log should be <= 64", base_log <= 64));
assert((
"Error (GPU Cmux tree): polynomial size should be one of 256, 512, 1024, "
"2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
// For larger k we will need to adjust the mask size
assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
"be >= 1 ",
r >= 1));
checks_cmux_tree(64, polynomial_size, base_log, r);
switch (polynomial_size) {
case 256:
@@ -273,6 +295,7 @@ void scratch_cuda_blind_rotation_sample_extraction_32(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t mbr_size, uint32_t tau, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
checks_blind_rotation_and_sample_extraction(polynomial_size);
switch (polynomial_size) {
case 256:
@@ -320,6 +343,7 @@ void scratch_cuda_blind_rotation_sample_extraction_64(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t mbr_size, uint32_t tau, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
checks_blind_rotation_and_sample_extraction(polynomial_size);
switch (polynomial_size) {
case 256:
@@ -386,6 +410,7 @@ void cuda_blind_rotate_and_sample_extraction_64(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t l_gadget, uint32_t max_shared_memory) {
checks_blind_rotation_and_sample_extraction(polynomial_size);
switch (polynomial_size) {
case 256:
host_blind_rotate_and_sample_extraction<uint64_t, int64_t, Degree<256>>(

View File

@@ -1,5 +1,53 @@
#include "wop_bootstrap.cuh"
/*
* Runs standard checks to validate the inputs
*/
void checks_wop_pbs(int polynomial_size, int level_count_bsk,
int number_of_inputs) {
assert(("Error (GPU WOP PBS): polynomial_size should be one of "
"256, 512, 1024, 2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
// The number of inputs should be lower than the number of streaming
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
// to the occupancy of 50%). The only supported value for k is 1, so
// k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU WOP PBS): the number of input LWEs must be lower or "
"equal to the "
"number of streaming multiprocessors on the device divided by 8 * "
"level_count_bsk",
number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk));
}
void checks_fast_circuit_bootstrap_vertical_packing(int polynomial_size) {
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
"256, 512, 1024, 2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
}
void checks_circuit_bootstrap_vertical_packing(int polynomial_size,
int number_of_inputs,
int level_count_bsk) {
// The number of inputs should be lower than the number of streaming
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
// to the occupancy of 50%). The only supported value for k is 1, so
// k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
"equal to the "
"number of streaming multiprocessors on the device divided by 8 * "
"level_count_bsk",
number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk));
checks_fast_circuit_bootstrap_vertical_packing(polynomial_size);
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the circuit bootstrap and vertical packing on 32 bits inputs, into
@@ -13,6 +61,8 @@ void scratch_cuda_circuit_bootstrap_vertical_packing_32(
uint32_t number_of_inputs, uint32_t tau, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
checks_fast_circuit_bootstrap_vertical_packing(polynomial_size);
switch (polynomial_size) {
case 256:
scratch_circuit_bootstrap_vertical_packing<uint32_t, int32_t, Degree<256>>(
@@ -68,6 +118,8 @@ void scratch_cuda_circuit_bootstrap_vertical_packing_64(
uint32_t number_of_inputs, uint32_t tau, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
checks_fast_circuit_bootstrap_vertical_packing(polynomial_size);
switch (polynomial_size) {
case 256:
scratch_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<256>>(
@@ -124,6 +176,7 @@ void scratch_cuda_wop_pbs_32(
uint32_t number_of_bits_of_message_including_padding,
uint32_t number_of_bits_to_extract, uint32_t number_of_inputs,
uint32_t max_shared_memory, bool allocate_gpu_memory) {
checks_wop_pbs(polynomial_size, level_count_bsk, number_of_inputs);
switch (polynomial_size) {
case 256:
scratch_wop_pbs<uint32_t, int32_t, Degree<256>>(
@@ -192,6 +245,7 @@ void scratch_cuda_wop_pbs_64(
uint32_t number_of_bits_of_message_including_padding,
uint32_t number_of_bits_to_extract, uint32_t number_of_inputs,
uint32_t max_shared_memory, bool allocate_gpu_memory) {
checks_wop_pbs(polynomial_size, level_count_bsk, number_of_inputs);
switch (polynomial_size) {
case 256:
scratch_wop_pbs<uint64_t, int64_t, Degree<256>>(
@@ -282,22 +336,10 @@ void cuda_circuit_bootstrap_vertical_packing_64(
uint32_t level_count_pksk, uint32_t base_log_pksk, uint32_t level_count_cbs,
uint32_t base_log_cbs, uint32_t number_of_inputs, uint32_t lut_number,
uint32_t max_shared_memory) {
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
"256, 512, 1024, 2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
// The number of inputs should be lower than the number of streaming
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
// to the occupancy of 50%). The only supported value for k is 1, so
// k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
"equal to the "
"number of streaming multiprocessors on the device divided by 8 * "
"level_count_bsk",
number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk));
checks_circuit_bootstrap_vertical_packing(polynomial_size, number_of_inputs,
level_count_bsk);
switch (polynomial_size) {
case 256:
host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<256>>(
@@ -411,22 +453,7 @@ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
uint32_t number_of_bits_of_message_including_padding,
uint32_t number_of_bits_to_extract, uint32_t delta_log,
uint32_t number_of_inputs, uint32_t max_shared_memory) {
assert(("Error (GPU WOP PBS): polynomial_size should be one of "
"256, 512, 1024, 2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
// The number of inputs should be lower than the number of streaming
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
// to the occupancy of 50%). The only supported value for k is 1, so
// k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU WOP PBS): the number of input LWEs must be lower or "
"equal to the "
"number of streaming multiprocessors on the device divided by 8 * "
"level_count_bsk",
number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk));
checks_wop_pbs(polynomial_size, level_count_bsk, number_of_inputs);
switch (polynomial_size) {
case 256:
host_wop_pbs<uint64_t, int64_t, Degree<256>>(