mirror of
https://github.com/zama-ai/concrete.git
synced 2026-02-08 11:35:02 -05:00
chore(concrete_cuda): Add checks to validate inputs passed to the scratch functions
This commit is contained in:
@@ -1,5 +1,41 @@
|
||||
#include "bit_extraction.cuh"
|
||||
|
||||
/*
|
||||
* Runs standard checks to validate the inputs
|
||||
*/
|
||||
void checks_fast_extract_bits(int nbits, int polynomial_size,
|
||||
int level_count_bsk, int number_of_samples) {
|
||||
|
||||
assert(("Error (GPU extract bits): polynomial_size should be one of "
|
||||
"256, 512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
// The number of samples should be lower than four time the number of
|
||||
// streaming multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being
|
||||
// related to the occupancy of 50%). The only supported value for k is 1, so
|
||||
// k + 1 = 2 for now.
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
|
||||
"equal to the "
|
||||
"number of streaming multiprocessors on the device divided by 8 * "
|
||||
"level_count_bsk",
|
||||
number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk));
|
||||
}
|
||||
|
||||
/*
|
||||
* Runs standard checks to validate the inputs
|
||||
*/
|
||||
void checks_extract_bits(int nbits, int polynomial_size, int base_log_bsk,
|
||||
int level_count_bsk, int number_of_samples) {
|
||||
|
||||
assert(("Error (GPU extract bits): base log should be <= nbits",
|
||||
base_log_bsk <= nbits));
|
||||
checks_fast_extract_bits(nbits, polynomial_size, level_count_bsk,
|
||||
number_of_samples);
|
||||
}
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the bit extraction on 32 bits inputs, into `cbs_buffer`. It also
|
||||
@@ -11,6 +47,8 @@ void scratch_cuda_extract_bits_32(
|
||||
uint32_t level_count, uint32_t number_of_inputs, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
checks_fast_extract_bits(32, polynomial_size, level_count, number_of_inputs);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_extract_bits<uint32_t, int32_t, Degree<256>>(
|
||||
@@ -63,6 +101,7 @@ void scratch_cuda_extract_bits_64(
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t number_of_inputs, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
checks_fast_extract_bits(64, polynomial_size, level_count, number_of_inputs);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -119,24 +158,8 @@ void cuda_extract_bits_32(void *v_stream, uint32_t gpu_index,
|
||||
uint32_t level_count_bsk, uint32_t base_log_ksk,
|
||||
uint32_t level_count_ksk, uint32_t number_of_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
assert(("Error (GPU extract bits): base log should be <= 32",
|
||||
base_log_bsk <= 32));
|
||||
assert(("Error (GPU extract bits): polynomial_size should be one of "
|
||||
"256, 512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
// The number of samples should be lower than four time the number of
|
||||
// streaming multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being
|
||||
// related to the occupancy of 50%). The only supported value for k is 1, so
|
||||
// k + 1 = 2 for now.
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
|
||||
"equal to the "
|
||||
"number of streaming multiprocessors on the device divided by 8 * "
|
||||
"level_count_bsk",
|
||||
number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk));
|
||||
checks_extract_bits(32, polynomial_size, base_log_bsk, level_count_bsk,
|
||||
number_of_samples);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -253,24 +276,8 @@ void cuda_extract_bits_64(void *v_stream, uint32_t gpu_index,
|
||||
uint32_t level_count_bsk, uint32_t base_log_ksk,
|
||||
uint32_t level_count_ksk, uint32_t number_of_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
assert(("Error (GPU extract bits): base log should be <= 64",
|
||||
base_log_bsk <= 64));
|
||||
assert(("Error (GPU extract bits): polynomial_size should be one of "
|
||||
"256, 512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
// The number of samples should be lower than four time the number of
|
||||
// streaming multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being
|
||||
// related to the occupancy of 50%). The only supported value for k is 1, so
|
||||
// k + 1 = 2 for now.
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
|
||||
"equal to the "
|
||||
"number of streaming multiprocessors on the device divided by 8 * "
|
||||
"level_count_bsk",
|
||||
number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk));
|
||||
checks_extract_bits(64, polynomial_size, base_log_bsk, level_count_bsk,
|
||||
number_of_samples);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
|
||||
@@ -1,5 +1,26 @@
|
||||
#include "bootstrap_amortized.cuh"
|
||||
|
||||
/*
|
||||
* Runs standard checks to validate the inputs
|
||||
*/
|
||||
void checks_fast_bootstrap_amortized(int nbits, int polynomial_size) {
|
||||
assert(
|
||||
("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
|
||||
"1024, 2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
}
|
||||
|
||||
/*
|
||||
* Runs standard checks to validate the inputs
|
||||
*/
|
||||
void checks_bootstrap_amortized(int nbits, int base_log, int polynomial_size) {
|
||||
assert(("Error (GPU amortized PBS): base log should be <= nbits",
|
||||
base_log <= nbits));
|
||||
checks_fast_bootstrap_amortized(nbits, polynomial_size);
|
||||
}
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the amortized PBS on 32 bits inputs, into `pbs_buffer`. It also
|
||||
@@ -13,6 +34,7 @@ void scratch_cuda_bootstrap_amortized_32(void *v_stream, uint32_t gpu_index,
|
||||
uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
checks_fast_bootstrap_amortized(32, polynomial_size);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -63,6 +85,7 @@ void scratch_cuda_bootstrap_amortized_64(void *v_stream, uint32_t gpu_index,
|
||||
uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
checks_fast_bootstrap_amortized(64, polynomial_size);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -111,14 +134,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
assert(
|
||||
("Error (GPU amortized PBS): base log should be <= 32", base_log <= 32));
|
||||
assert(
|
||||
("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
|
||||
"1024, 2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
checks_bootstrap_amortized(32, base_log, polynomial_size);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -247,14 +263,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
assert(
|
||||
("Error (GPU amortized PBS): base log should be <= 64", base_log <= 64));
|
||||
assert(
|
||||
("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
|
||||
"1024, 2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
checks_bootstrap_amortized(64, base_log, polynomial_size);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
|
||||
@@ -1,5 +1,41 @@
|
||||
#include "bootstrap_low_latency.cuh"
|
||||
|
||||
/*
|
||||
* Runs standard checks to validate the inputs
|
||||
*/
|
||||
void checks_fast_bootstrap_low_latency(int nbits, int level_count,
|
||||
int polynomial_size, int num_samples) {
|
||||
|
||||
assert((
|
||||
"Error (GPU low latency PBS): polynomial size should be one of 256, 512, "
|
||||
"1024, 2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
// The number of samples should be lower than 4 * SM/((k + 1) * l) (the
|
||||
// factor 4 being related to the occupancy of 50%). The only supported
|
||||
// value for k is 1, so k + 1 = 2 for now.
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
assert(("Error (GPU low latency PBS): the number of input LWEs must be lower "
|
||||
"or equal to the "
|
||||
"number of streaming multiprocessors on the device divided by 8 * "
|
||||
"level_count",
|
||||
num_samples <= number_of_sm * 4. / 2. / level_count));
|
||||
}
|
||||
|
||||
/*
|
||||
* Runs standard checks to validate the inputs
|
||||
*/
|
||||
void checks_bootstrap_low_latency(int nbits, int level_count, int base_log,
|
||||
int polynomial_size, int num_samples) {
|
||||
|
||||
assert(("Error (GPU low latency PBS): base log should be <= nbits",
|
||||
base_log <= nbits));
|
||||
checks_fast_bootstrap_low_latency(nbits, level_count, polynomial_size,
|
||||
num_samples);
|
||||
}
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the low latency PBS on 32 bits inputs, into `pbs_buffer`. It also
|
||||
@@ -11,6 +47,8 @@ void scratch_cuda_bootstrap_low_latency_32(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
checks_fast_bootstrap_low_latency(32, level_count, polynomial_size,
|
||||
input_lwe_ciphertext_count);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -65,6 +103,8 @@ void scratch_cuda_bootstrap_low_latency_64(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
checks_fast_bootstrap_low_latency(64, level_count, polynomial_size,
|
||||
input_lwe_ciphertext_count);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -123,24 +163,8 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
assert(("Error (GPU low latency PBS): base log should be <= 32",
|
||||
base_log <= 32));
|
||||
assert((
|
||||
"Error (GPU low latency PBS): polynomial size should be one of 256, 512, "
|
||||
"1024, 2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
// The number of samples should be lower than 4 * SM/((k + 1) * l) (the
|
||||
// factor 4 being related to the occupancy of 50%). The only supported
|
||||
// value for k is 1, so k + 1 = 2 for now.
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
assert(("Error (GPU low latency PBS): the number of input LWEs must be lower "
|
||||
"or equal to the "
|
||||
"number of streaming multiprocessors on the device divided by 8 * "
|
||||
"level_count",
|
||||
num_samples <= number_of_sm * 4. / 2. / level_count));
|
||||
checks_bootstrap_low_latency(32, level_count, base_log, polynomial_size,
|
||||
num_samples);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -280,24 +304,8 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
assert(("Error (GPU low latency PBS): base log should be <= 64",
|
||||
base_log <= 64));
|
||||
assert((
|
||||
"Error (GPU low latency PBS): polynomial size should be one of 256, 512, "
|
||||
"1024, 2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
// The number of samples should be lower than 4 * SM/((k + 1) * l) (the
|
||||
// factor 4 being related to the occupancy of 50%). The only supported
|
||||
// value for k is 1, so k + 1 = 2 for now.
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
assert(("Error (GPU low latency PBS): the number of input LWEs must be lower "
|
||||
"or equal to the "
|
||||
"number of streaming multiprocessors on the device divided by 8 * "
|
||||
"level_count",
|
||||
num_samples <= number_of_sm * 4. / 2. / level_count));
|
||||
checks_bootstrap_low_latency(64, level_count, base_log, polynomial_size,
|
||||
num_samples);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
|
||||
@@ -1,6 +1,38 @@
|
||||
#include "circuit_bootstrap.cuh"
|
||||
#include "circuit_bootstrap.h"
|
||||
|
||||
/*
|
||||
* Runs standard checks to validate the inputs
|
||||
*/
|
||||
void checks_fast_circuit_bootstrap(int polynomial_size, int number_of_inputs) {
|
||||
|
||||
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
|
||||
"256, 512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
}
|
||||
|
||||
/*
|
||||
* Runs standard checks to validate the inputs
|
||||
*/
|
||||
void checks_circuit_bootstrap(int polynomial_size, int level_bsk,
|
||||
int number_of_inputs) {
|
||||
// The number of samples should be lower than the number of streaming
|
||||
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
|
||||
// to the occupancy of 50%). The only supported value for k is 1, so
|
||||
// k + 1 = 2 for now.
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
|
||||
"equal to the "
|
||||
"number of streaming multiprocessors on the device divided by 8 * "
|
||||
"level_count_bsk",
|
||||
number_of_inputs <= number_of_sm / 4. / 2. / level_bsk));
|
||||
|
||||
checks_fast_circuit_bootstrap(polynomial_size, number_of_inputs);
|
||||
}
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the circuit bootstrap on 32 bits inputs, into `cbs_buffer`. It also
|
||||
@@ -12,6 +44,8 @@ void scratch_cuda_circuit_bootstrap_32(
|
||||
uint32_t level_count_cbs, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
|
||||
checks_fast_circuit_bootstrap(polynomial_size, number_of_inputs);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_circuit_bootstrap<uint32_t, int32_t, Degree<256>>(
|
||||
@@ -65,6 +99,8 @@ void scratch_cuda_circuit_bootstrap_64(
|
||||
uint32_t level_count_cbs, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
|
||||
checks_fast_circuit_bootstrap(polynomial_size, number_of_inputs);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_circuit_bootstrap<uint64_t, int64_t, Degree<256>>(
|
||||
@@ -119,22 +155,9 @@ void cuda_circuit_bootstrap_32(
|
||||
uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
|
||||
uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory) {
|
||||
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
|
||||
"256, 512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
// The number of samples should be lower than the number of streaming
|
||||
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
|
||||
// to the occupancy of 50%). The only supported value for k is 1, so
|
||||
// k + 1 = 2 for now.
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
|
||||
"equal to the "
|
||||
"number of streaming multiprocessors on the device divided by 8 * "
|
||||
"level_count_bsk",
|
||||
number_of_inputs <= number_of_sm / 4. / 2. / level_bsk));
|
||||
|
||||
checks_circuit_bootstrap(polynomial_size, level_bsk, number_of_inputs);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_circuit_bootstrap<uint32_t, Degree<256>>(
|
||||
@@ -228,23 +251,9 @@ void cuda_circuit_bootstrap_64(
|
||||
uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
|
||||
uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory) {
|
||||
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
|
||||
"256, 512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
// The number of samples should be lower than the number of streaming
|
||||
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
|
||||
// to the occupancy of 50%). The only supported value for k is 1, so
|
||||
// k + 1 = 2 for now.
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
|
||||
"equal to the "
|
||||
"number of streaming multiprocessors on the device divided by 8 * "
|
||||
"level_count_bsk",
|
||||
number_of_inputs <= number_of_sm / 4. / 2. / level_bsk));
|
||||
// The number of samples should be lower than the number of streaming
|
||||
|
||||
checks_circuit_bootstrap(polynomial_size, level_bsk, number_of_inputs);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_circuit_bootstrap<uint64_t, Degree<256>>(
|
||||
|
||||
@@ -2,6 +2,46 @@
|
||||
#include "vertical_packing.h"
|
||||
#include <cassert>
|
||||
|
||||
/*
|
||||
* Runs standard checks to validate the inputs
|
||||
*/
|
||||
void checks_fast_cmux_tree(int nbits, int polynomial_size, int r) {
|
||||
assert((
|
||||
"Error (GPU Cmux tree): polynomial size should be one of 256, 512, 1024, "
|
||||
"2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
// For larger k we will need to adjust the mask size
|
||||
assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
|
||||
"be >= 1 ",
|
||||
r >= 1));
|
||||
}
|
||||
|
||||
/*
|
||||
* Runs standard checks to validate the inputs
|
||||
*/
|
||||
void checks_cmux_tree(int nbits, int polynomial_size, int base_log, int r) {
|
||||
|
||||
assert(("Error (GPU Cmux tree): base log should be <= nbits",
|
||||
base_log <= nbits));
|
||||
checks_fast_cmux_tree(nbits, polynomial_size, r);
|
||||
}
|
||||
|
||||
/*
|
||||
* Runs standard checks to validate the inputs
|
||||
*/
|
||||
void checks_blind_rotation_and_sample_extraction(int polynomial_size) {
|
||||
|
||||
assert(("Error (GPU Blind rotation + sample extraction): polynomial size "
|
||||
"should be one of 256, 512, "
|
||||
"1024, "
|
||||
"2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
}
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the Cmux tree on 32 bits inputs, into `cmux_tree_buffer`. It also configures
|
||||
@@ -14,6 +54,7 @@ void scratch_cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index,
|
||||
uint32_t r, uint32_t tau,
|
||||
uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
checks_fast_cmux_tree(32, polynomial_size, r);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -63,6 +104,8 @@ void scratch_cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index,
|
||||
uint32_t r, uint32_t tau,
|
||||
uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
checks_fast_cmux_tree(64, polynomial_size, r);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_cmux_tree<uint64_t, int64_t, Degree<256>>(
|
||||
@@ -110,17 +153,7 @@ void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
|
||||
uint32_t level_count, uint32_t r, uint32_t tau,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
assert(("Error (GPU Cmux tree): base log should be <= 32", base_log <= 32));
|
||||
assert((
|
||||
"Error (GPU Cmux tree): polynomial size should be one of 256, 512, 1024, "
|
||||
"2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
// For larger k we will need to adjust the mask size
|
||||
assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
|
||||
"be >= 1 ",
|
||||
r >= 1));
|
||||
checks_cmux_tree(32, polynomial_size, base_log, r);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -197,18 +230,7 @@ void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
|
||||
uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t r, uint32_t tau,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
assert(("Error (GPU Cmux tree): base log should be <= 64", base_log <= 64));
|
||||
assert((
|
||||
"Error (GPU Cmux tree): polynomial size should be one of 256, 512, 1024, "
|
||||
"2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
// For larger k we will need to adjust the mask size
|
||||
assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
|
||||
"be >= 1 ",
|
||||
r >= 1));
|
||||
checks_cmux_tree(64, polynomial_size, base_log, r);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -273,6 +295,7 @@ void scratch_cuda_blind_rotation_sample_extraction_32(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t mbr_size, uint32_t tau, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
checks_blind_rotation_and_sample_extraction(polynomial_size);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -320,6 +343,7 @@ void scratch_cuda_blind_rotation_sample_extraction_64(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t mbr_size, uint32_t tau, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
checks_blind_rotation_and_sample_extraction(polynomial_size);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -386,6 +410,7 @@ void cuda_blind_rotate_and_sample_extraction_64(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t l_gadget, uint32_t max_shared_memory) {
|
||||
|
||||
checks_blind_rotation_and_sample_extraction(polynomial_size);
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_blind_rotate_and_sample_extraction<uint64_t, int64_t, Degree<256>>(
|
||||
|
||||
@@ -1,5 +1,53 @@
|
||||
#include "wop_bootstrap.cuh"
|
||||
|
||||
/*
|
||||
* Runs standard checks to validate the inputs
|
||||
*/
|
||||
void checks_wop_pbs(int polynomial_size, int level_count_bsk,
|
||||
int number_of_inputs) {
|
||||
assert(("Error (GPU WOP PBS): polynomial_size should be one of "
|
||||
"256, 512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
// The number of inputs should be lower than the number of streaming
|
||||
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
|
||||
// to the occupancy of 50%). The only supported value for k is 1, so
|
||||
// k + 1 = 2 for now.
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
assert(("Error (GPU WOP PBS): the number of input LWEs must be lower or "
|
||||
"equal to the "
|
||||
"number of streaming multiprocessors on the device divided by 8 * "
|
||||
"level_count_bsk",
|
||||
number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk));
|
||||
}
|
||||
|
||||
void checks_fast_circuit_bootstrap_vertical_packing(int polynomial_size) {
|
||||
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
|
||||
"256, 512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
}
|
||||
|
||||
void checks_circuit_bootstrap_vertical_packing(int polynomial_size,
|
||||
int number_of_inputs,
|
||||
int level_count_bsk) {
|
||||
// The number of inputs should be lower than the number of streaming
|
||||
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
|
||||
// to the occupancy of 50%). The only supported value for k is 1, so
|
||||
// k + 1 = 2 for now.
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
|
||||
"equal to the "
|
||||
"number of streaming multiprocessors on the device divided by 8 * "
|
||||
"level_count_bsk",
|
||||
number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk));
|
||||
checks_fast_circuit_bootstrap_vertical_packing(polynomial_size);
|
||||
}
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the circuit bootstrap and vertical packing on 32 bits inputs, into
|
||||
@@ -13,6 +61,8 @@ void scratch_cuda_circuit_bootstrap_vertical_packing_32(
|
||||
uint32_t number_of_inputs, uint32_t tau, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
checks_fast_circuit_bootstrap_vertical_packing(polynomial_size);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_circuit_bootstrap_vertical_packing<uint32_t, int32_t, Degree<256>>(
|
||||
@@ -68,6 +118,8 @@ void scratch_cuda_circuit_bootstrap_vertical_packing_64(
|
||||
uint32_t number_of_inputs, uint32_t tau, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
checks_fast_circuit_bootstrap_vertical_packing(polynomial_size);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<256>>(
|
||||
@@ -124,6 +176,7 @@ void scratch_cuda_wop_pbs_32(
|
||||
uint32_t number_of_bits_of_message_including_padding,
|
||||
uint32_t number_of_bits_to_extract, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
checks_wop_pbs(polynomial_size, level_count_bsk, number_of_inputs);
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_wop_pbs<uint32_t, int32_t, Degree<256>>(
|
||||
@@ -192,6 +245,7 @@ void scratch_cuda_wop_pbs_64(
|
||||
uint32_t number_of_bits_of_message_including_padding,
|
||||
uint32_t number_of_bits_to_extract, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
checks_wop_pbs(polynomial_size, level_count_bsk, number_of_inputs);
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_wop_pbs<uint64_t, int64_t, Degree<256>>(
|
||||
@@ -282,22 +336,10 @@ void cuda_circuit_bootstrap_vertical_packing_64(
|
||||
uint32_t level_count_pksk, uint32_t base_log_pksk, uint32_t level_count_cbs,
|
||||
uint32_t base_log_cbs, uint32_t number_of_inputs, uint32_t lut_number,
|
||||
uint32_t max_shared_memory) {
|
||||
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
|
||||
"256, 512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
// The number of inputs should be lower than the number of streaming
|
||||
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
|
||||
// to the occupancy of 50%). The only supported value for k is 1, so
|
||||
// k + 1 = 2 for now.
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
|
||||
"equal to the "
|
||||
"number of streaming multiprocessors on the device divided by 8 * "
|
||||
"level_count_bsk",
|
||||
number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk));
|
||||
|
||||
checks_circuit_bootstrap_vertical_packing(polynomial_size, number_of_inputs,
|
||||
level_count_bsk);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<256>>(
|
||||
@@ -411,22 +453,7 @@ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
uint32_t number_of_bits_of_message_including_padding,
|
||||
uint32_t number_of_bits_to_extract, uint32_t delta_log,
|
||||
uint32_t number_of_inputs, uint32_t max_shared_memory) {
|
||||
assert(("Error (GPU WOP PBS): polynomial_size should be one of "
|
||||
"256, 512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
// The number of inputs should be lower than the number of streaming
|
||||
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
|
||||
// to the occupancy of 50%). The only supported value for k is 1, so
|
||||
// k + 1 = 2 for now.
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
assert(("Error (GPU WOP PBS): the number of input LWEs must be lower or "
|
||||
"equal to the "
|
||||
"number of streaming multiprocessors on the device divided by 8 * "
|
||||
"level_count_bsk",
|
||||
number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk));
|
||||
checks_wop_pbs(polynomial_size, level_count_bsk, number_of_inputs);
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_wop_pbs<uint64_t, int64_t, Degree<256>>(
|
||||
|
||||
Reference in New Issue
Block a user