chore(cuda): add asserts on base log, poly size and num samples values

This commit is contained in:
Agnes Leroy
2022-10-14 11:13:58 +02:00
committed by Agnès Leroy
parent 1a76cadaa8
commit 703c74401c
4 changed files with 84 additions and 3 deletions

View File

@@ -73,6 +73,11 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
uint32_t lwe_idx,
uint32_t max_shared_memory) {
assert(("Error (GPU amortized PBS): base log should be <= 16", base_log <= 16));
assert(("Error (GPU amortized PBS): polynomial size should be one of 512, 1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
switch (polynomial_size) {
case 512:
host_bootstrap_amortized<uint32_t, Degree<512>>(
@@ -131,6 +136,11 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
uint32_t lwe_idx,
uint32_t max_shared_memory) {
assert(("Error (GPU amortized PBS): base log should be <= 16", base_log <= 16));
assert(("Error (GPU amortized PBS): polynomial size should be one of 512, 1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
switch (polynomial_size) {
case 512:
host_bootstrap_amortized<uint64_t, Degree<512>>(

View File

@@ -72,6 +72,18 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
uint32_t lwe_idx,
uint32_t max_shared_memory) {
assert(("Error (GPU low latency PBS): base log should be <= 16", base_log <= 16));
assert(("Error (GPU low latency PBS): polynomial size should be one of 512, 1024, 2048",
polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048));
// The number of samples should be lower than SM/(4 * (k + 1) * l) (the
// factor 4 being related to the occupancy of 50%). The only supported
// value for k is 1, so k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU low latency PBS): the number of input LWEs must be lower or equal to the "
"number of streaming multiprocessors on the device divided by 8 * l_gadget",
num_samples <= number_of_sm / 4. / 2. / l_gadget));
switch (polynomial_size) {
case 512:
host_bootstrap_low_latency<uint32_t, Degree<512>>(
@@ -134,6 +146,18 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
uint32_t lwe_idx,
uint32_t max_shared_memory) {
assert(("Error (GPU low latency PBS): base log should be <= 16", base_log <= 16));
assert(("Error (GPU low latency PBS): polynomial size should be one of 512, 1024, 2048",
polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048));
// The number of samples should be lower than SM/(4 * (k + 1) * l) (the
// factor 4 being related to the occupancy of 50%). The only supported
// value for k is 1, so k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU low latency PBS): the number of input LWEs must be lower or equal to the "
"number of streaming multiprocessors on the device divided by 8 * l_gadget",
num_samples <= number_of_sm / 4. / 2. / l_gadget));
switch (polynomial_size) {
case 512:
host_bootstrap_low_latency<uint64_t, Degree<512>>(

View File

@@ -12,6 +12,15 @@ void cuda_cmux_tree_32(
uint32_t r,
uint32_t max_shared_memory) {
assert(("Error (GPU Cmux tree): base log should be <= 16", base_log <= 16));
assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
// For larger k we will need to adjust the mask size
assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1", glwe_dimension == 1));
assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should be >= 1 ",
r >= 1));
switch (polynomial_size) {
case 512:
host_cmux_tree<uint32_t, int32_t, Degree<512>>(
@@ -48,6 +57,8 @@ void cuda_cmux_tree_32(
glwe_dimension, polynomial_size, base_log, l_gadget, r,
max_shared_memory);
break;
default:
break;
}
}
@@ -63,6 +74,15 @@ void cuda_cmux_tree_64(
uint32_t r,
uint32_t max_shared_memory) {
assert(("Error (GPU Cmux tree): base log should be <= 16", base_log <= 16));
assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
// For larger k we will need to adjust the mask size
assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1", glwe_dimension == 1));
assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should be >= 1 ",
r >= 1));
switch (polynomial_size) {
case 512:
host_cmux_tree<uint64_t, int64_t, Degree<512>>(
@@ -99,6 +119,8 @@ void cuda_cmux_tree_64(
glwe_dimension, polynomial_size, base_log, l_gadget, r,
max_shared_memory);
break;
default:
break;
}
}
@@ -125,6 +147,20 @@ void cuda_extract_bits_32(
uint32_t l_gadget_ksk,
uint32_t number_of_samples)
{
assert(("Error (GPU extract bits): base log should be <= 16", base_log_bsk <= 16));
assert(("Error (GPU extract bits): lwe_dimension_before should be one of 512, 1024, 2048",
lwe_dimension_before == 512 || lwe_dimension_before == 1024 ||
lwe_dimension_before == 2048));
// The number of samples should be lower than the number of streaming
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
// to the occupancy of 50%). The only supported value for k is 1, so
// k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU extract bits): the number of input LWEs must be lower or equal to the "
"number of streaming multiprocessors on the device divided by 8 * l_gadget_bsk",
number_of_samples <= number_of_sm / 4. / 2. / l_gadget_bsk));
switch (lwe_dimension_before) {
case 512:
host_extract_bits<uint32_t, Degree<512>>(
@@ -186,6 +222,20 @@ void cuda_extract_bits_64(
uint32_t l_gadget_ksk,
uint32_t number_of_samples)
{
assert(("Error (GPU extract bits): base log should be <= 16", base_log_bsk <= 16));
assert(("Error (GPU extract bits): lwe_dimension_before should be one of 512, 1024, 2048",
lwe_dimension_before == 512 || lwe_dimension_before == 1024 ||
lwe_dimension_before == 2048));
// The number of samples should be lower than the number of streaming
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
// to the occupancy of 50%). The only supported value for k is 1, so
// k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU extract bits): the number of input LWEs must be lower or equal to the "
"number of streaming multiprocessors on the device divided by 8 * l_gadget_bsk",
number_of_samples <= number_of_sm / 4. / 2. / l_gadget_bsk));
switch (lwe_dimension_before) {
case 512:
host_extract_bits<uint64_t, Degree<512>>(

View File

@@ -298,9 +298,6 @@ void host_cmux_tree(
uint32_t r,
uint32_t max_shared_memory) {
assert(glwe_dimension == 1); // For larger k we will need to adjust the mask size
assert(r >= 1);
auto stream = static_cast<cudaStream_t *>(v_stream);
int num_lut = (1<<r);