fix(backend-gpu): avoid warp branching in the PBS

This commit is contained in:
Agnes Leroy
2023-05-15 11:20:08 +02:00
committed by Agnès Leroy
parent 577e7847c6
commit b174f81687
5 changed files with 54 additions and 173 deletions

View File

@@ -26,6 +26,8 @@ jobs:
- uses: actions/checkout@v3
- name: Set up Clang
run: |
sudo sed -i 's/azure\.//' /etc/apt/sources.list
sudo apt-get update
sudo apt-get install clang-format-11
- name: Install cmake-format
run: |

View File

@@ -95,8 +95,8 @@ extern "C" void cuda_boolean_and_32(
int8_t *pbs_buffer = nullptr;
scratch_cuda_bootstrap_low_latency_32(
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level_count,
input_lwe_ciphertext_count, max_shared_memory, true);
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
lwe_buffer_2, bootstrapping_key, pbs_buffer, input_lwe_dimension,
@@ -203,8 +203,8 @@ extern "C" void cuda_boolean_nand_32(
int8_t *pbs_buffer = nullptr;
scratch_cuda_bootstrap_low_latency_32(
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level_count,
input_lwe_ciphertext_count, max_shared_memory, true);
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
lwe_buffer_3, bootstrapping_key, pbs_buffer, input_lwe_dimension,
@@ -311,8 +311,8 @@ extern "C" void cuda_boolean_nor_32(
int8_t *pbs_buffer = nullptr;
scratch_cuda_bootstrap_low_latency_32(
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level_count,
input_lwe_ciphertext_count, max_shared_memory, true);
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
lwe_buffer_3, bootstrapping_key, pbs_buffer, input_lwe_dimension,
@@ -411,8 +411,8 @@ extern "C" void cuda_boolean_or_32(
int8_t *pbs_buffer = nullptr;
scratch_cuda_bootstrap_low_latency_32(
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level_count,
input_lwe_ciphertext_count, max_shared_memory, true);
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
lwe_buffer_2, bootstrapping_key, pbs_buffer, input_lwe_dimension,
@@ -532,8 +532,8 @@ extern "C" void cuda_boolean_xor_32(
int8_t *pbs_buffer = nullptr;
scratch_cuda_bootstrap_low_latency_32(
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level_count,
input_lwe_ciphertext_count, max_shared_memory, true);
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
lwe_buffer_3, bootstrapping_key, pbs_buffer, input_lwe_dimension,
@@ -660,8 +660,8 @@ extern "C" void cuda_boolean_xnor_32(
int8_t *pbs_buffer = nullptr;
scratch_cuda_bootstrap_low_latency_32(
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level_count,
input_lwe_ciphertext_count, max_shared_memory, true);
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
lwe_buffer_4, bootstrapping_key, pbs_buffer, input_lwe_dimension,

View File

@@ -98,6 +98,7 @@ uint64_t get_buffer_size_bootstrap_low_latency_64(
input_lwe_ciphertext_count, max_shared_memory);
break;
default:
return 0;
break;
}
}

View File

@@ -3,6 +3,9 @@
#include "device.h"
#include "utils/timer.cuh"
// Return A if C == 0 and B if C == 1
#define SEL(A, B, C) ((-(C) & ((A) ^ (B))) ^ (A))
/*
* function compresses decomposed buffer into half size complex buffer for fft
*/
@@ -83,18 +86,22 @@ __device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input,
tid = threadIdx.x;
for (int i = 0; i < elems_per_thread; i++) {
if (j < degree) {
if (tid < degree - j) {
accumulator_slice[tid] = input_slice[tid + j];
} else {
accumulator_slice[tid] = -input_slice[tid - degree + j];
}
// if (tid < degree - j)
// accumulator_slice[tid] = input_slice[tid + j];
// else
// accumulator_slice[tid] = -input_slice[tid - degree + j];
int x = tid + j - SEL(degree, 0, tid < degree - j);
accumulator_slice[tid] =
SEL(-1, 1, tid < degree - j) * input_slice[x];
} else {
uint32_t jj = j - degree;
if (tid < degree - jj) {
accumulator_slice[tid] = -input_slice[tid + jj];
} else {
accumulator_slice[tid] = input_slice[tid - degree + jj];
}
int32_t jj = j - degree;
// if (tid < degree - jj)
// accumulator_slice[tid] = -input_slice[tid + jj];
// else
// accumulator_slice[tid] = input_slice[tid - degree + jj];
int x = tid + jj - SEL(degree, 0, tid < degree - jj);
accumulator_slice[tid] =
SEL(1, -1, tid < degree - jj) * input_slice[x];
}
tid += block_size;
}
@@ -120,19 +127,22 @@ __device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
int tid = threadIdx.x;
for (int i = 0; i < elems_per_thread; i++) {
if (j < degree) {
if (tid < j) {
result_acc_slice[tid] = -acc_slice[tid - j + degree] - acc_slice[tid];
} else {
result_acc_slice[tid] = acc_slice[tid - j] - acc_slice[tid];
}
// if (tid < j)
// result_acc_slice[tid] = -acc_slice[tid - j + degree]-acc_slice[tid];
// else
// result_acc_slice[tid] = acc_slice[tid - j] - acc_slice[tid];
int x = tid - j + SEL(0, degree, tid < j);
result_acc_slice[tid] =
SEL(1, -1, tid < j) * acc_slice[x] - acc_slice[tid];
} else {
uint32_t jj = j - degree;
if (tid < jj) {
result_acc_slice[tid] = acc_slice[tid - jj + degree] - acc_slice[tid];
} else {
result_acc_slice[tid] = -acc_slice[tid - jj] - acc_slice[tid];
}
int32_t jj = j - degree;
// if (tid < jj)
// result_acc_slice[tid] = acc_slice[tid - jj + degree]-acc_slice[tid];
// else
// result_acc_slice[tid] = -acc_slice[tid - jj] - acc_slice[tid];
int x = tid - jj + SEL(0, degree, tid < jj);
result_acc_slice[tid] =
SEL(-1, 1, tid < jj) * acc_slice[x] - acc_slice[tid];
}
tid += block_size;
}
@@ -249,18 +259,12 @@ __device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator,
tid = threadIdx.x;
result[params::opt];
for (int i = 0; i < params::opt; i++) {
if (1 < params::degree) {
if (tid < 1)
result[i] = -accumulator_slice[tid - 1 + params::degree];
else
result[i] = accumulator_slice[tid - 1];
} else {
uint32_t jj = 1 - (uint32_t)params::degree;
if (tid < jj)
result[i] = accumulator_slice[tid - jj + params::degree];
else
result[i] = -accumulator_slice[tid - jj];
}
// if (tid < 1)
// result[i] = -accumulator_slice[tid - 1 + params::degree];
// else
// result[i] = accumulator_slice[tid - 1];
int x = tid - 1 + SEL(0, params::degree, tid < 1);
result[i] = SEL(1, -1, tid < 1) * accumulator_slice[x];
tid += params::degree / params::opt;
}
synchronize_threads_in_block();

View File

@@ -71,132 +71,6 @@ public:
}
}
__device__ void multiply_by_monomial_negacyclic(Polynomial<T, params> &result,
uint32_t j) {
int tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
if (j < params::degree) {
if (tid < j)
result.coefficients[tid] =
-this->coefficients[tid - j + params::degree];
else
result.coefficients[tid] = this->coefficients[tid - j];
} else {
uint32_t jj = j - params::degree;
if (tid < jj)
result.coefficients[tid] =
this->coefficients[tid - jj + params::degree];
else
result.coefficients[tid] = -this->coefficients[tid - jj];
}
tid += params::degree / params::opt;
}
}
__device__ void multiply_by_monomial_negacyclic_inplace(uint32_t j) {
int tid = threadIdx.x;
T result[params::opt];
for (int i = 0; i < params::opt; i++) {
if (j < params::degree) {
if (tid < j)
result[i] = -this->coefficients[tid - j + params::degree];
else
result[i] = this->coefficients[tid - j];
} else {
uint32_t jj = j - params::degree;
if (tid < jj)
result[i] = this->coefficients[tid - jj + params::degree];
else
result[i] = -this->coefficients[tid - jj];
}
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
coefficients[tid] = result[i];
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
}
__device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
Polynomial<T, params> &result, uint32_t j) {
int tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
if (j < params::degree) {
if (tid < j)
result.coefficients[tid] =
-this->coefficients[tid - j + params::degree] -
this->coefficients[tid];
else
result.coefficients[tid] =
this->coefficients[tid - j] - this->coefficients[tid];
} else {
uint32_t jj = j - params::degree;
if (tid < jj)
result.coefficients[tid] =
this->coefficients[tid - jj + params::degree] -
this->coefficients[tid];
else
result.coefficients[tid] =
-this->coefficients[tid - jj] - this->coefficients[tid];
}
tid += params::degree / params::opt;
}
}
__device__ void divide_by_monomial_negacyclic(Polynomial<T, params> &result,
uint32_t j) {
int tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
if (j < params::degree) {
if (tid < params::degree - j) {
result.coefficients[tid] = this->coefficients[tid + j];
} else {
result.coefficients[tid] =
-this->coefficients[tid - params::degree + j];
}
} else {
uint32_t jj = j - params::degree;
if (tid < params::degree - jj) {
result.coefficients[tid] = -this->coefficients[tid + jj];
} else {
result.coefficients[tid] =
this->coefficients[tid - params::degree + jj];
}
}
tid += params::degree / params::opt;
}
}
__device__ void divide_by_monomial_negacyclic_inplace(uint32_t j) {
int tid = threadIdx.x;
T result[params::opt];
for (int i = 0; i < params::opt; i++) {
if (j < params::degree) {
if (tid < params::degree - j) {
result[i] = this->coefficients[tid + j];
} else {
result[i] = -this->coefficients[tid - params::degree + j];
}
} else {
uint32_t jj = j - params::degree;
if (tid < params::degree - jj) {
result[i] = -this->coefficients[tid + jj];
} else {
result[i] = this->coefficients[tid - params::degree + jj];
}
}
tid += params::degree / params::opt;
}
tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
coefficients[tid] = result[i];
tid = tid + params::degree / params::opt;
}
}
__device__ void round_to_closest_multiple_inplace(uint32_t base_log,
uint32_t level_count) {
int tid = threadIdx.x;