mirror of
https://github.com/zama-ai/concrete.git
synced 2026-04-17 03:00:54 -04:00
fix(backend-gpu): avoid warp branching in the PBS
This commit is contained in:
2
.github/workflows/concrete_cuda_test.yml
vendored
2
.github/workflows/concrete_cuda_test.yml
vendored
@@ -26,6 +26,8 @@ jobs:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Clang
|
||||
run: |
|
||||
sudo sed -i 's/azure\.//' /etc/apt/sources.list
|
||||
sudo apt-get update
|
||||
sudo apt-get install clang-format-11
|
||||
- name: Install cmake-format
|
||||
run: |
|
||||
|
||||
@@ -95,8 +95,8 @@ extern "C" void cuda_boolean_and_32(
|
||||
|
||||
int8_t *pbs_buffer = nullptr;
|
||||
scratch_cuda_bootstrap_low_latency_32(
|
||||
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, true);
|
||||
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
|
||||
pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
|
||||
cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
|
||||
lwe_buffer_2, bootstrapping_key, pbs_buffer, input_lwe_dimension,
|
||||
@@ -203,8 +203,8 @@ extern "C" void cuda_boolean_nand_32(
|
||||
|
||||
int8_t *pbs_buffer = nullptr;
|
||||
scratch_cuda_bootstrap_low_latency_32(
|
||||
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, true);
|
||||
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
|
||||
pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
|
||||
cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
|
||||
lwe_buffer_3, bootstrapping_key, pbs_buffer, input_lwe_dimension,
|
||||
@@ -311,8 +311,8 @@ extern "C" void cuda_boolean_nor_32(
|
||||
|
||||
int8_t *pbs_buffer = nullptr;
|
||||
scratch_cuda_bootstrap_low_latency_32(
|
||||
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, true);
|
||||
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
|
||||
pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
|
||||
cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
|
||||
lwe_buffer_3, bootstrapping_key, pbs_buffer, input_lwe_dimension,
|
||||
@@ -411,8 +411,8 @@ extern "C" void cuda_boolean_or_32(
|
||||
|
||||
int8_t *pbs_buffer = nullptr;
|
||||
scratch_cuda_bootstrap_low_latency_32(
|
||||
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, true);
|
||||
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
|
||||
pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
|
||||
cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
|
||||
lwe_buffer_2, bootstrapping_key, pbs_buffer, input_lwe_dimension,
|
||||
@@ -532,8 +532,8 @@ extern "C" void cuda_boolean_xor_32(
|
||||
|
||||
int8_t *pbs_buffer = nullptr;
|
||||
scratch_cuda_bootstrap_low_latency_32(
|
||||
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, true);
|
||||
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
|
||||
pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
|
||||
cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
|
||||
lwe_buffer_3, bootstrapping_key, pbs_buffer, input_lwe_dimension,
|
||||
@@ -660,8 +660,8 @@ extern "C" void cuda_boolean_xnor_32(
|
||||
|
||||
int8_t *pbs_buffer = nullptr;
|
||||
scratch_cuda_bootstrap_low_latency_32(
|
||||
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, true);
|
||||
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
|
||||
pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
|
||||
cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
|
||||
lwe_buffer_4, bootstrapping_key, pbs_buffer, input_lwe_dimension,
|
||||
|
||||
@@ -98,6 +98,7 @@ uint64_t get_buffer_size_bootstrap_low_latency_64(
|
||||
input_lwe_ciphertext_count, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
return 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,6 +3,9 @@
|
||||
#include "device.h"
|
||||
#include "utils/timer.cuh"
|
||||
|
||||
// Return A if C == 0 and B if C == 1
|
||||
#define SEL(A, B, C) ((-(C) & ((A) ^ (B))) ^ (A))
|
||||
|
||||
/*
|
||||
* function compresses decomposed buffer into half size complex buffer for fft
|
||||
*/
|
||||
@@ -83,18 +86,22 @@ __device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input,
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < elems_per_thread; i++) {
|
||||
if (j < degree) {
|
||||
if (tid < degree - j) {
|
||||
accumulator_slice[tid] = input_slice[tid + j];
|
||||
} else {
|
||||
accumulator_slice[tid] = -input_slice[tid - degree + j];
|
||||
}
|
||||
// if (tid < degree - j)
|
||||
// accumulator_slice[tid] = input_slice[tid + j];
|
||||
// else
|
||||
// accumulator_slice[tid] = -input_slice[tid - degree + j];
|
||||
int x = tid + j - SEL(degree, 0, tid < degree - j);
|
||||
accumulator_slice[tid] =
|
||||
SEL(-1, 1, tid < degree - j) * input_slice[x];
|
||||
} else {
|
||||
uint32_t jj = j - degree;
|
||||
if (tid < degree - jj) {
|
||||
accumulator_slice[tid] = -input_slice[tid + jj];
|
||||
} else {
|
||||
accumulator_slice[tid] = input_slice[tid - degree + jj];
|
||||
}
|
||||
int32_t jj = j - degree;
|
||||
// if (tid < degree - jj)
|
||||
// accumulator_slice[tid] = -input_slice[tid + jj];
|
||||
// else
|
||||
// accumulator_slice[tid] = input_slice[tid - degree + jj];
|
||||
int x = tid + jj - SEL(degree, 0, tid < degree - jj);
|
||||
accumulator_slice[tid] =
|
||||
SEL(1, -1, tid < degree - jj) * input_slice[x];
|
||||
}
|
||||
tid += block_size;
|
||||
}
|
||||
@@ -120,19 +127,22 @@ __device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < elems_per_thread; i++) {
|
||||
if (j < degree) {
|
||||
if (tid < j) {
|
||||
result_acc_slice[tid] = -acc_slice[tid - j + degree] - acc_slice[tid];
|
||||
} else {
|
||||
result_acc_slice[tid] = acc_slice[tid - j] - acc_slice[tid];
|
||||
}
|
||||
// if (tid < j)
|
||||
// result_acc_slice[tid] = -acc_slice[tid - j + degree]-acc_slice[tid];
|
||||
// else
|
||||
// result_acc_slice[tid] = acc_slice[tid - j] - acc_slice[tid];
|
||||
int x = tid - j + SEL(0, degree, tid < j);
|
||||
result_acc_slice[tid] =
|
||||
SEL(1, -1, tid < j) * acc_slice[x] - acc_slice[tid];
|
||||
} else {
|
||||
uint32_t jj = j - degree;
|
||||
if (tid < jj) {
|
||||
result_acc_slice[tid] = acc_slice[tid - jj + degree] - acc_slice[tid];
|
||||
|
||||
} else {
|
||||
result_acc_slice[tid] = -acc_slice[tid - jj] - acc_slice[tid];
|
||||
}
|
||||
int32_t jj = j - degree;
|
||||
// if (tid < jj)
|
||||
// result_acc_slice[tid] = acc_slice[tid - jj + degree]-acc_slice[tid];
|
||||
// else
|
||||
// result_acc_slice[tid] = -acc_slice[tid - jj] - acc_slice[tid];
|
||||
int x = tid - jj + SEL(0, degree, tid < jj);
|
||||
result_acc_slice[tid] =
|
||||
SEL(-1, 1, tid < jj) * acc_slice[x] - acc_slice[tid];
|
||||
}
|
||||
tid += block_size;
|
||||
}
|
||||
@@ -249,18 +259,12 @@ __device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator,
|
||||
tid = threadIdx.x;
|
||||
result[params::opt];
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
if (1 < params::degree) {
|
||||
if (tid < 1)
|
||||
result[i] = -accumulator_slice[tid - 1 + params::degree];
|
||||
else
|
||||
result[i] = accumulator_slice[tid - 1];
|
||||
} else {
|
||||
uint32_t jj = 1 - (uint32_t)params::degree;
|
||||
if (tid < jj)
|
||||
result[i] = accumulator_slice[tid - jj + params::degree];
|
||||
else
|
||||
result[i] = -accumulator_slice[tid - jj];
|
||||
}
|
||||
// if (tid < 1)
|
||||
// result[i] = -accumulator_slice[tid - 1 + params::degree];
|
||||
// else
|
||||
// result[i] = accumulator_slice[tid - 1];
|
||||
int x = tid - 1 + SEL(0, params::degree, tid < 1);
|
||||
result[i] = SEL(1, -1, tid < 1) * accumulator_slice[x];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
@@ -71,132 +71,6 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void multiply_by_monomial_negacyclic(Polynomial<T, params> &result,
|
||||
uint32_t j) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
if (j < params::degree) {
|
||||
if (tid < j)
|
||||
result.coefficients[tid] =
|
||||
-this->coefficients[tid - j + params::degree];
|
||||
else
|
||||
result.coefficients[tid] = this->coefficients[tid - j];
|
||||
} else {
|
||||
uint32_t jj = j - params::degree;
|
||||
if (tid < jj)
|
||||
result.coefficients[tid] =
|
||||
this->coefficients[tid - jj + params::degree];
|
||||
else
|
||||
result.coefficients[tid] = -this->coefficients[tid - jj];
|
||||
}
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void multiply_by_monomial_negacyclic_inplace(uint32_t j) {
|
||||
int tid = threadIdx.x;
|
||||
T result[params::opt];
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
if (j < params::degree) {
|
||||
if (tid < j)
|
||||
result[i] = -this->coefficients[tid - j + params::degree];
|
||||
else
|
||||
result[i] = this->coefficients[tid - j];
|
||||
} else {
|
||||
uint32_t jj = j - params::degree;
|
||||
if (tid < jj)
|
||||
result[i] = this->coefficients[tid - jj + params::degree];
|
||||
else
|
||||
result[i] = -this->coefficients[tid - jj];
|
||||
}
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
coefficients[tid] = result[i];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
__device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
|
||||
Polynomial<T, params> &result, uint32_t j) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
if (j < params::degree) {
|
||||
if (tid < j)
|
||||
result.coefficients[tid] =
|
||||
-this->coefficients[tid - j + params::degree] -
|
||||
this->coefficients[tid];
|
||||
else
|
||||
result.coefficients[tid] =
|
||||
this->coefficients[tid - j] - this->coefficients[tid];
|
||||
} else {
|
||||
uint32_t jj = j - params::degree;
|
||||
if (tid < jj)
|
||||
result.coefficients[tid] =
|
||||
this->coefficients[tid - jj + params::degree] -
|
||||
this->coefficients[tid];
|
||||
else
|
||||
result.coefficients[tid] =
|
||||
-this->coefficients[tid - jj] - this->coefficients[tid];
|
||||
}
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void divide_by_monomial_negacyclic(Polynomial<T, params> &result,
|
||||
uint32_t j) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
if (j < params::degree) {
|
||||
if (tid < params::degree - j) {
|
||||
result.coefficients[tid] = this->coefficients[tid + j];
|
||||
} else {
|
||||
result.coefficients[tid] =
|
||||
-this->coefficients[tid - params::degree + j];
|
||||
}
|
||||
} else {
|
||||
uint32_t jj = j - params::degree;
|
||||
if (tid < params::degree - jj) {
|
||||
result.coefficients[tid] = -this->coefficients[tid + jj];
|
||||
} else {
|
||||
result.coefficients[tid] =
|
||||
this->coefficients[tid - params::degree + jj];
|
||||
}
|
||||
}
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void divide_by_monomial_negacyclic_inplace(uint32_t j) {
|
||||
int tid = threadIdx.x;
|
||||
T result[params::opt];
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
if (j < params::degree) {
|
||||
if (tid < params::degree - j) {
|
||||
result[i] = this->coefficients[tid + j];
|
||||
} else {
|
||||
result[i] = -this->coefficients[tid - params::degree + j];
|
||||
}
|
||||
} else {
|
||||
uint32_t jj = j - params::degree;
|
||||
if (tid < params::degree - jj) {
|
||||
result[i] = -this->coefficients[tid + jj];
|
||||
} else {
|
||||
result[i] = this->coefficients[tid - params::degree + jj];
|
||||
}
|
||||
}
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
coefficients[tid] = result[i];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void round_to_closest_multiple_inplace(uint32_t base_log,
|
||||
uint32_t level_count) {
|
||||
int tid = threadIdx.x;
|
||||
|
||||
Reference in New Issue
Block a user