mirror of
https://github.com/zama-ai/concrete.git
synced 2026-02-09 03:55:04 -05:00
refactor(cuda): Refactor the low latency PBS.
This commit is contained in:
@@ -15,8 +15,6 @@ void cuda_extract_bits_32(
|
||||
uint32_t max_shared_memory) {
|
||||
assert(("Error (GPU extract bits): base log should be <= 32",
|
||||
base_log_bsk <= 32));
|
||||
assert(("Error (GPU extract bits): glwe_dimension should be equal to 1",
|
||||
glwe_dimension == 1));
|
||||
assert(("Error (GPU extract bits): lwe_dimension_in should be one of "
|
||||
"512, 1024, 2048, 4096, 8192",
|
||||
lwe_dimension_in == 512 || lwe_dimension_in == 1024 ||
|
||||
@@ -44,8 +42,8 @@ void cuda_extract_bits_32(
|
||||
(uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
|
||||
number_of_samples, max_shared_memory);
|
||||
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
|
||||
level_count_ksk, number_of_samples, max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_extract_bits<uint32_t, Degree<1024>>(
|
||||
@@ -56,8 +54,8 @@ void cuda_extract_bits_32(
|
||||
(uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
|
||||
number_of_samples, max_shared_memory);
|
||||
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
|
||||
level_count_ksk, number_of_samples, max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_extract_bits<uint32_t, Degree<2048>>(
|
||||
@@ -68,8 +66,8 @@ void cuda_extract_bits_32(
|
||||
(uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
|
||||
number_of_samples, max_shared_memory);
|
||||
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
|
||||
level_count_ksk, number_of_samples, max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_extract_bits<uint32_t, Degree<4096>>(
|
||||
@@ -80,8 +78,8 @@ void cuda_extract_bits_32(
|
||||
(uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
|
||||
number_of_samples, max_shared_memory);
|
||||
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
|
||||
level_count_ksk, number_of_samples, max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_extract_bits<uint32_t, Degree<8192>>(
|
||||
@@ -92,8 +90,8 @@ void cuda_extract_bits_32(
|
||||
(uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
|
||||
number_of_samples, max_shared_memory);
|
||||
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
|
||||
level_count_ksk, number_of_samples, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
@@ -157,8 +155,6 @@ void cuda_extract_bits_64(
|
||||
uint32_t max_shared_memory) {
|
||||
assert(("Error (GPU extract bits): base log should be <= 64",
|
||||
base_log_bsk <= 64));
|
||||
assert(("Error (GPU extract bits): glwe_dimension should be equal to 1",
|
||||
glwe_dimension == 1));
|
||||
assert(("Error (GPU extract bits): lwe_dimension_in should be one of "
|
||||
"512, 1024, 2048, 4096, 8192",
|
||||
lwe_dimension_in == 512 || lwe_dimension_in == 1024 ||
|
||||
@@ -186,8 +182,8 @@ void cuda_extract_bits_64(
|
||||
(uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
|
||||
number_of_samples, max_shared_memory);
|
||||
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
|
||||
level_count_ksk, number_of_samples, max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_extract_bits<uint64_t, Degree<1024>>(
|
||||
@@ -198,8 +194,8 @@ void cuda_extract_bits_64(
|
||||
(uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
|
||||
number_of_samples, max_shared_memory);
|
||||
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
|
||||
level_count_ksk, number_of_samples, max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_extract_bits<uint64_t, Degree<2048>>(
|
||||
@@ -210,8 +206,8 @@ void cuda_extract_bits_64(
|
||||
(uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
|
||||
number_of_samples, max_shared_memory);
|
||||
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
|
||||
level_count_ksk, number_of_samples, max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_extract_bits<uint64_t, Degree<4096>>(
|
||||
@@ -222,8 +218,8 @@ void cuda_extract_bits_64(
|
||||
(uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
|
||||
number_of_samples, max_shared_memory);
|
||||
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
|
||||
level_count_ksk, number_of_samples, max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_extract_bits<uint64_t, Degree<8192>>(
|
||||
@@ -234,8 +230,8 @@ void cuda_extract_bits_64(
|
||||
(uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
|
||||
number_of_samples, max_shared_memory);
|
||||
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
|
||||
level_count_ksk, number_of_samples, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
||||
@@ -140,9 +140,9 @@ __host__ void host_extract_bits(
|
||||
Torus *lwe_array_out_pbs_buffer, Torus *lut_pbs,
|
||||
uint32_t *lut_vector_indexes, Torus *ksk, double2 *fourier_bsk,
|
||||
uint32_t number_of_bits, uint32_t delta_log, uint32_t lwe_dimension_in,
|
||||
uint32_t lwe_dimension_out, uint32_t base_log_bsk, uint32_t level_count_bsk,
|
||||
uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
uint32_t lwe_dimension_out, uint32_t glwe_dimension, uint32_t base_log_bsk,
|
||||
uint32_t level_count_bsk, uint32_t base_log_ksk, uint32_t level_count_ksk,
|
||||
uint32_t number_of_samples, uint32_t max_shared_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
@@ -189,8 +189,8 @@ __host__ void host_extract_bits(
|
||||
host_bootstrap_low_latency<Torus, params>(
|
||||
v_stream, gpu_index, lwe_array_out_pbs_buffer, lut_pbs,
|
||||
lut_vector_indexes, lwe_array_out_ks_buffer, fourier_bsk,
|
||||
lwe_dimension_out, lwe_dimension_in, base_log_bsk, level_count_bsk,
|
||||
number_of_samples, 1, max_shared_memory);
|
||||
glwe_dimension, lwe_dimension_out, lwe_dimension_in, base_log_bsk,
|
||||
level_count_bsk, number_of_samples, 1, max_shared_memory);
|
||||
|
||||
// Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption
|
||||
// of 0 if the extracted bit was 0 and 1 in the other case
|
||||
|
||||
@@ -16,8 +16,6 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
|
||||
assert(("Error (GPU low latency PBS): base log should be <= 32",
|
||||
base_log <= 32));
|
||||
assert(("Error (GPU low latency PBS): glwe_dimension should be equal to 1",
|
||||
glwe_dimension == 1));
|
||||
assert(("Error (GPU low latency PBS): polynomial size should be one of 512, "
|
||||
"1024, 2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 ||
|
||||
@@ -39,36 +37,41 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
host_bootstrap_low_latency<uint32_t, Degree<512>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors, max_shared_memory);
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_bootstrap_low_latency<uint32_t, Degree<1024>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors, max_shared_memory);
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_bootstrap_low_latency<uint32_t, Degree<2048>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors, max_shared_memory);
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_bootstrap_low_latency<uint32_t, Degree<4096>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors, max_shared_memory);
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_bootstrap_low_latency<uint32_t, Degree<8192>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors, max_shared_memory);
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
@@ -160,8 +163,6 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
|
||||
assert(("Error (GPU low latency PBS): base log should be <= 64",
|
||||
base_log <= 64));
|
||||
assert(("Error (GPU low latency PBS): glwe_dimension should be equal to 1",
|
||||
glwe_dimension == 1));
|
||||
assert(("Error (GPU low latency PBS): polynomial size should be one of 512, "
|
||||
"1024, 2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 ||
|
||||
@@ -183,36 +184,41 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
host_bootstrap_low_latency<uint64_t, Degree<512>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors, max_shared_memory);
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_bootstrap_low_latency<uint64_t, Degree<1024>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors, max_shared_memory);
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_bootstrap_low_latency<uint64_t, Degree<2048>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors, max_shared_memory);
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_bootstrap_low_latency<uint64_t, Degree<4096>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors, max_shared_memory);
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_bootstrap_low_latency<uint64_t, Degree<8192>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors, max_shared_memory);
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
||||
@@ -25,11 +25,11 @@ using namespace cooperative_groups;
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
template <typename Torus, class params>
|
||||
__device__ void
|
||||
mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *mask_join_buffer,
|
||||
double2 *body_join_buffer, double2 *bootstrapping_key,
|
||||
int polynomial_size, int level_count, int iteration,
|
||||
grid_group &grid) {
|
||||
__device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft,
|
||||
double2 *join_buffer, double2 *bootstrapping_key,
|
||||
int polynomial_size, uint32_t glwe_dimension,
|
||||
int level_count, int iteration,
|
||||
grid_group &grid) {
|
||||
|
||||
// Switch to the FFT space
|
||||
NSMFFT_direct<HalfDegree<params>>(fft);
|
||||
@@ -40,52 +40,50 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *mask_join_buffer,
|
||||
// this function, so we end up getting the lines of the bootstrapping key
|
||||
// needed to perform the external product in this block (corresponding to
|
||||
// the same decomposition level)
|
||||
auto bsk_slice = get_ith_mask_kth_block(
|
||||
bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
|
||||
glwe_dimension, level_count);
|
||||
|
||||
auto bsk_mask_slice =
|
||||
get_ith_mask_kth_block(bootstrapping_key, iteration, blockIdx.y,
|
||||
blockIdx.x, polynomial_size, 1, level_count);
|
||||
auto bsk_body_slice =
|
||||
get_ith_body_kth_block(bootstrapping_key, iteration, blockIdx.y,
|
||||
blockIdx.x, polynomial_size, 1, level_count);
|
||||
// Selects all GLWEs in a particular decomposition level
|
||||
auto level_join_buffer =
|
||||
join_buffer + blockIdx.x * (glwe_dimension + 1) * params::degree / 2;
|
||||
|
||||
// Perform the matrix multiplication between the GGSW and the GLWE,
|
||||
// each block operating on a single level for mask and body
|
||||
|
||||
auto first_processed_bsk =
|
||||
(blockIdx.y == 0) ? bsk_mask_slice : bsk_body_slice;
|
||||
auto second_processed_bsk =
|
||||
(blockIdx.y == 0) ? bsk_body_slice : bsk_mask_slice;
|
||||
|
||||
auto first_processed_acc =
|
||||
(blockIdx.y == 0) ? &mask_join_buffer[params::degree / 2 * blockIdx.x]
|
||||
: &body_join_buffer[params::degree / 2 * blockIdx.x];
|
||||
auto second_processed_acc =
|
||||
(blockIdx.y == 0) ? &body_join_buffer[params::degree / 2 * blockIdx.x]
|
||||
: &mask_join_buffer[params::degree / 2 * blockIdx.x];
|
||||
// The first product is used to initialize level_join_buffer
|
||||
auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
|
||||
auto buffer_slice = level_join_buffer + blockIdx.y * params::degree / 2;
|
||||
|
||||
int tid = threadIdx.x;
|
||||
|
||||
// first product
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
first_processed_acc[tid] = fft[tid] * first_processed_bsk[tid];
|
||||
buffer_slice[tid] = fft[tid] * bsk_poly[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
grid.sync();
|
||||
tid = threadIdx.x;
|
||||
// second product
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
second_processed_acc[tid] += fft[tid] * second_processed_bsk[tid];
|
||||
tid += params::degree / params::opt;
|
||||
|
||||
// Continues multiplying fft by every polynomial in that particular bsk level
|
||||
// Each y-block accumulates in a different polynomial at each iteration
|
||||
for (int j = 1; j < (glwe_dimension + 1); j++) {
|
||||
int idx = (j + blockIdx.y) % (glwe_dimension + 1);
|
||||
|
||||
auto bsk_poly = bsk_slice + idx * params::degree / 2;
|
||||
auto buffer_slice = level_join_buffer + idx * params::degree / 2;
|
||||
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
buffer_slice[tid] += fft[tid] * bsk_poly[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
grid.sync();
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
// All blocks are synchronized here; after this sync, level_join_buffer has
|
||||
// the values needed from every other block
|
||||
|
||||
// All blocks are synchronized here; after this sync, *_join_buffer has the
|
||||
// values needed from every other block
|
||||
grid.sync();
|
||||
|
||||
auto src_acc = (blockIdx.y == 0) ? mask_join_buffer : body_join_buffer;
|
||||
auto src_acc = join_buffer + blockIdx.y * params::degree / 2;
|
||||
|
||||
// copy first product into fft buffer
|
||||
tid = threadIdx.x;
|
||||
@@ -97,7 +95,7 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *mask_join_buffer,
|
||||
|
||||
// accumulate rest of the products into fft buffer
|
||||
for (int l = 1; l < gridDim.x; l++) {
|
||||
auto cur_src_acc = &src_acc[l * params::degree / 2];
|
||||
auto cur_src_acc = &src_acc[l * (glwe_dimension + 1) * params::degree / 2];
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] += cur_src_acc[tid];
|
||||
@@ -121,19 +119,22 @@ template <typename Torus, class params, sharedMemDegree SMD>
|
||||
/*
|
||||
* Kernel launched by the low latency version of the
|
||||
* bootstrapping, that uses cooperative groups
|
||||
* lwe_array_out vector of output lwe s, with length
|
||||
* (polynomial_size+1)*num_samples lut_vector - vector of look up tables with
|
||||
* length polynomial_size * num_samples lut_vector_indexes - mapping between
|
||||
* lwe_array_in and lut_vector lwe_array_in
|
||||
* - vector of lwe inputs with length (lwe_dimension + 1) * num_samples
|
||||
*
|
||||
* - lwe_array_out: vector of output lwe s, with length
|
||||
* (glwe_dimension * polynomial_size+1)*num_samples
|
||||
* - lut_vector: vector of look up tables with
|
||||
* length (glwe_dimension+1) * polynomial_size * num_samples
|
||||
* - lut_vector_indexes: mapping between lwe_array_in and lut_vector
|
||||
* lwe_array_in: vector of lwe inputs with length (lwe_dimension + 1) *
|
||||
* num_samples
|
||||
*
|
||||
* Each y-block computes one element of the lwe_array_out.
|
||||
*/
|
||||
__global__ void device_bootstrap_low_latency(
|
||||
Torus *lwe_array_out, Torus *lut_vector, Torus *lwe_array_in,
|
||||
double2 *bootstrapping_key, double2 *mask_join_buffer,
|
||||
double2 *body_join_buffer, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, char *device_mem,
|
||||
int device_memory_size_per_block) {
|
||||
double2 *bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
char *device_mem, int device_memory_size_per_block) {
|
||||
|
||||
grid_group grid = this_grid();
|
||||
|
||||
@@ -144,6 +145,7 @@ __global__ void device_bootstrap_low_latency(
|
||||
char *selected_memory;
|
||||
int block_index =
|
||||
blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
|
||||
uint32_t glwe_dimension = gridDim.y - 1;
|
||||
|
||||
if constexpr (SMD == FULLSM)
|
||||
selected_memory = sharedmem;
|
||||
@@ -163,12 +165,12 @@ __global__ void device_bootstrap_low_latency(
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
auto block_lwe_array_in = &lwe_array_in[blockIdx.z * (lwe_dimension + 1)];
|
||||
|
||||
auto block_lut_vector = &lut_vector[blockIdx.z * params::degree * 2];
|
||||
auto block_lut_vector =
|
||||
&lut_vector[blockIdx.z * params::degree * (glwe_dimension + 1)];
|
||||
|
||||
auto block_mask_join_buffer =
|
||||
&mask_join_buffer[blockIdx.z * level_count * params::degree / 2];
|
||||
auto block_body_join_buffer =
|
||||
&body_join_buffer[blockIdx.z * level_count * params::degree / 2];
|
||||
auto block_join_buffer =
|
||||
&join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
|
||||
params::degree / 2];
|
||||
|
||||
// Since the space is L1 cache is small, we use the same memory location for
|
||||
// the rotated accumulator and the fft accumulator, since we know that the
|
||||
@@ -179,15 +181,10 @@ __global__ void device_bootstrap_low_latency(
|
||||
rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
2 * params::degree);
|
||||
|
||||
if (blockIdx.y == 0) {
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, block_lut_vector, b_hat, false, 1);
|
||||
} else {
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[params::degree], b_hat, false, 1);
|
||||
}
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
|
||||
false);
|
||||
|
||||
for (int i = 0; i < lwe_dimension; i++) {
|
||||
synchronize_threads_in_block();
|
||||
@@ -200,13 +197,13 @@ __global__ void device_bootstrap_low_latency(
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator, accumulator_rotated, a_hat, 1);
|
||||
accumulator, accumulator_rotated, a_hat);
|
||||
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_rotated, base_log, level_count, 1);
|
||||
accumulator_rotated, base_log, level_count);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
@@ -214,29 +211,33 @@ __global__ void device_bootstrap_low_latency(
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
|
||||
accumulator_rotated, 1);
|
||||
accumulator_rotated);
|
||||
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
// accumulator_rotated, so we need to synchronize here to make sure they
|
||||
// don't modify the same memory space at the same time
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform G^-1(ACC) * GGSW -> GLWE
|
||||
mul_ggsw_glwe<Torus, params>(accumulator, accumulator_fft,
|
||||
block_mask_join_buffer, block_body_join_buffer,
|
||||
bootstrapping_key, polynomial_size,
|
||||
level_count, i, grid);
|
||||
mul_ggsw_glwe<Torus, params>(
|
||||
accumulator, accumulator_fft, block_join_buffer, bootstrapping_key,
|
||||
polynomial_size, glwe_dimension, level_count, i, grid);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
auto block_lwe_array_out = &lwe_array_out[blockIdx.z * (polynomial_size + 1)];
|
||||
auto block_lwe_array_out =
|
||||
&lwe_array_out[blockIdx.z * (glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
if (blockIdx.x == 0 && blockIdx.y == 0) {
|
||||
if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
|
||||
// Perform a sample extract. At this point, all blocks have the result, but
|
||||
// we do the computation at block 0 to avoid waiting for extra blocks, in
|
||||
// case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1);
|
||||
} else if (blockIdx.x == 0 && blockIdx.y == 1) {
|
||||
block_lwe_array_out[params::degree] = accumulator[0];
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -248,7 +249,7 @@ template <typename Torus, class params>
|
||||
__host__ void host_bootstrap_low_latency(
|
||||
void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lut_vector,
|
||||
uint32_t *lut_vector_indexes, Torus *lwe_array_in,
|
||||
double2 *bootstrapping_key, uint32_t lwe_dimension,
|
||||
double2 *bootstrapping_key, uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
|
||||
uint32_t max_shared_memory) {
|
||||
@@ -258,10 +259,8 @@ __host__ void host_bootstrap_low_latency(
|
||||
|
||||
int buffer_size_per_gpu = level_count * input_lwe_ciphertext_count *
|
||||
polynomial_size / 2 * sizeof(double2);
|
||||
double2 *mask_buffer_fft =
|
||||
(double2 *)cuda_malloc_async(buffer_size_per_gpu, stream, gpu_index);
|
||||
double2 *body_buffer_fft =
|
||||
(double2 *)cuda_malloc_async(buffer_size_per_gpu, stream, gpu_index);
|
||||
double2 *buffer_fft = (double2 *)cuda_malloc_async(
|
||||
(glwe_dimension + 1) * buffer_size_per_gpu, stream, gpu_index);
|
||||
|
||||
// With SM each block corresponds to either the mask or body, no need to
|
||||
// duplicate data for each
|
||||
@@ -279,35 +278,34 @@ __host__ void host_bootstrap_low_latency(
|
||||
char *d_mem;
|
||||
|
||||
int thds = polynomial_size / params::opt;
|
||||
dim3 grid(level_count, 2, input_lwe_ciphertext_count);
|
||||
dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
|
||||
|
||||
void *kernel_args[12];
|
||||
void *kernel_args[11];
|
||||
kernel_args[0] = &lwe_array_out;
|
||||
kernel_args[1] = &lut_vector;
|
||||
kernel_args[2] = &lwe_array_in;
|
||||
kernel_args[3] = &bootstrapping_key;
|
||||
kernel_args[4] = &mask_buffer_fft;
|
||||
kernel_args[5] = &body_buffer_fft;
|
||||
kernel_args[6] = &lwe_dimension;
|
||||
kernel_args[7] = &polynomial_size;
|
||||
kernel_args[8] = &base_log;
|
||||
kernel_args[9] = &level_count;
|
||||
kernel_args[10] = &d_mem;
|
||||
kernel_args[4] = &buffer_fft;
|
||||
kernel_args[5] = &lwe_dimension;
|
||||
kernel_args[6] = &polynomial_size;
|
||||
kernel_args[7] = &base_log;
|
||||
kernel_args[8] = &level_count;
|
||||
kernel_args[9] = &d_mem;
|
||||
|
||||
if (max_shared_memory < SM_PART) {
|
||||
kernel_args[11] = &DM_FULL;
|
||||
kernel_args[10] = &DM_FULL;
|
||||
check_cuda_error(cudaGetLastError());
|
||||
d_mem = (char *)cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count *
|
||||
level_count * 2,
|
||||
level_count * (glwe_dimension + 1),
|
||||
stream, gpu_index);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_bootstrap_low_latency<Torus, params, NOSM>, grid, thds,
|
||||
(void **)kernel_args, 0, *stream));
|
||||
} else if (max_shared_memory < SM_FULL) {
|
||||
kernel_args[11] = &DM_PART;
|
||||
kernel_args[10] = &DM_PART;
|
||||
d_mem = (char *)cuda_malloc_async(DM_PART * input_lwe_ciphertext_count *
|
||||
level_count * 2,
|
||||
level_count * (glwe_dimension + 1),
|
||||
stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_bootstrap_low_latency<Torus, params, PARTIALSM>,
|
||||
@@ -322,7 +320,7 @@ __host__ void host_bootstrap_low_latency(
|
||||
|
||||
} else {
|
||||
int DM_NONE = 0;
|
||||
kernel_args[11] = &DM_NONE;
|
||||
kernel_args[10] = &DM_NONE;
|
||||
d_mem = (char *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_bootstrap_low_latency<Torus, params, FULLSM>,
|
||||
@@ -337,8 +335,7 @@ __host__ void host_bootstrap_low_latency(
|
||||
check_cuda_error(cudaGetLastError());
|
||||
// Synchronize the streams before copying the result to lwe_array_out at the
|
||||
// right place
|
||||
cuda_drop_async(mask_buffer_fft, stream, gpu_index);
|
||||
cuda_drop_async(body_buffer_fft, stream, gpu_index);
|
||||
cuda_drop_async(buffer_fft, stream, gpu_index);
|
||||
cuda_drop_async(d_mem, stream, gpu_index);
|
||||
}
|
||||
|
||||
|
||||
@@ -13,8 +13,6 @@ void cuda_circuit_bootstrap_32(
|
||||
uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk,
|
||||
uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs,
|
||||
uint32_t number_of_samples, uint32_t max_shared_memory) {
|
||||
assert(("Error (GPU circuit bootstrap): glwe_dimension should be equal to 1",
|
||||
glwe_dimension == 1));
|
||||
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
|
||||
"512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 ||
|
||||
@@ -137,8 +135,6 @@ void cuda_circuit_bootstrap_64(
|
||||
uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk,
|
||||
uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs,
|
||||
uint32_t number_of_samples, uint32_t max_shared_memory) {
|
||||
assert(("Error (GPU circuit bootstrap): glwe_dimension should be equal to 1",
|
||||
glwe_dimension == 1));
|
||||
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
|
||||
"512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 ||
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
* arbitrary amount of levels by using decompose_and_compress_level().
|
||||
*
|
||||
* This class always decomposes the entire set of num_poly polynomials.
|
||||
* By default, it works on a single polynomial.
|
||||
*/
|
||||
#pragma once
|
||||
template <typename T, class params> class GadgetMatrix {
|
||||
@@ -28,7 +29,7 @@ private:
|
||||
|
||||
public:
|
||||
__device__ GadgetMatrix(uint32_t base_log, uint32_t level_count, T *state,
|
||||
uint32_t num_poly)
|
||||
uint32_t num_poly = 1)
|
||||
: base_log(base_log), level_count(level_count), num_poly(num_poly),
|
||||
state(state) {
|
||||
|
||||
|
||||
@@ -61,11 +61,13 @@ __device__ void add_polynomial_inplace_low_lat(T *source, T *dst, int p_id) {
|
||||
* Performs acc = acc * (X^ä + 1) if zeroAcc = false
|
||||
* Performs acc = 0 if zeroAcc
|
||||
* takes single buffer and calculates inplace.
|
||||
*
|
||||
* By default, it works on a single polynomial.
|
||||
*/
|
||||
template <typename T, int elems_per_thread, int block_size>
|
||||
__device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input,
|
||||
uint32_t j, bool zeroAcc,
|
||||
uint32_t num_poly) {
|
||||
uint32_t num_poly = 1) {
|
||||
constexpr int degree = block_size * elems_per_thread;
|
||||
for (int z = 0; z < num_poly; z++) {
|
||||
T *accumulator_slice = (T *)accumulator + (ptrdiff_t)(z * degree);
|
||||
@@ -105,10 +107,12 @@ __device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input,
|
||||
*
|
||||
* Performs result_acc = acc * (X^ä - 1) - acc
|
||||
* takes single buffer as input and returns a single rotated buffer
|
||||
*
|
||||
* By default, it works on a single polynomial.
|
||||
*/
|
||||
template <typename T, int elems_per_thread, int block_size>
|
||||
__device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
|
||||
T *acc, T *result_acc, uint32_t j, uint32_t num_poly) {
|
||||
T *acc, T *result_acc, uint32_t j, uint32_t num_poly = 1) {
|
||||
constexpr int degree = block_size * elems_per_thread;
|
||||
for (int z = 0; z < num_poly; z++) {
|
||||
T *acc_slice = (T *)acc + (ptrdiff_t)(z * degree);
|
||||
@@ -138,11 +142,13 @@ __device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
|
||||
/*
|
||||
* Receives num_poly concatenated polynomials of type T. For each performs a
|
||||
* rounding to increase accuracy of the PBS. Calculates inplace.
|
||||
*
|
||||
* By default, it works on a single polynomial.
|
||||
*/
|
||||
template <typename T, int elems_per_thread, int block_size>
|
||||
__device__ void round_to_closest_multiple_inplace(T *rotated_acc, int base_log,
|
||||
int level_count,
|
||||
uint32_t num_poly) {
|
||||
uint32_t num_poly = 1) {
|
||||
constexpr int degree = block_size * elems_per_thread;
|
||||
for (int z = 0; z < num_poly; z++) {
|
||||
T *rotated_acc_slice = (T *)rotated_acc + (ptrdiff_t)(z * degree);
|
||||
@@ -192,20 +198,21 @@ __device__ void add_to_torus(double2 *m_values, Torus *result) {
|
||||
}
|
||||
}
|
||||
|
||||
// Extracts the body of a GLWE with dimension glwe_dimension
|
||||
// Extracts the body of a GLWE.
|
||||
// k is the offset to find the body element / polynomial in the lwe_array_out /
|
||||
// accumulator
|
||||
template <typename Torus, class params>
|
||||
__device__ void sample_extract_body(Torus *lwe_array_out, Torus *accumulator,
|
||||
uint32_t glwe_dimension) {
|
||||
uint32_t k) {
|
||||
// Set first coefficient of the accumulator as the body of the LWE sample
|
||||
lwe_array_out[glwe_dimension * params::degree] =
|
||||
accumulator[glwe_dimension * params::degree];
|
||||
lwe_array_out[k * params::degree] = accumulator[k * params::degree];
|
||||
}
|
||||
|
||||
// Extracts the mask of a GLWE with dimension glwe_dimension
|
||||
// Extracts the mask from num_poly polynomials individually
|
||||
template <typename Torus, class params>
|
||||
__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator,
|
||||
uint32_t glwe_dimension) {
|
||||
for (int z = 0; z < glwe_dimension; z++) {
|
||||
uint32_t num_poly = 1) {
|
||||
for (int z = 0; z < num_poly; z++) {
|
||||
Torus *lwe_array_out_slice =
|
||||
(Torus *)lwe_array_out + (ptrdiff_t)(z * params::degree);
|
||||
Torus *accumulator_slice =
|
||||
|
||||
@@ -17,8 +17,6 @@ void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
|
||||
polynomial_size == 2048 || polynomial_size == 4096 ||
|
||||
polynomial_size == 8192));
|
||||
// For larger k we will need to adjust the mask size
|
||||
assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1",
|
||||
glwe_dimension == 1));
|
||||
assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
|
||||
"be >= 1 ",
|
||||
r >= 1));
|
||||
@@ -99,8 +97,6 @@ void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
|
||||
polynomial_size == 2048 || polynomial_size == 4096 ||
|
||||
polynomial_size == 8192));
|
||||
// For larger k we will need to adjust the mask size
|
||||
assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1",
|
||||
glwe_dimension == 1));
|
||||
assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
|
||||
"be >= 1 ",
|
||||
r >= 1));
|
||||
|
||||
@@ -35,8 +35,6 @@ void cuda_circuit_bootstrap_vertical_packing_64(
|
||||
uint32_t base_log_pksk, uint32_t level_count_cbs, uint32_t base_log_cbs,
|
||||
uint32_t number_of_inputs, uint32_t lut_number,
|
||||
uint32_t max_shared_memory) {
|
||||
assert(("Error (GPU circuit bootstrap): glwe_dimension should be equal to 1",
|
||||
glwe_dimension == 1));
|
||||
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
|
||||
"512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 ||
|
||||
@@ -149,8 +147,6 @@ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
uint32_t number_of_bits_of_message_including_padding,
|
||||
uint32_t number_of_bits_to_extract,
|
||||
uint32_t number_of_inputs, uint32_t max_shared_memory) {
|
||||
assert(("Error (GPU WOP PBS): glwe_dimension should be equal to 1",
|
||||
glwe_dimension == 1));
|
||||
assert(("Error (GPU WOP PBS): polynomial_size should be one of "
|
||||
"512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 ||
|
||||
|
||||
@@ -174,8 +174,8 @@ __host__ void host_wop_pbs(
|
||||
lwe_array_in_buffer, lwe_array_in_shifted_buffer, lwe_array_out_ks_buffer,
|
||||
lwe_array_out_pbs_buffer, lut_pbs, lut_vector_indexes, ksk, fourier_bsk,
|
||||
number_of_bits_to_extract, delta_log, polynomial_size, lwe_dimension,
|
||||
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
|
||||
number_of_inputs, max_shared_memory);
|
||||
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
|
||||
level_count_ksk, number_of_inputs, max_shared_memory);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
cuda_drop_async(lut_pbs, stream, gpu_index);
|
||||
cuda_drop_async(lut_vector_indexes, stream, gpu_index);
|
||||
|
||||
Reference in New Issue
Block a user