mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-09 14:47:56 -05:00
chore(gpu): supress warnings in pcc_gpu
This commit is contained in:
@@ -76,7 +76,7 @@ void cuda_drop(void *ptr, uint32_t gpu_index);
|
||||
void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
|
||||
}
|
||||
|
||||
int cuda_get_max_shared_memory(uint32_t gpu_index);
|
||||
uint32_t cuda_get_max_shared_memory(uint32_t gpu_index);
|
||||
|
||||
bool cuda_check_support_cooperative_groups();
|
||||
|
||||
|
||||
@@ -3769,7 +3769,6 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
void init_temporary_buffers(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
uint32_t num_blocks) {
|
||||
uint32_t big_lwe_size = params.big_lwe_dimension + 1;
|
||||
|
||||
// non boolean temporary arrays, with `num_blocks` blocks
|
||||
remainder1 = new CudaRadixCiphertextFFI;
|
||||
|
||||
@@ -5,12 +5,12 @@
|
||||
|
||||
template <typename Torus>
|
||||
bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size, int max_shared_memory);
|
||||
uint32_t polynomial_size, uint32_t max_shared_memory);
|
||||
|
||||
template <typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, int max_shared_memory);
|
||||
uint32_t level_count, uint32_t max_shared_memory);
|
||||
|
||||
#if CUDA_ARCH >= 900
|
||||
template <typename Torus>
|
||||
|
||||
@@ -67,7 +67,7 @@ get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
|
||||
|
||||
template <typename Torus>
|
||||
bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
|
||||
uint32_t polynomial_size, int max_shared_memory);
|
||||
uint32_t polynomial_size, uint32_t max_shared_memory);
|
||||
|
||||
template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;
|
||||
|
||||
@@ -400,7 +400,7 @@ bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t level_count,
|
||||
uint32_t num_samples,
|
||||
int max_shared_memory);
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
|
||||
@@ -8,7 +8,7 @@ extern "C" {
|
||||
|
||||
bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t num_samples, int max_shared_memory);
|
||||
uint32_t num_samples, uint32_t max_shared_memory);
|
||||
|
||||
void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
|
||||
void *stream, uint32_t gpu_index, void *dest, void const *src,
|
||||
|
||||
@@ -304,8 +304,8 @@ void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {
|
||||
}
|
||||
|
||||
/// Get the maximum size for the shared memory
|
||||
int cuda_get_max_shared_memory(uint32_t gpu_index) {
|
||||
int max_shared_memory = 0;
|
||||
uint32_t cuda_get_max_shared_memory(uint32_t gpu_index) {
|
||||
auto max_shared_memory = 0;
|
||||
#if CUDA_ARCH == 900
|
||||
max_shared_memory = 226000;
|
||||
#elif CUDA_ARCH == 890
|
||||
@@ -321,5 +321,5 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) {
|
||||
gpu_index);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
#endif
|
||||
return max_shared_memory;
|
||||
return (uint32_t)(max_shared_memory);
|
||||
}
|
||||
|
||||
@@ -505,7 +505,6 @@ __host__ void host_integer_div_rem_kb(
|
||||
auto num_blocks = quotient->num_radix_blocks;
|
||||
if (is_signed) {
|
||||
auto radix_params = int_mem_ptr->params;
|
||||
uint32_t big_lwe_size = radix_params.big_lwe_dimension + 1;
|
||||
|
||||
// temporary memory
|
||||
auto positive_numerator = int_mem_ptr->positive_numerator;
|
||||
|
||||
@@ -251,7 +251,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
/// Here it is important to query the default max shared memory on device 0
|
||||
/// instead of cuda_get_max_shared_memory,
|
||||
/// to avoid bugs with tree_add_chunks trying to use too much shared memory
|
||||
int max_shared_memory = 0;
|
||||
auto max_shared_memory = 0;
|
||||
check_cuda_error(cudaDeviceGetAttribute(
|
||||
&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, 0));
|
||||
|
||||
|
||||
@@ -118,7 +118,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
|
||||
int gridSize = total_polynomials;
|
||||
int blockSize = polynomial_size / choose_opt_amortized(polynomial_size);
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
|
||||
double2 *buffer;
|
||||
switch (polynomial_size) {
|
||||
|
||||
@@ -145,7 +145,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
|
||||
cuda_memcpy_async_to_gpu(d_bsk, h_bsk, buffer_size, stream, gpu_index);
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
|
||||
double2 *buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
switch (polynomial_size) {
|
||||
|
||||
@@ -233,7 +233,7 @@ uint64_t get_buffer_size_partial_sm_programmable_bootstrap_amortized(
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_programmable_bootstrap_amortized(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, int max_shared_memory) {
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
|
||||
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
|
||||
@@ -264,7 +264,7 @@ __host__ void scratch_programmable_bootstrap_amortized(
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_programmable_bootstrap_amortized<Torus>(
|
||||
polynomial_size);
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
|
||||
cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>,
|
||||
@@ -311,7 +311,7 @@ __host__ void host_programmable_bootstrap_amortized(
|
||||
|
||||
uint64_t DM_FULL = SM_FULL;
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
// Create a 1-dimensional grid of threads
|
||||
|
||||
@@ -199,7 +199,7 @@ __host__ void scratch_programmable_bootstrap_cg(
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
|
||||
polynomial_size);
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
|
||||
@@ -246,7 +246,7 @@ __host__ void host_programmable_bootstrap_cg(
|
||||
get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
uint64_t full_dm = full_sm;
|
||||
@@ -301,7 +301,7 @@ __host__ void host_programmable_bootstrap_cg(
|
||||
template <typename Torus, class params>
|
||||
__host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
|
||||
int glwe_dimension, int level_count, int num_samples,
|
||||
int max_shared_memory) {
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
// If Cooperative Groups is not supported, no need to check anything else
|
||||
if (!cuda_check_support_cooperative_groups())
|
||||
@@ -347,7 +347,7 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
|
||||
template <typename Torus>
|
||||
__host__ bool supports_cooperative_groups_on_programmable_bootstrap(
|
||||
int glwe_dimension, int polynomial_size, int level_count, int num_samples,
|
||||
int max_shared_memory) {
|
||||
uint32_t max_shared_memory) {
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return verify_cuda_programmable_bootstrap_cg_grid_size<
|
||||
|
||||
@@ -227,7 +227,7 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
|
||||
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
if (max_shared_memory < full_sm_keybundle) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
|
||||
@@ -312,7 +312,7 @@ __host__ void execute_cg_external_product_loop(
|
||||
uint64_t no_dm = 0;
|
||||
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
|
||||
uint32_t keybundle_size_per_input =
|
||||
lwe_chunk_size * level_count * (glwe_dimension + 1) *
|
||||
@@ -409,7 +409,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
|
||||
template <typename Torus, class params>
|
||||
__host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
|
||||
int glwe_dimension, int level_count, int num_samples,
|
||||
int max_shared_memory) {
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
// If Cooperative Groups is not supported, no need to check anything else
|
||||
if (!cuda_check_support_cooperative_groups())
|
||||
@@ -460,7 +460,7 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
|
||||
template <typename Torus>
|
||||
__host__ bool supports_cooperative_groups_on_multibit_programmable_bootstrap(
|
||||
int glwe_dimension, int polynomial_size, int level_count, int num_samples,
|
||||
int max_shared_memory) {
|
||||
uint32_t max_shared_memory) {
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
|
||||
|
||||
@@ -9,18 +9,16 @@ bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t level_count,
|
||||
uint32_t num_samples,
|
||||
int max_shared_memory) {
|
||||
uint32_t max_shared_memory) {
|
||||
return supports_cooperative_groups_on_programmable_bootstrap<Torus>(
|
||||
glwe_dimension, polynomial_size, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t level_count,
|
||||
int max_shared_memory) {
|
||||
bool has_support_to_cuda_programmable_bootstrap_tbc(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory) {
|
||||
#if CUDA_ARCH >= 900
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -324,7 +322,7 @@ void scratch_cuda_programmable_bootstrap_32(
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
#if (CUDA_ARCH >= 900)
|
||||
if (has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
|
||||
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
|
||||
@@ -359,7 +357,7 @@ void scratch_cuda_programmable_bootstrap_64(
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
#if (CUDA_ARCH >= 900)
|
||||
if (has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
|
||||
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
|
||||
@@ -732,7 +730,7 @@ void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
|
||||
|
||||
template bool has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t num_samples, int max_shared_memory);
|
||||
uint32_t num_samples, uint32_t max_shared_memory);
|
||||
|
||||
template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
|
||||
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
|
||||
@@ -798,10 +796,10 @@ template void scratch_cuda_programmable_bootstrap<uint32_t>(
|
||||
|
||||
template bool has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, int max_shared_memory);
|
||||
uint32_t level_count, uint32_t max_shared_memory);
|
||||
template bool has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, int max_shared_memory);
|
||||
uint32_t level_count, uint32_t max_shared_memory);
|
||||
|
||||
#if CUDA_ARCH >= 900
|
||||
template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
|
||||
|
||||
@@ -262,7 +262,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_programmable_bootstrap(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, int max_shared_memory) {
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
|
||||
|
||||
uint64_t full_sm_step_one =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_step_one<Torus>(
|
||||
@@ -316,7 +316,7 @@ __host__ void scratch_programmable_bootstrap(
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_programmable_bootstrap<Torus>(polynomial_size);
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
|
||||
// Configure step one
|
||||
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
|
||||
@@ -372,7 +372,7 @@ __host__ void execute_step_one(
|
||||
uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
|
||||
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) {
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
int thds = polynomial_size / params::opt;
|
||||
dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1, level_count);
|
||||
@@ -414,7 +414,7 @@ __host__ void execute_step_two(
|
||||
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
int thds = polynomial_size / params::opt;
|
||||
dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1);
|
||||
|
||||
@@ -299,7 +299,7 @@ __host__ void scratch_programmable_bootstrap_128(
|
||||
get_buffer_size_partial_sm_programmable_bootstrap<__uint128_t>(
|
||||
polynomial_size);
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
|
||||
// Configure step one
|
||||
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
|
||||
@@ -359,7 +359,7 @@ __host__ void execute_step_one_128(
|
||||
uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
|
||||
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) {
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
int thds = polynomial_size / params::opt;
|
||||
dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1, level_count);
|
||||
@@ -401,7 +401,7 @@ __host__ void execute_step_two_128(
|
||||
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
int thds = polynomial_size / params::opt;
|
||||
dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1);
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
|
||||
bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t num_samples, int max_shared_memory) {
|
||||
uint32_t num_samples, uint32_t max_shared_memory) {
|
||||
return supports_cooperative_groups_on_multibit_programmable_bootstrap<
|
||||
uint64_t>(glwe_dimension, polynomial_size, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
@@ -18,7 +18,7 @@ bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
|
||||
template <typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, int max_shared_memory) {
|
||||
uint32_t level_count, uint32_t max_shared_memory) {
|
||||
#if CUDA_ARCH >= 900
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -449,7 +449,7 @@ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
polynomial_size);
|
||||
|
||||
int max_blocks_per_sm;
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
if (max_shared_memory < full_sm_keybundle)
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
@@ -532,7 +532,7 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
template bool
|
||||
has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, int max_shared_memory);
|
||||
uint32_t level_count, uint32_t max_shared_memory);
|
||||
|
||||
#if (CUDA_ARCH >= 900)
|
||||
template <typename Torus>
|
||||
|
||||
@@ -390,7 +390,7 @@ __host__ void scratch_multi_bit_programmable_bootstrap(
|
||||
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
uint64_t full_sm_keybundle =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
|
||||
polynomial_size);
|
||||
@@ -509,7 +509,7 @@ __host__ void execute_compute_keybundle(
|
||||
uint64_t full_sm_keybundle =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
|
||||
polynomial_size);
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
|
||||
auto d_mem = buffer->d_mem_keybundle;
|
||||
auto keybundle_fft = buffer->keybundle_fft;
|
||||
@@ -553,7 +553,7 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index,
|
||||
uint64_t partial_sm_accumulate_step_one =
|
||||
get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
|
||||
Torus>(polynomial_size);
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
|
||||
//
|
||||
auto d_mem = buffer->d_mem_acc_step_one;
|
||||
@@ -607,7 +607,7 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
uint64_t full_sm_accumulate_step_two =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
|
||||
polynomial_size);
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
|
||||
auto d_mem = buffer->d_mem_acc_step_two;
|
||||
auto keybundle_fft = buffer->keybundle_fft;
|
||||
|
||||
@@ -203,7 +203,7 @@ __host__ void scratch_programmable_bootstrap_tbc(
|
||||
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
bool supports_dsm =
|
||||
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
|
||||
Torus>(polynomial_size, max_shared_memory);
|
||||
@@ -266,7 +266,7 @@ __host__ void host_programmable_bootstrap_tbc(
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto supports_dsm =
|
||||
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
|
||||
Torus>(polynomial_size, max_shared_memory);
|
||||
@@ -344,7 +344,7 @@ __host__ void host_programmable_bootstrap_tbc(
|
||||
template <typename Torus, class params>
|
||||
__host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
|
||||
int glwe_dimension, int level_count, int num_samples,
|
||||
int max_shared_memory) {
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
// If Cooperative Groups is not supported, no need to check anything else
|
||||
if (!cuda_check_support_cooperative_groups())
|
||||
@@ -388,7 +388,7 @@ __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
|
||||
|
||||
template <typename Torus>
|
||||
bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
|
||||
uint32_t polynomial_size, int max_shared_memory) {
|
||||
uint32_t polynomial_size, uint32_t max_shared_memory) {
|
||||
uint64_t minimum_sm =
|
||||
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
@@ -405,7 +405,7 @@ bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
|
||||
template <typename Torus, class params>
|
||||
__host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, int max_shared_memory) {
|
||||
uint32_t level_count, uint32_t max_shared_memory) {
|
||||
|
||||
if (!cuda_check_support_thread_block_clusters() || num_samples > 128)
|
||||
return false;
|
||||
|
||||
@@ -206,7 +206,7 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap(
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
bool supports_dsm =
|
||||
supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
|
||||
Torus>(polynomial_size, max_shared_memory);
|
||||
@@ -305,7 +305,7 @@ __host__ void execute_tbc_external_product_loop(
|
||||
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto supports_dsm =
|
||||
supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
|
||||
Torus>(polynomial_size, max_shared_memory);
|
||||
@@ -426,7 +426,7 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
|
||||
|
||||
template <typename Torus>
|
||||
bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size, int max_shared_memory) {
|
||||
uint32_t polynomial_size, uint32_t max_shared_memory) {
|
||||
uint64_t minimum_sm =
|
||||
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
@@ -443,7 +443,7 @@ bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
|
||||
template <typename Torus, class params>
|
||||
__host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, int max_shared_memory) {
|
||||
uint32_t level_count, uint32_t max_shared_memory) {
|
||||
|
||||
if (!cuda_check_support_thread_block_clusters())
|
||||
return false;
|
||||
@@ -518,5 +518,5 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
|
||||
|
||||
template bool
|
||||
supports_distributed_shared_memory_on_multibit_programmable_bootstrap<uint64_t>(
|
||||
uint32_t polynomial_size, int max_shared_memory);
|
||||
uint32_t polynomial_size, uint32_t max_shared_memory);
|
||||
#endif // FASTMULTIBIT_PBS_H
|
||||
|
||||
@@ -1639,7 +1639,7 @@ unsafe extern "C" {
|
||||
polynomial_size: u32,
|
||||
level_count: u32,
|
||||
num_samples: u32,
|
||||
max_shared_memory: ffi::c_int,
|
||||
max_shared_memory: u32,
|
||||
) -> bool;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
|
||||
Reference in New Issue
Block a user