chore(gpu): supress warnings in pcc_gpu

This commit is contained in:
Agnes Leroy
2025-03-19 17:44:11 +01:00
committed by Agnès Leroy
parent 4928b1354e
commit b1008824e2
21 changed files with 58 additions and 62 deletions

View File

@@ -76,7 +76,7 @@ void cuda_drop(void *ptr, uint32_t gpu_index);
void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
}
int cuda_get_max_shared_memory(uint32_t gpu_index);
uint32_t cuda_get_max_shared_memory(uint32_t gpu_index);
bool cuda_check_support_cooperative_groups();

View File

@@ -3769,7 +3769,6 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
void init_temporary_buffers(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
uint32_t num_blocks) {
uint32_t big_lwe_size = params.big_lwe_dimension + 1;
// non boolean temporary arrays, with `num_blocks` blocks
remainder1 = new CudaRadixCiphertextFFI;

View File

@@ -5,12 +5,12 @@
template <typename Torus>
bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
uint32_t polynomial_size, int max_shared_memory);
uint32_t polynomial_size, uint32_t max_shared_memory);
template <typename Torus>
bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, int max_shared_memory);
uint32_t level_count, uint32_t max_shared_memory);
#if CUDA_ARCH >= 900
template <typename Torus>

View File

@@ -67,7 +67,7 @@ get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
template <typename Torus>
bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
uint32_t polynomial_size, int max_shared_memory);
uint32_t polynomial_size, uint32_t max_shared_memory);
template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;
@@ -400,7 +400,7 @@ bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t level_count,
uint32_t num_samples,
int max_shared_memory);
uint32_t max_shared_memory);
template <typename Torus>
void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(

View File

@@ -8,7 +8,7 @@ extern "C" {
bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t num_samples, int max_shared_memory);
uint32_t num_samples, uint32_t max_shared_memory);
void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
void *stream, uint32_t gpu_index, void *dest, void const *src,

View File

@@ -304,8 +304,8 @@ void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {
}
/// Get the maximum size for the shared memory
int cuda_get_max_shared_memory(uint32_t gpu_index) {
int max_shared_memory = 0;
uint32_t cuda_get_max_shared_memory(uint32_t gpu_index) {
auto max_shared_memory = 0;
#if CUDA_ARCH == 900
max_shared_memory = 226000;
#elif CUDA_ARCH == 890
@@ -321,5 +321,5 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) {
gpu_index);
check_cuda_error(cudaGetLastError());
#endif
return max_shared_memory;
return (uint32_t)(max_shared_memory);
}

View File

@@ -505,7 +505,6 @@ __host__ void host_integer_div_rem_kb(
auto num_blocks = quotient->num_radix_blocks;
if (is_signed) {
auto radix_params = int_mem_ptr->params;
uint32_t big_lwe_size = radix_params.big_lwe_dimension + 1;
// temporary memory
auto positive_numerator = int_mem_ptr->positive_numerator;

View File

@@ -251,7 +251,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
/// Here it is important to query the default max shared memory on device 0
/// instead of cuda_get_max_shared_memory,
/// to avoid bugs with tree_add_chunks trying to use too much shared memory
int max_shared_memory = 0;
auto max_shared_memory = 0;
check_cuda_error(cudaDeviceGetAttribute(
&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, 0));

View File

@@ -118,7 +118,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
int gridSize = total_polynomials;
int blockSize = polynomial_size / choose_opt_amortized(polynomial_size);
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
double2 *buffer;
switch (polynomial_size) {

View File

@@ -145,7 +145,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
cuda_memcpy_async_to_gpu(d_bsk, h_bsk, buffer_size, stream, gpu_index);
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
double2 *buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
switch (polynomial_size) {

View File

@@ -233,7 +233,7 @@ uint64_t get_buffer_size_partial_sm_programmable_bootstrap_amortized(
template <typename Torus>
uint64_t get_buffer_size_programmable_bootstrap_amortized(
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, int max_shared_memory) {
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
@@ -264,7 +264,7 @@ __host__ void scratch_programmable_bootstrap_amortized(
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap_amortized<Torus>(
polynomial_size);
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
cudaFuncSetAttribute(
device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>,
@@ -311,7 +311,7 @@ __host__ void host_programmable_bootstrap_amortized(
uint64_t DM_FULL = SM_FULL;
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
cuda_set_device(gpu_index);
// Create a 1-dimensional grid of threads

View File

@@ -199,7 +199,7 @@ __host__ void scratch_programmable_bootstrap_cg(
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
polynomial_size);
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
@@ -246,7 +246,7 @@ __host__ void host_programmable_bootstrap_cg(
get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
polynomial_size);
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
cuda_set_device(gpu_index);
uint64_t full_dm = full_sm;
@@ -301,7 +301,7 @@ __host__ void host_programmable_bootstrap_cg(
template <typename Torus, class params>
__host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
int glwe_dimension, int level_count, int num_samples,
int max_shared_memory) {
uint32_t max_shared_memory) {
// If Cooperative Groups is not supported, no need to check anything else
if (!cuda_check_support_cooperative_groups())
@@ -347,7 +347,7 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
template <typename Torus>
__host__ bool supports_cooperative_groups_on_programmable_bootstrap(
int glwe_dimension, int polynomial_size, int level_count, int num_samples,
int max_shared_memory) {
uint32_t max_shared_memory) {
switch (polynomial_size) {
case 256:
return verify_cuda_programmable_bootstrap_cg_grid_size<

View File

@@ -227,7 +227,7 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<Torus>(
polynomial_size);
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
if (max_shared_memory < full_sm_keybundle) {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
@@ -312,7 +312,7 @@ __host__ void execute_cg_external_product_loop(
uint64_t no_dm = 0;
auto lwe_chunk_size = buffer->lwe_chunk_size;
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
uint32_t keybundle_size_per_input =
lwe_chunk_size * level_count * (glwe_dimension + 1) *
@@ -409,7 +409,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
template <typename Torus, class params>
__host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
int glwe_dimension, int level_count, int num_samples,
int max_shared_memory) {
uint32_t max_shared_memory) {
// If Cooperative Groups is not supported, no need to check anything else
if (!cuda_check_support_cooperative_groups())
@@ -460,7 +460,7 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
template <typename Torus>
__host__ bool supports_cooperative_groups_on_multibit_programmable_bootstrap(
int glwe_dimension, int polynomial_size, int level_count, int num_samples,
int max_shared_memory) {
uint32_t max_shared_memory) {
switch (polynomial_size) {
case 256:
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<

View File

@@ -9,18 +9,16 @@ bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t level_count,
uint32_t num_samples,
int max_shared_memory) {
uint32_t max_shared_memory) {
return supports_cooperative_groups_on_programmable_bootstrap<Torus>(
glwe_dimension, polynomial_size, level_count, num_samples,
max_shared_memory);
}
template <typename Torus>
bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t level_count,
int max_shared_memory) {
bool has_support_to_cuda_programmable_bootstrap_tbc(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_shared_memory) {
#if CUDA_ARCH >= 900
switch (polynomial_size) {
case 256:
@@ -324,7 +322,7 @@ void scratch_cuda_programmable_bootstrap_32(
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
#if (CUDA_ARCH >= 900)
if (has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
@@ -359,7 +357,7 @@ void scratch_cuda_programmable_bootstrap_64(
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
#if (CUDA_ARCH >= 900)
if (has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
@@ -732,7 +730,7 @@ void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
template bool has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t num_samples, int max_shared_memory);
uint32_t num_samples, uint32_t max_shared_memory);
template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
@@ -798,10 +796,10 @@ template void scratch_cuda_programmable_bootstrap<uint32_t>(
template bool has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, int max_shared_memory);
uint32_t level_count, uint32_t max_shared_memory);
template bool has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, int max_shared_memory);
uint32_t level_count, uint32_t max_shared_memory);
#if CUDA_ARCH >= 900
template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(

View File

@@ -262,7 +262,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
template <typename Torus>
uint64_t get_buffer_size_programmable_bootstrap(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, int max_shared_memory) {
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
uint64_t full_sm_step_one =
get_buffer_size_full_sm_programmable_bootstrap_step_one<Torus>(
@@ -316,7 +316,7 @@ __host__ void scratch_programmable_bootstrap(
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap<Torus>(polynomial_size);
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
// Configure step one
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
@@ -372,7 +372,7 @@ __host__ void execute_step_one(
uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) {
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
cuda_set_device(gpu_index);
int thds = polynomial_size / params::opt;
dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1, level_count);
@@ -414,7 +414,7 @@ __host__ void execute_step_two(
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm,
uint32_t num_many_lut, uint32_t lut_stride) {
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
cuda_set_device(gpu_index);
int thds = polynomial_size / params::opt;
dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1);

View File

@@ -299,7 +299,7 @@ __host__ void scratch_programmable_bootstrap_128(
get_buffer_size_partial_sm_programmable_bootstrap<__uint128_t>(
polynomial_size);
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
// Configure step one
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
@@ -359,7 +359,7 @@ __host__ void execute_step_one_128(
uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) {
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
cuda_set_device(gpu_index);
int thds = polynomial_size / params::opt;
dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1, level_count);
@@ -401,7 +401,7 @@ __host__ void execute_step_two_128(
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm,
uint32_t num_many_lut, uint32_t lut_stride) {
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
cuda_set_device(gpu_index);
int thds = polynomial_size / params::opt;
dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1);

View File

@@ -9,7 +9,7 @@
bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t num_samples, int max_shared_memory) {
uint32_t num_samples, uint32_t max_shared_memory) {
return supports_cooperative_groups_on_multibit_programmable_bootstrap<
uint64_t>(glwe_dimension, polynomial_size, level_count, num_samples,
max_shared_memory);
@@ -18,7 +18,7 @@ bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
template <typename Torus>
bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, int max_shared_memory) {
uint32_t level_count, uint32_t max_shared_memory) {
#if CUDA_ARCH >= 900
switch (polynomial_size) {
case 256:
@@ -449,7 +449,7 @@ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
polynomial_size);
int max_blocks_per_sm;
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
cuda_set_device(gpu_index);
if (max_shared_memory < full_sm_keybundle)
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
@@ -532,7 +532,7 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
template bool
has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, int max_shared_memory);
uint32_t level_count, uint32_t max_shared_memory);
#if (CUDA_ARCH >= 900)
template <typename Torus>

View File

@@ -390,7 +390,7 @@ __host__ void scratch_multi_bit_programmable_bootstrap(
cuda_set_device(gpu_index);
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
uint64_t full_sm_keybundle =
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
polynomial_size);
@@ -509,7 +509,7 @@ __host__ void execute_compute_keybundle(
uint64_t full_sm_keybundle =
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
polynomial_size);
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto d_mem = buffer->d_mem_keybundle;
auto keybundle_fft = buffer->keybundle_fft;
@@ -553,7 +553,7 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index,
uint64_t partial_sm_accumulate_step_one =
get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
Torus>(polynomial_size);
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
//
auto d_mem = buffer->d_mem_acc_step_one;
@@ -607,7 +607,7 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
uint64_t full_sm_accumulate_step_two =
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
polynomial_size);
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto d_mem = buffer->d_mem_acc_step_two;
auto keybundle_fft = buffer->keybundle_fft;

View File

@@ -203,7 +203,7 @@ __host__ void scratch_programmable_bootstrap_tbc(
cuda_set_device(gpu_index);
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
bool supports_dsm =
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
Torus>(polynomial_size, max_shared_memory);
@@ -266,7 +266,7 @@ __host__ void host_programmable_bootstrap_tbc(
uint32_t num_many_lut, uint32_t lut_stride) {
cuda_set_device(gpu_index);
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto supports_dsm =
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
Torus>(polynomial_size, max_shared_memory);
@@ -344,7 +344,7 @@ __host__ void host_programmable_bootstrap_tbc(
template <typename Torus, class params>
__host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
int glwe_dimension, int level_count, int num_samples,
int max_shared_memory) {
uint32_t max_shared_memory) {
// If Cooperative Groups is not supported, no need to check anything else
if (!cuda_check_support_cooperative_groups())
@@ -388,7 +388,7 @@ __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
template <typename Torus>
bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
uint32_t polynomial_size, int max_shared_memory) {
uint32_t polynomial_size, uint32_t max_shared_memory) {
uint64_t minimum_sm =
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
polynomial_size);
@@ -405,7 +405,7 @@ bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
template <typename Torus, class params>
__host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, int max_shared_memory) {
uint32_t level_count, uint32_t max_shared_memory) {
if (!cuda_check_support_thread_block_clusters() || num_samples > 128)
return false;

View File

@@ -206,7 +206,7 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap(
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
cuda_set_device(gpu_index);
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
bool supports_dsm =
supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
Torus>(polynomial_size, max_shared_memory);
@@ -305,7 +305,7 @@ __host__ void execute_tbc_external_product_loop(
auto lwe_chunk_size = buffer->lwe_chunk_size;
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto supports_dsm =
supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
Torus>(polynomial_size, max_shared_memory);
@@ -426,7 +426,7 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
template <typename Torus>
bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
uint32_t polynomial_size, int max_shared_memory) {
uint32_t polynomial_size, uint32_t max_shared_memory) {
uint64_t minimum_sm =
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
polynomial_size);
@@ -443,7 +443,7 @@ bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
template <typename Torus, class params>
__host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, int max_shared_memory) {
uint32_t level_count, uint32_t max_shared_memory) {
if (!cuda_check_support_thread_block_clusters())
return false;
@@ -518,5 +518,5 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
template bool
supports_distributed_shared_memory_on_multibit_programmable_bootstrap<uint64_t>(
uint32_t polynomial_size, int max_shared_memory);
uint32_t polynomial_size, uint32_t max_shared_memory);
#endif // FASTMULTIBIT_PBS_H

View File

@@ -1639,7 +1639,7 @@ unsafe extern "C" {
polynomial_size: u32,
level_count: u32,
num_samples: u32,
max_shared_memory: ffi::c_int,
max_shared_memory: u32,
) -> bool;
}
unsafe extern "C" {