chore(gpu): supress warnings in pcc_gpu

2026-01-09 14:47:56 -05:00 · 2025-03-19 17:44:11 +01:00
parent 4928b1354e
commit b1008824e2
21 changed files with 58 additions and 62 deletions
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -76,7 +76,7 @@ void cuda_drop(void *ptr, uint32_t gpu_index);
 void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
 }

-int cuda_get_max_shared_memory(uint32_t gpu_index);
+uint32_t cuda_get_max_shared_memory(uint32_t gpu_index);

 bool cuda_check_support_cooperative_groups();

--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -3769,7 +3769,6 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
  void init_temporary_buffers(cudaStream_t const *streams,
                              uint32_t const *gpu_indexes, uint32_t gpu_count,
                              uint32_t num_blocks) {
-    uint32_t big_lwe_size = params.big_lwe_dimension + 1;

    // non boolean temporary arrays, with `num_blocks` blocks
    remainder1 = new CudaRadixCiphertextFFI;
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
@@ -5,12 +5,12 @@

 template <typename Torus>
 bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
-    uint32_t polynomial_size, int max_shared_memory);
+    uint32_t polynomial_size, uint32_t max_shared_memory);

 template <typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, int max_shared_memory);
+    uint32_t level_count, uint32_t max_shared_memory);

 #if CUDA_ARCH >= 900
 template <typename Torus>
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -67,7 +67,7 @@ get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {

 template <typename Torus>
 bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
-    uint32_t polynomial_size, int max_shared_memory);
+    uint32_t polynomial_size, uint32_t max_shared_memory);

 template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;

@@ -400,7 +400,7 @@ bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
                                                   uint32_t polynomial_size,
                                                   uint32_t level_count,
                                                   uint32_t num_samples,
-                                                   int max_shared_memory);
+                                                   uint32_t max_shared_memory);

 template <typename Torus>
 void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
@@ -8,7 +8,7 @@ extern "C" {

 bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t num_samples, int max_shared_memory);
+    uint32_t num_samples, uint32_t max_shared_memory);

 void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
    void *stream, uint32_t gpu_index, void *dest, void const *src,
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -304,8 +304,8 @@ void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {
 }

 /// Get the maximum size for the shared memory
-int cuda_get_max_shared_memory(uint32_t gpu_index) {
-  int max_shared_memory = 0;
+uint32_t cuda_get_max_shared_memory(uint32_t gpu_index) {
+  auto max_shared_memory = 0;
 #if CUDA_ARCH == 900
  max_shared_memory = 226000;
 #elif CUDA_ARCH == 890
@@ -321,5 +321,5 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) {
                         gpu_index);
  check_cuda_error(cudaGetLastError());
 #endif
-  return max_shared_memory;
+  return (uint32_t)(max_shared_memory);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -505,7 +505,6 @@ __host__ void host_integer_div_rem_kb(
  auto num_blocks = quotient->num_radix_blocks;
  if (is_signed) {
    auto radix_params = int_mem_ptr->params;
-    uint32_t big_lwe_size = radix_params.big_lwe_dimension + 1;

    // temporary memory
    auto positive_numerator = int_mem_ptr->positive_numerator;
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -251,7 +251,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
  /// Here it is important to query the default max shared memory on device 0
  /// instead of cuda_get_max_shared_memory,
  /// to avoid bugs with tree_add_chunks trying to use too much shared memory
-  int max_shared_memory = 0;
+  auto max_shared_memory = 0;
  check_cuda_error(cudaDeviceGetAttribute(
      &max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, 0));

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
@@ -118,7 +118,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
  int gridSize = total_polynomials;
  int blockSize = polynomial_size / choose_opt_amortized(polynomial_size);

-  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);

  double2 *buffer;
  switch (polynomial_size) {
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
@@ -145,7 +145,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,

  cuda_memcpy_async_to_gpu(d_bsk, h_bsk, buffer_size, stream, gpu_index);

-  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);

  double2 *buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
  switch (polynomial_size) {
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
@@ -233,7 +233,7 @@ uint64_t get_buffer_size_partial_sm_programmable_bootstrap_amortized(
 template <typename Torus>
 uint64_t get_buffer_size_programmable_bootstrap_amortized(
    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count, int max_shared_memory) {
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {

  uint64_t full_sm =
      get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
@@ -264,7 +264,7 @@ __host__ void scratch_programmable_bootstrap_amortized(
  uint64_t partial_sm =
      get_buffer_size_partial_sm_programmable_bootstrap_amortized<Torus>(
          polynomial_size);
-  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
    cudaFuncSetAttribute(
        device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>,
@@ -311,7 +311,7 @@ __host__ void host_programmable_bootstrap_amortized(

  uint64_t DM_FULL = SM_FULL;

-  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  cuda_set_device(gpu_index);

  // Create a 1-dimensional grid of threads
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -199,7 +199,7 @@ __host__ void scratch_programmable_bootstrap_cg(
  uint64_t partial_sm =
      get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
          polynomial_size);
-  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
    check_cuda_error(cudaFuncSetAttribute(
        device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
@@ -246,7 +246,7 @@ __host__ void host_programmable_bootstrap_cg(
      get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
          polynomial_size);

-  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  cuda_set_device(gpu_index);

  uint64_t full_dm = full_sm;
@@ -301,7 +301,7 @@ __host__ void host_programmable_bootstrap_cg(
 template <typename Torus, class params>
 __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
    int glwe_dimension, int level_count, int num_samples,
-    int max_shared_memory) {
+    uint32_t max_shared_memory) {

  // If Cooperative Groups is not supported, no need to check anything else
  if (!cuda_check_support_cooperative_groups())
@@ -347,7 +347,7 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
 template <typename Torus>
 __host__ bool supports_cooperative_groups_on_programmable_bootstrap(
    int glwe_dimension, int polynomial_size, int level_count, int num_samples,
-    int max_shared_memory) {
+    uint32_t max_shared_memory) {
  switch (polynomial_size) {
  case 256:
    return verify_cuda_programmable_bootstrap_cg_grid_size<
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -227,7 +227,7 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
      get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<Torus>(
          polynomial_size);

-  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  if (max_shared_memory < full_sm_keybundle) {
    check_cuda_error(cudaFuncSetAttribute(
        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
@@ -312,7 +312,7 @@ __host__ void execute_cg_external_product_loop(
  uint64_t no_dm = 0;

  auto lwe_chunk_size = buffer->lwe_chunk_size;
-  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);

  uint32_t keybundle_size_per_input =
      lwe_chunk_size * level_count * (glwe_dimension + 1) *
@@ -409,7 +409,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
 template <typename Torus, class params>
 __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
    int glwe_dimension, int level_count, int num_samples,
-    int max_shared_memory) {
+    uint32_t max_shared_memory) {

  // If Cooperative Groups is not supported, no need to check anything else
  if (!cuda_check_support_cooperative_groups())
@@ -460,7 +460,7 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
 template <typename Torus>
 __host__ bool supports_cooperative_groups_on_multibit_programmable_bootstrap(
    int glwe_dimension, int polynomial_size, int level_count, int num_samples,
-    int max_shared_memory) {
+    uint32_t max_shared_memory) {
  switch (polynomial_size) {
  case 256:
    return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
@@ -9,18 +9,16 @@ bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
                                                   uint32_t polynomial_size,
                                                   uint32_t level_count,
                                                   uint32_t num_samples,
-                                                   int max_shared_memory) {
+                                                   uint32_t max_shared_memory) {
  return supports_cooperative_groups_on_programmable_bootstrap<Torus>(
      glwe_dimension, polynomial_size, level_count, num_samples,
      max_shared_memory);
 }

 template <typename Torus>
-bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
-                                                    uint32_t glwe_dimension,
-                                                    uint32_t polynomial_size,
-                                                    uint32_t level_count,
-                                                    int max_shared_memory) {
+bool has_support_to_cuda_programmable_bootstrap_tbc(
+    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t max_shared_memory) {
 #if CUDA_ARCH >= 900
  switch (polynomial_size) {
  case 256:
@@ -324,7 +322,7 @@ void scratch_cuda_programmable_bootstrap_32(
    uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

-  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
 #if (CUDA_ARCH >= 900)
  if (has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
          input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
@@ -359,7 +357,7 @@ void scratch_cuda_programmable_bootstrap_64(
    uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

-  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
 #if (CUDA_ARCH >= 900)
  if (has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
          input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
@@ -732,7 +730,7 @@ void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,

 template bool has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t num_samples, int max_shared_memory);
+    uint32_t num_samples, uint32_t max_shared_memory);

 template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
    void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
@@ -798,10 +796,10 @@ template void scratch_cuda_programmable_bootstrap<uint32_t>(

 template bool has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, int max_shared_memory);
+    uint32_t level_count, uint32_t max_shared_memory);
 template bool has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, int max_shared_memory);
+    uint32_t level_count, uint32_t max_shared_memory);

 #if CUDA_ARCH >= 900
 template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
@@ -262,7 +262,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
 template <typename Torus>
 uint64_t get_buffer_size_programmable_bootstrap(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, int max_shared_memory) {
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {

  uint64_t full_sm_step_one =
      get_buffer_size_full_sm_programmable_bootstrap_step_one<Torus>(
@@ -316,7 +316,7 @@ __host__ void scratch_programmable_bootstrap(
  uint64_t partial_sm =
      get_buffer_size_partial_sm_programmable_bootstrap<Torus>(polynomial_size);

-  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);

  // Configure step one
  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
@@ -372,7 +372,7 @@ __host__ void execute_step_one(
    uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
    uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) {

-  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  cuda_set_device(gpu_index);
  int thds = polynomial_size / params::opt;
  dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1, level_count);
@@ -414,7 +414,7 @@ __host__ void execute_step_two(
    uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm,
    uint32_t num_many_lut, uint32_t lut_stride) {

-  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  cuda_set_device(gpu_index);
  int thds = polynomial_size / params::opt;
  dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
@@ -299,7 +299,7 @@ __host__ void scratch_programmable_bootstrap_128(
      get_buffer_size_partial_sm_programmable_bootstrap<__uint128_t>(
          polynomial_size);

-  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);

  // Configure step one
  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
@@ -359,7 +359,7 @@ __host__ void execute_step_one_128(
    uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
    uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) {

-  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  cuda_set_device(gpu_index);
  int thds = polynomial_size / params::opt;
  dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1, level_count);
@@ -401,7 +401,7 @@ __host__ void execute_step_two_128(
    uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm,
    uint32_t num_many_lut, uint32_t lut_stride) {

-  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  cuda_set_device(gpu_index);
  int thds = polynomial_size / params::opt;
  dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
@@ -9,7 +9,7 @@

 bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t num_samples, int max_shared_memory) {
+    uint32_t num_samples, uint32_t max_shared_memory) {
  return supports_cooperative_groups_on_multibit_programmable_bootstrap<
      uint64_t>(glwe_dimension, polynomial_size, level_count, num_samples,
                max_shared_memory);
@@ -18,7 +18,7 @@ bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
 template <typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, int max_shared_memory) {
+    uint32_t level_count, uint32_t max_shared_memory) {
 #if CUDA_ARCH >= 900
  switch (polynomial_size) {
  case 256:
@@ -449,7 +449,7 @@ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
          polynomial_size);

  int max_blocks_per_sm;
-  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  cuda_set_device(gpu_index);
  if (max_shared_memory < full_sm_keybundle)
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
@@ -532,7 +532,7 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
 template bool
 has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, int max_shared_memory);
+    uint32_t level_count, uint32_t max_shared_memory);

 #if (CUDA_ARCH >= 900)
 template <typename Torus>
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -390,7 +390,7 @@ __host__ void scratch_multi_bit_programmable_bootstrap(

  cuda_set_device(gpu_index);

-  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  uint64_t full_sm_keybundle =
      get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
          polynomial_size);
@@ -509,7 +509,7 @@ __host__ void execute_compute_keybundle(
  uint64_t full_sm_keybundle =
      get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
          polynomial_size);
-  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);

  auto d_mem = buffer->d_mem_keybundle;
  auto keybundle_fft = buffer->keybundle_fft;
@@ -553,7 +553,7 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index,
  uint64_t partial_sm_accumulate_step_one =
      get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
          Torus>(polynomial_size);
-  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);

  //
  auto d_mem = buffer->d_mem_acc_step_one;
@@ -607,7 +607,7 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
  uint64_t full_sm_accumulate_step_two =
      get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
          polynomial_size);
-  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);

  auto d_mem = buffer->d_mem_acc_step_two;
  auto keybundle_fft = buffer->keybundle_fft;
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
@@ -203,7 +203,7 @@ __host__ void scratch_programmable_bootstrap_tbc(

  cuda_set_device(gpu_index);

-  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  bool supports_dsm =
      supports_distributed_shared_memory_on_classic_programmable_bootstrap<
          Torus>(polynomial_size, max_shared_memory);
@@ -266,7 +266,7 @@ __host__ void host_programmable_bootstrap_tbc(
    uint32_t num_many_lut, uint32_t lut_stride) {
  cuda_set_device(gpu_index);

-  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  auto supports_dsm =
      supports_distributed_shared_memory_on_classic_programmable_bootstrap<
          Torus>(polynomial_size, max_shared_memory);
@@ -344,7 +344,7 @@ __host__ void host_programmable_bootstrap_tbc(
 template <typename Torus, class params>
 __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
    int glwe_dimension, int level_count, int num_samples,
-    int max_shared_memory) {
+    uint32_t max_shared_memory) {

  // If Cooperative Groups is not supported, no need to check anything else
  if (!cuda_check_support_cooperative_groups())
@@ -388,7 +388,7 @@ __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(

 template <typename Torus>
 bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
-    uint32_t polynomial_size, int max_shared_memory) {
+    uint32_t polynomial_size, uint32_t max_shared_memory) {
  uint64_t minimum_sm =
      get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
          polynomial_size);
@@ -405,7 +405,7 @@ bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
 template <typename Torus, class params>
 __host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap(
    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, int max_shared_memory) {
+    uint32_t level_count, uint32_t max_shared_memory) {

  if (!cuda_check_support_thread_block_clusters() || num_samples > 128)
    return false;
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -206,7 +206,7 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap(
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
  cuda_set_device(gpu_index);

-  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  bool supports_dsm =
      supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
          Torus>(polynomial_size, max_shared_memory);
@@ -305,7 +305,7 @@ __host__ void execute_tbc_external_product_loop(

  auto lwe_chunk_size = buffer->lwe_chunk_size;

-  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  auto supports_dsm =
      supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
          Torus>(polynomial_size, max_shared_memory);
@@ -426,7 +426,7 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(

 template <typename Torus>
 bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
-    uint32_t polynomial_size, int max_shared_memory) {
+    uint32_t polynomial_size, uint32_t max_shared_memory) {
  uint64_t minimum_sm =
      get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
          polynomial_size);
@@ -443,7 +443,7 @@ bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
 template <typename Torus, class params>
 __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, int max_shared_memory) {
+    uint32_t level_count, uint32_t max_shared_memory) {

  if (!cuda_check_support_thread_block_clusters())
    return false;
@@ -518,5 +518,5 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(

 template bool
 supports_distributed_shared_memory_on_multibit_programmable_bootstrap<uint64_t>(
-    uint32_t polynomial_size, int max_shared_memory);
+    uint32_t polynomial_size, uint32_t max_shared_memory);
 #endif // FASTMULTIBIT_PBS_H
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -1639,7 +1639,7 @@ unsafe extern "C" {
        polynomial_size: u32,
        level_count: u32,
        num_samples: u32,
-        max_shared_memory: ffi::c_int,
+        max_shared_memory: u32,
    ) -> bool;
 }
 unsafe extern "C" {