chore(gpu): encapsulate cudaSetDevice

2026-01-10 07:08:03 -05:00 · 2025-01-29 08:51:07 -03:00
parent c470b71b1d
commit 3c88574a52
33 changed files with 91 additions and 85 deletions
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -27,6 +27,8 @@ inline void cuda_error(cudaError_t code, const char *file, int line) {
    std::abort();                                                              \
  }

+void cuda_set_device(uint32_t gpu_index);
+
 cudaEvent_t cuda_create_event(uint32_t gpu_index);

 void cuda_event_record(cudaEvent_t event, cudaStream_t stream,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -189,7 +189,7 @@ template <typename Torus> struct int_radix_lut {
    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
    for (uint i = 0; i < active_gpu_count; i++) {
-      cudaSetDevice(i);
+      cuda_set_device(i);
      int8_t *gpu_pbs_buffer;
      auto num_blocks_on_gpu =
          get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
@@ -384,7 +384,7 @@ template <typename Torus> struct int_radix_lut {
    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
    for (uint i = 0; i < active_gpu_count; i++) {
-      cudaSetDevice(i);
+      cuda_set_device(i);
      int8_t *gpu_pbs_buffer;
      auto num_blocks_on_gpu =
          get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
@@ -114,7 +114,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
             uint32_t polynomial_size, uint32_t level_count,
             uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
             PBS_VARIANT pbs_variant, bool allocate_gpu_memory) {
-    cudaSetDevice(gpu_index);
+    cuda_set_device(gpu_index);

    this->pbs_variant = pbs_variant;
    this->lwe_chunk_size = lwe_chunk_size;
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -77,7 +77,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
             uint32_t polynomial_size, uint32_t level_count,
             uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
             bool allocate_gpu_memory) {
-    cudaSetDevice(gpu_index);
+    cuda_set_device(gpu_index);
    this->pbs_variant = pbs_variant;

    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
@@ -11,7 +11,7 @@ void cuda_convert_lwe_ciphertext_vector_to_gpu(cudaStream_t stream,
                                               uint32_t gpu_index, T *dest,
                                               T *src, uint32_t number_of_cts,
                                               uint32_t lwe_dimension) {
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
  uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
  cuda_memcpy_async_to_gpu(dest, src, size, stream, gpu_index);
 }
@@ -21,7 +21,7 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu(cudaStream_t stream,
                                               uint32_t gpu_index, T *dest,
                                               T *src, uint32_t number_of_cts,
                                               uint32_t lwe_dimension) {
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
  uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
  cuda_memcpy_async_to_cpu(dest, src, size, stream, gpu_index);
 }
@@ -55,7 +55,7 @@ __host__ void host_sample_extract(cudaStream_t stream, uint32_t gpu_index,
                                  Torus const *glwe_array_in,
                                  uint32_t const *nth_array, uint32_t num_nths,
                                  uint32_t glwe_dimension) {
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);

  dim3 grid(num_nths);
  dim3 thds(params::degree / params::opt);
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/fast_packing_keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/fast_packing_keyswitch.cuh
@@ -261,7 +261,7 @@ __host__ void host_fast_packing_keyswitch_lwe_list_to_glwe(

  // Optimization of packing keyswitch when packing many LWEs

-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
  check_cuda_error(cudaGetLastError());

  int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ggsw.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ggsw.cuh
@@ -57,7 +57,7 @@ void batch_fft_ggsw_vector(cudaStream_t *streams, uint32_t *gpu_indexes,
  if (gpu_count != 1)
    PANIC("GPU error (batch_fft_ggsw_vector): multi-GPU execution is not "
          "supported yet.")
-  cudaSetDevice(gpu_indexes[0]);
+  cuda_set_device(gpu_indexes[0]);

  int shared_memory_size = sizeof(double) * polynomial_size;

--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -105,7 +105,7 @@ __host__ void host_keyswitch_lwe_ciphertext_vector(
    uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
    uint32_t num_samples) {

-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);

  constexpr int num_threads_y = 32;
  int num_blocks, num_threads_x;
@@ -160,7 +160,7 @@ __host__ void scratch_packing_keyswitch_lwe_list_to_glwe(
    cudaStream_t stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t num_lwes, bool allocate_gpu_memory) {
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);

  int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;

--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -110,7 +110,7 @@ template <typename Torus>
 __host__ void host_modulus_switch_inplace(cudaStream_t stream,
                                          uint32_t gpu_index, Torus *array,
                                          int size, uint32_t log_modulus) {
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);

  int num_threads = 0, num_blocks = 0;
  getNumBlocksAndThreads(size, 1024, num_blocks, num_threads);
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -2,8 +2,12 @@
 #include <cstdint>
 #include <cuda_runtime.h>

-cudaEvent_t cuda_create_event(uint32_t gpu_index) {
+void cuda_set_device(uint32_t gpu_index) {
  check_cuda_error(cudaSetDevice(gpu_index));
+}
+
+cudaEvent_t cuda_create_event(uint32_t gpu_index) {
+  cuda_set_device(gpu_index);
  cudaEvent_t event;
  check_cuda_error(cudaEventCreate(&event));
  return event;
@@ -11,24 +15,24 @@ cudaEvent_t cuda_create_event(uint32_t gpu_index) {

 void cuda_event_record(cudaEvent_t event, cudaStream_t stream,
                       uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
  check_cuda_error(cudaEventRecord(event, stream));
 }

 void cuda_stream_wait_event(cudaStream_t stream, cudaEvent_t event,
                            uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
  check_cuda_error(cudaStreamWaitEvent(stream, event, 0));
 }

 void cuda_event_destroy(cudaEvent_t event, uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
  check_cuda_error(cudaEventDestroy(event));
 }

 /// Unsafe function to create a CUDA stream, must check first that GPU exists
 cudaStream_t cuda_create_stream(uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
  cudaStream_t stream;
  check_cuda_error(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
  return stream;
@@ -36,12 +40,12 @@ cudaStream_t cuda_create_stream(uint32_t gpu_index) {

 /// Unsafe function to destroy CUDA stream, must check first the GPU exists
 void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
  check_cuda_error(cudaStreamDestroy(stream));
 }

 void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
  check_cuda_error(cudaStreamSynchronize(stream));
 }

@@ -59,7 +63,7 @@ uint32_t cuda_is_available() { return cudaSetDevice(0) == cudaSuccess; }
 /// or if there's not enough memory. A safe wrapper around it must call
 /// cuda_check_valid_malloc() first
 void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
  void *ptr;
  check_cuda_error(cudaMalloc((void **)&ptr, size));

@@ -70,7 +74,7 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
 /// asynchronously.
 void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
                        uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
  void *ptr;

 #ifndef CUDART_VERSION
@@ -93,7 +97,7 @@ void *cuda_malloc_async(uint64_t size, cudaStream_t stream,

 /// Check that allocation is valid
 void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
  size_t total_mem, free_mem;
  check_cuda_error(cudaMemGetInfo(&free_mem, &total_mem));
  if (size > free_mem) {
@@ -141,7 +145,7 @@ void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
    PANIC("Cuda error: invalid device pointer in async copy to GPU.")
  }

-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
  check_cuda_error(
      cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream));
 }
@@ -161,7 +165,7 @@ void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
  if (attr_src.type != cudaMemoryTypeDevice) {
    PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
  }
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
  if (attr_src.device == attr_dest.device) {
    check_cuda_error(
        cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice, stream));
@@ -186,7 +190,7 @@ void cuda_memcpy_gpu_to_gpu(void *dest, void *src, uint64_t size,
  if (attr_src.type != cudaMemoryTypeDevice) {
    PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
  }
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
  if (attr_src.device == attr_dest.device) {
    check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToDevice));
  } else {
@@ -197,7 +201,7 @@ void cuda_memcpy_gpu_to_gpu(void *dest, void *src, uint64_t size,

 /// Synchronizes device
 void cuda_synchronize_device(uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
  check_cuda_error(cudaDeviceSynchronize());
 }

@@ -210,7 +214,7 @@ void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
    PANIC("Cuda error: invalid dest device pointer in cuda memset.")
  }
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
  check_cuda_error(cudaMemsetAsync(dest, val, size, stream));
 }

@@ -230,7 +234,7 @@ void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
    if (attr.type != cudaMemoryTypeDevice) {
      PANIC("Cuda error: invalid dest device pointer in cuda set value.")
    }
-    check_cuda_error(cudaSetDevice(gpu_index));
+    cuda_set_device(gpu_index);
    int block_size = 256;
    int num_blocks = (n + block_size - 1) / block_size;

@@ -260,7 +264,7 @@ void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
    PANIC("Cuda error: invalid src device pointer in copy to CPU async.")
  }

-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
  check_cuda_error(
      cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream));
 }
@@ -274,14 +278,14 @@ int cuda_get_number_of_gpus() {

 /// Drop a cuda array
 void cuda_drop(void *ptr, uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
  check_cuda_error(cudaFree(ptr));
 }

 /// Drop a cuda array asynchronously, if supported on the device
 void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {

-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
 #ifndef CUDART_VERSION
 #error CUDART_VERSION Undefined!
 #elif (CUDART_VERSION >= 11020)
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -12,7 +12,7 @@ __host__ void zero_out_if(cudaStream_t const *streams,
                          int_zero_out_if_buffer<Torus> *mem_ptr,
                          int_radix_lut<Torus> *predicate, void *const *bsks,
                          Torus *const *ksks, uint32_t num_radix_blocks) {
-  cudaSetDevice(gpu_indexes[0]);
+  cuda_set_device(gpu_indexes[0]);
  auto params = mem_ptr->params;

  // We can't use integer_radix_apply_bivariate_lookup_table_kb since the
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -38,7 +38,7 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
                                    uint32_t lwe_dimension,
                                    uint32_t num_radix_blocks) {

-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
  int num_blocks = 0, num_threads = 0;
  int num_entries = (lwe_dimension + 1);
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -50,7 +50,7 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
  if (array_in == array_out)
    PANIC("Cuda error: Input and output must be different");

-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
  auto compression_params = mem_ptr->compression_params;

  auto log_modulus = mem_ptr->storage_log_modulus;
@@ -185,7 +185,7 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
  if (array_in == glwe_array_out)
    PANIC("Cuda error: Input and output must be different");

-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);

  auto compression_params = mem_ptr->compression_params;

--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -78,7 +78,7 @@ host_radix_blocks_rotate_right(cudaStream_t const *streams,
    PANIC("Cuda error (blocks_rotate_right): the source and destination "
          "pointers should be different");
  }
-  cudaSetDevice(gpu_indexes[0]);
+  cuda_set_device(gpu_indexes[0]);
  radix_blocks_rotate_right<Torus><<<blocks_count, 1024, 0, streams[0]>>>(
      dst, src, value, blocks_count, lwe_size);
  check_cuda_error(cudaGetLastError());
@@ -96,7 +96,7 @@ host_radix_blocks_rotate_left(cudaStream_t const *streams,
    PANIC("Cuda error (blocks_rotate_left): the source and destination "
          "pointers should be different");
  }
-  cudaSetDevice(gpu_indexes[0]);
+  cuda_set_device(gpu_indexes[0]);
  radix_blocks_rotate_left<Torus><<<blocks_count, 1024, 0, streams[0]>>>(
      dst, src, value, blocks_count, lwe_size);
  check_cuda_error(cudaGetLastError());
@@ -125,7 +125,7 @@ __host__ void
 host_radix_blocks_reverse_inplace(cudaStream_t const *streams,
                                  uint32_t const *gpu_indexes, Torus *src,
                                  uint32_t blocks_count, uint32_t lwe_size) {
-  cudaSetDevice(gpu_indexes[0]);
+  cuda_set_device(gpu_indexes[0]);
  int num_blocks = blocks_count / 2, num_threads = 1024;
  radix_blocks_reverse_lwe_inplace<Torus>
      <<<num_blocks, num_threads, 0, streams[0]>>>(src, blocks_count, lwe_size);
@@ -163,7 +163,7 @@ template <typename Torus>
 __host__ void host_radix_cumulative_sum_in_groups(
    cudaStream_t stream, uint32_t gpu_index, Torus *dest, Torus *src,
    uint32_t radix_blocks_count, uint32_t lwe_size, uint32_t group_size) {
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
  // Each CUDA block is responsible for a single group
  int num_blocks = (radix_blocks_count + group_size - 1) / group_size,
      num_threads = 512;
@@ -219,7 +219,7 @@ __host__ void host_radix_split_simulators_and_grouping_pgns(
    cudaStream_t stream, uint32_t gpu_index, Torus *simulators,
    Torus *grouping_pgns, Torus *src, uint32_t radix_blocks_count,
    uint32_t lwe_size, uint32_t group_size, Torus delta) {
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
  // Each CUDA block is responsible for a single group
  int num_blocks = radix_blocks_count, num_threads = 512;
  radix_split_simulators_and_grouping_pgns<Torus>
@@ -255,7 +255,7 @@ __host__ void host_radix_sum_in_groups(cudaStream_t stream, uint32_t gpu_index,
                                       Torus *dest, Torus *src1, Torus *src2,
                                       uint32_t radix_blocks_count,
                                       uint32_t lwe_size, uint32_t group_size) {
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);

  int num_blocks = radix_blocks_count, num_threads = 512;
  radix_sum_in_groups<Torus><<<num_blocks, num_threads, 0, stream>>>(
@@ -297,7 +297,7 @@ pack_bivariate_blocks(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                      uint32_t lwe_dimension, uint32_t shift,
                      uint32_t num_radix_blocks) {

-  cudaSetDevice(gpu_indexes[0]);
+  cuda_set_device(gpu_indexes[0]);
  // Left message is shifted
  int num_blocks = 0, num_threads = 0;
  int num_entries = num_radix_blocks * (lwe_dimension + 1);
@@ -341,7 +341,7 @@ __host__ void pack_bivariate_blocks_with_single_block(
    Torus const *lwe_array_1, Torus const *lwe_2, Torus const *lwe_indexes_in,
    uint32_t lwe_dimension, uint32_t shift, uint32_t num_radix_blocks) {

-  cudaSetDevice(gpu_indexes[0]);
+  cuda_set_device(gpu_indexes[0]);
  // Left message is shifted
  int num_blocks = 0, num_threads = 0;
  int num_entries = num_radix_blocks * (lwe_dimension + 1);
@@ -1361,7 +1361,7 @@ __host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index,
                          uint32_t factor) {
  if (num_radix_blocks == 0)
    return;
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
  int num_blocks = 0, num_threads = 0;
  int num_entries = (lwe_dimension + 1);
  getNumBlocksAndThreads(num_entries, 1024, num_blocks, num_threads);
@@ -1392,7 +1392,7 @@ create_trivial_radix(cudaStream_t stream, uint32_t gpu_index,
                     uint32_t num_scalar_blocks, Torus message_modulus,
                     Torus carry_modulus) {

-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
  size_t radix_size = (lwe_dimension + 1) * num_radix_blocks;
  cuda_memset_async(lwe_array_out, 0, radix_size * sizeof(Torus), stream,
                    gpu_index);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -297,7 +297,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
      ch_amount++;
    dim3 add_grid(ch_amount, num_blocks, 1);

-    cudaSetDevice(gpu_indexes[0]);
+    cuda_set_device(gpu_indexes[0]);
    tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
        new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);

@@ -541,7 +541,7 @@ __host__ void host_integer_mult_radix_kb(
  dim3 grid(lsb_vector_block_count, 1, 1);
  dim3 thds(params::degree / params::opt, 1, 1);

-  cudaSetDevice(gpu_indexes[0]);
+  cuda_set_device(gpu_indexes[0]);
  all_shifted_lhs_rhs<Torus, params><<<grid, thds, 0, streams[0]>>>(
      radix_lwe_left, vector_result_lsb, vector_result_msb, radix_lwe_right,
      vector_lsb_rhs, vector_msb_rhs, num_blocks);
@@ -556,7 +556,7 @@ __host__ void host_integer_mult_radix_kb(
  vector_result_msb = &block_mul_res[lsb_vector_block_count *
                                     (polynomial_size * glwe_dimension + 1)];

-  cudaSetDevice(gpu_indexes[0]);
+  cuda_set_device(gpu_indexes[0]);
  fill_radix_from_lsb_msb<Torus, params>
      <<<num_blocks * num_blocks, params::degree / params::opt, 0,
         streams[0]>>>(vector_result_sb, vector_result_lsb, vector_result_msb,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
@@ -59,7 +59,7 @@ __host__ void host_integer_radix_negation(
    uint32_t gpu_count, Torus *output, Torus const *input,
    uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
    uint64_t message_modulus, uint64_t carry_modulus) {
-  cudaSetDevice(gpu_indexes[0]);
+  cuda_set_device(gpu_indexes[0]);

  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
@@ -29,7 +29,7 @@ __host__ void host_integer_radix_scalar_addition_inplace(
    uint32_t gpu_count, Torus *lwe_array, Torus const *scalar_input,
    uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus) {
-  cudaSetDevice(gpu_indexes[0]);
+  cuda_set_device(gpu_indexes[0]);

  // Create a 1-dimensional grid of threads
  int num_blocks = 0, num_threads = 0;
@@ -68,7 +68,7 @@ __host__ void host_integer_radix_add_scalar_one_inplace(
    uint32_t gpu_count, Torus *lwe_array, uint32_t lwe_dimension,
    uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
    uint32_t carry_modulus) {
-  cudaSetDevice(gpu_indexes[0]);
+  cuda_set_device(gpu_indexes[0]);

  // Create a 1-dimensional grid of threads
  int num_blocks = 0, num_threads = 0;
@@ -108,7 +108,7 @@ __host__ void host_integer_radix_scalar_subtraction_inplace(
    uint32_t gpu_count, Torus *lwe_array, Torus *scalar_input,
    uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus) {
-  cudaSetDevice(gpu_indexes[0]);
+  cuda_set_device(gpu_indexes[0]);

  // Create a 1-dimensional grid of threads
  int num_blocks = 0, num_threads = 0;
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -127,7 +127,7 @@ __host__ void host_integer_small_scalar_mul_radix(
    uint32_t gpu_count, T *output_lwe_array, T *input_lwe_array, T scalar,
    uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {

-  cudaSetDevice(gpu_indexes[0]);
+  cuda_set_device(gpu_indexes[0]);
  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
  int lwe_size = input_lwe_dimension + 1;
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
@@ -51,7 +51,7 @@ __host__ void host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index,
                                      const uint32_t lwe_dimension,
                                      const uint32_t lwe_ciphertext_count) {

-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
  int num_blocks = 0, num_threads = 0;
  int num_entries = lwe_ciphertext_count;
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
@@ -72,7 +72,7 @@ __host__ void host_addition_plaintext_scalar(
    const T plaintext_input, const uint32_t lwe_dimension,
    const uint32_t lwe_ciphertext_count) {

-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
  int num_blocks = 0, num_threads = 0;
  int num_entries = lwe_ciphertext_count;
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
@@ -106,7 +106,7 @@ __host__ void host_addition(cudaStream_t stream, uint32_t gpu_index,
                            CudaRadixCiphertextFFI const *input_1,
                            CudaRadixCiphertextFFI const *input_2) {

-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
  int lwe_size = output->lwe_dimension + 1;
@@ -136,7 +136,7 @@ __host__ void legacy_host_addition(cudaStream_t stream, uint32_t gpu_index,
                                   const uint32_t input_lwe_dimension,
                                   const uint32_t input_lwe_ciphertext_count) {

-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
  int lwe_size = input_lwe_dimension + 1;
@@ -172,7 +172,7 @@ __host__ void host_pack_for_overflowing_ops(cudaStream_t stream,
                                            uint32_t input_lwe_ciphertext_count,
                                            uint32_t message_modulus) {

-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
  int lwe_size = input_lwe_dimension + 1;
@@ -210,7 +210,7 @@ __host__ void host_subtraction(cudaStream_t stream, uint32_t gpu_index,
                               uint32_t input_lwe_dimension,
                               uint32_t input_lwe_ciphertext_count) {

-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
  int lwe_size = input_lwe_dimension + 1;
@@ -248,7 +248,7 @@ __host__ void host_subtraction_plaintext(cudaStream_t stream,
                                         uint32_t input_lwe_dimension,
                                         uint32_t input_lwe_ciphertext_count) {

-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
  int num_blocks = 0, num_threads = 0;
  int num_entries = input_lwe_ciphertext_count;
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
@@ -294,7 +294,7 @@ __host__ void host_unchecked_sub_with_correcting_term(
    uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
    uint32_t carry_modulus, uint32_t degree) {

-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
  int lwe_size = input_lwe_dimension + 1;
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
@@ -34,7 +34,7 @@ __host__ void host_cleartext_vec_multiplication(
    T const *cleartext_input, const uint32_t input_lwe_dimension,
    const uint32_t input_lwe_ciphertext_count) {

-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
  int lwe_size = input_lwe_dimension + 1;
@@ -70,7 +70,7 @@ host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
                              uint32_t input_lwe_dimension,
                              uint32_t input_lwe_ciphertext_count) {

-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
  int lwe_size = input_lwe_dimension + 1;
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh
@@ -26,7 +26,7 @@ __host__ void host_negation(cudaStream_t stream, uint32_t gpu_index, T *output,
                            T const *input, const uint32_t input_lwe_dimension,
                            const uint32_t input_lwe_ciphertext_count) {

-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
  int lwe_size = input_lwe_dimension + 1;
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
@@ -96,7 +96,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
                                 uint32_t total_polynomials) {

  auto stream = static_cast<cudaStream_t>(stream_v);
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
  auto input1 = (double2 *)_input1;
  auto input2 = (double2 *)_input2;
  auto output = (double2 *)_output;
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
@@ -78,7 +78,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
                                                 double2 *dest, ST const *src,
                                                 uint32_t polynomial_size,
                                                 uint32_t total_polynomials) {
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
  int shared_memory_size = sizeof(double) * polynomial_size;

  // Here the buffer size is the size of double2 times the number of polynomials
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
@@ -312,7 +312,7 @@ __host__ void host_programmable_bootstrap_amortized(
  uint64_t DM_FULL = SM_FULL;

  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);

  // Create a 1-dimensional grid of threads
  // where each block handles 1 sample and each thread
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -247,7 +247,7 @@ __host__ void host_programmable_bootstrap_cg(
          polynomial_size);

  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);

  uint64_t full_dm = full_sm;

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -215,7 +215,7 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
    uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);

  uint64_t full_sm_keybundle =
      get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
@@ -298,7 +298,7 @@ __host__ void execute_cg_external_product_loop(
    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
    uint32_t level_count, uint32_t lwe_offset, uint32_t num_many_lut,
    uint32_t lut_stride) {
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);

  uint64_t full_sm =
      get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
@@ -373,7 +373,7 @@ __host__ void execute_step_one(
    uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) {

  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
  int thds = polynomial_size / params::opt;
  dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);

@@ -415,7 +415,7 @@ __host__ void execute_step_two(
    uint32_t num_many_lut, uint32_t lut_stride) {

  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
  int thds = polynomial_size / params::opt;
  dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1);

@@ -456,7 +456,7 @@ __host__ void host_programmable_bootstrap(
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
    uint32_t num_many_lut, uint32_t lut_stride) {
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);

  // With SM each block corresponds to either the mask or body, no need to
  // duplicate data for each
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
@@ -450,7 +450,7 @@ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,

  int max_blocks_per_sm;
  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
  if (max_shared_memory < full_sm_keybundle)
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &max_blocks_per_sm,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -388,7 +388,7 @@ __host__ void scratch_multi_bit_programmable_bootstrap(
    uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);

  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  uint64_t full_sm_keybundle =
@@ -496,7 +496,7 @@ __host__ void execute_compute_keybundle(
    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);

  auto lwe_chunk_size = buffer->lwe_chunk_size;
  uint32_t chunk_size =
@@ -545,7 +545,7 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index,
                 uint32_t lwe_dimension, uint32_t glwe_dimension,
                 uint32_t polynomial_size, uint32_t base_log,
                 uint32_t level_count, uint32_t j, uint32_t lwe_offset) {
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);

  uint64_t full_sm_accumulate_step_one =
      get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
@@ -601,7 +601,7 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
                 uint32_t polynomial_size, int32_t grouping_factor,
                 uint32_t level_count, uint32_t j, uint32_t lwe_offset,
                 uint32_t num_many_lut, uint32_t lut_stride) {
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);

  auto lwe_chunk_size = buffer->lwe_chunk_size;
  uint64_t full_sm_accumulate_step_two =
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
@@ -201,7 +201,7 @@ __host__ void scratch_programmable_bootstrap_tbc(
    uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);

  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  bool supports_dsm =
@@ -264,7 +264,7 @@ __host__ void host_programmable_bootstrap_tbc(
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
    uint32_t num_many_lut, uint32_t lut_stride) {
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);

  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  auto supports_dsm =
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -204,7 +204,7 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap(
    pbs_buffer<uint64_t, MULTI_BIT> **buffer, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);

  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  bool supports_dsm =
@@ -301,7 +301,7 @@ __host__ void execute_tbc_external_product_loop(
    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
    uint32_t level_count, uint32_t lwe_offset, uint32_t num_many_lut,
    uint32_t lut_stride) {
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);

  auto lwe_chunk_size = buffer->lwe_chunk_size;

@@ -402,7 +402,7 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
    uint32_t num_many_lut, uint32_t lut_stride) {
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);

  auto lwe_chunk_size = buffer->lwe_chunk_size;
  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
@@ -21,7 +21,7 @@ int32_t cuda_setup_multi_gpu() {
        check_cuda_error(
            cudaDeviceCanAccessPeer(&has_peer_access_to_device_0, i, 0));
        if (has_peer_access_to_device_0) {
-          check_cuda_error(cudaSetDevice(i));
+          cuda_set_device(i);
          check_cuda_error(cudaDeviceEnablePeerAccess(0, 0));
        }
        num_used_gpus += 1;
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/setup_and_teardown.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/setup_and_teardown.cpp
@@ -140,7 +140,7 @@ void programmable_bootstrap_multibit_setup(
    DynamicDistribution glwe_noise_distribution, int pbs_base_log,
    int pbs_level, int message_modulus, int carry_modulus, int *payload_modulus,
    uint64_t *delta, int number_of_inputs, int repetitions, int samples) {
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);

  *payload_modulus = message_modulus * carry_modulus;
  // Value of the shift we multiply our messages by