chore(gpu): add nvtx tool for profiling

2026-01-09 14:47:56 -05:00 · 2024-06-19 09:37:28 +02:00
parent ffdaf6ad13
commit 25d1a4e4dd
14 changed files with 146 additions and 25 deletions
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -18,3 +18,4 @@ bindgen = "0.71"

 [features]
 experimental-multi-arch = []
+profile = []
--- a/backends/tfhe-cuda-backend/build.rs
+++ b/backends/tfhe-cuda-backend/build.rs
@@ -45,6 +45,13 @@ fn main() {
        } else {
            cmake_config.define("MULTI_ARCH", "OFF");
        }
+        // Conditionally pass the "USE_NVTOOLS" variable to CMake if the feature is enabled
+        if cfg!(feature = "profile") {
+            cmake_config.define("USE_NVTOOLS", "ON");
+            println!("cargo:rustc-link-lib=nvToolsExt");
+        } else {
+            cmake_config.define("USE_NVTOOLS", "OFF");
+        }

        // Build the CMake project
        let dest = cmake_config.build();
--- a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
@@ -88,7 +88,14 @@ else()
  set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O3")
 endif()

-# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
+# Check if the USE_NVTOOLS environment variable is set
+if(${USE_NVTOOLS})
+  message(STATUS "USE_NVTOOLS is enabled")
+  add_definitions(-DUSE_NVTOOLS)
+endif()
+
+# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging to use
+# nvtx when profiling -lnvToolsExt
 set(CMAKE_CUDA_FLAGS
    "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} ${OPTIMIZATION_FLAGS}\
  -std=c++17 --no-exceptions  --expt-relaxed-constexpr -rdc=true \
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
@@ -8,7 +8,7 @@ void scratch_cuda_integer_radix_cmux_kb_64(
    uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
    bool allocate_gpu_memory, bool allocate_ms_array) {
-
+  PUSH_RANGE("scratch cmux")
  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
@@ -21,6 +21,7 @@ void scratch_cuda_integer_radix_cmux_kb_64(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
      lwe_ciphertext_count, params, allocate_gpu_memory);
+  POP_RANGE()
 }

 void cuda_cmux_integer_radix_ciphertext_kb_64(
@@ -31,20 +32,22 @@ void cuda_cmux_integer_radix_ciphertext_kb_64(
    CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
    void *const *bsks, void *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
-
+  PUSH_RANGE("cmux")
  host_integer_radix_cmux_kb<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
      lwe_condition, lwe_array_true, lwe_array_false,
      (int_cmux_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
      ms_noise_reduction_key);
+  POP_RANGE()
 }

 void cleanup_cuda_integer_radix_cmux(void *const *streams,
                                     uint32_t const *gpu_indexes,
                                     uint32_t gpu_count,
                                     int8_t **mem_ptr_void) {
-
+  PUSH_RANGE("cleanup cmux")
  int_cmux_buffer<uint64_t> *mem_ptr =
      (int_cmux_buffer<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  POP_RANGE()
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
@@ -9,7 +9,7 @@ void scratch_cuda_integer_radix_comparison_kb_64(
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
    COMPARISON_TYPE op_type, bool is_signed, bool allocate_gpu_memory,
    bool allocate_ms_array) {
-
+  PUSH_RANGE("scratch comparison")
  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
@@ -35,6 +35,7 @@ void scratch_cuda_integer_radix_comparison_kb_64(
        op_type, is_signed, allocate_gpu_memory);
    break;
  }
+  POP_RANGE()
 }

 void cuda_comparison_integer_radix_ciphertext_kb_64(
@@ -44,7 +45,7 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
    CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
    void *const *bsks, void *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
-
+  PUSH_RANGE("comparison")
  if (lwe_array_1->num_radix_blocks != lwe_array_1->num_radix_blocks)
    PANIC("Cuda error: input num radix blocks must be the same")
  // The output ciphertext might be a boolean block or a radix ciphertext
@@ -85,16 +86,18 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
  default:
    PANIC("Cuda error: integer operation not supported")
  }
+  POP_RANGE()
 }

 void cleanup_cuda_integer_comparison(void *const *streams,
                                     uint32_t const *gpu_indexes,
                                     uint32_t gpu_count,
                                     int8_t **mem_ptr_void) {
-
+  PUSH_RANGE("cleanup comparison")
  int_comparison_buffer<uint64_t> *mem_ptr =
      (int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  POP_RANGE()
 }

 void scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
@@ -8,7 +8,7 @@ void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
    PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array) {
-
+  PUSH_RANGE("scratch div")
  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
@@ -18,6 +18,7 @@ void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count, is_signed,
      (int_div_rem_memory<uint64_t> **)mem_ptr, num_blocks, params,
      allocate_gpu_memory);
+  POP_RANGE()
 }

 void cuda_integer_div_rem_radix_ciphertext_kb_64(
@@ -27,20 +28,23 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64(
    CudaRadixCiphertextFFI const *divisor, bool is_signed, int8_t *mem_ptr,
    void *const *bsks, void *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
-
+  PUSH_RANGE("div")
  auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;

  host_integer_div_rem_kb<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count, quotient, remainder,
      numerator, divisor, is_signed, bsks, (uint64_t **)(ksks),
      ms_noise_reduction_key, mem);
+  POP_RANGE()
 }

 void cleanup_cuda_integer_div_rem(void *const *streams,
                                  uint32_t const *gpu_indexes,
                                  uint32_t gpu_count, int8_t **mem_ptr_void) {
+  PUSH_RANGE("cleanup div")
  int_div_rem_memory<uint64_t> *mem_ptr =
      (int_div_rem_memory<uint64_t> *)(*mem_ptr_void);

  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  POP_RANGE()
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -52,7 +52,7 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
    uint32_t uses_carry, bool allocate_gpu_memory, bool allocate_ms_array) {
-
+  PUSH_RANGE("scratch propagate sc")
  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
@@ -62,6 +62,7 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
      requested_flag, uses_carry, allocate_gpu_memory);
+  POP_RANGE()
 }

 void scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
@@ -72,7 +73,7 @@ void scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
    uint32_t uses_carry, bool allocate_gpu_memory, bool allocate_ms_array) {
-
+  PUSH_RANGE("scratch add & propagate sc")
  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
@@ -82,6 +83,7 @@ void scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
      requested_flag, uses_carry, allocate_gpu_memory);
+  POP_RANGE()
 }

 void scratch_cuda_integer_overflowing_sub_kb_64_inplace(
@@ -92,7 +94,7 @@ void scratch_cuda_integer_overflowing_sub_kb_64_inplace(
    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t compute_overflow,
    bool allocate_gpu_memory, bool allocate_ms_array) {
-
+  PUSH_RANGE("scratch overflow sub")
  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
@@ -102,6 +104,7 @@ void scratch_cuda_integer_overflowing_sub_kb_64_inplace(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_borrow_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
      compute_overflow, allocate_gpu_memory);
+  POP_RANGE()
 }

 void cuda_propagate_single_carry_kb_64_inplace(
@@ -140,38 +143,45 @@ void cuda_integer_overflowing_sub_kb_64_inplace(
    void *const *bsks, void *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
    uint32_t compute_overflow, uint32_t uses_input_borrow) {
-
+  PUSH_RANGE("overflow sub")
  host_integer_overflowing_sub<uint64_t>(
      (cudaStream_t const *)streams, gpu_indexes, gpu_count, lhs_array,
      lhs_array, rhs_array, overflow_block, input_borrow,
      (int_borrow_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
      ms_noise_reduction_key, compute_overflow, uses_input_borrow);
+  POP_RANGE()
 }

 void cleanup_cuda_propagate_single_carry(void *const *streams,
                                         uint32_t const *gpu_indexes,
                                         uint32_t gpu_count,
                                         int8_t **mem_ptr_void) {
+  PUSH_RANGE("cleanup propagate sc")
  int_sc_prop_memory<uint64_t> *mem_ptr =
      (int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  POP_RANGE()
 }

 void cleanup_cuda_add_and_propagate_single_carry(void *const *streams,
                                                 uint32_t const *gpu_indexes,
                                                 uint32_t gpu_count,
                                                 int8_t **mem_ptr_void) {
+  PUSH_RANGE("cleanup add & propagate sc")
  int_sc_prop_memory<uint64_t> *mem_ptr =
      (int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  POP_RANGE()
 }
 void cleanup_cuda_integer_overflowing_sub(void *const *streams,
                                          uint32_t const *gpu_indexes,
                                          uint32_t gpu_count,
                                          int8_t **mem_ptr_void) {
+  PUSH_RANGE("cleanup overflow sub")
  int_borrow_prop_memory<uint64_t> *mem_ptr =
      (int_borrow_prop_memory<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  POP_RANGE()
 }

 void scratch_cuda_apply_univariate_lut_kb_64(
@@ -182,7 +192,7 @@ void scratch_cuda_apply_univariate_lut_kb_64(
    uint32_t grouping_factor, uint32_t num_radix_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
    uint64_t lut_degree, bool allocate_gpu_memory, bool allocate_ms_array) {
-
+  PUSH_RANGE("scratch univar lut")
  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
                          ks_level, ks_base_log, pbs_level, pbs_base_log,
@@ -194,6 +204,7 @@ void scratch_cuda_apply_univariate_lut_kb_64(
      (int_radix_lut<uint64_t> **)mem_ptr,
      static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
      lut_degree, allocate_gpu_memory);
+  POP_RANGE()
 }

 void scratch_cuda_apply_many_univariate_lut_kb_64(
@@ -205,7 +216,7 @@ void scratch_cuda_apply_many_univariate_lut_kb_64(
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
    uint32_t num_many_lut, uint64_t lut_degree, bool allocate_gpu_memory,
    bool allocate_ms_array) {
-
+  PUSH_RANGE("scratch many lut")
  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
                          ks_level, ks_base_log, pbs_level, pbs_base_log,
@@ -217,6 +228,7 @@ void scratch_cuda_apply_many_univariate_lut_kb_64(
      (int_radix_lut<uint64_t> **)mem_ptr,
      static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
      num_many_lut, lut_degree, allocate_gpu_memory);
+  POP_RANGE()
 }

 void cuda_apply_univariate_lut_kb_64(
@@ -237,8 +249,10 @@ void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams,
                                             uint32_t const *gpu_indexes,
                                             uint32_t gpu_count,
                                             int8_t **mem_ptr_void) {
+  PUSH_RANGE("cleanup univar lut")
  int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  POP_RANGE()
 }

 void cuda_apply_many_univariate_lut_kb_64(
@@ -263,7 +277,7 @@ void scratch_cuda_apply_bivariate_lut_kb_64(
    uint32_t grouping_factor, uint32_t num_radix_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
    uint64_t lut_degree, bool allocate_gpu_memory, bool allocate_ms_array) {
-
+  PUSH_RANGE("scratch bivar lut")
  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
                          ks_level, ks_base_log, pbs_level, pbs_base_log,
@@ -275,6 +289,7 @@ void scratch_cuda_apply_bivariate_lut_kb_64(
      (int_radix_lut<uint64_t> **)mem_ptr,
      static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
      lut_degree, allocate_gpu_memory);
+  POP_RANGE()
 }

 void cuda_apply_bivariate_lut_kb_64(
@@ -297,8 +312,10 @@ void cleanup_cuda_apply_bivariate_lut_kb_64(void *const *streams,
                                            uint32_t const *gpu_indexes,
                                            uint32_t gpu_count,
                                            int8_t **mem_ptr_void) {
+  PUSH_RANGE("cleanup bivar lut")
  int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  POP_RANGE()
 }

 void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -13,6 +13,7 @@
 #include "polynomial/functions.cuh"
 #include "utils/helper.cuh"
 #include "utils/helper_multi_gpu.cuh"
+#include "utils/helper_profile.cuh"
 #include "utils/kernel_dimensions.cuh"
 #include <algorithm>
 #include <functional>
@@ -499,6 +500,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
    Torus *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
    int_radix_lut<Torus> *lut, uint32_t num_radix_blocks) {
+  PUSH_RANGE("apply lut")
  // apply_lookup_table
  auto params = lut->params;
  auto pbs_type = params.pbs_type;
@@ -596,6 +598,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
    lwe_array_out->degrees[i] = lut->degrees[degrees_index];
    lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
  }
+  POP_RANGE()
 }

 template <typename Torus>
@@ -606,6 +609,7 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
    Torus *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
    int_radix_lut<Torus> *lut, uint32_t num_many_lut, uint32_t lut_stride) {
+  PUSH_RANGE("apply many lut")
  // apply_lookup_table
  auto params = lut->params;
  auto pbs_type = params.pbs_type;
@@ -699,6 +703,7 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
    lwe_array_out->degrees[i] = lut->degrees[degrees_index];
    lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
  }
+  POP_RANGE()
 }

 template <typename Torus>
@@ -710,7 +715,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
    Torus *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
    int_radix_lut<Torus> *lut, uint32_t num_radix_blocks, uint32_t shift) {
-
+  PUSH_RANGE("apply bivar lut")
  if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
      lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
    PANIC("Cuda error: input and output radix ciphertexts should have the same "
@@ -814,6 +819,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
    lwe_array_out->degrees[i] = lut->degrees[degrees_index];
    lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
  }
+  POP_RANGE()
 }

 // Rotates the slice in-place such that the first mid elements of the slice move
@@ -995,7 +1001,7 @@ void generate_device_accumulator_bivariate(
    uint64_t *degree, uint64_t *max_degree, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
    std::function<Torus(Torus, Torus)> f) {
-
+  PUSH_RANGE("gen bivar lut acc")
  // host lut
  Torus *h_lut =
      (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
@@ -1013,6 +1019,7 @@ void generate_device_accumulator_bivariate(

  cuda_synchronize_stream(stream, gpu_index);
  free(h_lut);
+  POP_RANGE()
 }

 /*
@@ -1092,11 +1099,12 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
                                 uint32_t message_modulus,
                                 uint32_t carry_modulus,
                                 std::function<Torus(Torus)> f) {
-
+  PUSH_RANGE("gen lut acc")
  generate_device_accumulator_with_encoding(
      stream, gpu_index, acc, degree, max_degree, glwe_dimension,
      polynomial_size, message_modulus, carry_modulus, message_modulus,
      carry_modulus, f);
+  POP_RANGE()
 }

 /*
@@ -1112,7 +1120,7 @@ void generate_many_lut_device_accumulator(
    uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t message_modulus, uint32_t carry_modulus,
    std::vector<std::function<Torus(Torus)>> &functions) {
-
+  PUSH_RANGE("gen many lut acc")
  // host lut
  Torus *h_lut =
      (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
@@ -1129,6 +1137,7 @@ void generate_many_lut_device_accumulator(

  cuda_synchronize_stream(stream, gpu_index);
  free(h_lut);
+  POP_RANGE()
 }

 // This function is used to perform step 1 of Thomas' new carry propagation
@@ -1803,6 +1812,7 @@ void host_propagate_single_carry(
    void *const *bsks, Torus *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
    uint32_t requested_flag, uint32_t uses_carry) {
+  PUSH_RANGE("propagate sc")
  auto num_radix_blocks = lwe_array->num_radix_blocks;
  auto params = mem->params;
  auto glwe_dimension = params.glwe_dimension;
@@ -1891,6 +1901,7 @@ void host_propagate_single_carry(
        streams, gpu_indexes, gpu_count, lwe_array, prepared_blocks, bsks, ksks,
        ms_noise_reduction_key, message_extract, num_radix_blocks);
  }
+  POP_RANGE()
 }

 // This function perform the three steps of Thomas' new carry propagation
@@ -1904,6 +1915,7 @@ void host_add_and_propagate_single_carry(
    void *const *bsks, Torus *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
    uint32_t requested_flag, uint32_t uses_carry) {
+  PUSH_RANGE("add & propagate sc")
  if (lhs_array->num_radix_blocks != rhs_array->num_radix_blocks)
    PANIC("Cuda error: input and output num radix blocks must be the same")
  if (lhs_array->lwe_dimension != rhs_array->lwe_dimension ||
@@ -2026,6 +2038,7 @@ void host_add_and_propagate_single_carry(
        streams, gpu_indexes, gpu_count, lhs_array, prepared_blocks, bsks, ksks,
        ms_noise_reduction_key, mem->lut_message_extract, num_radix_blocks);
  }
+  POP_RANGE()
 }

 template <typename Torus>
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
@@ -73,7 +73,7 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
    uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
    uint32_t grouping_factor, uint32_t num_radix_blocks, PBS_TYPE pbs_type,
    bool allocate_gpu_memory, bool allocate_ms_array) {
-
+  PUSH_RANGE("scratch mul")
  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          polynomial_size * glwe_dimension, lwe_dimension,
                          ks_level, ks_base_log, pbs_level, pbs_base_log,
@@ -97,6 +97,7 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
    PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
          "Supported N's are powers of two in the interval [256..16384].")
  }
+  POP_RANGE()
 }

 /*
@@ -134,7 +135,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
    void *const *bsks, void *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
    int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks) {
-
+  PUSH_RANGE("mul")
  switch (polynomial_size) {
  case 256:
    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<256>>(
@@ -189,16 +190,18 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
    PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
          "Supported N's are powers of two in the interval [256..16384].")
  }
+  POP_RANGE()
 }

 void cleanup_cuda_integer_mult(void *const *streams,
                               uint32_t const *gpu_indexes, uint32_t gpu_count,
                               int8_t **mem_ptr_void) {
-
+  PUSH_RANGE("cleanup mul")
  int_mul_memory<uint64_t> *mem_ptr =
      (int_mul_memory<uint64_t> *)(*mem_ptr_void);

  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  POP_RANGE()
 }

 void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
@@ -116,10 +116,11 @@ __host__ void scratch_cuda_integer_overflowing_sub_kb(
    uint32_t gpu_count, int_overflowing_sub_memory<Torus> **mem_ptr,
    uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory,
    bool allocate_ms_array) {
-
+  PUSH_RANGE("scratch overflowing sub")
  *mem_ptr = new int_overflowing_sub_memory<Torus>(
      streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory,
      allocate_ms_array);
+  POP_RANGE()
 }

 template <typename Torus>
@@ -134,7 +135,7 @@ __host__ void host_integer_overflowing_sub(
    Torus *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
    uint32_t compute_overflow, uint32_t uses_input_borrow) {
-
+  PUSH_RANGE("overflowing sub")
  if (output->num_radix_blocks != input_left->num_radix_blocks ||
      output->num_radix_blocks != input_right->num_radix_blocks)
    PANIC("Cuda error: lwe_array_in and output num radix blocks must be "
@@ -165,6 +166,7 @@ __host__ void host_integer_overflowing_sub(
      streams, gpu_indexes, gpu_count, output, overflow_block, input_borrow,
      (int_borrow_prop_memory<Torus> *)mem_ptr, bsks, (Torus **)(ksks),
      ms_noise_reduction_key, num_groups, compute_overflow, uses_input_borrow);
+  POP_RANGE()
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
@@ -4,6 +4,7 @@
 #include "device.h"
 #include "integer/integer.h"
 #include "integer/radix_ciphertext.h"
+#include "utils/helper_profile.cuh"
 #include "utils/kernel_dimensions.cuh"

 template <typename Torus>
@@ -12,6 +13,7 @@ void create_zero_radix_ciphertext_async(cudaStream_t const stream,
                                        CudaRadixCiphertextFFI *radix,
                                        const uint32_t num_radix_blocks,
                                        const uint32_t lwe_dimension) {
+  PUSH_RANGE("create zero radix ct");
  radix->lwe_dimension = lwe_dimension;
  radix->num_radix_blocks = num_radix_blocks;
  radix->max_num_radix_blocks = num_radix_blocks;
@@ -25,6 +27,7 @@ void create_zero_radix_ciphertext_async(cudaStream_t const stream,
  if (radix->degrees == NULL || radix->noise_levels == NULL) {
    PANIC("Cuda error: degrees / noise levels not allocated correctly")
  }
+  POP_RANGE();
 }

 template <typename Torus>
@@ -73,6 +76,7 @@ void copy_radix_ciphertext_slice_async(
    const uint32_t output_end_lwe_index,
    const CudaRadixCiphertextFFI *input_radix,
    const uint32_t input_start_lwe_index, const uint32_t input_end_lwe_index) {
+  PUSH_RANGE("copy radix slice");
  if (output_radix->lwe_dimension != input_radix->lwe_dimension)
    PANIC("Cuda error: input lwe dimension should be equal to output lwe "
          "dimension")
@@ -116,6 +120,7 @@ void copy_radix_ciphertext_slice_async(
    output_radix->noise_levels[i + output_start_lwe_index] =
        input_radix->noise_levels[i + input_start_lwe_index];
  }
+  POP_RANGE();
 }

 template <typename Torus>
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_profile.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_profile.cu
@@ -0,0 +1,42 @@
+#include "helper_profile.cuh"
+
+uint32_t adler32(const unsigned char *data) {
+  const uint32_t MOD_ADLER = 65521;
+  uint32_t a = 1, b = 0;
+  size_t index;
+  for (index = 0; data[index] != 0; ++index) {
+    a = (a + data[index] * 2) % MOD_ADLER;
+    b = (b + a) % MOD_ADLER;
+  }
+  return (b << 16) | a;
+}
+
+void cuda_nvtx_label_with_color(const char *name) {
+#ifdef USE_NVTOOLS
+  int color_id = adler32((const unsigned char *)name);
+  int r, g, b;
+  r = color_id & 0x000000ff;
+  g = (color_id & 0x000ff000) >> 12;
+  b = (color_id & 0x0ff00000) >> 20;
+  if (r < 64 & g < 64 & b < 64) {
+    r = r * 3;
+    g = g * 3 + 64;
+    b = b * 4;
+  }
+
+  color_id = 0xff000000 | (r << 16) | (g << 8) | (b);
+  nvtxEventAttributes_t eventAttrib = {0};
+  eventAttrib.version = NVTX_VERSION;
+  eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+  eventAttrib.colorType = NVTX_COLOR_ARGB;
+  eventAttrib.color = color_id;
+  eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+  eventAttrib.message.ascii = name;
+  nvtxRangePushEx(&eventAttrib);
+#endif
+}
+void cuda_nvtx_pop() {
+#ifdef USE_NVTOOLS
+  nvtxRangePop();
+#endif
+}
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_profile.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_profile.cuh
@@ -0,0 +1,13 @@
+#ifndef HELPER_PROFILE
+#define HELPER_PROFILE
+#include <nvToolsExt.h>
+
+void cuda_nvtx_label_with_color(const char *name);
+void cuda_nvtx_pop();
+
+#define PUSH_RANGE(name)                                                       \
+  { cuda_nvtx_label_with_color(name); }
+#define POP_RANGE()                                                            \
+  { cuda_nvtx_pop(); }
+
+#endif
--- a/tfhe/Cargo.toml
+++ b/tfhe/Cargo.toml
@@ -91,6 +91,7 @@ strings = ["integer"]
 internal-keycache = ["dep:fs2"]
 gpu = ["dep:tfhe-cuda-backend", "shortint"]
 gpu-experimental-multi-arch = ["gpu", "tfhe-cuda-backend/experimental-multi-arch"]
+gpu-profile = ["gpu", "tfhe-cuda-backend/profile"]
 zk-pok = ["dep:tfhe-zk-pok"]

 # Adds more FheUint/FheInt types to the HL