From 25d1a4e4dd329e5e92fe8c4b633095e6b28f08c5 Mon Sep 17 00:00:00 2001
From: Guillermo Oyarzun <guillermo.oyarzun@zama.ai>
Date: Wed, 19 Jun 2024 09:37:28 +0200
Subject: [PATCH] chore(gpu): add nvtx tool for profiling

---
 backends/tfhe-cuda-backend/Cargo.toml         |  1 +
 backends/tfhe-cuda-backend/build.rs           |  7 ++++
 .../tfhe-cuda-backend/cuda/CMakeLists.txt     |  9 +++-
 .../cuda/src/integer/cmux.cu                  |  9 ++--
 .../cuda/src/integer/comparison.cu            |  9 ++--
 .../cuda/src/integer/div_rem.cu               |  8 +++-
 .../cuda/src/integer/integer.cu               | 31 ++++++++++----
 .../cuda/src/integer/integer.cuh              | 21 ++++++++--
 .../cuda/src/integer/multiplication.cu        |  9 ++--
 .../cuda/src/integer/negation.cuh             |  6 ++-
 .../cuda/src/integer/radix_ciphertext.cuh     |  5 +++
 .../cuda/src/utils/helper_profile.cu          | 42 +++++++++++++++++++
 .../cuda/src/utils/helper_profile.cuh         | 13 ++++++
 tfhe/Cargo.toml                               |  1 +
 14 files changed, 146 insertions(+), 25 deletions(-)
 create mode 100644 backends/tfhe-cuda-backend/cuda/src/utils/helper_profile.cu
 create mode 100644 backends/tfhe-cuda-backend/cuda/src/utils/helper_profile.cuh

diff --git a/backends/tfhe-cuda-backend/Cargo.toml b/backends/tfhe-cuda-backend/Cargo.toml
index b70e19d64..a3cb0b7cd 100644
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -18,3 +18,4 @@ bindgen = "0.71"
 
 [features]
 experimental-multi-arch = []
+profile = []
diff --git a/backends/tfhe-cuda-backend/build.rs b/backends/tfhe-cuda-backend/build.rs
index c1676d281..0334701d4 100644
--- a/backends/tfhe-cuda-backend/build.rs
+++ b/backends/tfhe-cuda-backend/build.rs
@@ -45,6 +45,13 @@ fn main() {
         } else {
             cmake_config.define("MULTI_ARCH", "OFF");
         }
+        // Conditionally pass the "USE_NVTOOLS" variable to CMake if the feature is enabled
+        if cfg!(feature = "profile") {
+            cmake_config.define("USE_NVTOOLS", "ON");
+            println!("cargo:rustc-link-lib=nvToolsExt");
+        } else {
+            cmake_config.define("USE_NVTOOLS", "OFF");
+        }
 
         // Build the CMake project
         let dest = cmake_config.build();
diff --git a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
index 3b977b35f..196fbb9c1 100644
--- a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
@@ -88,7 +88,14 @@ else()
   set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O3")
 endif()
 
-# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
+# Check if the USE_NVTOOLS environment variable is set
+if(${USE_NVTOOLS})
+  message(STATUS "USE_NVTOOLS is enabled")
+  add_definitions(-DUSE_NVTOOLS)
+endif()
+
+# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging to use
+# nvtx when profiling -lnvToolsExt
 set(CMAKE_CUDA_FLAGS
     "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} ${OPTIMIZATION_FLAGS}\
   -std=c++17 --no-exceptions  --expt-relaxed-constexpr -rdc=true \
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
index 908e73dd3..cd1c83fbb 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
@@ -8,7 +8,7 @@ void scratch_cuda_integer_radix_cmux_kb_64(
     uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
     uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
     bool allocate_gpu_memory, bool allocate_ms_array) {
-
+  PUSH_RANGE("scratch cmux")
   int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                           big_lwe_dimension, small_lwe_dimension, ks_level,
                           ks_base_log, pbs_level, pbs_base_log, grouping_factor,
@@ -21,6 +21,7 @@ void scratch_cuda_integer_radix_cmux_kb_64(
       (cudaStream_t *)(streams), gpu_indexes, gpu_count,
       (int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
       lwe_ciphertext_count, params, allocate_gpu_memory);
+  POP_RANGE()
 }
 
 void cuda_cmux_integer_radix_ciphertext_kb_64(
@@ -31,20 +32,22 @@ void cuda_cmux_integer_radix_ciphertext_kb_64(
     CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
     void *const *bsks, void *const *ksks,
     CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
-
+  PUSH_RANGE("cmux")
   host_integer_radix_cmux_kb<uint64_t>(
       (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
       lwe_condition, lwe_array_true, lwe_array_false,
       (int_cmux_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
       ms_noise_reduction_key);
+  POP_RANGE()
 }
 
 void cleanup_cuda_integer_radix_cmux(void *const *streams,
                                      uint32_t const *gpu_indexes,
                                      uint32_t gpu_count,
                                      int8_t **mem_ptr_void) {
-
+  PUSH_RANGE("cleanup cmux")
   int_cmux_buffer<uint64_t> *mem_ptr =
       (int_cmux_buffer<uint64_t> *)(*mem_ptr_void);
   mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  POP_RANGE()
 }
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
index be6ebb049..d34544050 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
@@ -9,7 +9,7 @@ void scratch_cuda_integer_radix_comparison_kb_64(
     uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
     COMPARISON_TYPE op_type, bool is_signed, bool allocate_gpu_memory,
     bool allocate_ms_array) {
-
+  PUSH_RANGE("scratch comparison")
   int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                           big_lwe_dimension, small_lwe_dimension, ks_level,
                           ks_base_log, pbs_level, pbs_base_log, grouping_factor,
@@ -35,6 +35,7 @@ void scratch_cuda_integer_radix_comparison_kb_64(
         op_type, is_signed, allocate_gpu_memory);
     break;
   }
+  POP_RANGE()
 }
 
 void cuda_comparison_integer_radix_ciphertext_kb_64(
@@ -44,7 +45,7 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
     CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
     void *const *bsks, void *const *ksks,
     CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
-
+  PUSH_RANGE("comparison")
   if (lwe_array_1->num_radix_blocks != lwe_array_1->num_radix_blocks)
     PANIC("Cuda error: input num radix blocks must be the same")
   // The output ciphertext might be a boolean block or a radix ciphertext
@@ -85,16 +86,18 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
   default:
     PANIC("Cuda error: integer operation not supported")
   }
+  POP_RANGE()
 }
 
 void cleanup_cuda_integer_comparison(void *const *streams,
                                      uint32_t const *gpu_indexes,
                                      uint32_t gpu_count,
                                      int8_t **mem_ptr_void) {
-
+  PUSH_RANGE("cleanup comparison")
   int_comparison_buffer<uint64_t> *mem_ptr =
       (int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
   mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  POP_RANGE()
 }
 
 void scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
index 0c1656465..77f400354 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
@@ -8,7 +8,7 @@ void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
     uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
     uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
     PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array) {
-
+  PUSH_RANGE("scratch div")
   int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                           big_lwe_dimension, small_lwe_dimension, ks_level,
                           ks_base_log, pbs_level, pbs_base_log, grouping_factor,
@@ -18,6 +18,7 @@ void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
       (cudaStream_t *)(streams), gpu_indexes, gpu_count, is_signed,
       (int_div_rem_memory<uint64_t> **)mem_ptr, num_blocks, params,
       allocate_gpu_memory);
+  POP_RANGE()
 }
 
 void cuda_integer_div_rem_radix_ciphertext_kb_64(
@@ -27,20 +28,23 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64(
     CudaRadixCiphertextFFI const *divisor, bool is_signed, int8_t *mem_ptr,
     void *const *bsks, void *const *ksks,
     CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
-
+  PUSH_RANGE("div")
   auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;
 
   host_integer_div_rem_kb<uint64_t>(
       (cudaStream_t *)(streams), gpu_indexes, gpu_count, quotient, remainder,
       numerator, divisor, is_signed, bsks, (uint64_t **)(ksks),
       ms_noise_reduction_key, mem);
+  POP_RANGE()
 }
 
 void cleanup_cuda_integer_div_rem(void *const *streams,
                                   uint32_t const *gpu_indexes,
                                   uint32_t gpu_count, int8_t **mem_ptr_void) {
+  PUSH_RANGE("cleanup div")
   int_div_rem_memory<uint64_t> *mem_ptr =
       (int_div_rem_memory<uint64_t> *)(*mem_ptr_void);
 
   mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  POP_RANGE()
 }
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
index bddf077a6..3a2fb89c2 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -52,7 +52,7 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
     uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
     uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
     uint32_t uses_carry, bool allocate_gpu_memory, bool allocate_ms_array) {
-
+  PUSH_RANGE("scratch propagate sc")
   int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                           big_lwe_dimension, small_lwe_dimension, ks_level,
                           ks_base_log, pbs_level, pbs_base_log, grouping_factor,
@@ -62,6 +62,7 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
       (cudaStream_t *)(streams), gpu_indexes, gpu_count,
       (int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
       requested_flag, uses_carry, allocate_gpu_memory);
+  POP_RANGE()
 }
 
 void scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
@@ -72,7 +73,7 @@ void scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
     uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
     uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
     uint32_t uses_carry, bool allocate_gpu_memory, bool allocate_ms_array) {
-
+  PUSH_RANGE("scratch add & propagate sc")
   int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                           big_lwe_dimension, small_lwe_dimension, ks_level,
                           ks_base_log, pbs_level, pbs_base_log, grouping_factor,
@@ -82,6 +83,7 @@ void scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
       (cudaStream_t *)(streams), gpu_indexes, gpu_count,
       (int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
       requested_flag, uses_carry, allocate_gpu_memory);
+  POP_RANGE()
 }
 
 void scratch_cuda_integer_overflowing_sub_kb_64_inplace(
@@ -92,7 +94,7 @@ void scratch_cuda_integer_overflowing_sub_kb_64_inplace(
     uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
     uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t compute_overflow,
     bool allocate_gpu_memory, bool allocate_ms_array) {
-
+  PUSH_RANGE("scratch overflow sub")
   int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                           big_lwe_dimension, small_lwe_dimension, ks_level,
                           ks_base_log, pbs_level, pbs_base_log, grouping_factor,
@@ -102,6 +104,7 @@ void scratch_cuda_integer_overflowing_sub_kb_64_inplace(
       (cudaStream_t *)(streams), gpu_indexes, gpu_count,
       (int_borrow_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
       compute_overflow, allocate_gpu_memory);
+  POP_RANGE()
 }
 
 void cuda_propagate_single_carry_kb_64_inplace(
@@ -140,38 +143,45 @@ void cuda_integer_overflowing_sub_kb_64_inplace(
     void *const *bsks, void *const *ksks,
     CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
     uint32_t compute_overflow, uint32_t uses_input_borrow) {
-
+  PUSH_RANGE("overflow sub")
   host_integer_overflowing_sub<uint64_t>(
       (cudaStream_t const *)streams, gpu_indexes, gpu_count, lhs_array,
       lhs_array, rhs_array, overflow_block, input_borrow,
       (int_borrow_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
       ms_noise_reduction_key, compute_overflow, uses_input_borrow);
+  POP_RANGE()
 }
 
 void cleanup_cuda_propagate_single_carry(void *const *streams,
                                          uint32_t const *gpu_indexes,
                                          uint32_t gpu_count,
                                          int8_t **mem_ptr_void) {
+  PUSH_RANGE("cleanup propagate sc")
   int_sc_prop_memory<uint64_t> *mem_ptr =
       (int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
   mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  POP_RANGE()
 }
 
 void cleanup_cuda_add_and_propagate_single_carry(void *const *streams,
                                                  uint32_t const *gpu_indexes,
                                                  uint32_t gpu_count,
                                                  int8_t **mem_ptr_void) {
+  PUSH_RANGE("cleanup add & propagate sc")
   int_sc_prop_memory<uint64_t> *mem_ptr =
       (int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
   mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  POP_RANGE()
 }
 void cleanup_cuda_integer_overflowing_sub(void *const *streams,
                                           uint32_t const *gpu_indexes,
                                           uint32_t gpu_count,
                                           int8_t **mem_ptr_void) {
+  PUSH_RANGE("cleanup overflow sub")
   int_borrow_prop_memory<uint64_t> *mem_ptr =
       (int_borrow_prop_memory<uint64_t> *)(*mem_ptr_void);
   mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  POP_RANGE()
 }
 
 void scratch_cuda_apply_univariate_lut_kb_64(
@@ -182,7 +192,7 @@ void scratch_cuda_apply_univariate_lut_kb_64(
     uint32_t grouping_factor, uint32_t num_radix_blocks,
     uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
     uint64_t lut_degree, bool allocate_gpu_memory, bool allocate_ms_array) {
-
+  PUSH_RANGE("scratch univar lut")
   int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                           glwe_dimension * polynomial_size, lwe_dimension,
                           ks_level, ks_base_log, pbs_level, pbs_base_log,
@@ -194,6 +204,7 @@ void scratch_cuda_apply_univariate_lut_kb_64(
       (int_radix_lut<uint64_t> **)mem_ptr,
       static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
       lut_degree, allocate_gpu_memory);
+  POP_RANGE()
 }
 
 void scratch_cuda_apply_many_univariate_lut_kb_64(
@@ -205,7 +216,7 @@ void scratch_cuda_apply_many_univariate_lut_kb_64(
     uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
     uint32_t num_many_lut, uint64_t lut_degree, bool allocate_gpu_memory,
     bool allocate_ms_array) {
-
+  PUSH_RANGE("scratch many lut")
   int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                           glwe_dimension * polynomial_size, lwe_dimension,
                           ks_level, ks_base_log, pbs_level, pbs_base_log,
@@ -217,6 +228,7 @@ void scratch_cuda_apply_many_univariate_lut_kb_64(
       (int_radix_lut<uint64_t> **)mem_ptr,
       static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
       num_many_lut, lut_degree, allocate_gpu_memory);
+  POP_RANGE()
 }
 
 void cuda_apply_univariate_lut_kb_64(
@@ -237,8 +249,10 @@ void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams,
                                              uint32_t const *gpu_indexes,
                                              uint32_t gpu_count,
                                              int8_t **mem_ptr_void) {
+  PUSH_RANGE("cleanup univar lut")
   int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
   mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  POP_RANGE()
 }
 
 void cuda_apply_many_univariate_lut_kb_64(
@@ -263,7 +277,7 @@ void scratch_cuda_apply_bivariate_lut_kb_64(
     uint32_t grouping_factor, uint32_t num_radix_blocks,
     uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
     uint64_t lut_degree, bool allocate_gpu_memory, bool allocate_ms_array) {
-
+  PUSH_RANGE("scratch bivar lut")
   int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                           glwe_dimension * polynomial_size, lwe_dimension,
                           ks_level, ks_base_log, pbs_level, pbs_base_log,
@@ -275,6 +289,7 @@ void scratch_cuda_apply_bivariate_lut_kb_64(
       (int_radix_lut<uint64_t> **)mem_ptr,
       static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
       lut_degree, allocate_gpu_memory);
+  POP_RANGE()
 }
 
 void cuda_apply_bivariate_lut_kb_64(
@@ -297,8 +312,10 @@ void cleanup_cuda_apply_bivariate_lut_kb_64(void *const *streams,
                                             uint32_t const *gpu_indexes,
                                             uint32_t gpu_count,
                                             int8_t **mem_ptr_void) {
+  PUSH_RANGE("cleanup bivar lut")
   int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
   mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  POP_RANGE()
 }
 
 void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
index 5b065032e..f0539bb01 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -13,6 +13,7 @@
 #include "polynomial/functions.cuh"
 #include "utils/helper.cuh"
 #include "utils/helper_multi_gpu.cuh"
+#include "utils/helper_profile.cuh"
 #include "utils/kernel_dimensions.cuh"
 #include <algorithm>
 #include <functional>
@@ -499,6 +500,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
     Torus *const *ksks,
     CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
     int_radix_lut<Torus> *lut, uint32_t num_radix_blocks) {
+  PUSH_RANGE("apply lut")
   // apply_lookup_table
   auto params = lut->params;
   auto pbs_type = params.pbs_type;
@@ -596,6 +598,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
     lwe_array_out->degrees[i] = lut->degrees[degrees_index];
     lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
   }
+  POP_RANGE()
 }
 
 template <typename Torus>
@@ -606,6 +609,7 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
     Torus *const *ksks,
     CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
     int_radix_lut<Torus> *lut, uint32_t num_many_lut, uint32_t lut_stride) {
+  PUSH_RANGE("apply many lut")
   // apply_lookup_table
   auto params = lut->params;
   auto pbs_type = params.pbs_type;
@@ -699,6 +703,7 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
     lwe_array_out->degrees[i] = lut->degrees[degrees_index];
     lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
   }
+  POP_RANGE()
 }
 
 template <typename Torus>
@@ -710,7 +715,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
     Torus *const *ksks,
     CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
     int_radix_lut<Torus> *lut, uint32_t num_radix_blocks, uint32_t shift) {
-
+  PUSH_RANGE("apply bivar lut")
   if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
       lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
     PANIC("Cuda error: input and output radix ciphertexts should have the same "
@@ -814,6 +819,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
     lwe_array_out->degrees[i] = lut->degrees[degrees_index];
     lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
   }
+  POP_RANGE()
 }
 
 // Rotates the slice in-place such that the first mid elements of the slice move
@@ -995,7 +1001,7 @@ void generate_device_accumulator_bivariate(
     uint64_t *degree, uint64_t *max_degree, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
     std::function<Torus(Torus, Torus)> f) {
-
+  PUSH_RANGE("gen bivar lut acc")
   // host lut
   Torus *h_lut =
       (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
@@ -1013,6 +1019,7 @@ void generate_device_accumulator_bivariate(
 
   cuda_synchronize_stream(stream, gpu_index);
   free(h_lut);
+  POP_RANGE()
 }
 
 /*
@@ -1092,11 +1099,12 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
                                  uint32_t message_modulus,
                                  uint32_t carry_modulus,
                                  std::function<Torus(Torus)> f) {
-
+  PUSH_RANGE("gen lut acc")
   generate_device_accumulator_with_encoding(
       stream, gpu_index, acc, degree, max_degree, glwe_dimension,
       polynomial_size, message_modulus, carry_modulus, message_modulus,
       carry_modulus, f);
+  POP_RANGE()
 }
 
 /*
@@ -1112,7 +1120,7 @@ void generate_many_lut_device_accumulator(
     uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t message_modulus, uint32_t carry_modulus,
     std::vector<std::function<Torus(Torus)>> &functions) {
-
+  PUSH_RANGE("gen many lut acc")
   // host lut
   Torus *h_lut =
       (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
@@ -1129,6 +1137,7 @@ void generate_many_lut_device_accumulator(
 
   cuda_synchronize_stream(stream, gpu_index);
   free(h_lut);
+  POP_RANGE()
 }
 
 // This function is used to perform step 1 of Thomas' new carry propagation
@@ -1803,6 +1812,7 @@ void host_propagate_single_carry(
     void *const *bsks, Torus *const *ksks,
     CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
     uint32_t requested_flag, uint32_t uses_carry) {
+  PUSH_RANGE("propagate sc")
   auto num_radix_blocks = lwe_array->num_radix_blocks;
   auto params = mem->params;
   auto glwe_dimension = params.glwe_dimension;
@@ -1891,6 +1901,7 @@ void host_propagate_single_carry(
         streams, gpu_indexes, gpu_count, lwe_array, prepared_blocks, bsks, ksks,
         ms_noise_reduction_key, message_extract, num_radix_blocks);
   }
+  POP_RANGE()
 }
 
 // This function perform the three steps of Thomas' new carry propagation
@@ -1904,6 +1915,7 @@ void host_add_and_propagate_single_carry(
     void *const *bsks, Torus *const *ksks,
     CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
     uint32_t requested_flag, uint32_t uses_carry) {
+  PUSH_RANGE("add & propagate sc")
   if (lhs_array->num_radix_blocks != rhs_array->num_radix_blocks)
     PANIC("Cuda error: input and output num radix blocks must be the same")
   if (lhs_array->lwe_dimension != rhs_array->lwe_dimension ||
@@ -2026,6 +2038,7 @@ void host_add_and_propagate_single_carry(
         streams, gpu_indexes, gpu_count, lhs_array, prepared_blocks, bsks, ksks,
         ms_noise_reduction_key, mem->lut_message_extract, num_radix_blocks);
   }
+  POP_RANGE()
 }
 
 template <typename Torus>
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
index 5470eabf5..a4e4a1180 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
@@ -73,7 +73,7 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
     uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
     uint32_t grouping_factor, uint32_t num_radix_blocks, PBS_TYPE pbs_type,
     bool allocate_gpu_memory, bool allocate_ms_array) {
-
+  PUSH_RANGE("scratch mul")
   int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                           polynomial_size * glwe_dimension, lwe_dimension,
                           ks_level, ks_base_log, pbs_level, pbs_base_log,
@@ -97,6 +97,7 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
     PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
           "Supported N's are powers of two in the interval [256..16384].")
   }
+  POP_RANGE()
 }
 
 /*
@@ -134,7 +135,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
     void *const *bsks, void *const *ksks,
     CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
     int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks) {
-
+  PUSH_RANGE("mul")
   switch (polynomial_size) {
   case 256:
     host_integer_mult_radix_kb<uint64_t, AmortizedDegree<256>>(
@@ -189,16 +190,18 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
     PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
           "Supported N's are powers of two in the interval [256..16384].")
   }
+  POP_RANGE()
 }
 
 void cleanup_cuda_integer_mult(void *const *streams,
                                uint32_t const *gpu_indexes, uint32_t gpu_count,
                                int8_t **mem_ptr_void) {
-
+  PUSH_RANGE("cleanup mul")
   int_mul_memory<uint64_t> *mem_ptr =
       (int_mul_memory<uint64_t> *)(*mem_ptr_void);
 
   mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  POP_RANGE()
 }
 
 void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
index 0333a2efe..5013481ba 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
@@ -116,10 +116,11 @@ __host__ void scratch_cuda_integer_overflowing_sub_kb(
     uint32_t gpu_count, int_overflowing_sub_memory<Torus> **mem_ptr,
     uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory,
     bool allocate_ms_array) {
-
+  PUSH_RANGE("scratch overflowing sub")
   *mem_ptr = new int_overflowing_sub_memory<Torus>(
       streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory,
       allocate_ms_array);
+  POP_RANGE()
 }
 
 template <typename Torus>
@@ -134,7 +135,7 @@ __host__ void host_integer_overflowing_sub(
     Torus *const *ksks,
     CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
     uint32_t compute_overflow, uint32_t uses_input_borrow) {
-
+  PUSH_RANGE("overflowing sub")
   if (output->num_radix_blocks != input_left->num_radix_blocks ||
       output->num_radix_blocks != input_right->num_radix_blocks)
     PANIC("Cuda error: lwe_array_in and output num radix blocks must be "
@@ -165,6 +166,7 @@ __host__ void host_integer_overflowing_sub(
       streams, gpu_indexes, gpu_count, output, overflow_block, input_borrow,
       (int_borrow_prop_memory<Torus> *)mem_ptr, bsks, (Torus **)(ksks),
       ms_noise_reduction_key, num_groups, compute_overflow, uses_input_borrow);
+  POP_RANGE()
 }
 
 #endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
index ea8272475..1acd3d959 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
@@ -4,6 +4,7 @@
 #include "device.h"
 #include "integer/integer.h"
 #include "integer/radix_ciphertext.h"
+#include "utils/helper_profile.cuh"
 #include "utils/kernel_dimensions.cuh"
 
 template <typename Torus>
@@ -12,6 +13,7 @@ void create_zero_radix_ciphertext_async(cudaStream_t const stream,
                                         CudaRadixCiphertextFFI *radix,
                                         const uint32_t num_radix_blocks,
                                         const uint32_t lwe_dimension) {
+  PUSH_RANGE("create zero radix ct");
   radix->lwe_dimension = lwe_dimension;
   radix->num_radix_blocks = num_radix_blocks;
   radix->max_num_radix_blocks = num_radix_blocks;
@@ -25,6 +27,7 @@ void create_zero_radix_ciphertext_async(cudaStream_t const stream,
   if (radix->degrees == NULL || radix->noise_levels == NULL) {
     PANIC("Cuda error: degrees / noise levels not allocated correctly")
   }
+  POP_RANGE();
 }
 
 template <typename Torus>
@@ -73,6 +76,7 @@ void copy_radix_ciphertext_slice_async(
     const uint32_t output_end_lwe_index,
     const CudaRadixCiphertextFFI *input_radix,
     const uint32_t input_start_lwe_index, const uint32_t input_end_lwe_index) {
+  PUSH_RANGE("copy radix slice");
   if (output_radix->lwe_dimension != input_radix->lwe_dimension)
     PANIC("Cuda error: input lwe dimension should be equal to output lwe "
           "dimension")
@@ -116,6 +120,7 @@ void copy_radix_ciphertext_slice_async(
     output_radix->noise_levels[i + output_start_lwe_index] =
         input_radix->noise_levels[i + input_start_lwe_index];
   }
+  POP_RANGE();
 }
 
 template <typename Torus>
diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/helper_profile.cu b/backends/tfhe-cuda-backend/cuda/src/utils/helper_profile.cu
new file mode 100644
index 000000000..1e229ee9e
--- /dev/null
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_profile.cu
@@ -0,0 +1,42 @@
+#include "helper_profile.cuh"
+
+uint32_t adler32(const unsigned char *data) {
+  const uint32_t MOD_ADLER = 65521;
+  uint32_t a = 1, b = 0;
+  size_t index;
+  for (index = 0; data[index] != 0; ++index) {
+    a = (a + data[index] * 2) % MOD_ADLER;
+    b = (b + a) % MOD_ADLER;
+  }
+  return (b << 16) | a;
+}
+
+void cuda_nvtx_label_with_color(const char *name) {
+#ifdef USE_NVTOOLS
+  int color_id = adler32((const unsigned char *)name);
+  int r, g, b;
+  r = color_id & 0x000000ff;
+  g = (color_id & 0x000ff000) >> 12;
+  b = (color_id & 0x0ff00000) >> 20;
+  if (r < 64 & g < 64 & b < 64) {
+    r = r * 3;
+    g = g * 3 + 64;
+    b = b * 4;
+  }
+
+  color_id = 0xff000000 | (r << 16) | (g << 8) | (b);
+  nvtxEventAttributes_t eventAttrib = {0};
+  eventAttrib.version = NVTX_VERSION;
+  eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+  eventAttrib.colorType = NVTX_COLOR_ARGB;
+  eventAttrib.color = color_id;
+  eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+  eventAttrib.message.ascii = name;
+  nvtxRangePushEx(&eventAttrib);
+#endif
+}
+void cuda_nvtx_pop() {
+#ifdef USE_NVTOOLS
+  nvtxRangePop();
+#endif
+}
diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/helper_profile.cuh b/backends/tfhe-cuda-backend/cuda/src/utils/helper_profile.cuh
new file mode 100644
index 000000000..5e2dbf6ea
--- /dev/null
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_profile.cuh
@@ -0,0 +1,13 @@
+#ifndef HELPER_PROFILE
+#define HELPER_PROFILE
+#include <nvToolsExt.h>
+
+void cuda_nvtx_label_with_color(const char *name);
+void cuda_nvtx_pop();
+
+#define PUSH_RANGE(name)                                                       \
+  { cuda_nvtx_label_with_color(name); }
+#define POP_RANGE()                                                            \
+  { cuda_nvtx_pop(); }
+
+#endif
diff --git a/tfhe/Cargo.toml b/tfhe/Cargo.toml
index 9643db4b9..992dee09f 100644
--- a/tfhe/Cargo.toml
+++ b/tfhe/Cargo.toml
@@ -91,6 +91,7 @@ strings = ["integer"]
 internal-keycache = ["dep:fs2"]
 gpu = ["dep:tfhe-cuda-backend", "shortint"]
 gpu-experimental-multi-arch = ["gpu", "tfhe-cuda-backend/experimental-multi-arch"]
+gpu-profile = ["gpu", "tfhe-cuda-backend/profile"]
 zk-pok = ["dep:tfhe-zk-pok"]
 
 # Adds more FheUint/FheInt types to the HL