From 25d1a4e4dd329e5e92fe8c4b633095e6b28f08c5 Mon Sep 17 00:00:00 2001 From: Guillermo Oyarzun Date: Wed, 19 Jun 2024 09:37:28 +0200 Subject: [PATCH] chore(gpu): add nvtx tool for profiling --- backends/tfhe-cuda-backend/Cargo.toml | 1 + backends/tfhe-cuda-backend/build.rs | 7 ++++ .../tfhe-cuda-backend/cuda/CMakeLists.txt | 9 +++- .../cuda/src/integer/cmux.cu | 9 ++-- .../cuda/src/integer/comparison.cu | 9 ++-- .../cuda/src/integer/div_rem.cu | 8 +++- .../cuda/src/integer/integer.cu | 31 ++++++++++---- .../cuda/src/integer/integer.cuh | 21 ++++++++-- .../cuda/src/integer/multiplication.cu | 9 ++-- .../cuda/src/integer/negation.cuh | 6 ++- .../cuda/src/integer/radix_ciphertext.cuh | 5 +++ .../cuda/src/utils/helper_profile.cu | 42 +++++++++++++++++++ .../cuda/src/utils/helper_profile.cuh | 13 ++++++ tfhe/Cargo.toml | 1 + 14 files changed, 146 insertions(+), 25 deletions(-) create mode 100644 backends/tfhe-cuda-backend/cuda/src/utils/helper_profile.cu create mode 100644 backends/tfhe-cuda-backend/cuda/src/utils/helper_profile.cuh diff --git a/backends/tfhe-cuda-backend/Cargo.toml b/backends/tfhe-cuda-backend/Cargo.toml index b70e19d64..a3cb0b7cd 100644 --- a/backends/tfhe-cuda-backend/Cargo.toml +++ b/backends/tfhe-cuda-backend/Cargo.toml @@ -18,3 +18,4 @@ bindgen = "0.71" [features] experimental-multi-arch = [] +profile = [] diff --git a/backends/tfhe-cuda-backend/build.rs b/backends/tfhe-cuda-backend/build.rs index c1676d281..0334701d4 100644 --- a/backends/tfhe-cuda-backend/build.rs +++ b/backends/tfhe-cuda-backend/build.rs @@ -45,6 +45,13 @@ fn main() { } else { cmake_config.define("MULTI_ARCH", "OFF"); } + // Conditionally pass the "USE_NVTOOLS" variable to CMake if the feature is enabled + if cfg!(feature = "profile") { + cmake_config.define("USE_NVTOOLS", "ON"); + println!("cargo:rustc-link-lib=nvToolsExt"); + } else { + cmake_config.define("USE_NVTOOLS", "OFF"); + } // Build the CMake project let dest = cmake_config.build(); diff --git a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt index 3b977b35f..196fbb9c1 100644 --- a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt +++ b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt @@ -88,7 +88,14 @@ else() set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O3") endif() -# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging +# Check if the USE_NVTOOLS environment variable is set +if(${USE_NVTOOLS}) + message(STATUS "USE_NVTOOLS is enabled") + add_definitions(-DUSE_NVTOOLS) +endif() + +# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging to use +# nvtx when profiling -lnvToolsExt set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} ${OPTIMIZATION_FLAGS}\ -std=c++17 --no-exceptions --expt-relaxed-constexpr -rdc=true \ diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu index 908e73dd3..cd1c83fbb 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu @@ -8,7 +8,7 @@ void scratch_cuda_integer_radix_cmux_kb_64( uint32_t grouping_factor, uint32_t lwe_ciphertext_count, uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array) { - + PUSH_RANGE("scratch cmux") int_radix_params params(pbs_type, glwe_dimension, polynomial_size, big_lwe_dimension, small_lwe_dimension, ks_level, ks_base_log, pbs_level, pbs_base_log, grouping_factor, @@ -21,6 +21,7 @@ void scratch_cuda_integer_radix_cmux_kb_64( (cudaStream_t *)(streams), gpu_indexes, gpu_count, (int_cmux_buffer **)mem_ptr, predicate_lut_f, lwe_ciphertext_count, params, allocate_gpu_memory); + POP_RANGE() } void cuda_cmux_integer_radix_ciphertext_kb_64( @@ -31,20 +32,22 @@ void cuda_cmux_integer_radix_ciphertext_kb_64( CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr, void *const *bsks, void *const *ksks, CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) { - + PUSH_RANGE("cmux") host_integer_radix_cmux_kb( (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out, lwe_condition, lwe_array_true, lwe_array_false, (int_cmux_buffer *)mem_ptr, bsks, (uint64_t **)(ksks), ms_noise_reduction_key); + POP_RANGE() } void cleanup_cuda_integer_radix_cmux(void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void) { - + PUSH_RANGE("cleanup cmux") int_cmux_buffer *mem_ptr = (int_cmux_buffer *)(*mem_ptr_void); mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); + POP_RANGE() } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu index be6ebb049..d34544050 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu @@ -9,7 +9,7 @@ void scratch_cuda_integer_radix_comparison_kb_64( uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, COMPARISON_TYPE op_type, bool is_signed, bool allocate_gpu_memory, bool allocate_ms_array) { - + PUSH_RANGE("scratch comparison") int_radix_params params(pbs_type, glwe_dimension, polynomial_size, big_lwe_dimension, small_lwe_dimension, ks_level, ks_base_log, pbs_level, pbs_base_log, grouping_factor, @@ -35,6 +35,7 @@ void scratch_cuda_integer_radix_comparison_kb_64( op_type, is_signed, allocate_gpu_memory); break; } + POP_RANGE() } void cuda_comparison_integer_radix_ciphertext_kb_64( @@ -44,7 +45,7 @@ void cuda_comparison_integer_radix_ciphertext_kb_64( CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr, void *const *bsks, void *const *ksks, CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) { - + PUSH_RANGE("comparison") if (lwe_array_1->num_radix_blocks != lwe_array_1->num_radix_blocks) PANIC("Cuda error: input num radix blocks must be the same") // The output ciphertext might be a boolean block or a radix ciphertext @@ -85,16 +86,18 @@ void cuda_comparison_integer_radix_ciphertext_kb_64( default: PANIC("Cuda error: integer operation not supported") } + POP_RANGE() } void cleanup_cuda_integer_comparison(void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void) { - + PUSH_RANGE("cleanup comparison") int_comparison_buffer *mem_ptr = (int_comparison_buffer *)(*mem_ptr_void); mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); + POP_RANGE() } void scratch_cuda_integer_are_all_comparisons_block_true_kb_64( diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu index 0c1656465..77f400354 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu @@ -8,7 +8,7 @@ void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64( uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array) { - + PUSH_RANGE("scratch div") int_radix_params params(pbs_type, glwe_dimension, polynomial_size, big_lwe_dimension, small_lwe_dimension, ks_level, ks_base_log, pbs_level, pbs_base_log, grouping_factor, @@ -18,6 +18,7 @@ void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64( (cudaStream_t *)(streams), gpu_indexes, gpu_count, is_signed, (int_div_rem_memory **)mem_ptr, num_blocks, params, allocate_gpu_memory); + POP_RANGE() } void cuda_integer_div_rem_radix_ciphertext_kb_64( @@ -27,20 +28,23 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64( CudaRadixCiphertextFFI const *divisor, bool is_signed, int8_t *mem_ptr, void *const *bsks, void *const *ksks, CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) { - + PUSH_RANGE("div") auto mem = (int_div_rem_memory *)mem_ptr; host_integer_div_rem_kb( (cudaStream_t *)(streams), gpu_indexes, gpu_count, quotient, remainder, numerator, divisor, is_signed, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem); + POP_RANGE() } void cleanup_cuda_integer_div_rem(void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void) { + PUSH_RANGE("cleanup div") int_div_rem_memory *mem_ptr = (int_div_rem_memory *)(*mem_ptr_void); mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); + POP_RANGE() } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu index bddf077a6..3a2fb89c2 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu @@ -52,7 +52,7 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace( uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag, uint32_t uses_carry, bool allocate_gpu_memory, bool allocate_ms_array) { - + PUSH_RANGE("scratch propagate sc") int_radix_params params(pbs_type, glwe_dimension, polynomial_size, big_lwe_dimension, small_lwe_dimension, ks_level, ks_base_log, pbs_level, pbs_base_log, grouping_factor, @@ -62,6 +62,7 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace( (cudaStream_t *)(streams), gpu_indexes, gpu_count, (int_sc_prop_memory **)mem_ptr, num_blocks, params, requested_flag, uses_carry, allocate_gpu_memory); + POP_RANGE() } void scratch_cuda_add_and_propagate_single_carry_kb_64_inplace( @@ -72,7 +73,7 @@ void scratch_cuda_add_and_propagate_single_carry_kb_64_inplace( uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag, uint32_t uses_carry, bool allocate_gpu_memory, bool allocate_ms_array) { - + PUSH_RANGE("scratch add & propagate sc") int_radix_params params(pbs_type, glwe_dimension, polynomial_size, big_lwe_dimension, small_lwe_dimension, ks_level, ks_base_log, pbs_level, pbs_base_log, grouping_factor, @@ -82,6 +83,7 @@ void scratch_cuda_add_and_propagate_single_carry_kb_64_inplace( (cudaStream_t *)(streams), gpu_indexes, gpu_count, (int_sc_prop_memory **)mem_ptr, num_blocks, params, requested_flag, uses_carry, allocate_gpu_memory); + POP_RANGE() } void scratch_cuda_integer_overflowing_sub_kb_64_inplace( @@ -92,7 +94,7 @@ void scratch_cuda_integer_overflowing_sub_kb_64_inplace( uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t compute_overflow, bool allocate_gpu_memory, bool allocate_ms_array) { - + PUSH_RANGE("scratch overflow sub") int_radix_params params(pbs_type, glwe_dimension, polynomial_size, big_lwe_dimension, small_lwe_dimension, ks_level, ks_base_log, pbs_level, pbs_base_log, grouping_factor, @@ -102,6 +104,7 @@ void scratch_cuda_integer_overflowing_sub_kb_64_inplace( (cudaStream_t *)(streams), gpu_indexes, gpu_count, (int_borrow_prop_memory **)mem_ptr, num_blocks, params, compute_overflow, allocate_gpu_memory); + POP_RANGE() } void cuda_propagate_single_carry_kb_64_inplace( @@ -140,38 +143,45 @@ void cuda_integer_overflowing_sub_kb_64_inplace( void *const *bsks, void *const *ksks, CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key, uint32_t compute_overflow, uint32_t uses_input_borrow) { - + PUSH_RANGE("overflow sub") host_integer_overflowing_sub( (cudaStream_t const *)streams, gpu_indexes, gpu_count, lhs_array, lhs_array, rhs_array, overflow_block, input_borrow, (int_borrow_prop_memory *)mem_ptr, bsks, (uint64_t **)ksks, ms_noise_reduction_key, compute_overflow, uses_input_borrow); + POP_RANGE() } void cleanup_cuda_propagate_single_carry(void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void) { + PUSH_RANGE("cleanup propagate sc") int_sc_prop_memory *mem_ptr = (int_sc_prop_memory *)(*mem_ptr_void); mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); + POP_RANGE() } void cleanup_cuda_add_and_propagate_single_carry(void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void) { + PUSH_RANGE("cleanup add & propagate sc") int_sc_prop_memory *mem_ptr = (int_sc_prop_memory *)(*mem_ptr_void); mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); + POP_RANGE() } void cleanup_cuda_integer_overflowing_sub(void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void) { + PUSH_RANGE("cleanup overflow sub") int_borrow_prop_memory *mem_ptr = (int_borrow_prop_memory *)(*mem_ptr_void); mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); + POP_RANGE() } void scratch_cuda_apply_univariate_lut_kb_64( @@ -182,7 +192,7 @@ void scratch_cuda_apply_univariate_lut_kb_64( uint32_t grouping_factor, uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, uint64_t lut_degree, bool allocate_gpu_memory, bool allocate_ms_array) { - + PUSH_RANGE("scratch univar lut") int_radix_params params(pbs_type, glwe_dimension, polynomial_size, glwe_dimension * polynomial_size, lwe_dimension, ks_level, ks_base_log, pbs_level, pbs_base_log, @@ -194,6 +204,7 @@ void scratch_cuda_apply_univariate_lut_kb_64( (int_radix_lut **)mem_ptr, static_cast(input_lut), num_radix_blocks, params, lut_degree, allocate_gpu_memory); + POP_RANGE() } void scratch_cuda_apply_many_univariate_lut_kb_64( @@ -205,7 +216,7 @@ void scratch_cuda_apply_many_univariate_lut_kb_64( uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t num_many_lut, uint64_t lut_degree, bool allocate_gpu_memory, bool allocate_ms_array) { - + PUSH_RANGE("scratch many lut") int_radix_params params(pbs_type, glwe_dimension, polynomial_size, glwe_dimension * polynomial_size, lwe_dimension, ks_level, ks_base_log, pbs_level, pbs_base_log, @@ -217,6 +228,7 @@ void scratch_cuda_apply_many_univariate_lut_kb_64( (int_radix_lut **)mem_ptr, static_cast(input_lut), num_radix_blocks, params, num_many_lut, lut_degree, allocate_gpu_memory); + POP_RANGE() } void cuda_apply_univariate_lut_kb_64( @@ -237,8 +249,10 @@ void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void) { + PUSH_RANGE("cleanup univar lut") int_radix_lut *mem_ptr = (int_radix_lut *)(*mem_ptr_void); mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); + POP_RANGE() } void cuda_apply_many_univariate_lut_kb_64( @@ -263,7 +277,7 @@ void scratch_cuda_apply_bivariate_lut_kb_64( uint32_t grouping_factor, uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, uint64_t lut_degree, bool allocate_gpu_memory, bool allocate_ms_array) { - + PUSH_RANGE("scratch bivar lut") int_radix_params params(pbs_type, glwe_dimension, polynomial_size, glwe_dimension * polynomial_size, lwe_dimension, ks_level, ks_base_log, pbs_level, pbs_base_log, @@ -275,6 +289,7 @@ void scratch_cuda_apply_bivariate_lut_kb_64( (int_radix_lut **)mem_ptr, static_cast(input_lut), num_radix_blocks, params, lut_degree, allocate_gpu_memory); + POP_RANGE() } void cuda_apply_bivariate_lut_kb_64( @@ -297,8 +312,10 @@ void cleanup_cuda_apply_bivariate_lut_kb_64(void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void) { + PUSH_RANGE("cleanup bivar lut") int_radix_lut *mem_ptr = (int_radix_lut *)(*mem_ptr_void); mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); + POP_RANGE() } void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64( diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index 5b065032e..f0539bb01 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -13,6 +13,7 @@ #include "polynomial/functions.cuh" #include "utils/helper.cuh" #include "utils/helper_multi_gpu.cuh" +#include "utils/helper_profile.cuh" #include "utils/kernel_dimensions.cuh" #include #include @@ -499,6 +500,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( Torus *const *ksks, CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key, int_radix_lut *lut, uint32_t num_radix_blocks) { + PUSH_RANGE("apply lut") // apply_lookup_table auto params = lut->params; auto pbs_type = params.pbs_type; @@ -596,6 +598,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( lwe_array_out->degrees[i] = lut->degrees[degrees_index]; lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL; } + POP_RANGE() } template @@ -606,6 +609,7 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb( Torus *const *ksks, CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key, int_radix_lut *lut, uint32_t num_many_lut, uint32_t lut_stride) { + PUSH_RANGE("apply many lut") // apply_lookup_table auto params = lut->params; auto pbs_type = params.pbs_type; @@ -699,6 +703,7 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb( lwe_array_out->degrees[i] = lut->degrees[degrees_index]; lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL; } + POP_RANGE() } template @@ -710,7 +715,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( Torus *const *ksks, CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key, int_radix_lut *lut, uint32_t num_radix_blocks, uint32_t shift) { - + PUSH_RANGE("apply bivar lut") if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension || lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension) PANIC("Cuda error: input and output radix ciphertexts should have the same " @@ -814,6 +819,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( lwe_array_out->degrees[i] = lut->degrees[degrees_index]; lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL; } + POP_RANGE() } // Rotates the slice in-place such that the first mid elements of the slice move @@ -995,7 +1001,7 @@ void generate_device_accumulator_bivariate( uint64_t *degree, uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus, std::function f) { - + PUSH_RANGE("gen bivar lut acc") // host lut Torus *h_lut = (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus)); @@ -1013,6 +1019,7 @@ void generate_device_accumulator_bivariate( cuda_synchronize_stream(stream, gpu_index); free(h_lut); + POP_RANGE() } /* @@ -1092,11 +1099,12 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index, uint32_t message_modulus, uint32_t carry_modulus, std::function f) { - + PUSH_RANGE("gen lut acc") generate_device_accumulator_with_encoding( stream, gpu_index, acc, degree, max_degree, glwe_dimension, polynomial_size, message_modulus, carry_modulus, message_modulus, carry_modulus, f); + POP_RANGE() } /* @@ -1112,7 +1120,7 @@ void generate_many_lut_device_accumulator( uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus, std::vector> &functions) { - + PUSH_RANGE("gen many lut acc") // host lut Torus *h_lut = (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus)); @@ -1129,6 +1137,7 @@ void generate_many_lut_device_accumulator( cuda_synchronize_stream(stream, gpu_index); free(h_lut); + POP_RANGE() } // This function is used to perform step 1 of Thomas' new carry propagation @@ -1803,6 +1812,7 @@ void host_propagate_single_carry( void *const *bsks, Torus *const *ksks, CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key, uint32_t requested_flag, uint32_t uses_carry) { + PUSH_RANGE("propagate sc") auto num_radix_blocks = lwe_array->num_radix_blocks; auto params = mem->params; auto glwe_dimension = params.glwe_dimension; @@ -1891,6 +1901,7 @@ void host_propagate_single_carry( streams, gpu_indexes, gpu_count, lwe_array, prepared_blocks, bsks, ksks, ms_noise_reduction_key, message_extract, num_radix_blocks); } + POP_RANGE() } // This function perform the three steps of Thomas' new carry propagation @@ -1904,6 +1915,7 @@ void host_add_and_propagate_single_carry( void *const *bsks, Torus *const *ksks, CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key, uint32_t requested_flag, uint32_t uses_carry) { + PUSH_RANGE("add & propagate sc") if (lhs_array->num_radix_blocks != rhs_array->num_radix_blocks) PANIC("Cuda error: input and output num radix blocks must be the same") if (lhs_array->lwe_dimension != rhs_array->lwe_dimension || @@ -2026,6 +2038,7 @@ void host_add_and_propagate_single_carry( streams, gpu_indexes, gpu_count, lhs_array, prepared_blocks, bsks, ksks, ms_noise_reduction_key, mem->lut_message_extract, num_radix_blocks); } + POP_RANGE() } template diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu index 5470eabf5..a4e4a1180 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu @@ -73,7 +73,7 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64( uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level, uint32_t grouping_factor, uint32_t num_radix_blocks, PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array) { - + PUSH_RANGE("scratch mul") int_radix_params params(pbs_type, glwe_dimension, polynomial_size, polynomial_size * glwe_dimension, lwe_dimension, ks_level, ks_base_log, pbs_level, pbs_base_log, @@ -97,6 +97,7 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64( PANIC("Cuda error (integer multiplication): unsupported polynomial size. " "Supported N's are powers of two in the interval [256..16384].") } + POP_RANGE() } /* @@ -134,7 +135,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64( void *const *bsks, void *const *ksks, CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key, int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks) { - + PUSH_RANGE("mul") switch (polynomial_size) { case 256: host_integer_mult_radix_kb>( @@ -189,16 +190,18 @@ void cuda_integer_mult_radix_ciphertext_kb_64( PANIC("Cuda error (integer multiplication): unsupported polynomial size. " "Supported N's are powers of two in the interval [256..16384].") } + POP_RANGE() } void cleanup_cuda_integer_mult(void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void) { - + PUSH_RANGE("cleanup mul") int_mul_memory *mem_ptr = (int_mul_memory *)(*mem_ptr_void); mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); + POP_RANGE() } void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh index 0333a2efe..5013481ba 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh @@ -116,10 +116,11 @@ __host__ void scratch_cuda_integer_overflowing_sub_kb( uint32_t gpu_count, int_overflowing_sub_memory **mem_ptr, uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory, bool allocate_ms_array) { - + PUSH_RANGE("scratch overflowing sub") *mem_ptr = new int_overflowing_sub_memory( streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory, allocate_ms_array); + POP_RANGE() } template @@ -134,7 +135,7 @@ __host__ void host_integer_overflowing_sub( Torus *const *ksks, CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key, uint32_t compute_overflow, uint32_t uses_input_borrow) { - + PUSH_RANGE("overflowing sub") if (output->num_radix_blocks != input_left->num_radix_blocks || output->num_radix_blocks != input_right->num_radix_blocks) PANIC("Cuda error: lwe_array_in and output num radix blocks must be " @@ -165,6 +166,7 @@ __host__ void host_integer_overflowing_sub( streams, gpu_indexes, gpu_count, output, overflow_block, input_borrow, (int_borrow_prop_memory *)mem_ptr, bsks, (Torus **)(ksks), ms_noise_reduction_key, num_groups, compute_overflow, uses_input_borrow); + POP_RANGE() } #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh index ea8272475..1acd3d959 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh @@ -4,6 +4,7 @@ #include "device.h" #include "integer/integer.h" #include "integer/radix_ciphertext.h" +#include "utils/helper_profile.cuh" #include "utils/kernel_dimensions.cuh" template @@ -12,6 +13,7 @@ void create_zero_radix_ciphertext_async(cudaStream_t const stream, CudaRadixCiphertextFFI *radix, const uint32_t num_radix_blocks, const uint32_t lwe_dimension) { + PUSH_RANGE("create zero radix ct"); radix->lwe_dimension = lwe_dimension; radix->num_radix_blocks = num_radix_blocks; radix->max_num_radix_blocks = num_radix_blocks; @@ -25,6 +27,7 @@ void create_zero_radix_ciphertext_async(cudaStream_t const stream, if (radix->degrees == NULL || radix->noise_levels == NULL) { PANIC("Cuda error: degrees / noise levels not allocated correctly") } + POP_RANGE(); } template @@ -73,6 +76,7 @@ void copy_radix_ciphertext_slice_async( const uint32_t output_end_lwe_index, const CudaRadixCiphertextFFI *input_radix, const uint32_t input_start_lwe_index, const uint32_t input_end_lwe_index) { + PUSH_RANGE("copy radix slice"); if (output_radix->lwe_dimension != input_radix->lwe_dimension) PANIC("Cuda error: input lwe dimension should be equal to output lwe " "dimension") @@ -116,6 +120,7 @@ void copy_radix_ciphertext_slice_async( output_radix->noise_levels[i + output_start_lwe_index] = input_radix->noise_levels[i + input_start_lwe_index]; } + POP_RANGE(); } template diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/helper_profile.cu b/backends/tfhe-cuda-backend/cuda/src/utils/helper_profile.cu new file mode 100644 index 000000000..1e229ee9e --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_profile.cu @@ -0,0 +1,42 @@ +#include "helper_profile.cuh" + +uint32_t adler32(const unsigned char *data) { + const uint32_t MOD_ADLER = 65521; + uint32_t a = 1, b = 0; + size_t index; + for (index = 0; data[index] != 0; ++index) { + a = (a + data[index] * 2) % MOD_ADLER; + b = (b + a) % MOD_ADLER; + } + return (b << 16) | a; +} + +void cuda_nvtx_label_with_color(const char *name) { +#ifdef USE_NVTOOLS + int color_id = adler32((const unsigned char *)name); + int r, g, b; + r = color_id & 0x000000ff; + g = (color_id & 0x000ff000) >> 12; + b = (color_id & 0x0ff00000) >> 20; + if (r < 64 & g < 64 & b < 64) { + r = r * 3; + g = g * 3 + 64; + b = b * 4; + } + + color_id = 0xff000000 | (r << 16) | (g << 8) | (b); + nvtxEventAttributes_t eventAttrib = {0}; + eventAttrib.version = NVTX_VERSION; + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; + eventAttrib.colorType = NVTX_COLOR_ARGB; + eventAttrib.color = color_id; + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; + eventAttrib.message.ascii = name; + nvtxRangePushEx(&eventAttrib); +#endif +} +void cuda_nvtx_pop() { +#ifdef USE_NVTOOLS + nvtxRangePop(); +#endif +} diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/helper_profile.cuh b/backends/tfhe-cuda-backend/cuda/src/utils/helper_profile.cuh new file mode 100644 index 000000000..5e2dbf6ea --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_profile.cuh @@ -0,0 +1,13 @@ +#ifndef HELPER_PROFILE +#define HELPER_PROFILE +#include + +void cuda_nvtx_label_with_color(const char *name); +void cuda_nvtx_pop(); + +#define PUSH_RANGE(name) \ + { cuda_nvtx_label_with_color(name); } +#define POP_RANGE() \ + { cuda_nvtx_pop(); } + +#endif diff --git a/tfhe/Cargo.toml b/tfhe/Cargo.toml index 9643db4b9..992dee09f 100644 --- a/tfhe/Cargo.toml +++ b/tfhe/Cargo.toml @@ -91,6 +91,7 @@ strings = ["integer"] internal-keycache = ["dep:fs2"] gpu = ["dep:tfhe-cuda-backend", "shortint"] gpu-experimental-multi-arch = ["gpu", "tfhe-cuda-backend/experimental-multi-arch"] +gpu-profile = ["gpu", "tfhe-cuda-backend/profile"] zk-pok = ["dep:tfhe-zk-pok"] # Adds more FheUint/FheInt types to the HL