mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-09 14:47:56 -05:00
chore(gpu): add nvtx tool for profiling
This commit is contained in:
committed by
Agnès Leroy
parent
ffdaf6ad13
commit
25d1a4e4dd
@@ -18,3 +18,4 @@ bindgen = "0.71"
|
||||
|
||||
[features]
|
||||
experimental-multi-arch = []
|
||||
profile = []
|
||||
|
||||
@@ -45,6 +45,13 @@ fn main() {
|
||||
} else {
|
||||
cmake_config.define("MULTI_ARCH", "OFF");
|
||||
}
|
||||
// Conditionally pass the "USE_NVTOOLS" variable to CMake if the feature is enabled
|
||||
if cfg!(feature = "profile") {
|
||||
cmake_config.define("USE_NVTOOLS", "ON");
|
||||
println!("cargo:rustc-link-lib=nvToolsExt");
|
||||
} else {
|
||||
cmake_config.define("USE_NVTOOLS", "OFF");
|
||||
}
|
||||
|
||||
// Build the CMake project
|
||||
let dest = cmake_config.build();
|
||||
|
||||
@@ -88,7 +88,14 @@ else()
|
||||
set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O3")
|
||||
endif()
|
||||
|
||||
# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
|
||||
# Check if the USE_NVTOOLS environment variable is set
|
||||
if(${USE_NVTOOLS})
|
||||
message(STATUS "USE_NVTOOLS is enabled")
|
||||
add_definitions(-DUSE_NVTOOLS)
|
||||
endif()
|
||||
|
||||
# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging to use
|
||||
# nvtx when profiling -lnvToolsExt
|
||||
set(CMAKE_CUDA_FLAGS
|
||||
"${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} ${OPTIMIZATION_FLAGS}\
|
||||
-std=c++17 --no-exceptions --expt-relaxed-constexpr -rdc=true \
|
||||
|
||||
@@ -8,7 +8,7 @@ void scratch_cuda_integer_radix_cmux_kb_64(
|
||||
uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
PUSH_RANGE("scratch cmux")
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
@@ -21,6 +21,7 @@ void scratch_cuda_integer_radix_cmux_kb_64(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
|
||||
lwe_ciphertext_count, params, allocate_gpu_memory);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cuda_cmux_integer_radix_ciphertext_kb_64(
|
||||
@@ -31,20 +32,22 @@ void cuda_cmux_integer_radix_ciphertext_kb_64(
|
||||
CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
PUSH_RANGE("cmux")
|
||||
host_integer_radix_cmux_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_condition, lwe_array_true, lwe_array_false,
|
||||
(int_cmux_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_cmux(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
PUSH_RANGE("cleanup cmux")
|
||||
int_cmux_buffer<uint64_t> *mem_ptr =
|
||||
(int_cmux_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
@@ -9,7 +9,7 @@ void scratch_cuda_integer_radix_comparison_kb_64(
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
COMPARISON_TYPE op_type, bool is_signed, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array) {
|
||||
|
||||
PUSH_RANGE("scratch comparison")
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
@@ -35,6 +35,7 @@ void scratch_cuda_integer_radix_comparison_kb_64(
|
||||
op_type, is_signed, allocate_gpu_memory);
|
||||
break;
|
||||
}
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
@@ -44,7 +45,7 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
PUSH_RANGE("comparison")
|
||||
if (lwe_array_1->num_radix_blocks != lwe_array_1->num_radix_blocks)
|
||||
PANIC("Cuda error: input num radix blocks must be the same")
|
||||
// The output ciphertext might be a boolean block or a radix ciphertext
|
||||
@@ -85,16 +86,18 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
default:
|
||||
PANIC("Cuda error: integer operation not supported")
|
||||
}
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_comparison(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
PUSH_RANGE("cleanup comparison")
|
||||
int_comparison_buffer<uint64_t> *mem_ptr =
|
||||
(int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
|
||||
|
||||
@@ -8,7 +8,7 @@ void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
PUSH_RANGE("scratch div")
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
@@ -18,6 +18,7 @@ void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, is_signed,
|
||||
(int_div_rem_memory<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
allocate_gpu_memory);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
@@ -27,20 +28,23 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
CudaRadixCiphertextFFI const *divisor, bool is_signed, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
PUSH_RANGE("div")
|
||||
auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;
|
||||
|
||||
host_integer_div_rem_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, quotient, remainder,
|
||||
numerator, divisor, is_signed, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, mem);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_div_rem(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int8_t **mem_ptr_void) {
|
||||
PUSH_RANGE("cleanup div")
|
||||
int_div_rem_memory<uint64_t> *mem_ptr =
|
||||
(int_div_rem_memory<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
@@ -52,7 +52,7 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
|
||||
uint32_t uses_carry, bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
PUSH_RANGE("scratch propagate sc")
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
@@ -62,6 +62,7 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
requested_flag, uses_carry, allocate_gpu_memory);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
@@ -72,7 +73,7 @@ void scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
|
||||
uint32_t uses_carry, bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
PUSH_RANGE("scratch add & propagate sc")
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
@@ -82,6 +83,7 @@ void scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
requested_flag, uses_carry, allocate_gpu_memory);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void scratch_cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
@@ -92,7 +94,7 @@ void scratch_cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t compute_overflow,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
PUSH_RANGE("scratch overflow sub")
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
@@ -102,6 +104,7 @@ void scratch_cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_borrow_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
compute_overflow, allocate_gpu_memory);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cuda_propagate_single_carry_kb_64_inplace(
|
||||
@@ -140,38 +143,45 @@ void cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t compute_overflow, uint32_t uses_input_borrow) {
|
||||
|
||||
PUSH_RANGE("overflow sub")
|
||||
host_integer_overflowing_sub<uint64_t>(
|
||||
(cudaStream_t const *)streams, gpu_indexes, gpu_count, lhs_array,
|
||||
lhs_array, rhs_array, overflow_block, input_borrow,
|
||||
(int_borrow_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
|
||||
ms_noise_reduction_key, compute_overflow, uses_input_borrow);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cleanup_cuda_propagate_single_carry(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
PUSH_RANGE("cleanup propagate sc")
|
||||
int_sc_prop_memory<uint64_t> *mem_ptr =
|
||||
(int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cleanup_cuda_add_and_propagate_single_carry(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
PUSH_RANGE("cleanup add & propagate sc")
|
||||
int_sc_prop_memory<uint64_t> *mem_ptr =
|
||||
(int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
POP_RANGE()
|
||||
}
|
||||
void cleanup_cuda_integer_overflowing_sub(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
PUSH_RANGE("cleanup overflow sub")
|
||||
int_borrow_prop_memory<uint64_t> *mem_ptr =
|
||||
(int_borrow_prop_memory<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void scratch_cuda_apply_univariate_lut_kb_64(
|
||||
@@ -182,7 +192,7 @@ void scratch_cuda_apply_univariate_lut_kb_64(
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint64_t lut_degree, bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
PUSH_RANGE("scratch univar lut")
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
@@ -194,6 +204,7 @@ void scratch_cuda_apply_univariate_lut_kb_64(
|
||||
(int_radix_lut<uint64_t> **)mem_ptr,
|
||||
static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
|
||||
lut_degree, allocate_gpu_memory);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void scratch_cuda_apply_many_univariate_lut_kb_64(
|
||||
@@ -205,7 +216,7 @@ void scratch_cuda_apply_many_univariate_lut_kb_64(
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint32_t num_many_lut, uint64_t lut_degree, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array) {
|
||||
|
||||
PUSH_RANGE("scratch many lut")
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
@@ -217,6 +228,7 @@ void scratch_cuda_apply_many_univariate_lut_kb_64(
|
||||
(int_radix_lut<uint64_t> **)mem_ptr,
|
||||
static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
|
||||
num_many_lut, lut_degree, allocate_gpu_memory);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cuda_apply_univariate_lut_kb_64(
|
||||
@@ -237,8 +249,10 @@ void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
PUSH_RANGE("cleanup univar lut")
|
||||
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cuda_apply_many_univariate_lut_kb_64(
|
||||
@@ -263,7 +277,7 @@ void scratch_cuda_apply_bivariate_lut_kb_64(
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint64_t lut_degree, bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
PUSH_RANGE("scratch bivar lut")
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
@@ -275,6 +289,7 @@ void scratch_cuda_apply_bivariate_lut_kb_64(
|
||||
(int_radix_lut<uint64_t> **)mem_ptr,
|
||||
static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
|
||||
lut_degree, allocate_gpu_memory);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cuda_apply_bivariate_lut_kb_64(
|
||||
@@ -297,8 +312,10 @@ void cleanup_cuda_apply_bivariate_lut_kb_64(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
PUSH_RANGE("cleanup bivar lut")
|
||||
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/helper_multi_gpu.cuh"
|
||||
#include "utils/helper_profile.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
@@ -499,6 +500,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int_radix_lut<Torus> *lut, uint32_t num_radix_blocks) {
|
||||
PUSH_RANGE("apply lut")
|
||||
// apply_lookup_table
|
||||
auto params = lut->params;
|
||||
auto pbs_type = params.pbs_type;
|
||||
@@ -596,6 +598,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
lwe_array_out->degrees[i] = lut->degrees[degrees_index];
|
||||
lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
|
||||
}
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -606,6 +609,7 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int_radix_lut<Torus> *lut, uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
PUSH_RANGE("apply many lut")
|
||||
// apply_lookup_table
|
||||
auto params = lut->params;
|
||||
auto pbs_type = params.pbs_type;
|
||||
@@ -699,6 +703,7 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
|
||||
lwe_array_out->degrees[i] = lut->degrees[degrees_index];
|
||||
lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
|
||||
}
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -710,7 +715,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int_radix_lut<Torus> *lut, uint32_t num_radix_blocks, uint32_t shift) {
|
||||
|
||||
PUSH_RANGE("apply bivar lut")
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
|
||||
lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
|
||||
PANIC("Cuda error: input and output radix ciphertexts should have the same "
|
||||
@@ -814,6 +819,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
lwe_array_out->degrees[i] = lut->degrees[degrees_index];
|
||||
lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
|
||||
}
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
// Rotates the slice in-place such that the first mid elements of the slice move
|
||||
@@ -995,7 +1001,7 @@ void generate_device_accumulator_bivariate(
|
||||
uint64_t *degree, uint64_t *max_degree, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
std::function<Torus(Torus, Torus)> f) {
|
||||
|
||||
PUSH_RANGE("gen bivar lut acc")
|
||||
// host lut
|
||||
Torus *h_lut =
|
||||
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
|
||||
@@ -1013,6 +1019,7 @@ void generate_device_accumulator_bivariate(
|
||||
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
free(h_lut);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1092,11 +1099,12 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
|
||||
uint32_t message_modulus,
|
||||
uint32_t carry_modulus,
|
||||
std::function<Torus(Torus)> f) {
|
||||
|
||||
PUSH_RANGE("gen lut acc")
|
||||
generate_device_accumulator_with_encoding(
|
||||
stream, gpu_index, acc, degree, max_degree, glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, message_modulus,
|
||||
carry_modulus, f);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1112,7 +1120,7 @@ void generate_many_lut_device_accumulator(
|
||||
uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t message_modulus, uint32_t carry_modulus,
|
||||
std::vector<std::function<Torus(Torus)>> &functions) {
|
||||
|
||||
PUSH_RANGE("gen many lut acc")
|
||||
// host lut
|
||||
Torus *h_lut =
|
||||
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
|
||||
@@ -1129,6 +1137,7 @@ void generate_many_lut_device_accumulator(
|
||||
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
free(h_lut);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
// This function is used to perform step 1 of Thomas' new carry propagation
|
||||
@@ -1803,6 +1812,7 @@ void host_propagate_single_carry(
|
||||
void *const *bsks, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t requested_flag, uint32_t uses_carry) {
|
||||
PUSH_RANGE("propagate sc")
|
||||
auto num_radix_blocks = lwe_array->num_radix_blocks;
|
||||
auto params = mem->params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
@@ -1891,6 +1901,7 @@ void host_propagate_single_carry(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, prepared_blocks, bsks, ksks,
|
||||
ms_noise_reduction_key, message_extract, num_radix_blocks);
|
||||
}
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
// This function perform the three steps of Thomas' new carry propagation
|
||||
@@ -1904,6 +1915,7 @@ void host_add_and_propagate_single_carry(
|
||||
void *const *bsks, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t requested_flag, uint32_t uses_carry) {
|
||||
PUSH_RANGE("add & propagate sc")
|
||||
if (lhs_array->num_radix_blocks != rhs_array->num_radix_blocks)
|
||||
PANIC("Cuda error: input and output num radix blocks must be the same")
|
||||
if (lhs_array->lwe_dimension != rhs_array->lwe_dimension ||
|
||||
@@ -2026,6 +2038,7 @@ void host_add_and_propagate_single_carry(
|
||||
streams, gpu_indexes, gpu_count, lhs_array, prepared_blocks, bsks, ksks,
|
||||
ms_noise_reduction_key, mem->lut_message_extract, num_radix_blocks);
|
||||
}
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
|
||||
@@ -73,7 +73,7 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
PUSH_RANGE("scratch mul")
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
polynomial_size * glwe_dimension, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
@@ -97,6 +97,7 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
|
||||
"Supported N's are powers of two in the interval [256..16384].")
|
||||
}
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -134,7 +135,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks) {
|
||||
|
||||
PUSH_RANGE("mul")
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<256>>(
|
||||
@@ -189,16 +190,18 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
|
||||
"Supported N's are powers of two in the interval [256..16384].")
|
||||
}
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_mult(void *const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
PUSH_RANGE("cleanup mul")
|
||||
int_mul_memory<uint64_t> *mem_ptr =
|
||||
(int_mul_memory<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
|
||||
@@ -116,10 +116,11 @@ __host__ void scratch_cuda_integer_overflowing_sub_kb(
|
||||
uint32_t gpu_count, int_overflowing_sub_memory<Torus> **mem_ptr,
|
||||
uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array) {
|
||||
|
||||
PUSH_RANGE("scratch overflowing sub")
|
||||
*mem_ptr = new int_overflowing_sub_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory,
|
||||
allocate_ms_array);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -134,7 +135,7 @@ __host__ void host_integer_overflowing_sub(
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t compute_overflow, uint32_t uses_input_borrow) {
|
||||
|
||||
PUSH_RANGE("overflowing sub")
|
||||
if (output->num_radix_blocks != input_left->num_radix_blocks ||
|
||||
output->num_radix_blocks != input_right->num_radix_blocks)
|
||||
PANIC("Cuda error: lwe_array_in and output num radix blocks must be "
|
||||
@@ -165,6 +166,7 @@ __host__ void host_integer_overflowing_sub(
|
||||
streams, gpu_indexes, gpu_count, output, overflow_block, input_borrow,
|
||||
(int_borrow_prop_memory<Torus> *)mem_ptr, bsks, (Torus **)(ksks),
|
||||
ms_noise_reduction_key, num_groups, compute_overflow, uses_input_borrow);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
#include "device.h"
|
||||
#include "integer/integer.h"
|
||||
#include "integer/radix_ciphertext.h"
|
||||
#include "utils/helper_profile.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
|
||||
template <typename Torus>
|
||||
@@ -12,6 +13,7 @@ void create_zero_radix_ciphertext_async(cudaStream_t const stream,
|
||||
CudaRadixCiphertextFFI *radix,
|
||||
const uint32_t num_radix_blocks,
|
||||
const uint32_t lwe_dimension) {
|
||||
PUSH_RANGE("create zero radix ct");
|
||||
radix->lwe_dimension = lwe_dimension;
|
||||
radix->num_radix_blocks = num_radix_blocks;
|
||||
radix->max_num_radix_blocks = num_radix_blocks;
|
||||
@@ -25,6 +27,7 @@ void create_zero_radix_ciphertext_async(cudaStream_t const stream,
|
||||
if (radix->degrees == NULL || radix->noise_levels == NULL) {
|
||||
PANIC("Cuda error: degrees / noise levels not allocated correctly")
|
||||
}
|
||||
POP_RANGE();
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -73,6 +76,7 @@ void copy_radix_ciphertext_slice_async(
|
||||
const uint32_t output_end_lwe_index,
|
||||
const CudaRadixCiphertextFFI *input_radix,
|
||||
const uint32_t input_start_lwe_index, const uint32_t input_end_lwe_index) {
|
||||
PUSH_RANGE("copy radix slice");
|
||||
if (output_radix->lwe_dimension != input_radix->lwe_dimension)
|
||||
PANIC("Cuda error: input lwe dimension should be equal to output lwe "
|
||||
"dimension")
|
||||
@@ -116,6 +120,7 @@ void copy_radix_ciphertext_slice_async(
|
||||
output_radix->noise_levels[i + output_start_lwe_index] =
|
||||
input_radix->noise_levels[i + input_start_lwe_index];
|
||||
}
|
||||
POP_RANGE();
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
|
||||
42
backends/tfhe-cuda-backend/cuda/src/utils/helper_profile.cu
Normal file
42
backends/tfhe-cuda-backend/cuda/src/utils/helper_profile.cu
Normal file
@@ -0,0 +1,42 @@
|
||||
#include "helper_profile.cuh"
|
||||
|
||||
uint32_t adler32(const unsigned char *data) {
|
||||
const uint32_t MOD_ADLER = 65521;
|
||||
uint32_t a = 1, b = 0;
|
||||
size_t index;
|
||||
for (index = 0; data[index] != 0; ++index) {
|
||||
a = (a + data[index] * 2) % MOD_ADLER;
|
||||
b = (b + a) % MOD_ADLER;
|
||||
}
|
||||
return (b << 16) | a;
|
||||
}
|
||||
|
||||
void cuda_nvtx_label_with_color(const char *name) {
|
||||
#ifdef USE_NVTOOLS
|
||||
int color_id = adler32((const unsigned char *)name);
|
||||
int r, g, b;
|
||||
r = color_id & 0x000000ff;
|
||||
g = (color_id & 0x000ff000) >> 12;
|
||||
b = (color_id & 0x0ff00000) >> 20;
|
||||
if (r < 64 & g < 64 & b < 64) {
|
||||
r = r * 3;
|
||||
g = g * 3 + 64;
|
||||
b = b * 4;
|
||||
}
|
||||
|
||||
color_id = 0xff000000 | (r << 16) | (g << 8) | (b);
|
||||
nvtxEventAttributes_t eventAttrib = {0};
|
||||
eventAttrib.version = NVTX_VERSION;
|
||||
eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
|
||||
eventAttrib.colorType = NVTX_COLOR_ARGB;
|
||||
eventAttrib.color = color_id;
|
||||
eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
|
||||
eventAttrib.message.ascii = name;
|
||||
nvtxRangePushEx(&eventAttrib);
|
||||
#endif
|
||||
}
|
||||
void cuda_nvtx_pop() {
|
||||
#ifdef USE_NVTOOLS
|
||||
nvtxRangePop();
|
||||
#endif
|
||||
}
|
||||
13
backends/tfhe-cuda-backend/cuda/src/utils/helper_profile.cuh
Normal file
13
backends/tfhe-cuda-backend/cuda/src/utils/helper_profile.cuh
Normal file
@@ -0,0 +1,13 @@
|
||||
#ifndef HELPER_PROFILE
|
||||
#define HELPER_PROFILE
|
||||
#include <nvToolsExt.h>
|
||||
|
||||
void cuda_nvtx_label_with_color(const char *name);
|
||||
void cuda_nvtx_pop();
|
||||
|
||||
#define PUSH_RANGE(name) \
|
||||
{ cuda_nvtx_label_with_color(name); }
|
||||
#define POP_RANGE() \
|
||||
{ cuda_nvtx_pop(); }
|
||||
|
||||
#endif
|
||||
@@ -91,6 +91,7 @@ strings = ["integer"]
|
||||
internal-keycache = ["dep:fs2"]
|
||||
gpu = ["dep:tfhe-cuda-backend", "shortint"]
|
||||
gpu-experimental-multi-arch = ["gpu", "tfhe-cuda-backend/experimental-multi-arch"]
|
||||
gpu-profile = ["gpu", "tfhe-cuda-backend/profile"]
|
||||
zk-pok = ["dep:tfhe-zk-pok"]
|
||||
|
||||
# Adds more FheUint/FheInt types to the HL
|
||||
|
||||
Reference in New Issue
Block a user