mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-09 14:47:56 -05:00
chore(gpu): multi-gpu debug target
This commit is contained in:
committed by
Andrei Stoian
parent
e523fd2cb6
commit
0604d237eb
@@ -20,3 +20,4 @@ bindgen = "0.71"
|
||||
experimental-multi-arch = []
|
||||
profile = []
|
||||
debug = []
|
||||
debug-fake-multi-gpu = []
|
||||
|
||||
@@ -54,6 +54,10 @@ fn main() {
|
||||
|
||||
if cfg!(feature = "debug") {
|
||||
cmake_config.define("CMAKE_BUILD_TYPE", "Debug");
|
||||
} else if cfg!(feature = "debug-fake-multi-gpu") {
|
||||
cmake_config.define("CMAKE_BUILD_TYPE", "DebugOnlyCpu");
|
||||
cmake_config.define("CMAKE_VERBOSE_MAKEFILE", "ON");
|
||||
cmake_config.define("FAKE_MULTI_GPU", "ON");
|
||||
}
|
||||
|
||||
// Build the CMake project
|
||||
|
||||
@@ -87,6 +87,9 @@ if(CMAKE_BUILD_TYPE_LOWERCASE STREQUAL "debug")
|
||||
add_definitions(-DDEBUG)
|
||||
set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O0 -G -g")
|
||||
set(USE_NVTOOLS 1)
|
||||
elseif(CMAKE_BUILD_TYPE_LOWERCASE STREQUAL "debugonlycpu")
|
||||
message("Compiling GPU kernels in Release and CPU code in Debug")
|
||||
set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O0 -g")
|
||||
else()
|
||||
# Release mode
|
||||
message("Compiling in Release mode")
|
||||
@@ -99,6 +102,11 @@ if(${USE_NVTOOLS})
|
||||
add_definitions(-DUSE_NVTOOLS)
|
||||
endif()
|
||||
|
||||
if(${FAKE_MULTI_GPU})
|
||||
message(STATUS "Fake multi-gpu debugging is enabled")
|
||||
add_definitions(-DDEBUG_FAKE_MULTI_GPU)
|
||||
endif()
|
||||
|
||||
# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging to use
|
||||
# nvtx when profiling -lnvToolsExt
|
||||
set(CMAKE_CUDA_FLAGS
|
||||
|
||||
@@ -728,23 +728,25 @@ template <typename Torus> struct int_radix_lut {
|
||||
"Broadcasting LUTs can only be done to the LUT streams or to new "
|
||||
"streams that reside on the same GPUs as the source LUTs");
|
||||
|
||||
if (new_active_streams.gpu_index(i) !=
|
||||
new_active_streams.gpu_index(0)) {
|
||||
cuda_stream_wait_event(new_active_streams.stream(i), event_broadcast,
|
||||
new_active_streams.gpu_index(i));
|
||||
if (broadcast_lut_values) {
|
||||
auto dst_lut = lut_vec[i];
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
dst_lut, src_lut, num_luts * lut_size * sizeof(Torus),
|
||||
new_active_streams.stream(i), new_active_streams.gpu_index(i),
|
||||
gpu_memory_allocated);
|
||||
}
|
||||
auto dst_lut_indexes = lut_indexes_vec[i];
|
||||
#ifndef DEBUG_FAKE_MULTI_GPU
|
||||
if (new_active_streams.gpu_index(i) == new_active_streams.gpu_index(0))
|
||||
continue;
|
||||
#endif
|
||||
|
||||
cuda_stream_wait_event(new_active_streams.stream(i), event_broadcast,
|
||||
new_active_streams.gpu_index(i));
|
||||
if (broadcast_lut_values) {
|
||||
auto dst_lut = lut_vec[i];
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
dst_lut_indexes, src_lut_indexes, num_blocks * sizeof(Torus),
|
||||
dst_lut, src_lut, num_luts * lut_size * sizeof(Torus),
|
||||
new_active_streams.stream(i), new_active_streams.gpu_index(i),
|
||||
gpu_memory_allocated);
|
||||
}
|
||||
auto dst_lut_indexes = lut_indexes_vec[i];
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
dst_lut_indexes, src_lut_indexes, num_blocks * sizeof(Torus),
|
||||
new_active_streams.stream(i), new_active_streams.gpu_index(i),
|
||||
gpu_memory_allocated);
|
||||
}
|
||||
// Ensure the device set at the end of this method is the same as it was
|
||||
// set at the beginning
|
||||
|
||||
Reference in New Issue
Block a user