chore(gpu): multi-gpu debug target

This commit is contained in:
Andrei Stoian
2025-09-11 15:40:36 +02:00
committed by Andrei Stoian
parent e523fd2cb6
commit 0604d237eb
11 changed files with 399 additions and 102 deletions

View File

@@ -20,3 +20,4 @@ bindgen = "0.71"
experimental-multi-arch = []
profile = []
debug = []
debug-fake-multi-gpu = []

View File

@@ -54,6 +54,10 @@ fn main() {
if cfg!(feature = "debug") {
cmake_config.define("CMAKE_BUILD_TYPE", "Debug");
} else if cfg!(feature = "debug-fake-multi-gpu") {
cmake_config.define("CMAKE_BUILD_TYPE", "DebugOnlyCpu");
cmake_config.define("CMAKE_VERBOSE_MAKEFILE", "ON");
cmake_config.define("FAKE_MULTI_GPU", "ON");
}
// Build the CMake project

View File

@@ -87,6 +87,9 @@ if(CMAKE_BUILD_TYPE_LOWERCASE STREQUAL "debug")
add_definitions(-DDEBUG)
set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O0 -G -g")
set(USE_NVTOOLS 1)
elseif(CMAKE_BUILD_TYPE_LOWERCASE STREQUAL "debugonlycpu")
message("Compiling GPU kernels in Release and CPU code in Debug")
set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O0 -g")
else()
# Release mode
message("Compiling in Release mode")
@@ -99,6 +102,11 @@ if(${USE_NVTOOLS})
add_definitions(-DUSE_NVTOOLS)
endif()
if(${FAKE_MULTI_GPU})
message(STATUS "Fake multi-gpu debugging is enabled")
add_definitions(-DDEBUG_FAKE_MULTI_GPU)
endif()
# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging to use
# nvtx when profiling -lnvToolsExt
set(CMAKE_CUDA_FLAGS

View File

@@ -728,23 +728,25 @@ template <typename Torus> struct int_radix_lut {
"Broadcasting LUTs can only be done to the LUT streams or to new "
"streams that reside on the same GPUs as the source LUTs");
if (new_active_streams.gpu_index(i) !=
new_active_streams.gpu_index(0)) {
cuda_stream_wait_event(new_active_streams.stream(i), event_broadcast,
new_active_streams.gpu_index(i));
if (broadcast_lut_values) {
auto dst_lut = lut_vec[i];
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
dst_lut, src_lut, num_luts * lut_size * sizeof(Torus),
new_active_streams.stream(i), new_active_streams.gpu_index(i),
gpu_memory_allocated);
}
auto dst_lut_indexes = lut_indexes_vec[i];
#ifndef DEBUG_FAKE_MULTI_GPU
if (new_active_streams.gpu_index(i) == new_active_streams.gpu_index(0))
continue;
#endif
cuda_stream_wait_event(new_active_streams.stream(i), event_broadcast,
new_active_streams.gpu_index(i));
if (broadcast_lut_values) {
auto dst_lut = lut_vec[i];
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
dst_lut_indexes, src_lut_indexes, num_blocks * sizeof(Torus),
dst_lut, src_lut, num_luts * lut_size * sizeof(Torus),
new_active_streams.stream(i), new_active_streams.gpu_index(i),
gpu_memory_allocated);
}
auto dst_lut_indexes = lut_indexes_vec[i];
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
dst_lut_indexes, src_lut_indexes, num_blocks * sizeof(Torus),
new_active_streams.stream(i), new_active_streams.gpu_index(i),
gpu_memory_allocated);
}
// Ensure the device set at the end of this method is the same as it was
// set at the beginning