chore(gpu): multi-gpu debug target

2026-01-09 14:47:56 -05:00 · 2025-09-11 15:40:36 +02:00
parent e523fd2cb6
commit 0604d237eb
11 changed files with 399 additions and 102 deletions
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -20,3 +20,4 @@ bindgen = "0.71"
 experimental-multi-arch = []
 profile = []
 debug = []
+debug-fake-multi-gpu = []
--- a/backends/tfhe-cuda-backend/build.rs
+++ b/backends/tfhe-cuda-backend/build.rs
@@ -54,6 +54,10 @@ fn main() {

        if cfg!(feature = "debug") {
            cmake_config.define("CMAKE_BUILD_TYPE", "Debug");
+        } else if cfg!(feature = "debug-fake-multi-gpu") {
+            cmake_config.define("CMAKE_BUILD_TYPE", "DebugOnlyCpu");
+            cmake_config.define("CMAKE_VERBOSE_MAKEFILE", "ON");
+            cmake_config.define("FAKE_MULTI_GPU", "ON");
        }

        // Build the CMake project
--- a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
@@ -87,6 +87,9 @@ if(CMAKE_BUILD_TYPE_LOWERCASE STREQUAL "debug")
  add_definitions(-DDEBUG)
  set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O0 -G -g")
  set(USE_NVTOOLS 1)
+elseif(CMAKE_BUILD_TYPE_LOWERCASE STREQUAL "debugonlycpu")
+  message("Compiling GPU kernels in Release and CPU code in Debug")
+  set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O0 -g")
 else()
  # Release mode
  message("Compiling in Release mode")
@@ -99,6 +102,11 @@ if(${USE_NVTOOLS})
  add_definitions(-DUSE_NVTOOLS)
 endif()

+if(${FAKE_MULTI_GPU})
+  message(STATUS "Fake multi-gpu debugging is enabled")
+  add_definitions(-DDEBUG_FAKE_MULTI_GPU)
+endif()
+
 # in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging to use
 # nvtx when profiling -lnvToolsExt
 set(CMAKE_CUDA_FLAGS
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -728,23 +728,25 @@ template <typename Torus> struct int_radix_lut {
            "Broadcasting LUTs can only be done to the LUT streams or to new "
            "streams that reside on the same GPUs as the source LUTs");

-        if (new_active_streams.gpu_index(i) !=
-            new_active_streams.gpu_index(0)) {
-          cuda_stream_wait_event(new_active_streams.stream(i), event_broadcast,
-                                 new_active_streams.gpu_index(i));
-          if (broadcast_lut_values) {
-            auto dst_lut = lut_vec[i];
-            cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
-                dst_lut, src_lut, num_luts * lut_size * sizeof(Torus),
-                new_active_streams.stream(i), new_active_streams.gpu_index(i),
-                gpu_memory_allocated);
-          }
-          auto dst_lut_indexes = lut_indexes_vec[i];
+#ifndef DEBUG_FAKE_MULTI_GPU
+        if (new_active_streams.gpu_index(i) == new_active_streams.gpu_index(0))
+          continue;
+#endif
+
+        cuda_stream_wait_event(new_active_streams.stream(i), event_broadcast,
+                               new_active_streams.gpu_index(i));
+        if (broadcast_lut_values) {
+          auto dst_lut = lut_vec[i];
          cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
-              dst_lut_indexes, src_lut_indexes, num_blocks * sizeof(Torus),
+              dst_lut, src_lut, num_luts * lut_size * sizeof(Torus),
              new_active_streams.stream(i), new_active_streams.gpu_index(i),
              gpu_memory_allocated);
        }
+        auto dst_lut_indexes = lut_indexes_vec[i];
+        cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
+            dst_lut_indexes, src_lut_indexes, num_blocks * sizeof(Torus),
+            new_active_streams.stream(i), new_active_streams.gpu_index(i),
+            gpu_memory_allocated);
      }
      // Ensure the device set at the end of this method is the same as it was
      // set at the beginning