fix: max streams 8

fix(gpu): deadlock
fix: rebase
2026-01-11 15:48:20 -05:00 · 2025-09-18 22:09:30 +02:00 · 2025-09-18 19:17:25 +02:00 · 2025-09-18 14:47:20 +02:00 · 2025-09-18 13:59:00 +02:00 · 2025-09-18 13:56:55 +02:00
8 changed files with 176 additions and 42 deletions
--- a/.github/workflows/coprocessor-benchmark-gpu.yml
+++ b/.github/workflows/coprocessor-benchmark-gpu.yml
@@ -3,6 +3,22 @@ name: coprocessor-benchmark-gpu

 on:
  workflow_dispatch:
+    inputs:
+      profile:
+        description: "Instance type"
+        required: true
+        type: choice
+        options:
+          - "l40 (n3-L40x1)"
+          - "4-l40 (n3-L40x4)"
+          - "single-h100 (n3-H100x1)"
+          - "2-h100 (n3-H100x2)"
+          - "4-h100 (n3-H100x4)"
+          - "multi-h100 (n3-H100x8)"
+          - "multi-h100-nvlink (n3-H100x8-NVLink)"
+          - "multi-h100-sxm5 (n3-H100x8-SXM5)"
+          - "multi-h100-sxm5_fallback (n3-H100x8-SXM5)"
+
  schedule:
    # Weekly tests @ 1AM
    - cron: "0 1 * * 6"
@@ -17,7 +33,9 @@ env:
  RUST_BACKTRACE: "full"
  RUST_MIN_STACK: "8388608"
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  PROFILE: "multi-h100-sxm5 (n3-H100x8-SXM5)"
+  PROFILE_SCHEDULED_RUN: "multi-h100-sxm5 (n3-H100x8-SXM5)"
+  PROFILE_MANUAL_RUN: ${{ inputs.profile }}
+  IS_MANUAL_RUN: ${{ github.event_name == 'workflow_dispatch' }}
  BENCHMARK_TYPE: "ALL"
  OPTIMIZATION_TARGET: "throughput"
  BATCH_SIZE: "5000"
@@ -40,15 +58,25 @@ jobs:
      - name: Parse profile
        id: parse_profile
        run: |
+          if [[ ${IS_MANUAL_RUN} == true ]]; then
+            PROFILE_RAW="${PROFILE_MANUAL_RUN}"
+          else
+            PROFILE_RAW="${PROFILE_SCHEDULED_RUN}"
+          fi
          # shellcheck disable=SC2001
-          PROFILE_VAL=$(echo "${PROFILE}" | sed 's|\(.*\)[[:space:]](.*)|\1|')
+          PROFILE_VAL=$(echo "${PROFILE_RAW}" | sed 's|\(.*\)[[:space:]](.*)|\1|')
          echo "profile=$PROFILE_VAL" >> "${GITHUB_OUTPUT}"

      - name: Parse hardware name
        id: parse_hardware_name
        run: |
+          if [[ ${IS_MANUAL_RUN} == true ]]; then
+            PROFILE_RAW="${PROFILE_MANUAL_RUN}"
+          else
+            PROFILE_RAW="${PROFILE}"
+          fi
          # shellcheck disable=SC2001
-          PROFILE_VAL=$(echo "${PROFILE}" | sed 's|.*[[:space:]](\(.*\))|\1|')
+          PROFILE_VAL=$(echo "${PROFILE_RAW}" | sed 's|.*[[:space:]](\(.*\))|\1|')
          echo "name=$PROFILE_VAL" >> "${GITHUB_OUTPUT}"

  setup-instance:
@@ -130,6 +158,13 @@ jobs:
          } >> "${GITHUB_ENV}"
        working-directory: tfhe-rs/

+      - name: Setup Hyperstack dependencies
+        uses: ./tfhe-rs/.github/actions/gpu_setup
+        with:
+          cuda-version: ${{ matrix.cuda }}
+          gcc-version: ${{ matrix.gcc }}
+          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
+
      - name: Check fhEVM and TFHE-rs repos
        run: |
          pwd
@@ -140,13 +175,6 @@ jobs:
        run: git lfs checkout
        working-directory: fhevm/

-      - name: Setup Hyperstack dependencies
-        uses: ./fhevm/.github/actions/gpu_setup
-        with:
-          cuda-version: ${{ matrix.cuda }}
-          gcc-version: ${{ matrix.gcc }}
-          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
-
      - name: Install rust
        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
@@ -154,7 +182,7 @@ jobs:

      - name: Install cargo dependencies
        run: |
-          sudo apt-get install -y protobuf-compiler cmake pkg-config libssl-dev \
+          sudo apt-get install -y protobuf-compiler pkg-config libssl-dev \
                                  libclang-dev docker-compose-v2 docker.io acl
          sudo usermod -aG docker "$USER"
          newgrp docker
@@ -181,9 +209,16 @@ jobs:
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

+      - name: Login to Chainguard Registry
+        uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0
+        with:
+          registry: cgr.dev
+          username: ${{ secrets.CGR_USERNAME }}
+          password: ${{ secrets.CGR_PASSWORD }}
+
      - name: Init database
        run: make init_db
-        working-directory: fhevm/coprocessor/fhevm-engine/coprocessor
+        working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker

      - name: Use Node.js
        uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
@@ -203,8 +238,12 @@ jobs:

      - name: Profile erc20 no-cmux benchmark on GPU
        run: |
-          BENCHMARK_BATCH_SIZE="${BATCH_SIZE}" FHEVM_DF_SCHEDULE="${SCHEDULING_POLICY}" BENCHMARK_TYPE="LATENCY" OPTIMIZATION_TARGET="${OPTIMIZATION_TARGET}" make -e "profile_erc20_gpu"
-        working-directory: fhevm/coprocessor/fhevm-engine/coprocessor
+          BENCHMARK_BATCH_SIZE="${BATCH_SIZE}" \
+          FHEVM_DF_SCHEDULE="${SCHEDULING_POLICY}" \
+          BENCHMARK_TYPE="THROUGHPUT_200" \
+          OPTIMIZATION_TARGET="${OPTIMIZATION_TARGET}" \
+          make -e "profile_erc20_gpu"
+        working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker

      - name: Get nsys profile name
        id: nsys_profile_name
@@ -215,7 +254,7 @@ jobs:
          REPORT_NAME: ${{ steps.nsys_profile_name.outputs.profile }}
        run: |
          mv report1.nsys-rep ${{ env.REPORT_NAME }}
-        working-directory: fhevm/coprocessor/fhevm-engine/coprocessor
+        working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker

      - name: Upload profile artifact
        env:
@@ -223,17 +262,17 @@ jobs:
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
        with:
          name: ${{ env.REPORT_NAME }}
-          path: fhevm/coprocessor/fhevm-engine/coprocessor/${{ env.REPORT_NAME }}
+          path: fhevm/coprocessor/fhevm-engine/tfhe-worker/${{ env.REPORT_NAME }}

      - name: Run latency benchmark on GPU
        run: |
          BENCHMARK_BATCH_SIZE="${BATCH_SIZE}" FHEVM_DF_SCHEDULE="${SCHEDULING_POLICY}" BENCHMARK_TYPE="LATENCY" OPTIMIZATION_TARGET="${OPTIMIZATION_TARGET}" make -e "benchmark_${BENCHMARKS}_gpu"
-        working-directory: fhevm/coprocessor/fhevm-engine/coprocessor
+        working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker

      - name: Run throughput benchmarks on GPU
        run: |
          BENCHMARK_BATCH_SIZE="${BATCH_SIZE}" FHEVM_DF_SCHEDULE="${SCHEDULING_POLICY}" BENCHMARK_TYPE="THROUGHPUT_200" OPTIMIZATION_TARGET="${OPTIMIZATION_TARGET}" make -e "benchmark_${BENCHMARKS}_gpu"
-        working-directory: fhevm/coprocessor/fhevm-engine/coprocessor
+        working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker

      - name: Parse results
        run: |
@@ -246,7 +285,7 @@ jobs:
          --commit-date "${COMMIT_DATE}" \
          --bench-date "${BENCH_DATE}" \
          --walk-subdirs \
-          --crate "coprocessor/fhevm-engine/coprocessor" \
+          --crate "coprocessor/fhevm-engine/tfhe-worker" \
          --name-suffix "operation_batch_size_${BATCH_SIZE}-schedule_${SCHEDULING_POLICY}-optimization_target_${OPTIMIZATION_TARGET}"
        working-directory: fhevm/

--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -86,7 +86,7 @@ jobs:
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
          backend: hyperstack
-          profile: multi-gpu-test
+          profile: 4-l40

      # This instance will be spawned especially for pull-request from forked repository
      - name: Start GitHub instance
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -43,7 +43,7 @@ jobs:
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
          backend: hyperstack
-          profile: multi-gpu-test
+          profile: 4-l40

  cuda-tests:
    name: gpu_integer_long_run_tests/cuda-tests
--- a/5
+++ b/5
@@ -1004,6 +1004,11 @@ test_list_gpu: install_rs_build_toolchain install_cargo_nextest
 		--features=integer,internal-keycache,gpu,zk-pok -p tfhe \
 		-E "test(/.*gpu.*/)"

+.PHONY: build_one_hl_api_test_gpu
+build_one_hl_api_test_gpu: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
+		   --features=integer,gpu-debug -vv -p tfhe -- "$${TEST}" --test-threads=1 --nocapture
+
 test_high_level_api_hpu: install_rs_build_toolchain install_cargo_nextest
 ifeq ($(HPU_CONFIG), v80)
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
--- a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
@@ -86,6 +86,7 @@ if(CMAKE_BUILD_TYPE_LOWERCASE STREQUAL "debug")
  message("Compiling in Debug mode")
  add_definitions(-DDEBUG)
  set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O0 -G -g")
+  set(USE_NVTOOLS 1)
 else()
  # Release mode
  message("Compiling in Release mode")
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -6,6 +6,15 @@
 #include <cstdlib>
 #include <cuda_runtime.h>

+#define CUDA_STREAM_POOL
+
+enum CudaStreamType
+{
+  KEY = 0,
+  ALLOC = 1,
+  TEMP_HELPER = 2,
+};
+
 extern "C" {

 #define check_cuda_error(ans)                                                  \
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -1,15 +1,27 @@
 #include "device.h"
+
+#include <atomic>
 #include <cstdint>
 #include <cuda_runtime.h>
 #include <mutex>
+#ifdef USE_NVTOOLS
+#include <cuda_profiler_api.h>
+#endif
+
+#ifdef CUDA_STREAM_POOL
+#include <deque>
+#include <mutex>
+#include <vector>
+#include <unordered_map>
+#endif

 uint32_t cuda_get_device() {
  int device;
  check_cuda_error(cudaGetDevice(&device));
  return static_cast<uint32_t>(device);
 }
-std::mutex pool_mutex;
-bool mem_pools_enabled = false;
+
+std::atomic<bool> mem_pools_enabled = false;

 // We use memory pools to reduce some overhead of memory allocations due
 // to our scratch/release pattern. This function is the simplest way of using
@@ -26,13 +38,13 @@ bool mem_pools_enabled = false;
 // We tested more complex configurations of mempools, but they did not yield
 // better results.
 void cuda_setup_mempool(uint32_t caller_gpu_index) {
-  if (!mem_pools_enabled) {
-    pool_mutex.lock();
-    if (mem_pools_enabled)
-      return; // If mem pools are already enabled, we don't need to do anything

-    // We do it only once for all GPUs
-    mem_pools_enabled = true;
+  bool pools_not_initialized = false;
+  bool pools_initialized = true;
+
+  // if pools_not_initialized is found, mem_pools_enabled is set to pools_initialized
+  // and the if body runs
+  if (mem_pools_enabled.compare_exchange_strong(pools_not_initialized, pools_initialized)) {
    uint32_t num_gpus = cuda_get_number_of_gpus();
    for (uint32_t gpu_index = 0; gpu_index < num_gpus; gpu_index++) {
      cuda_set_device(gpu_index);
@@ -75,7 +87,6 @@ void cuda_setup_mempool(uint32_t caller_gpu_index) {
    }
    // We return to the original gpu_index
    cuda_set_device(caller_gpu_index);
-    pool_mutex.unlock();
  }
 }

@@ -83,6 +94,9 @@ void cuda_set_device(uint32_t gpu_index) {
  check_cuda_error(cudaSetDevice(gpu_index));
  // Mempools are initialized only once in all the GPUS available
  cuda_setup_mempool(gpu_index);
+#ifdef USE_NVTOOLS
+  check_cuda_error(cudaProfilerStart());
+#endif
 }

 cudaEvent_t cuda_create_event(uint32_t gpu_index) {
@@ -109,18 +123,90 @@ void cuda_event_destroy(cudaEvent_t event, uint32_t gpu_index) {
  check_cuda_error(cudaEventDestroy(event));
 }

+#ifdef CUDA_STREAM_POOL
+struct CudaBoundStream
+{
+  cudaStream_t stream;
+  uint32_t gpu_index;
+};
+
+class CudaStreamPool
+{
+  std::vector<CudaBoundStream> poolCompute;
+  std::vector<CudaBoundStream> poolTransfer;
+
+  std::mutex mutex_pools;
+
+  size_t nextStream = 0;
+
+  const size_t MAX_STREAMS = 8;
+
+public:
+  cudaStream_t create_stream(uint32_t gpu_index)
+  {
+    std::lock_guard<std::mutex> lock(mutex_pools);
+    if (poolCompute.empty())
+    {
+      poolCompute.reserve(MAX_STREAMS);
+
+      cuda_set_device(gpu_index);
+      for (size_t i = 0; i < MAX_STREAMS; i++)
+      {
+        cudaStream_t stream;
+        check_cuda_error(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+        poolCompute.push_back(CudaBoundStream{stream, gpu_index});
+      }
+    }
+
+    PANIC_IF_FALSE(gpu_index == poolCompute[nextStream].gpu_index, "Bad gpu in stream pool");
+    cudaStream_t res = poolCompute[nextStream].stream;
+    nextStream = (nextStream + 1) % poolCompute.size();
+    return res;
+  }
+
+  void destroy_stream(cudaStream_t stream, uint32_t gpu_index)
+  {
+    //do nothing
+  }
+};
+
+
+class CudaMultiStreamPool {
+  std::unordered_map<uint32_t, CudaStreamPool> per_gpu_pools;
+  std::mutex pools_mutex; // for creation of the mem managers
+
+public:
+  CudaStreamPool &get(uint32_t gpu_index) {
+      std::lock_guard<std::mutex> guard(pools_mutex);
+      return per_gpu_pools[gpu_index]; // creates it if it does not exist
+  }
+};
+
+CudaMultiStreamPool gCudaStreamPool;
+#endif
+
+
 /// Unsafe function to create a CUDA stream, must check first that GPU exists
 cudaStream_t cuda_create_stream(uint32_t gpu_index) {
+#ifdef CUDA_STREAM_POOL
+  cuda_set_device(gpu_index); // this will initialize the mempool
+  return gCudaStreamPool.get(gpu_index).create_stream(gpu_index);
+#else
  cuda_set_device(gpu_index);
  cudaStream_t stream;
  check_cuda_error(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
  return stream;
+#endif
 }

 /// Unsafe function to destroy CUDA stream, must check first the GPU exists
 void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index) {
+#ifdef CUDA_STREAM_POOL
+  gCudaStreamPool.get(gpu_index).destroy_stream(stream, gpu_index);
+#else
  cuda_set_device(gpu_index);
  check_cuda_error(cudaStreamDestroy(stream));
+#endif
 }

 void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index) {
--- a/ci/slab.toml
+++ b/ci/slab.toml
@@ -83,18 +83,6 @@ image_name = "Ubuntu Server 22.04 LTS R570 CUDA 12.8"
 flavor_name = "n3-A100x8-NVLink"
 user = "ubuntu"

-[backend.hyperstack.multi-gpu-test]
-environment_name = "canada"
-image_name = "Ubuntu Server 22.04 LTS R570 CUDA 12.8"
-flavor_name = "n3-L40x4"
-user = "ubuntu"
-
-[backend.hyperstack.multi-gpu-test_fallback]
-environment_name = "canada"
-image_name = "Ubuntu Server 22.04 LTS R570 CUDA 12.8"
-flavor_name = "n3-RTX-A6000x2"
-user = "ubuntu"
-
 [backend.hyperstack.l40]
 environment_name = "canada"
 image_name = "Ubuntu Server 22.04 LTS R570 CUDA 12.8"
@@ -106,3 +94,9 @@ environment_name = "canada"
 image_name = "Ubuntu Server 22.04 LTS R570 CUDA 12.8"
 flavor_name = "n3-RTX-A6000x1"
 user = "ubuntu"
+
+[backend.hyperstack.4-l40]
+environment_name = "canada"
+image_name = "Ubuntu Server 22.04 LTS R570 CUDA 12.8"
+flavor_name = "n3-L40x4"
+user = "ubuntu"
Author	SHA1	Message	Date
Andrei Stoian	70fa68bf52	fix: max streams 8	2025-09-18 22:09:30 +02:00
Andrei Stoian	7faafd6602	fix(gpu): deadlock	2025-09-18 19:17:25 +02:00
Andrei Stoian	8c55f6b8d7	fix: rebase	2025-09-18 14:47:20 +02:00
Andrei Stoian	f6b1929a8d	feat(gpu): stream pools	2025-09-18 13:59:00 +02:00
Andrei Stoian	87c0d646a4	fix(gpu): coprocessor bench	2025-09-18 13:56:55 +02:00