From 36eceaf05ec53028b62316873870c3d5fe8c24f1 Mon Sep 17 00:00:00 2001 From: Andrei Stoian Date: Tue, 15 Jul 2025 11:25:29 +0200 Subject: [PATCH] feat(gpu): utility debug workflows in ci --- .github/actions/gpu_setup/action.yml | 8 + .../workflows/gpu_code_validation_tests.yml | 148 ++++++++++++++++++ Makefile | 18 ++- .../tfhe-cuda-backend/cuda/CMakeLists.txt | 4 +- .../cuda/src/integer/cmux.cu | 3 +- scripts/check_memory_errors.sh | 27 ++++ 6 files changed, 205 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/gpu_code_validation_tests.yml create mode 100755 scripts/check_memory_errors.sh diff --git a/.github/actions/gpu_setup/action.yml b/.github/actions/gpu_setup/action.yml index 3c50b7272..6e788a447 100644 --- a/.github/actions/gpu_setup/action.yml +++ b/.github/actions/gpu_setup/action.yml @@ -23,6 +23,7 @@ runs: echo "${CMAKE_SCRIPT_SHA} cmake-${CMAKE_VERSION}-linux-x86_64.sh" > checksum sha256sum -c checksum sudo bash cmake-"${CMAKE_VERSION}"-linux-x86_64.sh --skip-license --prefix=/usr/ --exclude-subdir + sudo apt remove -y unattended-upgrades sudo apt update sudo apt install -y cmake-format libclang-dev env: @@ -50,11 +51,13 @@ runs: - name: Export CUDA variables shell: bash run: | + find /usr/local -executable -name "nvcc" CUDA_PATH=/usr/local/cuda-"${CUDA_VERSION}" { echo "CUDA_PATH=$CUDA_PATH"; echo "LD_LIBRARY_PATH=$CUDA_PATH/lib64:$LD_LIBRARY_PATH"; echo "CUDA_MODULE_LOADER=EAGER"; + echo "PATH=$PATH:$CUDA_PATH/bin"; } >> "${GITHUB_ENV}" { echo "PATH=$PATH:$CUDA_PATH/bin"; @@ -74,6 +77,11 @@ runs: env: GCC_VERSION: ${{ inputs.gcc-version }} + - name: Check setup + shell: bash + run: | + which nvcc + - name: Check device is detected shell: bash run: nvidia-smi diff --git a/.github/workflows/gpu_code_validation_tests.yml b/.github/workflows/gpu_code_validation_tests.yml new file mode 100644 index 000000000..a70d145e4 --- /dev/null +++ b/.github/workflows/gpu_code_validation_tests.yml @@ -0,0 +1,148 @@ +# Compile and test tfhe-cuda-backend on an AWS instance +name: Cuda - Code Validation + +env: + CARGO_TERM_COLOR: always + ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + RUSTFLAGS: "-C target-cpu=native" + RUST_BACKTRACE: "full" + RUST_MIN_STACK: "8388608" + SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }} + SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png + SLACK_USERNAME: ${{ secrets.BOT_USERNAME }} + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} + SLACKIFY_MARKDOWN: true + IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }} + PULL_REQUEST_MD_LINK: "" + CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }} + # Secrets will be available only to zama-ai organization members + SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }} + EXTERNAL_CONTRIBUTION_RUNNER: "gpu_ubuntu-22.04" + +on: + # Allows you to run this workflow manually from the Actions tab as an alternative. + workflow_dispatch: + pull_request: + types: [ labeled ] + +permissions: + contents: read + +jobs: + setup-instance: + name: Setup instance (cuda-tests) + runs-on: ubuntu-latest + if: github.event_name != 'pull_request' || + (github.event.action == 'labeled' && github.event.label.name == 'approved') + outputs: + runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }} + steps: + - name: Start remote instance + id: start-remote-instance + if: env.SECRETS_AVAILABLE == 'true' + uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac + with: + mode: start + github-token: ${{ secrets.SLAB_ACTION_TOKEN }} + slab-url: ${{ secrets.SLAB_BASE_URL }} + job-secret: ${{ secrets.JOB_SECRET }} + backend: hyperstack + profile: gpu-test + + # This instance will be spawned especially for pull-request from forked repository + - name: Start GitHub instance + id: start-github-instance + if: env.SECRETS_AVAILABLE == 'false' + run: | + echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT" + + cuda-tests-linux: + name: CUDA Code Validation tests + needs: [ setup-instance ] + if: github.event_name != 'pull_request' || + (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped') + concurrency: + group: ${{ github.workflow_ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + runs-on: ${{ needs.setup-instance.outputs.runner-name }} + strategy: + fail-fast: false + # explicit include-based build matrix, of known valid options + matrix: + include: + - os: ubuntu-22.04 + cuda: "12.8" + gcc: 11 + steps: + - name: Checkout tfhe-rs + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 + with: + persist-credentials: 'false' + token: ${{ env.CHECKOUT_TOKEN }} + + - name: Setup Hyperstack dependencies + uses: ./.github/actions/gpu_setup + with: + cuda-version: ${{ matrix.cuda }} + gcc-version: ${{ matrix.gcc }} + github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }} + + - name: Find tools + run: | + find /usr -executable -name "compute-sanitizer" + + - name: Install latest stable + uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases + with: + toolchain: stable + + - name: Run memory sanitizer + run: | + make test_high_level_api_gpu_debug + + slack-notify: + name: Slack Notification + needs: [ setup-instance, cuda-tests-linux ] + runs-on: ubuntu-latest + if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }} + continue-on-error: true + steps: + - name: Set pull-request URL + if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request' + run: | + echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}" + env: + PR_BASE_URL: ${{ vars.PR_BASE_URL }} + PR_NUMBER: ${{ github.event.pull_request.number }} + + - name: Send message + if: env.SECRETS_AVAILABLE == 'true' + uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 + env: + SLACK_COLOR: ${{ needs.cuda-tests-linux.result }} + SLACK_MESSAGE: "GPU code validation tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))" + + teardown-instance: + name: Teardown instance (cuda-tests) + if: ${{ always() && needs.setup-instance.result == 'success' }} + needs: [ setup-instance, cuda-tests-linux ] + runs-on: ubuntu-latest + steps: + - name: Stop remote instance + id: stop-instance + if: env.SECRETS_AVAILABLE == 'true' + uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac + with: + mode: stop + github-token: ${{ secrets.SLAB_ACTION_TOKEN }} + slab-url: ${{ secrets.SLAB_BASE_URL }} + job-secret: ${{ secrets.JOB_SECRET }} + label: ${{ needs.setup-instance.outputs.runner-name }} + + - name: Slack Notification + if: ${{ failure() }} + continue-on-error: true + uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 + env: + SLACK_COLOR: ${{ job.status }} + SLACK_MESSAGE: "Instance teardown (cuda-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})" diff --git a/Makefile b/Makefile index be7152ce2..3e0a50368 100644 --- a/Makefile +++ b/Makefile @@ -686,6 +686,22 @@ test_integer_gpu_debug: install_rs_build_toolchain RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile release_lto_off \ --features=integer,gpu-debug -p $(TFHE_SPEC) -- integer::gpu::server_key:: +.PHONY: test_high_level_api_gpu_debug # Run the tests of the integer module with Debug flags for CUDA +test_high_level_api_gpu_debug: install_rs_build_toolchain install_cargo_nextest + export RUSTFLAGS="$(RUSTFLAGS)" && \ + export CARGO_RS_BUILD_TOOLCHAIN="$(CARGO_RS_BUILD_TOOLCHAIN)" && \ + export TFHE_SPEC="$(TFHE_SPEC)" && \ + export CARGO_PROFILE="$(CARGO_PROFILE)" && scripts/check_memory_errors.sh + +.PHONY: test_integer_hl_test_gpu_check_warnings +test_integer_hl_test_gpu_check_warnings: install_rs_build_toolchain + RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build \ + --features=integer,internal-keycache,gpu-debug,zk-pok -vv -p $(TFHE_SPEC) &> /tmp/gpu_compile_output + WARNINGS=$$(cat /tmp/gpu_compile_output | grep ": warning:" | grep "\[tfhe-cuda-backend" | grep -v "inline function" || true) && \ + if [[ "$${WARNINGS}" != "" ]]; then \ + echo "$${WARNINGS}" && exit 1; \ + fi + .PHONY: test_integer_long_run_gpu # Run the long run integer tests on the gpu backend test_integer_long_run_gpu: install_rs_check_toolchain install_cargo_nextest @@ -1591,7 +1607,7 @@ tfhe_lints .PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation pcc_gpu: check_rust_bindings_did_not_change clippy_rustdoc_gpu \ -clippy_gpu clippy_cuda_backend clippy_bench_gpu check_compile_tests_benches_gpu +clippy_gpu clippy_cuda_backend clippy_bench_gpu check_compile_tests_benches_gpu test_integer_hl_test_gpu_check_warnings .PHONY: pcc_hpu # pcc stands for pre commit checks for HPU compilation pcc_hpu: clippy_hpu clippy_hpu_backend clippy_hpu_mockup test_integer_hpu_mockup_ci_fast diff --git a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt index 0c52f99eb..c027f909f 100644 --- a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt +++ b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt @@ -78,8 +78,10 @@ endif() add_compile_definitions(CUDA_ARCH=${CUDA_ARCH}) +string(TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_LOWERCASE) + # Check if the DEBUG flag is defined -if(CMAKE_BUILD_TYPE STREQUAL "Debug") +if(CMAKE_BUILD_TYPE_LOWERCASE STREQUAL "debug") # Debug mode message("Compiling in Debug mode") add_definitions(-DDEBUG) diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu index 733befdd0..6fb141200 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu @@ -17,11 +17,12 @@ uint64_t scratch_cuda_integer_radix_cmux_kb_64( std::function predicate_lut_f = [](uint64_t x) -> uint64_t { return x == 1; }; - return scratch_cuda_integer_radix_cmux_kb( + uint64_t ret = scratch_cuda_integer_radix_cmux_kb( (cudaStream_t *)(streams), gpu_indexes, gpu_count, (int_cmux_buffer **)mem_ptr, predicate_lut_f, lwe_ciphertext_count, params, allocate_gpu_memory); POP_RANGE() + return ret; } void cuda_cmux_integer_radix_ciphertext_kb_64( diff --git a/scripts/check_memory_errors.sh b/scripts/check_memory_errors.sh new file mode 100755 index 000000000..3ce45379b --- /dev/null +++ b/scripts/check_memory_errors.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +# Build the tests but don't run them +RUSTFLAGS="$RUSTFLAGS" cargo "${CARGO_RS_BUILD_TOOLCHAIN}" test --no-run --profile "${CARGO_PROFILE}" \ + --features=integer,internal-keycache,gpu,zk-pok -p "${TFHE_SPEC}" + +# Find the test executable +EXECUTABLE=$(find target/release/deps/ -type f -executable -name "tfhe-*") + +# List the tests into a temporary file +RUSTFLAGS="$RUSTFLAGS" cargo "${CARGO_RS_BUILD_TOOLCHAIN}" nextest list --cargo-profile "${CARGO_PROFILE}" \ + --features=integer,internal-keycache,gpu,zk-pok -p "${TFHE_SPEC}" &> /tmp/test_list.txt + +# Filter the tests to get only the HL ones +TESTS_HL=$(sed -e $'s/\x1b\[[0-9;]*m//g' < /tmp/test_list.txt | grep 'high_level_api::.*gpu.*') + +# Run compute sanitizer on each test individually +# shellcheck disable=SC2181 +RESULT=0 && \ +while read -r t; do \ + echo compute-sanitizer --target-processes=all "$(pwd)"/"${EXECUTABLE}" -- "${t}" && \ + compute-sanitizer --leak-check=full --error-exitcode=1 --target-processes=all "$(pwd)"/"${EXECUTABLE}" -- "${t}" && \ + if [[ $? != "0" ]]; then \ + RESULT=1; \ + fi; \ +done <<< "${TESTS_HL}" + +exit $RESULT