mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-08 22:28:01 -05:00
feat(gpu): utility debug workflows in ci
This commit is contained in:
committed by
Agnès Leroy
parent
e8986cbd7c
commit
36eceaf05e
8
.github/actions/gpu_setup/action.yml
vendored
8
.github/actions/gpu_setup/action.yml
vendored
@@ -23,6 +23,7 @@ runs:
|
|||||||
echo "${CMAKE_SCRIPT_SHA} cmake-${CMAKE_VERSION}-linux-x86_64.sh" > checksum
|
echo "${CMAKE_SCRIPT_SHA} cmake-${CMAKE_VERSION}-linux-x86_64.sh" > checksum
|
||||||
sha256sum -c checksum
|
sha256sum -c checksum
|
||||||
sudo bash cmake-"${CMAKE_VERSION}"-linux-x86_64.sh --skip-license --prefix=/usr/ --exclude-subdir
|
sudo bash cmake-"${CMAKE_VERSION}"-linux-x86_64.sh --skip-license --prefix=/usr/ --exclude-subdir
|
||||||
|
sudo apt remove -y unattended-upgrades
|
||||||
sudo apt update
|
sudo apt update
|
||||||
sudo apt install -y cmake-format libclang-dev
|
sudo apt install -y cmake-format libclang-dev
|
||||||
env:
|
env:
|
||||||
@@ -50,11 +51,13 @@ runs:
|
|||||||
- name: Export CUDA variables
|
- name: Export CUDA variables
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
|
find /usr/local -executable -name "nvcc"
|
||||||
CUDA_PATH=/usr/local/cuda-"${CUDA_VERSION}"
|
CUDA_PATH=/usr/local/cuda-"${CUDA_VERSION}"
|
||||||
{
|
{
|
||||||
echo "CUDA_PATH=$CUDA_PATH";
|
echo "CUDA_PATH=$CUDA_PATH";
|
||||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib64:$LD_LIBRARY_PATH";
|
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib64:$LD_LIBRARY_PATH";
|
||||||
echo "CUDA_MODULE_LOADER=EAGER";
|
echo "CUDA_MODULE_LOADER=EAGER";
|
||||||
|
echo "PATH=$PATH:$CUDA_PATH/bin";
|
||||||
} >> "${GITHUB_ENV}"
|
} >> "${GITHUB_ENV}"
|
||||||
{
|
{
|
||||||
echo "PATH=$PATH:$CUDA_PATH/bin";
|
echo "PATH=$PATH:$CUDA_PATH/bin";
|
||||||
@@ -74,6 +77,11 @@ runs:
|
|||||||
env:
|
env:
|
||||||
GCC_VERSION: ${{ inputs.gcc-version }}
|
GCC_VERSION: ${{ inputs.gcc-version }}
|
||||||
|
|
||||||
|
- name: Check setup
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
which nvcc
|
||||||
|
|
||||||
- name: Check device is detected
|
- name: Check device is detected
|
||||||
shell: bash
|
shell: bash
|
||||||
run: nvidia-smi
|
run: nvidia-smi
|
||||||
|
|||||||
148
.github/workflows/gpu_code_validation_tests.yml
vendored
Normal file
148
.github/workflows/gpu_code_validation_tests.yml
vendored
Normal file
@@ -0,0 +1,148 @@
|
|||||||
|
# Compile and test tfhe-cuda-backend on an AWS instance
|
||||||
|
name: Cuda - Code Validation
|
||||||
|
|
||||||
|
env:
|
||||||
|
CARGO_TERM_COLOR: always
|
||||||
|
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||||
|
RUSTFLAGS: "-C target-cpu=native"
|
||||||
|
RUST_BACKTRACE: "full"
|
||||||
|
RUST_MIN_STACK: "8388608"
|
||||||
|
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||||
|
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||||
|
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||||
|
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||||
|
SLACKIFY_MARKDOWN: true
|
||||||
|
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
|
||||||
|
PULL_REQUEST_MD_LINK: ""
|
||||||
|
CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
|
||||||
|
# Secrets will be available only to zama-ai organization members
|
||||||
|
SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
|
||||||
|
EXTERNAL_CONTRIBUTION_RUNNER: "gpu_ubuntu-22.04"
|
||||||
|
|
||||||
|
on:
|
||||||
|
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||||
|
workflow_dispatch:
|
||||||
|
pull_request:
|
||||||
|
types: [ labeled ]
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
setup-instance:
|
||||||
|
name: Setup instance (cuda-tests)
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
if: github.event_name != 'pull_request' ||
|
||||||
|
(github.event.action == 'labeled' && github.event.label.name == 'approved')
|
||||||
|
outputs:
|
||||||
|
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
|
||||||
|
steps:
|
||||||
|
- name: Start remote instance
|
||||||
|
id: start-remote-instance
|
||||||
|
if: env.SECRETS_AVAILABLE == 'true'
|
||||||
|
uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
|
||||||
|
with:
|
||||||
|
mode: start
|
||||||
|
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||||
|
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||||
|
job-secret: ${{ secrets.JOB_SECRET }}
|
||||||
|
backend: hyperstack
|
||||||
|
profile: gpu-test
|
||||||
|
|
||||||
|
# This instance will be spawned especially for pull-request from forked repository
|
||||||
|
- name: Start GitHub instance
|
||||||
|
id: start-github-instance
|
||||||
|
if: env.SECRETS_AVAILABLE == 'false'
|
||||||
|
run: |
|
||||||
|
echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
|
||||||
|
|
||||||
|
cuda-tests-linux:
|
||||||
|
name: CUDA Code Validation tests
|
||||||
|
needs: [ setup-instance ]
|
||||||
|
if: github.event_name != 'pull_request' ||
|
||||||
|
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow_ref }}
|
||||||
|
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||||
|
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
# explicit include-based build matrix, of known valid options
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- os: ubuntu-22.04
|
||||||
|
cuda: "12.8"
|
||||||
|
gcc: 11
|
||||||
|
steps:
|
||||||
|
- name: Checkout tfhe-rs
|
||||||
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||||
|
with:
|
||||||
|
persist-credentials: 'false'
|
||||||
|
token: ${{ env.CHECKOUT_TOKEN }}
|
||||||
|
|
||||||
|
- name: Setup Hyperstack dependencies
|
||||||
|
uses: ./.github/actions/gpu_setup
|
||||||
|
with:
|
||||||
|
cuda-version: ${{ matrix.cuda }}
|
||||||
|
gcc-version: ${{ matrix.gcc }}
|
||||||
|
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
|
||||||
|
|
||||||
|
- name: Find tools
|
||||||
|
run: |
|
||||||
|
find /usr -executable -name "compute-sanitizer"
|
||||||
|
|
||||||
|
- name: Install latest stable
|
||||||
|
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||||
|
with:
|
||||||
|
toolchain: stable
|
||||||
|
|
||||||
|
- name: Run memory sanitizer
|
||||||
|
run: |
|
||||||
|
make test_high_level_api_gpu_debug
|
||||||
|
|
||||||
|
slack-notify:
|
||||||
|
name: Slack Notification
|
||||||
|
needs: [ setup-instance, cuda-tests-linux ]
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
|
||||||
|
continue-on-error: true
|
||||||
|
steps:
|
||||||
|
- name: Set pull-request URL
|
||||||
|
if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
|
||||||
|
run: |
|
||||||
|
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
|
||||||
|
env:
|
||||||
|
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
|
||||||
|
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||||
|
|
||||||
|
- name: Send message
|
||||||
|
if: env.SECRETS_AVAILABLE == 'true'
|
||||||
|
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
|
||||||
|
env:
|
||||||
|
SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
|
||||||
|
SLACK_MESSAGE: "GPU code validation tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
|
||||||
|
|
||||||
|
teardown-instance:
|
||||||
|
name: Teardown instance (cuda-tests)
|
||||||
|
if: ${{ always() && needs.setup-instance.result == 'success' }}
|
||||||
|
needs: [ setup-instance, cuda-tests-linux ]
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Stop remote instance
|
||||||
|
id: stop-instance
|
||||||
|
if: env.SECRETS_AVAILABLE == 'true'
|
||||||
|
uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
|
||||||
|
with:
|
||||||
|
mode: stop
|
||||||
|
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||||
|
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||||
|
job-secret: ${{ secrets.JOB_SECRET }}
|
||||||
|
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||||
|
|
||||||
|
- name: Slack Notification
|
||||||
|
if: ${{ failure() }}
|
||||||
|
continue-on-error: true
|
||||||
|
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
|
||||||
|
env:
|
||||||
|
SLACK_COLOR: ${{ job.status }}
|
||||||
|
SLACK_MESSAGE: "Instance teardown (cuda-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||||
18
Makefile
18
Makefile
@@ -686,6 +686,22 @@ test_integer_gpu_debug: install_rs_build_toolchain
|
|||||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile release_lto_off \
|
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile release_lto_off \
|
||||||
--features=integer,gpu-debug -p $(TFHE_SPEC) -- integer::gpu::server_key::
|
--features=integer,gpu-debug -p $(TFHE_SPEC) -- integer::gpu::server_key::
|
||||||
|
|
||||||
|
.PHONY: test_high_level_api_gpu_debug # Run the tests of the integer module with Debug flags for CUDA
|
||||||
|
test_high_level_api_gpu_debug: install_rs_build_toolchain install_cargo_nextest
|
||||||
|
export RUSTFLAGS="$(RUSTFLAGS)" && \
|
||||||
|
export CARGO_RS_BUILD_TOOLCHAIN="$(CARGO_RS_BUILD_TOOLCHAIN)" && \
|
||||||
|
export TFHE_SPEC="$(TFHE_SPEC)" && \
|
||||||
|
export CARGO_PROFILE="$(CARGO_PROFILE)" && scripts/check_memory_errors.sh
|
||||||
|
|
||||||
|
.PHONY: test_integer_hl_test_gpu_check_warnings
|
||||||
|
test_integer_hl_test_gpu_check_warnings: install_rs_build_toolchain
|
||||||
|
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build \
|
||||||
|
--features=integer,internal-keycache,gpu-debug,zk-pok -vv -p $(TFHE_SPEC) &> /tmp/gpu_compile_output
|
||||||
|
WARNINGS=$$(cat /tmp/gpu_compile_output | grep ": warning:" | grep "\[tfhe-cuda-backend" | grep -v "inline function" || true) && \
|
||||||
|
if [[ "$${WARNINGS}" != "" ]]; then \
|
||||||
|
echo "$${WARNINGS}" && exit 1; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
.PHONY: test_integer_long_run_gpu # Run the long run integer tests on the gpu backend
|
.PHONY: test_integer_long_run_gpu # Run the long run integer tests on the gpu backend
|
||||||
test_integer_long_run_gpu: install_rs_check_toolchain install_cargo_nextest
|
test_integer_long_run_gpu: install_rs_check_toolchain install_cargo_nextest
|
||||||
@@ -1591,7 +1607,7 @@ tfhe_lints
|
|||||||
|
|
||||||
.PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
|
.PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
|
||||||
pcc_gpu: check_rust_bindings_did_not_change clippy_rustdoc_gpu \
|
pcc_gpu: check_rust_bindings_did_not_change clippy_rustdoc_gpu \
|
||||||
clippy_gpu clippy_cuda_backend clippy_bench_gpu check_compile_tests_benches_gpu
|
clippy_gpu clippy_cuda_backend clippy_bench_gpu check_compile_tests_benches_gpu test_integer_hl_test_gpu_check_warnings
|
||||||
|
|
||||||
.PHONY: pcc_hpu # pcc stands for pre commit checks for HPU compilation
|
.PHONY: pcc_hpu # pcc stands for pre commit checks for HPU compilation
|
||||||
pcc_hpu: clippy_hpu clippy_hpu_backend clippy_hpu_mockup test_integer_hpu_mockup_ci_fast
|
pcc_hpu: clippy_hpu clippy_hpu_backend clippy_hpu_mockup test_integer_hpu_mockup_ci_fast
|
||||||
|
|||||||
@@ -78,8 +78,10 @@ endif()
|
|||||||
|
|
||||||
add_compile_definitions(CUDA_ARCH=${CUDA_ARCH})
|
add_compile_definitions(CUDA_ARCH=${CUDA_ARCH})
|
||||||
|
|
||||||
|
string(TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_LOWERCASE)
|
||||||
|
|
||||||
# Check if the DEBUG flag is defined
|
# Check if the DEBUG flag is defined
|
||||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
if(CMAKE_BUILD_TYPE_LOWERCASE STREQUAL "debug")
|
||||||
# Debug mode
|
# Debug mode
|
||||||
message("Compiling in Debug mode")
|
message("Compiling in Debug mode")
|
||||||
add_definitions(-DDEBUG)
|
add_definitions(-DDEBUG)
|
||||||
|
|||||||
@@ -17,11 +17,12 @@ uint64_t scratch_cuda_integer_radix_cmux_kb_64(
|
|||||||
std::function<uint64_t(uint64_t)> predicate_lut_f =
|
std::function<uint64_t(uint64_t)> predicate_lut_f =
|
||||||
[](uint64_t x) -> uint64_t { return x == 1; };
|
[](uint64_t x) -> uint64_t { return x == 1; };
|
||||||
|
|
||||||
return scratch_cuda_integer_radix_cmux_kb<uint64_t>(
|
uint64_t ret = scratch_cuda_integer_radix_cmux_kb<uint64_t>(
|
||||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||||
(int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
|
(int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
|
||||||
lwe_ciphertext_count, params, allocate_gpu_memory);
|
lwe_ciphertext_count, params, allocate_gpu_memory);
|
||||||
POP_RANGE()
|
POP_RANGE()
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
void cuda_cmux_integer_radix_ciphertext_kb_64(
|
void cuda_cmux_integer_radix_ciphertext_kb_64(
|
||||||
|
|||||||
27
scripts/check_memory_errors.sh
Executable file
27
scripts/check_memory_errors.sh
Executable file
@@ -0,0 +1,27 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Build the tests but don't run them
|
||||||
|
RUSTFLAGS="$RUSTFLAGS" cargo "${CARGO_RS_BUILD_TOOLCHAIN}" test --no-run --profile "${CARGO_PROFILE}" \
|
||||||
|
--features=integer,internal-keycache,gpu,zk-pok -p "${TFHE_SPEC}"
|
||||||
|
|
||||||
|
# Find the test executable
|
||||||
|
EXECUTABLE=$(find target/release/deps/ -type f -executable -name "tfhe-*")
|
||||||
|
|
||||||
|
# List the tests into a temporary file
|
||||||
|
RUSTFLAGS="$RUSTFLAGS" cargo "${CARGO_RS_BUILD_TOOLCHAIN}" nextest list --cargo-profile "${CARGO_PROFILE}" \
|
||||||
|
--features=integer,internal-keycache,gpu,zk-pok -p "${TFHE_SPEC}" &> /tmp/test_list.txt
|
||||||
|
|
||||||
|
# Filter the tests to get only the HL ones
|
||||||
|
TESTS_HL=$(sed -e $'s/\x1b\[[0-9;]*m//g' < /tmp/test_list.txt | grep 'high_level_api::.*gpu.*')
|
||||||
|
|
||||||
|
# Run compute sanitizer on each test individually
|
||||||
|
# shellcheck disable=SC2181
|
||||||
|
RESULT=0 && \
|
||||||
|
while read -r t; do \
|
||||||
|
echo compute-sanitizer --target-processes=all "$(pwd)"/"${EXECUTABLE}" -- "${t}" && \
|
||||||
|
compute-sanitizer --leak-check=full --error-exitcode=1 --target-processes=all "$(pwd)"/"${EXECUTABLE}" -- "${t}" && \
|
||||||
|
if [[ $? != "0" ]]; then \
|
||||||
|
RESULT=1; \
|
||||||
|
fi; \
|
||||||
|
done <<< "${TESTS_HL}"
|
||||||
|
|
||||||
|
exit $RESULT
|
||||||
Reference in New Issue
Block a user