Files
tfhe-rs/.github/workflows/gpu_full_h100_tests.yml
David Testé 5321f759d7 chore(ci): remove dependencies install on gpu h100 tests
This is redundant with the use of gpu_setup.yml action.
2025-05-06 14:06:17 +02:00

138 lines
5.0 KiB
YAML

# Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
name: Cuda - Full tests on H100
env:
CARGO_TERM_COLOR: always
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
on:
workflow_dispatch:
permissions: {}
jobs:
setup-instance:
name: Setup instance (cuda-h100-tests)
runs-on: ubuntu-latest
outputs:
# Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
# If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
# Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
# otherwise we'll try to run the next job on a non-existing on-demand instance.
runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label }}
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
steps:
- name: Start remote instance
id: start-remote-instance
continue-on-error: true
uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: hyperstack
profile: single-h100
# This will allow to fallback on permanent instances running on Hyperstack.
- name: Use permanent remote instance
id: use-permanent-instance
if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
run: |
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
cuda-tests-linux:
name: CUDA H100 tests
needs: [ setup-instance ]
concurrency:
group: ${{ github.workflow_ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
matrix:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
with:
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
- name: Setup Hyperstack dependencies
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
uses: ./.github/actions/gpu_setup
with:
cuda-version: ${{ matrix.cuda }}
gcc-version: ${{ matrix.gcc }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
with:
toolchain: stable
- name: Run core crypto, integer and internal CUDA backend tests
run: |
make test_gpu
- name: Run user docs tests
run: |
make test_user_doc_gpu
- name: Test C API
run: |
make test_c_api_gpu
- name: Run High Level API Tests
run: |
make test_high_level_api_gpu
slack-notify:
name: Slack Notification
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
if: ${{ failure() }}
continue-on-error: true
steps:
- name: Send message
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
env:
SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
SLACK_MESSAGE: "Full H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (cuda-h100-tests)
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"