mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-07 22:04:10 -05:00
chore(gpu): add compute-sanitizer run on H100
This commit is contained in:
2
.github/workflows/gpu_memory_sanitizer.yml
vendored
2
.github/workflows/gpu_memory_sanitizer.yml
vendored
@@ -148,4 +148,4 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (cuda-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_MESSAGE: "Instance teardown (gpu-memory-sanitizer) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
151
.github/workflows/gpu_memory_sanitizer_h100.yml
vendored
Normal file
151
.github/workflows/gpu_memory_sanitizer_h100.yml
vendored
Normal file
@@ -0,0 +1,151 @@
|
||||
# Compile and test tfhe-cuda-backend on an AWS instance
|
||||
name: gpu_memory_sanitizer_h100
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
SLACKIFY_MARKDOWN: true
|
||||
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
|
||||
PULL_REQUEST_MD_LINK: ""
|
||||
CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
|
||||
# Secrets will be available only to zama-ai organization members
|
||||
SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
|
||||
EXTERNAL_CONTRIBUTION_RUNNER: "gpu_ubuntu-22.04"
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
# zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning
|
||||
|
||||
jobs:
|
||||
setup-instance:
|
||||
name: gpu_memory_sanitizer/setup-instance
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved')
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
|
||||
steps:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: hyperstack
|
||||
profile: single-h100
|
||||
|
||||
# This instance will be spawned especially for pull-request from forked repository
|
||||
- name: Start GitHub instance
|
||||
id: start-github-instance
|
||||
if: env.SECRETS_AVAILABLE == 'false'
|
||||
run: |
|
||||
echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
|
||||
|
||||
cuda-tests-linux:
|
||||
name: gpu_memory_sanitizer/cuda-tests-linux
|
||||
needs: [ setup-instance ]
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
|
||||
concurrency:
|
||||
group: ${{ github.workflow_ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
timeout-minutes: 240
|
||||
strategy:
|
||||
fail-fast: false
|
||||
# explicit include-based build matrix, of known valid options
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.8"
|
||||
gcc: 11
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Setup Hyperstack dependencies
|
||||
uses: ./.github/actions/gpu_setup
|
||||
with:
|
||||
cuda-version: ${{ matrix.cuda }}
|
||||
gcc-version: ${{ matrix.gcc }}
|
||||
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
|
||||
|
||||
- name: Find tools
|
||||
run: |
|
||||
find /usr -executable -name "compute-sanitizer"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Run memory sanitizer
|
||||
run: |
|
||||
make test_high_level_api_gpu_sanitizer
|
||||
|
||||
slack-notify:
|
||||
name: gpu_memory_sanitizer/slack-notify
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Set pull-request URL
|
||||
if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
|
||||
run: |
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
|
||||
env:
|
||||
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
|
||||
- name: Send message
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
|
||||
env:
|
||||
SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
|
||||
SLACK_MESSAGE: "GPU Memory Checks tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
|
||||
|
||||
teardown-instance:
|
||||
name: gpu_memory_sanitizer/teardown-instance
|
||||
if: ${{ always() && needs.setup-instance.result == 'success' }}
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (gpu-memory-sanitizer-h100) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
Reference in New Issue
Block a user