chore(ci): transfer all GPU CI to hyperstack

This commit is contained in:
Agnes Leroy
2024-08-06 14:05:41 +02:00
committed by Agnès Leroy
parent a26e68c3bc
commit 5340859003
11 changed files with 154 additions and 58 deletions

View File

@@ -1,5 +1,5 @@
# Compile and test tfhe-cuda-backend on an H100 VM on hyperstack # Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
name: TFHE Cuda Backend - Base tests on H100 name: TFHE Cuda Backend - Fast tests on H100
env: env:
CARGO_TERM_COLOR: always CARGO_TERM_COLOR: always
@@ -49,7 +49,7 @@ jobs:
- tfhe/src/c_api/** - tfhe/src/c_api/**
- 'tfhe/docs/**.md' - 'tfhe/docs/**.md'
- Makefile - Makefile
- '.github/workflows/hyperstack**' - '.github/workflows/gpu_fast_h100_tests.yml'
- scripts/** - scripts/**
- ci/** - ci/**
@@ -109,6 +109,8 @@ jobs:
- name: Checkout tfhe-rs - name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
persist-credentials: 'false'
- name: Set up home - name: Set up home
run: | run: |
@@ -170,7 +172,7 @@ jobs:
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907 uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env: env:
SLACK_COLOR: ${{ needs.cuda-tests-linux.result }} SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
SLACK_MESSAGE: "Base H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})" SLACK_MESSAGE: "Fast H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance: teardown-instance:
name: Teardown instance (cuda-h100-tests) name: Teardown instance (cuda-h100-tests)

View File

@@ -47,6 +47,10 @@ jobs:
- tfhe/src/high_level_api/** - tfhe/src/high_level_api/**
- tfhe/src/c_api/** - tfhe/src/c_api/**
- 'tfhe/docs/**.md' - 'tfhe/docs/**.md'
- '.github/workflows/gpu_fast_tests.yml'
- Makefile
- scripts/**
- ci/**
setup-instance: setup-instance:
name: Setup instance (cuda-tests) name: Setup instance (cuda-tests)
@@ -65,7 +69,7 @@ jobs:
github-token: ${{ secrets.SLAB_ACTION_TOKEN }} github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }} slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }} job-secret: ${{ secrets.JOB_SECRET }}
backend: aws backend: hyperstack
profile: gpu-test profile: gpu-test
cuda-tests-linux: cuda-tests-linux:
@@ -84,11 +88,23 @@ jobs:
include: include:
- os: ubuntu-22.04 - os: ubuntu-22.04
cuda: "12.2" cuda: "12.2"
gcc: 9 gcc: 11
env: env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }} CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.6
steps: steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
./bootstrap
make -j"$(nproc)"
sudo make install
- name: Checkout tfhe-rs - name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with: with:
@@ -122,6 +138,10 @@ jobs:
echo "HOME=/home/ubuntu"; echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}" } >> "${GITHUB_ENV}"
- name: Check device is detected
if: ${{ !cancelled() }}
run: nvidia-smi
- name: Run core crypto and internal CUDA backend tests - name: Run core crypto and internal CUDA backend tests
run: | run: |
make test_core_crypto_gpu make test_core_crypto_gpu
@@ -139,13 +159,18 @@ jobs:
run: | run: |
make test_high_level_api_gpu make test_high_level_api_gpu
- name: Slack Notification slack-notify:
if: ${{ always() }} name: Slack Notification
continue-on-error: true needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
continue-on-error: true
steps:
- name: Send message
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907 uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env: env:
SLACK_COLOR: ${{ job.status }} SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
SLACK_MESSAGE: "CUDA AWS tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})" SLACK_MESSAGE: "Base GPU tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance: teardown-instance:
name: Teardown instance (cuda-tests) name: Teardown instance (cuda-tests)

View File

@@ -49,7 +49,7 @@ jobs:
- tfhe/src/c_api/** - tfhe/src/c_api/**
- 'tfhe/docs/**.md' - 'tfhe/docs/**.md'
- Makefile - Makefile
- '.github/workflows/aws_tfhe_multi_gpu**' - '.github/workflows/**_multi_gpu_tests.yml'
- scripts/** - scripts/**
- ci/** - ci/**
@@ -71,7 +71,7 @@ jobs:
github-token: ${{ secrets.SLAB_ACTION_TOKEN }} github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }} slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }} job-secret: ${{ secrets.JOB_SECRET }}
backend: aws backend: hyperstack
profile: multi-gpu-test profile: multi-gpu-test
cuda-tests-linux: cuda-tests-linux:
@@ -90,13 +90,27 @@ jobs:
include: include:
- os: ubuntu-22.04 - os: ubuntu-22.04
cuda: "12.2" cuda: "12.2"
gcc: 9 gcc: 11
env: env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }} CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.6
steps: steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
./bootstrap
make -j"$(nproc)"
sudo make install
- name: Checkout tfhe-rs - name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
persist-credentials: 'false'
- name: Set up home - name: Set up home
run: | run: |
@@ -126,30 +140,39 @@ jobs:
echo "HOME=/home/ubuntu"; echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}" } >> "${GITHUB_ENV}"
- name: Check device is detected
if: ${{ !cancelled() }}
run: nvidia-smi
# No need to test core_crypto and classic PBS in integer since it's already tested on single GPU. # No need to test core_crypto and classic PBS in integer since it's already tested on single GPU.
- name: Run multi-bit CUDA integer tests - name: Run multi-bit CUDA integer tests
run: | run: |
make test_integer_multi_bit_gpu_ci BIG_TESTS_INSTANCE=TRUE make test_integer_multi_bit_gpu_ci
- name: Run user docs tests - name: Run user docs tests
run: | run: |
make test_user_doc_gpu BIG_TESTS_INSTANCE=TRUE make test_user_doc_gpu
- name: Test C API - name: Test C API
run: | run: |
make test_c_api_gpu BIG_TESTS_INSTANCE=TRUE make test_c_api_gpu
- name: Run High Level API Tests - name: Run High Level API Tests
run: | run: |
make test_high_level_api_gpu BIG_TESTS_INSTANCE=TRUE make test_high_level_api_gpu
- name: Slack Notification slack-notify:
if: ${{ always() }} name: Slack Notification
continue-on-error: true needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
continue-on-error: true
steps:
- name: Send message
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907 uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env: env:
SLACK_COLOR: ${{ job.status }} SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
SLACK_MESSAGE: "CUDA AWS multi-GPU tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})" SLACK_MESSAGE: "Multi-GPU tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance: teardown-instance:
name: Teardown instance (cuda-tests-multi-gpu) name: Teardown instance (cuda-tests-multi-gpu)

View File

@@ -49,7 +49,7 @@ jobs:
- tfhe/src/c_api/** - tfhe/src/c_api/**
- 'tfhe/docs/**.md' - 'tfhe/docs/**.md'
- Makefile - Makefile
- '.github/workflows/hyperstack**' - '.github/workflows/gpu_signed_integer_h100_tests.yml'
- scripts/** - scripts/**
- ci/** - ci/**

View File

@@ -56,6 +56,10 @@ jobs:
- tfhe/src/high_level_api/** - tfhe/src/high_level_api/**
- tfhe/src/c_api/** - tfhe/src/c_api/**
- 'tfhe/docs/**.md' - 'tfhe/docs/**.md'
- '.github/workflows/gpu_signed_integer_tests.yml'
- Makefile
- scripts/**
- ci/**
setup-instance: setup-instance:
name: Setup instance (cuda-signed-integer-tests) name: Setup instance (cuda-signed-integer-tests)
@@ -75,7 +79,7 @@ jobs:
github-token: ${{ secrets.SLAB_ACTION_TOKEN }} github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }} slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }} job-secret: ${{ secrets.JOB_SECRET }}
backend: aws backend: hyperstack
profile: gpu-test profile: gpu-test
cuda-signed-integer-tests: cuda-signed-integer-tests:
@@ -94,13 +98,27 @@ jobs:
include: include:
- os: ubuntu-22.04 - os: ubuntu-22.04
cuda: "12.2" cuda: "12.2"
gcc: 9 gcc: 11
env: env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }} CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.6
steps: steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
./bootstrap
make -j"$(nproc)"
sudo make install
- name: Checkout tfhe-rs - name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
persist-credentials: 'false'
- name: Set up home - name: Set up home
run: | run: |
@@ -138,17 +156,26 @@ jobs:
echo "NIGHTLY_TESTS=TRUE"; echo "NIGHTLY_TESTS=TRUE";
} >> "${GITHUB_ENV}" } >> "${GITHUB_ENV}"
- name: Check device is detected
if: ${{ !cancelled() }}
run: nvidia-smi
- name: Run signed integer multi-bit tests - name: Run signed integer multi-bit tests
run: | run: |
make test_signed_integer_multi_bit_gpu_ci make test_signed_integer_multi_bit_gpu_ci
- name: Slack Notification slack-notify:
if: ${{ always() }} name: Slack Notification
continue-on-error: true needs: [ setup-instance, cuda-signed-integer-tests ]
runs-on: ubuntu-latest
if: ${{ always() && needs.cuda-signed-integer-tests.result != 'skipped' }}
continue-on-error: true
steps:
- name: Send message
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907 uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env: env:
SLACK_COLOR: ${{ job.status }} SLACK_COLOR: ${{ needs.cuda-signed-integer-tests.result }}
SLACK_MESSAGE: "CUDA AWS signed integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})" SLACK_MESSAGE: "Base GPU tests finished with status: ${{ needs.cuda-signed-integer-tests.result }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance: teardown-instance:
name: Teardown instance (cuda-tests) name: Teardown instance (cuda-tests)

View File

@@ -49,7 +49,7 @@ jobs:
- tfhe/src/c_api/** - tfhe/src/c_api/**
- 'tfhe/docs/**.md' - 'tfhe/docs/**.md'
- Makefile - Makefile
- '.github/workflows/hyperstack**' - '.github/workflows/gpu_unsigned_integer_tests.yml'
- scripts/** - scripts/**
- ci/** - ci/**

View File

@@ -55,6 +55,10 @@ jobs:
- tfhe/src/high_level_api/** - tfhe/src/high_level_api/**
- tfhe/src/c_api/** - tfhe/src/c_api/**
- 'tfhe/docs/**.md' - 'tfhe/docs/**.md'
- '.github/workflows/gpu_unsigned_integer_tests.yml'
- Makefile
- scripts/**
- ci/**
setup-instance: setup-instance:
name: Setup instance (cuda-unsigned-integer-tests) name: Setup instance (cuda-unsigned-integer-tests)
@@ -74,7 +78,7 @@ jobs:
github-token: ${{ secrets.SLAB_ACTION_TOKEN }} github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }} slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }} job-secret: ${{ secrets.JOB_SECRET }}
backend: aws backend: hyperstack
profile: gpu-test profile: gpu-test
cuda-unsigned-integer-tests: cuda-unsigned-integer-tests:
@@ -93,11 +97,23 @@ jobs:
include: include:
- os: ubuntu-22.04 - os: ubuntu-22.04
cuda: "12.2" cuda: "12.2"
gcc: 9 gcc: 11
env: env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }} CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.6
steps: steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
./bootstrap
make -j"$(nproc)"
sudo make install
- name: Checkout tfhe-rs - name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
@@ -137,17 +153,26 @@ jobs:
echo "NIGHTLY_TESTS=TRUE"; echo "NIGHTLY_TESTS=TRUE";
} >> "${GITHUB_ENV}" } >> "${GITHUB_ENV}"
- name: Check device is detected
if: ${{ !cancelled() }}
run: nvidia-smi
- name: Run unsigned integer multi-bit tests - name: Run unsigned integer multi-bit tests
run: | run: |
make test_unsigned_integer_multi_bit_gpu_ci make test_unsigned_integer_multi_bit_gpu_ci
- name: Slack Notification slack-notify:
if: ${{ always() }} name: Slack Notification
continue-on-error: true needs: [ setup-instance, cuda-unsigned-integer-tests ]
runs-on: ubuntu-latest
if: ${{ always() && needs.cuda-unsigned-integer-tests.result != 'skipped' }}
continue-on-error: true
steps:
- name: Send message
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907 uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env: env:
SLACK_COLOR: ${{ job.status }} SLACK_COLOR: ${{ needs.cuda-unsigned-integer-tests.result }}
SLACK_MESSAGE: "CUDA AWS unsigned integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})" SLACK_MESSAGE: "Unsigned integer GPU tests finished with status: ${{ needs.cuda-unsigned-integer-tests.result }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance: teardown-instance:
name: Teardown instance (cuda-tests) name: Teardown instance (cuda-tests)

View File

@@ -30,13 +30,10 @@ region = "us-east-1"
image_id = "ami-06b3d61f41bf8350a" image_id = "ami-06b3d61f41bf8350a"
instance_type = "m6i.4xlarge" instance_type = "m6i.4xlarge"
[backend.aws.gpu-test] [backend.hyperstack.gpu-test]
region = "us-east-1" environment_name = "canada"
image_id = "ami-06b3d61f41bf8350a" image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2"
instance_type = "p3.2xlarge" flavor_name = "n3-RTX-A6000x1"
# One spawn attempt every 30 seconds for 1 hour
spawn_retry_attempts = 120
spawn_retry_duration = 60
[backend.hyperstack.single-h100] [backend.hyperstack.single-h100]
environment_name = "canada" environment_name = "canada"
@@ -58,13 +55,10 @@ environment_name = "canada"
image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2" image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2"
flavor_name = "n3-A100x8-NVLink" flavor_name = "n3-A100x8-NVLink"
[backend.aws.multi-gpu-test] [backend.hyperstack.multi-gpu-test]
region = "us-east-1" environment_name = "canada"
image_id = "ami-06b3d61f41bf8350a" image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2"
instance_type = "p3.8xlarge" flavor_name = "n3-A100x4"
# One spawn attempt every 30 seconds for 1 hour
spawn_retry_attempts = 120
spawn_retry_duration = 60
[command.signed_integer_full_bench] [command.signed_integer_full_bench]
workflow = "signed_integer_full_benchmark.yml" workflow = "signed_integer_full_benchmark.yml"

View File

@@ -130,8 +130,8 @@ fi
# Override test-threads number to avoid Out-of-memory issues on GPU instances # Override test-threads number to avoid Out-of-memory issues on GPU instances
if [[ "${backend}" == "gpu" ]]; then if [[ "${backend}" == "gpu" ]]; then
if [[ "${BIG_TESTS_INSTANCE}" == TRUE ]]; then if [[ "${BIG_TESTS_INSTANCE}" == TRUE ]]; then
test_threads=5 test_threads=8
doctest_threads=5 doctest_threads=8
else else
test_threads=3 test_threads=3
doctest_threads=3 doctest_threads=3