diff --git a/.github/workflows/aws_tfhe_gpu_4090_tests.yml b/.github/workflows/gpu_4090_tests.yml similarity index 100% rename from .github/workflows/aws_tfhe_gpu_4090_tests.yml rename to .github/workflows/gpu_4090_tests.yml diff --git a/.github/workflows/hyperstack_tfhe_gpu_tests.yml b/.github/workflows/gpu_fast_h100_tests.yml similarity index 96% rename from .github/workflows/hyperstack_tfhe_gpu_tests.yml rename to .github/workflows/gpu_fast_h100_tests.yml index 33195ca04..0c3411233 100644 --- a/.github/workflows/hyperstack_tfhe_gpu_tests.yml +++ b/.github/workflows/gpu_fast_h100_tests.yml @@ -1,5 +1,5 @@ # Compile and test tfhe-cuda-backend on an H100 VM on hyperstack -name: TFHE Cuda Backend - Base tests on H100 +name: TFHE Cuda Backend - Fast tests on H100 env: CARGO_TERM_COLOR: always @@ -49,7 +49,7 @@ jobs: - tfhe/src/c_api/** - 'tfhe/docs/**.md' - Makefile - - '.github/workflows/hyperstack**' + - '.github/workflows/gpu_fast_h100_tests.yml' - scripts/** - ci/** @@ -109,6 +109,8 @@ jobs: - name: Checkout tfhe-rs uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 + with: + persist-credentials: 'false' - name: Set up home run: | @@ -170,7 +172,7 @@ jobs: uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907 env: SLACK_COLOR: ${{ needs.cuda-tests-linux.result }} - SLACK_MESSAGE: "Base H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})" + SLACK_MESSAGE: "Fast H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})" teardown-instance: name: Teardown instance (cuda-h100-tests) diff --git a/.github/workflows/aws_tfhe_gpu_tests.yml b/.github/workflows/gpu_fast_tests.yml similarity index 81% rename from .github/workflows/aws_tfhe_gpu_tests.yml rename to .github/workflows/gpu_fast_tests.yml index 7f6908e3b..60b982950 100644 --- a/.github/workflows/aws_tfhe_gpu_tests.yml +++ b/.github/workflows/gpu_fast_tests.yml @@ -47,6 +47,10 @@ jobs: - tfhe/src/high_level_api/** - tfhe/src/c_api/** - 'tfhe/docs/**.md' + - '.github/workflows/gpu_fast_tests.yml' + - Makefile + - scripts/** + - ci/** setup-instance: name: Setup instance (cuda-tests) @@ -65,7 +69,7 @@ jobs: github-token: ${{ secrets.SLAB_ACTION_TOKEN }} slab-url: ${{ secrets.SLAB_BASE_URL }} job-secret: ${{ secrets.JOB_SECRET }} - backend: aws + backend: hyperstack profile: gpu-test cuda-tests-linux: @@ -84,11 +88,23 @@ jobs: include: - os: ubuntu-22.04 cuda: "12.2" - gcc: 9 + gcc: 11 env: CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }} - + CMAKE_VERSION: 3.29.6 steps: + # Mandatory on hyperstack since a bootable volume is not re-usable yet. + - name: Install dependencies + run: | + sudo apt update + sudo apt install -y checkinstall zlib1g-dev libssl-dev + wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz + tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz + cd cmake-${{ env.CMAKE_VERSION }} + ./bootstrap + make -j"$(nproc)" + sudo make install + - name: Checkout tfhe-rs uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 with: @@ -122,6 +138,10 @@ jobs: echo "HOME=/home/ubuntu"; } >> "${GITHUB_ENV}" + - name: Check device is detected + if: ${{ !cancelled() }} + run: nvidia-smi + - name: Run core crypto and internal CUDA backend tests run: | make test_core_crypto_gpu @@ -139,13 +159,18 @@ jobs: run: | make test_high_level_api_gpu - - name: Slack Notification - if: ${{ always() }} - continue-on-error: true + slack-notify: + name: Slack Notification + needs: [ setup-instance, cuda-tests-linux ] + runs-on: ubuntu-latest + if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }} + continue-on-error: true + steps: + - name: Send message uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907 env: - SLACK_COLOR: ${{ job.status }} - SLACK_MESSAGE: "CUDA AWS tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})" + SLACK_COLOR: ${{ needs.cuda-tests-linux.result }} + SLACK_MESSAGE: "Base GPU tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})" teardown-instance: name: Teardown instance (cuda-tests) diff --git a/.github/workflows/aws_tfhe_multi_gpu_tests.yml b/.github/workflows/gpu_full_multi_gpu_tests.yml similarity index 79% rename from .github/workflows/aws_tfhe_multi_gpu_tests.yml rename to .github/workflows/gpu_full_multi_gpu_tests.yml index 5edeb280d..2c5bc5690 100644 --- a/.github/workflows/aws_tfhe_multi_gpu_tests.yml +++ b/.github/workflows/gpu_full_multi_gpu_tests.yml @@ -49,7 +49,7 @@ jobs: - tfhe/src/c_api/** - 'tfhe/docs/**.md' - Makefile - - '.github/workflows/aws_tfhe_multi_gpu**' + - '.github/workflows/**_multi_gpu_tests.yml' - scripts/** - ci/** @@ -71,7 +71,7 @@ jobs: github-token: ${{ secrets.SLAB_ACTION_TOKEN }} slab-url: ${{ secrets.SLAB_BASE_URL }} job-secret: ${{ secrets.JOB_SECRET }} - backend: aws + backend: hyperstack profile: multi-gpu-test cuda-tests-linux: @@ -90,13 +90,27 @@ jobs: include: - os: ubuntu-22.04 cuda: "12.2" - gcc: 9 + gcc: 11 env: CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }} - + CMAKE_VERSION: 3.29.6 steps: + # Mandatory on hyperstack since a bootable volume is not re-usable yet. + - name: Install dependencies + run: | + sudo apt update + sudo apt install -y checkinstall zlib1g-dev libssl-dev + wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz + tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz + cd cmake-${{ env.CMAKE_VERSION }} + ./bootstrap + make -j"$(nproc)" + sudo make install + - name: Checkout tfhe-rs uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 + with: + persist-credentials: 'false' - name: Set up home run: | @@ -126,30 +140,39 @@ jobs: echo "HOME=/home/ubuntu"; } >> "${GITHUB_ENV}" + - name: Check device is detected + if: ${{ !cancelled() }} + run: nvidia-smi + # No need to test core_crypto and classic PBS in integer since it's already tested on single GPU. - name: Run multi-bit CUDA integer tests run: | - make test_integer_multi_bit_gpu_ci + BIG_TESTS_INSTANCE=TRUE make test_integer_multi_bit_gpu_ci - name: Run user docs tests run: | - make test_user_doc_gpu + BIG_TESTS_INSTANCE=TRUE make test_user_doc_gpu - name: Test C API run: | - make test_c_api_gpu + BIG_TESTS_INSTANCE=TRUE make test_c_api_gpu - name: Run High Level API Tests run: | - make test_high_level_api_gpu + BIG_TESTS_INSTANCE=TRUE make test_high_level_api_gpu - - name: Slack Notification - if: ${{ always() }} - continue-on-error: true + slack-notify: + name: Slack Notification + needs: [ setup-instance, cuda-tests-linux ] + runs-on: ubuntu-latest + if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }} + continue-on-error: true + steps: + - name: Send message uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907 env: - SLACK_COLOR: ${{ job.status }} - SLACK_MESSAGE: "CUDA AWS multi-GPU tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})" + SLACK_COLOR: ${{ needs.cuda-tests-linux.result }} + SLACK_MESSAGE: "Multi-GPU tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})" teardown-instance: name: Teardown instance (cuda-tests-multi-gpu) diff --git a/.github/workflows/aws_tfhe_gpu_pcc.yml b/.github/workflows/gpu_pcc.yml similarity index 100% rename from .github/workflows/aws_tfhe_gpu_pcc.yml rename to .github/workflows/gpu_pcc.yml diff --git a/.github/workflows/hyperstack_tfhe_signed_integer_gpu_tests.yml b/.github/workflows/gpu_signed_integer_h100_tests.yml similarity index 98% rename from .github/workflows/hyperstack_tfhe_signed_integer_gpu_tests.yml rename to .github/workflows/gpu_signed_integer_h100_tests.yml index 2b3fb27e2..4e19a19b8 100644 --- a/.github/workflows/hyperstack_tfhe_signed_integer_gpu_tests.yml +++ b/.github/workflows/gpu_signed_integer_h100_tests.yml @@ -49,7 +49,7 @@ jobs: - tfhe/src/c_api/** - 'tfhe/docs/**.md' - Makefile - - '.github/workflows/hyperstack**' + - '.github/workflows/gpu_signed_integer_h100_tests.yml' - scripts/** - ci/** diff --git a/.github/workflows/aws_tfhe_signed_integer_gpu_tests.yml b/.github/workflows/gpu_signed_integer_tests.yml similarity index 81% rename from .github/workflows/aws_tfhe_signed_integer_gpu_tests.yml rename to .github/workflows/gpu_signed_integer_tests.yml index b4dc8dc92..e9e73b95e 100644 --- a/.github/workflows/aws_tfhe_signed_integer_gpu_tests.yml +++ b/.github/workflows/gpu_signed_integer_tests.yml @@ -56,6 +56,10 @@ jobs: - tfhe/src/high_level_api/** - tfhe/src/c_api/** - 'tfhe/docs/**.md' + - '.github/workflows/gpu_signed_integer_tests.yml' + - Makefile + - scripts/** + - ci/** setup-instance: name: Setup instance (cuda-signed-integer-tests) @@ -75,7 +79,7 @@ jobs: github-token: ${{ secrets.SLAB_ACTION_TOKEN }} slab-url: ${{ secrets.SLAB_BASE_URL }} job-secret: ${{ secrets.JOB_SECRET }} - backend: aws + backend: hyperstack profile: gpu-test cuda-signed-integer-tests: @@ -94,13 +98,27 @@ jobs: include: - os: ubuntu-22.04 cuda: "12.2" - gcc: 9 + gcc: 11 env: CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }} - + CMAKE_VERSION: 3.29.6 steps: + # Mandatory on hyperstack since a bootable volume is not re-usable yet. + - name: Install dependencies + run: | + sudo apt update + sudo apt install -y checkinstall zlib1g-dev libssl-dev + wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz + tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz + cd cmake-${{ env.CMAKE_VERSION }} + ./bootstrap + make -j"$(nproc)" + sudo make install + - name: Checkout tfhe-rs uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 + with: + persist-credentials: 'false' - name: Set up home run: | @@ -138,17 +156,26 @@ jobs: echo "NIGHTLY_TESTS=TRUE"; } >> "${GITHUB_ENV}" + - name: Check device is detected + if: ${{ !cancelled() }} + run: nvidia-smi + - name: Run signed integer multi-bit tests run: | make test_signed_integer_multi_bit_gpu_ci - - name: Slack Notification - if: ${{ always() }} - continue-on-error: true + slack-notify: + name: Slack Notification + needs: [ setup-instance, cuda-signed-integer-tests ] + runs-on: ubuntu-latest + if: ${{ always() && needs.cuda-signed-integer-tests.result != 'skipped' }} + continue-on-error: true + steps: + - name: Send message uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907 env: - SLACK_COLOR: ${{ job.status }} - SLACK_MESSAGE: "CUDA AWS signed integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})" + SLACK_COLOR: ${{ needs.cuda-signed-integer-tests.result }} + SLACK_MESSAGE: "Base GPU tests finished with status: ${{ needs.cuda-signed-integer-tests.result }}. (${{ env.ACTION_RUN_URL }})" teardown-instance: name: Teardown instance (cuda-tests) diff --git a/.github/workflows/hyperstack_tfhe_unsigned_integer_gpu_tests.yml b/.github/workflows/gpu_unsigned_integer_h100_tests.yml similarity index 99% rename from .github/workflows/hyperstack_tfhe_unsigned_integer_gpu_tests.yml rename to .github/workflows/gpu_unsigned_integer_h100_tests.yml index e82fffdd6..25796846a 100644 --- a/.github/workflows/hyperstack_tfhe_unsigned_integer_gpu_tests.yml +++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml @@ -49,7 +49,7 @@ jobs: - tfhe/src/c_api/** - 'tfhe/docs/**.md' - Makefile - - '.github/workflows/hyperstack**' + - '.github/workflows/gpu_unsigned_integer_tests.yml' - scripts/** - ci/** diff --git a/.github/workflows/aws_tfhe_integer_gpu_tests.yml b/.github/workflows/gpu_unsigned_integer_tests.yml similarity index 81% rename from .github/workflows/aws_tfhe_integer_gpu_tests.yml rename to .github/workflows/gpu_unsigned_integer_tests.yml index 2cee339d8..1b0286db4 100644 --- a/.github/workflows/aws_tfhe_integer_gpu_tests.yml +++ b/.github/workflows/gpu_unsigned_integer_tests.yml @@ -55,6 +55,10 @@ jobs: - tfhe/src/high_level_api/** - tfhe/src/c_api/** - 'tfhe/docs/**.md' + - '.github/workflows/gpu_unsigned_integer_tests.yml' + - Makefile + - scripts/** + - ci/** setup-instance: name: Setup instance (cuda-unsigned-integer-tests) @@ -74,7 +78,7 @@ jobs: github-token: ${{ secrets.SLAB_ACTION_TOKEN }} slab-url: ${{ secrets.SLAB_BASE_URL }} job-secret: ${{ secrets.JOB_SECRET }} - backend: aws + backend: hyperstack profile: gpu-test cuda-unsigned-integer-tests: @@ -93,11 +97,23 @@ jobs: include: - os: ubuntu-22.04 cuda: "12.2" - gcc: 9 + gcc: 11 env: CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }} - + CMAKE_VERSION: 3.29.6 steps: + # Mandatory on hyperstack since a bootable volume is not re-usable yet. + - name: Install dependencies + run: | + sudo apt update + sudo apt install -y checkinstall zlib1g-dev libssl-dev + wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz + tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz + cd cmake-${{ env.CMAKE_VERSION }} + ./bootstrap + make -j"$(nproc)" + sudo make install + - name: Checkout tfhe-rs uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 @@ -137,17 +153,26 @@ jobs: echo "NIGHTLY_TESTS=TRUE"; } >> "${GITHUB_ENV}" + - name: Check device is detected + if: ${{ !cancelled() }} + run: nvidia-smi + - name: Run unsigned integer multi-bit tests run: | make test_unsigned_integer_multi_bit_gpu_ci - - name: Slack Notification - if: ${{ always() }} - continue-on-error: true + slack-notify: + name: Slack Notification + needs: [ setup-instance, cuda-unsigned-integer-tests ] + runs-on: ubuntu-latest + if: ${{ always() && needs.cuda-unsigned-integer-tests.result != 'skipped' }} + continue-on-error: true + steps: + - name: Send message uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907 env: - SLACK_COLOR: ${{ job.status }} - SLACK_MESSAGE: "CUDA AWS unsigned integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})" + SLACK_COLOR: ${{ needs.cuda-unsigned-integer-tests.result }} + SLACK_MESSAGE: "Unsigned integer GPU tests finished with status: ${{ needs.cuda-unsigned-integer-tests.result }}. (${{ env.ACTION_RUN_URL }})" teardown-instance: name: Teardown instance (cuda-tests) diff --git a/ci/slab.toml b/ci/slab.toml index 8c6179be5..2974ff94c 100644 --- a/ci/slab.toml +++ b/ci/slab.toml @@ -30,13 +30,10 @@ region = "us-east-1" image_id = "ami-06b3d61f41bf8350a" instance_type = "m6i.4xlarge" -[backend.aws.gpu-test] -region = "us-east-1" -image_id = "ami-06b3d61f41bf8350a" -instance_type = "p3.2xlarge" -# One spawn attempt every 30 seconds for 1 hour -spawn_retry_attempts = 120 -spawn_retry_duration = 60 +[backend.hyperstack.gpu-test] +environment_name = "canada" +image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2" +flavor_name = "n3-RTX-A6000x1" [backend.hyperstack.single-h100] environment_name = "canada" @@ -58,13 +55,10 @@ environment_name = "canada" image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2" flavor_name = "n3-A100x8-NVLink" -[backend.aws.multi-gpu-test] -region = "us-east-1" -image_id = "ami-06b3d61f41bf8350a" -instance_type = "p3.8xlarge" -# One spawn attempt every 30 seconds for 1 hour -spawn_retry_attempts = 120 -spawn_retry_duration = 60 +[backend.hyperstack.multi-gpu-test] +environment_name = "canada" +image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2" +flavor_name = "n3-A100x4" [command.signed_integer_full_bench] workflow = "signed_integer_full_benchmark.yml" diff --git a/scripts/integer-tests.sh b/scripts/integer-tests.sh index e6a5e6d98..25b288607 100755 --- a/scripts/integer-tests.sh +++ b/scripts/integer-tests.sh @@ -130,8 +130,8 @@ fi # Override test-threads number to avoid Out-of-memory issues on GPU instances if [[ "${backend}" == "gpu" ]]; then if [[ "${BIG_TESTS_INSTANCE}" == TRUE ]]; then - test_threads=5 - doctest_threads=5 + test_threads=8 + doctest_threads=8 else test_threads=3 doctest_threads=3