chore(ci): transfer all GPU CI to hyperstack

2026-01-09 22:57:59 -05:00 · 2024-08-06 14:05:41 +02:00
parent a26e68c3bc
commit 5340859003
11 changed files with 154 additions and 58 deletions
--- a/.github/workflows/aws_tfhe_gpu_4090_tests.yml
+++ b/.github/workflows/aws_tfhe_gpu_4090_tests.yml
--- a/.github/workflows/hyperstack_tfhe_gpu_tests.yml
+++ b/.github/workflows/hyperstack_tfhe_gpu_tests.yml
@@ -1,5 +1,5 @@
 # Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
-name: TFHE Cuda Backend - Base tests on H100
+name: TFHE Cuda Backend - Fast tests on H100
 env:
  CARGO_TERM_COLOR: always
@@ -49,7 +49,7 @@ jobs:
              - tfhe/src/c_api/**
              - 'tfhe/docs/**.md'
              - Makefile
-              - '.github/workflows/hyperstack**'
+              - '.github/workflows/gpu_fast_h100_tests.yml'
              - scripts/**
              - ci/**
@@ -109,6 +109,8 @@ jobs:
      - name: Checkout tfhe-rs
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
      - name: Set up home
        run: |
@@ -170,7 +172,7 @@ jobs:
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
-          SLACK_MESSAGE: "Base H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Fast H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
  teardown-instance:
    name: Teardown instance (cuda-h100-tests)
--- a/.github/workflows/aws_tfhe_gpu_tests.yml
+++ b/.github/workflows/aws_tfhe_gpu_tests.yml
@@ -47,6 +47,10 @@ jobs:
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
              - 'tfhe/docs/**.md'
              - '.github/workflows/gpu_fast_tests.yml'
              - Makefile
              - scripts/**
              - ci/**
  setup-instance:
    name: Setup instance (cuda-tests)
@@ -65,7 +69,7 @@ jobs:
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
+          backend: hyperstack
          profile: gpu-test
  cuda-tests-linux:
@@ -84,11 +88,23 @@ jobs:
        include:
          - os: ubuntu-22.04
            cuda: "12.2"
-            gcc: 9
+            gcc: 11 
    env:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-
+      CMAKE_VERSION: 3.29.6
    steps:
      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
      - name: Install dependencies
        run: |
          sudo apt update
          sudo apt install -y checkinstall zlib1g-dev libssl-dev 
          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
          cd cmake-${{ env.CMAKE_VERSION }}
          ./bootstrap
          make -j"$(nproc)"
          sudo make install
      - name: Checkout tfhe-rs
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
@@ -122,6 +138,10 @@ jobs:
            echo "HOME=/home/ubuntu";
          } >> "${GITHUB_ENV}"
      - name: Check device is detected
        if: ${{ !cancelled() }}
        run: nvidia-smi
      - name: Run core crypto and internal CUDA backend tests
        run: |
          make test_core_crypto_gpu
@@ -139,13 +159,18 @@ jobs:
        run: |
          make test_high_level_api_gpu
-      - name: Slack Notification
+  slack-notify:
-        if: ${{ always() }}
+    name: Slack Notification
-        continue-on-error: true
+    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
    continue-on-error: true
    steps:
      - name: Send message
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
-          SLACK_COLOR: ${{ job.status }}
+          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
-          SLACK_MESSAGE: "CUDA AWS tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Base GPU tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
  teardown-instance:
    name: Teardown instance (cuda-tests)
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -49,7 +49,7 @@ jobs:
              - tfhe/src/c_api/**
              - 'tfhe/docs/**.md'
              - Makefile
-              - '.github/workflows/aws_tfhe_multi_gpu**'
+              - '.github/workflows/**_multi_gpu_tests.yml'
              - scripts/**
              - ci/**
@@ -71,7 +71,7 @@ jobs:
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
+          backend: hyperstack
          profile: multi-gpu-test
  cuda-tests-linux:
@@ -90,13 +90,27 @@ jobs:
        include:
          - os: ubuntu-22.04
            cuda: "12.2"
-            gcc: 9
+            gcc: 11 
    env:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-
+      CMAKE_VERSION: 3.29.6
    steps:
      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
      - name: Install dependencies
        run: |
          sudo apt update
          sudo apt install -y checkinstall zlib1g-dev libssl-dev 
          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
          cd cmake-${{ env.CMAKE_VERSION }}
          ./bootstrap
          make -j"$(nproc)"
          sudo make install
      - name: Checkout tfhe-rs
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
      - name: Set up home
        run: |
@@ -126,30 +140,39 @@ jobs:
            echo "HOME=/home/ubuntu";
          } >> "${GITHUB_ENV}"
      - name: Check device is detected
        if: ${{ !cancelled() }}
        run: nvidia-smi
      # No need to test core_crypto and classic PBS in integer since it's already tested on single GPU.
      - name: Run multi-bit CUDA integer tests
        run: |
-          make test_integer_multi_bit_gpu_ci
+          BIG_TESTS_INSTANCE=TRUE make test_integer_multi_bit_gpu_ci
      - name: Run user docs tests
        run: |
-          make test_user_doc_gpu
+          BIG_TESTS_INSTANCE=TRUE make test_user_doc_gpu
      - name: Test C API
        run: |
-          make test_c_api_gpu
+          BIG_TESTS_INSTANCE=TRUE make test_c_api_gpu
      - name: Run High Level API Tests
        run: |
-          make test_high_level_api_gpu
+          BIG_TESTS_INSTANCE=TRUE make test_high_level_api_gpu
-      - name: Slack Notification
+  slack-notify:
-        if: ${{ always() }}
+    name: Slack Notification
-        continue-on-error: true
+    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
    continue-on-error: true
    steps:
      - name: Send message
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
-          SLACK_COLOR: ${{ job.status }}
+          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
-          SLACK_MESSAGE: "CUDA AWS multi-GPU tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Multi-GPU tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
  teardown-instance:
    name: Teardown instance (cuda-tests-multi-gpu)
--- a/.github/workflows/aws_tfhe_gpu_pcc.yml
+++ b/.github/workflows/aws_tfhe_gpu_pcc.yml
--- a/.github/workflows/hyperstack_tfhe_signed_integer_gpu_tests.yml
+++ b/.github/workflows/hyperstack_tfhe_signed_integer_gpu_tests.yml
@@ -49,7 +49,7 @@ jobs:
              - tfhe/src/c_api/**
              - 'tfhe/docs/**.md'
              - Makefile
-              - '.github/workflows/hyperstack**'
+              - '.github/workflows/gpu_signed_integer_h100_tests.yml'
              - scripts/**
              - ci/**
--- a/.github/workflows/aws_tfhe_signed_integer_gpu_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_gpu_tests.yml
@@ -56,6 +56,10 @@ jobs:
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
              - 'tfhe/docs/**.md'
              - '.github/workflows/gpu_signed_integer_tests.yml'
              - Makefile
              - scripts/**
              - ci/**
  setup-instance:
    name: Setup instance (cuda-signed-integer-tests)
@@ -75,7 +79,7 @@ jobs:
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
+          backend: hyperstack
          profile: gpu-test
  cuda-signed-integer-tests:
@@ -94,13 +98,27 @@ jobs:
        include:
          - os: ubuntu-22.04
            cuda: "12.2"
-            gcc: 9
+            gcc: 11 
    env:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-
+      CMAKE_VERSION: 3.29.6
    steps:
      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
      - name: Install dependencies
        run: |
          sudo apt update
          sudo apt install -y checkinstall zlib1g-dev libssl-dev 
          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
          cd cmake-${{ env.CMAKE_VERSION }}
          ./bootstrap
          make -j"$(nproc)"
          sudo make install
      - name: Checkout tfhe-rs
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
      - name: Set up home
        run: |
@@ -138,17 +156,26 @@ jobs:
            echo "NIGHTLY_TESTS=TRUE";
          } >> "${GITHUB_ENV}"
      - name: Check device is detected
        if: ${{ !cancelled() }}
        run: nvidia-smi
      - name: Run signed integer multi-bit tests
        run: |
          make test_signed_integer_multi_bit_gpu_ci
-      - name: Slack Notification
+  slack-notify:
-        if: ${{ always() }}
+    name: Slack Notification
-        continue-on-error: true
+    needs: [ setup-instance, cuda-signed-integer-tests ]
    runs-on: ubuntu-latest
    if: ${{ always() && needs.cuda-signed-integer-tests.result != 'skipped' }}
    continue-on-error: true
    steps:
      - name: Send message
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
-          SLACK_COLOR: ${{ job.status }}
+          SLACK_COLOR: ${{ needs.cuda-signed-integer-tests.result }}
-          SLACK_MESSAGE: "CUDA AWS signed integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Base GPU tests finished with status: ${{ needs.cuda-signed-integer-tests.result }}. (${{ env.ACTION_RUN_URL }})"
  teardown-instance:
    name: Teardown instance (cuda-tests)
--- a/.github/workflows/hyperstack_tfhe_unsigned_integer_gpu_tests.yml
+++ b/.github/workflows/hyperstack_tfhe_unsigned_integer_gpu_tests.yml
@@ -49,7 +49,7 @@ jobs:
              - tfhe/src/c_api/**
              - 'tfhe/docs/**.md'
              - Makefile
-              - '.github/workflows/hyperstack**'
+              - '.github/workflows/gpu_unsigned_integer_tests.yml'
              - scripts/**
              - ci/**
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -55,6 +55,10 @@ jobs:
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
              - 'tfhe/docs/**.md'
              - '.github/workflows/gpu_unsigned_integer_tests.yml'
              - Makefile
              - scripts/**
              - ci/**
  setup-instance:
    name: Setup instance (cuda-unsigned-integer-tests)
@@ -74,7 +78,7 @@ jobs:
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
+          backend: hyperstack
          profile: gpu-test
  cuda-unsigned-integer-tests:
@@ -93,11 +97,23 @@ jobs:
        include:
          - os: ubuntu-22.04
            cuda: "12.2"
-            gcc: 9
+            gcc: 11
    env:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-
+      CMAKE_VERSION: 3.29.6
    steps:
      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
      - name: Install dependencies
        run: |
          sudo apt update
          sudo apt install -y checkinstall zlib1g-dev libssl-dev 
          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
          cd cmake-${{ env.CMAKE_VERSION }}
          ./bootstrap
          make -j"$(nproc)"
          sudo make install
      - name: Checkout tfhe-rs
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
@@ -137,17 +153,26 @@ jobs:
            echo "NIGHTLY_TESTS=TRUE";
          } >> "${GITHUB_ENV}"
      - name: Check device is detected
        if: ${{ !cancelled() }}
        run: nvidia-smi
      - name: Run unsigned integer multi-bit tests
        run: |
          make test_unsigned_integer_multi_bit_gpu_ci
-      - name: Slack Notification
+  slack-notify:
-        if: ${{ always() }}
+    name: Slack Notification
-        continue-on-error: true
+    needs: [ setup-instance, cuda-unsigned-integer-tests ]
    runs-on: ubuntu-latest
    if: ${{ always() && needs.cuda-unsigned-integer-tests.result != 'skipped' }}
    continue-on-error: true
    steps:
      - name: Send message
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
-          SLACK_COLOR: ${{ job.status }}
+          SLACK_COLOR: ${{ needs.cuda-unsigned-integer-tests.result }}
-          SLACK_MESSAGE: "CUDA AWS unsigned integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Unsigned integer GPU tests finished with status: ${{ needs.cuda-unsigned-integer-tests.result }}. (${{ env.ACTION_RUN_URL }})"
  teardown-instance:
    name: Teardown instance (cuda-tests)
--- a/ci/slab.toml
+++ b/ci/slab.toml
@@ -30,13 +30,10 @@ region = "us-east-1"
 image_id = "ami-06b3d61f41bf8350a"
 instance_type = "m6i.4xlarge"
-[backend.aws.gpu-test]
+[backend.hyperstack.gpu-test]
-region = "us-east-1"
+environment_name = "canada"
-image_id = "ami-06b3d61f41bf8350a"
+image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2"
-instance_type = "p3.2xlarge"
+flavor_name = "n3-RTX-A6000x1"
 # One spawn attempt every 30 seconds for 1 hour
 spawn_retry_attempts = 120
 spawn_retry_duration = 60
 [backend.hyperstack.single-h100]
 environment_name = "canada"
@@ -58,13 +55,10 @@ environment_name = "canada"
 image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2"
 flavor_name = "n3-A100x8-NVLink"
-[backend.aws.multi-gpu-test]
+[backend.hyperstack.multi-gpu-test]
-region = "us-east-1"
+environment_name = "canada"
-image_id = "ami-06b3d61f41bf8350a"
+image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2"
-instance_type = "p3.8xlarge"
+flavor_name = "n3-A100x4"
 # One spawn attempt every 30 seconds for 1 hour
 spawn_retry_attempts = 120
 spawn_retry_duration = 60
 [command.signed_integer_full_bench]
 workflow = "signed_integer_full_benchmark.yml"
--- a/scripts/integer-tests.sh
+++ b/scripts/integer-tests.sh
@@ -130,8 +130,8 @@ fi
 # Override test-threads number to avoid Out-of-memory issues on GPU instances
 if [[ "${backend}" == "gpu" ]]; then
    if [[ "${BIG_TESTS_INSTANCE}" == TRUE ]]; then
-        test_threads=5
+        test_threads=8
-        doctest_threads=5
+        doctest_threads=8
    else
        test_threads=3
        doctest_threads=3