diff --git a/.github/workflows/coprocessor-benchmark-gpu.yml b/.github/workflows/coprocessor-benchmark-gpu.yml index 6f98ae5dd..b34e6d0ea 100644 --- a/.github/workflows/coprocessor-benchmark-gpu.yml +++ b/.github/workflows/coprocessor-benchmark-gpu.yml @@ -3,6 +3,22 @@ name: coprocessor-benchmark-gpu on: workflow_dispatch: + inputs: + profile: + description: "Instance type" + required: true + type: choice + options: + - "l40 (n3-L40x1)" + - "4-l40 (n3-L40x4)" + - "single-h100 (n3-H100x1)" + - "2-h100 (n3-H100x2)" + - "4-h100 (n3-H100x4)" + - "multi-h100 (n3-H100x8)" + - "multi-h100-nvlink (n3-H100x8-NVLink)" + - "multi-h100-sxm5 (n3-H100x8-SXM5)" + - "multi-h100-sxm5_fallback (n3-H100x8-SXM5)" + schedule: # Weekly tests @ 1AM - cron: "0 1 * * 6" @@ -17,7 +33,9 @@ env: RUST_BACKTRACE: "full" RUST_MIN_STACK: "8388608" CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }} - PROFILE: "multi-h100-sxm5 (n3-H100x8-SXM5)" + PROFILE_SCHEDULED_RUN: "multi-h100-sxm5 (n3-H100x8-SXM5)" + PROFILE_MANUAL_RUN: ${{ inputs.profile }} + IS_MANUAL_RUN: ${{ github.event_name == 'workflow_dispatch' }} BENCHMARK_TYPE: "ALL" OPTIMIZATION_TARGET: "throughput" BATCH_SIZE: "5000" @@ -40,15 +58,25 @@ jobs: - name: Parse profile id: parse_profile run: | + if [[ ${IS_MANUAL_RUN} == true ]]; then + PROFILE_RAW="${PROFILE_MANUAL_RUN}" + else + PROFILE_RAW="${PROFILE_SCHEDULED_RUN}" + fi # shellcheck disable=SC2001 - PROFILE_VAL=$(echo "${PROFILE}" | sed 's|\(.*\)[[:space:]](.*)|\1|') + PROFILE_VAL=$(echo "${PROFILE_RAW}" | sed 's|\(.*\)[[:space:]](.*)|\1|') echo "profile=$PROFILE_VAL" >> "${GITHUB_OUTPUT}" - name: Parse hardware name id: parse_hardware_name run: | + if [[ ${IS_MANUAL_RUN} == true ]]; then + PROFILE_RAW="${PROFILE_MANUAL_RUN}" + else + PROFILE_RAW="${PROFILE}" + fi # shellcheck disable=SC2001 - PROFILE_VAL=$(echo "${PROFILE}" | sed 's|.*[[:space:]](\(.*\))|\1|') + PROFILE_VAL=$(echo "${PROFILE_RAW}" | sed 's|.*[[:space:]](\(.*\))|\1|') echo "name=$PROFILE_VAL" >> "${GITHUB_OUTPUT}" setup-instance: @@ -130,6 +158,13 @@ jobs: } >> "${GITHUB_ENV}" working-directory: tfhe-rs/ + - name: Setup Hyperstack dependencies + uses: ./tfhe-rs/.github/actions/gpu_setup + with: + cuda-version: ${{ matrix.cuda }} + gcc-version: ${{ matrix.gcc }} + github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }} + - name: Check fhEVM and TFHE-rs repos run: | pwd @@ -140,13 +175,6 @@ jobs: run: git lfs checkout working-directory: fhevm/ - - name: Setup Hyperstack dependencies - uses: ./fhevm/.github/actions/gpu_setup - with: - cuda-version: ${{ matrix.cuda }} - gcc-version: ${{ matrix.gcc }} - github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }} - - name: Install rust uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases with: @@ -154,7 +182,7 @@ jobs: - name: Install cargo dependencies run: | - sudo apt-get install -y protobuf-compiler cmake pkg-config libssl-dev \ + sudo apt-get install -y protobuf-compiler pkg-config libssl-dev \ libclang-dev docker-compose-v2 docker.io acl sudo usermod -aG docker "$USER" newgrp docker @@ -181,9 +209,16 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} + - name: Login to Chainguard Registry + uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0 + with: + registry: cgr.dev + username: ${{ secrets.CGR_USERNAME }} + password: ${{ secrets.CGR_PASSWORD }} + - name: Init database run: make init_db - working-directory: fhevm/coprocessor/fhevm-engine/coprocessor + working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker - name: Use Node.js uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0 @@ -203,8 +238,12 @@ jobs: - name: Profile erc20 no-cmux benchmark on GPU run: | - BENCHMARK_BATCH_SIZE="${BATCH_SIZE}" FHEVM_DF_SCHEDULE="${SCHEDULING_POLICY}" BENCHMARK_TYPE="LATENCY" OPTIMIZATION_TARGET="${OPTIMIZATION_TARGET}" make -e "profile_erc20_gpu" - working-directory: fhevm/coprocessor/fhevm-engine/coprocessor + BENCHMARK_BATCH_SIZE="${BATCH_SIZE}" \ + FHEVM_DF_SCHEDULE="${SCHEDULING_POLICY}" \ + BENCHMARK_TYPE="THROUGHPUT_200" \ + OPTIMIZATION_TARGET="${OPTIMIZATION_TARGET}" \ + make -e "profile_erc20_gpu" + working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker - name: Get nsys profile name id: nsys_profile_name @@ -215,7 +254,7 @@ jobs: REPORT_NAME: ${{ steps.nsys_profile_name.outputs.profile }} run: | mv report1.nsys-rep ${{ env.REPORT_NAME }} - working-directory: fhevm/coprocessor/fhevm-engine/coprocessor + working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker - name: Upload profile artifact env: @@ -223,17 +262,17 @@ jobs: uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 with: name: ${{ env.REPORT_NAME }} - path: fhevm/coprocessor/fhevm-engine/coprocessor/${{ env.REPORT_NAME }} + path: fhevm/coprocessor/fhevm-engine/tfhe-worker/${{ env.REPORT_NAME }} - name: Run latency benchmark on GPU run: | BENCHMARK_BATCH_SIZE="${BATCH_SIZE}" FHEVM_DF_SCHEDULE="${SCHEDULING_POLICY}" BENCHMARK_TYPE="LATENCY" OPTIMIZATION_TARGET="${OPTIMIZATION_TARGET}" make -e "benchmark_${BENCHMARKS}_gpu" - working-directory: fhevm/coprocessor/fhevm-engine/coprocessor + working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker - name: Run throughput benchmarks on GPU run: | BENCHMARK_BATCH_SIZE="${BATCH_SIZE}" FHEVM_DF_SCHEDULE="${SCHEDULING_POLICY}" BENCHMARK_TYPE="THROUGHPUT_200" OPTIMIZATION_TARGET="${OPTIMIZATION_TARGET}" make -e "benchmark_${BENCHMARKS}_gpu" - working-directory: fhevm/coprocessor/fhevm-engine/coprocessor + working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker - name: Parse results run: | @@ -246,7 +285,7 @@ jobs: --commit-date "${COMMIT_DATE}" \ --bench-date "${BENCH_DATE}" \ --walk-subdirs \ - --crate "coprocessor/fhevm-engine/coprocessor" \ + --crate "coprocessor/fhevm-engine/tfhe-worker" \ --name-suffix "operation_batch_size_${BATCH_SIZE}-schedule_${SCHEDULING_POLICY}-optimization_target_${OPTIMIZATION_TARGET}" working-directory: fhevm/ diff --git a/.github/workflows/gpu_full_multi_gpu_tests.yml b/.github/workflows/gpu_full_multi_gpu_tests.yml index 632fff387..c218d7766 100644 --- a/.github/workflows/gpu_full_multi_gpu_tests.yml +++ b/.github/workflows/gpu_full_multi_gpu_tests.yml @@ -86,7 +86,7 @@ jobs: slab-url: ${{ secrets.SLAB_BASE_URL }} job-secret: ${{ secrets.JOB_SECRET }} backend: hyperstack - profile: multi-gpu-test + profile: 4-l40 # This instance will be spawned especially for pull-request from forked repository - name: Start GitHub instance diff --git a/.github/workflows/gpu_integer_long_run_tests.yml b/.github/workflows/gpu_integer_long_run_tests.yml index bfa29d07b..82da2e8b0 100644 --- a/.github/workflows/gpu_integer_long_run_tests.yml +++ b/.github/workflows/gpu_integer_long_run_tests.yml @@ -43,7 +43,7 @@ jobs: slab-url: ${{ secrets.SLAB_BASE_URL }} job-secret: ${{ secrets.JOB_SECRET }} backend: hyperstack - profile: multi-gpu-test + profile: 4-l40 cuda-tests: name: gpu_integer_long_run_tests/cuda-tests diff --git a/Makefile b/Makefile index 75a7adf50..132f58b1f 100644 --- a/Makefile +++ b/Makefile @@ -1004,6 +1004,11 @@ test_list_gpu: install_rs_build_toolchain install_cargo_nextest --features=integer,internal-keycache,gpu,zk-pok -p tfhe \ -E "test(/.*gpu.*/)" +.PHONY: build_one_hl_api_test_gpu +build_one_hl_api_test_gpu: install_rs_build_toolchain + RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \ + --features=integer,gpu-debug -vv -p tfhe -- "$${TEST}" --test-threads=1 --nocapture + test_high_level_api_hpu: install_rs_build_toolchain install_cargo_nextest ifeq ($(HPU_CONFIG), v80) RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \ diff --git a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt index c027f909f..ec630429a 100644 --- a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt +++ b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt @@ -86,6 +86,7 @@ if(CMAKE_BUILD_TYPE_LOWERCASE STREQUAL "debug") message("Compiling in Debug mode") add_definitions(-DDEBUG) set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O0 -G -g") + set(USE_NVTOOLS 1) else() # Release mode message("Compiling in Release mode") diff --git a/backends/tfhe-cuda-backend/cuda/src/device.cu b/backends/tfhe-cuda-backend/cuda/src/device.cu index 8e433c30f..7c9191d26 100644 --- a/backends/tfhe-cuda-backend/cuda/src/device.cu +++ b/backends/tfhe-cuda-backend/cuda/src/device.cu @@ -2,6 +2,9 @@ #include #include #include +#ifdef USE_NVTOOLS +#include +#endif uint32_t cuda_get_device() { int device; @@ -83,6 +86,9 @@ void cuda_set_device(uint32_t gpu_index) { check_cuda_error(cudaSetDevice(gpu_index)); // Mempools are initialized only once in all the GPUS available cuda_setup_mempool(gpu_index); +#ifdef USE_NVTOOLS + check_cuda_error(cudaProfilerStart()); +#endif } cudaEvent_t cuda_create_event(uint32_t gpu_index) { diff --git a/ci/slab.toml b/ci/slab.toml index 371ee6af0..caa9f9211 100644 --- a/ci/slab.toml +++ b/ci/slab.toml @@ -83,18 +83,6 @@ image_name = "Ubuntu Server 22.04 LTS R570 CUDA 12.8" flavor_name = "n3-A100x8-NVLink" user = "ubuntu" -[backend.hyperstack.multi-gpu-test] -environment_name = "canada" -image_name = "Ubuntu Server 22.04 LTS R570 CUDA 12.8" -flavor_name = "n3-L40x4" -user = "ubuntu" - -[backend.hyperstack.multi-gpu-test_fallback] -environment_name = "canada" -image_name = "Ubuntu Server 22.04 LTS R570 CUDA 12.8" -flavor_name = "n3-RTX-A6000x2" -user = "ubuntu" - [backend.hyperstack.l40] environment_name = "canada" image_name = "Ubuntu Server 22.04 LTS R570 CUDA 12.8" @@ -106,3 +94,9 @@ environment_name = "canada" image_name = "Ubuntu Server 22.04 LTS R570 CUDA 12.8" flavor_name = "n3-RTX-A6000x1" user = "ubuntu" + +[backend.hyperstack.4-l40] +environment_name = "canada" +image_name = "Ubuntu Server 22.04 LTS R570 CUDA 12.8" +flavor_name = "n3-L40x4" +user = "ubuntu"