chore(ci): run ml benchmarks in a matrix with slab

This CI "feature" is meant to circumvent the 6 hours hard-limit
for a job in GitHub Action.
The benchmark is done using a matrix which is handled by Slab.
Here's the workflow:

  1. ML benchmarks are started in a fire and forget fashion via
     start_ml_benchmarks.yml
  2. Slab will read ci/slab.toml to get the AWS EC2 configuration
     and the matrix parameters
  3. Slab will launch at most max_parallel_jobs EC2 instances in
     parallel
  4. Each job will trigger ml_benchmark_subset.yml which will run
     only one of the generated YAML file via make generate-mlbench,
     based on the value of the matrix item they were given.
  5. As soon as a job is completed, the next one in the matrix
     will start promptly.

This is done until all the matrix items are exhausted.
This commit is contained in:
David Testé
2022-11-21 10:13:30 +01:00
committed by David Testé
parent 8e4be10eb9
commit 3c2a75186f
6 changed files with 191 additions and 8 deletions

View File

@@ -15,13 +15,6 @@ on:
options:
- m6i.metal
- c6a.metal
# Have a weekly benchmark run on main branch to be available on Monday morning (Paris time)
# TODO: uncomment this section once MLBenchmarks are implemented
# schedule:
# # * is a special character in YAML so you have to quote this string
# # At 1:00 every Thursday
# # Timezone is UTC, so Paris time is +2 during the summer and +1 during winter
# - cron: '0 1 * * THU'
env:
CARGO_TERM_COLOR: always

View File

@@ -0,0 +1,135 @@
# Run one of the ML benchmarks on an AWS instance and return parsed results to Slab CI bot.
name: Application benchmarks
on:
workflow_dispatch:
inputs:
instance_id:
description: 'Instance ID'
type: string
instance_image_id:
description: 'Instance AMI ID'
type: string
instance_type:
description: 'Instance product type'
type: string
runner_name:
description: 'Action runner name'
type: string
request_id:
description: 'Slab request ID'
type: string
matrix_item:
description: 'Build matrix item'
type: string
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
jobs:
run-ml-benchmarks:
name: Execute ML benchmarks subset in EC2
runs-on: ${{ github.event.inputs.runner_name }}
if: ${{ !cancelled() }}
steps:
- name: Instance configuration used
run: |
echo "IDs: ${{ inputs.instance_id }}"
echo "AMI: ${{ inputs.instance_image_id }}"
echo "Type: ${{ inputs.instance_type }}"
echo "Request ID: ${{ inputs.request_id }}"
echo "Matrix item: ${{ inputs.matrix_item }}"
- name: Get benchmark date
run: |
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
# SSH private key is required as some dependencies are from private repos
- uses: webfactory/ssh-agent@v0.5.2
with:
ssh-private-key: ${{ secrets.CONCRETE_COMPILER_CI_SSH_PRIVATE }}
- name: Fetch submodules
uses: actions/checkout@v3
with:
fetch-depth: 0
submodules: recursive
token: ${{ secrets.GH_TOKEN }}
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: actions-rs/toolchain@v1
with:
toolchain: stable
override: true
- name: Build compiler and ML benchmarks
run: |
set -e
cd compiler
make BINDINGS_PYTHON_ENABLED=OFF build-mlbench
- name: Download KeySetCache
if: ${{ !contains(github.head_ref, 'newkeysetcache') }}
continue-on-error: true
run: |
cd compiler
GITHUB_TOKEN=${{ secrets.GITHUB_TOKEN }} make keysetcache_ci_populated
- name: Run ML benchmarks
run: |
set -e
cd compiler
make BINDINGS_PYTHON_ENABLED=OFF ML_BENCH_SUBSET_ID=${{ inputs.matrix_item }} run-mlbench-subset
- name: Upload raw results artifact
uses: actions/upload-artifact@v3
with:
name: ${{ github.sha }}_raw
path: compiler/benchmarks_results.json
- name: Parse results
shell: bash
run: |
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
COMMIT_HASH="$(git describe --tags --dirty)"
python3 ./ci/benchmark_parser.py compiler/benchmarks_results.json ${{ env.RESULTS_FILENAME }} \
--schema compiler_benchmarks \
--hardware ${{ inputs.instance_type }} \
--project-version ${COMMIT_HASH} \
--branch ${{ github.ref_name }} \
--commit-date ${COMMIT_DATE} \
--bench-date "${{ env.BENCH_DATE }}"
- name: Upload parsed results artifact
uses: actions/upload-artifact@v3
with:
name: ${{ github.sha }}
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@v3
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.GH_TOKEN }}
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on downloaded artifact"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}

View File

@@ -0,0 +1,30 @@
# Start application benchmarks job on Slab CI bot.
name: Start ML benchmarks
on:
workflow_dispatch:
# Have a weekly benchmark run on main branch to be available on Monday morning (Paris time)
# TODO: uncomment this section once MLBenchmarks are implemented
# schedule:
# # * is a special character in YAML so you have to quote this string
# # At 1:00 every Thursday
# # Timezone is UTC, so Paris time is +2 during the summer and +1 during winter
# - cron: '0 1 * * THU'
jobs:
start-ml-benchmarks:
steps:
- name: Start AWS job in Slab
shell: bash
# TODO: step result must be correlated to HTTP return code.
run: |
PAYLOAD='{"command": "ml-bench", "git_ref": ${{ github.ref }}, "sha": ${{ github.sha }}}'
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${PAYLOAD} '${{ secrets.JOB_SECRET }}')"
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: start_data" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ PAYLOAD }} \
${{ secrets.SLAB_URL }}

13
ci/slab.toml Normal file
View File

@@ -0,0 +1,13 @@
[profile.m6i-bench]
region = "eu-west-3"
image_id = "ami-0a24aaee029d1295c"
instance_type = "m6i.metal"
subnet_id = "subnet-a886b4c1"
security_group= ["sg-0bf1c1d79c97bc88f", ]
# Trigger ML benchmarks by running each use cases subset in parallel.
[command.ml-bench]
workflow = "ml_benchmark_subset.yml"
profile = "m6i-bench"
matrix = [0,1,2,3,4,5,6,7,8,9,10]
max_parallel_jobs = 2

View File

@@ -23,6 +23,8 @@ HPX_INSTALL_DIR?=$(HPX_LOCAL_DIR)/build
CONCRETE_CORE_FFI_VERSION?=0.2.0
ML_BENCH_SUBSET_ID=
OS=undefined
ifeq ($(shell uname), Linux)
OS=linux
@@ -290,6 +292,10 @@ generate-mlbench:
run-mlbench: build-mlbench generate-mlbench
tests/end_to_end_benchmarks/end_to_end_mlbench.sh tests/end_to_end_benchmarks/mlbench/ $(BUILD_DIR)/bin/end_to_end_mlbench
run-mlbench-subset: build-mlbench generate-mlbench
@[ "${ML_BENCH_SUBSET_ID}" ] || ( echo "ML_BENCH_SUBSET_ID is not set"; exit 1 )
tests/end_to_end_benchmarks/end_to_end_mlbench.sh tests/end_to_end_benchmarks/mlbench/end_to_end_mlbench_$(ML_BENCH_SUBSET_ID).yaml $(BUILD_DIR)/bin/end_to_end_mlbench
show-stress-tests-summary:
@echo '------ Stress tests summary ------'
@echo

View File

@@ -1,3 +1,9 @@
#!/bin/bash
find $1 -name "*mlbench_*.yaml" -exec bash -c "BENCHMARK_FILE={} BENCHMARK_STACK=1000000000 BENCHMARK_NAME=MLBench $2" \;
if [[ -d $1 ]]; then
# Execute all generated YAML files sequentially.
find "$1" -name "*mlbench_*.yaml" -exec bash -c "BENCHMARK_FILE={} BENCHMARK_STACK=1000000000 BENCHMARK_NAME=MLBench $2" \;
else
# Execute only one of the YAML file.
bash -c "BENCHMARK_FILE=$1 BENCHMARK_STACK=1000000000 BENCHMARK_NAME=MLBench $2" \;
fi