AutoGPT/.github/workflows/benchmark-ci.yml

name: AGBenchmark CI

on:
  push:
    branches: [ master, development, ci-test* ]
    paths:
      - 'benchmark/**'
      - .github/workflows/benchmark-ci.yml
      - '!benchmark/reports/**'
  pull_request:
    branches: [ master, development, release-* ]
    paths:
      - 'benchmark/**'
      - '!benchmark/reports/**'
      - .github/workflows/benchmark-ci.yml

concurrency:
  group: ${{ format('benchmark-ci-{0}', github.head_ref && format('{0}-{1}', github.event_name, github.event.pull_request.number) || github.sha) }}
  cancel-in-progress: ${{ startsWith(github.event_name, 'pull_request') }}

defaults:
  run:
    shell: bash

env:
  min-python-version: '3.10'

jobs:
  test:
    permissions:
      contents: read
    timeout-minutes: 30
    strategy:
      fail-fast: false
      matrix:
        python-version: ["3.10"]
        platform-os: [ubuntu, macos, macos-arm64, windows]
    runs-on: ${{ matrix.platform-os != 'macos-arm64' && format('{0}-latest', matrix.platform-os) || 'macos-14' }}
    defaults:
      run:
        shell: bash
        working-directory: benchmark
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
          submodules: true

      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}

      - name: Set up Python dependency cache
        # On Windows, unpacking cached dependencies takes longer than just installing them
        if: runner.os != 'Windows'
        uses: actions/cache@v4
        with:
          path: ${{ runner.os == 'macOS' && '~/Library/Caches/pypoetry' || '~/.cache/pypoetry' }}
          key: poetry-${{ runner.os }}-${{ hashFiles('benchmark/poetry.lock') }}

      - name: Install Poetry (Unix)
        if: runner.os != 'Windows'
        run: |
          curl -sSL https://install.python-poetry.org | python3 -

          if [ "${{ runner.os }}" = "macOS" ]; then
            PATH="$HOME/.local/bin:$PATH"
            echo "$HOME/.local/bin" >> $GITHUB_PATH
          fi

      - name: Install Poetry (Windows)
        if: runner.os == 'Windows'
        shell: pwsh
        run: |
          (Invoke-WebRequest -Uri https://install.python-poetry.org -UseBasicParsing).Content | python -

          $env:PATH += ";$env:APPDATA\Python\Scripts"
          echo "$env:APPDATA\Python\Scripts" >> $env:GITHUB_PATH

      - name: Install Python dependencies
        run: poetry install

      - name: Run pytest with coverage
        run: |
          poetry run pytest -vv \
            --cov=agbenchmark --cov-branch --cov-report term-missing --cov-report xml \
            --durations=10 \
            tests
        env:
          CI: true
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}

      - name: Upload coverage reports to Codecov
        uses: codecov/codecov-action@v4
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          flags: agbenchmark,${{ runner.os }}

  self-test-with-agent:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        agent-name: [ forge ]
      fail-fast: false
    timeout-minutes: 20
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
          submodules: true

      - name: Set up Python ${{ env.min-python-version }}
        uses: actions/setup-python@v5
        with:
          python-version: ${{ env.min-python-version }}

      - name: Install Poetry
        run: |
          curl -sSL https://install.python-poetry.org | python -

      - name: Run regression tests
        working-directory: .
        run: |
          ./run agent start ${{ matrix.agent-name }}
          cd ${{ matrix.agent-name }}

          set +e # Ignore non-zero exit codes and continue execution
          echo "Running the following command: poetry run agbenchmark --maintain --mock"
          poetry run agbenchmark --maintain --mock
          EXIT_CODE=$?
          set -e  # Stop ignoring non-zero exit codes
          # Check if the exit code was 5, and if so, exit with 0 instead
          if [ $EXIT_CODE -eq 5 ]; then
            echo "regression_tests.json is empty."
          fi

          echo "Running the following command: poetry run agbenchmark --mock"
          poetry run agbenchmark --mock

          echo "Running the following command: poetry run agbenchmark --mock --category=data"
          poetry run agbenchmark --mock --category=data

          echo "Running the following command: poetry run agbenchmark --mock --category=coding"
          poetry run agbenchmark --mock --category=coding

          echo "Running the following command: poetry run agbenchmark --test=WriteFile"
          poetry run agbenchmark --test=WriteFile
          cd ../benchmark
          poetry install
          echo "Adding the BUILD_SKILL_TREE environment variable. This will attempt to add new elements in the skill tree. If new elements are added, the CI fails because they should have been pushed"
          export BUILD_SKILL_TREE=true

          poetry run agbenchmark --mock

          CHANGED=$(git diff --name-only | grep -E '(agbenchmark/challenges)|(../frontend/assets)') || echo "No diffs"
          if [ ! -z "$CHANGED" ]; then
            echo "There are unstaged changes please run agbenchmark and commit those changes since they are needed."
            echo "$CHANGED"
            exit 1
          else
            echo "No unstaged changes."
          fi
        env:
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
          TELEMETRY_ENVIRONMENT: autogpt-benchmark-ci
          TELEMETRY_OPT_IN: ${{ github.ref_name == 'master' }}