Fix blacksmith migration by removing eval-runner.yml

2026-04-29 03:00:45 -04:00 · 2025-03-19 14:33:58 +00:00
13 changed files with 128 additions and 63 deletions
@@ -46,7 +46,7 @@ on:

 jobs:
  del_runs:
-    runs-on: ubuntu-latest
+    runs-on: blacksmith-4vcpu-ubuntu-2204
    permissions:
      actions: write
      contents: read
@@ -24,20 +24,22 @@ jobs:
  build:
    if: github.repository == 'All-Hands-AI/OpenHands'
    name: Build Docusaurus
-    runs-on: ubuntu-latest
+    runs-on: blacksmith-4vcpu-ubuntu-2204
    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0
-      - uses: actions/setup-node@v4
+      - uses: useblacksmith/setup-node@v5
        with:
          node-version: 18
          cache: npm
          cache-dependency-path: docs/package-lock.json
      - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: useblacksmith/setup-python@v6
        with:
          python-version: '3.12'
+      - name: Generate Python Docs
+        run: rm -rf docs/modules/python && pip install pydoc-markdown && pydoc-markdown
      - name: Install dependencies
        run: cd docs && npm ci
      - name: Build website
@@ -52,7 +54,7 @@ jobs:
  deploy:
    if: github.ref == 'refs/heads/main' && github.repository == 'All-Hands-AI/OpenHands'
    name: Deploy to GitHub Pages
-    runs-on: ubuntu-latest
+    runs-on: blacksmith-4vcpu-ubuntu-2204
    # This job only runs on "main" so only run one of these jobs at a time
    # otherwise it will fail if one is already running
    concurrency:
@@ -16,7 +16,7 @@ concurrency:

 jobs:
  test:
-    runs-on: ubuntu-latest
+    runs-on: blacksmith-4vcpu-ubuntu-2204
    steps:
      - uses: actions/checkout@v4
      - name: Set up Docker Buildx
@@ -25,18 +25,18 @@ jobs:
      - name: Install tmux
        run: sudo apt-get update && sudo apt-get install -y tmux
      - name: Setup Node.js
-        uses: actions/setup-node@v4
+        uses: useblacksmith/setup-node@v5
        with:
          node-version: '22.x'
      - name: Install poetry via pipx
        run: pipx install poetry
      - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: useblacksmith/setup-python@v6
        with:
          python-version: '3.12'
          cache: 'poetry'
      - name: Install Python dependencies using Poetry
-        run: poetry install --without evaluation
+        run: poetry install --without evaluation,llama-index
      - name: Build Environment
        run: make build
      - name: Run tests
@@ -21,7 +21,7 @@ jobs:
  # Run frontend unit tests
  fe-test:
    name: FE Unit Tests
-    runs-on: ubuntu-latest
+    runs-on: blacksmith-4vcpu-ubuntu-2204
    strategy:
      matrix:
        node-version: [20, 22]
@@ -30,7 +30,7 @@ jobs:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Set up Node.js
-        uses: actions/setup-node@v4
+        uses: useblacksmith/setup-node@v5
        with:
          node-version: ${{ matrix.node-version }}
      - name: Install dependencies
@@ -32,7 +32,7 @@ jobs:
  # Builds the OpenHands Docker images
  ghcr_build_app:
    name: Build App Image
-    runs-on: ubuntu-latest
+    runs-on: blacksmith-4vcpu-ubuntu-2204
    permissions:
      contents: read
      packages: write
@@ -80,7 +80,7 @@ jobs:
  # Builds the runtime Docker images
  ghcr_build_runtime:
    name: Build Image
-    runs-on: ubuntu-latest
+    runs-on: blacksmith-4vcpu-ubuntu-2204
    permissions:
      contents: read
      packages: write
@@ -108,11 +108,11 @@ jobs:
        id: buildx
        uses: docker/setup-buildx-action@v3
      - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: useblacksmith/setup-python@v6
        with:
          python-version: '3.12'
      - name: Cache Poetry dependencies
-        uses: actions/cache@v4
+        uses: useblacksmith/cache@v5
        with:
          path: |
            ~/.cache/pypoetry
@@ -150,7 +150,7 @@ jobs:

  verify_hash_equivalence_in_runtime_and_app:
    name: Verify Hash Equivalence in Runtime and Docker images
-    runs-on: ubuntu-latest
+    runs-on: blacksmith-4vcpu-ubuntu-2204
    needs: [ghcr_build_runtime, ghcr_build_app]
    strategy:
      fail-fast: false
@@ -161,7 +161,7 @@ jobs:
        with:
          ref: ${{ github.event.pull_request.head.sha }}
      - name: Cache Poetry dependencies
-        uses: actions/cache@v4
+        uses: useblacksmith/cache@v5
        with:
          path: |
            ~/.cache/pypoetry
@@ -170,7 +170,7 @@ jobs:
          restore-keys: |
            ${{ runner.os }}-poetry-
      - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: useblacksmith/setup-python@v6
        with:
          python-version: '3.12'
      - name: Install poetry via pipx
@@ -204,7 +204,7 @@ jobs:
  test_runtime_root:
    name: RT Unit Tests (Root)
    needs: [ghcr_build_runtime]
-    runs-on: ubuntu-latest
+    runs-on: blacksmith-4vcpu-ubuntu-2204
    strategy:
      fail-fast: false
      matrix:
@@ -226,7 +226,7 @@ jobs:
        run: |
          docker load --input /tmp/runtime-${{ matrix.base_image }}.tar
      - name: Cache Poetry dependencies
-        uses: actions/cache@v4
+        uses: useblacksmith/cache@v5
        with:
          path: |
            ~/.cache/pypoetry
@@ -235,7 +235,7 @@ jobs:
          restore-keys: |
            ${{ runner.os }}-poetry-
      - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: useblacksmith/setup-python@v6
        with:
          python-version: '3.12'
      - name: Install poetry via pipx
@@ -269,7 +269,7 @@ jobs:
  # Run unit tests with the Docker runtime Docker images as openhands user
  test_runtime_oh:
    name: RT Unit Tests (openhands)
-    runs-on: ubuntu-latest
+    runs-on: blacksmith-4vcpu-ubuntu-2204
    needs: [ghcr_build_runtime]
    strategy:
      matrix:
@@ -291,7 +291,7 @@ jobs:
        run: |
          docker load --input /tmp/runtime-${{ matrix.base_image }}.tar
      - name: Cache Poetry dependencies
-        uses: actions/cache@v4
+        uses: useblacksmith/cache@v5
        with:
          path: |
            ~/.cache/pypoetry
@@ -300,7 +300,7 @@ jobs:
          restore-keys: |
            ${{ runner.os }}-poetry-
      - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: useblacksmith/setup-python@v6
        with:
          python-version: '3.12'
      - name: Install poetry via pipx
@@ -338,7 +338,7 @@ jobs:
  runtime_tests_check_success:
    name: All Runtime Tests Passed
    if: ${{ !cancelled() && !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') }}
-    runs-on: ubuntu-latest
+    runs-on: blacksmith-4vcpu-ubuntu-2204
    needs: [test_runtime_root, test_runtime_oh, verify_hash_equivalence_in_runtime_and_app]
    steps:
      - name: All tests passed
@@ -347,7 +347,7 @@ jobs:
  runtime_tests_check_fail:
    name: All Runtime Tests Passed
    if: ${{ cancelled() || contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') }}
-    runs-on: ubuntu-latest
+    runs-on: blacksmith-4vcpu-ubuntu-2204
    needs: [test_runtime_root, test_runtime_oh, verify_hash_equivalence_in_runtime_and_app]
    steps:
      - name: Some tests failed
@@ -358,7 +358,7 @@ jobs:
    name: Update PR Description
    if: github.event_name == 'pull_request' && !github.event.pull_request.head.repo.fork && github.actor != 'dependabot[bot]'
    needs: [ghcr_build_runtime]
-    runs-on: ubuntu-latest
+    runs-on: blacksmith-4vcpu-ubuntu-2204
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -18,7 +18,7 @@ env:
 jobs:
  run-integration-tests:
    if: github.event.label.name == 'integration-test' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule'
-    runs-on: ubuntu-latest
+    runs-on: blacksmith-4vcpu-ubuntu-2204
    permissions:
      contents: "read"
      id-token: "write"
@@ -35,13 +35,13 @@ jobs:
        run: pipx install poetry

      - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: useblacksmith/setup-python@v6
        with:
          python-version: ${{ matrix.python-version }}
          cache: "poetry"

      - name: Setup Node.js
-        uses: actions/setup-node@v4
+        uses: useblacksmith/setup-node@v5
        with:
          node-version: '22.x'

@@ -54,7 +54,7 @@ jobs:
            Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly.

      - name: Install Python dependencies using Poetry
-        run: poetry install --without evaluation
+        run: poetry install --without evaluation,llama-index

      - name: Configure config.toml for testing with Haiku
        env:
@@ -117,6 +117,68 @@ jobs:
          echo "EOF" >> $GITHUB_ENV

      # -------------------------------------------------------------
+      # Run DelegatorAgent tests for Haiku, limited to t01 and t02
+      - name: Wait a little bit (again)
+        run: sleep 5
+
+      - name: Configure config.toml for testing DelegatorAgent (Haiku)
+        env:
+          LLM_MODEL: "litellm_proxy/claude-3-5-haiku-20241022"
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+          MAX_ITERATIONS: 30
+        run: |
+          echo "[llm.eval]" > config.toml
+          echo "model = \"$LLM_MODEL\"" >> config.toml
+          echo "api_key = \"$LLM_API_KEY\"" >> config.toml
+          echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
+          echo "temperature = 0.0" >> config.toml
+
+      - name: Run integration test evaluation for DelegatorAgent (Haiku)
+        env:
+          SANDBOX_FORCE_REBUILD_RUNTIME: True
+        run: |
+          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD DelegatorAgent '' 30 $N_PROCESSES "t01_fix_simple_typo,t02_add_bash_hello" 'delegator_haiku_run'
+
+          # Find and export the delegator test results
+          REPORT_FILE_DELEGATOR_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/DelegatorAgent/*haiku*_maxiter_30_N* -name "report.md" -type f | head -n 1)
+          echo "REPORT_FILE_DELEGATOR_HAIKU: $REPORT_FILE_DELEGATOR_HAIKU"
+          echo "INTEGRATION_TEST_REPORT_DELEGATOR_HAIKU<<EOF" >> $GITHUB_ENV
+          cat $REPORT_FILE_DELEGATOR_HAIKU >> $GITHUB_ENV
+          echo >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+
+      # -------------------------------------------------------------
+      # Run DelegatorAgent tests for DeepSeek, limited to t01 and t02
+      - name: Wait a little bit (again)
+        run: sleep 5
+
+      - name: Configure config.toml for testing DelegatorAgent (DeepSeek)
+        env:
+          LLM_MODEL: "litellm_proxy/deepseek-chat"
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+          MAX_ITERATIONS: 30
+        run: |
+          echo "[llm.eval]" > config.toml
+          echo "model = \"$LLM_MODEL\"" >> config.toml
+          echo "api_key = \"$LLM_API_KEY\"" >> config.toml
+          echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
+          echo "temperature = 0.0" >> config.toml
+      - name: Run integration test evaluation for DelegatorAgent (DeepSeek)
+        env:
+          SANDBOX_FORCE_REBUILD_RUNTIME: True
+        run: |
+          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD DelegatorAgent '' 30 $N_PROCESSES "t01_fix_simple_typo,t02_add_bash_hello" 'delegator_deepseek_run'
+
+          # Find and export the delegator test results
+          REPORT_FILE_DELEGATOR_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/DelegatorAgent/deepseek*_maxiter_30_N* -name "report.md" -type f | head -n 1)
+          echo "REPORT_FILE_DELEGATOR_DEEPSEEK: $REPORT_FILE_DELEGATOR_DEEPSEEK"
+          echo "INTEGRATION_TEST_REPORT_DELEGATOR_DEEPSEEK<<EOF" >> $GITHUB_ENV
+          cat $REPORT_FILE_DELEGATOR_DEEPSEEK >> $GITHUB_ENV
+          echo >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+      # -------------------------------------------------------------
      # Run VisualBrowsingAgent tests for DeepSeek, limited to t05 and t06
      - name: Wait a little bit (again)
        run: sleep 5
@@ -151,7 +213,7 @@ jobs:
        run: |
          TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
          cd evaluation/evaluation_outputs/outputs  # Change to the outputs directory
-          tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/CodeActAgent/* integration_tests/VisualBrowsingAgent/* # Only include the actual result directories
+          tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/CodeActAgent/* integration_tests/DelegatorAgent/* integration_tests/VisualBrowsingAgent/* # Only include the actual result directories

      - name: Upload evaluation results as artifact
        uses: actions/upload-artifact@v4
@@ -192,6 +254,12 @@ jobs:
              **Integration Tests Report (DeepSeek)**
              DeepSeek LLM Test Results:
              ${{ env.INTEGRATION_TEST_REPORT_DEEPSEEK }}
+              ---
+                **Integration Tests Report Delegator (Haiku)**
+              ${{ env.INTEGRATION_TEST_REPORT_DELEGATOR_HAIKU }}
+              ---
+              **Integration Tests Report Delegator (DeepSeek)**
+              ${{ env.INTEGRATION_TEST_REPORT_DELEGATOR_DEEPSEEK }}
              ---
              **Integration Tests Report VisualBrowsing (DeepSeek)**
              ${{ env.INTEGRATION_TEST_REPORT_VISUALBROWSING_DEEPSEEK }}
@@ -9,7 +9,7 @@ jobs:
  lint-fix-frontend:
    if: github.event.label.name == 'lint-fix'
    name: Fix frontend linting issues
-    runs-on: ubuntu-latest
+    runs-on: blacksmith-4vcpu-ubuntu-2204
    permissions:
      contents: write
      pull-requests: write
@@ -22,7 +22,7 @@ jobs:
          token: ${{ secrets.GITHUB_TOKEN }}

      - name: Install Node.js 20
-        uses: actions/setup-node@v4
+        uses: useblacksmith/setup-node@v5
        with:
          node-version: 20
      - name: Install frontend dependencies
@@ -52,7 +52,7 @@ jobs:
  lint-fix-python:
    if: github.event.label.name == 'lint-fix'
    name: Fix Python linting issues
-    runs-on: ubuntu-latest
+    runs-on: blacksmith-4vcpu-ubuntu-2204
    permissions:
      contents: write
      pull-requests: write
@@ -65,7 +65,7 @@ jobs:
          token: ${{ secrets.GITHUB_TOKEN }}

      - name: Set up python
-        uses: actions/setup-python@v5
+        uses: useblacksmith/setup-python@v6
        with:
          python-version: 3.12
          cache: 'pip'
@@ -19,11 +19,11 @@ jobs:
  # Run lint on the frontend code
  lint-frontend:
    name: Lint frontend
-    runs-on: ubuntu-latest
+    runs-on: blacksmith-4vcpu-ubuntu-2204
    steps:
      - uses: actions/checkout@v4
      - name: Install Node.js 20
-        uses: actions/setup-node@v4
+        uses: useblacksmith/setup-node@v5
        with:
          node-version: 20
      - name: Install dependencies
@@ -39,13 +39,13 @@ jobs:
  # Run lint on the python code
  lint-python:
    name: Lint python
-    runs-on: ubuntu-latest
+    runs-on: blacksmith-4vcpu-ubuntu-2204
    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: Set up python
-        uses: actions/setup-python@v5
+        uses: useblacksmith/setup-python@v6
        with:
          python-version: 3.12
          cache: 'pip'
@@ -57,11 +57,11 @@ jobs:
  # Check version consistency across documentation
  check-version-consistency:
    name: Check version consistency
-    runs-on: ubuntu-latest
+    runs-on: blacksmith-4vcpu-ubuntu-2204
    steps:
      - uses: actions/checkout@v4
      - name: Set up python
-        uses: actions/setup-python@v5
+        uses: useblacksmith/setup-python@v6
        with:
          python-version: 3.12
      - name: Run version consistency check
@@ -74,13 +74,13 @@ jobs:
        (github.event.review.author_association == 'OWNER' || github.event.review.author_association == 'COLLABORATOR' || github.event.review.author_association == 'MEMBER')
        )
      )
-    runs-on: ubuntu-latest
+    runs-on: blacksmith-4vcpu-ubuntu-2204
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: useblacksmith/setup-python@v6
        with:
          python-version: "3.12"

@@ -106,7 +106,7 @@ jobs:
              contains(github.event.review.body, '@openhands-agent-exp')
            )
          )
-        uses: actions/cache@v4
+        uses: useblacksmith/cache@v5
        with:
          path: ${{ env.pythonLocation }}/lib/python3.12/site-packages/*
          key: ${{ runner.os }}-pip-openhands-resolver-${{ hashFiles('/tmp/requirements.txt') }}
@@ -295,12 +295,11 @@ jobs:
        if: always()
        env:
          AGENT_RESPONDED: ${{ env.AGENT_RESPONDED || 'false' }}
-          ISSUE_NUMBER: ${{ env.ISSUE_NUMBER }}
        with:
          github-token: ${{ secrets.PAT_TOKEN || github.token }}
          script: |
            const fs = require('fs');
-            const issueNumber = process.env.ISSUE_NUMBER;
+            const issueNumber = ${{ env.ISSUE_NUMBER }};
            let logContent = '';

            try {
@@ -331,15 +330,13 @@ jobs:
        if: always() # Comment on issue even if the previous steps fail
        env:
          AGENT_RESPONDED: ${{ env.AGENT_RESPONDED || 'false' }}
-          ISSUE_NUMBER: ${{ env.ISSUE_NUMBER }}
-          RESOLUTION_SUCCESS: ${{ steps.check_result.outputs.RESOLUTION_SUCCESS }}
        with:
          github-token: ${{ secrets.PAT_TOKEN || github.token }}
          script: |
            const fs = require('fs');
            const path = require('path');
-            const issueNumber = process.env.ISSUE_NUMBER;
-            const success = process.env.RESOLUTION_SUCCESS === 'true';
+            const issueNumber = ${{ env.ISSUE_NUMBER }};
+            const success = ${{ steps.check_result.outputs.RESOLUTION_SUCCESS }};

            let prNumber = '';
            let branchName = '';
@@ -404,12 +401,10 @@ jobs:
      - name: Fallback Error Comment
        uses: actions/github-script@v7
        if: ${{ env.AGENT_RESPONDED == 'false' }} # Only run if no conditions were met in previous steps
-        env:
-          ISSUE_NUMBER: ${{ env.ISSUE_NUMBER }}
        with:
          github-token: ${{ secrets.PAT_TOKEN || github.token }}
          script: |
-            const issueNumber = process.env.ISSUE_NUMBER;
+            const issueNumber = ${{ env.ISSUE_NUMBER }};

            github.rest.issues.createComment({
              issue_number: issueNumber,
@@ -19,7 +19,7 @@ jobs:
  # Run python unit tests on Linux
  test-on-linux:
    name: Python Unit Tests on Linux
-    runs-on: ubuntu-latest
+    runs-on: blacksmith-4vcpu-ubuntu-2204
    env:
      INSTALL_DOCKER: '0' # Set to '0' to skip Docker installation
    strategy:
@@ -33,22 +33,22 @@ jobs:
      - name: Install tmux
        run: sudo apt-get update && sudo apt-get install -y tmux
      - name: Setup Node.js
-        uses: actions/setup-node@v4
+        uses: useblacksmith/setup-node@v5
        with:
          node-version: '22.x'
      - name: Install poetry via pipx
        run: pipx install poetry
      - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: useblacksmith/setup-python@v6
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'poetry'
      - name: Install Python dependencies using Poetry
-        run: poetry install --without evaluation
+        run: poetry install --without evaluation,llama-index
      - name: Build Environment
        run: make build
      - name: Run Tests
-        run: poetry run pytest --forked -n auto --cov=openhands --cov-report=xml -svv ./tests/unit
+        run: poetry run pytest --forked -n auto --cov=openhands --cov-report=xml -svv ./tests/unit --ignore=tests/unit/test_long_term_memory.py
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v5
        env:
@@ -12,10 +12,10 @@ on:

 jobs:
  release:
-    runs-on: ubuntu-latest
+    runs-on: blacksmith-4vcpu-ubuntu-2204
    steps:
      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v5
+      - uses: useblacksmith/setup-python@v6
        with:
          python-version: 3.12
      - name: Install Poetry
@@ -10,7 +10,7 @@ jobs:
  trigger-job:
    name: Trigger remote eval job
    if: ${{ github.event.label.name == 'run-eval-xs' || github.event.label.name == 'run-eval-s' || github.event.label.name == 'run-eval-m' }}
-    runs-on: ubuntu-latest
+    runs-on: blacksmith-4vcpu-ubuntu-2204

    steps:
      - name: Checkout PR branch
@@ -8,7 +8,7 @@ on:

 jobs:
  stale:
-    runs-on: ubuntu-latest
+    runs-on: blacksmith-4vcpu-ubuntu-2204
    steps:
      - uses: actions/stale@v9
        with: