name: Run Integration Tests on: pull_request: types: [labeled] workflow_dispatch: inputs: reason: description: 'Reason for manual trigger' required: true default: '' schedule: - cron: '30 22 * * *' # Runs at 10:30pm UTC every day env: N_PROCESSES: 10 # Global configuration for number of parallel processes for evaluation jobs: run-integration-tests: if: github.event.label.name == 'integration-test' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' runs-on: blacksmith-4vcpu-ubuntu-2204 permissions: contents: "read" id-token: "write" pull-requests: "write" issues: "write" strategy: matrix: python-version: ["3.12"] steps: - name: Checkout repository uses: actions/checkout@v4 - name: Install poetry via pipx run: pipx install poetry - name: Set up Python uses: useblacksmith/setup-python@v6 with: python-version: ${{ matrix.python-version }} cache: "poetry" - name: Setup Node.js uses: useblacksmith/setup-node@v5 with: node-version: '22.x' - name: Comment on PR if 'integration-test' label is present if: github.event_name == 'pull_request' && github.event.label.name == 'integration-test' uses: KeisukeYamashita/create-comment@v1 with: unique: false comment: | Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly. - name: Install Python dependencies using Poetry run: poetry install --with dev,test,runtime,evaluation - name: Configure config.toml for testing with Haiku env: LLM_MODEL: "litellm_proxy/claude-3-5-haiku-20241022" LLM_API_KEY: ${{ secrets.LLM_API_KEY }} LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }} MAX_ITERATIONS: 10 run: | echo "[llm.eval]" > config.toml echo "model = \"$LLM_MODEL\"" >> config.toml echo "api_key = \"$LLM_API_KEY\"" >> config.toml echo "base_url = \"$LLM_BASE_URL\"" >> config.toml echo "temperature = 0.0" >> config.toml - name: Build environment run: make build - name: Run integration test evaluation for Haiku env: SANDBOX_FORCE_REBUILD_RUNTIME: True run: | poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' 10 $N_PROCESSES '' 'haiku_run' # get integration tests report REPORT_FILE_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*haiku*_maxiter_10_N* -name "report.md" -type f | head -n 1) echo "REPORT_FILE: $REPORT_FILE_HAIKU" echo "INTEGRATION_TEST_REPORT_HAIKU<> $GITHUB_ENV cat $REPORT_FILE_HAIKU >> $GITHUB_ENV echo >> $GITHUB_ENV echo "EOF" >> $GITHUB_ENV - name: Wait a little bit run: sleep 10 - name: Configure config.toml for testing with DeepSeek env: LLM_MODEL: "litellm_proxy/deepseek-chat" LLM_API_KEY: ${{ secrets.LLM_API_KEY }} LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }} MAX_ITERATIONS: 10 run: | echo "[llm.eval]" > config.toml echo "model = \"$LLM_MODEL\"" >> config.toml echo "api_key = \"$LLM_API_KEY\"" >> config.toml echo "base_url = \"$LLM_BASE_URL\"" >> config.toml echo "temperature = 0.0" >> config.toml - name: Run integration test evaluation for DeepSeek env: SANDBOX_FORCE_REBUILD_RUNTIME: True run: | poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' 10 $N_PROCESSES '' 'deepseek_run' # get integration tests report REPORT_FILE_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek*_maxiter_10_N* -name "report.md" -type f | head -n 1) echo "REPORT_FILE: $REPORT_FILE_DEEPSEEK" echo "INTEGRATION_TEST_REPORT_DEEPSEEK<> $GITHUB_ENV cat $REPORT_FILE_DEEPSEEK >> $GITHUB_ENV echo >> $GITHUB_ENV echo "EOF" >> $GITHUB_ENV # ------------------------------------------------------------- # Run VisualBrowsingAgent tests for DeepSeek, limited to t05 and t06 - name: Wait a little bit (again) run: sleep 5 - name: Configure config.toml for testing VisualBrowsingAgent (DeepSeek) env: LLM_MODEL: "litellm_proxy/deepseek-chat" LLM_API_KEY: ${{ secrets.LLM_API_KEY }} LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }} MAX_ITERATIONS: 15 run: | echo "[llm.eval]" > config.toml echo "model = \"$LLM_MODEL\"" >> config.toml echo "api_key = \"$LLM_API_KEY\"" >> config.toml echo "base_url = \"$LLM_BASE_URL\"" >> config.toml echo "temperature = 0.0" >> config.toml - name: Run integration test evaluation for VisualBrowsingAgent (DeepSeek) env: SANDBOX_FORCE_REBUILD_RUNTIME: True run: | poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD VisualBrowsingAgent '' 15 $N_PROCESSES "t05_simple_browsing,t06_github_pr_browsing.py" 'visualbrowsing_deepseek_run' # Find and export the visual browsing agent test results REPORT_FILE_VISUALBROWSING_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/VisualBrowsingAgent/deepseek*_maxiter_15_N* -name "report.md" -type f | head -n 1) echo "REPORT_FILE_VISUALBROWSING_DEEPSEEK: $REPORT_FILE_VISUALBROWSING_DEEPSEEK" echo "INTEGRATION_TEST_REPORT_VISUALBROWSING_DEEPSEEK<> $GITHUB_ENV cat $REPORT_FILE_VISUALBROWSING_DEEPSEEK >> $GITHUB_ENV echo >> $GITHUB_ENV echo "EOF" >> $GITHUB_ENV - name: Create archive of evaluation outputs run: | TIMESTAMP=$(date +'%y-%m-%d-%H-%M') cd evaluation/evaluation_outputs/outputs # Change to the outputs directory tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/CodeActAgent/* integration_tests/VisualBrowsingAgent/* # Only include the actual result directories - name: Upload evaluation results as artifact uses: actions/upload-artifact@v4 id: upload_results_artifact with: name: integration-test-outputs-${{ github.run_id }}-${{ github.run_attempt }} path: integration_tests_*.tar.gz - name: Get artifact URLs run: | echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV - name: Set timestamp and trigger reason run: | echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV if [[ "${{ github.event_name }}" == "pull_request" ]]; then echo "TRIGGER_REASON=pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV else echo "TRIGGER_REASON=nightly-scheduled" >> $GITHUB_ENV fi - name: Comment with results and artifact link id: create_comment uses: KeisukeYamashita/create-comment@v1 with: # if triggered by PR, use PR number, otherwise use 9745 as fallback issue number for manual triggers number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 9745 }} unique: false comment: | Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || (github.event_name == 'workflow_dispatch' && format('Manual Trigger: {0}', github.event.inputs.reason)) || 'Nightly Scheduled Run' }} Commit: ${{ github.sha }} **Integration Tests Report (Haiku)** Haiku LLM Test Results: ${{ env.INTEGRATION_TEST_REPORT_HAIKU }} --- **Integration Tests Report (DeepSeek)** DeepSeek LLM Test Results: ${{ env.INTEGRATION_TEST_REPORT_DEEPSEEK }} --- **Integration Tests Report VisualBrowsing (DeepSeek)** ${{ env.INTEGRATION_TEST_REPORT_VISUALBROWSING_DEEPSEEK }} --- Download testing outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }})