mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-04-29 03:00:45 -04:00
Compare commits
31 Commits
prd/org-co
...
fix-webare
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
11c191338e | ||
|
|
fad1c5375a | ||
|
|
c66178f85d | ||
|
|
39e25a86ff | ||
|
|
9d671f2e22 | ||
|
|
45a2b8c131 | ||
|
|
5c627c4ef4 | ||
|
|
dee201e3c1 | ||
|
|
872c54aa22 | ||
|
|
a375307e12 | ||
|
|
4fb6b8e683 | ||
|
|
09531c8cbe | ||
|
|
16a85bd507 | ||
|
|
f37f9a7ae6 | ||
|
|
e2e730f89f | ||
|
|
f86df8a717 | ||
|
|
3e4e75ab6a | ||
|
|
cddb4e0afd | ||
|
|
674f9bb191 | ||
|
|
7e51bbab0c | ||
|
|
1f974773e5 | ||
|
|
4a91aad9d6 | ||
|
|
65451dc929 | ||
|
|
ecdd457aa5 | ||
|
|
35d5723059 | ||
|
|
79f8bb0710 | ||
|
|
300fbf96a6 | ||
|
|
873bc537ca | ||
|
|
e5b0b90bef | ||
|
|
bd07bf6fa9 | ||
|
|
ecc119e6d6 |
212
evaluation/benchmarks/webarena/IMPLEMENTATION_PLAN.md
Normal file
212
evaluation/benchmarks/webarena/IMPLEMENTATION_PLAN.md
Normal file
@@ -0,0 +1,212 @@
|
||||
# WebArena CDP Integration Implementation Plan
|
||||
|
||||
## Overview
|
||||
|
||||
This document outlines the proper solution for integrating OpenHands with the official WebArena evaluation harness using Chrome DevTools Protocol (CDP) session logging.
|
||||
|
||||
## The Problem
|
||||
|
||||
WebArena evaluators require:
|
||||
1. Live browser state (DOM, cookies, localStorage, etc.)
|
||||
2. CDPSession object for making CDP calls
|
||||
3. Page object for accessing current URL, title, content
|
||||
|
||||
OpenHands only provides:
|
||||
1. Action/observation pairs in text format
|
||||
2. No live browser state
|
||||
3. No CDP access during evaluation
|
||||
|
||||
## The Solution: CDP Session Logging
|
||||
|
||||
### Phase 1: Capture Browser State During Inference
|
||||
|
||||
**Modify `openhands/runtime/browser/browser_env.py`:**
|
||||
|
||||
```python
|
||||
class BrowserEnv:
|
||||
def __init__(self, ...):
|
||||
# ... existing code ...
|
||||
self.cdp_logger = CDPSessionLogger() if should_log_cdp() else None
|
||||
|
||||
def step(self, action):
|
||||
# ... existing action execution ...
|
||||
|
||||
# Log CDP state after each action
|
||||
if self.cdp_logger:
|
||||
self.cdp_logger.capture_state_snapshot(f"after_action_{action.action}")
|
||||
|
||||
# ... return observation ...
|
||||
|
||||
def close(self):
|
||||
# Save final CDP session
|
||||
if self.cdp_logger:
|
||||
instance_id = get_current_instance_id() # from evaluation context
|
||||
self.cdp_logger.save_session(instance_id)
|
||||
```
|
||||
|
||||
**Add CDP Logger Integration:**
|
||||
|
||||
```python
|
||||
class CDPSessionLogger:
|
||||
def attach_to_browsergym_env(self, env):
|
||||
"""Attach to BrowserGym environment's Playwright page."""
|
||||
# Access the underlying Playwright page from BrowserGym
|
||||
playwright_page = env.page # or however BrowserGym exposes it
|
||||
self.attach_to_page(playwright_page)
|
||||
|
||||
def capture_state_snapshot(self, trigger: str):
|
||||
"""Capture complete browser state using CDP."""
|
||||
# DOM snapshot (key for WebArena evaluators)
|
||||
dom_snapshot = self.cdp_session.send("DOMSnapshot.captureSnapshot", {
|
||||
"computedStyles": [],
|
||||
"includeDOMRects": True,
|
||||
"includePaintOrder": True,
|
||||
})
|
||||
|
||||
# All other state (cookies, localStorage, etc.)
|
||||
# ... as shown in POC ...
|
||||
```
|
||||
|
||||
### Phase 2: Mock Objects for Evaluation
|
||||
|
||||
**Create Mock Page/CDPSession:**
|
||||
|
||||
```python
|
||||
class MockCDPSession:
|
||||
def __init__(self, saved_state):
|
||||
self.saved_state = saved_state
|
||||
|
||||
def send(self, method: str, params=None):
|
||||
"""Return saved state instead of making live CDP calls."""
|
||||
if method == "DOMSnapshot.captureSnapshot":
|
||||
return self.saved_state["dom_snapshot"]
|
||||
elif method == "Network.getAllCookies":
|
||||
return self.saved_state["cookies"]
|
||||
# ... handle all CDP methods WebArena uses ...
|
||||
|
||||
class MockPage:
|
||||
def __init__(self, saved_state):
|
||||
self.saved_state = saved_state
|
||||
|
||||
def url(self): return self.saved_state["final_url"]
|
||||
def title(self): return self.saved_state["final_title"]
|
||||
def context(self): return MockBrowserContext(self.saved_state)
|
||||
# ... implement all Page methods WebArena uses ...
|
||||
```
|
||||
|
||||
### Phase 3: Updated Evaluation Script
|
||||
|
||||
**Modify `eval_infer.py`:**
|
||||
|
||||
```python
|
||||
def evaluate_with_official_webarena_harness(instance_data, config_file):
|
||||
"""Use official WebArena evaluators with saved CDP state."""
|
||||
|
||||
# Load saved CDP session
|
||||
cdp_integration = WebArenaCDPIntegration()
|
||||
mock_page, mock_client = cdp_integration.create_mock_page_and_client(
|
||||
instance_data["instance_id"]
|
||||
)
|
||||
|
||||
# Convert OpenHands trajectory to WebArena format
|
||||
trajectory = convert_openhands_trajectory_to_webarena_format(instance_data)
|
||||
|
||||
# Use official WebArena evaluator with mock objects
|
||||
evaluator = evaluator_router(config_file)
|
||||
score = evaluator(
|
||||
trajectory=trajectory,
|
||||
config_file=config_file,
|
||||
page=mock_page, # Mock page with saved state
|
||||
client=mock_client, # Mock CDP session with saved state
|
||||
)
|
||||
|
||||
return score
|
||||
```
|
||||
|
||||
## Implementation Steps
|
||||
|
||||
### Step 1: Integrate CDP Logger into BrowserEnv
|
||||
|
||||
1. **Add CDP logging to `browser_env.py`:**
|
||||
- Detect when running WebArena evaluation
|
||||
- Attach CDP logger to BrowserGym's Playwright page
|
||||
- Capture state snapshots after each action
|
||||
- Save final session with instance ID
|
||||
|
||||
2. **Environment variable setup:**
|
||||
```bash
|
||||
export WEBARENA_CDP_LOGGING=true
|
||||
export WEBARENA_CDP_SESSION_DIR=/tmp/cdp_sessions
|
||||
```
|
||||
|
||||
### Step 2: Create Mock Objects
|
||||
|
||||
1. **Implement `MockCDPSession`:**
|
||||
- Handle all CDP methods WebArena evaluators use
|
||||
- Return saved state instead of making live calls
|
||||
- Support `DOMSnapshot.captureSnapshot`, `Network.getAllCookies`, etc.
|
||||
|
||||
2. **Implement `MockPage`:**
|
||||
- Provide saved URL, title, content
|
||||
- Mock JavaScript evaluation with saved state
|
||||
- Support element queries using DOM snapshot
|
||||
|
||||
### Step 3: Update Evaluation Pipeline
|
||||
|
||||
1. **Modify `run_infer.py`:**
|
||||
- Enable CDP logging for WebArena tasks
|
||||
- Ensure instance IDs are properly set
|
||||
- Save CDP sessions to accessible location
|
||||
|
||||
2. **Update `eval_infer.py`:**
|
||||
- Load saved CDP sessions
|
||||
- Create mock objects
|
||||
- Use official WebArena evaluators
|
||||
- Remove all heuristic evaluation logic
|
||||
|
||||
### Step 4: Testing and Validation
|
||||
|
||||
1. **Test with known tasks:**
|
||||
- Run inference with CDP logging
|
||||
- Verify CDP sessions are saved correctly
|
||||
- Test evaluation with mock objects
|
||||
- Compare results with expected outcomes
|
||||
|
||||
2. **Validate DOM snapshot format:**
|
||||
- Ensure saved DOM snapshots match WebArena expectations
|
||||
- Test all CDP methods used by evaluators
|
||||
- Verify JavaScript evaluation works correctly
|
||||
|
||||
## Benefits of This Approach
|
||||
|
||||
1. **✅ Uses Official WebArena Evaluation:** No heuristics or approximations
|
||||
2. **✅ Preserves Exact Browser State:** DOM, cookies, localStorage, etc.
|
||||
3. **✅ No Live Browser Needed:** Evaluation works offline with saved state
|
||||
4. **✅ Scalable:** Can evaluate many instances without browser overhead
|
||||
5. **✅ Accurate:** Evaluators get exactly the state they expect
|
||||
|
||||
## File Structure
|
||||
|
||||
```
|
||||
/tmp/cdp_sessions/
|
||||
├── webarena.1.json # CDP session for task 1
|
||||
├── webarena.2.json # CDP session for task 2
|
||||
├── webarena.3.json # CDP session for task 3
|
||||
└── webarena.4.json # CDP session for task 4
|
||||
|
||||
evaluation/benchmarks/webarena/
|
||||
├── run_infer.py # Modified to enable CDP logging
|
||||
├── eval_infer.py # Uses mock objects with saved state
|
||||
├── cdp_integration.py # Mock Page/CDPSession implementation
|
||||
└── IMPLEMENTATION_PLAN.md # This document
|
||||
```
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. **Implement CDP logger integration in `browser_env.py`**
|
||||
2. **Create comprehensive mock objects**
|
||||
3. **Update evaluation scripts**
|
||||
4. **Test with actual WebArena tasks**
|
||||
5. **Validate results against expected outcomes**
|
||||
|
||||
This approach solves the fundamental problem: WebArena evaluators need live browser state, but OpenHands only provides action/observation pairs. By capturing and replaying the exact browser state, we can use the official WebArena evaluation harness without any compromises.
|
||||
@@ -6,11 +6,21 @@ This folder contains evaluation for [WebArena](https://github.com/web-arena-x/we
|
||||
|
||||
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
|
||||
|
||||
Make sure to install the evaluation dependencies:
|
||||
|
||||
```bash
|
||||
poetry install --with evaluation
|
||||
```
|
||||
|
||||
## Setup WebArena Environment
|
||||
|
||||
WebArena requires you to set up websites containing pre-populated content that is accessible via URL to the machine running the OpenHands agents.
|
||||
Follow [this document](https://github.com/web-arena-x/webarena/blob/main/environment_docker/README.md) to set up your own WebArena environment through local servers or AWS EC2 instances.
|
||||
Take note of the base URL (`$WEBARENA_BASE_URL`) of the machine where the environment is installed.
|
||||
WebArena requires access to websites containing pre-populated content. You can either:
|
||||
|
||||
1. **Use an existing WebArena environment** (recommended for evaluation): Set the `WEBARENA_BASE_URL` environment variable to point to an existing WebArena server.
|
||||
|
||||
2. **Set up your own environment**: Follow [this document](https://github.com/web-arena-x/webarena/blob/main/environment_docker/README.md) to set up your own WebArena environment through local servers or AWS EC2 instances.
|
||||
|
||||
The WebArena evaluation package is already installed with the evaluation dependencies, so you don't need to clone the WebArena repository separately.
|
||||
|
||||
## Test if your environment works
|
||||
|
||||
@@ -21,20 +31,51 @@ Follow the WebArena environment setup guide carefully, and make sure the URL fie
|
||||
|
||||
## Run Evaluation
|
||||
|
||||
### Step 1: Run Inference
|
||||
Before running, you must provide an LLM config in a local config.toml and pass its name to run_infer.sh:
|
||||
|
||||
1) Create config.toml in the repo root (this file is gitignored):
|
||||
|
||||
```toml
|
||||
[llm.eval_openai]
|
||||
model = "gpt-4o"
|
||||
api_key = "sk-..." # Your OpenAI API key
|
||||
```
|
||||
|
||||
2) Ensure Docker is installed and running (the first run will build a browser-enabled runtime image).
|
||||
|
||||
|
||||
```bash
|
||||
export WEBARENA_BASE_URL=<YOUR_SERVER_URL_HERE>
|
||||
export OPENAI_API_KEY="yourkey" # this key is required for some WebArena validators that utilize LLMs
|
||||
bash evaluation/benchmarks/webarena/scripts/run_infer.sh
|
||||
# args: MODEL_CONFIG COMMIT_HASH AGENT EVAL_LIMIT NUM_WORKERS
|
||||
bash evaluation/benchmarks/webarena/scripts/run_infer.sh llm.eval_openai HEAD BrowsingAgent 3 1
|
||||
```
|
||||
|
||||
Results will be in `evaluation/evaluation_outputs/outputs/webarena/`
|
||||
|
||||
To calculate the success rate, run:
|
||||
### Step 2: Evaluate Results
|
||||
|
||||
```sh
|
||||
poetry run python evaluation/benchmarks/webarena/get_success_rate.py evaluation/evaluation_outputs/outputs/webarena/SOME_AGENT/EXP_NAME/output.jsonl
|
||||
To evaluate the results and calculate success rate using the official WebArena harness, you must have the official WebArena repo and its Python dependencies available locally:
|
||||
|
||||
1) Clone the official repo and install deps (one-time):
|
||||
|
||||
```bash
|
||||
cd /workspace/project
|
||||
git clone https://github.com/web-arena-x/webarena
|
||||
cd webarena && pip install -e .
|
||||
```
|
||||
|
||||
2) Then run the evaluator:
|
||||
|
||||
```bash
|
||||
poetry run python evaluation/benchmarks/webarena/eval_infer.py evaluation/evaluation_outputs/outputs/webarena/SOME_AGENT/EXP_NAME/output.jsonl
|
||||
```
|
||||
|
||||
Notes:
|
||||
- The evaluator expects WEBARENA_BASE_URL to be set and the WebArena services to be reachable.
|
||||
- If you skip installing the official harness, you can still inspect output.jsonl manually or write your own scorer, but the script above will fail without the harness.
|
||||
|
||||
## Submit your evaluation results
|
||||
|
||||
You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
|
||||
|
||||
283
evaluation/benchmarks/webarena/browsergym_state_capture.py
Normal file
283
evaluation/benchmarks/webarena/browsergym_state_capture.py
Normal file
@@ -0,0 +1,283 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
BrowserGym State Capture for WebArena Evaluation
|
||||
|
||||
This module leverages BrowserGym's existing state capture capabilities to save
|
||||
browser state for proper WebArena evaluation. BrowserGym already provides:
|
||||
- extract_dom_snapshot() - exactly what WebArena evaluators need
|
||||
- Direct Playwright page access via env.page
|
||||
- CDP session access via page.context.new_cdp_session()
|
||||
|
||||
This is much simpler than our original CDP logging approach because BrowserGym
|
||||
already has all the infrastructure we need.
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
|
||||
import browsergym.core.observation as obs
|
||||
|
||||
|
||||
class BrowserGymStateCapture:
|
||||
"""
|
||||
Captures browser state using BrowserGym's existing observation functions.
|
||||
This provides everything WebArena evaluators need without custom CDP logging.
|
||||
"""
|
||||
|
||||
def __init__(self, output_dir: str = '/tmp/webarena_states'):
|
||||
self.output_dir = Path(output_dir)
|
||||
self.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.current_instance_id: str | None = None
|
||||
|
||||
def set_instance_id(self, instance_id: str) -> None:
|
||||
"""Set the current WebArena instance ID for state saving."""
|
||||
self.current_instance_id = instance_id
|
||||
|
||||
def capture_final_state(self, browsergym_env) -> dict[str, Any]:
|
||||
"""
|
||||
Capture the final browser state using BrowserGym's observation functions.
|
||||
This captures everything WebArena evaluators need.
|
||||
"""
|
||||
if not hasattr(browsergym_env, 'page'):
|
||||
raise RuntimeError('BrowserGym environment does not have page attribute')
|
||||
|
||||
page = browsergym_env.page
|
||||
|
||||
# Use BrowserGym's existing observation extraction functions
|
||||
state = {
|
||||
'instance_id': self.current_instance_id,
|
||||
'final_url': page.url,
|
||||
'final_title': page.title(),
|
||||
# This is the key - BrowserGym's extract_dom_snapshot uses CDP internally
|
||||
# and returns exactly the format WebArena evaluators expect
|
||||
'dom_snapshot': obs.extract_dom_snapshot(page),
|
||||
# Additional state that might be useful
|
||||
'screenshot': obs.extract_screenshot(page),
|
||||
'axtree': obs.extract_merged_axtree(page),
|
||||
'focused_element': obs.extract_focused_element_bid(page),
|
||||
}
|
||||
|
||||
# Get additional browser state via CDP
|
||||
try:
|
||||
cdp_session = page.context.new_cdp_session(page)
|
||||
|
||||
# Get cookies
|
||||
cookies_result = cdp_session.send('Network.getAllCookies')
|
||||
state['cookies'] = cookies_result
|
||||
|
||||
# Get localStorage
|
||||
local_storage = cdp_session.send(
|
||||
'Runtime.evaluate',
|
||||
{'expression': 'JSON.stringify(localStorage)', 'returnByValue': True},
|
||||
)
|
||||
state['local_storage'] = local_storage.get('result', {}).get('value', '{}')
|
||||
|
||||
# Get sessionStorage
|
||||
session_storage = cdp_session.send(
|
||||
'Runtime.evaluate',
|
||||
{'expression': 'JSON.stringify(sessionStorage)', 'returnByValue': True},
|
||||
)
|
||||
state['session_storage'] = session_storage.get('result', {}).get(
|
||||
'value', '{}'
|
||||
)
|
||||
|
||||
cdp_session.detach()
|
||||
|
||||
except Exception as e:
|
||||
print(f'Warning: Could not capture additional state via CDP: {e}')
|
||||
state['cookies'] = {'cookies': []}
|
||||
state['local_storage'] = '{}'
|
||||
state['session_storage'] = '{}'
|
||||
|
||||
return state
|
||||
|
||||
def save_state(self, browsergym_env) -> str:
|
||||
"""Save the current browser state to disk."""
|
||||
if self.current_instance_id is None:
|
||||
raise RuntimeError('Instance ID not set. Call set_instance_id() first.')
|
||||
|
||||
state = self.capture_final_state(browsergym_env)
|
||||
|
||||
# Save to file
|
||||
state_file = self.output_dir / f'{self.current_instance_id}.json'
|
||||
with open(state_file, 'w') as f:
|
||||
json.dump(state, f, indent=2, default=str)
|
||||
|
||||
print(f'✅ Saved browser state to: {state_file}')
|
||||
return str(state_file)
|
||||
|
||||
def load_state(self, instance_id: str) -> dict[str, Any]:
|
||||
"""Load saved browser state from disk."""
|
||||
state_file = self.output_dir / f'{instance_id}.json'
|
||||
|
||||
if not state_file.exists():
|
||||
raise FileNotFoundError(f'State file not found: {state_file}')
|
||||
|
||||
with open(state_file, 'r') as f:
|
||||
state = json.load(f)
|
||||
|
||||
return state
|
||||
|
||||
|
||||
class MockPageForWebArena:
|
||||
"""
|
||||
Mock Page object that provides saved browser state for WebArena evaluation.
|
||||
This uses the exact state captured by BrowserGym's observation functions.
|
||||
"""
|
||||
|
||||
def __init__(self, saved_state: dict[str, Any]):
|
||||
self.saved_state = saved_state
|
||||
self._url = saved_state.get('final_url', '')
|
||||
self._title = saved_state.get('final_title', '')
|
||||
self._context = MockBrowserContextForWebArena(saved_state)
|
||||
|
||||
def url(self) -> str:
|
||||
return self._url
|
||||
|
||||
def title(self) -> str:
|
||||
return self._title
|
||||
|
||||
@property
|
||||
def context(self):
|
||||
return self._context
|
||||
|
||||
def evaluate(self, expression: str) -> Any:
|
||||
"""Mock JavaScript evaluation using saved state."""
|
||||
if 'window.location.href' in expression:
|
||||
return self._url
|
||||
elif 'document.title' in expression:
|
||||
return self._title
|
||||
elif 'localStorage' in expression:
|
||||
return self.saved_state.get('local_storage', '{}')
|
||||
elif 'sessionStorage' in expression:
|
||||
return self.saved_state.get('session_storage', '{}')
|
||||
return None
|
||||
|
||||
|
||||
class MockCDPSessionForWebArena:
|
||||
"""
|
||||
Mock CDPSession that returns saved state from BrowserGym's observations.
|
||||
This is the key component that makes WebArena evaluators work.
|
||||
"""
|
||||
|
||||
def __init__(self, saved_state: dict[str, Any]):
|
||||
self.saved_state = saved_state
|
||||
|
||||
def send(self, method: str, params: Optional[dict] = None) -> dict[str, Any]:
|
||||
"""
|
||||
Mock CDP send method that returns BrowserGym's captured state.
|
||||
The key insight: BrowserGym's extract_dom_snapshot() already returns
|
||||
the exact format that WebArena evaluators expect from CDP calls.
|
||||
"""
|
||||
if method == 'DOMSnapshot.captureSnapshot':
|
||||
# BrowserGym's extract_dom_snapshot already returns the right format!
|
||||
return self.saved_state.get('dom_snapshot', {})
|
||||
|
||||
elif method == 'Network.getAllCookies':
|
||||
return self.saved_state.get('cookies', {'cookies': []})
|
||||
|
||||
elif method == 'Runtime.evaluate':
|
||||
if params and 'expression' in params:
|
||||
expression = params['expression']
|
||||
if 'localStorage' in expression:
|
||||
return {
|
||||
'result': {'value': self.saved_state.get('local_storage', '{}')}
|
||||
}
|
||||
elif 'sessionStorage' in expression:
|
||||
return {
|
||||
'result': {
|
||||
'value': self.saved_state.get('session_storage', '{}')
|
||||
}
|
||||
}
|
||||
elif 'window.location.href' in expression:
|
||||
return {'result': {'value': self.saved_state.get('final_url', '')}}
|
||||
elif 'document.title' in expression:
|
||||
return {
|
||||
'result': {'value': self.saved_state.get('final_title', '')}
|
||||
}
|
||||
|
||||
return {}
|
||||
|
||||
def detach(self):
|
||||
"""Mock detach method."""
|
||||
pass
|
||||
|
||||
|
||||
class MockBrowserContextForWebArena:
|
||||
"""Mock browser context for WebArena evaluation."""
|
||||
|
||||
def __init__(self, saved_state: dict[str, Any]):
|
||||
self.saved_state = saved_state
|
||||
|
||||
def new_cdp_session(self, page) -> MockCDPSessionForWebArena:
|
||||
"""Return mock CDP session with BrowserGym's captured state."""
|
||||
return MockCDPSessionForWebArena(self.saved_state)
|
||||
|
||||
|
||||
def integrate_with_openhands_browser_env():
|
||||
"""
|
||||
Integration point for OpenHands browser_env.py.
|
||||
This shows how to add state capture to the existing BrowserGym usage.
|
||||
"""
|
||||
|
||||
# This would be added to browser_env.py in the browser_process method
|
||||
example_integration = """
|
||||
def browser_process(self) -> None:
|
||||
env = gym.make('browsergym/openended', ...)
|
||||
obs, info = env.reset()
|
||||
|
||||
# Add state capture for WebArena evaluation
|
||||
state_capture = None
|
||||
if os.getenv('WEBARENA_EVALUATION'):
|
||||
state_capture = BrowserGymStateCapture()
|
||||
|
||||
while should_continue():
|
||||
if self.browser_side.poll(timeout=0.01):
|
||||
unique_request_id, action_data = self.browser_side.recv()
|
||||
|
||||
# Handle WebArena instance ID setting
|
||||
if unique_request_id == 'SET_WEBARENA_INSTANCE':
|
||||
if state_capture:
|
||||
state_capture.set_instance_id(action_data['instance_id'])
|
||||
continue
|
||||
|
||||
action = action_data['action']
|
||||
obs, reward, terminated, truncated, info = env.step(action)
|
||||
|
||||
# Capture final state when task completes
|
||||
if terminated and state_capture:
|
||||
state_capture.save_state(env)
|
||||
|
||||
# ... rest of existing code ...
|
||||
"""
|
||||
|
||||
return example_integration
|
||||
|
||||
|
||||
def demonstrate_integration():
|
||||
"""Demonstrate how this integrates with WebArena evaluation."""
|
||||
print('🚀 BrowserGym State Capture for WebArena')
|
||||
print('=' * 50)
|
||||
|
||||
print('✅ Key advantages of this approach:')
|
||||
print(" 1. Uses BrowserGym's existing observation functions")
|
||||
print(' 2. extract_dom_snapshot() already returns WebArena-compatible format')
|
||||
print(' 3. No custom CDP logging needed')
|
||||
print(' 4. Minimal changes to OpenHands browser_env.py')
|
||||
print(' 5. Leverages existing, tested BrowserGym infrastructure')
|
||||
|
||||
print('\n📋 Integration steps:')
|
||||
print(' 1. Add BrowserGymStateCapture to browser_env.py')
|
||||
print(' 2. Capture state when WebArena tasks complete')
|
||||
print(
|
||||
' 3. Use MockPageForWebArena and MockCDPSessionForWebArena in eval_infer.py'
|
||||
)
|
||||
print(' 4. Official WebArena evaluators work with mock objects')
|
||||
|
||||
print('\n🎯 This is much simpler than custom CDP logging because')
|
||||
print(' BrowserGym already provides everything we need!')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
demonstrate_integration()
|
||||
359
evaluation/benchmarks/webarena/eval_infer.py
Normal file
359
evaluation/benchmarks/webarena/eval_infer.py
Normal file
@@ -0,0 +1,359 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
WebArena evaluation script for OpenHands outputs using official WebArena evaluation harness.
|
||||
This script evaluates the results from run_infer.py using the official WebArena evaluation code.
|
||||
|
||||
This script requires:
|
||||
1. Official WebArena repository cloned to /workspace/project/webarena
|
||||
2. WebArena environment variables properly configured
|
||||
3. Authentication files set up for WebArena sites
|
||||
4. Docker containers running for WebArena sites
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from typing import Any
|
||||
|
||||
# Set up environment variables for WebArena
|
||||
WEBARENA_BASE_URL = os.environ.get('WEBARENA_BASE_URL', '')
|
||||
if WEBARENA_BASE_URL:
|
||||
os.environ['REDDIT'] = f'{WEBARENA_BASE_URL}:9999'
|
||||
os.environ['SHOPPING'] = f'{WEBARENA_BASE_URL}:7770'
|
||||
os.environ['SHOPPING_ADMIN'] = f'{WEBARENA_BASE_URL}:7780'
|
||||
os.environ['GITLAB'] = f'{WEBARENA_BASE_URL}:8023'
|
||||
os.environ['WIKIPEDIA'] = f'{WEBARENA_BASE_URL}:8888'
|
||||
os.environ['MAP'] = f'{WEBARENA_BASE_URL}:3000'
|
||||
os.environ['HOMEPAGE'] = f'{WEBARENA_BASE_URL}:4399'
|
||||
|
||||
# Add the webarena path to sys.path to import its modules
|
||||
WEBARENA_PATH = '/workspace/project/webarena'
|
||||
sys.path.insert(0, WEBARENA_PATH)
|
||||
|
||||
try:
|
||||
from browser_env import ScriptBrowserEnv, create_stop_action
|
||||
from browser_env.actions import Action
|
||||
from browser_env.utils import StateInfo
|
||||
from evaluation_harness import evaluator_router
|
||||
|
||||
print('✅ WebArena evaluation harness imported successfully')
|
||||
except ImportError as e:
|
||||
print(f'❌ Failed to import WebArena evaluation harness: {e}')
|
||||
print('Make sure the WebArena repository is cloned to /workspace/project/webarena')
|
||||
print('and all dependencies are installed.')
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def load_config_file(config_path: str) -> dict[str, Any]:
|
||||
"""Load WebArena config file."""
|
||||
with open(config_path, 'r') as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def convert_openhands_action_to_webarena(action_data: dict[str, Any]) -> Action:
|
||||
"""Convert OpenHands action format to WebArena action format."""
|
||||
action_type = action_data.get('action', '')
|
||||
args = action_data.get('args', {})
|
||||
|
||||
if action_type == 'browse':
|
||||
url = args.get('url', '')
|
||||
if url:
|
||||
return Action(action_type='goto', coordinate=[0, 0], text=url)
|
||||
|
||||
elif action_type == 'click':
|
||||
coordinate = args.get('coordinate', [0, 0])
|
||||
return Action(action_type='click', coordinate=coordinate)
|
||||
|
||||
elif action_type == 'type':
|
||||
text = args.get('text', '')
|
||||
return Action(action_type='type', text=text, coordinate=[0, 0])
|
||||
|
||||
elif action_type == 'key':
|
||||
key = args.get('key', '')
|
||||
return Action(action_type='key', text=key, coordinate=[0, 0])
|
||||
|
||||
elif action_type == 'scroll':
|
||||
coordinate = args.get('coordinate', [0, 0])
|
||||
direction = args.get('direction', 'down')
|
||||
return Action(action_type='scroll', coordinate=coordinate, text=direction)
|
||||
|
||||
elif action_type == 'finish':
|
||||
return create_stop_action('')
|
||||
|
||||
# Default fallback for unknown actions
|
||||
return Action(action_type='none', coordinate=[0, 0])
|
||||
|
||||
|
||||
def convert_openhands_trajectory_to_webarena_format(
|
||||
openhands_output: dict[str, Any],
|
||||
) -> list[Any]:
|
||||
"""
|
||||
Convert OpenHands trajectory format to WebArena trajectory format.
|
||||
|
||||
OpenHands format: history contains pairs of [action, observation]
|
||||
WebArena format: trajectory is a list alternating between StateInfo and Action
|
||||
"""
|
||||
trajectory = []
|
||||
|
||||
# Add initial state
|
||||
initial_state = StateInfo(
|
||||
observation={'text': 'Initial state'}, info={'observation_metadata': {}}
|
||||
)
|
||||
trajectory.append(initial_state)
|
||||
|
||||
# Process the history
|
||||
history = openhands_output.get('history', [])
|
||||
for history_pair in history:
|
||||
if len(history_pair) >= 2:
|
||||
action_data = history_pair[0]
|
||||
observation_data = history_pair[1]
|
||||
|
||||
# Convert action
|
||||
webarena_action = convert_openhands_action_to_webarena(action_data)
|
||||
trajectory.append(webarena_action)
|
||||
|
||||
# Add state info from observation
|
||||
state_info = StateInfo(
|
||||
observation={'text': observation_data.get('content', '')},
|
||||
info={'observation_metadata': observation_data.get('extras', {})},
|
||||
)
|
||||
trajectory.append(state_info)
|
||||
|
||||
return trajectory
|
||||
|
||||
|
||||
def evaluate_with_official_webarena_harness(
|
||||
instance_data: dict[str, Any], config_file_path: str
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Evaluate a single WebArena instance using the official evaluation harness.
|
||||
|
||||
This function:
|
||||
1. Converts OpenHands trajectory to WebArena format
|
||||
2. Sets up a browser environment
|
||||
3. Replays the trajectory to reach the final state
|
||||
4. Runs the official WebArena evaluator
|
||||
"""
|
||||
|
||||
instance_id = instance_data.get('instance_id', 'unknown')
|
||||
print(f'\n🔍 Evaluating instance: {instance_id}')
|
||||
|
||||
try:
|
||||
# Load config to understand the task
|
||||
config_data = load_config_file(config_file_path)
|
||||
intent = config_data.get('intent', '')
|
||||
start_url = config_data.get('start_url', '')
|
||||
|
||||
print(f' Task: {intent}')
|
||||
print(f' Start URL: {start_url}')
|
||||
|
||||
# Convert OpenHands trajectory to WebArena format
|
||||
trajectory = convert_openhands_trajectory_to_webarena_format(instance_data)
|
||||
print(f' Converted trajectory with {len(trajectory)} steps')
|
||||
|
||||
# Get the evaluator for this config
|
||||
evaluator = evaluator_router(config_file_path)
|
||||
print(f' Using evaluator: {type(evaluator).__name__}')
|
||||
|
||||
# Create browser environment for evaluation
|
||||
env = ScriptBrowserEnv(
|
||||
headless=True,
|
||||
slow_mo=0,
|
||||
observation_type='accessibility_tree',
|
||||
current_viewport_only=True,
|
||||
viewport_size={'width': 1280, 'height': 720},
|
||||
)
|
||||
|
||||
try:
|
||||
# Initialize the environment with the task
|
||||
obs, info = env.reset(options={'config_file': config_file_path})
|
||||
|
||||
# Replay the trajectory to reach the final state
|
||||
# This is necessary because the evaluator needs the actual browser state
|
||||
current_obs = obs
|
||||
for i, step in enumerate(trajectory):
|
||||
if isinstance(step, Action):
|
||||
try:
|
||||
current_obs, reward, done, info = env.step(step)
|
||||
if done:
|
||||
break
|
||||
except Exception as e:
|
||||
print(f' Warning: Error replaying step {i}: {e}')
|
||||
continue
|
||||
|
||||
# Run the official evaluation
|
||||
score = evaluator(
|
||||
trajectory=trajectory,
|
||||
config_file=config_file_path,
|
||||
page=env.page,
|
||||
client=env.page.context.new_cdp_session(env.page),
|
||||
)
|
||||
|
||||
result = {
|
||||
'instance_id': instance_id,
|
||||
'score': score,
|
||||
'success': score == 1.0,
|
||||
'trajectory_length': len(trajectory),
|
||||
'evaluator': type(evaluator).__name__,
|
||||
'evaluation_type': 'official_webarena_harness',
|
||||
'intent': intent,
|
||||
}
|
||||
|
||||
print(
|
||||
f' Result: {"✅ PASS" if score == 1.0 else "❌ FAIL"} (score: {score})'
|
||||
)
|
||||
return result
|
||||
|
||||
finally:
|
||||
env.close()
|
||||
|
||||
except Exception as e:
|
||||
print(f' ❌ Error evaluating {instance_id}: {e}')
|
||||
return {
|
||||
'instance_id': instance_id,
|
||||
'score': 0.0,
|
||||
'success': False,
|
||||
'error': str(e),
|
||||
'evaluator': 'error',
|
||||
'evaluation_type': 'error',
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Evaluate WebArena results using ONLY the official WebArena evaluation harness'
|
||||
)
|
||||
parser.add_argument(
|
||||
'output_file', type=str, help='Path to OpenHands output.jsonl file'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--results_file',
|
||||
type=str,
|
||||
default='webarena_official_eval_results.json',
|
||||
help='Path to save evaluation results',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--config_dir',
|
||||
type=str,
|
||||
default='/workspace/project/webarena/config_files/examples',
|
||||
help='Directory containing WebArena config files',
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print('🚀 Starting WebArena Evaluation with Official WebArena Harness ONLY')
|
||||
print(f'📁 Output file: {args.output_file}')
|
||||
print(f'📁 Config directory: {args.config_dir}')
|
||||
|
||||
# Verify WebArena environment is properly set up
|
||||
if not WEBARENA_BASE_URL:
|
||||
print('❌ WEBARENA_BASE_URL environment variable not set')
|
||||
print('Please set WEBARENA_BASE_URL to your WebArena server URL')
|
||||
sys.exit(1)
|
||||
|
||||
print(f'🌐 WebArena base URL: {WEBARENA_BASE_URL}')
|
||||
|
||||
# Load OpenHands results
|
||||
results = []
|
||||
with open(args.output_file, 'r') as f:
|
||||
for line in f:
|
||||
if line.strip():
|
||||
results.append(json.loads(line))
|
||||
|
||||
print(f'📊 Found {len(results)} instances to evaluate')
|
||||
|
||||
# Evaluate each instance using ONLY official WebArena evaluation harness
|
||||
evaluation_results = []
|
||||
total_score = 0.0
|
||||
|
||||
for result in results:
|
||||
instance_id = result.get('instance_id', 'unknown')
|
||||
|
||||
# Find corresponding config file
|
||||
config_file = None
|
||||
# Accept either plain numeric id ("8") or legacy prefixed id ("webarena.8")
|
||||
task_num = instance_id.split('.')[-1]
|
||||
config_file = f'{args.config_dir}/{task_num}.json'
|
||||
|
||||
if config_file and os.path.exists(config_file):
|
||||
eval_result = evaluate_with_official_webarena_harness(result, config_file)
|
||||
evaluation_results.append(eval_result)
|
||||
total_score += eval_result.get('score', 0.0)
|
||||
else:
|
||||
print(f'\n🔍 Evaluating instance: {instance_id}')
|
||||
print(f' ⚠️ Config file not found: {config_file}')
|
||||
evaluation_results.append(
|
||||
{
|
||||
'instance_id': instance_id,
|
||||
'score': 0.0,
|
||||
'success': False,
|
||||
'error': f'Config file not found: {config_file}',
|
||||
'evaluation_type': 'config_error',
|
||||
}
|
||||
)
|
||||
|
||||
# Calculate final metrics
|
||||
total_instances = len(evaluation_results)
|
||||
success_count = sum(1 for r in evaluation_results if r.get('success', False))
|
||||
success_rate = success_count / total_instances if total_instances > 0 else 0.0
|
||||
average_score = total_score / total_instances if total_instances > 0 else 0.0
|
||||
|
||||
# Save results
|
||||
final_results = {
|
||||
'evaluation_method': 'official_webarena_harness_only',
|
||||
'webarena_base_url': WEBARENA_BASE_URL,
|
||||
'total_instances': total_instances,
|
||||
'success_count': success_count,
|
||||
'success_rate': success_rate,
|
||||
'average_score': average_score,
|
||||
'individual_results': evaluation_results,
|
||||
}
|
||||
|
||||
with open(args.results_file, 'w') as f:
|
||||
json.dump(final_results, f, indent=2)
|
||||
|
||||
# Print summary
|
||||
print('\n' + '=' * 70)
|
||||
print('🎯 WEBARENA EVALUATION RESULTS (Official Harness ONLY)')
|
||||
print('=' * 70)
|
||||
print(f'📊 Total instances: {total_instances}')
|
||||
print(f'✅ Successful: {success_count}')
|
||||
print(f'❌ Failed: {total_instances - success_count}')
|
||||
print(f'📈 Success rate: {success_rate:.2%}')
|
||||
print(f'📊 Average score: {average_score:.4f}')
|
||||
print(f'💾 Results saved to: {args.results_file}')
|
||||
print('=' * 70)
|
||||
|
||||
# Print individual results
|
||||
print('\n📋 Individual Results:')
|
||||
for result in evaluation_results:
|
||||
status = '✅ PASS' if result.get('success', False) else '❌ FAIL'
|
||||
score = result.get('score', 0.0)
|
||||
instance_id = result.get('instance_id', 'unknown')
|
||||
evaluator = result.get('evaluator', 'unknown')
|
||||
error = result.get('error', '')
|
||||
if error:
|
||||
print(f' {instance_id}: {status} (score: {score:.2f}) - Error: {error}')
|
||||
else:
|
||||
print(
|
||||
f' {instance_id}: {status} (score: {score:.2f}) - Evaluator: {evaluator}'
|
||||
)
|
||||
|
||||
# Print requirements if there were errors
|
||||
error_count = sum(1 for r in evaluation_results if r.get('error'))
|
||||
if error_count > 0:
|
||||
print('\n' + '⚠️' * 20)
|
||||
print('EVALUATION ERRORS DETECTED')
|
||||
print('⚠️' * 20)
|
||||
print('This evaluation requires:')
|
||||
print('1. WebArena Docker containers running and accessible')
|
||||
print('2. Authentication files (.auth/) properly set up')
|
||||
print('3. All WebArena dependencies installed')
|
||||
print('4. Proper network access to WebArena sites')
|
||||
print('\nPlease resolve these issues for accurate evaluation.')
|
||||
print('⚠️' * 20)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
211
evaluation/benchmarks/webarena/eval_infer_new.py
Normal file
211
evaluation/benchmarks/webarena/eval_infer_new.py
Normal file
@@ -0,0 +1,211 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
WebArena Evaluation Script
|
||||
|
||||
This script evaluates WebArena task results using the official WebArena evaluation harness
|
||||
with BrowserGym state capture. It loads saved browser state and creates mock objects
|
||||
that provide the exact state WebArena evaluators need.
|
||||
|
||||
This approach leverages BrowserGym's existing observation functions (extract_dom_snapshot, etc.)
|
||||
which already provide WebArena-compatible state capture.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
# Add WebArena to path
|
||||
sys.path.insert(0, '/workspace/project/webarena')
|
||||
|
||||
|
||||
def convert_openhands_trajectory_to_webarena_format(
|
||||
instance_data: dict[str, Any],
|
||||
) -> list[Any]:
|
||||
"""
|
||||
Convert OpenHands trajectory format to WebArena trajectory format.
|
||||
|
||||
WebArena expects a list of alternating Action and StateInfo objects.
|
||||
OpenHands provides action/observation pairs in text format.
|
||||
"""
|
||||
trajectory = []
|
||||
|
||||
# Get the conversation history
|
||||
history = instance_data.get('history', [])
|
||||
|
||||
for entry in history:
|
||||
if entry.get('source') == 'agent':
|
||||
# This is an agent action
|
||||
content = entry.get('message', {}).get('content', '')
|
||||
|
||||
# Create a WebArena-compatible action
|
||||
action = {
|
||||
'action_type': 'browser_action',
|
||||
'content': content,
|
||||
'timestamp': entry.get('timestamp', 0),
|
||||
}
|
||||
trajectory.append(action)
|
||||
|
||||
elif entry.get('source') == 'user':
|
||||
# This might be an observation or state info
|
||||
content = entry.get('message', {}).get('content', '')
|
||||
|
||||
# Create a WebArena-compatible state info
|
||||
state_info = {
|
||||
'observation': content,
|
||||
'timestamp': entry.get('timestamp', 0),
|
||||
}
|
||||
trajectory.append(state_info)
|
||||
|
||||
# Add a final stop action if needed
|
||||
if trajectory and not trajectory[-1].get('action_type'):
|
||||
trajectory.append(
|
||||
{
|
||||
'action_type': 'stop',
|
||||
'content': 'Task completed',
|
||||
'timestamp': trajectory[-1].get('timestamp', 0) + 1,
|
||||
}
|
||||
)
|
||||
|
||||
return trajectory
|
||||
|
||||
|
||||
def evaluate_with_browsergym_state_capture(
|
||||
instance_data: dict[str, Any], config_file: str
|
||||
) -> float:
|
||||
"""
|
||||
Evaluate using official WebArena harness with BrowserGym state capture.
|
||||
|
||||
This loads the saved browser state captured during inference and creates
|
||||
mock Page/CDPSession objects that provide the exact state WebArena evaluators need.
|
||||
"""
|
||||
try:
|
||||
# Import BrowserGym state capture
|
||||
from browsergym_state_capture import (
|
||||
BrowserGymStateCapture,
|
||||
MockCDPSessionForWebArena,
|
||||
MockPageForWebArena,
|
||||
)
|
||||
|
||||
# Import WebArena evaluation components
|
||||
from evaluation_harness import evaluator_router
|
||||
|
||||
# Load saved browser state
|
||||
instance_id = instance_data.get('instance_id', 'unknown')
|
||||
state_capture = BrowserGymStateCapture()
|
||||
|
||||
try:
|
||||
saved_state = state_capture.load_state(instance_id)
|
||||
print(f' ✅ Loaded browser state for {instance_id}')
|
||||
except FileNotFoundError:
|
||||
print(f' ❌ No saved browser state found for {instance_id}')
|
||||
print(' Make sure inference was run with browser_logging_dir enabled')
|
||||
return 0.0
|
||||
|
||||
# Create mock objects with saved state
|
||||
mock_page = MockPageForWebArena(saved_state)
|
||||
mock_client = MockCDPSessionForWebArena(saved_state)
|
||||
|
||||
# Convert trajectory format
|
||||
trajectory = convert_openhands_trajectory_to_webarena_format(instance_data)
|
||||
|
||||
# Get the official evaluator
|
||||
evaluator = evaluator_router(config_file)
|
||||
|
||||
# Run evaluation with mock objects containing saved browser state
|
||||
score = evaluator(
|
||||
trajectory=trajectory,
|
||||
config_file=config_file,
|
||||
page=mock_page, # Mock page with BrowserGym's captured state
|
||||
client=mock_client, # Mock CDP session with BrowserGym's captured state
|
||||
)
|
||||
|
||||
return score
|
||||
|
||||
except ImportError as e:
|
||||
print(f' ❌ Could not import BrowserGym state capture: {e}')
|
||||
print(' Make sure browsergym_state_capture.py is available')
|
||||
return 0.0
|
||||
except Exception as e:
|
||||
print(f' ❌ Evaluation failed: {e}')
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
return 0.0
|
||||
|
||||
|
||||
def main():
|
||||
"""Main evaluation function."""
|
||||
if len(sys.argv) != 2:
|
||||
print('Usage: python eval_infer.py <output_file>')
|
||||
sys.exit(1)
|
||||
|
||||
output_file = sys.argv[1]
|
||||
|
||||
if not os.path.exists(output_file):
|
||||
print(f'❌ Output file not found: {output_file}')
|
||||
sys.exit(1)
|
||||
|
||||
print('🔍 WebArena Evaluation (BrowserGym State Capture)')
|
||||
print('=' * 60)
|
||||
|
||||
# Load results
|
||||
with open(output_file, 'r') as f:
|
||||
results = [json.loads(line) for line in f]
|
||||
|
||||
print(f'📊 Evaluating {len(results)} WebArena tasks...')
|
||||
|
||||
# WebArena config files
|
||||
config_dir = Path('/workspace/project/webarena/config_files/examples')
|
||||
|
||||
total_score = 0
|
||||
evaluated_count = 0
|
||||
|
||||
for result in results:
|
||||
instance_id = result.get('instance_id', 'unknown')
|
||||
|
||||
# Find corresponding config file
|
||||
config_file = config_dir / f'{instance_id}.json'
|
||||
|
||||
if not config_file.exists():
|
||||
print(f'⚠️ Config file not found for {instance_id}')
|
||||
continue
|
||||
|
||||
print(f'\n🧪 Evaluating {instance_id}...')
|
||||
|
||||
try:
|
||||
# Use official WebArena evaluation with BrowserGym state capture
|
||||
score = evaluate_with_browsergym_state_capture(result, str(config_file))
|
||||
|
||||
print(f' Score: {score}')
|
||||
total_score += score
|
||||
evaluated_count += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f' ❌ Evaluation failed: {e}')
|
||||
|
||||
if evaluated_count > 0:
|
||||
average_score = total_score / evaluated_count
|
||||
print('\n📈 Results Summary:')
|
||||
print(f' Tasks evaluated: {evaluated_count}')
|
||||
print(f' Total score: {total_score}')
|
||||
print(f' Average score: {average_score:.3f}')
|
||||
print(
|
||||
f' Pass rate: {total_score}/{evaluated_count} ({100 * total_score / evaluated_count:.1f}%)'
|
||||
)
|
||||
else:
|
||||
print('\n❌ No tasks could be evaluated')
|
||||
|
||||
print('\n🎯 Evaluation Method:')
|
||||
print(' - Uses official WebArena evaluation harness')
|
||||
print(' - Loads browser state captured by BrowserGym during inference')
|
||||
print(' - Creates mock Page/CDPSession objects with exact browser state')
|
||||
print(' - WebArena evaluators get the exact state they need')
|
||||
|
||||
print('\n💡 To enable browser state capture during inference:')
|
||||
print(' export WEBARENA_BROWSER_LOGGING_DIR=/tmp/webarena_states')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,33 +0,0 @@
|
||||
import argparse
|
||||
import json
|
||||
|
||||
import browsergym.webarena # noqa F401 register webarena tasks as gym environments
|
||||
import gymnasium as gym
|
||||
|
||||
parser = argparse.ArgumentParser(description='Calculate average reward.')
|
||||
parser.add_argument('output_path', type=str, help='path to output.jsonl')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if __name__ == '__main__':
|
||||
env_ids = [
|
||||
id for id in gym.envs.registry.keys() if id.startswith('browsergym/webarena')
|
||||
]
|
||||
total_num = len(env_ids)
|
||||
print('Total number of tasks: ', total_num)
|
||||
total_reward = 0
|
||||
total_cost = 0
|
||||
actual_num = 0
|
||||
with open(args.output_path, 'r') as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
actual_num += 1
|
||||
total_cost += data['metrics']['accumulated_cost']
|
||||
total_reward += data['test_result']
|
||||
|
||||
avg_reward = total_reward / total_num
|
||||
print('Success Rate: ', avg_reward)
|
||||
|
||||
avg_cost = total_cost / actual_num
|
||||
print('Avg Cost: ', avg_cost)
|
||||
print('Actual number of tasks finished: ', actual_num)
|
||||
@@ -1,15 +1,13 @@
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
import browsergym.webarena # noqa F401 register webarena tasks as gym environments
|
||||
import gymnasium as gym
|
||||
import pandas as pd
|
||||
|
||||
from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
EvalOutput,
|
||||
codeact_user_response,
|
||||
compatibility_for_eval_history_pairs,
|
||||
get_default_sandbox_config_for_eval,
|
||||
get_metrics,
|
||||
@@ -23,29 +21,32 @@ from openhands.controller.state.state import State
|
||||
from openhands.core.config import (
|
||||
OpenHandsConfig,
|
||||
get_llm_config_arg,
|
||||
parse_arguments,
|
||||
)
|
||||
from openhands.core.config.arg_utils import get_evaluation_parser
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
from openhands.core.main import create_runtime, run_controller
|
||||
from openhands.events.action import (
|
||||
BrowseInteractiveAction,
|
||||
CmdRunAction,
|
||||
MessageAction,
|
||||
)
|
||||
from openhands.events.observation import CmdOutputObservation
|
||||
from openhands.runtime.base import Runtime
|
||||
from openhands.runtime.browser.browser_env import (
|
||||
BROWSER_EVAL_GET_GOAL_ACTION,
|
||||
BROWSER_EVAL_GET_REWARDS_ACTION,
|
||||
)
|
||||
from openhands.utils.async_utils import call_async_from_sync
|
||||
|
||||
SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
|
||||
SUPPORTED_AGENT_CLS = {'BrowsingAgent', 'CodeActAgent'}
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
'BrowsingAgent': codeact_user_response,
|
||||
}
|
||||
|
||||
# Global variable to store task configs
|
||||
TASK_CONFIGS = {}
|
||||
|
||||
|
||||
def get_config(
|
||||
metadata: EvalMetadata,
|
||||
env_id: str,
|
||||
task_config: dict,
|
||||
) -> OpenHandsConfig:
|
||||
base_url = os.environ.get('WEBARENA_BASE_URL', None)
|
||||
openai_api_key = os.environ.get('OPENAI_API_KEY', None)
|
||||
@@ -54,7 +55,7 @@ def get_config(
|
||||
|
||||
sandbox_config = get_default_sandbox_config_for_eval()
|
||||
sandbox_config.base_container_image = 'python:3.12-bookworm'
|
||||
sandbox_config.browsergym_eval_env = env_id
|
||||
# Remove browsergym_eval_env dependency - we'll use regular browser environment
|
||||
sandbox_config.runtime_startup_env_vars = {
|
||||
'BASE_URL': base_url,
|
||||
'OPENAI_API_KEY': openai_api_key,
|
||||
@@ -70,6 +71,7 @@ def get_config(
|
||||
metadata=metadata,
|
||||
runtime='docker',
|
||||
sandbox_config=sandbox_config,
|
||||
enable_browser=True,
|
||||
)
|
||||
config.set_llm_config(metadata.llm_config)
|
||||
agent_config = config.get_agent_config(metadata.agent_class)
|
||||
@@ -77,30 +79,59 @@ def get_config(
|
||||
return config
|
||||
|
||||
|
||||
def get_instruction(task_config: dict) -> MessageAction:
|
||||
"""Create the instruction message for the agent based on the task config."""
|
||||
intent = task_config.get('intent', 'Complete the task')
|
||||
start_url = task_config.get('start_url', 'about:blank')
|
||||
|
||||
# BrowserGym WebArena already handles URL substitution, so we can use start_url directly
|
||||
# Create a comprehensive instruction that includes the task and starting point
|
||||
instruction = f"""You are a web browsing agent. Your task is: {intent}
|
||||
|
||||
Please start by navigating to: {start_url}
|
||||
|
||||
Complete the task by interacting with the webpage as needed. Use the browser tool to navigate, click, fill forms, and perform other web interactions to accomplish the goal."""
|
||||
|
||||
return MessageAction(content=instruction)
|
||||
|
||||
|
||||
def initialize_runtime(
|
||||
runtime: Runtime,
|
||||
) -> dict:
|
||||
task_config: dict,
|
||||
) -> None:
|
||||
"""Initialize the runtime for the agent.
|
||||
|
||||
This function is called before the runtime is used to run the agent.
|
||||
Also performs initial navigation to the task's start_url because USE_NAV is disabled during evaluation.
|
||||
"""
|
||||
logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
|
||||
obs: CmdOutputObservation
|
||||
|
||||
# Set instance id
|
||||
# Ensure workspace exists
|
||||
action = CmdRunAction(command='mkdir -p /workspace')
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
assert obs.exit_code == 0
|
||||
|
||||
action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_GOAL_ACTION)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
goal = obs.content
|
||||
# Navigate to the configured start_url so the page is ready for the agent
|
||||
try:
|
||||
from openhands.events.action import BrowseInteractiveAction
|
||||
|
||||
start_url = task_config.get('start_url')
|
||||
if start_url:
|
||||
browse_action = BrowseInteractiveAction(
|
||||
browser_actions=f'goto("{start_url}")',
|
||||
return_axtree=True,
|
||||
)
|
||||
runtime.browse_interactive(browse_action)
|
||||
else:
|
||||
logger.warning(
|
||||
'No start_url found in task_config; skipping initial navigation'
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f'Failed to perform initial navigation: {e}')
|
||||
|
||||
logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')
|
||||
return goal
|
||||
|
||||
|
||||
def complete_runtime(
|
||||
@@ -108,22 +139,40 @@ def complete_runtime(
|
||||
) -> dict[str, Any]:
|
||||
"""Complete the runtime for the agent.
|
||||
|
||||
This function is called before the runtime is used to run the agent.
|
||||
If you need to do something in the sandbox to get the correctness metric after
|
||||
the agent has run, modify this function.
|
||||
This function is called after the agent has run.
|
||||
Since we're using the official webarena evaluation, we don't need to get rewards here.
|
||||
"""
|
||||
logger.info(f'{"-" * 50} BEGIN Runtime Completion Fn {"-" * 50}')
|
||||
obs: CmdOutputObservation
|
||||
|
||||
action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
# Capture the final accessibility tree for WebArena evaluation
|
||||
try:
|
||||
# Create a browser action to get the current page state with accessibility tree
|
||||
from openhands.events.action import BrowseInteractiveAction
|
||||
|
||||
logger.info(f'{"-" * 50} END Runtime Completion Fn {"-" * 50}')
|
||||
return {
|
||||
'rewards': json.loads(obs.content),
|
||||
}
|
||||
# Use a no-op action that returns the accessibility tree
|
||||
final_browse_action = BrowseInteractiveAction(
|
||||
browser_actions='noop()', # No-op action to just get current state
|
||||
return_axtree=True, # Ensure we get the accessibility tree
|
||||
)
|
||||
|
||||
# Execute the action to get the final observation with accessibility tree
|
||||
final_obs = runtime.browse_interactive(final_browse_action)
|
||||
|
||||
# Extract the accessibility tree from the observation
|
||||
final_axtree = None
|
||||
if hasattr(final_obs, 'axtree_object') and final_obs.axtree_object:
|
||||
final_axtree = final_obs.axtree_object
|
||||
logger.info('Successfully captured final accessibility tree')
|
||||
else:
|
||||
logger.warning('No accessibility tree found in final observation')
|
||||
|
||||
logger.info(f'{"-" * 50} END Runtime Completion Fn {"-" * 50}')
|
||||
return {'final_accessibility_tree': final_axtree}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f'Error capturing final accessibility tree: {e}')
|
||||
logger.info(f'{"-" * 50} END Runtime Completion Fn {"-" * 50}')
|
||||
return {'final_accessibility_tree': None}
|
||||
|
||||
|
||||
def process_instance(
|
||||
@@ -131,31 +180,34 @@ def process_instance(
|
||||
metadata: EvalMetadata,
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
env_id = instance.instance_id
|
||||
config = get_config(metadata, env_id)
|
||||
task_id = instance.instance_id
|
||||
task_config = TASK_CONFIGS.get(task_id, {})
|
||||
config = get_config(metadata, task_config)
|
||||
|
||||
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
|
||||
if reset_logger:
|
||||
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
|
||||
reset_logger_for_multiprocessing(logger, env_id, log_dir)
|
||||
reset_logger_for_multiprocessing(logger, str(task_id), log_dir)
|
||||
else:
|
||||
logger.info(f'Starting evaluation for instance {env_id}.')
|
||||
logger.info(f'Starting evaluation for task {task_id}.')
|
||||
|
||||
runtime = create_runtime(config)
|
||||
call_async_from_sync(runtime.connect)
|
||||
task_str = initialize_runtime(runtime)
|
||||
initialize_runtime(runtime, task_config)
|
||||
|
||||
# Get the proper instruction message
|
||||
message_action = get_instruction(task_config)
|
||||
|
||||
state: State | None = asyncio.run(
|
||||
run_controller(
|
||||
config=config,
|
||||
initial_user_action=MessageAction(content=task_str),
|
||||
initial_user_action=message_action,
|
||||
runtime=runtime,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
||||
metadata.agent_class
|
||||
],
|
||||
)
|
||||
)
|
||||
# ======= Attempt to evaluate the agent's environment impact =======
|
||||
|
||||
# If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
|
||||
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
|
||||
|
||||
if state is None:
|
||||
raise ValueError('State should not be None.')
|
||||
@@ -171,7 +223,6 @@ def process_instance(
|
||||
|
||||
return_val = complete_runtime(runtime)
|
||||
logger.info(f'Return value from complete_runtime: {return_val}')
|
||||
reward = max(return_val['rewards'])
|
||||
|
||||
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
|
||||
# for compatibility with the existing output format, we can remake the pairs here
|
||||
@@ -180,43 +231,90 @@ def process_instance(
|
||||
|
||||
# Save the output
|
||||
output = EvalOutput(
|
||||
instance_id=env_id,
|
||||
instance_id=str(task_id),
|
||||
instruction=instruction,
|
||||
metadata=metadata,
|
||||
history=histories,
|
||||
metrics=metrics,
|
||||
error=state.last_error if state and state.last_error else None,
|
||||
test_result={
|
||||
'reward': reward,
|
||||
'task_config': task_config, # Store task config for later evaluation
|
||||
'final_accessibility_tree': return_val.get('final_accessibility_tree')
|
||||
if return_val
|
||||
else None,
|
||||
},
|
||||
)
|
||||
return output
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_arguments()
|
||||
parser = get_evaluation_parser()
|
||||
args = parser.parse_args()
|
||||
|
||||
# Set up WebArena environment variables for BrowserGym
|
||||
base_url = os.environ.get('WEBARENA_BASE_URL', None)
|
||||
if not base_url:
|
||||
raise ValueError('WEBARENA_BASE_URL must be set')
|
||||
|
||||
# Set up the WA_ prefixed environment variables that BrowserGym expects
|
||||
os.environ['WA_SHOPPING'] = f'{base_url}:7770/'
|
||||
os.environ['WA_SHOPPING_ADMIN'] = f'{base_url}:7780/admin'
|
||||
os.environ['WA_REDDIT'] = f'{base_url}:9999'
|
||||
os.environ['WA_GITLAB'] = f'{base_url}:8023'
|
||||
os.environ['WA_WIKIPEDIA'] = (
|
||||
f'{base_url}:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing'
|
||||
)
|
||||
os.environ['WA_MAP'] = f'{base_url}:3000'
|
||||
os.environ['WA_HOMEPAGE'] = f'{base_url}:4399'
|
||||
|
||||
# Load webarena task configs from BrowserGym
|
||||
from browsergym.webarena.config import TASK_IDS
|
||||
from browsergym.webarena.task import GenericWebArenaTask
|
||||
|
||||
task_configs = []
|
||||
|
||||
# Load a subset of tasks for testing (first 10 tasks)
|
||||
test_task_ids = list(TASK_IDS)[:10] # Use first 10 tasks for testing
|
||||
|
||||
for task_id in test_task_ids:
|
||||
try:
|
||||
# Create a temporary task to get the config
|
||||
temp_task = GenericWebArenaTask(seed=42, task_id=task_id)
|
||||
|
||||
# Get the first (and likely only) task config for this task_id
|
||||
if temp_task.task_configs:
|
||||
task_config = temp_task.task_configs[0]
|
||||
task_configs.append({'task_id': task_id, 'task_config': task_config})
|
||||
except Exception as e:
|
||||
print(f'Warning: Could not load task {task_id}: {e}')
|
||||
continue
|
||||
|
||||
if not task_configs:
|
||||
raise ValueError('No task configs could be loaded from BrowserGym WebArena')
|
||||
|
||||
print(f'Found {len(task_configs)} task configs from BrowserGym WebArena')
|
||||
|
||||
# Store task configs globally for process_instance to access
|
||||
for task in task_configs:
|
||||
TASK_CONFIGS[str(task['task_id'])] = task['task_config']
|
||||
|
||||
# Create dataset from task configs
|
||||
dataset = pd.DataFrame(
|
||||
{
|
||||
'instance_id': [
|
||||
id
|
||||
for id in gym.envs.registry.keys()
|
||||
if id.startswith('browsergym/webarena')
|
||||
]
|
||||
}
|
||||
[{'instance_id': str(task['task_id'])} for task in task_configs]
|
||||
)
|
||||
|
||||
llm_config = None
|
||||
if args.llm_config:
|
||||
llm_config = get_llm_config_arg(args.llm_config)
|
||||
llm_config = get_llm_config_arg(args.llm_config, args.config_file)
|
||||
# modify_params must be False for evaluation purpose, for reproducibility and accuracy of results
|
||||
llm_config.modify_params = False
|
||||
if llm_config:
|
||||
llm_config.modify_params = False
|
||||
if llm_config is None:
|
||||
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
|
||||
|
||||
metadata = make_metadata(
|
||||
llm_config,
|
||||
args.dataset_name,
|
||||
'webarena',
|
||||
args.agent_cls,
|
||||
args.max_iterations,
|
||||
args.eval_note,
|
||||
|
||||
@@ -38,7 +38,7 @@ EVAL_NOTE="$OPENHANDS_VERSION"
|
||||
COMMAND="poetry run python evaluation/benchmarks/webarena/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 15 \
|
||||
--max-iterations 30 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $EVAL_NOTE"
|
||||
|
||||
|
||||
19
evaluation/benchmarks/webarena/scripts/webarena_env.sh
Executable file
19
evaluation/benchmarks/webarena/scripts/webarena_env.sh
Executable file
@@ -0,0 +1,19 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# WebArena environment configuration
|
||||
# This script sets up the environment variables needed for WebArena evaluation
|
||||
|
||||
# Check if WEBARENA_BASE_URL is set
|
||||
if [ -z "$WEBARENA_BASE_URL" ]; then
|
||||
echo "Warning: WEBARENA_BASE_URL is not set. Please set it to the base URL where webarena services are hosted."
|
||||
echo "Example: export WEBARENA_BASE_URL=http://your-webarena-host"
|
||||
fi
|
||||
|
||||
# Check if OPENAI_API_KEY is set
|
||||
if [ -z "$OPENAI_API_KEY" ]; then
|
||||
echo "Warning: OPENAI_API_KEY is not set. Please set it to your OpenAI API key."
|
||||
fi
|
||||
|
||||
echo "WebArena environment configured:"
|
||||
echo " WEBARENA_BASE_URL: $WEBARENA_BASE_URL"
|
||||
echo " OPENAI_API_KEY: ${OPENAI_API_KEY:+[SET]}${OPENAI_API_KEY:-[NOT SET]}"
|
||||
@@ -188,6 +188,14 @@ def make_metadata(
|
||||
pathlib.Path(os.path.join(eval_output_path, 'logs')).mkdir(
|
||||
parents=True, exist_ok=True
|
||||
)
|
||||
# Allow overriding the evaluation output directory via env for smoke runs
|
||||
override_output_dir = os.environ.get('EVAL_OUTPUT_DIR')
|
||||
if override_output_dir:
|
||||
eval_output_path = override_output_dir
|
||||
pathlib.Path(eval_output_path).mkdir(parents=True, exist_ok=True)
|
||||
pathlib.Path(os.path.join(eval_output_path, 'logs')).mkdir(
|
||||
parents=True, exist_ok=True
|
||||
)
|
||||
logger.info(f'Using evaluation output directory: {eval_output_path}')
|
||||
|
||||
metadata = EvalMetadata(
|
||||
|
||||
@@ -154,15 +154,32 @@ class BrowsingAgent(Agent):
|
||||
# for webarena and miniwob++ eval, we need to retrieve the initial observation already in browser env
|
||||
# initialize and retrieve the first observation by issuing an noop OP
|
||||
# For non-benchmark browsing, the browser env starts with a blank page, and the agent is expected to first navigate to desired websites
|
||||
return BrowseInteractiveAction(browser_actions='noop()')
|
||||
return BrowseInteractiveAction(browser_actions='noop()', return_axtree=True)
|
||||
|
||||
for event in state.view:
|
||||
if isinstance(event, BrowseInteractiveAction):
|
||||
prev_actions.append(event.browser_actions)
|
||||
last_action = event
|
||||
elif isinstance(event, MessageAction) and event.source == EventSource.AGENT:
|
||||
# agent has responded, task finished.
|
||||
return AgentFinishAction(outputs={'content': event.content})
|
||||
# agent has responded with a message. Avoid finishing on generic browsing error string.
|
||||
# Check for various forms of the generic browsing error message
|
||||
generic_error_patterns = [
|
||||
'error encountered when browsing',
|
||||
'error encountered while browsing',
|
||||
'error encountered during browsing',
|
||||
'an error encountered when browsing',
|
||||
'an error encountered while browsing',
|
||||
'an error encountered during browsing'
|
||||
]
|
||||
if (
|
||||
event.content
|
||||
and any(pattern in event.content.strip().lower() for pattern in generic_error_patterns)
|
||||
):
|
||||
logger.warning(
|
||||
'Ignoring generic error message from agent; continuing.'
|
||||
)
|
||||
else:
|
||||
return AgentFinishAction(outputs={'content': event.content})
|
||||
elif isinstance(event, Observation):
|
||||
last_obs = event
|
||||
|
||||
@@ -176,7 +193,21 @@ class BrowsingAgent(Agent):
|
||||
isinstance(last_action, BrowseInteractiveAction)
|
||||
and last_action.browsergym_send_msg_to_user
|
||||
):
|
||||
return MessageAction(last_action.browsergym_send_msg_to_user)
|
||||
# Avoid prematurely finishing on generic error messages
|
||||
msg_content = last_action.browsergym_send_msg_to_user.strip()
|
||||
generic_error_patterns = [
|
||||
'error encountered when browsing',
|
||||
'error encountered while browsing',
|
||||
'error encountered during browsing',
|
||||
'an error encountered when browsing',
|
||||
'an error encountered while browsing',
|
||||
'an error encountered during browsing'
|
||||
]
|
||||
if any(pattern in msg_content.lower() for pattern in generic_error_patterns):
|
||||
logger.warning('Ignoring generic error message from model; continuing.')
|
||||
# Do not finish; proceed to compute next action
|
||||
else:
|
||||
return MessageAction(last_action.browsergym_send_msg_to_user)
|
||||
|
||||
if isinstance(last_obs, BrowserOutputObservation):
|
||||
if last_obs.error:
|
||||
@@ -189,17 +220,59 @@ class BrowsingAgent(Agent):
|
||||
cur_url = last_obs.url
|
||||
|
||||
try:
|
||||
cur_axtree_txt = flatten_axtree_to_str(
|
||||
last_obs.axtree_object,
|
||||
extra_properties=last_obs.extra_element_properties,
|
||||
with_clickable=True,
|
||||
filter_visible_only=True,
|
||||
# Debug logging to understand the structure
|
||||
logger.info(
|
||||
f'DEBUG: axtree_object type: {type(last_obs.axtree_object)}'
|
||||
)
|
||||
logger.info(
|
||||
f'DEBUG: axtree_object is None: {last_obs.axtree_object is None}'
|
||||
)
|
||||
if isinstance(last_obs.axtree_object, dict):
|
||||
logger.info(
|
||||
f'DEBUG: axtree_object keys: {list(last_obs.axtree_object.keys())}'
|
||||
)
|
||||
if 'nodes' in last_obs.axtree_object:
|
||||
logger.info(
|
||||
f'DEBUG: nodes type: {type(last_obs.axtree_object["nodes"])}'
|
||||
)
|
||||
logger.info(
|
||||
f'DEBUG: nodes length: {len(last_obs.axtree_object["nodes"]) if last_obs.axtree_object["nodes"] else 0}'
|
||||
)
|
||||
|
||||
# Check if axtree_object exists and has the expected structure
|
||||
if not last_obs.axtree_object or not isinstance(
|
||||
last_obs.axtree_object, dict
|
||||
):
|
||||
logger.info('DEBUG: Using fallback - no axtree_object or not dict')
|
||||
cur_axtree_txt = '[No accessibility tree available]'
|
||||
elif (
|
||||
'nodes' not in last_obs.axtree_object
|
||||
or not last_obs.axtree_object['nodes']
|
||||
):
|
||||
# axtree_object exists but is empty or missing nodes - this is the common case
|
||||
logger.info('DEBUG: Using fallback - missing nodes or empty nodes')
|
||||
cur_axtree_txt = '[Accessibility tree not yet loaded]'
|
||||
else:
|
||||
# axtree_object has the expected structure with nodes
|
||||
logger.info('DEBUG: Calling flatten_axtree_to_str')
|
||||
cur_axtree_txt = flatten_axtree_to_str(
|
||||
last_obs.axtree_object,
|
||||
extra_properties=last_obs.extra_element_properties,
|
||||
with_clickable=True,
|
||||
filter_visible_only=True,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
'Error when trying to process the accessibility tree: %s', e
|
||||
'BROWSING AGENT ERROR when trying to process the accessibility tree: %s',
|
||||
e,
|
||||
)
|
||||
logger.error(
|
||||
f'DEBUG: Exception occurred with axtree_object: {last_obs.axtree_object}'
|
||||
)
|
||||
# Fall back gracefully without aborting the task
|
||||
cur_axtree_txt = (
|
||||
'[Accessibility tree unavailable due to processing error]'
|
||||
)
|
||||
return MessageAction('Error encountered when browsing.')
|
||||
|
||||
goal, _ = state.get_current_user_intent()
|
||||
|
||||
|
||||
@@ -61,11 +61,32 @@ class BrowsingActionParserMessage(ActionParser):
|
||||
return '```' not in action_str
|
||||
|
||||
def parse(self, action_str: str) -> Action:
|
||||
# If the model emitted a plain message (no code fence). If it is an
|
||||
# error-like message, recover by requesting another observation instead
|
||||
# of finishing immediately.
|
||||
lowered = action_str.strip().lower()
|
||||
# Check for various forms of the generic browsing error message
|
||||
generic_error_patterns = [
|
||||
'error encountered when browsing',
|
||||
'error encountered while browsing',
|
||||
'error encountered during browsing',
|
||||
'an error encountered when browsing',
|
||||
'an error encountered while browsing',
|
||||
'an error encountered during browsing'
|
||||
]
|
||||
if any(pattern in lowered for pattern in generic_error_patterns):
|
||||
return BrowseInteractiveAction(
|
||||
browser_actions='noop()',
|
||||
thought='Recovered from generic browsing error message',
|
||||
browsergym_send_msg_to_user='',
|
||||
return_axtree=True,
|
||||
)
|
||||
msg = f'send_msg_to_user("""{action_str}""")'
|
||||
return BrowseInteractiveAction(
|
||||
browser_actions=msg,
|
||||
thought=action_str,
|
||||
browsergym_send_msg_to_user=action_str,
|
||||
return_axtree=True,
|
||||
)
|
||||
|
||||
|
||||
@@ -101,6 +122,24 @@ class BrowsingActionParserBrowseInteractive(ActionParser):
|
||||
)
|
||||
thought = parts[0].strip() if parts[1].strip() != '' else ''
|
||||
|
||||
# Guard against generic error message leading to premature finish
|
||||
lowered = browser_actions.strip().lower()
|
||||
generic_error_patterns = [
|
||||
'error encountered when browsing',
|
||||
'error encountered while browsing',
|
||||
'error encountered during browsing',
|
||||
'an error encountered when browsing',
|
||||
'an error encountered while browsing',
|
||||
'an error encountered during browsing'
|
||||
]
|
||||
if any(pattern in lowered for pattern in generic_error_patterns):
|
||||
return BrowseInteractiveAction(
|
||||
browser_actions='noop()',
|
||||
thought=thought,
|
||||
browsergym_send_msg_to_user='',
|
||||
return_axtree=True,
|
||||
)
|
||||
|
||||
# if the LLM wants to talk to the user, we extract the message
|
||||
msg_content = ''
|
||||
for sub_action in browser_actions.split('\n'):
|
||||
@@ -113,14 +152,33 @@ class BrowsingActionParserBrowseInteractive(ActionParser):
|
||||
logger.error(f'Error parsing action: {sub_action}')
|
||||
# the syntax was not correct, but we can still try to get the message
|
||||
# e.g. send_msg_to_user("Hello, world!") or send_msg_to_user('Hello, world!'
|
||||
match = re.search(r'send_msg_to_user\((["\'])(.*?)\1\)', sub_action)
|
||||
match = re.search(r'send_msg_to_user\((["])(.*?)\1\)', sub_action)
|
||||
if match:
|
||||
msg_content = match.group(2)
|
||||
else:
|
||||
msg_content = ''
|
||||
|
||||
# Also guard if the extracted message content is the generic error
|
||||
lowered_msg = msg_content.strip().lower()
|
||||
generic_error_patterns = [
|
||||
'error encountered when browsing',
|
||||
'error encountered while browsing',
|
||||
'error encountered during browsing',
|
||||
'an error encountered when browsing',
|
||||
'an error encountered while browsing',
|
||||
'an error encountered during browsing'
|
||||
]
|
||||
if any(pattern in lowered_msg for pattern in generic_error_patterns):
|
||||
return BrowseInteractiveAction(
|
||||
browser_actions='noop()',
|
||||
thought=thought,
|
||||
browsergym_send_msg_to_user='',
|
||||
return_axtree=True,
|
||||
)
|
||||
|
||||
return BrowseInteractiveAction(
|
||||
browser_actions=browser_actions,
|
||||
thought=thought,
|
||||
browsergym_send_msg_to_user=msg_content,
|
||||
return_axtree=True,
|
||||
)
|
||||
|
||||
@@ -247,7 +247,11 @@ def response_to_actions(
|
||||
raise FunctionCallValidationError(
|
||||
f'Missing required argument "code" in tool call {tool_call.function.name}'
|
||||
)
|
||||
action = BrowseInteractiveAction(browser_actions=arguments['code'])
|
||||
# Allow user to specify whether they need accessibility tree
|
||||
return_axtree = arguments.get('return_axtree', False)
|
||||
action = BrowseInteractiveAction(
|
||||
browser_actions=arguments['code'], return_axtree=return_axtree
|
||||
)
|
||||
set_security_risk(action, arguments)
|
||||
|
||||
# ================================================
|
||||
|
||||
@@ -64,7 +64,7 @@ scroll(delta_x: float, delta_y: float)
|
||||
|
||||
scroll(-50.2, -100.5)
|
||||
|
||||
fill(bid: str, value: str)
|
||||
fill(bid: str, value: str, enable_autocomplete_menu: bool = False)
|
||||
Description: Fill out a form field. It focuses the element and triggers an input event with the entered text. It works for <input>, <textarea> and [contenteditable] elements.
|
||||
Examples:
|
||||
fill('237', 'example value')
|
||||
@@ -159,6 +159,11 @@ BrowserTool = ChatCompletionToolParam(
|
||||
+ _BROWSER_TOOL_DESCRIPTION
|
||||
),
|
||||
},
|
||||
'return_axtree': {
|
||||
'type': 'boolean',
|
||||
'description': 'Whether to return the accessibility tree in the observation. Set to true if you need to analyze page structure or find elements by text content. Default is false for performance.',
|
||||
'default': False,
|
||||
},
|
||||
'security_risk': {
|
||||
'type': 'string',
|
||||
'description': SECURITY_RISK_DESC,
|
||||
|
||||
@@ -250,24 +250,69 @@ Note:
|
||||
)
|
||||
tabs = get_tabs(last_obs)
|
||||
try:
|
||||
# IMPORTANT: keep AX Tree of full webpage, add visible and clickable tags
|
||||
cur_axtree_txt = flatten_axtree_to_str(
|
||||
last_obs.axtree_object,
|
||||
extra_properties=last_obs.extra_element_properties,
|
||||
with_visible=True,
|
||||
with_clickable=True,
|
||||
with_center_coords=False,
|
||||
with_bounding_box_coords=False,
|
||||
filter_visible_only=False,
|
||||
filter_with_bid_only=False,
|
||||
filter_som_only=False,
|
||||
# Debug logging to understand the structure
|
||||
logger.info(
|
||||
f'VISUAL DEBUG: axtree_object type: {type(last_obs.axtree_object)}'
|
||||
)
|
||||
cur_axtree_txt = get_axtree(axtree_txt=cur_axtree_txt)
|
||||
logger.info(
|
||||
f'VISUAL DEBUG: axtree_object is None: {last_obs.axtree_object is None}'
|
||||
)
|
||||
if isinstance(last_obs.axtree_object, dict):
|
||||
logger.info(
|
||||
f'VISUAL DEBUG: axtree_object keys: {list(last_obs.axtree_object.keys())}'
|
||||
)
|
||||
if 'nodes' in last_obs.axtree_object:
|
||||
logger.info(
|
||||
f'VISUAL DEBUG: nodes type: {type(last_obs.axtree_object["nodes"])}'
|
||||
)
|
||||
logger.info(
|
||||
f'VISUAL DEBUG: nodes length: {len(last_obs.axtree_object["nodes"]) if last_obs.axtree_object["nodes"] else 0}'
|
||||
)
|
||||
|
||||
# Check if axtree_object exists and has the expected structure
|
||||
if not last_obs.axtree_object or not isinstance(
|
||||
last_obs.axtree_object, dict
|
||||
):
|
||||
logger.info(
|
||||
'VISUAL DEBUG: Using fallback - no axtree_object or not dict'
|
||||
)
|
||||
cur_axtree_txt = '[No accessibility tree available]'
|
||||
elif (
|
||||
'nodes' not in last_obs.axtree_object
|
||||
or not last_obs.axtree_object['nodes']
|
||||
):
|
||||
# axtree_object exists but is empty or missing nodes - this is the common case
|
||||
logger.info(
|
||||
'VISUAL DEBUG: Using fallback - missing nodes or empty nodes'
|
||||
)
|
||||
cur_axtree_txt = '[Accessibility tree not yet loaded]'
|
||||
else:
|
||||
# IMPORTANT: keep AX Tree of full webpage, add visible and clickable tags
|
||||
logger.info('VISUAL DEBUG: Calling flatten_axtree_to_str')
|
||||
cur_axtree_txt = flatten_axtree_to_str(
|
||||
last_obs.axtree_object,
|
||||
extra_properties=last_obs.extra_element_properties,
|
||||
with_visible=True,
|
||||
with_clickable=True,
|
||||
with_center_coords=False,
|
||||
with_bounding_box_coords=False,
|
||||
filter_visible_only=False,
|
||||
filter_with_bid_only=False,
|
||||
filter_som_only=False,
|
||||
)
|
||||
cur_axtree_txt = get_axtree(axtree_txt=cur_axtree_txt)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
'Error when trying to process the accessibility tree: %s', e
|
||||
'VISUAL BROWSING AGENT ERROR when trying to process the accessibility tree: %s',
|
||||
e,
|
||||
)
|
||||
logger.error(
|
||||
f'VISUAL DEBUG: Exception occurred with axtree_object: {last_obs.axtree_object}'
|
||||
)
|
||||
# Fall back gracefully without aborting the task
|
||||
cur_axtree_txt = (
|
||||
'[Accessibility tree unavailable due to processing error]'
|
||||
)
|
||||
return MessageAction('Error encountered when browsing.')
|
||||
set_of_marks = last_obs.set_of_marks
|
||||
goal, image_urls = state.get_current_user_intent()
|
||||
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import atexit
|
||||
import json
|
||||
import multiprocessing
|
||||
import time
|
||||
import uuid
|
||||
@@ -21,14 +20,18 @@ BROWSER_EVAL_GET_REWARDS_ACTION = 'GET_EVAL_REWARDS'
|
||||
|
||||
|
||||
class BrowserEnv:
|
||||
def __init__(self, browsergym_eval_env: str | None = None):
|
||||
def __init__(
|
||||
self,
|
||||
browsergym_eval_env: str | None = None,
|
||||
browser_logging_dir: str | None = None,
|
||||
):
|
||||
self.html_text_converter = self.get_html_text_converter()
|
||||
self.eval_mode = False
|
||||
self.eval_dir = ''
|
||||
|
||||
# EVAL only: browsergym_eval_env must be provided for evaluation
|
||||
self.browsergym_eval_env = browsergym_eval_env
|
||||
self.eval_mode = bool(browsergym_eval_env)
|
||||
# Browser state logging configuration (for WebArena evaluation)
|
||||
self.browser_logging_dir = browser_logging_dir
|
||||
self.enable_state_logging = browser_logging_dir is not None
|
||||
|
||||
# Initialize browser environment process
|
||||
multiprocessing.set_start_method('spawn', force=True)
|
||||
@@ -67,59 +70,43 @@ class BrowserEnv:
|
||||
raise BrowserInitException('Failed to start browser environment.')
|
||||
|
||||
def browser_process(self) -> None:
|
||||
if self.eval_mode:
|
||||
assert self.browsergym_eval_env is not None
|
||||
logger.info('Initializing browser env for web browsing evaluation.')
|
||||
if not self.browsergym_eval_env.startswith('browsergym/'):
|
||||
self.browsergym_eval_env = 'browsergym/' + self.browsergym_eval_env
|
||||
if 'visualwebarena' in self.browsergym_eval_env:
|
||||
import browsergym.visualwebarena # noqa F401 register visualwebarena tasks as gym environments
|
||||
import nltk
|
||||
|
||||
nltk.download('punkt_tab')
|
||||
elif 'webarena' in self.browsergym_eval_env:
|
||||
import browsergym.webarena # noqa F401 register webarena tasks as gym environments
|
||||
elif 'miniwob' in self.browsergym_eval_env:
|
||||
import browsergym.miniwob # noqa F401 register miniwob tasks as gym environments
|
||||
else:
|
||||
raise ValueError(
|
||||
f'Unsupported browsergym eval env: {self.browsergym_eval_env}'
|
||||
)
|
||||
env = gym.make(self.browsergym_eval_env, tags_to_mark='all', timeout=100000)
|
||||
else:
|
||||
env = gym.make(
|
||||
'browsergym/openended',
|
||||
task_kwargs={'start_url': 'about:blank', 'goal': 'PLACEHOLDER_GOAL'},
|
||||
wait_for_user_message=False,
|
||||
headless=True,
|
||||
disable_env_checker=True,
|
||||
tags_to_mark='all',
|
||||
timeout=100000,
|
||||
pw_context_kwargs={'accept_downloads': True},
|
||||
pw_chromium_kwargs={'downloads_path': '/workspace/.downloads/'},
|
||||
)
|
||||
env = gym.make(
|
||||
'browsergym/openended',
|
||||
task_kwargs={'start_url': 'about:blank', 'goal': 'PLACEHOLDER_GOAL'},
|
||||
wait_for_user_message=False,
|
||||
headless=True,
|
||||
disable_env_checker=True,
|
||||
tags_to_mark='all',
|
||||
timeout=100000,
|
||||
pw_context_kwargs={'accept_downloads': True},
|
||||
pw_chromium_kwargs={'downloads_path': '/workspace/.downloads/'},
|
||||
pre_observation_delay=2.0, # Increase delay to allow accessibility trees to load
|
||||
)
|
||||
obs, info = env.reset()
|
||||
|
||||
logger.info('Successfully called env.reset')
|
||||
# EVAL ONLY: save the goal into file for evaluation
|
||||
self.eval_goal = None
|
||||
self.goal_image_urls = []
|
||||
self.eval_rewards: list[float] = []
|
||||
if self.eval_mode:
|
||||
self.eval_goal = obs['goal']
|
||||
if 'goal_object' in obs:
|
||||
obs['goal_object'] = list(obs['goal_object'])
|
||||
if len(obs['goal_object']) > 0:
|
||||
self.eval_goal = obs['goal_object'][0]['text']
|
||||
for message in obs['goal_object']:
|
||||
if message['type'] == 'image_url':
|
||||
image_src = message['image_url']
|
||||
if isinstance(image_src, dict):
|
||||
image_src = image_src['url']
|
||||
self.goal_image_urls.append(image_src)
|
||||
logger.debug(f'Browsing goal: {self.eval_goal}')
|
||||
logger.info('Browser env started.')
|
||||
|
||||
# Initialize browser state capture for WebArena evaluation
|
||||
state_capture = None
|
||||
if self.enable_state_logging:
|
||||
try:
|
||||
from evaluation.benchmarks.webarena.browsergym_state_capture import (
|
||||
BrowserGymStateCapture,
|
||||
)
|
||||
|
||||
state_capture = BrowserGymStateCapture(
|
||||
output_dir=self.browser_logging_dir or '/tmp/webarena_states'
|
||||
)
|
||||
logger.info(
|
||||
f'Browser state logging enabled: {self.browser_logging_dir}'
|
||||
)
|
||||
except ImportError:
|
||||
logger.warning(
|
||||
'Could not import BrowserGymStateCapture, state logging disabled'
|
||||
)
|
||||
state_capture = None
|
||||
|
||||
while should_continue():
|
||||
try:
|
||||
if self.browser_side.poll(timeout=0.01):
|
||||
@@ -133,34 +120,60 @@ class BrowserEnv:
|
||||
elif unique_request_id == 'IS_ALIVE':
|
||||
self.browser_side.send(('ALIVE', None))
|
||||
continue
|
||||
|
||||
# EVAL ONLY: Get evaluation info
|
||||
if action_data['action'] == BROWSER_EVAL_GET_GOAL_ACTION:
|
||||
self.browser_side.send(
|
||||
(
|
||||
unique_request_id,
|
||||
{
|
||||
'text_content': self.eval_goal,
|
||||
'image_content': self.goal_image_urls,
|
||||
},
|
||||
elif unique_request_id == 'SET_WEBARENA_INSTANCE':
|
||||
# Set WebArena instance ID for state capture
|
||||
if state_capture and 'instance_id' in action_data:
|
||||
state_capture.set_instance_id(action_data['instance_id'])
|
||||
logger.info(
|
||||
f'Set WebArena instance ID: {action_data["instance_id"]}'
|
||||
)
|
||||
)
|
||||
self.browser_side.send((unique_request_id, {'status': 'ok'}))
|
||||
continue
|
||||
elif action_data['action'] == BROWSER_EVAL_GET_REWARDS_ACTION:
|
||||
self.browser_side.send(
|
||||
(
|
||||
unique_request_id,
|
||||
{'text_content': json.dumps(self.eval_rewards)},
|
||||
elif unique_request_id == 'CAPTURE_WEBARENA_STATE':
|
||||
# Capture final browser state for WebArena evaluation
|
||||
if state_capture:
|
||||
try:
|
||||
state_file = state_capture.save_state(env)
|
||||
self.browser_side.send(
|
||||
(
|
||||
unique_request_id,
|
||||
{'status': 'ok', 'state_file': state_file},
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f'Failed to capture WebArena state: {e}')
|
||||
self.browser_side.send(
|
||||
(
|
||||
unique_request_id,
|
||||
{'status': 'error', 'error': str(e)},
|
||||
)
|
||||
)
|
||||
else:
|
||||
self.browser_side.send(
|
||||
(unique_request_id, {'status': 'disabled'})
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
action = action_data['action']
|
||||
obs, reward, terminated, truncated, info = env.step(action)
|
||||
|
||||
# EVAL ONLY: Save the rewards into file for evaluation
|
||||
if self.eval_mode:
|
||||
self.eval_rewards.append(reward)
|
||||
# DEBUG: Log what's in the BrowserGym observation
|
||||
logger.info(f'DEBUG: BrowserGym obs keys: {list(obs.keys())}')
|
||||
if 'axtree_object' in obs:
|
||||
axtree_obj = obs['axtree_object']
|
||||
logger.info(f'DEBUG: axtree_object type: {type(axtree_obj)}')
|
||||
if isinstance(axtree_obj, dict):
|
||||
logger.info(
|
||||
f'DEBUG: axtree_object keys: {list(axtree_obj.keys())}'
|
||||
)
|
||||
if 'nodes' in axtree_obj:
|
||||
logger.info(
|
||||
f'DEBUG: axtree_object nodes length: {len(axtree_obj["nodes"]) if axtree_obj["nodes"] else 0}'
|
||||
)
|
||||
else:
|
||||
logger.info(f'DEBUG: axtree_object value: {axtree_obj}')
|
||||
else:
|
||||
logger.info('DEBUG: No axtree_object in BrowserGym observation')
|
||||
|
||||
# add text content of the page
|
||||
html_str = flatten_dom_to_str(obs['dom_object'])
|
||||
@@ -208,6 +221,48 @@ class BrowserEnv:
|
||||
logger.debug(f'Browser env is not alive. Response ID: {response_id}')
|
||||
return False
|
||||
|
||||
def set_webarena_instance_id(self, instance_id: str, timeout: float = 10) -> bool:
|
||||
"""Set the WebArena instance ID for browser state capture."""
|
||||
if not self.enable_state_logging:
|
||||
logger.warning('Browser state logging is not enabled')
|
||||
return False
|
||||
|
||||
unique_request_id = 'SET_WEBARENA_INSTANCE'
|
||||
self.agent_side.send((unique_request_id, {'instance_id': instance_id}))
|
||||
start_time = time.time()
|
||||
while True:
|
||||
if should_exit() or time.time() - start_time > timeout:
|
||||
logger.error('Timeout setting WebArena instance ID')
|
||||
return False
|
||||
if self.agent_side.poll(timeout=0.01):
|
||||
response_id, response = self.agent_side.recv()
|
||||
if response_id == unique_request_id:
|
||||
return response.get('status') == 'ok'
|
||||
|
||||
def capture_webarena_state(self, timeout: float = 30) -> str | None:
|
||||
"""Capture the current browser state for WebArena evaluation."""
|
||||
if not self.enable_state_logging:
|
||||
logger.warning('Browser state logging is not enabled')
|
||||
return None
|
||||
|
||||
unique_request_id = 'CAPTURE_WEBARENA_STATE'
|
||||
self.agent_side.send((unique_request_id, {}))
|
||||
start_time = time.time()
|
||||
while True:
|
||||
if should_exit() or time.time() - start_time > timeout:
|
||||
logger.error('Timeout capturing WebArena state')
|
||||
return None
|
||||
if self.agent_side.poll(timeout=0.01):
|
||||
response_id, response = self.agent_side.recv()
|
||||
if response_id == unique_request_id:
|
||||
if response.get('status') == 'ok':
|
||||
return response.get('state_file')
|
||||
else:
|
||||
logger.error(
|
||||
f'Failed to capture state: {response.get("error", "unknown error")}'
|
||||
)
|
||||
return None
|
||||
|
||||
def close(self) -> None:
|
||||
if not self.process.is_alive():
|
||||
return
|
||||
|
||||
@@ -21,14 +21,22 @@ def get_axtree_str(
|
||||
extra_element_properties: dict[str, Any],
|
||||
filter_visible_only: bool = False,
|
||||
) -> str:
|
||||
cur_axtree_txt = flatten_axtree_to_str(
|
||||
axtree_object,
|
||||
extra_properties=extra_element_properties,
|
||||
with_clickable=True,
|
||||
skip_generic=False,
|
||||
filter_visible_only=filter_visible_only,
|
||||
)
|
||||
return str(cur_axtree_txt)
|
||||
# Check if axtree_object exists and has the expected structure
|
||||
if not axtree_object or not isinstance(axtree_object, dict):
|
||||
return '[No accessibility tree available]'
|
||||
elif 'nodes' not in axtree_object or not axtree_object['nodes']:
|
||||
# axtree_object exists but is empty or missing nodes - this is the common case
|
||||
return '[Accessibility tree not yet loaded]'
|
||||
else:
|
||||
# axtree_object has the expected structure with nodes
|
||||
cur_axtree_txt = flatten_axtree_to_str(
|
||||
axtree_object,
|
||||
extra_properties=extra_element_properties,
|
||||
with_clickable=True,
|
||||
skip_generic=False,
|
||||
filter_visible_only=filter_visible_only,
|
||||
)
|
||||
return str(cur_axtree_txt)
|
||||
|
||||
|
||||
def get_agent_obs_text(obs: BrowserOutputObservation) -> str:
|
||||
|
||||
333
poetry.lock
generated
333
poetry.lock
generated
@@ -1,4 +1,4 @@
|
||||
# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 2.1.4 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "aiofiles"
|
||||
@@ -1078,79 +1078,80 @@ botocore = ["botocore"]
|
||||
|
||||
[[package]]
|
||||
name = "browsergym"
|
||||
version = "0.13.3"
|
||||
version = "0.14.2"
|
||||
description = "BrowserGym: a gym environment for web task automation in the Chromium browser"
|
||||
optional = false
|
||||
python-versions = ">3.7"
|
||||
python-versions = ">3.10"
|
||||
groups = ["evaluation"]
|
||||
files = [
|
||||
{file = "browsergym-0.13.3-py3-none-any.whl", hash = "sha256:4f1f8284ca3eb82e5bafb8fa24557ccdd98aaee55971cfa136ad7857011abb20"},
|
||||
{file = "browsergym-0.13.3.tar.gz", hash = "sha256:c3ee2ac41cf7a13abe71e0f9c63c28b37fee348dcc64fa1a6d2b5e513f9929e0"},
|
||||
{file = "browsergym-0.14.2-py3-none-any.whl", hash = "sha256:03e8aada75deb3dd3b68673a68b05f0522a83e4de5a63da5aeb2222daffe6df4"},
|
||||
{file = "browsergym-0.14.2.tar.gz", hash = "sha256:f45419ac0a2a050ca728ad2085b59a37ebf7df7d32d8f280b7db7b9bd6564be0"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
browsergym-assistantbench = "0.13.3"
|
||||
browsergym-core = "0.13.3"
|
||||
browsergym-experiments = "0.13.3"
|
||||
browsergym-miniwob = "0.13.3"
|
||||
browsergym-visualwebarena = "0.13.3"
|
||||
browsergym-webarena = "0.13.3"
|
||||
browsergym-assistantbench = "0.14.2"
|
||||
browsergym-core = "0.14.2"
|
||||
browsergym-experiments = "0.14.2"
|
||||
browsergym-miniwob = "0.14.2"
|
||||
browsergym-visualwebarena = "0.14.2"
|
||||
browsergym-webarena = "0.14.2"
|
||||
browsergym-workarena = ">=0.4.1"
|
||||
weblinx-browsergym = ">=0.0.1dev14"
|
||||
weblinx-browsergym = ">=0.0.2"
|
||||
|
||||
[[package]]
|
||||
name = "browsergym-assistantbench"
|
||||
version = "0.13.3"
|
||||
version = "0.14.2"
|
||||
description = "AssistantBench benchmark for BrowserGym"
|
||||
optional = false
|
||||
python-versions = ">3.7"
|
||||
groups = ["evaluation"]
|
||||
files = [
|
||||
{file = "browsergym_assistantbench-0.13.3-py3-none-any.whl", hash = "sha256:33f40b590f2baa521e05c1b32b063d867e9cd901c40dda5cb30cb203035236b7"},
|
||||
{file = "browsergym_assistantbench-0.13.3.tar.gz", hash = "sha256:46d784c7dcfc7b07836e4378d20275998b185b6c2ca6d0973500ab0333fde981"},
|
||||
{file = "browsergym_assistantbench-0.14.2-py3-none-any.whl", hash = "sha256:f137abe167f2d6287d7eb125a68eee0f3d63da365b34a70798993638de41139e"},
|
||||
{file = "browsergym_assistantbench-0.14.2.tar.gz", hash = "sha256:0c76833a1ca0713b2da0b33d62b621677a1b6b8e58733255d052a40f24dbf0ab"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
browsergym-core = "0.13.3"
|
||||
browsergym-core = "0.14.2"
|
||||
datasets = "*"
|
||||
numpy = "*"
|
||||
scipy = "*"
|
||||
|
||||
[[package]]
|
||||
name = "browsergym-core"
|
||||
version = "0.13.3"
|
||||
version = "0.14.2"
|
||||
description = "BrowserGym: a gym environment for web task automation in the Chromium browser"
|
||||
optional = false
|
||||
python-versions = ">3.9"
|
||||
groups = ["main", "evaluation"]
|
||||
files = [
|
||||
{file = "browsergym_core-0.13.3-py3-none-any.whl", hash = "sha256:db806c64deb819a51501f0466ecb51533fbc7b6edb5f7dbdcb865e7564a86719"},
|
||||
{file = "browsergym_core-0.13.3.tar.gz", hash = "sha256:ac5036b574c8c14ac4a0c09da578a0a00b584d6f5b5ed9bf7a247e24f4d9d2f8"},
|
||||
{file = "browsergym_core-0.14.2-py3-none-any.whl", hash = "sha256:217dfae3d8f6a92e4502b4dfd97dc5ec955a91e5f6b45944f857c182a57168d0"},
|
||||
{file = "browsergym_core-0.14.2.tar.gz", hash = "sha256:aa99a56aa6aae74bb3e1c139ae2fe7d53f0a5bed8707e0ee7520daed531f1f52"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
beautifulsoup4 = ">=4.12"
|
||||
gymnasium = ">=0.27"
|
||||
lxml = ">=4.9"
|
||||
lxml = ">=4.9,<6.0.0"
|
||||
mcp = {version = ">=1.6.0", extras = ["cli"]}
|
||||
numpy = ">=1.14"
|
||||
pillow = ">=10.1"
|
||||
playwright = ">=1.39,<2.0"
|
||||
playwright = "1.44"
|
||||
pyparsing = ">=3"
|
||||
|
||||
[[package]]
|
||||
name = "browsergym-experiments"
|
||||
version = "0.13.3"
|
||||
version = "0.14.2"
|
||||
description = "Experimentation tools for BrowserGym"
|
||||
optional = false
|
||||
python-versions = ">3.7"
|
||||
groups = ["evaluation"]
|
||||
files = [
|
||||
{file = "browsergym_experiments-0.13.3-py3-none-any.whl", hash = "sha256:61963e747eb2c3d04f4f0b5bb5a2f61208025fe2f94faf23f1b86b98dfce3218"},
|
||||
{file = "browsergym_experiments-0.13.3.tar.gz", hash = "sha256:96842e7700e27380746ac57ffc647a1dd56d449f925441ed9bc87675cddfff08"},
|
||||
{file = "browsergym_experiments-0.14.2-py3-none-any.whl", hash = "sha256:acb5eee773b7fbba6f3f60e03fa6b7fa66d277181e9bae36bdaf5ddec6d338d5"},
|
||||
{file = "browsergym_experiments-0.14.2.tar.gz", hash = "sha256:d71cee90706026c585ca95165f2bb1363b3607432c0720afcfd3b1d51aa9a637"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
browsergym-core = "0.13.3"
|
||||
browsergym-core = "0.14.2"
|
||||
dataclasses-json = "*"
|
||||
tiktoken = ">=0.4"
|
||||
|
||||
@@ -1165,33 +1166,33 @@ workarena = ["browsergym-workarena"]
|
||||
|
||||
[[package]]
|
||||
name = "browsergym-miniwob"
|
||||
version = "0.13.3"
|
||||
version = "0.14.2"
|
||||
description = "MiniWoB++ benchmark for BrowserGym"
|
||||
optional = false
|
||||
python-versions = ">3.7"
|
||||
groups = ["evaluation"]
|
||||
files = [
|
||||
{file = "browsergym_miniwob-0.13.3-py3-none-any.whl", hash = "sha256:353b9f8849b7f637e17a928021a93ce962ca9b828434cfe68cebdbe2f11f4a2f"},
|
||||
{file = "browsergym_miniwob-0.13.3.tar.gz", hash = "sha256:0e22797a83d4664636364b2400c5ea0eca16ddd3f50d3003891b0892da1ff40e"},
|
||||
{file = "browsergym_miniwob-0.14.2-py3-none-any.whl", hash = "sha256:bc99712c11e39d46c11c5431d57a121854f141291ab16d62e329a1dca0cea974"},
|
||||
{file = "browsergym_miniwob-0.14.2.tar.gz", hash = "sha256:00ea1f820124689f086830323ea610fec5207e7f1718c86d1fc69e0eb385d939"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
browsergym-core = "0.13.3"
|
||||
browsergym-core = "0.14.2"
|
||||
|
||||
[[package]]
|
||||
name = "browsergym-visualwebarena"
|
||||
version = "0.13.3"
|
||||
version = "0.14.2"
|
||||
description = "VisualWebArena benchmark for BrowserGym"
|
||||
optional = false
|
||||
python-versions = ">3.7"
|
||||
groups = ["evaluation"]
|
||||
files = [
|
||||
{file = "browsergym_visualwebarena-0.13.3-py3-none-any.whl", hash = "sha256:a42c200023497a4970290fce39b419a93aadfc9e92c02ae602704d2957e5e531"},
|
||||
{file = "browsergym_visualwebarena-0.13.3.tar.gz", hash = "sha256:635b4a71c8ff6bff3e84c0fecc7a10b9e932fe2929d4bf8e2e9a5bf2e29438e4"},
|
||||
{file = "browsergym_visualwebarena-0.14.2-py3-none-any.whl", hash = "sha256:c86efeb64e97d2b2305af36e460b5e638f328955bf9c5e5c31a0fa5cffaee922"},
|
||||
{file = "browsergym_visualwebarena-0.14.2.tar.gz", hash = "sha256:a926c13b3f244cdb6266106f2b88904af090f3bc16f17524e6b714ac25727f73"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
browsergym-core = "0.13.3"
|
||||
browsergym-core = "0.14.2"
|
||||
browsergym-webarena = "*"
|
||||
libvisualwebarena = "0.0.15"
|
||||
requests = "*"
|
||||
@@ -1199,18 +1200,18 @@ torch = "*"
|
||||
|
||||
[[package]]
|
||||
name = "browsergym-webarena"
|
||||
version = "0.13.3"
|
||||
version = "0.14.2"
|
||||
description = "WebArena benchmark for BrowserGym"
|
||||
optional = false
|
||||
python-versions = ">3.7"
|
||||
groups = ["evaluation"]
|
||||
files = [
|
||||
{file = "browsergym_webarena-0.13.3-py3-none-any.whl", hash = "sha256:28098690f7c4a513c06e9da0d95f13e5c7bc70ec4bcfcfb7f83311b4081af0c9"},
|
||||
{file = "browsergym_webarena-0.13.3.tar.gz", hash = "sha256:60347edfd8d16e9b6b34a03b3ccb0e058ff11b83f3308ac5ead60321a9cc6462"},
|
||||
{file = "browsergym_webarena-0.14.2-py3-none-any.whl", hash = "sha256:d9bd8fb4e64627a57134fe205497aa36c5e39ffcafd255b8511ba31983478cff"},
|
||||
{file = "browsergym_webarena-0.14.2.tar.gz", hash = "sha256:ccc741ea6a6d4e0d4022fc3c0e7c50d2ee7edc2076a3c50b277005eb572f4c65"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
browsergym-core = "0.13.3"
|
||||
browsergym-core = "0.14.2"
|
||||
libwebarena = "0.0.4"
|
||||
|
||||
[[package]]
|
||||
@@ -2868,56 +2869,58 @@ test = ["build", "mypy", "pytest", "pytest-xdist", "ruff", "twine", "types-reque
|
||||
|
||||
[[package]]
|
||||
name = "gevent"
|
||||
version = "25.5.1"
|
||||
version = "24.2.1"
|
||||
description = "Coroutine-based network library"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
python-versions = ">=3.8"
|
||||
groups = ["test"]
|
||||
files = [
|
||||
{file = "gevent-25.5.1-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:8e5a0fab5e245b15ec1005b3666b0a2e867c26f411c8fe66ae1afe07174a30e9"},
|
||||
{file = "gevent-25.5.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c7b80a37f2fb45ee4a8f7e64b77dd8a842d364384046e394227b974a4e9c9a52"},
|
||||
{file = "gevent-25.5.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:29ab729d50ae85077a68e0385f129f5b01052d01a0ae6d7fdc1824f5337905e4"},
|
||||
{file = "gevent-25.5.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:80d20592aeabcc4e294fd441fd43d45cb537437fd642c374ea9d964622fad229"},
|
||||
{file = "gevent-25.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8ba0257542ccbb72a8229dc34d00844ccdfba110417e4b7b34599548d0e20e9"},
|
||||
{file = "gevent-25.5.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cad0821dff998c7c60dd238f92cd61380342c47fb9e92e1a8705d9b5ac7c16e8"},
|
||||
{file = "gevent-25.5.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:017a7384c0cd1a5907751c991535a0699596e89725468a7fc39228312e10efa1"},
|
||||
{file = "gevent-25.5.1-cp310-cp310-win_amd64.whl", hash = "sha256:469c86d02fccad7e2a3d82fe22237e47ecb376fbf4710bc18747b49c50716817"},
|
||||
{file = "gevent-25.5.1-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:12380aba5c316e9ff53cc21d8ab80f4a91c0df3ada58f65d4f5eb2cf693db00e"},
|
||||
{file = "gevent-25.5.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f0694daab1a041b69a53f53c2141c12994892b2503870515cabe6a5dbd2a928"},
|
||||
{file = "gevent-25.5.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2797885e9aeffdc98e1846723e5aa212e7ce53007dbef40d6fd2add264235c41"},
|
||||
{file = "gevent-25.5.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cde6aaac36b54332e10ea2a5bc0de6a8aba6c205c92603fe4396e3777c88e05d"},
|
||||
{file = "gevent-25.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:24484f80f14befb8822bf29554cfb3a26a26cb69cd1e5a8be9e23b4bd7a96e25"},
|
||||
{file = "gevent-25.5.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8fdc7446895fa184890d8ca5ea61e502691114f9db55c9b76adc33f3086c4368"},
|
||||
{file = "gevent-25.5.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:5b6106e2414b1797133786258fa1962a5e836480e4d5e861577f9fc63b673a5a"},
|
||||
{file = "gevent-25.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:bc899212d90f311784c58938a9c09c59802fb6dc287a35fabdc36d180f57f575"},
|
||||
{file = "gevent-25.5.1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:d87c0a1bd809d8f70f96b9b229779ec6647339830b8888a192beed33ac8d129f"},
|
||||
{file = "gevent-25.5.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b87a4b66edb3808d4d07bbdb0deed5a710cf3d3c531e082759afd283758bb649"},
|
||||
{file = "gevent-25.5.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f076779050029a82feb0cb1462021d3404d22f80fa76a181b1a7889cd4d6b519"},
|
||||
{file = "gevent-25.5.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bb673eb291c19370f69295f7a881a536451408481e2e3deec3f41dedb7c281ec"},
|
||||
{file = "gevent-25.5.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1325ed44225c8309c0dd188bdbbbee79e1df8c11ceccac226b861c7d52e4837"},
|
||||
{file = "gevent-25.5.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:fcd5bcad3102bde686d0adcc341fade6245186050ce14386d547ccab4bd54310"},
|
||||
{file = "gevent-25.5.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1a93062609e8fa67ec97cd5fb9206886774b2a09b24887f40148c9c37e6fb71c"},
|
||||
{file = "gevent-25.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:2534c23dc32bed62b659ed4fd9e198906179e68b26c9276a897e04163bdde806"},
|
||||
{file = "gevent-25.5.1-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:a022a9de9275ce0b390b7315595454258c525dc8287a03f1a6cacc5878ab7cbc"},
|
||||
{file = "gevent-25.5.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3fae8533f9d0ef3348a1f503edcfb531ef7a0236b57da1e24339aceb0ce52922"},
|
||||
{file = "gevent-25.5.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c7b32d9c3b5294b39ea9060e20c582e49e1ec81edbfeae6cf05f8ad0829cb13d"},
|
||||
{file = "gevent-25.5.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7b95815fe44f318ebbfd733b6428b4cb18cc5e68f1c40e8501dd69cc1f42a83d"},
|
||||
{file = "gevent-25.5.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d316529b70d325b183b2f3f5cde958911ff7be12eb2b532b5c301f915dbbf1e"},
|
||||
{file = "gevent-25.5.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f6ba33c13db91ffdbb489a4f3d177a261ea1843923e1d68a5636c53fe98fa5ce"},
|
||||
{file = "gevent-25.5.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:37ee34b77c7553777c0b8379915f75934c3f9c8cd32f7cd098ea43c9323c2276"},
|
||||
{file = "gevent-25.5.1-cp313-cp313-win_amd64.whl", hash = "sha256:9fa6aa0da224ed807d3b76cdb4ee8b54d4d4d5e018aed2478098e685baae7896"},
|
||||
{file = "gevent-25.5.1-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:0bacf89a65489d26c7087669af89938d5bfd9f7afb12a07b57855b9fad6ccbd0"},
|
||||
{file = "gevent-25.5.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e30169ef9cc0a57930bfd8fe14d86bc9d39fb96d278e3891e85cbe7b46058a97"},
|
||||
{file = "gevent-25.5.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:e72ad5f8d9c92df017fb91a1f6a438cfb63b0eff4b40904ff81b40cb8150078c"},
|
||||
{file = "gevent-25.5.1-cp39-cp39-win32.whl", hash = "sha256:e5f358e81e27b1a7f2fb2f5219794e13ab5f59ce05571aa3877cfac63adb97db"},
|
||||
{file = "gevent-25.5.1-cp39-cp39-win_amd64.whl", hash = "sha256:b83aff2441c7d4ee93e519989713b7c2607d4510abe990cd1d04f641bc6c03af"},
|
||||
{file = "gevent-25.5.1-pp310-pypy310_pp73-macosx_11_0_universal2.whl", hash = "sha256:60ad4ca9ca2c4cc8201b607c229cd17af749831e371d006d8a91303bb5568eb1"},
|
||||
{file = "gevent-25.5.1.tar.gz", hash = "sha256:582c948fa9a23188b890d0bc130734a506d039a2e5ad87dae276a456cc683e61"},
|
||||
{file = "gevent-24.2.1-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:6f947a9abc1a129858391b3d9334c45041c08a0f23d14333d5b844b6e5c17a07"},
|
||||
{file = "gevent-24.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bde283313daf0b34a8d1bab30325f5cb0f4e11b5869dbe5bc61f8fe09a8f66f3"},
|
||||
{file = "gevent-24.2.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5a1df555431f5cd5cc189a6ee3544d24f8c52f2529134685f1e878c4972ab026"},
|
||||
{file = "gevent-24.2.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:14532a67f7cb29fb055a0e9b39f16b88ed22c66b96641df8c04bdc38c26b9ea5"},
|
||||
{file = "gevent-24.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd23df885318391856415e20acfd51a985cba6919f0be78ed89f5db9ff3a31cb"},
|
||||
{file = "gevent-24.2.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:ca80b121bbec76d7794fcb45e65a7eca660a76cc1a104ed439cdbd7df5f0b060"},
|
||||
{file = "gevent-24.2.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b9913c45d1be52d7a5db0c63977eebb51f68a2d5e6fd922d1d9b5e5fd758cc98"},
|
||||
{file = "gevent-24.2.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:918cdf8751b24986f915d743225ad6b702f83e1106e08a63b736e3a4c6ead789"},
|
||||
{file = "gevent-24.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:3d5325ccfadfd3dcf72ff88a92fb8fc0b56cacc7225f0f4b6dcf186c1a6eeabc"},
|
||||
{file = "gevent-24.2.1-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:03aa5879acd6b7076f6a2a307410fb1e0d288b84b03cdfd8c74db8b4bc882fc5"},
|
||||
{file = "gevent-24.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f8bb35ce57a63c9a6896c71a285818a3922d8ca05d150fd1fe49a7f57287b836"},
|
||||
{file = "gevent-24.2.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d7f87c2c02e03d99b95cfa6f7a776409083a9e4d468912e18c7680437b29222c"},
|
||||
{file = "gevent-24.2.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:968581d1717bbcf170758580f5f97a2925854943c45a19be4d47299507db2eb7"},
|
||||
{file = "gevent-24.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7899a38d0ae7e817e99adb217f586d0a4620e315e4de577444ebeeed2c5729be"},
|
||||
{file = "gevent-24.2.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:f5e8e8d60e18d5f7fd49983f0c4696deeddaf6e608fbab33397671e2fcc6cc91"},
|
||||
{file = "gevent-24.2.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:fbfdce91239fe306772faab57597186710d5699213f4df099d1612da7320d682"},
|
||||
{file = "gevent-24.2.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cdf66977a976d6a3cfb006afdf825d1482f84f7b81179db33941f2fc9673bb1d"},
|
||||
{file = "gevent-24.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:1dffb395e500613e0452b9503153f8f7ba587c67dd4a85fc7cd7aa7430cb02cc"},
|
||||
{file = "gevent-24.2.1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:6c47ae7d1174617b3509f5d884935e788f325eb8f1a7efc95d295c68d83cce40"},
|
||||
{file = "gevent-24.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f7cac622e11b4253ac4536a654fe221249065d9a69feb6cdcd4d9af3503602e0"},
|
||||
{file = "gevent-24.2.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bf5b9c72b884c6f0c4ed26ef204ee1f768b9437330422492c319470954bc4cc7"},
|
||||
{file = "gevent-24.2.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f5de3c676e57177b38857f6e3cdfbe8f38d1cd754b63200c0615eaa31f514b4f"},
|
||||
{file = "gevent-24.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d4faf846ed132fd7ebfbbf4fde588a62d21faa0faa06e6f468b7faa6f436b661"},
|
||||
{file = "gevent-24.2.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:368a277bd9278ddb0fde308e6a43f544222d76ed0c4166e0d9f6b036586819d9"},
|
||||
{file = "gevent-24.2.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:f8a04cf0c5b7139bc6368b461257d4a757ea2fe89b3773e494d235b7dd51119f"},
|
||||
{file = "gevent-24.2.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9d8d0642c63d453179058abc4143e30718b19a85cbf58c2744c9a63f06a1d388"},
|
||||
{file = "gevent-24.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:94138682e68ec197db42ad7442d3cf9b328069c3ad8e4e5022e6b5cd3e7ffae5"},
|
||||
{file = "gevent-24.2.1-cp38-cp38-macosx_11_0_universal2.whl", hash = "sha256:8f4b8e777d39013595a7740b4463e61b1cfe5f462f1b609b28fbc1e4c4ff01e5"},
|
||||
{file = "gevent-24.2.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:141a2b24ad14f7b9576965c0c84927fc85f824a9bb19f6ec1e61e845d87c9cd8"},
|
||||
{file = "gevent-24.2.1-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:9202f22ef811053077d01f43cc02b4aaf4472792f9fd0f5081b0b05c926cca19"},
|
||||
{file = "gevent-24.2.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2955eea9c44c842c626feebf4459c42ce168685aa99594e049d03bedf53c2800"},
|
||||
{file = "gevent-24.2.1-cp38-cp38-win32.whl", hash = "sha256:44098038d5e2749b0784aabb27f1fcbb3f43edebedf64d0af0d26955611be8d6"},
|
||||
{file = "gevent-24.2.1-cp38-cp38-win_amd64.whl", hash = "sha256:117e5837bc74a1673605fb53f8bfe22feb6e5afa411f524c835b2ddf768db0de"},
|
||||
{file = "gevent-24.2.1-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:2ae3a25ecce0a5b0cd0808ab716bfca180230112bb4bc89b46ae0061d62d4afe"},
|
||||
{file = "gevent-24.2.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a7ceb59986456ce851160867ce4929edaffbd2f069ae25717150199f8e1548b8"},
|
||||
{file = "gevent-24.2.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:2e9ac06f225b696cdedbb22f9e805e2dd87bf82e8fa5e17756f94e88a9d37cf7"},
|
||||
{file = "gevent-24.2.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:90cbac1ec05b305a1b90ede61ef73126afdeb5a804ae04480d6da12c56378df1"},
|
||||
{file = "gevent-24.2.1-cp39-cp39-win32.whl", hash = "sha256:782a771424fe74bc7e75c228a1da671578c2ba4ddb2ca09b8f959abdf787331e"},
|
||||
{file = "gevent-24.2.1-cp39-cp39-win_amd64.whl", hash = "sha256:3adfb96637f44010be8abd1b5e73b5070f851b817a0b182e601202f20fa06533"},
|
||||
{file = "gevent-24.2.1-pp310-pypy310_pp73-macosx_11_0_universal2.whl", hash = "sha256:7b00f8c9065de3ad226f7979154a7b27f3b9151c8055c162332369262fc025d8"},
|
||||
{file = "gevent-24.2.1.tar.gz", hash = "sha256:432fc76f680acf7cf188c2ee0f5d3ab73b63c1f03114c7cd8a34cebbe5aa2056"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
cffi = {version = ">=1.17.1", markers = "platform_python_implementation == \"CPython\" and sys_platform == \"win32\""}
|
||||
greenlet = {version = ">=3.2.2", markers = "platform_python_implementation == \"CPython\""}
|
||||
cffi = {version = ">=1.12.2", markers = "platform_python_implementation == \"CPython\" and sys_platform == \"win32\""}
|
||||
greenlet = {version = ">=3.0rc3", markers = "platform_python_implementation == \"CPython\" and python_version >= \"3.11\""}
|
||||
"zope.event" = "*"
|
||||
"zope.interface" = "*"
|
||||
|
||||
@@ -2925,8 +2928,8 @@ greenlet = {version = ">=3.2.2", markers = "platform_python_implementation == \"
|
||||
dnspython = ["dnspython (>=1.16.0,<2.0) ; python_version < \"3.10\"", "idna ; python_version < \"3.10\""]
|
||||
docs = ["furo", "repoze.sphinx.autointerface", "sphinx", "sphinxcontrib-programoutput", "zope.schema"]
|
||||
monitor = ["psutil (>=5.7.0) ; sys_platform != \"win32\" or platform_python_implementation == \"CPython\""]
|
||||
recommended = ["cffi (>=1.17.1) ; platform_python_implementation == \"CPython\"", "dnspython (>=1.16.0,<2.0) ; python_version < \"3.10\"", "idna ; python_version < \"3.10\"", "psutil (>=5.7.0) ; sys_platform != \"win32\" or platform_python_implementation == \"CPython\""]
|
||||
test = ["cffi (>=1.17.1) ; platform_python_implementation == \"CPython\"", "coverage (>=5.0) ; sys_platform != \"win32\"", "dnspython (>=1.16.0,<2.0) ; python_version < \"3.10\"", "idna ; python_version < \"3.10\"", "objgraph", "psutil (>=5.7.0) ; sys_platform != \"win32\" or platform_python_implementation == \"CPython\"", "requests"]
|
||||
recommended = ["cffi (>=1.12.2) ; platform_python_implementation == \"CPython\"", "dnspython (>=1.16.0,<2.0) ; python_version < \"3.10\"", "idna ; python_version < \"3.10\"", "psutil (>=5.7.0) ; sys_platform != \"win32\" or platform_python_implementation == \"CPython\""]
|
||||
test = ["cffi (>=1.12.2) ; platform_python_implementation == \"CPython\"", "coverage (>=5.0) ; sys_platform != \"win32\"", "dnspython (>=1.16.0,<2.0) ; python_version < \"3.10\"", "idna ; python_version < \"3.10\"", "objgraph", "psutil (>=5.7.0) ; sys_platform != \"win32\" or platform_python_implementation == \"CPython\"", "requests"]
|
||||
|
||||
[[package]]
|
||||
name = "ghapi"
|
||||
@@ -3400,67 +3403,70 @@ grpc = ["grpcio (>=1.44.0,<2.0.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "greenlet"
|
||||
version = "3.2.2"
|
||||
version = "3.0.3"
|
||||
description = "Lightweight in-process concurrent programming"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
python-versions = ">=3.7"
|
||||
groups = ["main", "evaluation", "test"]
|
||||
files = [
|
||||
{file = "greenlet-3.2.2-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:c49e9f7c6f625507ed83a7485366b46cbe325717c60837f7244fc99ba16ba9d6"},
|
||||
{file = "greenlet-3.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c3cc1a3ed00ecfea8932477f729a9f616ad7347a5e55d50929efa50a86cb7be7"},
|
||||
{file = "greenlet-3.2.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7c9896249fbef2c615853b890ee854f22c671560226c9221cfd27c995db97e5c"},
|
||||
{file = "greenlet-3.2.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7409796591d879425997a518138889d8d17e63ada7c99edc0d7a1c22007d4907"},
|
||||
{file = "greenlet-3.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7791dcb496ec53d60c7f1c78eaa156c21f402dda38542a00afc3e20cae0f480f"},
|
||||
{file = "greenlet-3.2.2-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d8009ae46259e31bc73dc183e402f548e980c96f33a6ef58cc2e7865db012e13"},
|
||||
{file = "greenlet-3.2.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:fd9fb7c941280e2c837b603850efc93c999ae58aae2b40765ed682a6907ebbc5"},
|
||||
{file = "greenlet-3.2.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:00cd814b8959b95a546e47e8d589610534cfb71f19802ea8a2ad99d95d702057"},
|
||||
{file = "greenlet-3.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:d0cb7d47199001de7658c213419358aa8937df767936506db0db7ce1a71f4a2f"},
|
||||
{file = "greenlet-3.2.2-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:dcb9cebbf3f62cb1e5afacae90761ccce0effb3adaa32339a0670fe7805d8068"},
|
||||
{file = "greenlet-3.2.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf3fc9145141250907730886b031681dfcc0de1c158f3cc51c092223c0f381ce"},
|
||||
{file = "greenlet-3.2.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:efcdfb9df109e8a3b475c016f60438fcd4be68cd13a365d42b35914cdab4bb2b"},
|
||||
{file = "greenlet-3.2.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4bd139e4943547ce3a56ef4b8b1b9479f9e40bb47e72cc906f0f66b9d0d5cab3"},
|
||||
{file = "greenlet-3.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:71566302219b17ca354eb274dfd29b8da3c268e41b646f330e324e3967546a74"},
|
||||
{file = "greenlet-3.2.2-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3091bc45e6b0c73f225374fefa1536cd91b1e987377b12ef5b19129b07d93ebe"},
|
||||
{file = "greenlet-3.2.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:44671c29da26539a5f142257eaba5110f71887c24d40df3ac87f1117df589e0e"},
|
||||
{file = "greenlet-3.2.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c23ea227847c9dbe0b3910f5c0dd95658b607137614eb821e6cbaecd60d81cc6"},
|
||||
{file = "greenlet-3.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:0a16fb934fcabfdfacf21d79e6fed81809d8cd97bc1be9d9c89f0e4567143d7b"},
|
||||
{file = "greenlet-3.2.2-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:df4d1509efd4977e6a844ac96d8be0b9e5aa5d5c77aa27ca9f4d3f92d3fcf330"},
|
||||
{file = "greenlet-3.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da956d534a6d1b9841f95ad0f18ace637668f680b1339ca4dcfb2c1837880a0b"},
|
||||
{file = "greenlet-3.2.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9c7b15fb9b88d9ee07e076f5a683027bc3befd5bb5d25954bb633c385d8b737e"},
|
||||
{file = "greenlet-3.2.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:752f0e79785e11180ebd2e726c8a88109ded3e2301d40abced2543aa5d164275"},
|
||||
{file = "greenlet-3.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ae572c996ae4b5e122331e12bbb971ea49c08cc7c232d1bd43150800a2d6c65"},
|
||||
{file = "greenlet-3.2.2-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:02f5972ff02c9cf615357c17ab713737cccfd0eaf69b951084a9fd43f39833d3"},
|
||||
{file = "greenlet-3.2.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:4fefc7aa68b34b9224490dfda2e70ccf2131368493add64b4ef2d372955c207e"},
|
||||
{file = "greenlet-3.2.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a31ead8411a027c2c4759113cf2bd473690517494f3d6e4bf67064589afcd3c5"},
|
||||
{file = "greenlet-3.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:b24c7844c0a0afc3ccbeb0b807adeefb7eff2b5599229ecedddcfeb0ef333bec"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:3ab7194ee290302ca15449f601036007873028712e92ca15fc76597a0aeb4c59"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2dc5c43bb65ec3669452af0ab10729e8fdc17f87a1f2ad7ec65d4aaaefabf6bf"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:decb0658ec19e5c1f519faa9a160c0fc85a41a7e6654b3ce1b44b939f8bf1325"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6fadd183186db360b61cb34e81117a096bff91c072929cd1b529eb20dd46e6c5"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1919cbdc1c53ef739c94cf2985056bcc0838c1f217b57647cbf4578576c63825"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3885f85b61798f4192d544aac7b25a04ece5fe2704670b4ab73c2d2c14ab740d"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:85f3e248507125bf4af607a26fd6cb8578776197bd4b66e35229cdf5acf1dfbf"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:1e76106b6fc55fa3d6fe1c527f95ee65e324a13b62e243f77b48317346559708"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313-win_amd64.whl", hash = "sha256:fe46d4f8e94e637634d54477b0cfabcf93c53f29eedcbdeecaf2af32029b4421"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ba30e88607fb6990544d84caf3c706c4b48f629e18853fc6a646f82db9629418"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:055916fafad3e3388d27dd68517478933a97edc2fc54ae79d3bec827de2c64c4"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2593283bf81ca37d27d110956b79e8723f9aa50c4bcdc29d3c0543d4743d2763"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89c69e9a10670eb7a66b8cef6354c24671ba241f46152dd3eed447f79c29fb5b"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:02a98600899ca1ca5d3a2590974c9e3ec259503b2d6ba6527605fcd74e08e207"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313t-musllinux_1_1_aarch64.whl", hash = "sha256:b50a8c5c162469c3209e5ec92ee4f95c8231b11db6a04db09bbe338176723bb8"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:45f9f4853fb4cc46783085261c9ec4706628f3b57de3e68bae03e8f8b3c0de51"},
|
||||
{file = "greenlet-3.2.2-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:9ea5231428af34226c05f927e16fc7f6fa5e39e3ad3cd24ffa48ba53a47f4240"},
|
||||
{file = "greenlet-3.2.2-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:1e4747712c4365ef6765708f948acc9c10350719ca0545e362c24ab973017370"},
|
||||
{file = "greenlet-3.2.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:782743700ab75716650b5238a4759f840bb2dcf7bff56917e9ffdf9f1f23ec59"},
|
||||
{file = "greenlet-3.2.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:354f67445f5bed6604e493a06a9a49ad65675d3d03477d38a4db4a427e9aad0e"},
|
||||
{file = "greenlet-3.2.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3aeca9848d08ce5eb653cf16e15bb25beeab36e53eb71cc32569f5f3afb2a3aa"},
|
||||
{file = "greenlet-3.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cb8553ee954536500d88a1a2f58fcb867e45125e600e80f586ade399b3f8819"},
|
||||
{file = "greenlet-3.2.2-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1592a615b598643dbfd566bac8467f06c8c8ab6e56f069e573832ed1d5d528cc"},
|
||||
{file = "greenlet-3.2.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:1f72667cc341c95184f1c68f957cb2d4fc31eef81646e8e59358a10ce6689457"},
|
||||
{file = "greenlet-3.2.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a8fa80665b1a29faf76800173ff5325095f3e66a78e62999929809907aca5659"},
|
||||
{file = "greenlet-3.2.2-cp39-cp39-win32.whl", hash = "sha256:6629311595e3fe7304039c67f00d145cd1d38cf723bb5b99cc987b23c1433d61"},
|
||||
{file = "greenlet-3.2.2-cp39-cp39-win_amd64.whl", hash = "sha256:eeb27bece45c0c2a5842ac4c5a1b5c2ceaefe5711078eed4e8043159fa05c834"},
|
||||
{file = "greenlet-3.2.2.tar.gz", hash = "sha256:ad053d34421a2debba45aa3cc39acf454acbcd025b3fc1a9f8a0dee237abd485"},
|
||||
{file = "greenlet-3.0.3-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:9da2bd29ed9e4f15955dd1595ad7bc9320308a3b766ef7f837e23ad4b4aac31a"},
|
||||
{file = "greenlet-3.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d353cadd6083fdb056bb46ed07e4340b0869c305c8ca54ef9da3421acbdf6881"},
|
||||
{file = "greenlet-3.0.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dca1e2f3ca00b84a396bc1bce13dd21f680f035314d2379c4160c98153b2059b"},
|
||||
{file = "greenlet-3.0.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3ed7fb269f15dc662787f4119ec300ad0702fa1b19d2135a37c2c4de6fadfd4a"},
|
||||
{file = "greenlet-3.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd4f49ae60e10adbc94b45c0b5e6a179acc1736cf7a90160b404076ee283cf83"},
|
||||
{file = "greenlet-3.0.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:73a411ef564e0e097dbe7e866bb2dda0f027e072b04da387282b02c308807405"},
|
||||
{file = "greenlet-3.0.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:7f362975f2d179f9e26928c5b517524e89dd48530a0202570d55ad6ca5d8a56f"},
|
||||
{file = "greenlet-3.0.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:649dde7de1a5eceb258f9cb00bdf50e978c9db1b996964cd80703614c86495eb"},
|
||||
{file = "greenlet-3.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:68834da854554926fbedd38c76e60c4a2e3198c6fbed520b106a8986445caaf9"},
|
||||
{file = "greenlet-3.0.3-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:b1b5667cced97081bf57b8fa1d6bfca67814b0afd38208d52538316e9422fc61"},
|
||||
{file = "greenlet-3.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:52f59dd9c96ad2fc0d5724107444f76eb20aaccb675bf825df6435acb7703559"},
|
||||
{file = "greenlet-3.0.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:afaff6cf5200befd5cec055b07d1c0a5a06c040fe5ad148abcd11ba6ab9b114e"},
|
||||
{file = "greenlet-3.0.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fe754d231288e1e64323cfad462fcee8f0288654c10bdf4f603a39ed923bef33"},
|
||||
{file = "greenlet-3.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2797aa5aedac23af156bbb5a6aa2cd3427ada2972c828244eb7d1b9255846379"},
|
||||
{file = "greenlet-3.0.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b7f009caad047246ed379e1c4dbcb8b020f0a390667ea74d2387be2998f58a22"},
|
||||
{file = "greenlet-3.0.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c5e1536de2aad7bf62e27baf79225d0d64360d4168cf2e6becb91baf1ed074f3"},
|
||||
{file = "greenlet-3.0.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:894393ce10ceac937e56ec00bb71c4c2f8209ad516e96033e4b3b1de270e200d"},
|
||||
{file = "greenlet-3.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:1ea188d4f49089fc6fb283845ab18a2518d279c7cd9da1065d7a84e991748728"},
|
||||
{file = "greenlet-3.0.3-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:70fb482fdf2c707765ab5f0b6655e9cfcf3780d8d87355a063547b41177599be"},
|
||||
{file = "greenlet-3.0.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d4d1ac74f5c0c0524e4a24335350edad7e5f03b9532da7ea4d3c54d527784f2e"},
|
||||
{file = "greenlet-3.0.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:149e94a2dd82d19838fe4b2259f1b6b9957d5ba1b25640d2380bea9c5df37676"},
|
||||
{file = "greenlet-3.0.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:15d79dd26056573940fcb8c7413d84118086f2ec1a8acdfa854631084393efcc"},
|
||||
{file = "greenlet-3.0.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:881b7db1ebff4ba09aaaeae6aa491daeb226c8150fc20e836ad00041bcb11230"},
|
||||
{file = "greenlet-3.0.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fcd2469d6a2cf298f198f0487e0a5b1a47a42ca0fa4dfd1b6862c999f018ebbf"},
|
||||
{file = "greenlet-3.0.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1f672519db1796ca0d8753f9e78ec02355e862d0998193038c7073045899f305"},
|
||||
{file = "greenlet-3.0.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2516a9957eed41dd8f1ec0c604f1cdc86758b587d964668b5b196a9db5bfcde6"},
|
||||
{file = "greenlet-3.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:bba5387a6975598857d86de9eac14210a49d554a77eb8261cc68b7d082f78ce2"},
|
||||
{file = "greenlet-3.0.3-cp37-cp37m-macosx_11_0_universal2.whl", hash = "sha256:5b51e85cb5ceda94e79d019ed36b35386e8c37d22f07d6a751cb659b180d5274"},
|
||||
{file = "greenlet-3.0.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:daf3cb43b7cf2ba96d614252ce1684c1bccee6b2183a01328c98d36fcd7d5cb0"},
|
||||
{file = "greenlet-3.0.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99bf650dc5d69546e076f413a87481ee1d2d09aaaaaca058c9251b6d8c14783f"},
|
||||
{file = "greenlet-3.0.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2dd6e660effd852586b6a8478a1d244b8dc90ab5b1321751d2ea15deb49ed414"},
|
||||
{file = "greenlet-3.0.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e3391d1e16e2a5a1507d83e4a8b100f4ee626e8eca43cf2cadb543de69827c4c"},
|
||||
{file = "greenlet-3.0.3-cp37-cp37m-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e1f145462f1fa6e4a4ae3c0f782e580ce44d57c8f2c7aae1b6fa88c0b2efdb41"},
|
||||
{file = "greenlet-3.0.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:1a7191e42732df52cb5f39d3527217e7ab73cae2cb3694d241e18f53d84ea9a7"},
|
||||
{file = "greenlet-3.0.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:0448abc479fab28b00cb472d278828b3ccca164531daab4e970a0458786055d6"},
|
||||
{file = "greenlet-3.0.3-cp37-cp37m-win32.whl", hash = "sha256:b542be2440edc2d48547b5923c408cbe0fc94afb9f18741faa6ae970dbcb9b6d"},
|
||||
{file = "greenlet-3.0.3-cp37-cp37m-win_amd64.whl", hash = "sha256:01bc7ea167cf943b4c802068e178bbf70ae2e8c080467070d01bfa02f337ee67"},
|
||||
{file = "greenlet-3.0.3-cp38-cp38-macosx_11_0_universal2.whl", hash = "sha256:1996cb9306c8595335bb157d133daf5cf9f693ef413e7673cb07e3e5871379ca"},
|
||||
{file = "greenlet-3.0.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3ddc0f794e6ad661e321caa8d2f0a55ce01213c74722587256fb6566049a8b04"},
|
||||
{file = "greenlet-3.0.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c9db1c18f0eaad2f804728c67d6c610778456e3e1cc4ab4bbd5eeb8e6053c6fc"},
|
||||
{file = "greenlet-3.0.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7170375bcc99f1a2fbd9c306f5be8764eaf3ac6b5cb968862cad4c7057756506"},
|
||||
{file = "greenlet-3.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b66c9c1e7ccabad3a7d037b2bcb740122a7b17a53734b7d72a344ce39882a1b"},
|
||||
{file = "greenlet-3.0.3-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:098d86f528c855ead3479afe84b49242e174ed262456c342d70fc7f972bc13c4"},
|
||||
{file = "greenlet-3.0.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:81bb9c6d52e8321f09c3d165b2a78c680506d9af285bfccbad9fb7ad5a5da3e5"},
|
||||
{file = "greenlet-3.0.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:fd096eb7ffef17c456cfa587523c5f92321ae02427ff955bebe9e3c63bc9f0da"},
|
||||
{file = "greenlet-3.0.3-cp38-cp38-win32.whl", hash = "sha256:d46677c85c5ba00a9cb6f7a00b2bfa6f812192d2c9f7d9c4f6a55b60216712f3"},
|
||||
{file = "greenlet-3.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:419b386f84949bf0e7c73e6032e3457b82a787c1ab4a0e43732898a761cc9dbf"},
|
||||
{file = "greenlet-3.0.3-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:da70d4d51c8b306bb7a031d5cff6cc25ad253affe89b70352af5f1cb68e74b53"},
|
||||
{file = "greenlet-3.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:086152f8fbc5955df88382e8a75984e2bb1c892ad2e3c80a2508954e52295257"},
|
||||
{file = "greenlet-3.0.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d73a9fe764d77f87f8ec26a0c85144d6a951a6c438dfe50487df5595c6373eac"},
|
||||
{file = "greenlet-3.0.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b7dcbe92cc99f08c8dd11f930de4d99ef756c3591a5377d1d9cd7dd5e896da71"},
|
||||
{file = "greenlet-3.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1551a8195c0d4a68fac7a4325efac0d541b48def35feb49d803674ac32582f61"},
|
||||
{file = "greenlet-3.0.3-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:64d7675ad83578e3fc149b617a444fab8efdafc9385471f868eb5ff83e446b8b"},
|
||||
{file = "greenlet-3.0.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b37eef18ea55f2ffd8f00ff8fe7c8d3818abd3e25fb73fae2ca3b672e333a7a6"},
|
||||
{file = "greenlet-3.0.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:77457465d89b8263bca14759d7c1684df840b6811b2499838cc5b040a8b5b113"},
|
||||
{file = "greenlet-3.0.3-cp39-cp39-win32.whl", hash = "sha256:57e8974f23e47dac22b83436bdcf23080ade568ce77df33159e019d161ce1d1e"},
|
||||
{file = "greenlet-3.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:c5ee858cfe08f34712f548c3c363e807e7186f03ad7a5039ebadb29e8c6be067"},
|
||||
{file = "greenlet-3.0.3.tar.gz", hash = "sha256:43374442353259554ce33599da8b692d5aa96f8976d567d4badf263371fbe491"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
@@ -3791,7 +3797,7 @@ version = "0.4.0"
|
||||
description = "Consume Server-Sent Event (SSE) messages with HTTPX."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
groups = ["main", "evaluation"]
|
||||
files = [
|
||||
{file = "httpx-sse-0.4.0.tar.gz", hash = "sha256:1e81a3a3070ce322add1d3529ed42eb5f70817f45ed6ec915ab753f961139721"},
|
||||
{file = "httpx_sse-0.4.0-py3-none-any.whl", hash = "sha256:f329af6eae57eaa2bdfd962b42524764af68075ea87370a2de920af5341e318f"},
|
||||
@@ -5453,7 +5459,7 @@ version = "1.9.2"
|
||||
description = "Model Context Protocol SDK"
|
||||
optional = false
|
||||
python-versions = ">=3.10"
|
||||
groups = ["main"]
|
||||
groups = ["main", "evaluation"]
|
||||
files = [
|
||||
{file = "mcp-1.9.2-py3-none-any.whl", hash = "sha256:bc29f7fd67d157fef378f89a4210384f5fecf1168d0feb12d22929818723f978"},
|
||||
{file = "mcp-1.9.2.tar.gz", hash = "sha256:3c7651c053d635fd235990a12e84509fe32780cd359a5bbef352e20d4d963c05"},
|
||||
@@ -5465,9 +5471,11 @@ httpx = ">=0.27"
|
||||
httpx-sse = ">=0.4"
|
||||
pydantic = ">=2.7.2,<3.0.0"
|
||||
pydantic-settings = ">=2.5.2"
|
||||
python-dotenv = {version = ">=1.0.0", optional = true, markers = "extra == \"cli\""}
|
||||
python-multipart = ">=0.0.9"
|
||||
sse-starlette = ">=1.6.1"
|
||||
starlette = ">=0.27"
|
||||
typer = {version = ">=0.12.4", optional = true, markers = "extra == \"cli\""}
|
||||
uvicorn = {version = ">=0.23.1", markers = "sys_platform != \"emscripten\""}
|
||||
|
||||
[package.extras]
|
||||
@@ -7042,25 +7050,24 @@ type = ["mypy (>=1.14.1)"]
|
||||
|
||||
[[package]]
|
||||
name = "playwright"
|
||||
version = "1.52.0"
|
||||
version = "1.44.0"
|
||||
description = "A high-level API to automate web browsers"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main", "evaluation", "test"]
|
||||
files = [
|
||||
{file = "playwright-1.52.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:19b2cb9d4794062008a635a99bd135b03ebb782d460f96534a91cb583f549512"},
|
||||
{file = "playwright-1.52.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:0797c0479cbdc99607412a3c486a3a2ec9ddc77ac461259fd2878c975bcbb94a"},
|
||||
{file = "playwright-1.52.0-py3-none-macosx_11_0_universal2.whl", hash = "sha256:7223960b7dd7ddeec1ba378c302d1d09733b8dac438f492e9854c85d3ca7144f"},
|
||||
{file = "playwright-1.52.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:d010124d24a321e0489a8c0d38a3971a7ca7656becea7656c9376bfea7f916d4"},
|
||||
{file = "playwright-1.52.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4173e453c43180acc60fd77ffe1ebee8d0efbfd9986c03267007b9c3845415af"},
|
||||
{file = "playwright-1.52.0-py3-none-win32.whl", hash = "sha256:cd0bdf92df99db6237a99f828e80a6a50db6180ef8d5352fc9495df2c92f9971"},
|
||||
{file = "playwright-1.52.0-py3-none-win_amd64.whl", hash = "sha256:dcbf75101eba3066b7521c6519de58721ea44379eb17a0dafa94f9f1b17f59e4"},
|
||||
{file = "playwright-1.52.0-py3-none-win_arm64.whl", hash = "sha256:9d0085b8de513de5fb50669f8e6677f0252ef95a9a1d2d23ccee9638e71e65cb"},
|
||||
{file = "playwright-1.44.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:c2317a80896796fdeb03d60f06cc229e775ff2e19b80c64b1bb9b29c8a59d992"},
|
||||
{file = "playwright-1.44.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:54d44fb634d870839301c2326e1e12a178a1be0de76d0caaec230ab075c2e077"},
|
||||
{file = "playwright-1.44.0-py3-none-macosx_11_0_universal2.whl", hash = "sha256:64b67194e73b47ae72acf25f1a9cfacfef38ca2b52e4bb8b0abd385c5deeaadf"},
|
||||
{file = "playwright-1.44.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:29161b1fae71f7c402df5b15f0bd3deaeecd8b3d1ecd9ff01271700c66210e7b"},
|
||||
{file = "playwright-1.44.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f8c8a3bfea17576d3f94a2363eee195cbda8dbba86975588c7eaac7792b25eee"},
|
||||
{file = "playwright-1.44.0-py3-none-win32.whl", hash = "sha256:235e37832deaa9af8a629d09955396259ab757533cc1922f9b0308b4ee0d9cdf"},
|
||||
{file = "playwright-1.44.0-py3-none-win_amd64.whl", hash = "sha256:5b8a4a1d4d50f4ff99b47965576322a8c4e34631854b862a25c1feb824be22a8"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
greenlet = ">=3.1.1,<4.0.0"
|
||||
pyee = ">=13,<14"
|
||||
greenlet = "3.0.3"
|
||||
pyee = "11.1.0"
|
||||
|
||||
[[package]]
|
||||
name = "pluggy"
|
||||
@@ -7664,7 +7671,7 @@ version = "2.9.1"
|
||||
description = "Settings management using Pydantic"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
groups = ["main", "evaluation"]
|
||||
files = [
|
||||
{file = "pydantic_settings-2.9.1-py3-none-any.whl", hash = "sha256:59b4f431b1defb26fe620c71a7d3968a710d719f5f4cdbbdb7926edeb770f6ef"},
|
||||
{file = "pydantic_settings-2.9.1.tar.gz", hash = "sha256:c509bf79d27563add44e8446233359004ed85066cd096d8b510f715e6ef5d268"},
|
||||
@@ -7716,21 +7723,21 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "pyee"
|
||||
version = "13.0.0"
|
||||
version = "11.1.0"
|
||||
description = "A rough port of Node.js's EventEmitter to Python with a few tricks of its own"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main", "evaluation", "test"]
|
||||
files = [
|
||||
{file = "pyee-13.0.0-py3-none-any.whl", hash = "sha256:48195a3cddb3b1515ce0695ed76036b5ccc2ef3a9f963ff9f77aec0139845498"},
|
||||
{file = "pyee-13.0.0.tar.gz", hash = "sha256:b391e3c5a434d1f5118a25615001dbc8f669cf410ab67d04c4d4e07c55481c37"},
|
||||
{file = "pyee-11.1.0-py3-none-any.whl", hash = "sha256:5d346a7d0f861a4b2e6c47960295bd895f816725b27d656181947346be98d7c1"},
|
||||
{file = "pyee-11.1.0.tar.gz", hash = "sha256:b53af98f6990c810edd9b56b87791021a8f54fd13db4edd1142438d44ba2263f"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
typing-extensions = "*"
|
||||
|
||||
[package.extras]
|
||||
dev = ["black", "build", "flake8", "flake8-black", "isort", "jupyter-console", "mkdocs", "mkdocs-include-markdown-plugin", "mkdocstrings[python]", "mypy", "pytest", "pytest-asyncio ; python_version >= \"3.4\"", "pytest-trio ; python_version >= \"3.7\"", "sphinx", "toml", "tox", "trio", "trio ; python_version > \"3.6\"", "trio-typing ; python_version > \"3.6\"", "twine", "twisted", "validate-pyproject[all]"]
|
||||
dev = ["black", "build", "flake8", "flake8-black", "isort", "jupyter-console", "mkdocs", "mkdocs-include-markdown-plugin", "mkdocstrings[python]", "pytest", "pytest-asyncio ; python_version >= \"3.4\"", "pytest-trio ; python_version >= \"3.7\"", "sphinx", "toml", "tox", "trio", "trio ; python_version > \"3.6\"", "trio-typing ; python_version > \"3.6\"", "twine", "twisted", "validate-pyproject[all]"]
|
||||
|
||||
[[package]]
|
||||
name = "pyflakes"
|
||||
@@ -8188,7 +8195,7 @@ version = "0.0.20"
|
||||
description = "A streaming multipart parser for Python"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
groups = ["main", "evaluation"]
|
||||
files = [
|
||||
{file = "python_multipart-0.0.20-py3-none-any.whl", hash = "sha256:8a62d3a8335e06589fe01f2a3e178cdcc632f3fbe0d492ad9ee0ec35aab1f104"},
|
||||
{file = "python_multipart-0.0.20.tar.gz", hash = "sha256:8dd0cab45b8e23064ae09147625994d090fa46f5b0d1e13af944c331a7fa9d13"},
|
||||
@@ -9623,7 +9630,7 @@ version = "2.4.1"
|
||||
description = "SSE plugin for Starlette"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
groups = ["main", "evaluation"]
|
||||
files = [
|
||||
{file = "sse_starlette-2.4.1-py3-none-any.whl", hash = "sha256:08b77ea898ab1a13a428b2b6f73cfe6d0e607a7b4e15b9bb23e4a37b087fd39a"},
|
||||
{file = "sse_starlette-2.4.1.tar.gz", hash = "sha256:7c8a800a1ca343e9165fc06bbda45c78e4c6166320707ae30b416c42da070926"},
|
||||
@@ -9694,7 +9701,7 @@ version = "0.46.2"
|
||||
description = "The little ASGI library that shines."
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
groups = ["main", "evaluation"]
|
||||
files = [
|
||||
{file = "starlette-0.46.2-py3-none-any.whl", hash = "sha256:595633ce89f8ffa71a015caed34a5b2dc1c0cdb3f0f1fbd1e69339cf2abeec35"},
|
||||
{file = "starlette-0.46.2.tar.gz", hash = "sha256:7f7361f34eed179294600af672f565727419830b54b7b084efe44bb82d2fccd5"},
|
||||
@@ -10726,7 +10733,7 @@ version = "0.35.0"
|
||||
description = "The lightning-fast ASGI server."
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
groups = ["main", "evaluation"]
|
||||
files = [
|
||||
{file = "uvicorn-0.35.0-py3-none-any.whl", hash = "sha256:197535216b25ff9b785e29a0b79199f55222193d47f820816e7da751e9bc8d4a"},
|
||||
{file = "uvicorn-0.35.0.tar.gz", hash = "sha256:bc662f087f7cf2ce11a1d7fd70b90c9f98ef2e2831556dd078d131b96cc94a01"},
|
||||
@@ -11850,4 +11857,4 @@ third-party-runtimes = ["daytona", "e2b", "modal", "runloop-api-client"]
|
||||
[metadata]
|
||||
lock-version = "2.1"
|
||||
python-versions = "^3.12,<3.14"
|
||||
content-hash = "a0ae2cee596dde71f89c06e9669efda58ee8f8f019fad3dbe9df068005c32904"
|
||||
content-hash = "4aabe341a78e439a0cc9dead9f03f49c75bbe7f8b1287269e62961d88af04468"
|
||||
|
||||
@@ -41,7 +41,7 @@ types-toml = "*"
|
||||
uvicorn = "*"
|
||||
numpy = "*"
|
||||
json-repair = "*"
|
||||
browsergym-core = "0.13.3" # integrate browsergym-core as the browsing interface
|
||||
browsergym-core = "0.14.2" # integrate browsergym-core as the browsing interface
|
||||
html2text = "*"
|
||||
deprecated = "*"
|
||||
pexpect = "*"
|
||||
@@ -156,10 +156,10 @@ gdown = "*"
|
||||
matplotlib = "*"
|
||||
seaborn = "*"
|
||||
tabulate = "*"
|
||||
browsergym = "0.13.3"
|
||||
browsergym-webarena = "0.13.3"
|
||||
browsergym-miniwob = "0.13.3"
|
||||
browsergym-visualwebarena = "0.13.3"
|
||||
browsergym = "0.14.2"
|
||||
browsergym-webarena = "0.14.2"
|
||||
browsergym-miniwob = "0.14.2"
|
||||
browsergym-visualwebarena = "0.14.2"
|
||||
boto3-stubs = { extras = [ "s3" ], version = "^1.37.19" }
|
||||
# transitive dependency, pinned here to avoid conflicts
|
||||
pyarrow = "21.0.0"
|
||||
|
||||
Reference in New Issue
Block a user