mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-04-08 03:00:28 -04:00
## Summary Adds three new Copilot tools for multi-step browser automation using the [agent-browser](https://github.com/vercel-labs/agent-browser) CLI (Playwright-based local daemon): - **`browser_navigate`** — navigate to a URL and get an accessibility-tree snapshot with `@ref` IDs - **`browser_act`** — interact with page elements (click, fill, scroll, check, press, select, `dblclick`, `type`, `wait`, back, forward, reload); returns updated snapshot - **`browser_screenshot`** — capture annotated screenshot (with `@ref` overlays) and save to user workspace Also adds **`browse_web`** (Stagehand + Browserbase) for one-shot JS-rendered page extraction. ### Why two browser tools? | Tool | When to use | |------|-------------| | `browse_web` | Single-shot extraction — cloud Browserbase session, no local daemon needed | | `browser_navigate` / `browser_act` | Multi-step flows (login → navigate → scrape), persistent session within a Copilot session | ### Design decisions - **SSRF protection**: Uses the same `validate_url()` from `backend.util.request` as HTTP blocks — async DNS, all IPs checked, full RFC 1918 + link-local + IPv6 coverage - **Session isolation**: `_run()` passes both `--session <id>` (isolated Chromium context per Copilot session) **and** `--session-name <id>` (persist cookies within a session), preventing cross-session state leakage while supporting login flows - **Snapshot truncation**: Interactive-only accessibility tree (`snapshot -i`) capped at 20 000 chars with a continuation hint - **Screenshot storage**: PNG bytes uploaded to user workspace via `WriteWorkspaceFileTool`; returns `file_id` for retrieval ### Bugs fixed in this PR - Session isolation bug: `--session-name` alone shared browser history across different Copilot sessions; added `--session` to isolate contexts - Missing actions: added `dblclick`, `type` (append without clearing), `wait` (CSS selector or ms delay) ## Test plan - [x] 53 unit tests covering all three tools, all actions, SSRF integration, auth check, session isolation, snapshot truncation, timeout, missing binary - [x] Integration test: real `agent-browser` CLI + Anthropic API tool-calling loop (3/3 scenarios passed) - [x] Linting (Ruff, isort, Black, Pyright) all passing ``` backend/copilot/tools/agent_browser_test.py 53 passed in 17.79s ```
877 lines
32 KiB
Python
877 lines
32 KiB
Python
"""Agent-browser tools — multi-step browser automation for the Copilot.
|
|
|
|
Uses the agent-browser CLI (https://github.com/vercel-labs/agent-browser)
|
|
which runs a local Chromium instance managed by a persistent daemon.
|
|
|
|
- Runs locally — no cloud account required
|
|
- Full interaction support: click, fill, scroll, login flows, multi-step
|
|
- Session persistence via --session-name: cookies/auth carry across tool calls
|
|
within the same Copilot session, enabling login → navigate → extract workflows
|
|
- Screenshot with --annotate overlays @ref labels, saved to workspace for user
|
|
- The Claude Agent SDK's multi-turn loop handles orchestration — each tool call
|
|
is one browser action; the LLM chains them naturally
|
|
|
|
SSRF protection:
|
|
Uses the shared validate_url() from backend.util.request, which is the same
|
|
guard used by HTTP blocks and web_fetch. It resolves ALL DNS answers (not just
|
|
the first), blocks RFC 1918, loopback, link-local, 0.0.0.0/8, multicast,
|
|
and all relevant IPv6 ranges, and applies IDNA encoding to prevent Unicode
|
|
domain attacks.
|
|
|
|
Requires:
|
|
npm install -g agent-browser
|
|
agent-browser install (downloads Chromium, one-time per machine)
|
|
"""
|
|
|
|
import asyncio
|
|
import base64
|
|
import json
|
|
import logging
|
|
import os
|
|
import shutil
|
|
import tempfile
|
|
from typing import Any
|
|
|
|
from backend.copilot.model import ChatSession
|
|
from backend.util.request import validate_url
|
|
|
|
from .base import BaseTool
|
|
from .models import (
|
|
BrowserActResponse,
|
|
BrowserNavigateResponse,
|
|
BrowserScreenshotResponse,
|
|
ErrorResponse,
|
|
ToolResponseBase,
|
|
)
|
|
from .workspace_files import get_manager
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Per-command timeout (seconds). Navigation + networkidle wait can be slow.
|
|
_CMD_TIMEOUT = 45
|
|
# Accessibility tree can be very large; cap it to keep LLM context manageable.
|
|
_MAX_SNAPSHOT_CHARS = 20_000
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Subprocess helper
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
async def _run(
|
|
session_name: str,
|
|
*args: str,
|
|
timeout: int = _CMD_TIMEOUT,
|
|
) -> tuple[int, str, str]:
|
|
"""Run agent-browser for the given session and return (rc, stdout, stderr).
|
|
|
|
Uses both:
|
|
--session <name> → isolated Chromium context (no shared history/cookies
|
|
with other Copilot sessions — prevents cross-session
|
|
browser state leakage)
|
|
--session-name <name> → persist cookies/localStorage across tool calls within
|
|
the same session (enables login → navigate flows)
|
|
"""
|
|
cmd = [
|
|
"agent-browser",
|
|
"--session",
|
|
session_name,
|
|
"--session-name",
|
|
session_name,
|
|
*args,
|
|
]
|
|
proc = None
|
|
try:
|
|
proc = await asyncio.create_subprocess_exec(
|
|
*cmd,
|
|
stdout=asyncio.subprocess.PIPE,
|
|
stderr=asyncio.subprocess.PIPE,
|
|
)
|
|
stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
|
|
return proc.returncode or 0, stdout.decode(), stderr.decode()
|
|
except asyncio.TimeoutError:
|
|
# Kill the orphaned subprocess so it does not linger in the process table.
|
|
if proc is not None and proc.returncode is None:
|
|
proc.kill()
|
|
try:
|
|
await proc.communicate()
|
|
except Exception:
|
|
pass # Best-effort reap; ignore errors during cleanup.
|
|
return 1, "", f"Command timed out after {timeout}s."
|
|
except FileNotFoundError:
|
|
return (
|
|
1,
|
|
"",
|
|
"agent-browser is not installed (run: npm install -g agent-browser && agent-browser install).",
|
|
)
|
|
|
|
|
|
async def _snapshot(session_name: str) -> str:
|
|
"""Return the current page's interactive accessibility tree, truncated."""
|
|
rc, stdout, stderr = await _run(session_name, "snapshot", "-i", "-c")
|
|
if rc != 0:
|
|
return f"[snapshot failed: {stderr[:300]}]"
|
|
text = stdout.strip()
|
|
if len(text) > _MAX_SNAPSHOT_CHARS:
|
|
suffix = "\n\n[Snapshot truncated — use browser_act to navigate further]"
|
|
keep = max(0, _MAX_SNAPSHOT_CHARS - len(suffix))
|
|
text = text[:keep] + suffix
|
|
return text
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Stateless session helpers — persist / restore browser state across pods
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# Module-level cache of sessions known to be alive on this pod.
|
|
# Avoids the subprocess probe on every tool call within the same pod.
|
|
_alive_sessions: set[str] = set()
|
|
|
|
# Per-session locks to prevent concurrent _ensure_session calls from
|
|
# triggering duplicate _restore_browser_state for the same session.
|
|
# Protected by _session_locks_mutex to ensure setdefault/pop are not
|
|
# interleaved across await boundaries.
|
|
_session_locks: dict[str, asyncio.Lock] = {}
|
|
_session_locks_mutex = asyncio.Lock()
|
|
|
|
# Workspace filename for persisted browser state (auto-scoped to session).
|
|
# Dot-prefixed so it is hidden from user workspace listings.
|
|
_STATE_FILENAME = "._browser_state.json"
|
|
|
|
# Maximum concurrent subprocesses during cookie/storage restore.
|
|
_RESTORE_CONCURRENCY = 10
|
|
|
|
# Maximum cookies to restore per session. Pathological sites can accumulate
|
|
# thousands of cookies; restoring them all would be slow and is rarely useful.
|
|
_MAX_RESTORE_COOKIES = 100
|
|
|
|
# Background tasks for fire-and-forget state persistence.
|
|
# Prevents GC from collecting tasks before they complete.
|
|
_background_tasks: set[asyncio.Task] = set()
|
|
|
|
|
|
def _fire_and_forget_save(
|
|
session_name: str, user_id: str, session: ChatSession
|
|
) -> None:
|
|
"""Schedule state persistence as a background task (non-blocking).
|
|
|
|
State save is already best-effort (errors are swallowed), so running it
|
|
in the background avoids adding latency to tool responses.
|
|
"""
|
|
task = asyncio.create_task(_save_browser_state(session_name, user_id, session))
|
|
_background_tasks.add(task)
|
|
task.add_done_callback(_background_tasks.discard)
|
|
|
|
|
|
async def _has_local_session(session_name: str) -> bool:
|
|
"""Check if the local agent-browser daemon for this session is running."""
|
|
rc, _, _ = await _run(session_name, "get", "url", timeout=5)
|
|
return rc == 0
|
|
|
|
|
|
async def _save_browser_state(
|
|
session_name: str, user_id: str, session: ChatSession
|
|
) -> None:
|
|
"""Persist browser state (cookies, localStorage, URL) to workspace.
|
|
|
|
Best-effort: errors are logged but never propagate to the tool response.
|
|
"""
|
|
try:
|
|
# Gather state in parallel
|
|
(rc_url, url_out, _), (rc_ck, ck_out, _), (rc_ls, ls_out, _) = (
|
|
await asyncio.gather(
|
|
_run(session_name, "get", "url", timeout=10),
|
|
_run(session_name, "cookies", "get", "--json", timeout=10),
|
|
_run(session_name, "storage", "local", "--json", timeout=10),
|
|
)
|
|
)
|
|
|
|
state = {
|
|
"url": url_out.strip() if rc_url == 0 else "",
|
|
"cookies": (json.loads(ck_out) if rc_ck == 0 and ck_out.strip() else []),
|
|
"local_storage": (
|
|
json.loads(ls_out) if rc_ls == 0 and ls_out.strip() else {}
|
|
),
|
|
}
|
|
|
|
manager = await get_manager(user_id, session.session_id)
|
|
await manager.write_file(
|
|
content=json.dumps(state).encode("utf-8"),
|
|
filename=_STATE_FILENAME,
|
|
mime_type="application/json",
|
|
overwrite=True,
|
|
)
|
|
except Exception:
|
|
logger.warning(
|
|
"[browser] Failed to save browser state for session %s",
|
|
session_name,
|
|
exc_info=True,
|
|
)
|
|
|
|
|
|
async def _restore_browser_state(
|
|
session_name: str, user_id: str, session: ChatSession
|
|
) -> bool:
|
|
"""Restore browser state from workspace storage into a fresh daemon.
|
|
|
|
Best-effort: errors are logged but never propagate to the tool response.
|
|
Returns True on success (or no state to restore), False on failure.
|
|
"""
|
|
try:
|
|
manager = await get_manager(user_id, session.session_id)
|
|
|
|
file_info = await manager.get_file_info_by_path(_STATE_FILENAME)
|
|
if file_info is None:
|
|
return True # No saved state — first call or never saved
|
|
|
|
state_bytes = await manager.read_file(_STATE_FILENAME)
|
|
state = json.loads(state_bytes.decode("utf-8"))
|
|
|
|
url = state.get("url", "")
|
|
cookies = state.get("cookies", [])
|
|
local_storage = state.get("local_storage", {})
|
|
|
|
# Navigate first — starts daemon + sets the correct origin for cookies
|
|
if url:
|
|
# Validate the saved URL to prevent SSRF via stored redirect targets.
|
|
try:
|
|
await validate_url(url, trusted_origins=[])
|
|
except ValueError:
|
|
logger.warning(
|
|
"[browser] State restore: blocked SSRF URL %s", url[:200]
|
|
)
|
|
return False
|
|
|
|
rc, _, stderr = await _run(session_name, "open", url)
|
|
if rc != 0:
|
|
logger.warning(
|
|
"[browser] State restore: failed to open %s: %s",
|
|
url,
|
|
stderr[:200],
|
|
)
|
|
return False
|
|
await _run(session_name, "wait", "--load", "load", timeout=15)
|
|
|
|
# Restore cookies and localStorage in parallel via asyncio.gather.
|
|
# Semaphore caps concurrent subprocess spawns so we don't overwhelm the
|
|
# system when a session has hundreds of cookies.
|
|
sem = asyncio.Semaphore(_RESTORE_CONCURRENCY)
|
|
|
|
# Guard against pathological sites with thousands of cookies.
|
|
if len(cookies) > _MAX_RESTORE_COOKIES:
|
|
logger.debug(
|
|
"[browser] State restore: capping cookies from %d to %d",
|
|
len(cookies),
|
|
_MAX_RESTORE_COOKIES,
|
|
)
|
|
cookies = cookies[:_MAX_RESTORE_COOKIES]
|
|
|
|
async def _set_cookie(c: dict[str, Any]) -> None:
|
|
name = c.get("name", "")
|
|
value = c.get("value", "")
|
|
domain = c.get("domain", "")
|
|
path = c.get("path", "/")
|
|
if not (name and domain):
|
|
return
|
|
async with sem:
|
|
rc, _, stderr = await _run(
|
|
session_name,
|
|
"cookies",
|
|
"set",
|
|
name,
|
|
value,
|
|
"--domain",
|
|
domain,
|
|
"--path",
|
|
path,
|
|
timeout=5,
|
|
)
|
|
if rc != 0:
|
|
logger.debug(
|
|
"[browser] State restore: cookie set failed for %s: %s",
|
|
name,
|
|
stderr[:100],
|
|
)
|
|
|
|
async def _set_storage(key: str, val: object) -> None:
|
|
async with sem:
|
|
rc, _, stderr = await _run(
|
|
session_name,
|
|
"storage",
|
|
"local",
|
|
"set",
|
|
key,
|
|
str(val),
|
|
timeout=5,
|
|
)
|
|
if rc != 0:
|
|
logger.debug(
|
|
"[browser] State restore: localStorage set failed for %s: %s",
|
|
key,
|
|
stderr[:100],
|
|
)
|
|
|
|
await asyncio.gather(
|
|
*[_set_cookie(c) for c in cookies],
|
|
*[_set_storage(k, v) for k, v in local_storage.items()],
|
|
)
|
|
|
|
return True
|
|
except Exception:
|
|
logger.warning(
|
|
"[browser] Failed to restore browser state for session %s",
|
|
session_name,
|
|
exc_info=True,
|
|
)
|
|
return False
|
|
|
|
|
|
async def _ensure_session(
|
|
session_name: str, user_id: str, session: ChatSession
|
|
) -> None:
|
|
"""Ensure the local browser daemon has state. Restore from cloud if needed."""
|
|
if session_name in _alive_sessions:
|
|
return
|
|
async with _session_locks_mutex:
|
|
lock = _session_locks.setdefault(session_name, asyncio.Lock())
|
|
async with lock:
|
|
# Double-check after acquiring lock — another coroutine may have restored.
|
|
if session_name in _alive_sessions:
|
|
return
|
|
if await _has_local_session(session_name):
|
|
_alive_sessions.add(session_name)
|
|
return
|
|
if await _restore_browser_state(session_name, user_id, session):
|
|
_alive_sessions.add(session_name)
|
|
|
|
|
|
async def close_browser_session(session_name: str, user_id: str | None = None) -> None:
|
|
"""Shut down the local agent-browser daemon and clean up stored state.
|
|
|
|
Deletes ``._browser_state.json`` from workspace storage so cookies and
|
|
other credentials do not linger after the session is deleted.
|
|
|
|
Best-effort: errors are logged but never raised.
|
|
"""
|
|
_alive_sessions.discard(session_name)
|
|
async with _session_locks_mutex:
|
|
_session_locks.pop(session_name, None)
|
|
|
|
# Delete persisted browser state (cookies, localStorage) from workspace.
|
|
if user_id:
|
|
try:
|
|
manager = await get_manager(user_id, session_name)
|
|
file_info = await manager.get_file_info_by_path(_STATE_FILENAME)
|
|
if file_info is not None:
|
|
await manager.delete_file(file_info.id)
|
|
except Exception:
|
|
logger.debug(
|
|
"[browser] Failed to delete state file for session %s",
|
|
session_name,
|
|
exc_info=True,
|
|
)
|
|
|
|
try:
|
|
rc, _, stderr = await _run(session_name, "close", timeout=10)
|
|
if rc != 0:
|
|
logger.debug(
|
|
"[browser] close failed for session %s: %s",
|
|
session_name,
|
|
stderr[:200],
|
|
)
|
|
except Exception:
|
|
logger.debug(
|
|
"[browser] Exception closing browser session %s",
|
|
session_name,
|
|
exc_info=True,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Tool: browser_navigate
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class BrowserNavigateTool(BaseTool):
|
|
"""Navigate to a URL and return the page's interactive elements.
|
|
|
|
The browser session persists across tool calls within this Copilot session
|
|
(keyed to session_id), so cookies and auth state carry over. This enables
|
|
full login flows: navigate to login page → browser_act to fill credentials
|
|
→ browser_act to submit → browser_navigate to the target page.
|
|
"""
|
|
|
|
@property
|
|
def name(self) -> str:
|
|
return "browser_navigate"
|
|
|
|
@property
|
|
def description(self) -> str:
|
|
return (
|
|
"Navigate to a URL using a real browser. Returns an accessibility "
|
|
"tree snapshot listing the page's interactive elements with @ref IDs "
|
|
"(e.g. @e3) that can be used with browser_act. "
|
|
"Session persists — cookies and login state carry over between calls. "
|
|
"Use this (with browser_act) for multi-step interaction: login flows, "
|
|
"form filling, button clicks, or anything requiring page interaction. "
|
|
"For plain static pages, prefer web_fetch — no browser overhead. "
|
|
"For authenticated pages: navigate to the login page first, use browser_act "
|
|
"to fill credentials and submit, then navigate to the target page. "
|
|
"Note: for slow SPAs, the returned snapshot may reflect a partially-loaded "
|
|
"state. If elements seem missing, use browser_act with action='wait' and a "
|
|
"CSS selector or millisecond delay, then take a browser_screenshot to verify."
|
|
)
|
|
|
|
@property
|
|
def parameters(self) -> dict[str, Any]:
|
|
return {
|
|
"type": "object",
|
|
"properties": {
|
|
"url": {
|
|
"type": "string",
|
|
"description": "The HTTP/HTTPS URL to navigate to.",
|
|
},
|
|
"wait_for": {
|
|
"type": "string",
|
|
"enum": ["networkidle", "load", "domcontentloaded"],
|
|
"default": "networkidle",
|
|
"description": "When to consider navigation complete. Use 'networkidle' for SPAs (default).",
|
|
},
|
|
},
|
|
"required": ["url"],
|
|
}
|
|
|
|
@property
|
|
def requires_auth(self) -> bool:
|
|
return True
|
|
|
|
@property
|
|
def is_available(self) -> bool:
|
|
return shutil.which("agent-browser") is not None
|
|
|
|
async def _execute(
|
|
self,
|
|
user_id: str | None,
|
|
session: ChatSession,
|
|
**kwargs: Any,
|
|
) -> ToolResponseBase:
|
|
"""Navigate to *url*, wait for the page to settle, and return a snapshot.
|
|
|
|
The snapshot is an accessibility-tree listing of interactive elements.
|
|
Note: for slow SPAs that never fully idle, the snapshot may reflect a
|
|
partially-loaded state (the wait is best-effort).
|
|
"""
|
|
url: str = (kwargs.get("url") or "").strip()
|
|
wait_for: str = kwargs.get("wait_for") or "networkidle"
|
|
session_name = session.session_id
|
|
|
|
if not url:
|
|
return ErrorResponse(
|
|
message="Please provide a URL to navigate to.",
|
|
error="missing_url",
|
|
session_id=session_name,
|
|
)
|
|
|
|
try:
|
|
await validate_url(url, trusted_origins=[])
|
|
except ValueError as e:
|
|
return ErrorResponse(
|
|
message=str(e),
|
|
error="blocked_url",
|
|
session_id=session_name,
|
|
)
|
|
|
|
# Restore browser state from cloud if this is a different pod
|
|
if user_id:
|
|
await _ensure_session(session_name, user_id, session)
|
|
|
|
# Navigate
|
|
rc, _, stderr = await _run(session_name, "open", url)
|
|
if rc != 0:
|
|
logger.warning(
|
|
"[browser_navigate] open failed for %s: %s", url, stderr[:300]
|
|
)
|
|
return ErrorResponse(
|
|
message="Failed to navigate to URL.",
|
|
error="navigation_failed",
|
|
session_id=session_name,
|
|
)
|
|
|
|
# Wait for page to settle (best-effort: some SPAs never reach networkidle)
|
|
wait_rc, _, wait_err = await _run(session_name, "wait", "--load", wait_for)
|
|
if wait_rc != 0:
|
|
logger.warning(
|
|
"[browser_navigate] wait(%s) failed: %s", wait_for, wait_err[:300]
|
|
)
|
|
|
|
# Get current title and URL in parallel
|
|
(_, title_out, _), (_, url_out, _) = await asyncio.gather(
|
|
_run(session_name, "get", "title"),
|
|
_run(session_name, "get", "url"),
|
|
)
|
|
|
|
snapshot = await _snapshot(session_name)
|
|
|
|
result = BrowserNavigateResponse(
|
|
message=f"Navigated to {url}",
|
|
url=url_out.strip() or url,
|
|
title=title_out.strip(),
|
|
snapshot=snapshot,
|
|
session_id=session_name,
|
|
)
|
|
|
|
# Persist browser state to cloud for cross-pod continuity
|
|
if user_id:
|
|
_fire_and_forget_save(session_name, user_id, session)
|
|
|
|
return result
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Tool: browser_act
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_NO_TARGET_ACTIONS = frozenset({"back", "forward", "reload"})
|
|
_SCROLL_ACTIONS = frozenset({"scroll"})
|
|
_TARGET_ONLY_ACTIONS = frozenset({"click", "dblclick", "hover", "check", "uncheck"})
|
|
_TARGET_VALUE_ACTIONS = frozenset({"fill", "type", "select"})
|
|
# wait <selector|ms>: waits for a DOM element or a fixed delay (e.g. "1000" for 1 s)
|
|
_WAIT_ACTIONS = frozenset({"wait"})
|
|
|
|
|
|
class BrowserActTool(BaseTool):
|
|
"""Perform an action on the current browser page and return the updated snapshot.
|
|
|
|
Use @ref IDs from the snapshot returned by browser_navigate (e.g. '@e3').
|
|
The LLM orchestrates multi-step flows by chaining browser_navigate and
|
|
browser_act calls across turns of the Claude Agent SDK conversation.
|
|
"""
|
|
|
|
@property
|
|
def name(self) -> str:
|
|
return "browser_act"
|
|
|
|
@property
|
|
def description(self) -> str:
|
|
return (
|
|
"Interact with the current browser page. Use @ref IDs from the "
|
|
"snapshot (e.g. '@e3') to target elements. Returns an updated snapshot. "
|
|
"Supported actions: click, dblclick, fill, type, scroll, hover, press, "
|
|
"check, uncheck, select, wait, back, forward, reload. "
|
|
"fill clears the field before typing; type appends without clearing. "
|
|
"wait accepts a CSS selector (waits for element) or milliseconds string (e.g. '1000'). "
|
|
"Example login flow: fill @e1 with email → fill @e2 with password → "
|
|
"click @e3 (submit) → browser_navigate to the target page."
|
|
)
|
|
|
|
@property
|
|
def parameters(self) -> dict[str, Any]:
|
|
return {
|
|
"type": "object",
|
|
"properties": {
|
|
"action": {
|
|
"type": "string",
|
|
"enum": [
|
|
"click",
|
|
"dblclick",
|
|
"fill",
|
|
"type",
|
|
"scroll",
|
|
"hover",
|
|
"press",
|
|
"check",
|
|
"uncheck",
|
|
"select",
|
|
"wait",
|
|
"back",
|
|
"forward",
|
|
"reload",
|
|
],
|
|
"description": "The action to perform.",
|
|
},
|
|
"target": {
|
|
"type": "string",
|
|
"description": (
|
|
"Element to target. Use @ref from snapshot (e.g. '@e3'), "
|
|
"a CSS selector, or a text description. "
|
|
"Required for: click, dblclick, fill, type, hover, check, uncheck, select. "
|
|
"For wait: a CSS selector to wait for, or milliseconds as a string (e.g. '1000')."
|
|
),
|
|
},
|
|
"value": {
|
|
"type": "string",
|
|
"description": (
|
|
"For fill/type: the text to enter. "
|
|
"For press: key name (e.g. 'Enter', 'Tab', 'Control+a'). "
|
|
"For select: the option value to select."
|
|
),
|
|
},
|
|
"direction": {
|
|
"type": "string",
|
|
"enum": ["up", "down", "left", "right"],
|
|
"default": "down",
|
|
"description": "For scroll: direction to scroll.",
|
|
},
|
|
},
|
|
"required": ["action"],
|
|
}
|
|
|
|
@property
|
|
def requires_auth(self) -> bool:
|
|
return True
|
|
|
|
@property
|
|
def is_available(self) -> bool:
|
|
return shutil.which("agent-browser") is not None
|
|
|
|
async def _execute(
|
|
self,
|
|
user_id: str | None,
|
|
session: ChatSession,
|
|
**kwargs: Any,
|
|
) -> ToolResponseBase:
|
|
"""Perform a browser action and return an updated page snapshot.
|
|
|
|
Validates the *action*/*target*/*value* combination, delegates to
|
|
``agent-browser``, waits for the page to settle, and returns the
|
|
accessibility-tree snapshot so the LLM can plan the next step.
|
|
"""
|
|
action: str = (kwargs.get("action") or "").strip()
|
|
target: str = (kwargs.get("target") or "").strip()
|
|
value: str = (kwargs.get("value") or "").strip()
|
|
direction: str = (kwargs.get("direction") or "down").strip()
|
|
session_name = session.session_id
|
|
|
|
if not action:
|
|
return ErrorResponse(
|
|
message="Please specify an action.",
|
|
error="missing_action",
|
|
session_id=session_name,
|
|
)
|
|
|
|
# Build the agent-browser command args
|
|
if action in _NO_TARGET_ACTIONS:
|
|
cmd_args = [action]
|
|
|
|
elif action in _SCROLL_ACTIONS:
|
|
cmd_args = ["scroll", direction]
|
|
|
|
elif action == "press":
|
|
if not value:
|
|
return ErrorResponse(
|
|
message="'press' requires a 'value' (key name, e.g. 'Enter').",
|
|
error="missing_value",
|
|
session_id=session_name,
|
|
)
|
|
cmd_args = ["press", value]
|
|
|
|
elif action in _TARGET_ONLY_ACTIONS:
|
|
if not target:
|
|
return ErrorResponse(
|
|
message=f"'{action}' requires a 'target' element.",
|
|
error="missing_target",
|
|
session_id=session_name,
|
|
)
|
|
cmd_args = [action, target]
|
|
|
|
elif action in _TARGET_VALUE_ACTIONS:
|
|
if not target or not value:
|
|
return ErrorResponse(
|
|
message=f"'{action}' requires both 'target' and 'value'.",
|
|
error="missing_params",
|
|
session_id=session_name,
|
|
)
|
|
cmd_args = [action, target, value]
|
|
|
|
elif action in _WAIT_ACTIONS:
|
|
if not target:
|
|
return ErrorResponse(
|
|
message=(
|
|
"'wait' requires a 'target': a CSS selector to wait for, "
|
|
"or milliseconds as a string (e.g. '1000')."
|
|
),
|
|
error="missing_target",
|
|
session_id=session_name,
|
|
)
|
|
cmd_args = ["wait", target]
|
|
|
|
else:
|
|
return ErrorResponse(
|
|
message=f"Unsupported action: {action}",
|
|
error="invalid_action",
|
|
session_id=session_name,
|
|
)
|
|
|
|
# Restore browser state from cloud if this is a different pod
|
|
if user_id:
|
|
await _ensure_session(session_name, user_id, session)
|
|
|
|
rc, _, stderr = await _run(session_name, *cmd_args)
|
|
if rc != 0:
|
|
logger.warning("[browser_act] %s failed: %s", action, stderr[:300])
|
|
return ErrorResponse(
|
|
message=f"Action '{action}' failed.",
|
|
error="action_failed",
|
|
session_id=session_name,
|
|
)
|
|
|
|
# Allow the page to settle after interaction (best-effort: SPAs may not idle)
|
|
settle_rc, _, settle_err = await _run(
|
|
session_name, "wait", "--load", "networkidle"
|
|
)
|
|
if settle_rc != 0:
|
|
logger.warning(
|
|
"[browser_act] post-action wait failed: %s", settle_err[:300]
|
|
)
|
|
|
|
snapshot = await _snapshot(session_name)
|
|
_, url_out, _ = await _run(session_name, "get", "url")
|
|
|
|
result = BrowserActResponse(
|
|
message=f"Performed '{action}'" + (f" on '{target}'" if target else ""),
|
|
action=action,
|
|
current_url=url_out.strip(),
|
|
snapshot=snapshot,
|
|
session_id=session_name,
|
|
)
|
|
|
|
# Persist browser state to cloud for cross-pod continuity
|
|
if user_id:
|
|
_fire_and_forget_save(session_name, user_id, session)
|
|
|
|
return result
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Tool: browser_screenshot
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class BrowserScreenshotTool(BaseTool):
|
|
"""Capture a screenshot of the current browser page and save it to the workspace."""
|
|
|
|
@property
|
|
def name(self) -> str:
|
|
return "browser_screenshot"
|
|
|
|
@property
|
|
def description(self) -> str:
|
|
return (
|
|
"Take a screenshot of the current browser page and save it to the workspace. "
|
|
"IMPORTANT: After calling this tool, immediately call read_workspace_file "
|
|
"with the returned file_id to display the image inline to the user — "
|
|
"the screenshot is not visible until you do this. "
|
|
"With annotate=true (default), @ref labels are overlaid on interactive "
|
|
"elements, making it easy to see which @ref ID maps to which element on screen."
|
|
)
|
|
|
|
@property
|
|
def parameters(self) -> dict[str, Any]:
|
|
return {
|
|
"type": "object",
|
|
"properties": {
|
|
"annotate": {
|
|
"type": "boolean",
|
|
"default": True,
|
|
"description": "Overlay @ref labels on interactive elements (default: true).",
|
|
},
|
|
"filename": {
|
|
"type": "string",
|
|
"default": "screenshot.png",
|
|
"description": "Filename to save in the workspace.",
|
|
},
|
|
},
|
|
}
|
|
|
|
@property
|
|
def requires_auth(self) -> bool:
|
|
return True
|
|
|
|
@property
|
|
def is_available(self) -> bool:
|
|
return shutil.which("agent-browser") is not None
|
|
|
|
async def _execute(
|
|
self,
|
|
user_id: str | None,
|
|
session: ChatSession,
|
|
**kwargs: Any,
|
|
) -> ToolResponseBase:
|
|
"""Capture a PNG screenshot and upload it to the workspace.
|
|
|
|
Handles string-to-bool coercion for *annotate* (OpenAI function-call
|
|
payloads sometimes deliver ``"true"``/``"false"`` as strings).
|
|
Returns a :class:`BrowserScreenshotResponse` with the workspace
|
|
``file_id`` the LLM should pass to ``read_workspace_file``.
|
|
"""
|
|
raw_annotate = kwargs.get("annotate", True)
|
|
if isinstance(raw_annotate, str):
|
|
annotate = raw_annotate.strip().lower() in {"1", "true", "yes", "on"}
|
|
else:
|
|
annotate = bool(raw_annotate)
|
|
filename: str = (kwargs.get("filename") or "screenshot.png").strip()
|
|
session_name = session.session_id
|
|
|
|
# Restore browser state from cloud if this is a different pod
|
|
if user_id:
|
|
await _ensure_session(session_name, user_id, session)
|
|
|
|
tmp_fd, tmp_path = tempfile.mkstemp(suffix=".png")
|
|
os.close(tmp_fd)
|
|
try:
|
|
cmd_args = ["screenshot"]
|
|
if annotate:
|
|
cmd_args.append("--annotate")
|
|
cmd_args.append(tmp_path)
|
|
|
|
rc, _, stderr = await _run(session_name, *cmd_args)
|
|
if rc != 0:
|
|
logger.warning("[browser_screenshot] failed: %s", stderr[:300])
|
|
return ErrorResponse(
|
|
message="Failed to take screenshot.",
|
|
error="screenshot_failed",
|
|
session_id=session_name,
|
|
)
|
|
|
|
with open(tmp_path, "rb") as f:
|
|
png_bytes = f.read()
|
|
|
|
finally:
|
|
try:
|
|
os.unlink(tmp_path)
|
|
except OSError:
|
|
pass # Best-effort temp file cleanup; not critical if it fails.
|
|
|
|
# Upload to workspace so the user can view it
|
|
png_b64 = base64.b64encode(png_bytes).decode()
|
|
|
|
# Import here to avoid circular deps — workspace_files imports from .models
|
|
from .workspace_files import WorkspaceWriteResponse, WriteWorkspaceFileTool
|
|
|
|
write_resp = await WriteWorkspaceFileTool()._execute(
|
|
user_id=user_id,
|
|
session=session,
|
|
filename=filename,
|
|
content_base64=png_b64,
|
|
)
|
|
|
|
if not isinstance(write_resp, WorkspaceWriteResponse):
|
|
return ErrorResponse(
|
|
message="Screenshot taken but failed to save to workspace.",
|
|
error="workspace_write_failed",
|
|
session_id=session_name,
|
|
)
|
|
|
|
result = BrowserScreenshotResponse(
|
|
message=f"Screenshot saved to workspace as '{filename}'. Use read_workspace_file with file_id='{write_resp.file_id}' to retrieve it.",
|
|
file_id=write_resp.file_id,
|
|
filename=filename,
|
|
session_id=session_name,
|
|
)
|
|
|
|
# Persist browser state to cloud for cross-pod continuity
|
|
if user_id:
|
|
_fire_and_forget_save(session_name, user_id, session)
|
|
|
|
return result
|