mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-04-08 03:00:28 -04:00
## Summary - Replaces the arch-conditional chromium install (ARM64 vs AMD64) with a single approach: always use the distro-packaged `chromium` and set `AGENT_BROWSER_EXECUTABLE_PATH=/usr/bin/chromium` - Removes `agent-browser install` entirely (it downloads Chrome for Testing, which has no ARM64 binary) - Removes the `entrypoint.sh` wrapper script that was setting the env var at runtime - Updates `autogpt_platform/db/docker/docker-compose.yml`: removes `external: true` from the network declarations so the Supabase stack can be brought up standalone (needed for the Docker integration tests in the test plan below — without this, `docker compose up` fails unless the platform stack is already running); also sets `GOTRUE_MAILER_AUTOCONFIRM: true` for local dev convenience (no SMTP setup required on first run — this compose file is not used in production) - Updates `autogpt_platform/docker-compose.platform.yml`: mounts the `workspace` volume so agent-browser results (screenshots, snapshots) are accessible from other services; without this the copilot workspace write fails in Docker ## Verification Tested via Docker build on arm64 (Apple Silicon): ``` === Testing agent-browser with system chromium === ✓ Example Domain https://example.com/ === SUCCESS: agent-browser launched with system chromium === ``` agent-browser navigated to example.com in ~1.5s using system chromium (v146 from Debian trixie). ## Test plan - [x] Docker build test on arm64: `agent-browser open https://example.com` succeeds with system chromium - [x] Verify amd64 Docker build still works (CI)
859 lines
30 KiB
Python
859 lines
30 KiB
Python
"""Agent-browser tools — multi-step browser automation for the Copilot.
|
|
|
|
Uses the agent-browser CLI (https://github.com/vercel-labs/agent-browser)
|
|
which runs a local Chromium instance managed by a persistent daemon.
|
|
|
|
- Runs locally — no cloud account required
|
|
- Full interaction support: click, fill, scroll, login flows, multi-step
|
|
- Session persistence via --session-name: cookies/auth carry across tool calls
|
|
within the same Copilot session, enabling login → navigate → extract workflows
|
|
- Screenshot with --annotate overlays @ref labels, saved to workspace for user
|
|
- The Claude Agent SDK's multi-turn loop handles orchestration — each tool call
|
|
is one browser action; the LLM chains them naturally
|
|
|
|
SSRF protection:
|
|
Uses the shared validate_url() from backend.util.request, which is the same
|
|
guard used by HTTP blocks and web_fetch. It resolves ALL DNS answers (not just
|
|
the first), blocks RFC 1918, loopback, link-local, 0.0.0.0/8, multicast,
|
|
and all relevant IPv6 ranges, and applies IDNA encoding to prevent Unicode
|
|
domain attacks.
|
|
|
|
Requires:
|
|
npm install -g agent-browser
|
|
In Docker: system chromium package with AGENT_BROWSER_EXECUTABLE_PATH=/usr/bin/chromium
|
|
(set automatically — no `agent-browser install` needed).
|
|
Locally: run `agent-browser install` to download Chromium.
|
|
"""
|
|
|
|
import asyncio
|
|
import base64
|
|
import json
|
|
import logging
|
|
import os
|
|
import shutil
|
|
import tempfile
|
|
from typing import Any
|
|
|
|
from backend.copilot.context import get_workspace_manager
|
|
from backend.copilot.model import ChatSession
|
|
from backend.util.request import validate_url_host
|
|
|
|
from .base import BaseTool
|
|
from .models import (
|
|
BrowserActResponse,
|
|
BrowserNavigateResponse,
|
|
BrowserScreenshotResponse,
|
|
ErrorResponse,
|
|
ToolResponseBase,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Per-command timeout (seconds). Navigation + networkidle wait can be slow.
|
|
_CMD_TIMEOUT = 45
|
|
# Accessibility tree can be very large; cap it to keep LLM context manageable.
|
|
_MAX_SNAPSHOT_CHARS = 20_000
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Subprocess helper
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
async def _run(
|
|
session_name: str,
|
|
*args: str,
|
|
timeout: int = _CMD_TIMEOUT,
|
|
) -> tuple[int, str, str]:
|
|
"""Run agent-browser for the given session and return (rc, stdout, stderr).
|
|
|
|
Uses both:
|
|
--session <name> → isolated Chromium context (no shared history/cookies
|
|
with other Copilot sessions — prevents cross-session
|
|
browser state leakage)
|
|
--session-name <name> → persist cookies/localStorage across tool calls within
|
|
the same session (enables login → navigate flows)
|
|
"""
|
|
cmd = [
|
|
"agent-browser",
|
|
"--session",
|
|
session_name,
|
|
"--session-name",
|
|
session_name,
|
|
*args,
|
|
]
|
|
proc = None
|
|
try:
|
|
proc = await asyncio.create_subprocess_exec(
|
|
*cmd,
|
|
stdout=asyncio.subprocess.PIPE,
|
|
stderr=asyncio.subprocess.PIPE,
|
|
)
|
|
stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
|
|
return proc.returncode or 0, stdout.decode(), stderr.decode()
|
|
except asyncio.TimeoutError:
|
|
# Kill the orphaned subprocess so it does not linger in the process table.
|
|
if proc is not None and proc.returncode is None:
|
|
proc.kill()
|
|
try:
|
|
await proc.communicate()
|
|
except Exception:
|
|
pass # Best-effort reap; ignore errors during cleanup.
|
|
return 1, "", f"Command timed out after {timeout}s."
|
|
except FileNotFoundError:
|
|
return (
|
|
1,
|
|
"",
|
|
"agent-browser is not installed (run: npm install -g agent-browser && agent-browser install).",
|
|
)
|
|
|
|
|
|
async def _snapshot(session_name: str) -> str:
|
|
"""Return the current page's interactive accessibility tree, truncated."""
|
|
rc, stdout, stderr = await _run(session_name, "snapshot", "-i", "-c")
|
|
if rc != 0:
|
|
return f"[snapshot failed: {stderr[:300]}]"
|
|
text = stdout.strip()
|
|
if len(text) > _MAX_SNAPSHOT_CHARS:
|
|
suffix = "\n\n[Snapshot truncated — use browser_act to navigate further]"
|
|
keep = max(0, _MAX_SNAPSHOT_CHARS - len(suffix))
|
|
text = text[:keep] + suffix
|
|
return text
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Stateless session helpers — persist / restore browser state across pods
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# Module-level cache of sessions known to be alive on this pod.
|
|
# Avoids the subprocess probe on every tool call within the same pod.
|
|
_alive_sessions: set[str] = set()
|
|
|
|
# Per-session locks to prevent concurrent _ensure_session calls from
|
|
# triggering duplicate _restore_browser_state for the same session.
|
|
# Protected by _session_locks_mutex to ensure setdefault/pop are not
|
|
# interleaved across await boundaries.
|
|
_session_locks: dict[str, asyncio.Lock] = {}
|
|
_session_locks_mutex = asyncio.Lock()
|
|
|
|
# Workspace filename for persisted browser state (auto-scoped to session).
|
|
# Dot-prefixed so it is hidden from user workspace listings.
|
|
_STATE_FILENAME = "._browser_state.json"
|
|
|
|
# Maximum concurrent subprocesses during cookie/storage restore.
|
|
_RESTORE_CONCURRENCY = 10
|
|
|
|
# Maximum cookies to restore per session. Pathological sites can accumulate
|
|
# thousands of cookies; restoring them all would be slow and is rarely useful.
|
|
_MAX_RESTORE_COOKIES = 100
|
|
|
|
# Background tasks for fire-and-forget state persistence.
|
|
# Prevents GC from collecting tasks before they complete.
|
|
_background_tasks: set[asyncio.Task] = set()
|
|
|
|
|
|
def _fire_and_forget_save(
|
|
session_name: str, user_id: str, session: ChatSession
|
|
) -> None:
|
|
"""Schedule state persistence as a background task (non-blocking).
|
|
|
|
State save is already best-effort (errors are swallowed), so running it
|
|
in the background avoids adding latency to tool responses.
|
|
"""
|
|
task = asyncio.create_task(_save_browser_state(session_name, user_id, session))
|
|
_background_tasks.add(task)
|
|
task.add_done_callback(_background_tasks.discard)
|
|
|
|
|
|
async def _has_local_session(session_name: str) -> bool:
|
|
"""Check if the local agent-browser daemon for this session is running."""
|
|
rc, _, _ = await _run(session_name, "get", "url", timeout=5)
|
|
return rc == 0
|
|
|
|
|
|
async def _save_browser_state(
|
|
session_name: str, user_id: str, session: ChatSession
|
|
) -> None:
|
|
"""Persist browser state (cookies, localStorage, URL) to workspace.
|
|
|
|
Best-effort: errors are logged but never propagate to the tool response.
|
|
"""
|
|
try:
|
|
# Gather state in parallel
|
|
(rc_url, url_out, _), (rc_ck, ck_out, _), (rc_ls, ls_out, _) = (
|
|
await asyncio.gather(
|
|
_run(session_name, "get", "url", timeout=10),
|
|
_run(session_name, "cookies", "get", "--json", timeout=10),
|
|
_run(session_name, "storage", "local", "--json", timeout=10),
|
|
)
|
|
)
|
|
|
|
state = {
|
|
"url": url_out.strip() if rc_url == 0 else "",
|
|
"cookies": (json.loads(ck_out) if rc_ck == 0 and ck_out.strip() else []),
|
|
"local_storage": (
|
|
json.loads(ls_out) if rc_ls == 0 and ls_out.strip() else {}
|
|
),
|
|
}
|
|
|
|
manager = await get_workspace_manager(user_id, session.session_id)
|
|
await manager.write_file(
|
|
content=json.dumps(state).encode("utf-8"),
|
|
filename=_STATE_FILENAME,
|
|
mime_type="application/json",
|
|
overwrite=True,
|
|
)
|
|
except Exception:
|
|
logger.warning(
|
|
"[browser] Failed to save browser state for session %s",
|
|
session_name,
|
|
exc_info=True,
|
|
)
|
|
|
|
|
|
async def _restore_browser_state(
|
|
session_name: str, user_id: str, session: ChatSession
|
|
) -> bool:
|
|
"""Restore browser state from workspace storage into a fresh daemon.
|
|
|
|
Best-effort: errors are logged but never propagate to the tool response.
|
|
Returns True on success (or no state to restore), False on failure.
|
|
"""
|
|
try:
|
|
manager = await get_workspace_manager(user_id, session.session_id)
|
|
|
|
file_info = await manager.get_file_info_by_path(_STATE_FILENAME)
|
|
if file_info is None:
|
|
return True # No saved state — first call or never saved
|
|
|
|
state_bytes = await manager.read_file(_STATE_FILENAME)
|
|
state = json.loads(state_bytes.decode("utf-8"))
|
|
|
|
url = state.get("url", "")
|
|
cookies = state.get("cookies", [])
|
|
local_storage = state.get("local_storage", {})
|
|
|
|
# Navigate first — starts daemon + sets the correct origin for cookies
|
|
if url:
|
|
# Validate the saved URL to prevent SSRF via stored redirect targets.
|
|
try:
|
|
await validate_url_host(url)
|
|
except ValueError:
|
|
logger.warning(
|
|
"[browser] State restore: blocked SSRF URL %s", url[:200]
|
|
)
|
|
return False
|
|
|
|
rc, _, stderr = await _run(session_name, "open", url)
|
|
if rc != 0:
|
|
logger.warning(
|
|
"[browser] State restore: failed to open %s: %s",
|
|
url,
|
|
stderr[:200],
|
|
)
|
|
return False
|
|
await _run(session_name, "wait", "--load", "load", timeout=15)
|
|
|
|
# Restore cookies and localStorage in parallel via asyncio.gather.
|
|
# Semaphore caps concurrent subprocess spawns so we don't overwhelm the
|
|
# system when a session has hundreds of cookies.
|
|
sem = asyncio.Semaphore(_RESTORE_CONCURRENCY)
|
|
|
|
# Guard against pathological sites with thousands of cookies.
|
|
if len(cookies) > _MAX_RESTORE_COOKIES:
|
|
logger.debug(
|
|
"[browser] State restore: capping cookies from %d to %d",
|
|
len(cookies),
|
|
_MAX_RESTORE_COOKIES,
|
|
)
|
|
cookies = cookies[:_MAX_RESTORE_COOKIES]
|
|
|
|
async def _set_cookie(c: dict[str, Any]) -> None:
|
|
name = c.get("name", "")
|
|
value = c.get("value", "")
|
|
domain = c.get("domain", "")
|
|
path = c.get("path", "/")
|
|
if not (name and domain):
|
|
return
|
|
async with sem:
|
|
rc, _, stderr = await _run(
|
|
session_name,
|
|
"cookies",
|
|
"set",
|
|
name,
|
|
value,
|
|
"--domain",
|
|
domain,
|
|
"--path",
|
|
path,
|
|
timeout=5,
|
|
)
|
|
if rc != 0:
|
|
logger.debug(
|
|
"[browser] State restore: cookie set failed for %s: %s",
|
|
name,
|
|
stderr[:100],
|
|
)
|
|
|
|
async def _set_storage(key: str, val: object) -> None:
|
|
async with sem:
|
|
rc, _, stderr = await _run(
|
|
session_name,
|
|
"storage",
|
|
"local",
|
|
"set",
|
|
key,
|
|
str(val),
|
|
timeout=5,
|
|
)
|
|
if rc != 0:
|
|
logger.debug(
|
|
"[browser] State restore: localStorage set failed for %s: %s",
|
|
key,
|
|
stderr[:100],
|
|
)
|
|
|
|
await asyncio.gather(
|
|
*[_set_cookie(c) for c in cookies],
|
|
*[_set_storage(k, v) for k, v in local_storage.items()],
|
|
)
|
|
|
|
return True
|
|
except Exception:
|
|
logger.warning(
|
|
"[browser] Failed to restore browser state for session %s",
|
|
session_name,
|
|
exc_info=True,
|
|
)
|
|
return False
|
|
|
|
|
|
async def _ensure_session(
|
|
session_name: str, user_id: str, session: ChatSession
|
|
) -> None:
|
|
"""Ensure the local browser daemon has state. Restore from cloud if needed."""
|
|
if session_name in _alive_sessions:
|
|
return
|
|
async with _session_locks_mutex:
|
|
lock = _session_locks.setdefault(session_name, asyncio.Lock())
|
|
async with lock:
|
|
# Double-check after acquiring lock — another coroutine may have restored.
|
|
if session_name in _alive_sessions:
|
|
return
|
|
if await _has_local_session(session_name):
|
|
_alive_sessions.add(session_name)
|
|
return
|
|
if await _restore_browser_state(session_name, user_id, session):
|
|
_alive_sessions.add(session_name)
|
|
|
|
|
|
async def close_browser_session(session_name: str, user_id: str | None = None) -> None:
|
|
"""Shut down the local agent-browser daemon and clean up stored state.
|
|
|
|
Deletes ``._browser_state.json`` from workspace storage so cookies and
|
|
other credentials do not linger after the session is deleted.
|
|
|
|
Best-effort: errors are logged but never raised.
|
|
"""
|
|
_alive_sessions.discard(session_name)
|
|
async with _session_locks_mutex:
|
|
_session_locks.pop(session_name, None)
|
|
|
|
# Delete persisted browser state (cookies, localStorage) from workspace.
|
|
if user_id:
|
|
try:
|
|
manager = await get_workspace_manager(user_id, session_name)
|
|
file_info = await manager.get_file_info_by_path(_STATE_FILENAME)
|
|
if file_info is not None:
|
|
await manager.delete_file(file_info.id)
|
|
except Exception:
|
|
logger.debug(
|
|
"[browser] Failed to delete state file for session %s",
|
|
session_name,
|
|
exc_info=True,
|
|
)
|
|
|
|
try:
|
|
rc, _, stderr = await _run(session_name, "close", timeout=10)
|
|
if rc != 0:
|
|
logger.debug(
|
|
"[browser] close failed for session %s: %s",
|
|
session_name,
|
|
stderr[:200],
|
|
)
|
|
except Exception:
|
|
logger.debug(
|
|
"[browser] Exception closing browser session %s",
|
|
session_name,
|
|
exc_info=True,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Tool: browser_navigate
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class BrowserNavigateTool(BaseTool):
|
|
"""Navigate to a URL and return the page's interactive elements.
|
|
|
|
The browser session persists across tool calls within this Copilot session
|
|
(keyed to session_id), so cookies and auth state carry over. This enables
|
|
full login flows: navigate to login page → browser_act to fill credentials
|
|
→ browser_act to submit → browser_navigate to the target page.
|
|
"""
|
|
|
|
@property
|
|
def name(self) -> str:
|
|
return "browser_navigate"
|
|
|
|
@property
|
|
def description(self) -> str:
|
|
return (
|
|
"Navigate to a URL in a real browser. Returns accessibility tree with @ref IDs "
|
|
"for browser_act. Session persists (cookies/auth carry over). "
|
|
"For static pages, prefer web_fetch. "
|
|
"For SPAs, elements may load late — use browser_act with wait + browser_screenshot to verify. "
|
|
"For auth: navigate to login, fill creds and submit with browser_act, then navigate to target."
|
|
)
|
|
|
|
@property
|
|
def parameters(self) -> dict[str, Any]:
|
|
return {
|
|
"type": "object",
|
|
"properties": {
|
|
"url": {
|
|
"type": "string",
|
|
"description": "HTTP/HTTPS URL to navigate to.",
|
|
},
|
|
"wait_for": {
|
|
"type": "string",
|
|
"enum": ["networkidle", "load", "domcontentloaded"],
|
|
"default": "networkidle",
|
|
"description": "Navigation completion strategy (default: networkidle).",
|
|
},
|
|
},
|
|
"required": ["url"],
|
|
}
|
|
|
|
@property
|
|
def requires_auth(self) -> bool:
|
|
return True
|
|
|
|
@property
|
|
def is_available(self) -> bool:
|
|
return shutil.which("agent-browser") is not None
|
|
|
|
async def _execute(
|
|
self,
|
|
user_id: str | None,
|
|
session: ChatSession,
|
|
**kwargs: Any,
|
|
) -> ToolResponseBase:
|
|
"""Navigate to *url*, wait for the page to settle, and return a snapshot.
|
|
|
|
The snapshot is an accessibility-tree listing of interactive elements.
|
|
Note: for slow SPAs that never fully idle, the snapshot may reflect a
|
|
partially-loaded state (the wait is best-effort).
|
|
"""
|
|
url: str = (kwargs.get("url") or "").strip()
|
|
wait_for: str = kwargs.get("wait_for") or "networkidle"
|
|
session_name = session.session_id
|
|
|
|
if not url:
|
|
return ErrorResponse(
|
|
message="Please provide a URL to navigate to.",
|
|
error="missing_url",
|
|
session_id=session_name,
|
|
)
|
|
|
|
try:
|
|
await validate_url_host(url)
|
|
except ValueError as e:
|
|
return ErrorResponse(
|
|
message=str(e),
|
|
error="blocked_url",
|
|
session_id=session_name,
|
|
)
|
|
|
|
# Restore browser state from cloud if this is a different pod
|
|
if user_id:
|
|
await _ensure_session(session_name, user_id, session)
|
|
|
|
# Navigate
|
|
rc, _, stderr = await _run(session_name, "open", url)
|
|
if rc != 0:
|
|
logger.warning(
|
|
"[browser_navigate] open failed for %s: %s", url, stderr[:300]
|
|
)
|
|
return ErrorResponse(
|
|
message="Failed to navigate to URL.",
|
|
error="navigation_failed",
|
|
session_id=session_name,
|
|
)
|
|
|
|
# Wait for page to settle (best-effort: some SPAs never reach networkidle)
|
|
wait_rc, _, wait_err = await _run(session_name, "wait", "--load", wait_for)
|
|
if wait_rc != 0:
|
|
logger.warning(
|
|
"[browser_navigate] wait(%s) failed: %s", wait_for, wait_err[:300]
|
|
)
|
|
|
|
# Get current title and URL in parallel
|
|
(_, title_out, _), (_, url_out, _) = await asyncio.gather(
|
|
_run(session_name, "get", "title"),
|
|
_run(session_name, "get", "url"),
|
|
)
|
|
|
|
snapshot = await _snapshot(session_name)
|
|
|
|
result = BrowserNavigateResponse(
|
|
message=f"Navigated to {url}",
|
|
url=url_out.strip() or url,
|
|
title=title_out.strip(),
|
|
snapshot=snapshot,
|
|
session_id=session_name,
|
|
)
|
|
|
|
# Persist browser state to cloud for cross-pod continuity
|
|
if user_id:
|
|
_fire_and_forget_save(session_name, user_id, session)
|
|
|
|
return result
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Tool: browser_act
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_NO_TARGET_ACTIONS = frozenset({"back", "forward", "reload"})
|
|
_SCROLL_ACTIONS = frozenset({"scroll"})
|
|
_TARGET_ONLY_ACTIONS = frozenset({"click", "dblclick", "hover", "check", "uncheck"})
|
|
_TARGET_VALUE_ACTIONS = frozenset({"fill", "type", "select"})
|
|
# wait <selector|ms>: waits for a DOM element or a fixed delay (e.g. "1000" for 1 s)
|
|
_WAIT_ACTIONS = frozenset({"wait"})
|
|
|
|
|
|
class BrowserActTool(BaseTool):
|
|
"""Perform an action on the current browser page and return the updated snapshot.
|
|
|
|
Use @ref IDs from the snapshot returned by browser_navigate (e.g. '@e3').
|
|
The LLM orchestrates multi-step flows by chaining browser_navigate and
|
|
browser_act calls across turns of the Claude Agent SDK conversation.
|
|
"""
|
|
|
|
@property
|
|
def name(self) -> str:
|
|
return "browser_act"
|
|
|
|
@property
|
|
def description(self) -> str:
|
|
return (
|
|
"Interact with the current browser page using @ref IDs from the snapshot. "
|
|
"Actions: click, dblclick, fill, type, scroll, hover, press, "
|
|
"check, uncheck, select, wait, back, forward, reload. "
|
|
"fill clears field first; type appends. "
|
|
"wait accepts CSS selector or milliseconds (e.g. '1000'). "
|
|
"Returns updated snapshot."
|
|
)
|
|
|
|
@property
|
|
def parameters(self) -> dict[str, Any]:
|
|
return {
|
|
"type": "object",
|
|
"properties": {
|
|
"action": {
|
|
"type": "string",
|
|
"enum": [
|
|
"click",
|
|
"dblclick",
|
|
"fill",
|
|
"type",
|
|
"scroll",
|
|
"hover",
|
|
"press",
|
|
"check",
|
|
"uncheck",
|
|
"select",
|
|
"wait",
|
|
"back",
|
|
"forward",
|
|
"reload",
|
|
],
|
|
"description": "Action to perform.",
|
|
},
|
|
"target": {
|
|
"type": "string",
|
|
"description": "@ref ID (e.g. '@e3'), CSS selector, or text. Required for: click, dblclick, fill, type, hover, check, uncheck, select. For wait: CSS selector or milliseconds string (e.g. '1000').",
|
|
},
|
|
"value": {
|
|
"type": "string",
|
|
"description": "Text for fill/type, key for press (e.g. 'Enter'), option for select.",
|
|
},
|
|
"direction": {
|
|
"type": "string",
|
|
"enum": ["up", "down", "left", "right"],
|
|
"default": "down",
|
|
"description": "Scroll direction (default: down).",
|
|
},
|
|
},
|
|
"required": ["action"],
|
|
}
|
|
|
|
@property
|
|
def requires_auth(self) -> bool:
|
|
return True
|
|
|
|
@property
|
|
def is_available(self) -> bool:
|
|
return shutil.which("agent-browser") is not None
|
|
|
|
async def _execute(
|
|
self,
|
|
user_id: str | None,
|
|
session: ChatSession,
|
|
**kwargs: Any,
|
|
) -> ToolResponseBase:
|
|
"""Perform a browser action and return an updated page snapshot.
|
|
|
|
Validates the *action*/*target*/*value* combination, delegates to
|
|
``agent-browser``, waits for the page to settle, and returns the
|
|
accessibility-tree snapshot so the LLM can plan the next step.
|
|
"""
|
|
action: str = (kwargs.get("action") or "").strip()
|
|
target: str = (kwargs.get("target") or "").strip()
|
|
value: str = (kwargs.get("value") or "").strip()
|
|
direction: str = (kwargs.get("direction") or "down").strip()
|
|
session_name = session.session_id
|
|
|
|
if not action:
|
|
return ErrorResponse(
|
|
message="Please specify an action.",
|
|
error="missing_action",
|
|
session_id=session_name,
|
|
)
|
|
|
|
# Build the agent-browser command args
|
|
if action in _NO_TARGET_ACTIONS:
|
|
cmd_args = [action]
|
|
|
|
elif action in _SCROLL_ACTIONS:
|
|
cmd_args = ["scroll", direction]
|
|
|
|
elif action == "press":
|
|
if not value:
|
|
return ErrorResponse(
|
|
message="'press' requires a 'value' (key name, e.g. 'Enter').",
|
|
error="missing_value",
|
|
session_id=session_name,
|
|
)
|
|
cmd_args = ["press", value]
|
|
|
|
elif action in _TARGET_ONLY_ACTIONS:
|
|
if not target:
|
|
return ErrorResponse(
|
|
message=f"'{action}' requires a 'target' element.",
|
|
error="missing_target",
|
|
session_id=session_name,
|
|
)
|
|
cmd_args = [action, target]
|
|
|
|
elif action in _TARGET_VALUE_ACTIONS:
|
|
if not target or not value:
|
|
return ErrorResponse(
|
|
message=f"'{action}' requires both 'target' and 'value'.",
|
|
error="missing_params",
|
|
session_id=session_name,
|
|
)
|
|
cmd_args = [action, target, value]
|
|
|
|
elif action in _WAIT_ACTIONS:
|
|
if not target:
|
|
return ErrorResponse(
|
|
message=(
|
|
"'wait' requires a 'target': a CSS selector to wait for, "
|
|
"or milliseconds as a string (e.g. '1000')."
|
|
),
|
|
error="missing_target",
|
|
session_id=session_name,
|
|
)
|
|
cmd_args = ["wait", target]
|
|
|
|
else:
|
|
return ErrorResponse(
|
|
message=f"Unsupported action: {action}",
|
|
error="invalid_action",
|
|
session_id=session_name,
|
|
)
|
|
|
|
# Restore browser state from cloud if this is a different pod
|
|
if user_id:
|
|
await _ensure_session(session_name, user_id, session)
|
|
|
|
rc, _, stderr = await _run(session_name, *cmd_args)
|
|
if rc != 0:
|
|
logger.warning("[browser_act] %s failed: %s", action, stderr[:300])
|
|
return ErrorResponse(
|
|
message=f"Action '{action}' failed.",
|
|
error="action_failed",
|
|
session_id=session_name,
|
|
)
|
|
|
|
# Allow the page to settle after interaction (best-effort: SPAs may not idle)
|
|
settle_rc, _, settle_err = await _run(
|
|
session_name, "wait", "--load", "networkidle"
|
|
)
|
|
if settle_rc != 0:
|
|
logger.warning(
|
|
"[browser_act] post-action wait failed: %s", settle_err[:300]
|
|
)
|
|
|
|
snapshot = await _snapshot(session_name)
|
|
_, url_out, _ = await _run(session_name, "get", "url")
|
|
|
|
result = BrowserActResponse(
|
|
message=f"Performed '{action}'" + (f" on '{target}'" if target else ""),
|
|
action=action,
|
|
current_url=url_out.strip(),
|
|
snapshot=snapshot,
|
|
session_id=session_name,
|
|
)
|
|
|
|
# Persist browser state to cloud for cross-pod continuity
|
|
if user_id:
|
|
_fire_and_forget_save(session_name, user_id, session)
|
|
|
|
return result
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Tool: browser_screenshot
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class BrowserScreenshotTool(BaseTool):
|
|
"""Capture a screenshot of the current browser page and save it to the workspace."""
|
|
|
|
@property
|
|
def name(self) -> str:
|
|
return "browser_screenshot"
|
|
|
|
@property
|
|
def description(self) -> str:
|
|
return (
|
|
"Screenshot the current browser page and save to workspace. "
|
|
"annotate=true overlays @ref labels on elements. "
|
|
"IMPORTANT: After calling, you MUST immediately call read_workspace_file with the "
|
|
"returned file_id to display the image inline."
|
|
)
|
|
|
|
@property
|
|
def parameters(self) -> dict[str, Any]:
|
|
return {
|
|
"type": "object",
|
|
"properties": {
|
|
"annotate": {
|
|
"type": "boolean",
|
|
"default": True,
|
|
"description": "Overlay @ref labels (default: true).",
|
|
},
|
|
"filename": {
|
|
"type": "string",
|
|
"default": "screenshot.png",
|
|
"description": "Workspace filename (default: screenshot.png).",
|
|
},
|
|
},
|
|
}
|
|
|
|
@property
|
|
def requires_auth(self) -> bool:
|
|
return True
|
|
|
|
@property
|
|
def is_available(self) -> bool:
|
|
return shutil.which("agent-browser") is not None
|
|
|
|
async def _execute(
|
|
self,
|
|
user_id: str | None,
|
|
session: ChatSession,
|
|
**kwargs: Any,
|
|
) -> ToolResponseBase:
|
|
"""Capture a PNG screenshot and upload it to the workspace.
|
|
|
|
Handles string-to-bool coercion for *annotate* (OpenAI function-call
|
|
payloads sometimes deliver ``"true"``/``"false"`` as strings).
|
|
Returns a :class:`BrowserScreenshotResponse` with the workspace
|
|
``file_id`` the LLM should pass to ``read_workspace_file``.
|
|
"""
|
|
raw_annotate = kwargs.get("annotate", True)
|
|
if isinstance(raw_annotate, str):
|
|
annotate = raw_annotate.strip().lower() in {"1", "true", "yes", "on"}
|
|
else:
|
|
annotate = bool(raw_annotate)
|
|
filename: str = (kwargs.get("filename") or "screenshot.png").strip()
|
|
session_name = session.session_id
|
|
|
|
# Restore browser state from cloud if this is a different pod
|
|
if user_id:
|
|
await _ensure_session(session_name, user_id, session)
|
|
|
|
tmp_fd, tmp_path = tempfile.mkstemp(suffix=".png")
|
|
os.close(tmp_fd)
|
|
try:
|
|
cmd_args = ["screenshot"]
|
|
if annotate:
|
|
cmd_args.append("--annotate")
|
|
cmd_args.append(tmp_path)
|
|
|
|
rc, _, stderr = await _run(session_name, *cmd_args)
|
|
if rc != 0:
|
|
logger.warning("[browser_screenshot] failed: %s", stderr[:300])
|
|
return ErrorResponse(
|
|
message="Failed to take screenshot.",
|
|
error="screenshot_failed",
|
|
session_id=session_name,
|
|
)
|
|
|
|
with open(tmp_path, "rb") as f:
|
|
png_bytes = f.read()
|
|
|
|
finally:
|
|
try:
|
|
os.unlink(tmp_path)
|
|
except OSError:
|
|
pass # Best-effort temp file cleanup; not critical if it fails.
|
|
|
|
# Upload to workspace so the user can view it
|
|
png_b64 = base64.b64encode(png_bytes).decode()
|
|
|
|
# Import here to avoid circular deps — workspace_files imports from .models
|
|
from .workspace_files import WorkspaceWriteResponse, WriteWorkspaceFileTool
|
|
|
|
write_resp = await WriteWorkspaceFileTool()._execute(
|
|
user_id=user_id,
|
|
session=session,
|
|
filename=filename,
|
|
content_base64=png_b64,
|
|
)
|
|
|
|
if not isinstance(write_resp, WorkspaceWriteResponse):
|
|
return ErrorResponse(
|
|
message="Screenshot taken but failed to save to workspace.",
|
|
error="workspace_write_failed",
|
|
session_id=session_name,
|
|
)
|
|
|
|
result = BrowserScreenshotResponse(
|
|
message=f"Screenshot saved to workspace as '{filename}'. Use read_workspace_file with file_id='{write_resp.file_id}' to retrieve it.",
|
|
file_id=write_resp.file_id,
|
|
filename=filename,
|
|
session_id=session_name,
|
|
)
|
|
|
|
# Persist browser state to cloud for cross-pod continuity
|
|
if user_id:
|
|
_fire_and_forget_save(session_name, user_id, session)
|
|
|
|
return result
|