feat(copilot): Add agent-browser multi-step browser automation tools (#12230)

## Summary

Adds three new Copilot tools for multi-step browser automation using the
[agent-browser](https://github.com/vercel-labs/agent-browser) CLI
(Playwright-based local daemon):

- **`browser_navigate`** — navigate to a URL and get an
accessibility-tree snapshot with `@ref` IDs
- **`browser_act`** — interact with page elements (click, fill, scroll,
check, press, select, `dblclick`, `type`, `wait`, back, forward,
reload); returns updated snapshot
- **`browser_screenshot`** — capture annotated screenshot (with `@ref`
overlays) and save to user workspace

Also adds **`browse_web`** (Stagehand + Browserbase) for one-shot
JS-rendered page extraction.

### Why two browser tools?

| Tool | When to use |
|------|-------------|
| `browse_web` | Single-shot extraction — cloud Browserbase session, no
local daemon needed |
| `browser_navigate` / `browser_act` | Multi-step flows (login →
navigate → scrape), persistent session within a Copilot session |

### Design decisions

- **SSRF protection**: Uses the same `validate_url()` from
`backend.util.request` as HTTP blocks — async DNS, all IPs checked, full
RFC 1918 + link-local + IPv6 coverage
- **Session isolation**: `_run()` passes both `--session <id>` (isolated
Chromium context per Copilot session) **and** `--session-name <id>`
(persist cookies within a session), preventing cross-session state
leakage while supporting login flows
- **Snapshot truncation**: Interactive-only accessibility tree
(`snapshot -i`) capped at 20 000 chars with a continuation hint
- **Screenshot storage**: PNG bytes uploaded to user workspace via
`WriteWorkspaceFileTool`; returns `file_id` for retrieval

### Bugs fixed in this PR

- Session isolation bug: `--session-name` alone shared browser history
across different Copilot sessions; added `--session` to isolate contexts
- Missing actions: added `dblclick`, `type` (append without clearing),
`wait` (CSS selector or ms delay)

## Test plan

- [x] 53 unit tests covering all three tools, all actions, SSRF
integration, auth check, session isolation, snapshot truncation,
timeout, missing binary
- [x] Integration test: real `agent-browser` CLI + Anthropic API
tool-calling loop (3/3 scenarios passed)
- [x] Linting (Ruff, isort, Black, Pyright) all passing

```
backend/copilot/tools/agent_browser_test.py  53 passed in 17.79s
```
This commit is contained in:
Zamil Majdy
2026-03-04 04:55:28 +07:00
committed by GitHub
parent 29da8db48e
commit b504cf9854
10 changed files with 2695 additions and 13 deletions

View File

@@ -672,6 +672,16 @@ async def delete_chat_session(session_id: str, user_id: str | None = None) -> bo
async with _session_locks_mutex:
_session_locks.pop(session_id, None)
# Shut down any local browser daemon for this session (best-effort).
# Inline import required: all tool modules import ChatSession from this
# module, so any top-level import from tools.* would create a cycle.
try:
from .tools.agent_browser import close_browser_session
await close_browser_session(session_id, user_id=user_id)
except Exception as e:
logger.debug(f"Browser cleanup for session {session_id}: {e}")
return True

View File

@@ -59,7 +59,7 @@ from .response_model import (
StreamToolOutputAvailable,
StreamUsage,
)
from .tools import execute_tool, tools
from .tools import execute_tool, get_available_tools
from .tools.models import ErrorResponse
from .tracking import track_user_message
@@ -514,7 +514,7 @@ async def stream_chat_completion(
)
async for chunk in _stream_chat_chunks(
session=session,
tools=tools,
tools=get_available_tools(),
system_prompt=system_prompt,
text_block_id=text_block_id,
):

View File

@@ -1,12 +1,14 @@
from __future__ import annotations
import logging
from typing import TYPE_CHECKING, Any
from openai.types.chat import ChatCompletionToolParam
from backend.copilot.model import ChatSession
from backend.copilot.tracking import track_tool_called
from .add_understanding import AddUnderstandingTool
from .agent_browser import BrowserActTool, BrowserNavigateTool, BrowserScreenshotTool
from .agent_output import AgentOutputTool
from .base import BaseTool
from .bash_exec import BashExecTool
@@ -30,6 +32,7 @@ from .workspace_files import (
)
if TYPE_CHECKING:
from backend.copilot.model import ChatSession
from backend.copilot.response_model import StreamToolOutputAvailable
logger = logging.getLogger(__name__)
@@ -50,6 +53,10 @@ TOOL_REGISTRY: dict[str, BaseTool] = {
"get_doc_page": GetDocPageTool(),
# Web fetch for safe URL retrieval
"web_fetch": WebFetchTool(),
# Agent-browser multi-step automation (navigate, act, screenshot)
"browser_navigate": BrowserNavigateTool(),
"browser_act": BrowserActTool(),
"browser_screenshot": BrowserScreenshotTool(),
# Sandboxed code execution (bubblewrap)
"bash_exec": BashExecTool(),
# Persistent workspace tools (cloud storage, survives across sessions)
@@ -67,10 +74,17 @@ TOOL_REGISTRY: dict[str, BaseTool] = {
find_agent_tool = TOOL_REGISTRY["find_agent"]
run_agent_tool = TOOL_REGISTRY["run_agent"]
# Generated from registry for OpenAI API
tools: list[ChatCompletionToolParam] = [
tool.as_openai_tool() for tool in TOOL_REGISTRY.values()
]
def get_available_tools() -> list[ChatCompletionToolParam]:
"""Return OpenAI tool schemas for tools available in the current environment.
Called per-request so that env-var or binary availability is evaluated
fresh each time (e.g. browser_* tools are excluded when agent-browser
CLI is not installed).
"""
return [
tool.as_openai_tool() for tool in TOOL_REGISTRY.values() if tool.is_available
]
def get_tool(tool_name: str) -> BaseTool | None:

View File

@@ -0,0 +1,876 @@
"""Agent-browser tools — multi-step browser automation for the Copilot.
Uses the agent-browser CLI (https://github.com/vercel-labs/agent-browser)
which runs a local Chromium instance managed by a persistent daemon.
- Runs locally — no cloud account required
- Full interaction support: click, fill, scroll, login flows, multi-step
- Session persistence via --session-name: cookies/auth carry across tool calls
within the same Copilot session, enabling login → navigate → extract workflows
- Screenshot with --annotate overlays @ref labels, saved to workspace for user
- The Claude Agent SDK's multi-turn loop handles orchestration — each tool call
is one browser action; the LLM chains them naturally
SSRF protection:
Uses the shared validate_url() from backend.util.request, which is the same
guard used by HTTP blocks and web_fetch. It resolves ALL DNS answers (not just
the first), blocks RFC 1918, loopback, link-local, 0.0.0.0/8, multicast,
and all relevant IPv6 ranges, and applies IDNA encoding to prevent Unicode
domain attacks.
Requires:
npm install -g agent-browser
agent-browser install (downloads Chromium, one-time per machine)
"""
import asyncio
import base64
import json
import logging
import os
import shutil
import tempfile
from typing import Any
from backend.copilot.model import ChatSession
from backend.util.request import validate_url
from .base import BaseTool
from .models import (
BrowserActResponse,
BrowserNavigateResponse,
BrowserScreenshotResponse,
ErrorResponse,
ToolResponseBase,
)
from .workspace_files import get_manager
logger = logging.getLogger(__name__)
# Per-command timeout (seconds). Navigation + networkidle wait can be slow.
_CMD_TIMEOUT = 45
# Accessibility tree can be very large; cap it to keep LLM context manageable.
_MAX_SNAPSHOT_CHARS = 20_000
# ---------------------------------------------------------------------------
# Subprocess helper
# ---------------------------------------------------------------------------
async def _run(
session_name: str,
*args: str,
timeout: int = _CMD_TIMEOUT,
) -> tuple[int, str, str]:
"""Run agent-browser for the given session and return (rc, stdout, stderr).
Uses both:
--session <name> → isolated Chromium context (no shared history/cookies
with other Copilot sessions — prevents cross-session
browser state leakage)
--session-name <name> → persist cookies/localStorage across tool calls within
the same session (enables login → navigate flows)
"""
cmd = [
"agent-browser",
"--session",
session_name,
"--session-name",
session_name,
*args,
]
proc = None
try:
proc = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
return proc.returncode or 0, stdout.decode(), stderr.decode()
except asyncio.TimeoutError:
# Kill the orphaned subprocess so it does not linger in the process table.
if proc is not None and proc.returncode is None:
proc.kill()
try:
await proc.communicate()
except Exception:
pass # Best-effort reap; ignore errors during cleanup.
return 1, "", f"Command timed out after {timeout}s."
except FileNotFoundError:
return (
1,
"",
"agent-browser is not installed (run: npm install -g agent-browser && agent-browser install).",
)
async def _snapshot(session_name: str) -> str:
"""Return the current page's interactive accessibility tree, truncated."""
rc, stdout, stderr = await _run(session_name, "snapshot", "-i", "-c")
if rc != 0:
return f"[snapshot failed: {stderr[:300]}]"
text = stdout.strip()
if len(text) > _MAX_SNAPSHOT_CHARS:
suffix = "\n\n[Snapshot truncated — use browser_act to navigate further]"
keep = max(0, _MAX_SNAPSHOT_CHARS - len(suffix))
text = text[:keep] + suffix
return text
# ---------------------------------------------------------------------------
# Stateless session helpers — persist / restore browser state across pods
# ---------------------------------------------------------------------------
# Module-level cache of sessions known to be alive on this pod.
# Avoids the subprocess probe on every tool call within the same pod.
_alive_sessions: set[str] = set()
# Per-session locks to prevent concurrent _ensure_session calls from
# triggering duplicate _restore_browser_state for the same session.
# Protected by _session_locks_mutex to ensure setdefault/pop are not
# interleaved across await boundaries.
_session_locks: dict[str, asyncio.Lock] = {}
_session_locks_mutex = asyncio.Lock()
# Workspace filename for persisted browser state (auto-scoped to session).
# Dot-prefixed so it is hidden from user workspace listings.
_STATE_FILENAME = "._browser_state.json"
# Maximum concurrent subprocesses during cookie/storage restore.
_RESTORE_CONCURRENCY = 10
# Maximum cookies to restore per session. Pathological sites can accumulate
# thousands of cookies; restoring them all would be slow and is rarely useful.
_MAX_RESTORE_COOKIES = 100
# Background tasks for fire-and-forget state persistence.
# Prevents GC from collecting tasks before they complete.
_background_tasks: set[asyncio.Task] = set()
def _fire_and_forget_save(
session_name: str, user_id: str, session: ChatSession
) -> None:
"""Schedule state persistence as a background task (non-blocking).
State save is already best-effort (errors are swallowed), so running it
in the background avoids adding latency to tool responses.
"""
task = asyncio.create_task(_save_browser_state(session_name, user_id, session))
_background_tasks.add(task)
task.add_done_callback(_background_tasks.discard)
async def _has_local_session(session_name: str) -> bool:
"""Check if the local agent-browser daemon for this session is running."""
rc, _, _ = await _run(session_name, "get", "url", timeout=5)
return rc == 0
async def _save_browser_state(
session_name: str, user_id: str, session: ChatSession
) -> None:
"""Persist browser state (cookies, localStorage, URL) to workspace.
Best-effort: errors are logged but never propagate to the tool response.
"""
try:
# Gather state in parallel
(rc_url, url_out, _), (rc_ck, ck_out, _), (rc_ls, ls_out, _) = (
await asyncio.gather(
_run(session_name, "get", "url", timeout=10),
_run(session_name, "cookies", "get", "--json", timeout=10),
_run(session_name, "storage", "local", "--json", timeout=10),
)
)
state = {
"url": url_out.strip() if rc_url == 0 else "",
"cookies": (json.loads(ck_out) if rc_ck == 0 and ck_out.strip() else []),
"local_storage": (
json.loads(ls_out) if rc_ls == 0 and ls_out.strip() else {}
),
}
manager = await get_manager(user_id, session.session_id)
await manager.write_file(
content=json.dumps(state).encode("utf-8"),
filename=_STATE_FILENAME,
mime_type="application/json",
overwrite=True,
)
except Exception:
logger.warning(
"[browser] Failed to save browser state for session %s",
session_name,
exc_info=True,
)
async def _restore_browser_state(
session_name: str, user_id: str, session: ChatSession
) -> bool:
"""Restore browser state from workspace storage into a fresh daemon.
Best-effort: errors are logged but never propagate to the tool response.
Returns True on success (or no state to restore), False on failure.
"""
try:
manager = await get_manager(user_id, session.session_id)
file_info = await manager.get_file_info_by_path(_STATE_FILENAME)
if file_info is None:
return True # No saved state — first call or never saved
state_bytes = await manager.read_file(_STATE_FILENAME)
state = json.loads(state_bytes.decode("utf-8"))
url = state.get("url", "")
cookies = state.get("cookies", [])
local_storage = state.get("local_storage", {})
# Navigate first — starts daemon + sets the correct origin for cookies
if url:
# Validate the saved URL to prevent SSRF via stored redirect targets.
try:
await validate_url(url, trusted_origins=[])
except ValueError:
logger.warning(
"[browser] State restore: blocked SSRF URL %s", url[:200]
)
return False
rc, _, stderr = await _run(session_name, "open", url)
if rc != 0:
logger.warning(
"[browser] State restore: failed to open %s: %s",
url,
stderr[:200],
)
return False
await _run(session_name, "wait", "--load", "load", timeout=15)
# Restore cookies and localStorage in parallel via asyncio.gather.
# Semaphore caps concurrent subprocess spawns so we don't overwhelm the
# system when a session has hundreds of cookies.
sem = asyncio.Semaphore(_RESTORE_CONCURRENCY)
# Guard against pathological sites with thousands of cookies.
if len(cookies) > _MAX_RESTORE_COOKIES:
logger.debug(
"[browser] State restore: capping cookies from %d to %d",
len(cookies),
_MAX_RESTORE_COOKIES,
)
cookies = cookies[:_MAX_RESTORE_COOKIES]
async def _set_cookie(c: dict[str, Any]) -> None:
name = c.get("name", "")
value = c.get("value", "")
domain = c.get("domain", "")
path = c.get("path", "/")
if not (name and domain):
return
async with sem:
rc, _, stderr = await _run(
session_name,
"cookies",
"set",
name,
value,
"--domain",
domain,
"--path",
path,
timeout=5,
)
if rc != 0:
logger.debug(
"[browser] State restore: cookie set failed for %s: %s",
name,
stderr[:100],
)
async def _set_storage(key: str, val: object) -> None:
async with sem:
rc, _, stderr = await _run(
session_name,
"storage",
"local",
"set",
key,
str(val),
timeout=5,
)
if rc != 0:
logger.debug(
"[browser] State restore: localStorage set failed for %s: %s",
key,
stderr[:100],
)
await asyncio.gather(
*[_set_cookie(c) for c in cookies],
*[_set_storage(k, v) for k, v in local_storage.items()],
)
return True
except Exception:
logger.warning(
"[browser] Failed to restore browser state for session %s",
session_name,
exc_info=True,
)
return False
async def _ensure_session(
session_name: str, user_id: str, session: ChatSession
) -> None:
"""Ensure the local browser daemon has state. Restore from cloud if needed."""
if session_name in _alive_sessions:
return
async with _session_locks_mutex:
lock = _session_locks.setdefault(session_name, asyncio.Lock())
async with lock:
# Double-check after acquiring lock — another coroutine may have restored.
if session_name in _alive_sessions:
return
if await _has_local_session(session_name):
_alive_sessions.add(session_name)
return
if await _restore_browser_state(session_name, user_id, session):
_alive_sessions.add(session_name)
async def close_browser_session(session_name: str, user_id: str | None = None) -> None:
"""Shut down the local agent-browser daemon and clean up stored state.
Deletes ``._browser_state.json`` from workspace storage so cookies and
other credentials do not linger after the session is deleted.
Best-effort: errors are logged but never raised.
"""
_alive_sessions.discard(session_name)
async with _session_locks_mutex:
_session_locks.pop(session_name, None)
# Delete persisted browser state (cookies, localStorage) from workspace.
if user_id:
try:
manager = await get_manager(user_id, session_name)
file_info = await manager.get_file_info_by_path(_STATE_FILENAME)
if file_info is not None:
await manager.delete_file(file_info.id)
except Exception:
logger.debug(
"[browser] Failed to delete state file for session %s",
session_name,
exc_info=True,
)
try:
rc, _, stderr = await _run(session_name, "close", timeout=10)
if rc != 0:
logger.debug(
"[browser] close failed for session %s: %s",
session_name,
stderr[:200],
)
except Exception:
logger.debug(
"[browser] Exception closing browser session %s",
session_name,
exc_info=True,
)
# ---------------------------------------------------------------------------
# Tool: browser_navigate
# ---------------------------------------------------------------------------
class BrowserNavigateTool(BaseTool):
"""Navigate to a URL and return the page's interactive elements.
The browser session persists across tool calls within this Copilot session
(keyed to session_id), so cookies and auth state carry over. This enables
full login flows: navigate to login page → browser_act to fill credentials
→ browser_act to submit → browser_navigate to the target page.
"""
@property
def name(self) -> str:
return "browser_navigate"
@property
def description(self) -> str:
return (
"Navigate to a URL using a real browser. Returns an accessibility "
"tree snapshot listing the page's interactive elements with @ref IDs "
"(e.g. @e3) that can be used with browser_act. "
"Session persists — cookies and login state carry over between calls. "
"Use this (with browser_act) for multi-step interaction: login flows, "
"form filling, button clicks, or anything requiring page interaction. "
"For plain static pages, prefer web_fetch — no browser overhead. "
"For authenticated pages: navigate to the login page first, use browser_act "
"to fill credentials and submit, then navigate to the target page. "
"Note: for slow SPAs, the returned snapshot may reflect a partially-loaded "
"state. If elements seem missing, use browser_act with action='wait' and a "
"CSS selector or millisecond delay, then take a browser_screenshot to verify."
)
@property
def parameters(self) -> dict[str, Any]:
return {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The HTTP/HTTPS URL to navigate to.",
},
"wait_for": {
"type": "string",
"enum": ["networkidle", "load", "domcontentloaded"],
"default": "networkidle",
"description": "When to consider navigation complete. Use 'networkidle' for SPAs (default).",
},
},
"required": ["url"],
}
@property
def requires_auth(self) -> bool:
return True
@property
def is_available(self) -> bool:
return shutil.which("agent-browser") is not None
async def _execute(
self,
user_id: str | None,
session: ChatSession,
**kwargs: Any,
) -> ToolResponseBase:
"""Navigate to *url*, wait for the page to settle, and return a snapshot.
The snapshot is an accessibility-tree listing of interactive elements.
Note: for slow SPAs that never fully idle, the snapshot may reflect a
partially-loaded state (the wait is best-effort).
"""
url: str = (kwargs.get("url") or "").strip()
wait_for: str = kwargs.get("wait_for") or "networkidle"
session_name = session.session_id
if not url:
return ErrorResponse(
message="Please provide a URL to navigate to.",
error="missing_url",
session_id=session_name,
)
try:
await validate_url(url, trusted_origins=[])
except ValueError as e:
return ErrorResponse(
message=str(e),
error="blocked_url",
session_id=session_name,
)
# Restore browser state from cloud if this is a different pod
if user_id:
await _ensure_session(session_name, user_id, session)
# Navigate
rc, _, stderr = await _run(session_name, "open", url)
if rc != 0:
logger.warning(
"[browser_navigate] open failed for %s: %s", url, stderr[:300]
)
return ErrorResponse(
message="Failed to navigate to URL.",
error="navigation_failed",
session_id=session_name,
)
# Wait for page to settle (best-effort: some SPAs never reach networkidle)
wait_rc, _, wait_err = await _run(session_name, "wait", "--load", wait_for)
if wait_rc != 0:
logger.warning(
"[browser_navigate] wait(%s) failed: %s", wait_for, wait_err[:300]
)
# Get current title and URL in parallel
(_, title_out, _), (_, url_out, _) = await asyncio.gather(
_run(session_name, "get", "title"),
_run(session_name, "get", "url"),
)
snapshot = await _snapshot(session_name)
result = BrowserNavigateResponse(
message=f"Navigated to {url}",
url=url_out.strip() or url,
title=title_out.strip(),
snapshot=snapshot,
session_id=session_name,
)
# Persist browser state to cloud for cross-pod continuity
if user_id:
_fire_and_forget_save(session_name, user_id, session)
return result
# ---------------------------------------------------------------------------
# Tool: browser_act
# ---------------------------------------------------------------------------
_NO_TARGET_ACTIONS = frozenset({"back", "forward", "reload"})
_SCROLL_ACTIONS = frozenset({"scroll"})
_TARGET_ONLY_ACTIONS = frozenset({"click", "dblclick", "hover", "check", "uncheck"})
_TARGET_VALUE_ACTIONS = frozenset({"fill", "type", "select"})
# wait <selector|ms>: waits for a DOM element or a fixed delay (e.g. "1000" for 1 s)
_WAIT_ACTIONS = frozenset({"wait"})
class BrowserActTool(BaseTool):
"""Perform an action on the current browser page and return the updated snapshot.
Use @ref IDs from the snapshot returned by browser_navigate (e.g. '@e3').
The LLM orchestrates multi-step flows by chaining browser_navigate and
browser_act calls across turns of the Claude Agent SDK conversation.
"""
@property
def name(self) -> str:
return "browser_act"
@property
def description(self) -> str:
return (
"Interact with the current browser page. Use @ref IDs from the "
"snapshot (e.g. '@e3') to target elements. Returns an updated snapshot. "
"Supported actions: click, dblclick, fill, type, scroll, hover, press, "
"check, uncheck, select, wait, back, forward, reload. "
"fill clears the field before typing; type appends without clearing. "
"wait accepts a CSS selector (waits for element) or milliseconds string (e.g. '1000'). "
"Example login flow: fill @e1 with email → fill @e2 with password → "
"click @e3 (submit) → browser_navigate to the target page."
)
@property
def parameters(self) -> dict[str, Any]:
return {
"type": "object",
"properties": {
"action": {
"type": "string",
"enum": [
"click",
"dblclick",
"fill",
"type",
"scroll",
"hover",
"press",
"check",
"uncheck",
"select",
"wait",
"back",
"forward",
"reload",
],
"description": "The action to perform.",
},
"target": {
"type": "string",
"description": (
"Element to target. Use @ref from snapshot (e.g. '@e3'), "
"a CSS selector, or a text description. "
"Required for: click, dblclick, fill, type, hover, check, uncheck, select. "
"For wait: a CSS selector to wait for, or milliseconds as a string (e.g. '1000')."
),
},
"value": {
"type": "string",
"description": (
"For fill/type: the text to enter. "
"For press: key name (e.g. 'Enter', 'Tab', 'Control+a'). "
"For select: the option value to select."
),
},
"direction": {
"type": "string",
"enum": ["up", "down", "left", "right"],
"default": "down",
"description": "For scroll: direction to scroll.",
},
},
"required": ["action"],
}
@property
def requires_auth(self) -> bool:
return True
@property
def is_available(self) -> bool:
return shutil.which("agent-browser") is not None
async def _execute(
self,
user_id: str | None,
session: ChatSession,
**kwargs: Any,
) -> ToolResponseBase:
"""Perform a browser action and return an updated page snapshot.
Validates the *action*/*target*/*value* combination, delegates to
``agent-browser``, waits for the page to settle, and returns the
accessibility-tree snapshot so the LLM can plan the next step.
"""
action: str = (kwargs.get("action") or "").strip()
target: str = (kwargs.get("target") or "").strip()
value: str = (kwargs.get("value") or "").strip()
direction: str = (kwargs.get("direction") or "down").strip()
session_name = session.session_id
if not action:
return ErrorResponse(
message="Please specify an action.",
error="missing_action",
session_id=session_name,
)
# Build the agent-browser command args
if action in _NO_TARGET_ACTIONS:
cmd_args = [action]
elif action in _SCROLL_ACTIONS:
cmd_args = ["scroll", direction]
elif action == "press":
if not value:
return ErrorResponse(
message="'press' requires a 'value' (key name, e.g. 'Enter').",
error="missing_value",
session_id=session_name,
)
cmd_args = ["press", value]
elif action in _TARGET_ONLY_ACTIONS:
if not target:
return ErrorResponse(
message=f"'{action}' requires a 'target' element.",
error="missing_target",
session_id=session_name,
)
cmd_args = [action, target]
elif action in _TARGET_VALUE_ACTIONS:
if not target or not value:
return ErrorResponse(
message=f"'{action}' requires both 'target' and 'value'.",
error="missing_params",
session_id=session_name,
)
cmd_args = [action, target, value]
elif action in _WAIT_ACTIONS:
if not target:
return ErrorResponse(
message=(
"'wait' requires a 'target': a CSS selector to wait for, "
"or milliseconds as a string (e.g. '1000')."
),
error="missing_target",
session_id=session_name,
)
cmd_args = ["wait", target]
else:
return ErrorResponse(
message=f"Unsupported action: {action}",
error="invalid_action",
session_id=session_name,
)
# Restore browser state from cloud if this is a different pod
if user_id:
await _ensure_session(session_name, user_id, session)
rc, _, stderr = await _run(session_name, *cmd_args)
if rc != 0:
logger.warning("[browser_act] %s failed: %s", action, stderr[:300])
return ErrorResponse(
message=f"Action '{action}' failed.",
error="action_failed",
session_id=session_name,
)
# Allow the page to settle after interaction (best-effort: SPAs may not idle)
settle_rc, _, settle_err = await _run(
session_name, "wait", "--load", "networkidle"
)
if settle_rc != 0:
logger.warning(
"[browser_act] post-action wait failed: %s", settle_err[:300]
)
snapshot = await _snapshot(session_name)
_, url_out, _ = await _run(session_name, "get", "url")
result = BrowserActResponse(
message=f"Performed '{action}'" + (f" on '{target}'" if target else ""),
action=action,
current_url=url_out.strip(),
snapshot=snapshot,
session_id=session_name,
)
# Persist browser state to cloud for cross-pod continuity
if user_id:
_fire_and_forget_save(session_name, user_id, session)
return result
# ---------------------------------------------------------------------------
# Tool: browser_screenshot
# ---------------------------------------------------------------------------
class BrowserScreenshotTool(BaseTool):
"""Capture a screenshot of the current browser page and save it to the workspace."""
@property
def name(self) -> str:
return "browser_screenshot"
@property
def description(self) -> str:
return (
"Take a screenshot of the current browser page and save it to the workspace. "
"IMPORTANT: After calling this tool, immediately call read_workspace_file "
"with the returned file_id to display the image inline to the user — "
"the screenshot is not visible until you do this. "
"With annotate=true (default), @ref labels are overlaid on interactive "
"elements, making it easy to see which @ref ID maps to which element on screen."
)
@property
def parameters(self) -> dict[str, Any]:
return {
"type": "object",
"properties": {
"annotate": {
"type": "boolean",
"default": True,
"description": "Overlay @ref labels on interactive elements (default: true).",
},
"filename": {
"type": "string",
"default": "screenshot.png",
"description": "Filename to save in the workspace.",
},
},
}
@property
def requires_auth(self) -> bool:
return True
@property
def is_available(self) -> bool:
return shutil.which("agent-browser") is not None
async def _execute(
self,
user_id: str | None,
session: ChatSession,
**kwargs: Any,
) -> ToolResponseBase:
"""Capture a PNG screenshot and upload it to the workspace.
Handles string-to-bool coercion for *annotate* (OpenAI function-call
payloads sometimes deliver ``"true"``/``"false"`` as strings).
Returns a :class:`BrowserScreenshotResponse` with the workspace
``file_id`` the LLM should pass to ``read_workspace_file``.
"""
raw_annotate = kwargs.get("annotate", True)
if isinstance(raw_annotate, str):
annotate = raw_annotate.strip().lower() in {"1", "true", "yes", "on"}
else:
annotate = bool(raw_annotate)
filename: str = (kwargs.get("filename") or "screenshot.png").strip()
session_name = session.session_id
# Restore browser state from cloud if this is a different pod
if user_id:
await _ensure_session(session_name, user_id, session)
tmp_fd, tmp_path = tempfile.mkstemp(suffix=".png")
os.close(tmp_fd)
try:
cmd_args = ["screenshot"]
if annotate:
cmd_args.append("--annotate")
cmd_args.append(tmp_path)
rc, _, stderr = await _run(session_name, *cmd_args)
if rc != 0:
logger.warning("[browser_screenshot] failed: %s", stderr[:300])
return ErrorResponse(
message="Failed to take screenshot.",
error="screenshot_failed",
session_id=session_name,
)
with open(tmp_path, "rb") as f:
png_bytes = f.read()
finally:
try:
os.unlink(tmp_path)
except OSError:
pass # Best-effort temp file cleanup; not critical if it fails.
# Upload to workspace so the user can view it
png_b64 = base64.b64encode(png_bytes).decode()
# Import here to avoid circular deps — workspace_files imports from .models
from .workspace_files import WorkspaceWriteResponse, WriteWorkspaceFileTool
write_resp = await WriteWorkspaceFileTool()._execute(
user_id=user_id,
session=session,
filename=filename,
content_base64=png_b64,
)
if not isinstance(write_resp, WorkspaceWriteResponse):
return ErrorResponse(
message="Screenshot taken but failed to save to workspace.",
error="workspace_write_failed",
session_id=session_name,
)
result = BrowserScreenshotResponse(
message=f"Screenshot saved to workspace as '{filename}'. Use read_workspace_file with file_id='{write_resp.file_id}' to retrieve it.",
file_id=write_resp.file_id,
filename=filename,
session_id=session_name,
)
# Persist browser state to cloud for cross-pod continuity
if user_id:
_fire_and_forget_save(session_name, user_id, session)
return result

File diff suppressed because it is too large Load Diff

View File

@@ -36,6 +36,16 @@ class BaseTool:
"""Whether this tool requires authentication."""
return False
@property
def is_available(self) -> bool:
"""Whether this tool is available in the current environment.
Override to check required env vars, binaries, or other dependencies.
Unavailable tools are excluded from the LLM tool list so the model is
never offered an option that will immediately fail.
"""
return True
def as_openai_tool(self) -> ChatCompletionToolParam:
"""Convert to OpenAI tool format."""
return ChatCompletionToolParam(

View File

@@ -41,6 +41,10 @@ class ResponseType(str, Enum):
INPUT_VALIDATION_ERROR = "input_validation_error"
# Web fetch
WEB_FETCH = "web_fetch"
# Agent-browser multi-step automation (navigate, act, screenshot)
BROWSER_NAVIGATE = "browser_navigate"
BROWSER_ACT = "browser_act"
BROWSER_SCREENSHOT = "browser_screenshot"
# Code execution
BASH_EXEC = "bash_exec"
# Feature request types
@@ -476,3 +480,32 @@ class FeatureRequestCreatedResponse(ToolResponseBase):
issue_url: str
is_new_issue: bool # False if added to existing
customer_name: str
# Agent-browser multi-step automation models
class BrowserNavigateResponse(ToolResponseBase):
"""Response for browser_navigate tool."""
type: ResponseType = ResponseType.BROWSER_NAVIGATE
url: str
title: str
snapshot: str # Interactive accessibility tree with @ref IDs
class BrowserActResponse(ToolResponseBase):
"""Response for browser_act tool."""
type: ResponseType = ResponseType.BROWSER_ACT
action: str
current_url: str = ""
snapshot: str # Updated accessibility tree after the action
class BrowserScreenshotResponse(ToolResponseBase):
"""Response for browser_screenshot tool."""
type: ResponseType = ResponseType.BROWSER_SCREENSHOT
file_id: str # Workspace file ID — use read_workspace_file to retrieve
filename: str

View File

@@ -218,7 +218,7 @@ def _is_text_mime(mime_type: str) -> bool:
return any(mime_type.startswith(t) for t in _TEXT_MIME_PREFIXES)
async def _get_manager(user_id: str, session_id: str) -> WorkspaceManager:
async def get_manager(user_id: str, session_id: str) -> WorkspaceManager:
"""Create a session-scoped WorkspaceManager."""
workspace = await workspace_db().get_or_create_workspace(user_id)
return WorkspaceManager(user_id, workspace.id, session_id)
@@ -386,7 +386,7 @@ class ListWorkspaceFilesTool(BaseTool):
include_all_sessions: bool = kwargs.get("include_all_sessions", False)
try:
manager = await _get_manager(user_id, session_id)
manager = await get_manager(user_id, session_id)
files = await manager.list_files(
path=path_prefix, limit=limit, include_all_sessions=include_all_sessions
)
@@ -517,7 +517,7 @@ class ReadWorkspaceFileTool(BaseTool):
)
try:
manager = await _get_manager(user_id, session_id)
manager = await get_manager(user_id, session_id)
resolved = await _resolve_file(manager, file_id, path, session_id)
if isinstance(resolved, ErrorResponse):
return resolved
@@ -725,7 +725,7 @@ class WriteWorkspaceFileTool(BaseTool):
try:
await scan_content_safe(content, filename=filename)
manager = await _get_manager(user_id, session_id)
manager = await get_manager(user_id, session_id)
rec = await manager.write_file(
content=content,
filename=filename,
@@ -852,7 +852,7 @@ class DeleteWorkspaceFileTool(BaseTool):
)
try:
manager = await _get_manager(user_id, session_id)
manager = await get_manager(user_id, session_id)
resolved = await _resolve_file(manager, file_id, path, session_id)
if isinstance(resolved, ErrorResponse):
return resolved

View File

@@ -12,6 +12,7 @@ import {
GlobeIcon,
ListChecksIcon,
MagnifyingGlassIcon,
MonitorIcon,
PencilSimpleIcon,
TerminalIcon,
TrashIcon,
@@ -48,6 +49,7 @@ function formatToolName(name: string): string {
type ToolCategory =
| "bash"
| "web"
| "browser"
| "file-read"
| "file-write"
| "file-delete"
@@ -65,6 +67,10 @@ function getToolCategory(toolName: string): ToolCategory {
case "WebSearch":
case "WebFetch":
return "web";
case "browser_navigate":
case "browser_act":
case "browser_screenshot":
return "browser";
case "read_workspace_file":
case "read_file":
case "Read":
@@ -120,6 +126,8 @@ function ToolIcon({
return <TerminalIcon size={14} weight="regular" className={iconClass} />;
case "web":
return <GlobeIcon size={14} weight="regular" className={iconClass} />;
case "browser":
return <MonitorIcon size={14} weight="regular" className={iconClass} />;
case "file-read":
return <FileIcon size={14} weight="regular" className={iconClass} />;
case "file-write":
@@ -155,6 +163,8 @@ function AccordionIcon({ category }: { category: ToolCategory }) {
return <TerminalIcon size={32} weight="light" />;
case "web":
return <GlobeIcon size={32} weight="light" />;
case "browser":
return <MonitorIcon size={32} weight="light" />;
case "file-read":
case "file-write":
return <FileIcon size={32} weight="light" />;
@@ -189,6 +199,16 @@ function getInputSummary(toolName: string, input: unknown): string | null {
return typeof inp.url === "string" ? inp.url : null;
case "WebSearch":
return typeof inp.query === "string" ? inp.query : null;
case "browser_navigate":
return typeof inp.url === "string" ? inp.url : null;
case "browser_act":
return typeof inp.action === "string"
? inp.target
? `${inp.action} ${inp.target}`
: (inp.action as string)
: null;
case "browser_screenshot":
return null;
case "read_workspace_file":
case "read_file":
case "Read":
@@ -259,6 +279,11 @@ function getAnimationText(part: ToolUIPart, category: ToolCategory): string {
return shortSummary
? `Fetching ${shortSummary}`
: "Fetching web content…";
case "browser":
if (toolName === "browser_screenshot") return "Taking screenshot…";
return shortSummary
? `Browsing ${shortSummary}`
: "Interacting with browser…";
case "file-read":
return shortSummary ? `Reading ${shortSummary}` : "Reading file…";
case "file-write":
@@ -297,6 +322,11 @@ function getAnimationText(part: ToolUIPart, category: ToolCategory): string {
return shortSummary
? `Fetched ${shortSummary}`
: "Fetched web content";
case "browser":
if (toolName === "browser_screenshot") return "Screenshot captured";
return shortSummary
? `Browsed ${shortSummary}`
: "Browser action completed";
case "file-read":
return shortSummary ? `Read ${shortSummary}` : "File read completed";
case "file-write":
@@ -323,6 +353,8 @@ function getAnimationText(part: ToolUIPart, category: ToolCategory): string {
return "Command failed";
case "web":
return toolName === "WebSearch" ? "Search failed" : "Fetch failed";
case "browser":
return "Browser action failed";
default:
return `${formatToolName(toolName)} failed`;
}
@@ -500,6 +532,44 @@ function getWebAccordionData(
};
}
function getBrowserAccordionData(
toolName: string,
input: unknown,
output: Record<string, unknown>,
): AccordionData {
const message = getStringField(output, "message");
const snapshot = getStringField(output, "snapshot");
// Screenshot tool: show the file_id so the user knows it was saved
if (toolName === "browser_screenshot") {
const fileId = getStringField(output, "file_id");
const filename = getStringField(output, "filename");
return {
title: filename ? `Screenshot: ${filename}` : "Screenshot captured",
description: fileId ? `file_id: ${fileId}` : undefined,
content: message ? <ContentMessage>{message}</ContentMessage> : null,
};
}
// Navigate / act tools: show snapshot if available
const title =
toolName === "browser_navigate"
? (getStringField(output, "title") ?? "Page loaded")
: (message ?? "Action completed");
const url = getStringField(output, "url", "current_url");
return {
title,
description: url ? truncate(url, 80) : undefined,
content: snapshot ? (
<ContentCodeBlock>{truncate(snapshot, 3000)}</ContentCodeBlock>
) : message ? (
<ContentMessage>{message}</ContentMessage>
) : null,
};
}
function getFileAccordionData(
category: ToolCategory,
input: unknown,
@@ -725,6 +795,7 @@ function getDefaultAccordionData(
function getAccordionData(
category: ToolCategory,
toolName: string,
input: unknown,
output: Record<string, unknown>,
): AccordionData {
@@ -733,6 +804,8 @@ function getAccordionData(
return getBashAccordionData(input, output);
case "web":
return getWebAccordionData(input, output);
case "browser":
return getBrowserAccordionData(toolName, input, output);
case "file-read":
case "file-write":
case "file-delete":
@@ -775,7 +848,7 @@ export function GenericTool({ part }: Props) {
const showAccordion = hasOutput || hasError || hasTodoInput;
const accordionData = showAccordion
? getAccordionData(category, part.input, output ?? {})
? getAccordionData(category, toolName, part.input, output ?? {})
: null;
return (

View File

@@ -11254,6 +11254,9 @@
"operation_in_progress",
"input_validation_error",
"web_fetch",
"browser_navigate",
"browser_act",
"browser_screenshot",
"bash_exec",
"feature_request_search",
"feature_request_created",