mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-04-08 03:00:28 -04:00
feat(copilot): Add agent-browser multi-step browser automation tools (#12230)
## Summary Adds three new Copilot tools for multi-step browser automation using the [agent-browser](https://github.com/vercel-labs/agent-browser) CLI (Playwright-based local daemon): - **`browser_navigate`** — navigate to a URL and get an accessibility-tree snapshot with `@ref` IDs - **`browser_act`** — interact with page elements (click, fill, scroll, check, press, select, `dblclick`, `type`, `wait`, back, forward, reload); returns updated snapshot - **`browser_screenshot`** — capture annotated screenshot (with `@ref` overlays) and save to user workspace Also adds **`browse_web`** (Stagehand + Browserbase) for one-shot JS-rendered page extraction. ### Why two browser tools? | Tool | When to use | |------|-------------| | `browse_web` | Single-shot extraction — cloud Browserbase session, no local daemon needed | | `browser_navigate` / `browser_act` | Multi-step flows (login → navigate → scrape), persistent session within a Copilot session | ### Design decisions - **SSRF protection**: Uses the same `validate_url()` from `backend.util.request` as HTTP blocks — async DNS, all IPs checked, full RFC 1918 + link-local + IPv6 coverage - **Session isolation**: `_run()` passes both `--session <id>` (isolated Chromium context per Copilot session) **and** `--session-name <id>` (persist cookies within a session), preventing cross-session state leakage while supporting login flows - **Snapshot truncation**: Interactive-only accessibility tree (`snapshot -i`) capped at 20 000 chars with a continuation hint - **Screenshot storage**: PNG bytes uploaded to user workspace via `WriteWorkspaceFileTool`; returns `file_id` for retrieval ### Bugs fixed in this PR - Session isolation bug: `--session-name` alone shared browser history across different Copilot sessions; added `--session` to isolate contexts - Missing actions: added `dblclick`, `type` (append without clearing), `wait` (CSS selector or ms delay) ## Test plan - [x] 53 unit tests covering all three tools, all actions, SSRF integration, auth check, session isolation, snapshot truncation, timeout, missing binary - [x] Integration test: real `agent-browser` CLI + Anthropic API tool-calling loop (3/3 scenarios passed) - [x] Linting (Ruff, isort, Black, Pyright) all passing ``` backend/copilot/tools/agent_browser_test.py 53 passed in 17.79s ```
This commit is contained in:
@@ -672,6 +672,16 @@ async def delete_chat_session(session_id: str, user_id: str | None = None) -> bo
|
||||
async with _session_locks_mutex:
|
||||
_session_locks.pop(session_id, None)
|
||||
|
||||
# Shut down any local browser daemon for this session (best-effort).
|
||||
# Inline import required: all tool modules import ChatSession from this
|
||||
# module, so any top-level import from tools.* would create a cycle.
|
||||
try:
|
||||
from .tools.agent_browser import close_browser_session
|
||||
|
||||
await close_browser_session(session_id, user_id=user_id)
|
||||
except Exception as e:
|
||||
logger.debug(f"Browser cleanup for session {session_id}: {e}")
|
||||
|
||||
return True
|
||||
|
||||
|
||||
|
||||
@@ -59,7 +59,7 @@ from .response_model import (
|
||||
StreamToolOutputAvailable,
|
||||
StreamUsage,
|
||||
)
|
||||
from .tools import execute_tool, tools
|
||||
from .tools import execute_tool, get_available_tools
|
||||
from .tools.models import ErrorResponse
|
||||
from .tracking import track_user_message
|
||||
|
||||
@@ -514,7 +514,7 @@ async def stream_chat_completion(
|
||||
)
|
||||
async for chunk in _stream_chat_chunks(
|
||||
session=session,
|
||||
tools=tools,
|
||||
tools=get_available_tools(),
|
||||
system_prompt=system_prompt,
|
||||
text_block_id=text_block_id,
|
||||
):
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from openai.types.chat import ChatCompletionToolParam
|
||||
|
||||
from backend.copilot.model import ChatSession
|
||||
from backend.copilot.tracking import track_tool_called
|
||||
|
||||
from .add_understanding import AddUnderstandingTool
|
||||
from .agent_browser import BrowserActTool, BrowserNavigateTool, BrowserScreenshotTool
|
||||
from .agent_output import AgentOutputTool
|
||||
from .base import BaseTool
|
||||
from .bash_exec import BashExecTool
|
||||
@@ -30,6 +32,7 @@ from .workspace_files import (
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from backend.copilot.model import ChatSession
|
||||
from backend.copilot.response_model import StreamToolOutputAvailable
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -50,6 +53,10 @@ TOOL_REGISTRY: dict[str, BaseTool] = {
|
||||
"get_doc_page": GetDocPageTool(),
|
||||
# Web fetch for safe URL retrieval
|
||||
"web_fetch": WebFetchTool(),
|
||||
# Agent-browser multi-step automation (navigate, act, screenshot)
|
||||
"browser_navigate": BrowserNavigateTool(),
|
||||
"browser_act": BrowserActTool(),
|
||||
"browser_screenshot": BrowserScreenshotTool(),
|
||||
# Sandboxed code execution (bubblewrap)
|
||||
"bash_exec": BashExecTool(),
|
||||
# Persistent workspace tools (cloud storage, survives across sessions)
|
||||
@@ -67,10 +74,17 @@ TOOL_REGISTRY: dict[str, BaseTool] = {
|
||||
find_agent_tool = TOOL_REGISTRY["find_agent"]
|
||||
run_agent_tool = TOOL_REGISTRY["run_agent"]
|
||||
|
||||
# Generated from registry for OpenAI API
|
||||
tools: list[ChatCompletionToolParam] = [
|
||||
tool.as_openai_tool() for tool in TOOL_REGISTRY.values()
|
||||
]
|
||||
|
||||
def get_available_tools() -> list[ChatCompletionToolParam]:
|
||||
"""Return OpenAI tool schemas for tools available in the current environment.
|
||||
|
||||
Called per-request so that env-var or binary availability is evaluated
|
||||
fresh each time (e.g. browser_* tools are excluded when agent-browser
|
||||
CLI is not installed).
|
||||
"""
|
||||
return [
|
||||
tool.as_openai_tool() for tool in TOOL_REGISTRY.values() if tool.is_available
|
||||
]
|
||||
|
||||
|
||||
def get_tool(tool_name: str) -> BaseTool | None:
|
||||
|
||||
876
autogpt_platform/backend/backend/copilot/tools/agent_browser.py
Normal file
876
autogpt_platform/backend/backend/copilot/tools/agent_browser.py
Normal file
@@ -0,0 +1,876 @@
|
||||
"""Agent-browser tools — multi-step browser automation for the Copilot.
|
||||
|
||||
Uses the agent-browser CLI (https://github.com/vercel-labs/agent-browser)
|
||||
which runs a local Chromium instance managed by a persistent daemon.
|
||||
|
||||
- Runs locally — no cloud account required
|
||||
- Full interaction support: click, fill, scroll, login flows, multi-step
|
||||
- Session persistence via --session-name: cookies/auth carry across tool calls
|
||||
within the same Copilot session, enabling login → navigate → extract workflows
|
||||
- Screenshot with --annotate overlays @ref labels, saved to workspace for user
|
||||
- The Claude Agent SDK's multi-turn loop handles orchestration — each tool call
|
||||
is one browser action; the LLM chains them naturally
|
||||
|
||||
SSRF protection:
|
||||
Uses the shared validate_url() from backend.util.request, which is the same
|
||||
guard used by HTTP blocks and web_fetch. It resolves ALL DNS answers (not just
|
||||
the first), blocks RFC 1918, loopback, link-local, 0.0.0.0/8, multicast,
|
||||
and all relevant IPv6 ranges, and applies IDNA encoding to prevent Unicode
|
||||
domain attacks.
|
||||
|
||||
Requires:
|
||||
npm install -g agent-browser
|
||||
agent-browser install (downloads Chromium, one-time per machine)
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from typing import Any
|
||||
|
||||
from backend.copilot.model import ChatSession
|
||||
from backend.util.request import validate_url
|
||||
|
||||
from .base import BaseTool
|
||||
from .models import (
|
||||
BrowserActResponse,
|
||||
BrowserNavigateResponse,
|
||||
BrowserScreenshotResponse,
|
||||
ErrorResponse,
|
||||
ToolResponseBase,
|
||||
)
|
||||
from .workspace_files import get_manager
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Per-command timeout (seconds). Navigation + networkidle wait can be slow.
|
||||
_CMD_TIMEOUT = 45
|
||||
# Accessibility tree can be very large; cap it to keep LLM context manageable.
|
||||
_MAX_SNAPSHOT_CHARS = 20_000
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Subprocess helper
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def _run(
|
||||
session_name: str,
|
||||
*args: str,
|
||||
timeout: int = _CMD_TIMEOUT,
|
||||
) -> tuple[int, str, str]:
|
||||
"""Run agent-browser for the given session and return (rc, stdout, stderr).
|
||||
|
||||
Uses both:
|
||||
--session <name> → isolated Chromium context (no shared history/cookies
|
||||
with other Copilot sessions — prevents cross-session
|
||||
browser state leakage)
|
||||
--session-name <name> → persist cookies/localStorage across tool calls within
|
||||
the same session (enables login → navigate flows)
|
||||
"""
|
||||
cmd = [
|
||||
"agent-browser",
|
||||
"--session",
|
||||
session_name,
|
||||
"--session-name",
|
||||
session_name,
|
||||
*args,
|
||||
]
|
||||
proc = None
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
|
||||
return proc.returncode or 0, stdout.decode(), stderr.decode()
|
||||
except asyncio.TimeoutError:
|
||||
# Kill the orphaned subprocess so it does not linger in the process table.
|
||||
if proc is not None and proc.returncode is None:
|
||||
proc.kill()
|
||||
try:
|
||||
await proc.communicate()
|
||||
except Exception:
|
||||
pass # Best-effort reap; ignore errors during cleanup.
|
||||
return 1, "", f"Command timed out after {timeout}s."
|
||||
except FileNotFoundError:
|
||||
return (
|
||||
1,
|
||||
"",
|
||||
"agent-browser is not installed (run: npm install -g agent-browser && agent-browser install).",
|
||||
)
|
||||
|
||||
|
||||
async def _snapshot(session_name: str) -> str:
|
||||
"""Return the current page's interactive accessibility tree, truncated."""
|
||||
rc, stdout, stderr = await _run(session_name, "snapshot", "-i", "-c")
|
||||
if rc != 0:
|
||||
return f"[snapshot failed: {stderr[:300]}]"
|
||||
text = stdout.strip()
|
||||
if len(text) > _MAX_SNAPSHOT_CHARS:
|
||||
suffix = "\n\n[Snapshot truncated — use browser_act to navigate further]"
|
||||
keep = max(0, _MAX_SNAPSHOT_CHARS - len(suffix))
|
||||
text = text[:keep] + suffix
|
||||
return text
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Stateless session helpers — persist / restore browser state across pods
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Module-level cache of sessions known to be alive on this pod.
|
||||
# Avoids the subprocess probe on every tool call within the same pod.
|
||||
_alive_sessions: set[str] = set()
|
||||
|
||||
# Per-session locks to prevent concurrent _ensure_session calls from
|
||||
# triggering duplicate _restore_browser_state for the same session.
|
||||
# Protected by _session_locks_mutex to ensure setdefault/pop are not
|
||||
# interleaved across await boundaries.
|
||||
_session_locks: dict[str, asyncio.Lock] = {}
|
||||
_session_locks_mutex = asyncio.Lock()
|
||||
|
||||
# Workspace filename for persisted browser state (auto-scoped to session).
|
||||
# Dot-prefixed so it is hidden from user workspace listings.
|
||||
_STATE_FILENAME = "._browser_state.json"
|
||||
|
||||
# Maximum concurrent subprocesses during cookie/storage restore.
|
||||
_RESTORE_CONCURRENCY = 10
|
||||
|
||||
# Maximum cookies to restore per session. Pathological sites can accumulate
|
||||
# thousands of cookies; restoring them all would be slow and is rarely useful.
|
||||
_MAX_RESTORE_COOKIES = 100
|
||||
|
||||
# Background tasks for fire-and-forget state persistence.
|
||||
# Prevents GC from collecting tasks before they complete.
|
||||
_background_tasks: set[asyncio.Task] = set()
|
||||
|
||||
|
||||
def _fire_and_forget_save(
|
||||
session_name: str, user_id: str, session: ChatSession
|
||||
) -> None:
|
||||
"""Schedule state persistence as a background task (non-blocking).
|
||||
|
||||
State save is already best-effort (errors are swallowed), so running it
|
||||
in the background avoids adding latency to tool responses.
|
||||
"""
|
||||
task = asyncio.create_task(_save_browser_state(session_name, user_id, session))
|
||||
_background_tasks.add(task)
|
||||
task.add_done_callback(_background_tasks.discard)
|
||||
|
||||
|
||||
async def _has_local_session(session_name: str) -> bool:
|
||||
"""Check if the local agent-browser daemon for this session is running."""
|
||||
rc, _, _ = await _run(session_name, "get", "url", timeout=5)
|
||||
return rc == 0
|
||||
|
||||
|
||||
async def _save_browser_state(
|
||||
session_name: str, user_id: str, session: ChatSession
|
||||
) -> None:
|
||||
"""Persist browser state (cookies, localStorage, URL) to workspace.
|
||||
|
||||
Best-effort: errors are logged but never propagate to the tool response.
|
||||
"""
|
||||
try:
|
||||
# Gather state in parallel
|
||||
(rc_url, url_out, _), (rc_ck, ck_out, _), (rc_ls, ls_out, _) = (
|
||||
await asyncio.gather(
|
||||
_run(session_name, "get", "url", timeout=10),
|
||||
_run(session_name, "cookies", "get", "--json", timeout=10),
|
||||
_run(session_name, "storage", "local", "--json", timeout=10),
|
||||
)
|
||||
)
|
||||
|
||||
state = {
|
||||
"url": url_out.strip() if rc_url == 0 else "",
|
||||
"cookies": (json.loads(ck_out) if rc_ck == 0 and ck_out.strip() else []),
|
||||
"local_storage": (
|
||||
json.loads(ls_out) if rc_ls == 0 and ls_out.strip() else {}
|
||||
),
|
||||
}
|
||||
|
||||
manager = await get_manager(user_id, session.session_id)
|
||||
await manager.write_file(
|
||||
content=json.dumps(state).encode("utf-8"),
|
||||
filename=_STATE_FILENAME,
|
||||
mime_type="application/json",
|
||||
overwrite=True,
|
||||
)
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"[browser] Failed to save browser state for session %s",
|
||||
session_name,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
|
||||
async def _restore_browser_state(
|
||||
session_name: str, user_id: str, session: ChatSession
|
||||
) -> bool:
|
||||
"""Restore browser state from workspace storage into a fresh daemon.
|
||||
|
||||
Best-effort: errors are logged but never propagate to the tool response.
|
||||
Returns True on success (or no state to restore), False on failure.
|
||||
"""
|
||||
try:
|
||||
manager = await get_manager(user_id, session.session_id)
|
||||
|
||||
file_info = await manager.get_file_info_by_path(_STATE_FILENAME)
|
||||
if file_info is None:
|
||||
return True # No saved state — first call or never saved
|
||||
|
||||
state_bytes = await manager.read_file(_STATE_FILENAME)
|
||||
state = json.loads(state_bytes.decode("utf-8"))
|
||||
|
||||
url = state.get("url", "")
|
||||
cookies = state.get("cookies", [])
|
||||
local_storage = state.get("local_storage", {})
|
||||
|
||||
# Navigate first — starts daemon + sets the correct origin for cookies
|
||||
if url:
|
||||
# Validate the saved URL to prevent SSRF via stored redirect targets.
|
||||
try:
|
||||
await validate_url(url, trusted_origins=[])
|
||||
except ValueError:
|
||||
logger.warning(
|
||||
"[browser] State restore: blocked SSRF URL %s", url[:200]
|
||||
)
|
||||
return False
|
||||
|
||||
rc, _, stderr = await _run(session_name, "open", url)
|
||||
if rc != 0:
|
||||
logger.warning(
|
||||
"[browser] State restore: failed to open %s: %s",
|
||||
url,
|
||||
stderr[:200],
|
||||
)
|
||||
return False
|
||||
await _run(session_name, "wait", "--load", "load", timeout=15)
|
||||
|
||||
# Restore cookies and localStorage in parallel via asyncio.gather.
|
||||
# Semaphore caps concurrent subprocess spawns so we don't overwhelm the
|
||||
# system when a session has hundreds of cookies.
|
||||
sem = asyncio.Semaphore(_RESTORE_CONCURRENCY)
|
||||
|
||||
# Guard against pathological sites with thousands of cookies.
|
||||
if len(cookies) > _MAX_RESTORE_COOKIES:
|
||||
logger.debug(
|
||||
"[browser] State restore: capping cookies from %d to %d",
|
||||
len(cookies),
|
||||
_MAX_RESTORE_COOKIES,
|
||||
)
|
||||
cookies = cookies[:_MAX_RESTORE_COOKIES]
|
||||
|
||||
async def _set_cookie(c: dict[str, Any]) -> None:
|
||||
name = c.get("name", "")
|
||||
value = c.get("value", "")
|
||||
domain = c.get("domain", "")
|
||||
path = c.get("path", "/")
|
||||
if not (name and domain):
|
||||
return
|
||||
async with sem:
|
||||
rc, _, stderr = await _run(
|
||||
session_name,
|
||||
"cookies",
|
||||
"set",
|
||||
name,
|
||||
value,
|
||||
"--domain",
|
||||
domain,
|
||||
"--path",
|
||||
path,
|
||||
timeout=5,
|
||||
)
|
||||
if rc != 0:
|
||||
logger.debug(
|
||||
"[browser] State restore: cookie set failed for %s: %s",
|
||||
name,
|
||||
stderr[:100],
|
||||
)
|
||||
|
||||
async def _set_storage(key: str, val: object) -> None:
|
||||
async with sem:
|
||||
rc, _, stderr = await _run(
|
||||
session_name,
|
||||
"storage",
|
||||
"local",
|
||||
"set",
|
||||
key,
|
||||
str(val),
|
||||
timeout=5,
|
||||
)
|
||||
if rc != 0:
|
||||
logger.debug(
|
||||
"[browser] State restore: localStorage set failed for %s: %s",
|
||||
key,
|
||||
stderr[:100],
|
||||
)
|
||||
|
||||
await asyncio.gather(
|
||||
*[_set_cookie(c) for c in cookies],
|
||||
*[_set_storage(k, v) for k, v in local_storage.items()],
|
||||
)
|
||||
|
||||
return True
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"[browser] Failed to restore browser state for session %s",
|
||||
session_name,
|
||||
exc_info=True,
|
||||
)
|
||||
return False
|
||||
|
||||
|
||||
async def _ensure_session(
|
||||
session_name: str, user_id: str, session: ChatSession
|
||||
) -> None:
|
||||
"""Ensure the local browser daemon has state. Restore from cloud if needed."""
|
||||
if session_name in _alive_sessions:
|
||||
return
|
||||
async with _session_locks_mutex:
|
||||
lock = _session_locks.setdefault(session_name, asyncio.Lock())
|
||||
async with lock:
|
||||
# Double-check after acquiring lock — another coroutine may have restored.
|
||||
if session_name in _alive_sessions:
|
||||
return
|
||||
if await _has_local_session(session_name):
|
||||
_alive_sessions.add(session_name)
|
||||
return
|
||||
if await _restore_browser_state(session_name, user_id, session):
|
||||
_alive_sessions.add(session_name)
|
||||
|
||||
|
||||
async def close_browser_session(session_name: str, user_id: str | None = None) -> None:
|
||||
"""Shut down the local agent-browser daemon and clean up stored state.
|
||||
|
||||
Deletes ``._browser_state.json`` from workspace storage so cookies and
|
||||
other credentials do not linger after the session is deleted.
|
||||
|
||||
Best-effort: errors are logged but never raised.
|
||||
"""
|
||||
_alive_sessions.discard(session_name)
|
||||
async with _session_locks_mutex:
|
||||
_session_locks.pop(session_name, None)
|
||||
|
||||
# Delete persisted browser state (cookies, localStorage) from workspace.
|
||||
if user_id:
|
||||
try:
|
||||
manager = await get_manager(user_id, session_name)
|
||||
file_info = await manager.get_file_info_by_path(_STATE_FILENAME)
|
||||
if file_info is not None:
|
||||
await manager.delete_file(file_info.id)
|
||||
except Exception:
|
||||
logger.debug(
|
||||
"[browser] Failed to delete state file for session %s",
|
||||
session_name,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
try:
|
||||
rc, _, stderr = await _run(session_name, "close", timeout=10)
|
||||
if rc != 0:
|
||||
logger.debug(
|
||||
"[browser] close failed for session %s: %s",
|
||||
session_name,
|
||||
stderr[:200],
|
||||
)
|
||||
except Exception:
|
||||
logger.debug(
|
||||
"[browser] Exception closing browser session %s",
|
||||
session_name,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tool: browser_navigate
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class BrowserNavigateTool(BaseTool):
|
||||
"""Navigate to a URL and return the page's interactive elements.
|
||||
|
||||
The browser session persists across tool calls within this Copilot session
|
||||
(keyed to session_id), so cookies and auth state carry over. This enables
|
||||
full login flows: navigate to login page → browser_act to fill credentials
|
||||
→ browser_act to submit → browser_navigate to the target page.
|
||||
"""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "browser_navigate"
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return (
|
||||
"Navigate to a URL using a real browser. Returns an accessibility "
|
||||
"tree snapshot listing the page's interactive elements with @ref IDs "
|
||||
"(e.g. @e3) that can be used with browser_act. "
|
||||
"Session persists — cookies and login state carry over between calls. "
|
||||
"Use this (with browser_act) for multi-step interaction: login flows, "
|
||||
"form filling, button clicks, or anything requiring page interaction. "
|
||||
"For plain static pages, prefer web_fetch — no browser overhead. "
|
||||
"For authenticated pages: navigate to the login page first, use browser_act "
|
||||
"to fill credentials and submit, then navigate to the target page. "
|
||||
"Note: for slow SPAs, the returned snapshot may reflect a partially-loaded "
|
||||
"state. If elements seem missing, use browser_act with action='wait' and a "
|
||||
"CSS selector or millisecond delay, then take a browser_screenshot to verify."
|
||||
)
|
||||
|
||||
@property
|
||||
def parameters(self) -> dict[str, Any]:
|
||||
return {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string",
|
||||
"description": "The HTTP/HTTPS URL to navigate to.",
|
||||
},
|
||||
"wait_for": {
|
||||
"type": "string",
|
||||
"enum": ["networkidle", "load", "domcontentloaded"],
|
||||
"default": "networkidle",
|
||||
"description": "When to consider navigation complete. Use 'networkidle' for SPAs (default).",
|
||||
},
|
||||
},
|
||||
"required": ["url"],
|
||||
}
|
||||
|
||||
@property
|
||||
def requires_auth(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def is_available(self) -> bool:
|
||||
return shutil.which("agent-browser") is not None
|
||||
|
||||
async def _execute(
|
||||
self,
|
||||
user_id: str | None,
|
||||
session: ChatSession,
|
||||
**kwargs: Any,
|
||||
) -> ToolResponseBase:
|
||||
"""Navigate to *url*, wait for the page to settle, and return a snapshot.
|
||||
|
||||
The snapshot is an accessibility-tree listing of interactive elements.
|
||||
Note: for slow SPAs that never fully idle, the snapshot may reflect a
|
||||
partially-loaded state (the wait is best-effort).
|
||||
"""
|
||||
url: str = (kwargs.get("url") or "").strip()
|
||||
wait_for: str = kwargs.get("wait_for") or "networkidle"
|
||||
session_name = session.session_id
|
||||
|
||||
if not url:
|
||||
return ErrorResponse(
|
||||
message="Please provide a URL to navigate to.",
|
||||
error="missing_url",
|
||||
session_id=session_name,
|
||||
)
|
||||
|
||||
try:
|
||||
await validate_url(url, trusted_origins=[])
|
||||
except ValueError as e:
|
||||
return ErrorResponse(
|
||||
message=str(e),
|
||||
error="blocked_url",
|
||||
session_id=session_name,
|
||||
)
|
||||
|
||||
# Restore browser state from cloud if this is a different pod
|
||||
if user_id:
|
||||
await _ensure_session(session_name, user_id, session)
|
||||
|
||||
# Navigate
|
||||
rc, _, stderr = await _run(session_name, "open", url)
|
||||
if rc != 0:
|
||||
logger.warning(
|
||||
"[browser_navigate] open failed for %s: %s", url, stderr[:300]
|
||||
)
|
||||
return ErrorResponse(
|
||||
message="Failed to navigate to URL.",
|
||||
error="navigation_failed",
|
||||
session_id=session_name,
|
||||
)
|
||||
|
||||
# Wait for page to settle (best-effort: some SPAs never reach networkidle)
|
||||
wait_rc, _, wait_err = await _run(session_name, "wait", "--load", wait_for)
|
||||
if wait_rc != 0:
|
||||
logger.warning(
|
||||
"[browser_navigate] wait(%s) failed: %s", wait_for, wait_err[:300]
|
||||
)
|
||||
|
||||
# Get current title and URL in parallel
|
||||
(_, title_out, _), (_, url_out, _) = await asyncio.gather(
|
||||
_run(session_name, "get", "title"),
|
||||
_run(session_name, "get", "url"),
|
||||
)
|
||||
|
||||
snapshot = await _snapshot(session_name)
|
||||
|
||||
result = BrowserNavigateResponse(
|
||||
message=f"Navigated to {url}",
|
||||
url=url_out.strip() or url,
|
||||
title=title_out.strip(),
|
||||
snapshot=snapshot,
|
||||
session_id=session_name,
|
||||
)
|
||||
|
||||
# Persist browser state to cloud for cross-pod continuity
|
||||
if user_id:
|
||||
_fire_and_forget_save(session_name, user_id, session)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tool: browser_act
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_NO_TARGET_ACTIONS = frozenset({"back", "forward", "reload"})
|
||||
_SCROLL_ACTIONS = frozenset({"scroll"})
|
||||
_TARGET_ONLY_ACTIONS = frozenset({"click", "dblclick", "hover", "check", "uncheck"})
|
||||
_TARGET_VALUE_ACTIONS = frozenset({"fill", "type", "select"})
|
||||
# wait <selector|ms>: waits for a DOM element or a fixed delay (e.g. "1000" for 1 s)
|
||||
_WAIT_ACTIONS = frozenset({"wait"})
|
||||
|
||||
|
||||
class BrowserActTool(BaseTool):
|
||||
"""Perform an action on the current browser page and return the updated snapshot.
|
||||
|
||||
Use @ref IDs from the snapshot returned by browser_navigate (e.g. '@e3').
|
||||
The LLM orchestrates multi-step flows by chaining browser_navigate and
|
||||
browser_act calls across turns of the Claude Agent SDK conversation.
|
||||
"""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "browser_act"
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return (
|
||||
"Interact with the current browser page. Use @ref IDs from the "
|
||||
"snapshot (e.g. '@e3') to target elements. Returns an updated snapshot. "
|
||||
"Supported actions: click, dblclick, fill, type, scroll, hover, press, "
|
||||
"check, uncheck, select, wait, back, forward, reload. "
|
||||
"fill clears the field before typing; type appends without clearing. "
|
||||
"wait accepts a CSS selector (waits for element) or milliseconds string (e.g. '1000'). "
|
||||
"Example login flow: fill @e1 with email → fill @e2 with password → "
|
||||
"click @e3 (submit) → browser_navigate to the target page."
|
||||
)
|
||||
|
||||
@property
|
||||
def parameters(self) -> dict[str, Any]:
|
||||
return {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"action": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"click",
|
||||
"dblclick",
|
||||
"fill",
|
||||
"type",
|
||||
"scroll",
|
||||
"hover",
|
||||
"press",
|
||||
"check",
|
||||
"uncheck",
|
||||
"select",
|
||||
"wait",
|
||||
"back",
|
||||
"forward",
|
||||
"reload",
|
||||
],
|
||||
"description": "The action to perform.",
|
||||
},
|
||||
"target": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"Element to target. Use @ref from snapshot (e.g. '@e3'), "
|
||||
"a CSS selector, or a text description. "
|
||||
"Required for: click, dblclick, fill, type, hover, check, uncheck, select. "
|
||||
"For wait: a CSS selector to wait for, or milliseconds as a string (e.g. '1000')."
|
||||
),
|
||||
},
|
||||
"value": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"For fill/type: the text to enter. "
|
||||
"For press: key name (e.g. 'Enter', 'Tab', 'Control+a'). "
|
||||
"For select: the option value to select."
|
||||
),
|
||||
},
|
||||
"direction": {
|
||||
"type": "string",
|
||||
"enum": ["up", "down", "left", "right"],
|
||||
"default": "down",
|
||||
"description": "For scroll: direction to scroll.",
|
||||
},
|
||||
},
|
||||
"required": ["action"],
|
||||
}
|
||||
|
||||
@property
|
||||
def requires_auth(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def is_available(self) -> bool:
|
||||
return shutil.which("agent-browser") is not None
|
||||
|
||||
async def _execute(
|
||||
self,
|
||||
user_id: str | None,
|
||||
session: ChatSession,
|
||||
**kwargs: Any,
|
||||
) -> ToolResponseBase:
|
||||
"""Perform a browser action and return an updated page snapshot.
|
||||
|
||||
Validates the *action*/*target*/*value* combination, delegates to
|
||||
``agent-browser``, waits for the page to settle, and returns the
|
||||
accessibility-tree snapshot so the LLM can plan the next step.
|
||||
"""
|
||||
action: str = (kwargs.get("action") or "").strip()
|
||||
target: str = (kwargs.get("target") or "").strip()
|
||||
value: str = (kwargs.get("value") or "").strip()
|
||||
direction: str = (kwargs.get("direction") or "down").strip()
|
||||
session_name = session.session_id
|
||||
|
||||
if not action:
|
||||
return ErrorResponse(
|
||||
message="Please specify an action.",
|
||||
error="missing_action",
|
||||
session_id=session_name,
|
||||
)
|
||||
|
||||
# Build the agent-browser command args
|
||||
if action in _NO_TARGET_ACTIONS:
|
||||
cmd_args = [action]
|
||||
|
||||
elif action in _SCROLL_ACTIONS:
|
||||
cmd_args = ["scroll", direction]
|
||||
|
||||
elif action == "press":
|
||||
if not value:
|
||||
return ErrorResponse(
|
||||
message="'press' requires a 'value' (key name, e.g. 'Enter').",
|
||||
error="missing_value",
|
||||
session_id=session_name,
|
||||
)
|
||||
cmd_args = ["press", value]
|
||||
|
||||
elif action in _TARGET_ONLY_ACTIONS:
|
||||
if not target:
|
||||
return ErrorResponse(
|
||||
message=f"'{action}' requires a 'target' element.",
|
||||
error="missing_target",
|
||||
session_id=session_name,
|
||||
)
|
||||
cmd_args = [action, target]
|
||||
|
||||
elif action in _TARGET_VALUE_ACTIONS:
|
||||
if not target or not value:
|
||||
return ErrorResponse(
|
||||
message=f"'{action}' requires both 'target' and 'value'.",
|
||||
error="missing_params",
|
||||
session_id=session_name,
|
||||
)
|
||||
cmd_args = [action, target, value]
|
||||
|
||||
elif action in _WAIT_ACTIONS:
|
||||
if not target:
|
||||
return ErrorResponse(
|
||||
message=(
|
||||
"'wait' requires a 'target': a CSS selector to wait for, "
|
||||
"or milliseconds as a string (e.g. '1000')."
|
||||
),
|
||||
error="missing_target",
|
||||
session_id=session_name,
|
||||
)
|
||||
cmd_args = ["wait", target]
|
||||
|
||||
else:
|
||||
return ErrorResponse(
|
||||
message=f"Unsupported action: {action}",
|
||||
error="invalid_action",
|
||||
session_id=session_name,
|
||||
)
|
||||
|
||||
# Restore browser state from cloud if this is a different pod
|
||||
if user_id:
|
||||
await _ensure_session(session_name, user_id, session)
|
||||
|
||||
rc, _, stderr = await _run(session_name, *cmd_args)
|
||||
if rc != 0:
|
||||
logger.warning("[browser_act] %s failed: %s", action, stderr[:300])
|
||||
return ErrorResponse(
|
||||
message=f"Action '{action}' failed.",
|
||||
error="action_failed",
|
||||
session_id=session_name,
|
||||
)
|
||||
|
||||
# Allow the page to settle after interaction (best-effort: SPAs may not idle)
|
||||
settle_rc, _, settle_err = await _run(
|
||||
session_name, "wait", "--load", "networkidle"
|
||||
)
|
||||
if settle_rc != 0:
|
||||
logger.warning(
|
||||
"[browser_act] post-action wait failed: %s", settle_err[:300]
|
||||
)
|
||||
|
||||
snapshot = await _snapshot(session_name)
|
||||
_, url_out, _ = await _run(session_name, "get", "url")
|
||||
|
||||
result = BrowserActResponse(
|
||||
message=f"Performed '{action}'" + (f" on '{target}'" if target else ""),
|
||||
action=action,
|
||||
current_url=url_out.strip(),
|
||||
snapshot=snapshot,
|
||||
session_id=session_name,
|
||||
)
|
||||
|
||||
# Persist browser state to cloud for cross-pod continuity
|
||||
if user_id:
|
||||
_fire_and_forget_save(session_name, user_id, session)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tool: browser_screenshot
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class BrowserScreenshotTool(BaseTool):
|
||||
"""Capture a screenshot of the current browser page and save it to the workspace."""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "browser_screenshot"
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return (
|
||||
"Take a screenshot of the current browser page and save it to the workspace. "
|
||||
"IMPORTANT: After calling this tool, immediately call read_workspace_file "
|
||||
"with the returned file_id to display the image inline to the user — "
|
||||
"the screenshot is not visible until you do this. "
|
||||
"With annotate=true (default), @ref labels are overlaid on interactive "
|
||||
"elements, making it easy to see which @ref ID maps to which element on screen."
|
||||
)
|
||||
|
||||
@property
|
||||
def parameters(self) -> dict[str, Any]:
|
||||
return {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"annotate": {
|
||||
"type": "boolean",
|
||||
"default": True,
|
||||
"description": "Overlay @ref labels on interactive elements (default: true).",
|
||||
},
|
||||
"filename": {
|
||||
"type": "string",
|
||||
"default": "screenshot.png",
|
||||
"description": "Filename to save in the workspace.",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
@property
|
||||
def requires_auth(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def is_available(self) -> bool:
|
||||
return shutil.which("agent-browser") is not None
|
||||
|
||||
async def _execute(
|
||||
self,
|
||||
user_id: str | None,
|
||||
session: ChatSession,
|
||||
**kwargs: Any,
|
||||
) -> ToolResponseBase:
|
||||
"""Capture a PNG screenshot and upload it to the workspace.
|
||||
|
||||
Handles string-to-bool coercion for *annotate* (OpenAI function-call
|
||||
payloads sometimes deliver ``"true"``/``"false"`` as strings).
|
||||
Returns a :class:`BrowserScreenshotResponse` with the workspace
|
||||
``file_id`` the LLM should pass to ``read_workspace_file``.
|
||||
"""
|
||||
raw_annotate = kwargs.get("annotate", True)
|
||||
if isinstance(raw_annotate, str):
|
||||
annotate = raw_annotate.strip().lower() in {"1", "true", "yes", "on"}
|
||||
else:
|
||||
annotate = bool(raw_annotate)
|
||||
filename: str = (kwargs.get("filename") or "screenshot.png").strip()
|
||||
session_name = session.session_id
|
||||
|
||||
# Restore browser state from cloud if this is a different pod
|
||||
if user_id:
|
||||
await _ensure_session(session_name, user_id, session)
|
||||
|
||||
tmp_fd, tmp_path = tempfile.mkstemp(suffix=".png")
|
||||
os.close(tmp_fd)
|
||||
try:
|
||||
cmd_args = ["screenshot"]
|
||||
if annotate:
|
||||
cmd_args.append("--annotate")
|
||||
cmd_args.append(tmp_path)
|
||||
|
||||
rc, _, stderr = await _run(session_name, *cmd_args)
|
||||
if rc != 0:
|
||||
logger.warning("[browser_screenshot] failed: %s", stderr[:300])
|
||||
return ErrorResponse(
|
||||
message="Failed to take screenshot.",
|
||||
error="screenshot_failed",
|
||||
session_id=session_name,
|
||||
)
|
||||
|
||||
with open(tmp_path, "rb") as f:
|
||||
png_bytes = f.read()
|
||||
|
||||
finally:
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except OSError:
|
||||
pass # Best-effort temp file cleanup; not critical if it fails.
|
||||
|
||||
# Upload to workspace so the user can view it
|
||||
png_b64 = base64.b64encode(png_bytes).decode()
|
||||
|
||||
# Import here to avoid circular deps — workspace_files imports from .models
|
||||
from .workspace_files import WorkspaceWriteResponse, WriteWorkspaceFileTool
|
||||
|
||||
write_resp = await WriteWorkspaceFileTool()._execute(
|
||||
user_id=user_id,
|
||||
session=session,
|
||||
filename=filename,
|
||||
content_base64=png_b64,
|
||||
)
|
||||
|
||||
if not isinstance(write_resp, WorkspaceWriteResponse):
|
||||
return ErrorResponse(
|
||||
message="Screenshot taken but failed to save to workspace.",
|
||||
error="workspace_write_failed",
|
||||
session_id=session_name,
|
||||
)
|
||||
|
||||
result = BrowserScreenshotResponse(
|
||||
message=f"Screenshot saved to workspace as '{filename}'. Use read_workspace_file with file_id='{write_resp.file_id}' to retrieve it.",
|
||||
file_id=write_resp.file_id,
|
||||
filename=filename,
|
||||
session_id=session_name,
|
||||
)
|
||||
|
||||
# Persist browser state to cloud for cross-pod continuity
|
||||
if user_id:
|
||||
_fire_and_forget_save(session_name, user_id, session)
|
||||
|
||||
return result
|
||||
1663
autogpt_platform/backend/backend/copilot/tools/agent_browser_test.py
Normal file
1663
autogpt_platform/backend/backend/copilot/tools/agent_browser_test.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -36,6 +36,16 @@ class BaseTool:
|
||||
"""Whether this tool requires authentication."""
|
||||
return False
|
||||
|
||||
@property
|
||||
def is_available(self) -> bool:
|
||||
"""Whether this tool is available in the current environment.
|
||||
|
||||
Override to check required env vars, binaries, or other dependencies.
|
||||
Unavailable tools are excluded from the LLM tool list so the model is
|
||||
never offered an option that will immediately fail.
|
||||
"""
|
||||
return True
|
||||
|
||||
def as_openai_tool(self) -> ChatCompletionToolParam:
|
||||
"""Convert to OpenAI tool format."""
|
||||
return ChatCompletionToolParam(
|
||||
|
||||
@@ -41,6 +41,10 @@ class ResponseType(str, Enum):
|
||||
INPUT_VALIDATION_ERROR = "input_validation_error"
|
||||
# Web fetch
|
||||
WEB_FETCH = "web_fetch"
|
||||
# Agent-browser multi-step automation (navigate, act, screenshot)
|
||||
BROWSER_NAVIGATE = "browser_navigate"
|
||||
BROWSER_ACT = "browser_act"
|
||||
BROWSER_SCREENSHOT = "browser_screenshot"
|
||||
# Code execution
|
||||
BASH_EXEC = "bash_exec"
|
||||
# Feature request types
|
||||
@@ -476,3 +480,32 @@ class FeatureRequestCreatedResponse(ToolResponseBase):
|
||||
issue_url: str
|
||||
is_new_issue: bool # False if added to existing
|
||||
customer_name: str
|
||||
|
||||
|
||||
# Agent-browser multi-step automation models
|
||||
|
||||
|
||||
class BrowserNavigateResponse(ToolResponseBase):
|
||||
"""Response for browser_navigate tool."""
|
||||
|
||||
type: ResponseType = ResponseType.BROWSER_NAVIGATE
|
||||
url: str
|
||||
title: str
|
||||
snapshot: str # Interactive accessibility tree with @ref IDs
|
||||
|
||||
|
||||
class BrowserActResponse(ToolResponseBase):
|
||||
"""Response for browser_act tool."""
|
||||
|
||||
type: ResponseType = ResponseType.BROWSER_ACT
|
||||
action: str
|
||||
current_url: str = ""
|
||||
snapshot: str # Updated accessibility tree after the action
|
||||
|
||||
|
||||
class BrowserScreenshotResponse(ToolResponseBase):
|
||||
"""Response for browser_screenshot tool."""
|
||||
|
||||
type: ResponseType = ResponseType.BROWSER_SCREENSHOT
|
||||
file_id: str # Workspace file ID — use read_workspace_file to retrieve
|
||||
filename: str
|
||||
|
||||
@@ -218,7 +218,7 @@ def _is_text_mime(mime_type: str) -> bool:
|
||||
return any(mime_type.startswith(t) for t in _TEXT_MIME_PREFIXES)
|
||||
|
||||
|
||||
async def _get_manager(user_id: str, session_id: str) -> WorkspaceManager:
|
||||
async def get_manager(user_id: str, session_id: str) -> WorkspaceManager:
|
||||
"""Create a session-scoped WorkspaceManager."""
|
||||
workspace = await workspace_db().get_or_create_workspace(user_id)
|
||||
return WorkspaceManager(user_id, workspace.id, session_id)
|
||||
@@ -386,7 +386,7 @@ class ListWorkspaceFilesTool(BaseTool):
|
||||
include_all_sessions: bool = kwargs.get("include_all_sessions", False)
|
||||
|
||||
try:
|
||||
manager = await _get_manager(user_id, session_id)
|
||||
manager = await get_manager(user_id, session_id)
|
||||
files = await manager.list_files(
|
||||
path=path_prefix, limit=limit, include_all_sessions=include_all_sessions
|
||||
)
|
||||
@@ -517,7 +517,7 @@ class ReadWorkspaceFileTool(BaseTool):
|
||||
)
|
||||
|
||||
try:
|
||||
manager = await _get_manager(user_id, session_id)
|
||||
manager = await get_manager(user_id, session_id)
|
||||
resolved = await _resolve_file(manager, file_id, path, session_id)
|
||||
if isinstance(resolved, ErrorResponse):
|
||||
return resolved
|
||||
@@ -725,7 +725,7 @@ class WriteWorkspaceFileTool(BaseTool):
|
||||
|
||||
try:
|
||||
await scan_content_safe(content, filename=filename)
|
||||
manager = await _get_manager(user_id, session_id)
|
||||
manager = await get_manager(user_id, session_id)
|
||||
rec = await manager.write_file(
|
||||
content=content,
|
||||
filename=filename,
|
||||
@@ -852,7 +852,7 @@ class DeleteWorkspaceFileTool(BaseTool):
|
||||
)
|
||||
|
||||
try:
|
||||
manager = await _get_manager(user_id, session_id)
|
||||
manager = await get_manager(user_id, session_id)
|
||||
resolved = await _resolve_file(manager, file_id, path, session_id)
|
||||
if isinstance(resolved, ErrorResponse):
|
||||
return resolved
|
||||
|
||||
@@ -12,6 +12,7 @@ import {
|
||||
GlobeIcon,
|
||||
ListChecksIcon,
|
||||
MagnifyingGlassIcon,
|
||||
MonitorIcon,
|
||||
PencilSimpleIcon,
|
||||
TerminalIcon,
|
||||
TrashIcon,
|
||||
@@ -48,6 +49,7 @@ function formatToolName(name: string): string {
|
||||
type ToolCategory =
|
||||
| "bash"
|
||||
| "web"
|
||||
| "browser"
|
||||
| "file-read"
|
||||
| "file-write"
|
||||
| "file-delete"
|
||||
@@ -65,6 +67,10 @@ function getToolCategory(toolName: string): ToolCategory {
|
||||
case "WebSearch":
|
||||
case "WebFetch":
|
||||
return "web";
|
||||
case "browser_navigate":
|
||||
case "browser_act":
|
||||
case "browser_screenshot":
|
||||
return "browser";
|
||||
case "read_workspace_file":
|
||||
case "read_file":
|
||||
case "Read":
|
||||
@@ -120,6 +126,8 @@ function ToolIcon({
|
||||
return <TerminalIcon size={14} weight="regular" className={iconClass} />;
|
||||
case "web":
|
||||
return <GlobeIcon size={14} weight="regular" className={iconClass} />;
|
||||
case "browser":
|
||||
return <MonitorIcon size={14} weight="regular" className={iconClass} />;
|
||||
case "file-read":
|
||||
return <FileIcon size={14} weight="regular" className={iconClass} />;
|
||||
case "file-write":
|
||||
@@ -155,6 +163,8 @@ function AccordionIcon({ category }: { category: ToolCategory }) {
|
||||
return <TerminalIcon size={32} weight="light" />;
|
||||
case "web":
|
||||
return <GlobeIcon size={32} weight="light" />;
|
||||
case "browser":
|
||||
return <MonitorIcon size={32} weight="light" />;
|
||||
case "file-read":
|
||||
case "file-write":
|
||||
return <FileIcon size={32} weight="light" />;
|
||||
@@ -189,6 +199,16 @@ function getInputSummary(toolName: string, input: unknown): string | null {
|
||||
return typeof inp.url === "string" ? inp.url : null;
|
||||
case "WebSearch":
|
||||
return typeof inp.query === "string" ? inp.query : null;
|
||||
case "browser_navigate":
|
||||
return typeof inp.url === "string" ? inp.url : null;
|
||||
case "browser_act":
|
||||
return typeof inp.action === "string"
|
||||
? inp.target
|
||||
? `${inp.action} ${inp.target}`
|
||||
: (inp.action as string)
|
||||
: null;
|
||||
case "browser_screenshot":
|
||||
return null;
|
||||
case "read_workspace_file":
|
||||
case "read_file":
|
||||
case "Read":
|
||||
@@ -259,6 +279,11 @@ function getAnimationText(part: ToolUIPart, category: ToolCategory): string {
|
||||
return shortSummary
|
||||
? `Fetching ${shortSummary}`
|
||||
: "Fetching web content…";
|
||||
case "browser":
|
||||
if (toolName === "browser_screenshot") return "Taking screenshot…";
|
||||
return shortSummary
|
||||
? `Browsing ${shortSummary}`
|
||||
: "Interacting with browser…";
|
||||
case "file-read":
|
||||
return shortSummary ? `Reading ${shortSummary}` : "Reading file…";
|
||||
case "file-write":
|
||||
@@ -297,6 +322,11 @@ function getAnimationText(part: ToolUIPart, category: ToolCategory): string {
|
||||
return shortSummary
|
||||
? `Fetched ${shortSummary}`
|
||||
: "Fetched web content";
|
||||
case "browser":
|
||||
if (toolName === "browser_screenshot") return "Screenshot captured";
|
||||
return shortSummary
|
||||
? `Browsed ${shortSummary}`
|
||||
: "Browser action completed";
|
||||
case "file-read":
|
||||
return shortSummary ? `Read ${shortSummary}` : "File read completed";
|
||||
case "file-write":
|
||||
@@ -323,6 +353,8 @@ function getAnimationText(part: ToolUIPart, category: ToolCategory): string {
|
||||
return "Command failed";
|
||||
case "web":
|
||||
return toolName === "WebSearch" ? "Search failed" : "Fetch failed";
|
||||
case "browser":
|
||||
return "Browser action failed";
|
||||
default:
|
||||
return `${formatToolName(toolName)} failed`;
|
||||
}
|
||||
@@ -500,6 +532,44 @@ function getWebAccordionData(
|
||||
};
|
||||
}
|
||||
|
||||
function getBrowserAccordionData(
|
||||
toolName: string,
|
||||
input: unknown,
|
||||
output: Record<string, unknown>,
|
||||
): AccordionData {
|
||||
const message = getStringField(output, "message");
|
||||
const snapshot = getStringField(output, "snapshot");
|
||||
|
||||
// Screenshot tool: show the file_id so the user knows it was saved
|
||||
if (toolName === "browser_screenshot") {
|
||||
const fileId = getStringField(output, "file_id");
|
||||
const filename = getStringField(output, "filename");
|
||||
return {
|
||||
title: filename ? `Screenshot: ${filename}` : "Screenshot captured",
|
||||
description: fileId ? `file_id: ${fileId}` : undefined,
|
||||
content: message ? <ContentMessage>{message}</ContentMessage> : null,
|
||||
};
|
||||
}
|
||||
|
||||
// Navigate / act tools: show snapshot if available
|
||||
const title =
|
||||
toolName === "browser_navigate"
|
||||
? (getStringField(output, "title") ?? "Page loaded")
|
||||
: (message ?? "Action completed");
|
||||
|
||||
const url = getStringField(output, "url", "current_url");
|
||||
|
||||
return {
|
||||
title,
|
||||
description: url ? truncate(url, 80) : undefined,
|
||||
content: snapshot ? (
|
||||
<ContentCodeBlock>{truncate(snapshot, 3000)}</ContentCodeBlock>
|
||||
) : message ? (
|
||||
<ContentMessage>{message}</ContentMessage>
|
||||
) : null,
|
||||
};
|
||||
}
|
||||
|
||||
function getFileAccordionData(
|
||||
category: ToolCategory,
|
||||
input: unknown,
|
||||
@@ -725,6 +795,7 @@ function getDefaultAccordionData(
|
||||
|
||||
function getAccordionData(
|
||||
category: ToolCategory,
|
||||
toolName: string,
|
||||
input: unknown,
|
||||
output: Record<string, unknown>,
|
||||
): AccordionData {
|
||||
@@ -733,6 +804,8 @@ function getAccordionData(
|
||||
return getBashAccordionData(input, output);
|
||||
case "web":
|
||||
return getWebAccordionData(input, output);
|
||||
case "browser":
|
||||
return getBrowserAccordionData(toolName, input, output);
|
||||
case "file-read":
|
||||
case "file-write":
|
||||
case "file-delete":
|
||||
@@ -775,7 +848,7 @@ export function GenericTool({ part }: Props) {
|
||||
|
||||
const showAccordion = hasOutput || hasError || hasTodoInput;
|
||||
const accordionData = showAccordion
|
||||
? getAccordionData(category, part.input, output ?? {})
|
||||
? getAccordionData(category, toolName, part.input, output ?? {})
|
||||
: null;
|
||||
|
||||
return (
|
||||
|
||||
@@ -11254,6 +11254,9 @@
|
||||
"operation_in_progress",
|
||||
"input_validation_error",
|
||||
"web_fetch",
|
||||
"browser_navigate",
|
||||
"browser_act",
|
||||
"browser_screenshot",
|
||||
"bash_exec",
|
||||
"feature_request_search",
|
||||
"feature_request_created",
|
||||
|
||||
Reference in New Issue
Block a user