feat(copilot): Add browse_web tool for JS-rendered page browsing

Adds a new `browse_web` copilot tool backed by Stagehand + Browserbase
that handles JavaScript-rendered pages, SPAs, and dynamic content that
the existing `web_fetch` tool cannot reach.

- BrowseWebTool: navigates URLs with a real browser, extracts content
  via natural language instruction using Stagehand's extract() API
- Ephemeral sessions per call — no external storage needed
- Thread-safe signal handling for the CoPilot executor thread pool
- 50K char content limit to protect LLM context window
- Graceful degradation when Stagehand env vars are not configured
- Registered in TOOL_REGISTRY alongside web_fetch

Requires: STAGEHAND_API_KEY, STAGEHAND_PROJECT_ID, ANTHROPIC_API_KEY
This commit is contained in:
Zamil Majdy
2026-02-26 23:07:19 +07:00
parent 29ca034e40
commit a131737f57
3 changed files with 221 additions and 0 deletions

View File

@@ -10,6 +10,7 @@ from .add_understanding import AddUnderstandingTool
from .agent_output import AgentOutputTool
from .base import BaseTool
from .bash_exec import BashExecTool
from .browse_web import BrowseWebTool
from .create_agent import CreateAgentTool
from .customize_agent import CustomizeAgentTool
from .edit_agent import EditAgentTool
@@ -50,6 +51,8 @@ TOOL_REGISTRY: dict[str, BaseTool] = {
"get_doc_page": GetDocPageTool(),
# Web fetch for safe URL retrieval
"web_fetch": WebFetchTool(),
# Browser-based browsing for JS-rendered pages (Stagehand + Browserbase)
"browse_web": BrowseWebTool(),
# Sandboxed code execution (bubblewrap)
"bash_exec": BashExecTool(),
# Persistent workspace tools (cloud storage, survives across sessions)

View File

@@ -0,0 +1,207 @@
"""Web browsing tool — navigate real browser sessions to extract page content.
Uses Stagehand + Browserbase for cloud-based browser execution. Handles
JS-rendered pages, SPAs, and dynamic content that web_fetch cannot reach.
Requires environment variables:
STAGEHAND_API_KEY — Browserbase API key
STAGEHAND_PROJECT_ID — Browserbase project ID
ANTHROPIC_API_KEY — LLM key used by Stagehand for extraction
"""
import logging
import os
import signal
import threading
from contextlib import contextmanager
from typing import Any, Generator
import stagehand.main
from stagehand import Stagehand
from backend.copilot.model import ChatSession
from .base import BaseTool
from .models import BrowseWebResponse, ErrorResponse, ToolResponseBase
logger = logging.getLogger(__name__)
# Stagehand uses the LLM internally for natural-language extraction/actions.
_STAGEHAND_MODEL = "anthropic/claude-sonnet-4-5-20250929"
# Hard cap on extracted content returned to the LLM context.
_MAX_CONTENT_CHARS = 50_000
# ---------------------------------------------------------------------------
# Thread-safety patch for Stagehand signal handlers
# Stagehand tries to register signal handlers on init. In worker threads
# (e.g. the CoPilot executor thread pool) this raises a ValueError because
# signal.signal() is only allowed in the main thread.
# ---------------------------------------------------------------------------
_original_register_signal_handlers = stagehand.main.Stagehand._register_signal_handlers
def _safe_register_signal_handlers(self: Any) -> None:
if threading.current_thread() is threading.main_thread():
_original_register_signal_handlers(self)
stagehand.main.Stagehand._register_signal_handlers = _safe_register_signal_handlers
@contextmanager
def _thread_safe_signal() -> Generator[None, None, None]:
"""Suppress signal.signal() calls when not in the main thread."""
if threading.current_thread() is not threading.main_thread():
original = signal.signal
signal.signal = lambda *_: None # type: ignore[assignment]
try:
yield
finally:
signal.signal = original
else:
yield
class BrowseWebTool(BaseTool):
"""Navigate a URL with a real browser and extract its content.
Use this instead of ``web_fetch`` when the page requires JavaScript
to render (SPAs, dashboards, paywalled content with JS checks, etc.).
"""
@property
def name(self) -> str:
return "browse_web"
@property
def description(self) -> str:
return (
"Navigate to a URL using a real browser and extract content. "
"Handles JavaScript-rendered pages and dynamic content that "
"web_fetch cannot reach. "
"Specify exactly what to extract via the `instruction` parameter."
)
@property
def parameters(self) -> dict[str, Any]:
return {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The HTTP/HTTPS URL to navigate to.",
},
"instruction": {
"type": "string",
"description": (
"What to extract from the page. Be specific — e.g. "
"'Extract all pricing plans with features and prices', "
"'Get the main article text and author', "
"'List all navigation links'. "
"Defaults to extracting the main page content."
),
"default": "Extract the main content of this page.",
},
},
"required": ["url"],
}
@property
def requires_auth(self) -> bool:
return True
async def _execute(
self,
user_id: str | None, # noqa: ARG002
session: ChatSession,
**kwargs: Any,
) -> ToolResponseBase:
url: str = (kwargs.get("url") or "").strip()
instruction: str = (
kwargs.get("instruction") or "Extract the main content of this page."
)
session_id = session.session_id if session else None
if not url:
return ErrorResponse(
message="Please provide a URL to browse.",
error="missing_url",
session_id=session_id,
)
if not url.startswith(("http://", "https://")):
return ErrorResponse(
message="Only HTTP/HTTPS URLs are supported.",
error="invalid_url",
session_id=session_id,
)
api_key = os.environ.get("STAGEHAND_API_KEY")
project_id = os.environ.get("STAGEHAND_PROJECT_ID")
model_api_key = os.environ.get("ANTHROPIC_API_KEY")
if not api_key or not project_id:
return ErrorResponse(
message=(
"Web browsing is not configured on this platform. "
"STAGEHAND_API_KEY and STAGEHAND_PROJECT_ID are required."
),
error="not_configured",
session_id=session_id,
)
if not model_api_key:
return ErrorResponse(
message=(
"Web browsing is not configured: ANTHROPIC_API_KEY is required "
"for Stagehand's extraction model."
),
error="not_configured",
session_id=session_id,
)
client: Stagehand | None = None
try:
with _thread_safe_signal():
client = Stagehand(
api_key=api_key,
project_id=project_id,
model_name=_STAGEHAND_MODEL,
model_api_key=model_api_key,
)
await client.init()
page = client.page
assert page is not None, "Stagehand page is not initialized"
await page.goto(url)
result = await page.extract(instruction)
# Extract the text content from the Pydantic result model.
raw = result.model_dump().get("extraction", "")
content = str(raw) if raw else ""
truncated = len(content) > _MAX_CONTENT_CHARS
if truncated:
content = content[:_MAX_CONTENT_CHARS] + "\n\n[Content truncated]"
return BrowseWebResponse(
message=f"Browsed {url}",
url=url,
content=content,
truncated=truncated,
session_id=session_id,
)
except Exception as e:
logger.warning("[browse_web] Failed for %s: %s", url, e)
return ErrorResponse(
message=f"Failed to browse URL: {e}",
error="browse_failed",
session_id=session_id,
)
finally:
if client is not None:
try:
await client.close()
except Exception:
pass

View File

@@ -41,6 +41,8 @@ class ResponseType(str, Enum):
INPUT_VALIDATION_ERROR = "input_validation_error"
# Web fetch
WEB_FETCH = "web_fetch"
# Browser-based web browsing (JS-rendered pages)
BROWSE_WEB = "browse_web"
# Code execution
BASH_EXEC = "bash_exec"
# Feature request types
@@ -438,6 +440,15 @@ class WebFetchResponse(ToolResponseBase):
truncated: bool = False
class BrowseWebResponse(ToolResponseBase):
"""Response for browse_web tool."""
type: ResponseType = ResponseType.BROWSE_WEB
url: str
content: str
truncated: bool = False
class BashExecResponse(ToolResponseBase):
"""Response for bash_exec tool."""