feat(copilot): Add browse_web tool for JS-rendered page browsing

Adds a new `browse_web` copilot tool backed by Stagehand + Browserbase that handles JavaScript-rendered pages, SPAs, and dynamic content that the existing `web_fetch` tool cannot reach. - BrowseWebTool: navigates URLs with a real browser, extracts content via natural language instruction using Stagehand's extract() API - Ephemeral sessions per call — no external storage needed - Thread-safe signal handling for the CoPilot executor thread pool - 50K char content limit to protect LLM context window - Graceful degradation when Stagehand env vars are not configured - Registered in TOOL_REGISTRY alongside web_fetch Requires: STAGEHAND_API_KEY, STAGEHAND_PROJECT_ID, ANTHROPIC_API_KEY
2026-04-08 03:00:28 -04:00 · 2026-02-26 23:07:19 +07:00
parent 29ca034e40
commit a131737f57
3 changed files with 221 additions and 0 deletions
--- a/autogpt_platform/backend/backend/copilot/tools/init.py
+++ b/autogpt_platform/backend/backend/copilot/tools/init.py
@@ -10,6 +10,7 @@ from .add_understanding import AddUnderstandingTool
 from .agent_output import AgentOutputTool
 from .base import BaseTool
 from .bash_exec import BashExecTool
+from .browse_web import BrowseWebTool
 from .create_agent import CreateAgentTool
 from .customize_agent import CustomizeAgentTool
 from .edit_agent import EditAgentTool
@@ -50,6 +51,8 @@ TOOL_REGISTRY: dict[str, BaseTool] = {
    "get_doc_page": GetDocPageTool(),
    # Web fetch for safe URL retrieval
    "web_fetch": WebFetchTool(),
+    # Browser-based browsing for JS-rendered pages (Stagehand + Browserbase)
+    "browse_web": BrowseWebTool(),
    # Sandboxed code execution (bubblewrap)
    "bash_exec": BashExecTool(),
    # Persistent workspace tools (cloud storage, survives across sessions)
--- a/autogpt_platform/backend/backend/copilot/tools/browse_web.py
+++ b/autogpt_platform/backend/backend/copilot/tools/browse_web.py
@@ -0,0 +1,207 @@
+"""Web browsing tool — navigate real browser sessions to extract page content.
+
+Uses Stagehand + Browserbase for cloud-based browser execution. Handles
+JS-rendered pages, SPAs, and dynamic content that web_fetch cannot reach.
+
+Requires environment variables:
+    STAGEHAND_API_KEY     — Browserbase API key
+    STAGEHAND_PROJECT_ID  — Browserbase project ID
+    ANTHROPIC_API_KEY     — LLM key used by Stagehand for extraction
+"""
+
+import logging
+import os
+import signal
+import threading
+from contextlib import contextmanager
+from typing import Any, Generator
+
+import stagehand.main
+from stagehand import Stagehand
+
+from backend.copilot.model import ChatSession
+
+from .base import BaseTool
+from .models import BrowseWebResponse, ErrorResponse, ToolResponseBase
+
+logger = logging.getLogger(__name__)
+
+# Stagehand uses the LLM internally for natural-language extraction/actions.
+_STAGEHAND_MODEL = "anthropic/claude-sonnet-4-5-20250929"
+# Hard cap on extracted content returned to the LLM context.
+_MAX_CONTENT_CHARS = 50_000
+
+# ---------------------------------------------------------------------------
+# Thread-safety patch for Stagehand signal handlers
+# Stagehand tries to register signal handlers on init. In worker threads
+# (e.g. the CoPilot executor thread pool) this raises a ValueError because
+# signal.signal() is only allowed in the main thread.
+# ---------------------------------------------------------------------------
+_original_register_signal_handlers = stagehand.main.Stagehand._register_signal_handlers
+
+
+def _safe_register_signal_handlers(self: Any) -> None:
+    if threading.current_thread() is threading.main_thread():
+        _original_register_signal_handlers(self)
+
+
+stagehand.main.Stagehand._register_signal_handlers = _safe_register_signal_handlers
+
+
+@contextmanager
+def _thread_safe_signal() -> Generator[None, None, None]:
+    """Suppress signal.signal() calls when not in the main thread."""
+    if threading.current_thread() is not threading.main_thread():
+        original = signal.signal
+        signal.signal = lambda *_: None  # type: ignore[assignment]
+        try:
+            yield
+        finally:
+            signal.signal = original
+    else:
+        yield
+
+
+class BrowseWebTool(BaseTool):
+    """Navigate a URL with a real browser and extract its content.
+
+    Use this instead of ``web_fetch`` when the page requires JavaScript
+    to render (SPAs, dashboards, paywalled content with JS checks, etc.).
+    """
+
+    @property
+    def name(self) -> str:
+        return "browse_web"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Navigate to a URL using a real browser and extract content. "
+            "Handles JavaScript-rendered pages and dynamic content that "
+            "web_fetch cannot reach. "
+            "Specify exactly what to extract via the `instruction` parameter."
+        )
+
+    @property
+    def parameters(self) -> dict[str, Any]:
+        return {
+            "type": "object",
+            "properties": {
+                "url": {
+                    "type": "string",
+                    "description": "The HTTP/HTTPS URL to navigate to.",
+                },
+                "instruction": {
+                    "type": "string",
+                    "description": (
+                        "What to extract from the page. Be specific — e.g. "
+                        "'Extract all pricing plans with features and prices', "
+                        "'Get the main article text and author', "
+                        "'List all navigation links'. "
+                        "Defaults to extracting the main page content."
+                    ),
+                    "default": "Extract the main content of this page.",
+                },
+            },
+            "required": ["url"],
+        }
+
+    @property
+    def requires_auth(self) -> bool:
+        return True
+
+    async def _execute(
+        self,
+        user_id: str | None,  # noqa: ARG002
+        session: ChatSession,
+        **kwargs: Any,
+    ) -> ToolResponseBase:
+        url: str = (kwargs.get("url") or "").strip()
+        instruction: str = (
+            kwargs.get("instruction") or "Extract the main content of this page."
+        )
+        session_id = session.session_id if session else None
+
+        if not url:
+            return ErrorResponse(
+                message="Please provide a URL to browse.",
+                error="missing_url",
+                session_id=session_id,
+            )
+
+        if not url.startswith(("http://", "https://")):
+            return ErrorResponse(
+                message="Only HTTP/HTTPS URLs are supported.",
+                error="invalid_url",
+                session_id=session_id,
+            )
+
+        api_key = os.environ.get("STAGEHAND_API_KEY")
+        project_id = os.environ.get("STAGEHAND_PROJECT_ID")
+        model_api_key = os.environ.get("ANTHROPIC_API_KEY")
+
+        if not api_key or not project_id:
+            return ErrorResponse(
+                message=(
+                    "Web browsing is not configured on this platform. "
+                    "STAGEHAND_API_KEY and STAGEHAND_PROJECT_ID are required."
+                ),
+                error="not_configured",
+                session_id=session_id,
+            )
+
+        if not model_api_key:
+            return ErrorResponse(
+                message=(
+                    "Web browsing is not configured: ANTHROPIC_API_KEY is required "
+                    "for Stagehand's extraction model."
+                ),
+                error="not_configured",
+                session_id=session_id,
+            )
+
+        client: Stagehand | None = None
+        try:
+            with _thread_safe_signal():
+                client = Stagehand(
+                    api_key=api_key,
+                    project_id=project_id,
+                    model_name=_STAGEHAND_MODEL,
+                    model_api_key=model_api_key,
+                )
+                await client.init()
+
+            page = client.page
+            assert page is not None, "Stagehand page is not initialized"
+            await page.goto(url)
+            result = await page.extract(instruction)
+
+            # Extract the text content from the Pydantic result model.
+            raw = result.model_dump().get("extraction", "")
+            content = str(raw) if raw else ""
+
+            truncated = len(content) > _MAX_CONTENT_CHARS
+            if truncated:
+                content = content[:_MAX_CONTENT_CHARS] + "\n\n[Content truncated]"
+
+            return BrowseWebResponse(
+                message=f"Browsed {url}",
+                url=url,
+                content=content,
+                truncated=truncated,
+                session_id=session_id,
+            )
+
+        except Exception as e:
+            logger.warning("[browse_web] Failed for %s: %s", url, e)
+            return ErrorResponse(
+                message=f"Failed to browse URL: {e}",
+                error="browse_failed",
+                session_id=session_id,
+            )
+        finally:
+            if client is not None:
+                try:
+                    await client.close()
+                except Exception:
+                    pass
--- a/autogpt_platform/backend/backend/copilot/tools/models.py
+++ b/autogpt_platform/backend/backend/copilot/tools/models.py
@@ -41,6 +41,8 @@ class ResponseType(str, Enum):
    INPUT_VALIDATION_ERROR = "input_validation_error"
    # Web fetch
    WEB_FETCH = "web_fetch"
+    # Browser-based web browsing (JS-rendered pages)
+    BROWSE_WEB = "browse_web"
    # Code execution
    BASH_EXEC = "bash_exec"
    # Feature request types
@@ -438,6 +440,15 @@ class WebFetchResponse(ToolResponseBase):
    truncated: bool = False


+class BrowseWebResponse(ToolResponseBase):
+    """Response for browse_web tool."""
+
+    type: ResponseType = ResponseType.BROWSE_WEB
+    url: str
+    content: str
+    truncated: bool = False
+
+
 class BashExecResponse(ToolResponseBase):
    """Response for bash_exec tool."""