feat(backend/copilot): add web_search tool via Anthropic web_search beta

New `web_search` copilot tool wraps Anthropic's server-side `web_search_20250305` so both SDK and baseline paths have a single unified search interface. Previously baseline (Kimi on OpenRouter) had no native search and had to go through the Perplexity block via `run_block`; SDK (Sonnet) used Claude Code's native WebSearch. * `copilot/tools/web_search.py` — `WebSearchTool` dispatches through `AsyncAnthropic.messages.create` with a cheap Haiku model + `web_search_20250305` tool, parses `web_search_tool_result` blocks into {title, url, snippet, page_age}. `is_available` hides the tool when no Anthropic API key is configured. * `sdk/tool_adapter.py` — moved `WebSearch` from SDK built-in-always list to `SDK_DISALLOWED_TOOLS` so SDK routes through `mcp__copilot__web_search` too. Single code path for cost tracking. * `persist_and_record_usage(provider="anthropic")` — billing lands in the same turn-accounting bucket as LLM cost, so rate limits and credit charges stay coherent. Cost = per-search fee ($10/1K) + Haiku dispatch tokens. * `copilot/tools/models.py` — new `WebSearchResponse` / `WebSearchResult` models matching the native WebSearch shape. 12 new tests: result extractor (title/url/snippet/page_age, limit cap, non-search blocks ignored), cost estimator (per-search fee linear in count), integration (cost tracker called with provider='anthropic'), no-API-key short-circuit, registry sanity.
2026-04-30 03:00:41 -04:00 · 2026-04-21 23:39:27 +07:00
parent 0591804272
commit 1316e16f04
5 changed files with 556 additions and 1 deletions
--- a/autogpt_platform/backend/backend/copilot/sdk/tool_adapter.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/tool_adapter.py
@@ -779,7 +779,9 @@ def create_copilot_mcp_server(*, use_e2b: bool = False):
 # In E2B mode, all five are disabled — MCP equivalents provide direct sandbox
 # access.  read_file also handles local tool-results and ephemeral reads.
 _SDK_BUILTIN_FILE_TOOLS = ["Read", "Write", "Edit", "Glob", "Grep"]
-_SDK_BUILTIN_ALWAYS = ["Task", "Agent", "WebSearch", "TodoWrite"]
+# WebSearch moved to ``SDK_DISALLOWED_TOOLS`` — routed through
+# ``mcp__copilot__web_search`` so cost tracking is unified across paths.
+_SDK_BUILTIN_ALWAYS = ["Task", "Agent", "TodoWrite"]
 _SDK_BUILTIN_TOOLS = [*_SDK_BUILTIN_FILE_TOOLS, *_SDK_BUILTIN_ALWAYS]

 # SDK built-in tools that must be explicitly blocked.
@@ -805,6 +807,7 @@ _SDK_BUILTIN_TOOLS = [*_SDK_BUILTIN_FILE_TOOLS, *_SDK_BUILTIN_ALWAYS]
 SDK_DISALLOWED_TOOLS = [
    "Bash",
    "WebFetch",
+    "WebSearch",
    "AskUserQuestion",
    "Write",
    "Edit",
--- a/autogpt_platform/backend/backend/copilot/tools/init.py
+++ b/autogpt_platform/backend/backend/copilot/tools/init.py
@@ -45,6 +45,7 @@ from .run_sub_session import RunSubSessionTool
 from .search_docs import SearchDocsTool
 from .validate_agent import ValidateAgentGraphTool
 from .web_fetch import WebFetchTool
+from .web_search import WebSearchTool
 from .workspace_files import (
    DeleteWorkspaceFileTool,
    ListWorkspaceFilesTool,
@@ -93,6 +94,7 @@ TOOL_REGISTRY: dict[str, BaseTool] = {
    "get_agent_building_guide": GetAgentBuildingGuideTool(),
    # Web fetch for safe URL retrieval
    "web_fetch": WebFetchTool(),
+    "web_search": WebSearchTool(),
    # Agent-browser multi-step automation (navigate, act, screenshot)
    "browser_navigate": BrowserNavigateTool(),
    "browser_act": BrowserActTool(),
--- a/autogpt_platform/backend/backend/copilot/tools/models.py
+++ b/autogpt_platform/backend/backend/copilot/tools/models.py
@@ -76,6 +76,7 @@ class ResponseType(str, Enum):

    # Web
    WEB_FETCH = "web_fetch"
+    WEB_SEARCH = "web_search"

    # Feature requests
    FEATURE_REQUEST_SEARCH = "feature_request_search"
@@ -585,6 +586,30 @@ class WebFetchResponse(ToolResponseBase):
    truncated: bool = False


+class WebSearchResult(BaseModel):
+    """One entry in a web_search tool response."""
+
+    title: str
+    url: str
+    snippet: str = ""
+    page_age: str | None = None
+
+
+class WebSearchResponse(ToolResponseBase):
+    """Response for web_search tool — mirrors the shape of the SDK's
+    native ``WebSearch`` tool so the LLM sees a consistent interface
+    regardless of which path dispatched the call."""
+
+    type: ResponseType = ResponseType.WEB_SEARCH
+    query: str
+    results: list[WebSearchResult] = Field(default_factory=list)
+    # Backend-reported usage for this call (copied from Anthropic's
+    # ``usage.server_tool_use``).  Surfaces as metadata for frontend
+    # debug panels but is also what drives rate-limit / cost tracking
+    # via ``persist_and_record_usage(provider="anthropic")``.
+    search_requests: int = 0
+
+
 class BashExecResponse(ToolResponseBase):
    """Response for bash_exec tool."""

--- a/autogpt_platform/backend/backend/copilot/tools/web_search.py
+++ b/autogpt_platform/backend/backend/copilot/tools/web_search.py
@@ -0,0 +1,221 @@
+"""Web search tool — wraps Anthropic's server-side ``web_search`` beta.
+
+Single entry point for web search on both SDK and baseline paths.  The
+``web_search_20250305`` tool is server-side on Anthropic, so we call
+the Messages API directly regardless of which LLM invoked the copilot
+tool — OpenRouter can't proxy server-side tool execution.
+"""
+
+import logging
+from typing import Any
+
+from anthropic import AsyncAnthropic
+
+from backend.copilot.model import ChatSession
+from backend.copilot.token_tracking import persist_and_record_usage
+from backend.util.settings import Settings
+
+from .base import BaseTool
+from .models import ErrorResponse, ToolResponseBase, WebSearchResponse, WebSearchResult
+
+logger = logging.getLogger(__name__)
+
+_WEB_SEARCH_DISPATCH_MODEL = "claude-haiku-4-5"
+_MAX_DISPATCH_TOKENS = 512
+_DEFAULT_MAX_RESULTS = 5
+_HARD_MAX_RESULTS = 20
+
+
+class WebSearchTool(BaseTool):
+    """Search the public web and return cited results."""
+
+    @property
+    def name(self) -> str:
+        return "web_search"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Search the web and return cited results.  Use this for live "
+            "information — news, current events, up-to-date docs, recent "
+            "releases — when the model's training data would be stale.  "
+            "Returns a list of {title, url, snippet} plus the URLs so "
+            "``web_fetch`` can deep-dive any result.  Costs a few cents "
+            "per search; prefer one well-targeted query over many "
+            "reformulations."
+        )
+
+    @property
+    def parameters(self) -> dict[str, Any]:
+        return {
+            "type": "object",
+            "properties": {
+                "query": {
+                    "type": "string",
+                    "description": "The search query — a question or topic.",
+                },
+                "max_results": {
+                    "type": "integer",
+                    "description": (
+                        f"Maximum results to return (default "
+                        f"{_DEFAULT_MAX_RESULTS}, hard cap {_HARD_MAX_RESULTS})."
+                    ),
+                    "default": _DEFAULT_MAX_RESULTS,
+                },
+            },
+            "required": ["query"],
+        }
+
+    @property
+    def requires_auth(self) -> bool:
+        return False
+
+    @property
+    def is_available(self) -> bool:
+        return bool(Settings().secrets.anthropic_api_key)
+
+    async def _execute(
+        self,
+        user_id: str | None,
+        session: ChatSession,
+        query: str = "",
+        max_results: int = _DEFAULT_MAX_RESULTS,
+        **kwargs: Any,
+    ) -> ToolResponseBase:
+        query = (query or "").strip()
+        session_id = session.session_id if session else None
+        if not query:
+            return ErrorResponse(
+                message="Please provide a non-empty search query.",
+                error="missing_query",
+                session_id=session_id,
+            )
+
+        try:
+            max_results = int(max_results)
+        except (TypeError, ValueError):
+            max_results = _DEFAULT_MAX_RESULTS
+        max_results = max(1, min(max_results, _HARD_MAX_RESULTS))
+
+        api_key = Settings().secrets.anthropic_api_key
+        if not api_key:
+            return ErrorResponse(
+                message=(
+                    "Web search is unavailable — the deployment has no "
+                    "Anthropic API key configured."
+                ),
+                error="web_search_not_configured",
+                session_id=session_id,
+            )
+
+        client = AsyncAnthropic(api_key=api_key)
+        try:
+            resp = await client.messages.create(
+                model=_WEB_SEARCH_DISPATCH_MODEL,
+                max_tokens=_MAX_DISPATCH_TOKENS,
+                tools=[
+                    {
+                        "type": "web_search_20250305",
+                        "name": "web_search",
+                        "max_uses": 1,
+                    }
+                ],
+                messages=[
+                    {
+                        "role": "user",
+                        "content": (
+                            f"Use the web_search tool exactly once with the "
+                            f"query {query!r} and then stop.  Do not "
+                            f"summarise — the caller parses the raw "
+                            f"tool_result."
+                        ),
+                    }
+                ],
+            )
+        except Exception as exc:
+            logger.warning(
+                "[web_search] Anthropic call failed for query=%r: %s", query, exc
+            )
+            return ErrorResponse(
+                message=f"Web search failed: {exc}",
+                error="web_search_failed",
+                session_id=session_id,
+            )
+
+        results, search_requests = _extract_results(resp, limit=max_results)
+
+        cost_usd = _estimate_cost_usd(resp, search_requests=search_requests)
+        try:
+            usage = getattr(resp, "usage", None)
+            await persist_and_record_usage(
+                session=session,
+                user_id=user_id,
+                prompt_tokens=getattr(usage, "input_tokens", 0) or 0,
+                completion_tokens=getattr(usage, "output_tokens", 0) or 0,
+                log_prefix="[web_search]",
+                cost_usd=cost_usd,
+                model=_WEB_SEARCH_DISPATCH_MODEL,
+                provider="anthropic",
+            )
+        except Exception as exc:
+            logger.warning("[web_search] usage tracking failed: %s", exc)
+
+        return WebSearchResponse(
+            message=f"Found {len(results)} result(s) for {query!r}.",
+            query=query,
+            results=results,
+            search_requests=search_requests,
+            session_id=session_id,
+        )
+
+
+def _extract_results(resp: Any, *, limit: int) -> tuple[list[WebSearchResult], int]:
+    """Pull results + server-side request count from an Anthropic response."""
+    results: list[WebSearchResult] = []
+    search_requests = 0
+
+    for block in getattr(resp, "content", []) or []:
+        btype = getattr(block, "type", None)
+        if btype == "web_search_tool_result":
+            content = getattr(block, "content", []) or []
+            for item in content:
+                if getattr(item, "type", None) != "web_search_result":
+                    continue
+                if len(results) >= limit:
+                    break
+                results.append(
+                    WebSearchResult(
+                        title=getattr(item, "title", "") or "",
+                        url=getattr(item, "url", "") or "",
+                        snippet=getattr(item, "encrypted_content", None)
+                        or getattr(item, "page_content", "")
+                        or "",
+                        page_age=getattr(item, "page_age", None),
+                    )
+                )
+
+    usage = getattr(resp, "usage", None)
+    server_tool_use = getattr(usage, "server_tool_use", None) if usage else None
+    if server_tool_use is not None:
+        search_requests = getattr(server_tool_use, "web_search_requests", 0) or 0
+
+    return results, search_requests
+
+
+# Update when Anthropic revises pricing.
+_COST_PER_SEARCH_USD = 0.010  # $10 per 1,000 web_search requests
+_HAIKU_INPUT_USD_PER_MTOK = 1.0
+_HAIKU_OUTPUT_USD_PER_MTOK = 5.0
+
+
+def _estimate_cost_usd(resp: Any, *, search_requests: int) -> float:
+    """Per-search fee × count + Haiku dispatch tokens."""
+    usage = getattr(resp, "usage", None)
+    input_tokens = getattr(usage, "input_tokens", 0) if usage else 0
+    output_tokens = getattr(usage, "output_tokens", 0) if usage else 0
+
+    search_cost = search_requests * _COST_PER_SEARCH_USD
+    inference_cost = (input_tokens / 1_000_000) * _HAIKU_INPUT_USD_PER_MTOK + (
+        output_tokens / 1_000_000
+    ) * _HAIKU_OUTPUT_USD_PER_MTOK
+    return round(search_cost + inference_cost, 6)
--- a/autogpt_platform/backend/backend/copilot/tools/web_search_test.py
+++ b/autogpt_platform/backend/backend/copilot/tools/web_search_test.py
@@ -0,0 +1,304 @@
+"""Tests for the ``web_search`` copilot tool.
+
+Covers the result extractor + cost estimator as pure units (fed with
+synthetic Anthropic response objects), plus light integration tests that
+mock ``AsyncAnthropic.messages.create`` and confirm the handler plumbs
+through to ``persist_and_record_usage`` with the right provider tag.
+"""
+
+from types import SimpleNamespace
+from unittest.mock import AsyncMock, patch
+
+import pytest
+
+from backend.copilot.model import ChatSession
+
+from .models import ErrorResponse, WebSearchResponse, WebSearchResult
+from .web_search import (
+    _COST_PER_SEARCH_USD,
+    WebSearchTool,
+    _estimate_cost_usd,
+    _extract_results,
+)
+
+
+def _fake_anthropic_response(
+    *,
+    results: list[dict] | None = None,
+    search_requests: int = 1,
+    input_tokens: int = 120,
+    output_tokens: int = 40,
+) -> SimpleNamespace:
+    """Build a synthetic Anthropic Messages response.
+
+    Matches the shape produced by ``client.messages.create`` when the
+    response includes a ``web_search_tool_result`` content block and
+    ``usage.server_tool_use.web_search_requests`` on the turn meter.
+    """
+    content = []
+    if results is not None:
+        content.append(
+            SimpleNamespace(
+                type="web_search_tool_result",
+                content=[
+                    SimpleNamespace(
+                        type="web_search_result",
+                        title=r.get("title", "untitled"),
+                        url=r.get("url", ""),
+                        encrypted_content=r.get("snippet", ""),
+                        page_age=r.get("page_age"),
+                    )
+                    for r in results
+                ],
+            )
+        )
+    usage = SimpleNamespace(
+        input_tokens=input_tokens,
+        output_tokens=output_tokens,
+        server_tool_use=SimpleNamespace(web_search_requests=search_requests),
+    )
+    return SimpleNamespace(content=content, usage=usage)
+
+
+class TestExtractResults:
+    """The extractor is the only Anthropic-response-shape contact point;
+    pin its behaviour so an API shape change surfaces here first."""
+
+    def test_extracts_title_url_snippet_and_page_age(self):
+        resp = _fake_anthropic_response(
+            results=[
+                {
+                    "title": "Kimi K2.6 launch",
+                    "url": "https://example.com/kimi",
+                    "snippet": "Moonshot released K2.6 on 2026-04-20.",
+                    "page_age": "1 day",
+                },
+                {
+                    "title": "OpenRouter pricing",
+                    "url": "https://openrouter.ai/moonshotai/kimi-k2.6",
+                    "snippet": "",
+                },
+            ]
+        )
+        out, requests = _extract_results(resp, limit=10)
+        assert requests == 1
+        assert len(out) == 2
+        assert out[0].title == "Kimi K2.6 launch"
+        assert out[0].url == "https://example.com/kimi"
+        assert out[0].snippet.startswith("Moonshot released")
+        assert out[0].page_age == "1 day"
+        assert out[1].snippet == ""
+
+    def test_limit_caps_returned_results(self):
+        resp = _fake_anthropic_response(
+            results=[{"title": f"r{i}", "url": f"https://e/{i}"} for i in range(10)]
+        )
+        out, _ = _extract_results(resp, limit=3)
+        assert len(out) == 3
+        assert [r.title for r in out] == ["r0", "r1", "r2"]
+
+    def test_missing_content_returns_empty(self):
+        resp = SimpleNamespace(content=[], usage=None)
+        out, requests = _extract_results(resp, limit=10)
+        assert out == []
+        assert requests == 0
+
+    def test_non_search_blocks_are_ignored(self):
+        resp = SimpleNamespace(
+            content=[
+                SimpleNamespace(type="text", text="Here's what I found..."),
+                SimpleNamespace(
+                    type="web_search_tool_result",
+                    content=[
+                        SimpleNamespace(
+                            type="web_search_result",
+                            title="real",
+                            url="https://real.example",
+                            encrypted_content="body",
+                            page_age=None,
+                        )
+                    ],
+                ),
+            ],
+            usage=None,
+        )
+        out, _ = _extract_results(resp, limit=10)
+        assert len(out) == 1 and out[0].title == "real"
+
+
+class TestEstimateCostUsd:
+    """Pin the per-search fee + Haiku inference math — the pricing
+    constants in ``web_search.py`` are hard-coded (no live lookup) so a
+    drift between Anthropic's schedule and our constants must surface
+    in this test for the next reader to notice."""
+
+    def test_zero_searches_still_charges_inference(self):
+        resp = _fake_anthropic_response(results=[], search_requests=0)
+        cost = _estimate_cost_usd(resp, search_requests=0)
+        # Haiku at 1000 input / 5000 output tokens = tiny but non-zero.
+        assert 0 < cost < 0.001
+
+    def test_single_search_fee_dominates(self):
+        resp = _fake_anthropic_response(
+            results=[{"title": "x", "url": "https://e"}],
+            search_requests=1,
+            input_tokens=100,
+            output_tokens=20,
+        )
+        cost = _estimate_cost_usd(resp, search_requests=1)
+        # ~$0.010 search + trivial inference — total still ~1 cent.
+        assert cost >= _COST_PER_SEARCH_USD
+        assert cost < _COST_PER_SEARCH_USD + 0.001
+
+    def test_three_searches_linear_in_count(self):
+        resp = _fake_anthropic_response(
+            results=[], search_requests=3, input_tokens=0, output_tokens=0
+        )
+        cost = _estimate_cost_usd(resp, search_requests=3)
+        assert cost == pytest.approx(3 * _COST_PER_SEARCH_USD)
+
+
+class TestWebSearchToolDispatch:
+    """Lightweight integration test: mock the Anthropic client, confirm
+    the handler returns a ``WebSearchResponse`` and the usage tracker is
+    called with ``provider='anthropic'`` (not 'open_router', even on the
+    baseline path — server-side web_search bills Anthropic regardless of
+    the calling LLM's route)."""
+
+    def _session(self) -> ChatSession:
+        s = ChatSession.new("test-user", dry_run=False)
+        s.session_id = "sess-1"
+        return s
+
+    @pytest.mark.asyncio
+    async def test_returns_response_with_results_and_tracks_cost(self, monkeypatch):
+        fake_resp = _fake_anthropic_response(
+            results=[
+                {
+                    "title": "hello",
+                    "url": "https://example.com",
+                    "snippet": "greeting",
+                }
+            ],
+            search_requests=1,
+        )
+        mock_client = type(
+            "MC",
+            (),
+            {
+                "messages": type(
+                    "M", (), {"create": AsyncMock(return_value=fake_resp)}
+                )()
+            },
+        )()
+
+        # Stub the Anthropic API key so ``is_available`` is True.
+        monkeypatch.setattr(
+            "backend.copilot.tools.web_search.Settings",
+            lambda: SimpleNamespace(
+                secrets=SimpleNamespace(anthropic_api_key="sk-test")
+            ),
+        )
+
+        with (
+            patch(
+                "backend.copilot.tools.web_search.AsyncAnthropic",
+                return_value=mock_client,
+            ),
+            patch(
+                "backend.copilot.tools.web_search.persist_and_record_usage",
+                new=AsyncMock(return_value=160),
+            ) as mock_track,
+        ):
+            tool = WebSearchTool()
+            result = await tool._execute(
+                user_id="u1",
+                session=self._session(),
+                query="kimi k2.6 launch",
+                max_results=5,
+            )
+
+        assert isinstance(result, WebSearchResponse)
+        assert result.query == "kimi k2.6 launch"
+        assert len(result.results) == 1
+        assert isinstance(result.results[0], WebSearchResult)
+        assert result.search_requests == 1
+
+        # Cost tracker must have been called with provider="anthropic".
+        assert mock_track.await_count == 1
+        kwargs = mock_track.await_args.kwargs
+        assert kwargs["provider"] == "anthropic"
+        assert kwargs["model"] == "claude-haiku-4-5"
+        assert kwargs["user_id"] == "u1"
+        assert kwargs["cost_usd"] >= _COST_PER_SEARCH_USD
+
+    @pytest.mark.asyncio
+    async def test_missing_api_key_returns_error_without_calling_anthropic(
+        self, monkeypatch
+    ):
+        monkeypatch.setattr(
+            "backend.copilot.tools.web_search.Settings",
+            lambda: SimpleNamespace(secrets=SimpleNamespace(anthropic_api_key="")),
+        )
+        anthropic_stub = AsyncMock()
+        with (
+            patch(
+                "backend.copilot.tools.web_search.AsyncAnthropic",
+                return_value=anthropic_stub,
+            ),
+            patch(
+                "backend.copilot.tools.web_search.persist_and_record_usage",
+                new=AsyncMock(),
+            ) as mock_track,
+        ):
+            tool = WebSearchTool()
+            assert tool.is_available is False
+            result = await tool._execute(
+                user_id="u1",
+                session=self._session(),
+                query="anything",
+            )
+        assert isinstance(result, ErrorResponse)
+        assert result.error == "web_search_not_configured"
+        anthropic_stub.messages.create.assert_not_called()
+        mock_track.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_empty_query_rejected_without_api_call(self, monkeypatch):
+        monkeypatch.setattr(
+            "backend.copilot.tools.web_search.Settings",
+            lambda: SimpleNamespace(
+                secrets=SimpleNamespace(anthropic_api_key="sk-test")
+            ),
+        )
+        anthropic_stub = AsyncMock()
+        with patch(
+            "backend.copilot.tools.web_search.AsyncAnthropic",
+            return_value=anthropic_stub,
+        ):
+            tool = WebSearchTool()
+            result = await tool._execute(
+                user_id="u1", session=self._session(), query="   "
+            )
+        assert isinstance(result, ErrorResponse)
+        assert result.error == "missing_query"
+        anthropic_stub.messages.create.assert_not_called()
+
+
+class TestToolRegistryIntegration:
+    """The tool must be registered under the ``web_search`` name so the
+    MCP layer exposes it as ``mcp__copilot__web_search`` — which is
+    what the SDK path now dispatches to (see
+    ``sdk/tool_adapter.py::SDK_DISALLOWED_TOOLS`` which blocks the CLI's
+    native ``WebSearch`` in favour of the MCP route)."""
+
+    def test_web_search_is_in_tool_registry(self):
+        from backend.copilot.tools import TOOL_REGISTRY
+
+        assert "web_search" in TOOL_REGISTRY
+        assert isinstance(TOOL_REGISTRY["web_search"], WebSearchTool)
+
+    def test_sdk_native_websearch_is_disallowed(self):
+        from backend.copilot.sdk.tool_adapter import SDK_DISALLOWED_TOOLS
+
+        assert "WebSearch" in SDK_DISALLOWED_TOOLS