From 1316e16f04697a3b4ee4c995e661f735bbd23743 Mon Sep 17 00:00:00 2001 From: majdyz Date: Tue, 21 Apr 2026 23:39:27 +0700 Subject: [PATCH] feat(backend/copilot): add web_search tool via Anthropic web_search beta MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New `web_search` copilot tool wraps Anthropic's server-side `web_search_20250305` so both SDK and baseline paths have a single unified search interface. Previously baseline (Kimi on OpenRouter) had no native search and had to go through the Perplexity block via `run_block`; SDK (Sonnet) used Claude Code's native WebSearch. * `copilot/tools/web_search.py` — `WebSearchTool` dispatches through `AsyncAnthropic.messages.create` with a cheap Haiku model + `web_search_20250305` tool, parses `web_search_tool_result` blocks into {title, url, snippet, page_age}. `is_available` hides the tool when no Anthropic API key is configured. * `sdk/tool_adapter.py` — moved `WebSearch` from SDK built-in-always list to `SDK_DISALLOWED_TOOLS` so SDK routes through `mcp__copilot__web_search` too. Single code path for cost tracking. * `persist_and_record_usage(provider="anthropic")` — billing lands in the same turn-accounting bucket as LLM cost, so rate limits and credit charges stay coherent. Cost = per-search fee ($10/1K) + Haiku dispatch tokens. * `copilot/tools/models.py` — new `WebSearchResponse` / `WebSearchResult` models matching the native WebSearch shape. 12 new tests: result extractor (title/url/snippet/page_age, limit cap, non-search blocks ignored), cost estimator (per-search fee linear in count), integration (cost tracker called with provider='anthropic'), no-API-key short-circuit, registry sanity. --- .../backend/copilot/sdk/tool_adapter.py | 5 +- .../backend/backend/copilot/tools/__init__.py | 2 + .../backend/backend/copilot/tools/models.py | 25 ++ .../backend/copilot/tools/web_search.py | 221 +++++++++++++ .../backend/copilot/tools/web_search_test.py | 304 ++++++++++++++++++ 5 files changed, 556 insertions(+), 1 deletion(-) create mode 100644 autogpt_platform/backend/backend/copilot/tools/web_search.py create mode 100644 autogpt_platform/backend/backend/copilot/tools/web_search_test.py diff --git a/autogpt_platform/backend/backend/copilot/sdk/tool_adapter.py b/autogpt_platform/backend/backend/copilot/sdk/tool_adapter.py index d97937da23..7e1fa0396d 100644 --- a/autogpt_platform/backend/backend/copilot/sdk/tool_adapter.py +++ b/autogpt_platform/backend/backend/copilot/sdk/tool_adapter.py @@ -779,7 +779,9 @@ def create_copilot_mcp_server(*, use_e2b: bool = False): # In E2B mode, all five are disabled — MCP equivalents provide direct sandbox # access. read_file also handles local tool-results and ephemeral reads. _SDK_BUILTIN_FILE_TOOLS = ["Read", "Write", "Edit", "Glob", "Grep"] -_SDK_BUILTIN_ALWAYS = ["Task", "Agent", "WebSearch", "TodoWrite"] +# WebSearch moved to ``SDK_DISALLOWED_TOOLS`` — routed through +# ``mcp__copilot__web_search`` so cost tracking is unified across paths. +_SDK_BUILTIN_ALWAYS = ["Task", "Agent", "TodoWrite"] _SDK_BUILTIN_TOOLS = [*_SDK_BUILTIN_FILE_TOOLS, *_SDK_BUILTIN_ALWAYS] # SDK built-in tools that must be explicitly blocked. @@ -805,6 +807,7 @@ _SDK_BUILTIN_TOOLS = [*_SDK_BUILTIN_FILE_TOOLS, *_SDK_BUILTIN_ALWAYS] SDK_DISALLOWED_TOOLS = [ "Bash", "WebFetch", + "WebSearch", "AskUserQuestion", "Write", "Edit", diff --git a/autogpt_platform/backend/backend/copilot/tools/__init__.py b/autogpt_platform/backend/backend/copilot/tools/__init__.py index 9ba050b79a..7aace646a6 100644 --- a/autogpt_platform/backend/backend/copilot/tools/__init__.py +++ b/autogpt_platform/backend/backend/copilot/tools/__init__.py @@ -45,6 +45,7 @@ from .run_sub_session import RunSubSessionTool from .search_docs import SearchDocsTool from .validate_agent import ValidateAgentGraphTool from .web_fetch import WebFetchTool +from .web_search import WebSearchTool from .workspace_files import ( DeleteWorkspaceFileTool, ListWorkspaceFilesTool, @@ -93,6 +94,7 @@ TOOL_REGISTRY: dict[str, BaseTool] = { "get_agent_building_guide": GetAgentBuildingGuideTool(), # Web fetch for safe URL retrieval "web_fetch": WebFetchTool(), + "web_search": WebSearchTool(), # Agent-browser multi-step automation (navigate, act, screenshot) "browser_navigate": BrowserNavigateTool(), "browser_act": BrowserActTool(), diff --git a/autogpt_platform/backend/backend/copilot/tools/models.py b/autogpt_platform/backend/backend/copilot/tools/models.py index 8fa7e6cbb4..08b62056a4 100644 --- a/autogpt_platform/backend/backend/copilot/tools/models.py +++ b/autogpt_platform/backend/backend/copilot/tools/models.py @@ -76,6 +76,7 @@ class ResponseType(str, Enum): # Web WEB_FETCH = "web_fetch" + WEB_SEARCH = "web_search" # Feature requests FEATURE_REQUEST_SEARCH = "feature_request_search" @@ -585,6 +586,30 @@ class WebFetchResponse(ToolResponseBase): truncated: bool = False +class WebSearchResult(BaseModel): + """One entry in a web_search tool response.""" + + title: str + url: str + snippet: str = "" + page_age: str | None = None + + +class WebSearchResponse(ToolResponseBase): + """Response for web_search tool — mirrors the shape of the SDK's + native ``WebSearch`` tool so the LLM sees a consistent interface + regardless of which path dispatched the call.""" + + type: ResponseType = ResponseType.WEB_SEARCH + query: str + results: list[WebSearchResult] = Field(default_factory=list) + # Backend-reported usage for this call (copied from Anthropic's + # ``usage.server_tool_use``). Surfaces as metadata for frontend + # debug panels but is also what drives rate-limit / cost tracking + # via ``persist_and_record_usage(provider="anthropic")``. + search_requests: int = 0 + + class BashExecResponse(ToolResponseBase): """Response for bash_exec tool.""" diff --git a/autogpt_platform/backend/backend/copilot/tools/web_search.py b/autogpt_platform/backend/backend/copilot/tools/web_search.py new file mode 100644 index 0000000000..4b7ac3a53f --- /dev/null +++ b/autogpt_platform/backend/backend/copilot/tools/web_search.py @@ -0,0 +1,221 @@ +"""Web search tool — wraps Anthropic's server-side ``web_search`` beta. + +Single entry point for web search on both SDK and baseline paths. The +``web_search_20250305`` tool is server-side on Anthropic, so we call +the Messages API directly regardless of which LLM invoked the copilot +tool — OpenRouter can't proxy server-side tool execution. +""" + +import logging +from typing import Any + +from anthropic import AsyncAnthropic + +from backend.copilot.model import ChatSession +from backend.copilot.token_tracking import persist_and_record_usage +from backend.util.settings import Settings + +from .base import BaseTool +from .models import ErrorResponse, ToolResponseBase, WebSearchResponse, WebSearchResult + +logger = logging.getLogger(__name__) + +_WEB_SEARCH_DISPATCH_MODEL = "claude-haiku-4-5" +_MAX_DISPATCH_TOKENS = 512 +_DEFAULT_MAX_RESULTS = 5 +_HARD_MAX_RESULTS = 20 + + +class WebSearchTool(BaseTool): + """Search the public web and return cited results.""" + + @property + def name(self) -> str: + return "web_search" + + @property + def description(self) -> str: + return ( + "Search the web and return cited results. Use this for live " + "information — news, current events, up-to-date docs, recent " + "releases — when the model's training data would be stale. " + "Returns a list of {title, url, snippet} plus the URLs so " + "``web_fetch`` can deep-dive any result. Costs a few cents " + "per search; prefer one well-targeted query over many " + "reformulations." + ) + + @property + def parameters(self) -> dict[str, Any]: + return { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "The search query — a question or topic.", + }, + "max_results": { + "type": "integer", + "description": ( + f"Maximum results to return (default " + f"{_DEFAULT_MAX_RESULTS}, hard cap {_HARD_MAX_RESULTS})." + ), + "default": _DEFAULT_MAX_RESULTS, + }, + }, + "required": ["query"], + } + + @property + def requires_auth(self) -> bool: + return False + + @property + def is_available(self) -> bool: + return bool(Settings().secrets.anthropic_api_key) + + async def _execute( + self, + user_id: str | None, + session: ChatSession, + query: str = "", + max_results: int = _DEFAULT_MAX_RESULTS, + **kwargs: Any, + ) -> ToolResponseBase: + query = (query or "").strip() + session_id = session.session_id if session else None + if not query: + return ErrorResponse( + message="Please provide a non-empty search query.", + error="missing_query", + session_id=session_id, + ) + + try: + max_results = int(max_results) + except (TypeError, ValueError): + max_results = _DEFAULT_MAX_RESULTS + max_results = max(1, min(max_results, _HARD_MAX_RESULTS)) + + api_key = Settings().secrets.anthropic_api_key + if not api_key: + return ErrorResponse( + message=( + "Web search is unavailable — the deployment has no " + "Anthropic API key configured." + ), + error="web_search_not_configured", + session_id=session_id, + ) + + client = AsyncAnthropic(api_key=api_key) + try: + resp = await client.messages.create( + model=_WEB_SEARCH_DISPATCH_MODEL, + max_tokens=_MAX_DISPATCH_TOKENS, + tools=[ + { + "type": "web_search_20250305", + "name": "web_search", + "max_uses": 1, + } + ], + messages=[ + { + "role": "user", + "content": ( + f"Use the web_search tool exactly once with the " + f"query {query!r} and then stop. Do not " + f"summarise — the caller parses the raw " + f"tool_result." + ), + } + ], + ) + except Exception as exc: + logger.warning( + "[web_search] Anthropic call failed for query=%r: %s", query, exc + ) + return ErrorResponse( + message=f"Web search failed: {exc}", + error="web_search_failed", + session_id=session_id, + ) + + results, search_requests = _extract_results(resp, limit=max_results) + + cost_usd = _estimate_cost_usd(resp, search_requests=search_requests) + try: + usage = getattr(resp, "usage", None) + await persist_and_record_usage( + session=session, + user_id=user_id, + prompt_tokens=getattr(usage, "input_tokens", 0) or 0, + completion_tokens=getattr(usage, "output_tokens", 0) or 0, + log_prefix="[web_search]", + cost_usd=cost_usd, + model=_WEB_SEARCH_DISPATCH_MODEL, + provider="anthropic", + ) + except Exception as exc: + logger.warning("[web_search] usage tracking failed: %s", exc) + + return WebSearchResponse( + message=f"Found {len(results)} result(s) for {query!r}.", + query=query, + results=results, + search_requests=search_requests, + session_id=session_id, + ) + + +def _extract_results(resp: Any, *, limit: int) -> tuple[list[WebSearchResult], int]: + """Pull results + server-side request count from an Anthropic response.""" + results: list[WebSearchResult] = [] + search_requests = 0 + + for block in getattr(resp, "content", []) or []: + btype = getattr(block, "type", None) + if btype == "web_search_tool_result": + content = getattr(block, "content", []) or [] + for item in content: + if getattr(item, "type", None) != "web_search_result": + continue + if len(results) >= limit: + break + results.append( + WebSearchResult( + title=getattr(item, "title", "") or "", + url=getattr(item, "url", "") or "", + snippet=getattr(item, "encrypted_content", None) + or getattr(item, "page_content", "") + or "", + page_age=getattr(item, "page_age", None), + ) + ) + + usage = getattr(resp, "usage", None) + server_tool_use = getattr(usage, "server_tool_use", None) if usage else None + if server_tool_use is not None: + search_requests = getattr(server_tool_use, "web_search_requests", 0) or 0 + + return results, search_requests + + +# Update when Anthropic revises pricing. +_COST_PER_SEARCH_USD = 0.010 # $10 per 1,000 web_search requests +_HAIKU_INPUT_USD_PER_MTOK = 1.0 +_HAIKU_OUTPUT_USD_PER_MTOK = 5.0 + + +def _estimate_cost_usd(resp: Any, *, search_requests: int) -> float: + """Per-search fee × count + Haiku dispatch tokens.""" + usage = getattr(resp, "usage", None) + input_tokens = getattr(usage, "input_tokens", 0) if usage else 0 + output_tokens = getattr(usage, "output_tokens", 0) if usage else 0 + + search_cost = search_requests * _COST_PER_SEARCH_USD + inference_cost = (input_tokens / 1_000_000) * _HAIKU_INPUT_USD_PER_MTOK + ( + output_tokens / 1_000_000 + ) * _HAIKU_OUTPUT_USD_PER_MTOK + return round(search_cost + inference_cost, 6) diff --git a/autogpt_platform/backend/backend/copilot/tools/web_search_test.py b/autogpt_platform/backend/backend/copilot/tools/web_search_test.py new file mode 100644 index 0000000000..fe7885e171 --- /dev/null +++ b/autogpt_platform/backend/backend/copilot/tools/web_search_test.py @@ -0,0 +1,304 @@ +"""Tests for the ``web_search`` copilot tool. + +Covers the result extractor + cost estimator as pure units (fed with +synthetic Anthropic response objects), plus light integration tests that +mock ``AsyncAnthropic.messages.create`` and confirm the handler plumbs +through to ``persist_and_record_usage`` with the right provider tag. +""" + +from types import SimpleNamespace +from unittest.mock import AsyncMock, patch + +import pytest + +from backend.copilot.model import ChatSession + +from .models import ErrorResponse, WebSearchResponse, WebSearchResult +from .web_search import ( + _COST_PER_SEARCH_USD, + WebSearchTool, + _estimate_cost_usd, + _extract_results, +) + + +def _fake_anthropic_response( + *, + results: list[dict] | None = None, + search_requests: int = 1, + input_tokens: int = 120, + output_tokens: int = 40, +) -> SimpleNamespace: + """Build a synthetic Anthropic Messages response. + + Matches the shape produced by ``client.messages.create`` when the + response includes a ``web_search_tool_result`` content block and + ``usage.server_tool_use.web_search_requests`` on the turn meter. + """ + content = [] + if results is not None: + content.append( + SimpleNamespace( + type="web_search_tool_result", + content=[ + SimpleNamespace( + type="web_search_result", + title=r.get("title", "untitled"), + url=r.get("url", ""), + encrypted_content=r.get("snippet", ""), + page_age=r.get("page_age"), + ) + for r in results + ], + ) + ) + usage = SimpleNamespace( + input_tokens=input_tokens, + output_tokens=output_tokens, + server_tool_use=SimpleNamespace(web_search_requests=search_requests), + ) + return SimpleNamespace(content=content, usage=usage) + + +class TestExtractResults: + """The extractor is the only Anthropic-response-shape contact point; + pin its behaviour so an API shape change surfaces here first.""" + + def test_extracts_title_url_snippet_and_page_age(self): + resp = _fake_anthropic_response( + results=[ + { + "title": "Kimi K2.6 launch", + "url": "https://example.com/kimi", + "snippet": "Moonshot released K2.6 on 2026-04-20.", + "page_age": "1 day", + }, + { + "title": "OpenRouter pricing", + "url": "https://openrouter.ai/moonshotai/kimi-k2.6", + "snippet": "", + }, + ] + ) + out, requests = _extract_results(resp, limit=10) + assert requests == 1 + assert len(out) == 2 + assert out[0].title == "Kimi K2.6 launch" + assert out[0].url == "https://example.com/kimi" + assert out[0].snippet.startswith("Moonshot released") + assert out[0].page_age == "1 day" + assert out[1].snippet == "" + + def test_limit_caps_returned_results(self): + resp = _fake_anthropic_response( + results=[{"title": f"r{i}", "url": f"https://e/{i}"} for i in range(10)] + ) + out, _ = _extract_results(resp, limit=3) + assert len(out) == 3 + assert [r.title for r in out] == ["r0", "r1", "r2"] + + def test_missing_content_returns_empty(self): + resp = SimpleNamespace(content=[], usage=None) + out, requests = _extract_results(resp, limit=10) + assert out == [] + assert requests == 0 + + def test_non_search_blocks_are_ignored(self): + resp = SimpleNamespace( + content=[ + SimpleNamespace(type="text", text="Here's what I found..."), + SimpleNamespace( + type="web_search_tool_result", + content=[ + SimpleNamespace( + type="web_search_result", + title="real", + url="https://real.example", + encrypted_content="body", + page_age=None, + ) + ], + ), + ], + usage=None, + ) + out, _ = _extract_results(resp, limit=10) + assert len(out) == 1 and out[0].title == "real" + + +class TestEstimateCostUsd: + """Pin the per-search fee + Haiku inference math — the pricing + constants in ``web_search.py`` are hard-coded (no live lookup) so a + drift between Anthropic's schedule and our constants must surface + in this test for the next reader to notice.""" + + def test_zero_searches_still_charges_inference(self): + resp = _fake_anthropic_response(results=[], search_requests=0) + cost = _estimate_cost_usd(resp, search_requests=0) + # Haiku at 1000 input / 5000 output tokens = tiny but non-zero. + assert 0 < cost < 0.001 + + def test_single_search_fee_dominates(self): + resp = _fake_anthropic_response( + results=[{"title": "x", "url": "https://e"}], + search_requests=1, + input_tokens=100, + output_tokens=20, + ) + cost = _estimate_cost_usd(resp, search_requests=1) + # ~$0.010 search + trivial inference — total still ~1 cent. + assert cost >= _COST_PER_SEARCH_USD + assert cost < _COST_PER_SEARCH_USD + 0.001 + + def test_three_searches_linear_in_count(self): + resp = _fake_anthropic_response( + results=[], search_requests=3, input_tokens=0, output_tokens=0 + ) + cost = _estimate_cost_usd(resp, search_requests=3) + assert cost == pytest.approx(3 * _COST_PER_SEARCH_USD) + + +class TestWebSearchToolDispatch: + """Lightweight integration test: mock the Anthropic client, confirm + the handler returns a ``WebSearchResponse`` and the usage tracker is + called with ``provider='anthropic'`` (not 'open_router', even on the + baseline path — server-side web_search bills Anthropic regardless of + the calling LLM's route).""" + + def _session(self) -> ChatSession: + s = ChatSession.new("test-user", dry_run=False) + s.session_id = "sess-1" + return s + + @pytest.mark.asyncio + async def test_returns_response_with_results_and_tracks_cost(self, monkeypatch): + fake_resp = _fake_anthropic_response( + results=[ + { + "title": "hello", + "url": "https://example.com", + "snippet": "greeting", + } + ], + search_requests=1, + ) + mock_client = type( + "MC", + (), + { + "messages": type( + "M", (), {"create": AsyncMock(return_value=fake_resp)} + )() + }, + )() + + # Stub the Anthropic API key so ``is_available`` is True. + monkeypatch.setattr( + "backend.copilot.tools.web_search.Settings", + lambda: SimpleNamespace( + secrets=SimpleNamespace(anthropic_api_key="sk-test") + ), + ) + + with ( + patch( + "backend.copilot.tools.web_search.AsyncAnthropic", + return_value=mock_client, + ), + patch( + "backend.copilot.tools.web_search.persist_and_record_usage", + new=AsyncMock(return_value=160), + ) as mock_track, + ): + tool = WebSearchTool() + result = await tool._execute( + user_id="u1", + session=self._session(), + query="kimi k2.6 launch", + max_results=5, + ) + + assert isinstance(result, WebSearchResponse) + assert result.query == "kimi k2.6 launch" + assert len(result.results) == 1 + assert isinstance(result.results[0], WebSearchResult) + assert result.search_requests == 1 + + # Cost tracker must have been called with provider="anthropic". + assert mock_track.await_count == 1 + kwargs = mock_track.await_args.kwargs + assert kwargs["provider"] == "anthropic" + assert kwargs["model"] == "claude-haiku-4-5" + assert kwargs["user_id"] == "u1" + assert kwargs["cost_usd"] >= _COST_PER_SEARCH_USD + + @pytest.mark.asyncio + async def test_missing_api_key_returns_error_without_calling_anthropic( + self, monkeypatch + ): + monkeypatch.setattr( + "backend.copilot.tools.web_search.Settings", + lambda: SimpleNamespace(secrets=SimpleNamespace(anthropic_api_key="")), + ) + anthropic_stub = AsyncMock() + with ( + patch( + "backend.copilot.tools.web_search.AsyncAnthropic", + return_value=anthropic_stub, + ), + patch( + "backend.copilot.tools.web_search.persist_and_record_usage", + new=AsyncMock(), + ) as mock_track, + ): + tool = WebSearchTool() + assert tool.is_available is False + result = await tool._execute( + user_id="u1", + session=self._session(), + query="anything", + ) + assert isinstance(result, ErrorResponse) + assert result.error == "web_search_not_configured" + anthropic_stub.messages.create.assert_not_called() + mock_track.assert_not_called() + + @pytest.mark.asyncio + async def test_empty_query_rejected_without_api_call(self, monkeypatch): + monkeypatch.setattr( + "backend.copilot.tools.web_search.Settings", + lambda: SimpleNamespace( + secrets=SimpleNamespace(anthropic_api_key="sk-test") + ), + ) + anthropic_stub = AsyncMock() + with patch( + "backend.copilot.tools.web_search.AsyncAnthropic", + return_value=anthropic_stub, + ): + tool = WebSearchTool() + result = await tool._execute( + user_id="u1", session=self._session(), query=" " + ) + assert isinstance(result, ErrorResponse) + assert result.error == "missing_query" + anthropic_stub.messages.create.assert_not_called() + + +class TestToolRegistryIntegration: + """The tool must be registered under the ``web_search`` name so the + MCP layer exposes it as ``mcp__copilot__web_search`` — which is + what the SDK path now dispatches to (see + ``sdk/tool_adapter.py::SDK_DISALLOWED_TOOLS`` which blocks the CLI's + native ``WebSearch`` in favour of the MCP route).""" + + def test_web_search_is_in_tool_registry(self): + from backend.copilot.tools import TOOL_REGISTRY + + assert "web_search" in TOOL_REGISTRY + assert isinstance(TOOL_REGISTRY["web_search"], WebSearchTool) + + def test_sdk_native_websearch_is_disallowed(self): + from backend.copilot.sdk.tool_adapter import SDK_DISALLOWED_TOOLS + + assert "WebSearch" in SDK_DISALLOWED_TOOLS