feat(backend/copilot): add web_search tool via Anthropic web_search beta

New `web_search` copilot tool wraps Anthropic's server-side `web_search_20250305` so both SDK and baseline paths have a single unified search interface.  Previously baseline (Kimi on OpenRouter) had no native search and had to go through the Perplexity block via `run_block`; SDK (Sonnet) used Claude Code's native WebSearch.

* `copilot/tools/web_search.py` — `WebSearchTool` dispatches through `AsyncAnthropic.messages.create` with a cheap Haiku model + `web_search_20250305` tool, parses `web_search_tool_result` blocks into {title, url, snippet, page_age}.  `is_available` hides the tool when no Anthropic API key is configured.

* `sdk/tool_adapter.py` — moved `WebSearch` from SDK built-in-always list to `SDK_DISALLOWED_TOOLS` so SDK routes through `mcp__copilot__web_search` too.  Single code path for cost tracking.

* `persist_and_record_usage(provider="anthropic")` — billing lands in the same turn-accounting bucket as LLM cost, so rate limits and credit charges stay coherent.  Cost = per-search fee ($10/1K) + Haiku dispatch tokens.

* `copilot/tools/models.py` — new `WebSearchResponse` / `WebSearchResult` models matching the native WebSearch shape.

12 new tests: result extractor (title/url/snippet/page_age, limit cap, non-search blocks ignored), cost estimator (per-search fee linear in count), integration (cost tracker called with provider='anthropic'), no-API-key short-circuit, registry sanity.
This commit is contained in:
majdyz
2026-04-21 23:39:27 +07:00
parent 0591804272
commit 1316e16f04
5 changed files with 556 additions and 1 deletions

View File

@@ -779,7 +779,9 @@ def create_copilot_mcp_server(*, use_e2b: bool = False):
# In E2B mode, all five are disabled — MCP equivalents provide direct sandbox
# access. read_file also handles local tool-results and ephemeral reads.
_SDK_BUILTIN_FILE_TOOLS = ["Read", "Write", "Edit", "Glob", "Grep"]
_SDK_BUILTIN_ALWAYS = ["Task", "Agent", "WebSearch", "TodoWrite"]
# WebSearch moved to ``SDK_DISALLOWED_TOOLS`` — routed through
# ``mcp__copilot__web_search`` so cost tracking is unified across paths.
_SDK_BUILTIN_ALWAYS = ["Task", "Agent", "TodoWrite"]
_SDK_BUILTIN_TOOLS = [*_SDK_BUILTIN_FILE_TOOLS, *_SDK_BUILTIN_ALWAYS]
# SDK built-in tools that must be explicitly blocked.
@@ -805,6 +807,7 @@ _SDK_BUILTIN_TOOLS = [*_SDK_BUILTIN_FILE_TOOLS, *_SDK_BUILTIN_ALWAYS]
SDK_DISALLOWED_TOOLS = [
"Bash",
"WebFetch",
"WebSearch",
"AskUserQuestion",
"Write",
"Edit",

View File

@@ -45,6 +45,7 @@ from .run_sub_session import RunSubSessionTool
from .search_docs import SearchDocsTool
from .validate_agent import ValidateAgentGraphTool
from .web_fetch import WebFetchTool
from .web_search import WebSearchTool
from .workspace_files import (
DeleteWorkspaceFileTool,
ListWorkspaceFilesTool,
@@ -93,6 +94,7 @@ TOOL_REGISTRY: dict[str, BaseTool] = {
"get_agent_building_guide": GetAgentBuildingGuideTool(),
# Web fetch for safe URL retrieval
"web_fetch": WebFetchTool(),
"web_search": WebSearchTool(),
# Agent-browser multi-step automation (navigate, act, screenshot)
"browser_navigate": BrowserNavigateTool(),
"browser_act": BrowserActTool(),

View File

@@ -76,6 +76,7 @@ class ResponseType(str, Enum):
# Web
WEB_FETCH = "web_fetch"
WEB_SEARCH = "web_search"
# Feature requests
FEATURE_REQUEST_SEARCH = "feature_request_search"
@@ -585,6 +586,30 @@ class WebFetchResponse(ToolResponseBase):
truncated: bool = False
class WebSearchResult(BaseModel):
"""One entry in a web_search tool response."""
title: str
url: str
snippet: str = ""
page_age: str | None = None
class WebSearchResponse(ToolResponseBase):
"""Response for web_search tool — mirrors the shape of the SDK's
native ``WebSearch`` tool so the LLM sees a consistent interface
regardless of which path dispatched the call."""
type: ResponseType = ResponseType.WEB_SEARCH
query: str
results: list[WebSearchResult] = Field(default_factory=list)
# Backend-reported usage for this call (copied from Anthropic's
# ``usage.server_tool_use``). Surfaces as metadata for frontend
# debug panels but is also what drives rate-limit / cost tracking
# via ``persist_and_record_usage(provider="anthropic")``.
search_requests: int = 0
class BashExecResponse(ToolResponseBase):
"""Response for bash_exec tool."""

View File

@@ -0,0 +1,221 @@
"""Web search tool — wraps Anthropic's server-side ``web_search`` beta.
Single entry point for web search on both SDK and baseline paths. The
``web_search_20250305`` tool is server-side on Anthropic, so we call
the Messages API directly regardless of which LLM invoked the copilot
tool — OpenRouter can't proxy server-side tool execution.
"""
import logging
from typing import Any
from anthropic import AsyncAnthropic
from backend.copilot.model import ChatSession
from backend.copilot.token_tracking import persist_and_record_usage
from backend.util.settings import Settings
from .base import BaseTool
from .models import ErrorResponse, ToolResponseBase, WebSearchResponse, WebSearchResult
logger = logging.getLogger(__name__)
_WEB_SEARCH_DISPATCH_MODEL = "claude-haiku-4-5"
_MAX_DISPATCH_TOKENS = 512
_DEFAULT_MAX_RESULTS = 5
_HARD_MAX_RESULTS = 20
class WebSearchTool(BaseTool):
"""Search the public web and return cited results."""
@property
def name(self) -> str:
return "web_search"
@property
def description(self) -> str:
return (
"Search the web and return cited results. Use this for live "
"information — news, current events, up-to-date docs, recent "
"releases — when the model's training data would be stale. "
"Returns a list of {title, url, snippet} plus the URLs so "
"``web_fetch`` can deep-dive any result. Costs a few cents "
"per search; prefer one well-targeted query over many "
"reformulations."
)
@property
def parameters(self) -> dict[str, Any]:
return {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The search query — a question or topic.",
},
"max_results": {
"type": "integer",
"description": (
f"Maximum results to return (default "
f"{_DEFAULT_MAX_RESULTS}, hard cap {_HARD_MAX_RESULTS})."
),
"default": _DEFAULT_MAX_RESULTS,
},
},
"required": ["query"],
}
@property
def requires_auth(self) -> bool:
return False
@property
def is_available(self) -> bool:
return bool(Settings().secrets.anthropic_api_key)
async def _execute(
self,
user_id: str | None,
session: ChatSession,
query: str = "",
max_results: int = _DEFAULT_MAX_RESULTS,
**kwargs: Any,
) -> ToolResponseBase:
query = (query or "").strip()
session_id = session.session_id if session else None
if not query:
return ErrorResponse(
message="Please provide a non-empty search query.",
error="missing_query",
session_id=session_id,
)
try:
max_results = int(max_results)
except (TypeError, ValueError):
max_results = _DEFAULT_MAX_RESULTS
max_results = max(1, min(max_results, _HARD_MAX_RESULTS))
api_key = Settings().secrets.anthropic_api_key
if not api_key:
return ErrorResponse(
message=(
"Web search is unavailable — the deployment has no "
"Anthropic API key configured."
),
error="web_search_not_configured",
session_id=session_id,
)
client = AsyncAnthropic(api_key=api_key)
try:
resp = await client.messages.create(
model=_WEB_SEARCH_DISPATCH_MODEL,
max_tokens=_MAX_DISPATCH_TOKENS,
tools=[
{
"type": "web_search_20250305",
"name": "web_search",
"max_uses": 1,
}
],
messages=[
{
"role": "user",
"content": (
f"Use the web_search tool exactly once with the "
f"query {query!r} and then stop. Do not "
f"summarise — the caller parses the raw "
f"tool_result."
),
}
],
)
except Exception as exc:
logger.warning(
"[web_search] Anthropic call failed for query=%r: %s", query, exc
)
return ErrorResponse(
message=f"Web search failed: {exc}",
error="web_search_failed",
session_id=session_id,
)
results, search_requests = _extract_results(resp, limit=max_results)
cost_usd = _estimate_cost_usd(resp, search_requests=search_requests)
try:
usage = getattr(resp, "usage", None)
await persist_and_record_usage(
session=session,
user_id=user_id,
prompt_tokens=getattr(usage, "input_tokens", 0) or 0,
completion_tokens=getattr(usage, "output_tokens", 0) or 0,
log_prefix="[web_search]",
cost_usd=cost_usd,
model=_WEB_SEARCH_DISPATCH_MODEL,
provider="anthropic",
)
except Exception as exc:
logger.warning("[web_search] usage tracking failed: %s", exc)
return WebSearchResponse(
message=f"Found {len(results)} result(s) for {query!r}.",
query=query,
results=results,
search_requests=search_requests,
session_id=session_id,
)
def _extract_results(resp: Any, *, limit: int) -> tuple[list[WebSearchResult], int]:
"""Pull results + server-side request count from an Anthropic response."""
results: list[WebSearchResult] = []
search_requests = 0
for block in getattr(resp, "content", []) or []:
btype = getattr(block, "type", None)
if btype == "web_search_tool_result":
content = getattr(block, "content", []) or []
for item in content:
if getattr(item, "type", None) != "web_search_result":
continue
if len(results) >= limit:
break
results.append(
WebSearchResult(
title=getattr(item, "title", "") or "",
url=getattr(item, "url", "") or "",
snippet=getattr(item, "encrypted_content", None)
or getattr(item, "page_content", "")
or "",
page_age=getattr(item, "page_age", None),
)
)
usage = getattr(resp, "usage", None)
server_tool_use = getattr(usage, "server_tool_use", None) if usage else None
if server_tool_use is not None:
search_requests = getattr(server_tool_use, "web_search_requests", 0) or 0
return results, search_requests
# Update when Anthropic revises pricing.
_COST_PER_SEARCH_USD = 0.010 # $10 per 1,000 web_search requests
_HAIKU_INPUT_USD_PER_MTOK = 1.0
_HAIKU_OUTPUT_USD_PER_MTOK = 5.0
def _estimate_cost_usd(resp: Any, *, search_requests: int) -> float:
"""Per-search fee × count + Haiku dispatch tokens."""
usage = getattr(resp, "usage", None)
input_tokens = getattr(usage, "input_tokens", 0) if usage else 0
output_tokens = getattr(usage, "output_tokens", 0) if usage else 0
search_cost = search_requests * _COST_PER_SEARCH_USD
inference_cost = (input_tokens / 1_000_000) * _HAIKU_INPUT_USD_PER_MTOK + (
output_tokens / 1_000_000
) * _HAIKU_OUTPUT_USD_PER_MTOK
return round(search_cost + inference_cost, 6)

View File

@@ -0,0 +1,304 @@
"""Tests for the ``web_search`` copilot tool.
Covers the result extractor + cost estimator as pure units (fed with
synthetic Anthropic response objects), plus light integration tests that
mock ``AsyncAnthropic.messages.create`` and confirm the handler plumbs
through to ``persist_and_record_usage`` with the right provider tag.
"""
from types import SimpleNamespace
from unittest.mock import AsyncMock, patch
import pytest
from backend.copilot.model import ChatSession
from .models import ErrorResponse, WebSearchResponse, WebSearchResult
from .web_search import (
_COST_PER_SEARCH_USD,
WebSearchTool,
_estimate_cost_usd,
_extract_results,
)
def _fake_anthropic_response(
*,
results: list[dict] | None = None,
search_requests: int = 1,
input_tokens: int = 120,
output_tokens: int = 40,
) -> SimpleNamespace:
"""Build a synthetic Anthropic Messages response.
Matches the shape produced by ``client.messages.create`` when the
response includes a ``web_search_tool_result`` content block and
``usage.server_tool_use.web_search_requests`` on the turn meter.
"""
content = []
if results is not None:
content.append(
SimpleNamespace(
type="web_search_tool_result",
content=[
SimpleNamespace(
type="web_search_result",
title=r.get("title", "untitled"),
url=r.get("url", ""),
encrypted_content=r.get("snippet", ""),
page_age=r.get("page_age"),
)
for r in results
],
)
)
usage = SimpleNamespace(
input_tokens=input_tokens,
output_tokens=output_tokens,
server_tool_use=SimpleNamespace(web_search_requests=search_requests),
)
return SimpleNamespace(content=content, usage=usage)
class TestExtractResults:
"""The extractor is the only Anthropic-response-shape contact point;
pin its behaviour so an API shape change surfaces here first."""
def test_extracts_title_url_snippet_and_page_age(self):
resp = _fake_anthropic_response(
results=[
{
"title": "Kimi K2.6 launch",
"url": "https://example.com/kimi",
"snippet": "Moonshot released K2.6 on 2026-04-20.",
"page_age": "1 day",
},
{
"title": "OpenRouter pricing",
"url": "https://openrouter.ai/moonshotai/kimi-k2.6",
"snippet": "",
},
]
)
out, requests = _extract_results(resp, limit=10)
assert requests == 1
assert len(out) == 2
assert out[0].title == "Kimi K2.6 launch"
assert out[0].url == "https://example.com/kimi"
assert out[0].snippet.startswith("Moonshot released")
assert out[0].page_age == "1 day"
assert out[1].snippet == ""
def test_limit_caps_returned_results(self):
resp = _fake_anthropic_response(
results=[{"title": f"r{i}", "url": f"https://e/{i}"} for i in range(10)]
)
out, _ = _extract_results(resp, limit=3)
assert len(out) == 3
assert [r.title for r in out] == ["r0", "r1", "r2"]
def test_missing_content_returns_empty(self):
resp = SimpleNamespace(content=[], usage=None)
out, requests = _extract_results(resp, limit=10)
assert out == []
assert requests == 0
def test_non_search_blocks_are_ignored(self):
resp = SimpleNamespace(
content=[
SimpleNamespace(type="text", text="Here's what I found..."),
SimpleNamespace(
type="web_search_tool_result",
content=[
SimpleNamespace(
type="web_search_result",
title="real",
url="https://real.example",
encrypted_content="body",
page_age=None,
)
],
),
],
usage=None,
)
out, _ = _extract_results(resp, limit=10)
assert len(out) == 1 and out[0].title == "real"
class TestEstimateCostUsd:
"""Pin the per-search fee + Haiku inference math — the pricing
constants in ``web_search.py`` are hard-coded (no live lookup) so a
drift between Anthropic's schedule and our constants must surface
in this test for the next reader to notice."""
def test_zero_searches_still_charges_inference(self):
resp = _fake_anthropic_response(results=[], search_requests=0)
cost = _estimate_cost_usd(resp, search_requests=0)
# Haiku at 1000 input / 5000 output tokens = tiny but non-zero.
assert 0 < cost < 0.001
def test_single_search_fee_dominates(self):
resp = _fake_anthropic_response(
results=[{"title": "x", "url": "https://e"}],
search_requests=1,
input_tokens=100,
output_tokens=20,
)
cost = _estimate_cost_usd(resp, search_requests=1)
# ~$0.010 search + trivial inference — total still ~1 cent.
assert cost >= _COST_PER_SEARCH_USD
assert cost < _COST_PER_SEARCH_USD + 0.001
def test_three_searches_linear_in_count(self):
resp = _fake_anthropic_response(
results=[], search_requests=3, input_tokens=0, output_tokens=0
)
cost = _estimate_cost_usd(resp, search_requests=3)
assert cost == pytest.approx(3 * _COST_PER_SEARCH_USD)
class TestWebSearchToolDispatch:
"""Lightweight integration test: mock the Anthropic client, confirm
the handler returns a ``WebSearchResponse`` and the usage tracker is
called with ``provider='anthropic'`` (not 'open_router', even on the
baseline path — server-side web_search bills Anthropic regardless of
the calling LLM's route)."""
def _session(self) -> ChatSession:
s = ChatSession.new("test-user", dry_run=False)
s.session_id = "sess-1"
return s
@pytest.mark.asyncio
async def test_returns_response_with_results_and_tracks_cost(self, monkeypatch):
fake_resp = _fake_anthropic_response(
results=[
{
"title": "hello",
"url": "https://example.com",
"snippet": "greeting",
}
],
search_requests=1,
)
mock_client = type(
"MC",
(),
{
"messages": type(
"M", (), {"create": AsyncMock(return_value=fake_resp)}
)()
},
)()
# Stub the Anthropic API key so ``is_available`` is True.
monkeypatch.setattr(
"backend.copilot.tools.web_search.Settings",
lambda: SimpleNamespace(
secrets=SimpleNamespace(anthropic_api_key="sk-test")
),
)
with (
patch(
"backend.copilot.tools.web_search.AsyncAnthropic",
return_value=mock_client,
),
patch(
"backend.copilot.tools.web_search.persist_and_record_usage",
new=AsyncMock(return_value=160),
) as mock_track,
):
tool = WebSearchTool()
result = await tool._execute(
user_id="u1",
session=self._session(),
query="kimi k2.6 launch",
max_results=5,
)
assert isinstance(result, WebSearchResponse)
assert result.query == "kimi k2.6 launch"
assert len(result.results) == 1
assert isinstance(result.results[0], WebSearchResult)
assert result.search_requests == 1
# Cost tracker must have been called with provider="anthropic".
assert mock_track.await_count == 1
kwargs = mock_track.await_args.kwargs
assert kwargs["provider"] == "anthropic"
assert kwargs["model"] == "claude-haiku-4-5"
assert kwargs["user_id"] == "u1"
assert kwargs["cost_usd"] >= _COST_PER_SEARCH_USD
@pytest.mark.asyncio
async def test_missing_api_key_returns_error_without_calling_anthropic(
self, monkeypatch
):
monkeypatch.setattr(
"backend.copilot.tools.web_search.Settings",
lambda: SimpleNamespace(secrets=SimpleNamespace(anthropic_api_key="")),
)
anthropic_stub = AsyncMock()
with (
patch(
"backend.copilot.tools.web_search.AsyncAnthropic",
return_value=anthropic_stub,
),
patch(
"backend.copilot.tools.web_search.persist_and_record_usage",
new=AsyncMock(),
) as mock_track,
):
tool = WebSearchTool()
assert tool.is_available is False
result = await tool._execute(
user_id="u1",
session=self._session(),
query="anything",
)
assert isinstance(result, ErrorResponse)
assert result.error == "web_search_not_configured"
anthropic_stub.messages.create.assert_not_called()
mock_track.assert_not_called()
@pytest.mark.asyncio
async def test_empty_query_rejected_without_api_call(self, monkeypatch):
monkeypatch.setattr(
"backend.copilot.tools.web_search.Settings",
lambda: SimpleNamespace(
secrets=SimpleNamespace(anthropic_api_key="sk-test")
),
)
anthropic_stub = AsyncMock()
with patch(
"backend.copilot.tools.web_search.AsyncAnthropic",
return_value=anthropic_stub,
):
tool = WebSearchTool()
result = await tool._execute(
user_id="u1", session=self._session(), query=" "
)
assert isinstance(result, ErrorResponse)
assert result.error == "missing_query"
anthropic_stub.messages.create.assert_not_called()
class TestToolRegistryIntegration:
"""The tool must be registered under the ``web_search`` name so the
MCP layer exposes it as ``mcp__copilot__web_search`` — which is
what the SDK path now dispatches to (see
``sdk/tool_adapter.py::SDK_DISALLOWED_TOOLS`` which blocks the CLI's
native ``WebSearch`` in favour of the MCP route)."""
def test_web_search_is_in_tool_registry(self):
from backend.copilot.tools import TOOL_REGISTRY
assert "web_search" in TOOL_REGISTRY
assert isinstance(TOOL_REGISTRY["web_search"], WebSearchTool)
def test_sdk_native_websearch_is_disallowed(self):
from backend.copilot.sdk.tool_adapter import SDK_DISALLOWED_TOOLS
assert "WebSearch" in SDK_DISALLOWED_TOOLS