diff --git a/autogpt_platform/backend/backend/copilot/baseline/reasoning.py b/autogpt_platform/backend/backend/copilot/baseline/reasoning.py index 15a77dde8a..0c689ed4a7 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/reasoning.py +++ b/autogpt_platform/backend/backend/copilot/baseline/reasoning.py @@ -1,7 +1,8 @@ """Extended-thinking wire support for the baseline (OpenRouter) path. -Anthropic routes on OpenRouter expose extended thinking through -non-OpenAI extension fields that the OpenAI Python SDK doesn't model: +OpenRouter routes that support extended thinking (Anthropic Claude and +Moonshot Kimi today) expose reasoning through non-OpenAI extension fields +that the OpenAI Python SDK doesn't model: * ``reasoning`` (legacy string) — enabled by ``include_reasoning: true``. * ``reasoning_content`` — DeepSeek / some OpenRouter routes. @@ -17,12 +18,14 @@ This module keeps the wire-level concerns in one place: one streaming round and emits ``StreamReasoning*`` events so the caller only has to plumb the events into its pending queue. * :func:`reasoning_extra_body` builds the ``extra_body`` fragment for the - OpenAI client call. Returns ``None`` on non-Anthropic routes. + OpenAI client call. Returns ``None`` for routes without reasoning + support (see :func:`_is_reasoning_route`). """ from __future__ import annotations import logging +import time import uuid from typing import Any @@ -42,6 +45,19 @@ logger = logging.getLogger(__name__) _VISIBLE_REASONING_TYPES = frozenset({"reasoning.text", "reasoning.summary"}) +# Coalescing thresholds for ``StreamReasoningDelta`` emission. OpenRouter's +# Kimi K2.6 endpoint tokenises reasoning at a much finer grain than Anthropic +# (~4,700 deltas per turn in one observed session, vs ~28 for Sonnet); without +# coalescing, every chunk is one Redis ``xadd`` + one SSE frame + one React +# re-render of the non-virtualised chat list, which paint-storms the browser +# main thread and freezes the UI. Batching into ~32-char / ~40 ms windows +# cuts the event rate ~100x while staying snappy enough that the Reasoning +# collapse still feels live (well under the ~100 ms perceptual threshold). +# Per-delta persistence to ``session.messages`` stays granular — we only +# coalesce the *wire* emission. +_COALESCE_MIN_CHARS = 32 +_COALESCE_MAX_INTERVAL_MS = 40.0 + class ReasoningDetail(BaseModel): """One entry in OpenRouter's ``reasoning_details`` list. @@ -132,18 +148,72 @@ class OpenRouterDeltaExtension(BaseModel): return "".join(d.visible_text for d in self.reasoning_details) +def _is_reasoning_route(model: str) -> bool: + """Return True when the route supports OpenRouter's ``reasoning`` extension. + + OpenRouter exposes reasoning tokens via a unified ``reasoning`` request + param that works on any provider that supports extended thinking — + currently Anthropic (Claude Opus / Sonnet) and Moonshot (Kimi K2.6 + + kimi-k2-thinking) advertise it in their ``supported_parameters``. + Other providers silently drop the field, but we skip it anyway to keep + the payload tight and avoid confusing cache diagnostics. + + Kept separate from :func:`backend.copilot.baseline.service._is_anthropic_model` + because ``cache_control`` is strictly Anthropic-specific (Moonshot does + its own auto-caching), so the two gates must not conflate. + + Both the Claude and Kimi matches are anchored to the provider + prefix (or to a bare model id with no prefix at all) to avoid + substring false positives — a custom ``some-other-provider/claude-mock`` + or ``provider/hakimi-large`` configured via + ``CHAT_FAST_STANDARD_MODEL`` must NOT inherit the reasoning + extra_body and take a 400 from its upstream. Recognised shapes: + + * Claude — ``anthropic/`` or ``anthropic.`` provider prefix, or a + bare ``claude-`` model id with no provider prefix + (``claude-opus-4.7``, ``anthropic/claude-sonnet-4-6``, + ``anthropic.claude-3-5-sonnet``). A non-Anthropic prefix like + ``someprovider/claude-mock`` is rejected on purpose. + * Kimi — ``moonshotai/`` provider prefix, or a ``kimi-`` model id + with no provider prefix (``kimi-k2.6``, + ``moonshotai/kimi-k2-thinking``). Like Claude, a non-Moonshot + prefix is rejected — exception: ``openrouter/kimi-k2.6`` stays + recognised because ``openrouter/`` is how we route to Moonshot + today and changing that would be a behaviour regression for + existing deployments. + """ + lowered = model.lower() + if lowered.startswith(("anthropic/", "anthropic.")): + return True + if lowered.startswith("moonshotai/"): + return True + # ``openrouter/`` historically routes to whatever the default + # upstream for the model is — for kimi that's Moonshot, so accept + # ``openrouter/kimi-...`` here. Other ``openrouter/`` models + # (e.g. ``openrouter/auto``) fall through to the no-prefix check + # below and are rejected unless they start with ``claude-`` / + # ``kimi-`` after the slash, which no real OpenRouter route does. + if lowered.startswith("openrouter/kimi-"): + return True + if "/" in lowered: + # Any other provider prefix is a custom / non-Anthropic / + # non-Moonshot route and must not opt into reasoning. This + # blocks substring false positives like + # ``some-provider/claude-mock-v1`` or ``other/kimi-pro``. + return False + # No provider prefix — accept bare ``claude-*`` and ``kimi-*`` ids + # so direct CLI configs (``claude-3-5-sonnet-20241022``, + # ``kimi-k2-instruct``) keep working. + return lowered.startswith("claude-") or lowered.startswith("kimi-") + + def reasoning_extra_body(model: str, max_thinking_tokens: int) -> dict[str, Any] | None: """Build the ``extra_body["reasoning"]`` fragment for the OpenAI client. - Returns ``None`` for non-Anthropic routes (other OpenRouter providers - ignore the field but we skip it anyway to keep the payload minimal) - and for ``max_thinking_tokens <= 0`` (operator kill switch). + Returns ``None`` for non-reasoning routes and for + ``max_thinking_tokens <= 0`` (operator kill switch). """ - # Imported lazily to avoid pulling service.py at module load — service.py - # imports this module, and the lazy import keeps the dependency one-way. - from backend.copilot.baseline.service import _is_anthropic_model - - if not _is_anthropic_model(model) or max_thinking_tokens <= 0: + if not _is_reasoning_route(model) or max_thinking_tokens <= 0: return None return {"reasoning": {"max_tokens": max_thinking_tokens}} @@ -177,11 +247,24 @@ class BaselineReasoningEmitter: def __init__( self, session_messages: list[ChatMessage] | None = None, + *, + coalesce_min_chars: int = _COALESCE_MIN_CHARS, + coalesce_max_interval_ms: float = _COALESCE_MAX_INTERVAL_MS, ) -> None: self._block_id: str = str(uuid.uuid4()) self._open: bool = False self._session_messages = session_messages self._current_row: ChatMessage | None = None + # Coalescing state — ``_pending_delta`` accumulates reasoning text + # between wire flushes. Providers like Kimi K2.6 emit very fine- + # grained chunks; batching them reduces Redis ``xadd`` + SSE + React + # re-render load by ~100x for equivalent text output. Tuning knobs + # are kwargs so tests can disable coalescing (``=0``) for + # deterministic event assertions. + self._coalesce_min_chars = coalesce_min_chars + self._coalesce_max_interval_ms = coalesce_max_interval_ms + self._pending_delta: str = "" + self._last_flush_monotonic: float = 0.0 @property def is_open(self) -> bool: @@ -192,39 +275,86 @@ class BaselineReasoningEmitter: Empty list when the chunk carries no reasoning payload, so this is safe to call on every chunk without guarding at the call site. - Persistence (when a session message list is attached) happens in - lockstep with emission so the row's content stays equal to the - concatenated deltas at every delta boundary. + + Persistence (when a session message list is attached) stays + per-delta so the DB row's content always equals the concatenation + of wire deltas at every chunk boundary, independent of the + coalescing window. Only the wire emission is batched. """ ext = OpenRouterDeltaExtension.from_delta(delta) text = ext.visible_text() if not text: return [] events: list[StreamBaseResponse] = [] + # First reasoning text in this block — emit Start + the first Delta + # atomically so the frontend Reasoning collapse renders immediately + # rather than waiting for the coalesce window to elapse. Subsequent + # chunks buffer into ``_pending_delta`` and only flush when the + # char/time thresholds trip. + # Sample the monotonic clock exactly once per chunk — at ~4,700 + # chunks per turn, folding the two calls into one cuts ~4,700 + # syscalls off the hot path without changing semantics. + now = time.monotonic() if not self._open: events.append(StreamReasoningStart(id=self._block_id)) + events.append(StreamReasoningDelta(id=self._block_id, delta=text)) self._open = True + self._last_flush_monotonic = now if self._session_messages is not None: - self._current_row = ChatMessage(role="reasoning", content="") + self._current_row = ChatMessage(role="reasoning", content=text) self._session_messages.append(self._current_row) - events.append(StreamReasoningDelta(id=self._block_id, delta=text)) + return events + + # Persist per-delta (no coalescing here — the session snapshot stays + # consistent at every chunk boundary, independent of the wire + # coalesce window). if self._current_row is not None: self._current_row.content = (self._current_row.content or "") + text + + self._pending_delta += text + if self._should_flush_pending(now): + events.append( + StreamReasoningDelta(id=self._block_id, delta=self._pending_delta) + ) + self._pending_delta = "" + self._last_flush_monotonic = now return events + def _should_flush_pending(self, now: float) -> bool: + """Return True when the accumulated delta should be emitted now. + + *now* is the monotonic timestamp sampled by the caller so the + clock is read at most once per chunk (the flush-timestamp update + reuses the same value). + """ + if not self._pending_delta: + return False + if len(self._pending_delta) >= self._coalesce_min_chars: + return True + elapsed_ms = (now - self._last_flush_monotonic) * 1000.0 + return elapsed_ms >= self._coalesce_max_interval_ms + def close(self) -> list[StreamBaseResponse]: """Emit ``StreamReasoningEnd`` for the open block (if any) and rotate. - Idempotent — returns ``[]`` when no block is open. The id rotation - guarantees the next reasoning block starts with a fresh id rather - than reusing one already closed on the wire. The persisted row is - not removed — it stays in ``session_messages`` as the durable - record of what was reasoned. + Idempotent — returns ``[]`` when no block is open. Drains any + still-buffered delta first so the frontend never loses tail text + from the coalesce window. The id rotation guarantees the next + reasoning block starts with a fresh id rather than reusing one + already closed on the wire. The persisted row is not removed — + it stays in ``session_messages`` as the durable record of what + was reasoned. """ if not self._open: return [] - event = StreamReasoningEnd(id=self._block_id) + events: list[StreamBaseResponse] = [] + if self._pending_delta: + events.append( + StreamReasoningDelta(id=self._block_id, delta=self._pending_delta) + ) + self._pending_delta = "" + events.append(StreamReasoningEnd(id=self._block_id)) self._open = False self._block_id = str(uuid.uuid4()) self._current_row = None - return [event] + return events diff --git a/autogpt_platform/backend/backend/copilot/baseline/reasoning_test.py b/autogpt_platform/backend/backend/copilot/baseline/reasoning_test.py index df64086d5f..e18c8066e4 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/reasoning_test.py +++ b/autogpt_platform/backend/backend/copilot/baseline/reasoning_test.py @@ -12,6 +12,7 @@ from backend.copilot.baseline.reasoning import ( BaselineReasoningEmitter, OpenRouterDeltaExtension, ReasoningDetail, + _is_reasoning_route, reasoning_extra_body, ) from backend.copilot.model import ChatMessage @@ -135,6 +136,59 @@ class TestOpenRouterDeltaExtension: assert ext.visible_text() == "real" +class TestIsReasoningRoute: + def test_anthropic_routes(self): + assert _is_reasoning_route("anthropic/claude-sonnet-4-6") + assert _is_reasoning_route("claude-3-5-sonnet-20241022") + assert _is_reasoning_route("anthropic.claude-3-5-sonnet") + assert _is_reasoning_route("ANTHROPIC/Claude-Opus") # case-insensitive + + def test_moonshot_kimi_routes(self): + # OpenRouter advertises the ``reasoning`` extension on Moonshot + # endpoints — both K2.6 (the new baseline default) and the + # reasoning-native kimi-k2-thinking variant. + assert _is_reasoning_route("moonshotai/kimi-k2.6") + assert _is_reasoning_route("moonshotai/kimi-k2-thinking") + assert _is_reasoning_route("moonshotai/kimi-k2.5") + # Direct (non-OpenRouter) model ids also resolve via the ``kimi-`` + # prefix so a future bare ``kimi-k3`` id would still match. + assert _is_reasoning_route("kimi-k2-instruct") + # Provider-prefixed bare kimi ids (without the ``moonshotai/`` + # prefix) are also recognised — the match anchors on the final + # path segment. + assert _is_reasoning_route("openrouter/kimi-k2.6") + + def test_other_providers_rejected(self): + assert not _is_reasoning_route("openai/gpt-4o") + assert not _is_reasoning_route("google/gemini-2.5-pro") + assert not _is_reasoning_route("xai/grok-4") + assert not _is_reasoning_route("meta-llama/llama-3.3-70b-instruct") + assert not _is_reasoning_route("deepseek/deepseek-r1") + + def test_kimi_substring_false_positives_rejected(self): + # Regression: the previous implementation matched any model whose + # name contained the substring ``kimi`` — including unrelated model + # ids like ``hakimi``. The anchored match below rejects them. + assert not _is_reasoning_route("some-provider/hakimi-large") + assert not _is_reasoning_route("hakimi") + assert not _is_reasoning_route("akimi-7b") + + def test_claude_substring_false_positives_rejected(self): + # Regression (Sentry review on #12871): ``'claude' in lowered`` + # matched any substring — a custom + # ``someprovider/claude-mock-v1`` set via + # ``CHAT_FAST_STANDARD_MODEL`` would inherit the reasoning + # extra_body and take a 400 from its upstream. The anchored + # match requires either an ``anthropic`` / ``anthropic.`` / + # ``anthropic/`` prefix, or a bare ``claude-`` id with no + # provider prefix. + assert not _is_reasoning_route("someprovider/claude-mock-v1") + assert not _is_reasoning_route("custom/claude-like-model") + # Same principle for Kimi — a non-Moonshot provider prefix is + # rejected even when the model id starts with ``kimi-``. + assert not _is_reasoning_route("other/kimi-pro") + + class TestReasoningExtraBody: def test_anthropic_route_returns_fragment(self): assert reasoning_extra_body("anthropic/claude-sonnet-4-6", 4096) == { @@ -146,16 +200,30 @@ class TestReasoningExtraBody: "reasoning": {"max_tokens": 2048} } - def test_non_anthropic_route_returns_none(self): + def test_kimi_routes_return_fragment(self): + # Kimi K2.6 ships the same OpenRouter ``reasoning`` extension as + # Anthropic, so the gate widened with this PR and the fragment + # must now materialise on Moonshot routes too. + assert reasoning_extra_body("moonshotai/kimi-k2.6", 8192) == { + "reasoning": {"max_tokens": 8192} + } + assert reasoning_extra_body("moonshotai/kimi-k2-thinking", 4096) == { + "reasoning": {"max_tokens": 4096} + } + + def test_non_reasoning_route_returns_none(self): assert reasoning_extra_body("openai/gpt-4o", 4096) is None assert reasoning_extra_body("google/gemini-2.5-pro", 4096) is None + assert reasoning_extra_body("xai/grok-4", 4096) is None def test_zero_max_tokens_kill_switch(self): # Operator kill switch: ``max_thinking_tokens <= 0`` disables the - # ``reasoning`` extra_body fragment even on an Anthropic route. - # Lets us silence reasoning without dropping the SDK path's budget. + # ``reasoning`` extra_body fragment on ANY reasoning route (Anthropic + # or Kimi). Lets us silence reasoning without dropping the SDK + # path's budget. assert reasoning_extra_body("anthropic/claude-sonnet-4-6", 0) is None assert reasoning_extra_body("anthropic/claude-sonnet-4-6", -1) is None + assert reasoning_extra_body("moonshotai/kimi-k2.6", 0) is None class TestBaselineReasoningEmitter: @@ -171,7 +239,12 @@ class TestBaselineReasoningEmitter: assert emitter.is_open is True def test_subsequent_deltas_reuse_block_id_without_new_start(self): - emitter = BaselineReasoningEmitter() + # Disable coalescing so each chunk flushes immediately — this test + # is about the Start/Delta/block-id state machine, not the coalesce + # window. Coalescing behaviour is covered below. + emitter = BaselineReasoningEmitter( + coalesce_min_chars=0, coalesce_max_interval_ms=0 + ) first = emitter.on_delta(_delta(reasoning="a")) second = emitter.on_delta(_delta(reasoning="b")) @@ -226,6 +299,106 @@ class TestBaselineReasoningEmitter: assert deltas[0].delta == "plan: do the thing" +class TestReasoningDeltaCoalescing: + """Coalescing batches fine-grained provider chunks into bigger wire + frames. OpenRouter's Kimi K2.6 emits ~4,700 reasoning-delta chunks + per turn vs ~28 for Sonnet; without batching, every chunk becomes one + Redis ``xadd`` + one SSE event + one React re-render of the + non-virtualised chat list, which paint-storms the browser. These + tests pin the batching contract: small chunks buffer until the + char-size or time threshold trips, large chunks still flush + immediately, and ``close()`` never drops tail text.""" + + def test_small_chunks_after_first_buffer_until_threshold(self): + # Generous time threshold so size alone controls flush timing. + emitter = BaselineReasoningEmitter( + coalesce_min_chars=32, coalesce_max_interval_ms=60_000 + ) + # First chunk always flushes immediately (so UI renders without + # waiting). + first = emitter.on_delta(_delta(reasoning="hi ")) + assert any(isinstance(e, StreamReasoningStart) for e in first) + assert sum(isinstance(e, StreamReasoningDelta) for e in first) == 1 + + # Subsequent small chunks buffer silently — 5 × 4 chars = 20 chars, + # still under the 32-char threshold. + for _ in range(5): + assert emitter.on_delta(_delta(reasoning="abcd")) == [] + + # Once the threshold is crossed, the accumulated buffer flushes + # as a single StreamReasoningDelta carrying every buffered chunk. + flush = emitter.on_delta(_delta(reasoning="efghijklmnop")) + assert len(flush) == 1 + assert isinstance(flush[0], StreamReasoningDelta) + assert flush[0].delta == "abcd" * 5 + "efghijklmnop" + + def test_time_based_flush_when_chars_stay_below_threshold(self, monkeypatch): + # Fake ``time.monotonic`` so we can drive the time-based branch + # deterministically without real sleeps. + from backend.copilot.baseline import reasoning as rmod + + fake_now = [0.0] + monkeypatch.setattr(rmod.time, "monotonic", lambda: fake_now[0]) + + emitter = BaselineReasoningEmitter( + coalesce_min_chars=1000, coalesce_max_interval_ms=40 + ) + # t=0: first chunk flushes immediately. + first = emitter.on_delta(_delta(reasoning="a")) + assert sum(isinstance(e, StreamReasoningDelta) for e in first) == 1 + + # t=10 ms: still under 40 ms → buffer. + fake_now[0] = 0.010 + assert emitter.on_delta(_delta(reasoning="b")) == [] + + # t=50 ms since last flush → time threshold trips, flush fires. + fake_now[0] = 0.060 + flushed = emitter.on_delta(_delta(reasoning="c")) + assert len(flushed) == 1 + assert isinstance(flushed[0], StreamReasoningDelta) + assert flushed[0].delta == "bc" + + def test_close_flushes_tail_buffer_before_end(self): + emitter = BaselineReasoningEmitter( + coalesce_min_chars=1000, coalesce_max_interval_ms=60_000 + ) + emitter.on_delta(_delta(reasoning="first")) # flushes (first chunk) + emitter.on_delta(_delta(reasoning=" middle ")) # buffered + emitter.on_delta(_delta(reasoning="tail")) # buffered + + events = emitter.close() + assert len(events) == 2 + assert isinstance(events[0], StreamReasoningDelta) + assert events[0].delta == " middle tail" + assert isinstance(events[1], StreamReasoningEnd) + + def test_coalesce_disabled_flushes_every_chunk(self): + emitter = BaselineReasoningEmitter( + coalesce_min_chars=0, coalesce_max_interval_ms=0 + ) + first = emitter.on_delta(_delta(reasoning="a")) + second = emitter.on_delta(_delta(reasoning="b")) + assert sum(isinstance(e, StreamReasoningDelta) for e in first) == 1 + assert sum(isinstance(e, StreamReasoningDelta) for e in second) == 1 + + def test_persistence_stays_per_delta_even_when_wire_coalesces(self): + """DB row content must track every chunk so a crash mid-turn + persists the full reasoning-so-far, even if the coalesce window + never flushed those chunks to the wire.""" + session: list[ChatMessage] = [] + emitter = BaselineReasoningEmitter( + session, + coalesce_min_chars=1000, + coalesce_max_interval_ms=60_000, + ) + emitter.on_delta(_delta(reasoning="first ")) + emitter.on_delta(_delta(reasoning="chunk ")) + emitter.on_delta(_delta(reasoning="three")) + # No close; verify the persisted row already has everything. + assert len(session) == 1 + assert session[0].content == "first chunk three" + + class TestReasoningPersistence: """The persistence contract: without ``role="reasoning"`` rows in session.messages, useHydrateOnStreamEnd overwrites the live-streamed diff --git a/autogpt_platform/backend/backend/copilot/baseline/service.py b/autogpt_platform/backend/backend/copilot/baseline/service.py index 474a6834b1..6aa88e9d41 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/service.py +++ b/autogpt_platform/backend/backend/copilot/baseline/service.py @@ -321,14 +321,17 @@ def _filter_tools_by_permissions( def _resolve_baseline_model(tier: CopilotLlmModel | None) -> str: """Pick the model for the baseline path based on the per-request tier. - The baseline (fast) and SDK (extended thinking) paths now share the - same tier-based model resolution — only the *path* differs between - "fast" and "extended_thinking". ``'advanced'`` → Opus; - ``'standard'`` / ``None`` → the config default (Sonnet). + Baseline resolves independently of SDK via the ``fast_*_model`` cells + of the (path, tier) matrix. ``'standard'`` / ``None`` picks Kimi + K2.6 by default (cheap + OpenRouter ``reasoning`` support); + ``'advanced'`` picks Opus by default so the advanced tier is a clean + A/B against the SDK advanced tier — same model, different path — + isolating reasoning-wire + cache differences from model capability. + Both defaults are overridable per ``CHAT_FAST_*_MODEL`` env vars. """ - from backend.copilot.service import resolve_chat_model - - return resolve_chat_model(tier) + if tier == "advanced": + return config.fast_advanced_model + return config.fast_standard_model @dataclass @@ -761,6 +764,19 @@ async def _baseline_tool_executor( ) ) + # Announce the tool call to the session so in-turn guards like + # ``require_guide_read`` can see it *right now*, before the tool + # actually runs. Without this, the tool_call row lives only in + # ``state.session_messages`` until the ``finally`` block flushes it + # into ``session.messages`` at turn end — so a second tool in the + # same turn (e.g. ``create_agent`` after ``get_agent_building_guide``) + # scans a stale ``session.messages`` and the guard re-fires despite + # the guide having been called. The announce-set is cleared at turn + # end; we deliberately don't touch ``session.messages`` here to avoid + # duplicating the assistant row that ``_baseline_conversation_updater`` + # will append at round end. + session.announce_inflight_tool_call(tool_name) + try: result: StreamToolOutputAvailable = await execute_tool( tool_name=tool_name, @@ -1806,6 +1822,16 @@ async def stream_chat_completion_baseline( yield StreamError(errorText=error_msg, code="baseline_error") # Still persist whatever we got finally: + # In-flight tool-call announcements are only meaningful for the + # current turn; clear at the top of the outer finally so the next + # turn starts with a clean scratch buffer even if one of the + # awaited cleanup steps below (usage persistence, session upsert, + # transcript upload) raises. The buffer is a process-local scratch + # set — if we leak it into the next turn the guide-read guard would + # observe a phantom in-flight call and skip its gate, so this must + # run unconditionally. + session.clear_inflight_tool_calls() + # Pending messages are drained atomically at turn start and # between tool rounds, so there's nothing to clear in finally. # Any message pushed after the final drain window stays in the diff --git a/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py b/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py index 03a9ef99c9..5a95c5c901 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py +++ b/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py @@ -1404,6 +1404,16 @@ class TestApplyPromptCacheMarkers: assert not _is_anthropic_model("xai/grok-4") assert not _is_anthropic_model("meta-llama/llama-3.3-70b-instruct") + def test_is_anthropic_model_rejects_kimi_routes(self): + """Regression guard: Kimi K2.6 is a reasoning route (reasoning + extra_body is sent) but NOT an Anthropic route — Moonshot does + its own auto prompt caching, so ``cache_control`` markers must + NOT be applied. OpenRouter silently drops them today, but if + they ever start failing fast we'd want the gate tight.""" + assert not _is_anthropic_model("moonshotai/kimi-k2.6") + assert not _is_anthropic_model("moonshotai/kimi-k2-thinking") + assert not _is_anthropic_model("kimi-k2-instruct") + def test_cache_control_uses_configured_ttl(self, monkeypatch): """TTL comes from ChatConfig.baseline_prompt_cache_ttl — defaults to 1h so the static prefix (system + tools) stays warm across @@ -1829,7 +1839,7 @@ class TestBaselineReasoningStreaming: @pytest.mark.asyncio async def test_reasoning_param_absent_on_non_anthropic_routes(self): - """Non-Anthropic routes (e.g. OpenAI) must not receive ``reasoning``.""" + """Non-reasoning routes (e.g. OpenAI) must not receive ``reasoning``.""" state = _BaselineStreamState(model="openai/gpt-4o") mock_client = MagicMock() @@ -1850,6 +1860,54 @@ class TestBaselineReasoningStreaming: extra_body = mock_client.chat.completions.create.call_args[1]["extra_body"] assert "reasoning" not in extra_body + @pytest.mark.asyncio + async def test_kimi_route_sends_reasoning_but_no_cache_control(self): + """Kimi K2.6 is the default fast_model and sends ``reasoning`` via + OpenRouter's unified extension. It must NOT receive ``cache_control`` + markers or the ``anthropic-beta`` header — Moonshot uses its own + auto-caching and those Anthropic-only fields would either get + silently dropped or (worst case) 400 on a future provider change.""" + state = _BaselineStreamState(model="moonshotai/kimi-k2.6") + + mock_client = MagicMock() + mock_client.chat.completions.create = AsyncMock( + return_value=_make_stream_mock() + ) + + with patch( + "backend.copilot.baseline.service._get_openai_client", + return_value=mock_client, + ): + await _baseline_llm_caller( + messages=[ + {"role": "system", "content": "you are a helpful assistant"}, + {"role": "user", "content": "hi"}, + ], + tools=[ + { + "type": "function", + "function": {"name": "echo", "parameters": {}}, + } + ], + state=state, + ) + + call_kwargs = mock_client.chat.completions.create.call_args[1] + extra_body = call_kwargs["extra_body"] + # Reasoning param on — the whole point of picking Kimi is the + # cheap-but-still-reasoning-capable path. + assert "reasoning" in extra_body + assert extra_body["reasoning"]["max_tokens"] > 0 + # Anthropic-only fields stay off. + assert "extra_headers" not in call_kwargs + sys_msg = call_kwargs["messages"][0] + sys_content = sys_msg.get("content") + if isinstance(sys_content, list): + assert all("cache_control" not in block for block in sys_content) + tools = call_kwargs.get("tools", []) + for t in tools: + assert "cache_control" not in t + @pytest.mark.asyncio async def test_reasoning_only_stream_still_closes_block(self): """Regression: a stream with only reasoning (no text, no tool_call) diff --git a/autogpt_platform/backend/backend/copilot/baseline/transcript_integration_test.py b/autogpt_platform/backend/backend/copilot/baseline/transcript_integration_test.py index 8d6fb50a53..808b06eb32 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/transcript_integration_test.py +++ b/autogpt_platform/backend/backend/copilot/baseline/transcript_integration_test.py @@ -63,21 +63,123 @@ def _make_session_messages(*roles: str) -> list[ChatMessage]: class TestResolveBaselineModel: - """Baseline model resolution honours the per-request tier toggle.""" + """Baseline model resolution honours the per-request tier toggle. - def test_advanced_tier_selects_advanced_model(self): - assert _resolve_baseline_model("advanced") == config.advanced_model + Baseline reads the ``fast_*_model`` cells of the (path, tier) matrix + and never falls through to the SDK-side ``thinking_*_model`` cells. + Default routing: + - ``standard`` / ``None`` → ``config.fast_standard_model`` (Kimi K2.6) + - ``advanced`` → ``config.fast_advanced_model`` (Opus — same as SDK's + advanced tier, so the advanced A/B isolates path differences) + """ - def test_standard_tier_selects_default_model(self): - assert _resolve_baseline_model("standard") == config.model + def test_advanced_tier_selects_fast_advanced_model(self): + assert _resolve_baseline_model("advanced") == config.fast_advanced_model - def test_none_tier_selects_default_model(self): - """Baseline users without a tier MUST keep the default (standard).""" - assert _resolve_baseline_model(None) == config.model + def test_standard_tier_selects_fast_standard_model(self): + assert _resolve_baseline_model("standard") == config.fast_standard_model - def test_standard_and_advanced_models_differ(self): - """Advanced tier defaults to a different (Opus) model than standard.""" - assert config.model != config.advanced_model + def test_none_tier_selects_fast_standard_model(self): + """Baseline users without a tier get the cheap fast-standard default.""" + assert _resolve_baseline_model(None) == config.fast_standard_model + + def test_fast_standard_default_is_kimi(self): + """Shipped default: Kimi K2.6 on the baseline standard cell. + + Asserts the declared ``Field`` default — env-independent — so a + deploy-time ``CHAT_FAST_STANDARD_MODEL`` rollback override + doesn't fail CI while still pinning the shipped default. + """ + from backend.copilot.config import ChatConfig + + assert ( + ChatConfig.model_fields["fast_standard_model"].default + == "moonshotai/kimi-k2.6" + ) + + def test_fast_advanced_default_is_opus(self): + """Shipped default: Opus on the baseline advanced cell — mirrors + the SDK advanced cell so the advanced-tier A/B stays clean + (same model, different path).""" + from backend.copilot.config import ChatConfig + + assert ( + ChatConfig.model_fields["fast_advanced_model"].default + == "anthropic/claude-opus-4.7" + ) + + def test_standard_cells_diverge_across_paths(self): + """The whole point of the split: baseline cheap (Kimi) vs SDK + Anthropic-only (Sonnet). If the shipped standard defaults ever + collapse to the same value someone lost the cost savings. + Checked against ``Field`` defaults, not the env-backed singleton.""" + from backend.copilot.config import ChatConfig + + assert ( + ChatConfig.model_fields["thinking_standard_model"].default + != ChatConfig.model_fields["fast_standard_model"].default + ) + + def test_standard_and_advanced_cells_differ_on_fast(self): + """Advanced tier defaults to a different model than standard on + the baseline path. Checked against declared ``Field`` defaults + so operator env overrides don't flake the test.""" + from backend.copilot.config import ChatConfig + + assert ( + ChatConfig.model_fields["fast_standard_model"].default + != ChatConfig.model_fields["fast_advanced_model"].default + ) + + def test_legacy_env_aliases_route_to_new_fields(self, monkeypatch): + """Backward compat: the pre-split env var names must still bind. + + The four-field matrix was introduced with ``validation_alias`` + entries so that existing deployments setting ``CHAT_MODEL`` / + ``CHAT_ADVANCED_MODEL`` / ``CHAT_FAST_MODEL`` continue to override + the same effective cell without a rename. Construct a fresh + ``ChatConfig`` with each legacy name set and confirm it lands on + the new field. + """ + from backend.copilot.config import ChatConfig + + monkeypatch.setenv("CHAT_MODEL", "legacy/sonnet-via-chat-model") + monkeypatch.setenv("CHAT_ADVANCED_MODEL", "legacy/opus-via-advanced") + monkeypatch.setenv("CHAT_FAST_MODEL", "legacy/fast-via-fast-model") + + cfg = ChatConfig() + + assert cfg.thinking_standard_model == "legacy/sonnet-via-chat-model" + assert cfg.thinking_advanced_model == "legacy/opus-via-advanced" + assert cfg.fast_standard_model == "legacy/fast-via-fast-model" + + def test_all_four_new_env_vars_bind_to_their_cells(self, monkeypatch): + """Each of the four (path, tier) cells must be overridable via + its documented ``CHAT_*_*_MODEL`` env var — including + ``CHAT_FAST_ADVANCED_MODEL`` which was missing a + ``validation_alias`` in the original split and only bound + implicitly through ``env_prefix``. Pinning all four here so + that whenever someone touches the config shape, an accidental + unbinding fails CI instead of silently ignoring operator + overrides. + """ + from backend.copilot.config import ChatConfig + + monkeypatch.setenv("CHAT_FAST_STANDARD_MODEL", "explicit/fast-std") + monkeypatch.setenv("CHAT_FAST_ADVANCED_MODEL", "explicit/fast-adv") + monkeypatch.setenv("CHAT_THINKING_STANDARD_MODEL", "explicit/think-std") + monkeypatch.setenv("CHAT_THINKING_ADVANCED_MODEL", "explicit/think-adv") + # Clear the legacy aliases so they don't win priority in + # ``AliasChoices`` (first match wins). + for legacy in ("CHAT_MODEL", "CHAT_ADVANCED_MODEL", "CHAT_FAST_MODEL"): + monkeypatch.delenv(legacy, raising=False) + + cfg = ChatConfig() + + assert cfg.fast_standard_model == "explicit/fast-std" + assert cfg.fast_advanced_model == "explicit/fast-adv" + assert cfg.thinking_standard_model == "explicit/think-std" + assert cfg.thinking_advanced_model == "explicit/think-adv" class TestLoadPriorTranscript: diff --git a/autogpt_platform/backend/backend/copilot/config.py b/autogpt_platform/backend/backend/copilot/config.py index 1bb63fe1da..1183217f37 100644 --- a/autogpt_platform/backend/backend/copilot/config.py +++ b/autogpt_platform/backend/backend/copilot/config.py @@ -3,7 +3,7 @@ import os from typing import Literal -from pydantic import Field, field_validator +from pydantic import AliasChoices, Field, field_validator from pydantic_settings import BaseSettings from backend.util.clients import OPENROUTER_BASE_URL @@ -17,8 +17,12 @@ from backend.util.clients import OPENROUTER_BASE_URL CopilotMode = Literal["fast", "extended_thinking"] # Per-request model tier set by the frontend model toggle. -# 'standard' uses ``ChatConfig.model`` (Sonnet by default). -# 'advanced' uses ``ChatConfig.advanced_model`` (Opus by default). +# 'standard' picks the cheaper everyday model for the active path — +# ``fast_standard_model`` on the baseline path, ``thinking_standard_model`` +# on the SDK path. +# 'advanced' picks the premium model for the active path — ``fast_advanced_model`` +# on the baseline path, ``thinking_advanced_model`` on the SDK path (both +# default to Opus today). # None means no preference — falls through to LD per-user targeting, then config. # Using tier names instead of model names keeps the contract model-agnostic. CopilotLlmModel = Literal["standard", "advanced"] @@ -27,21 +31,61 @@ CopilotLlmModel = Literal["standard", "advanced"] class ChatConfig(BaseSettings): """Configuration for the chat system.""" - # Chat model tiers — applied orthogonally to the path (fast=baseline vs - # extended_thinking=SDK). The "fast" vs "extended_thinking" toggle picks - # which code path runs (no reasoning / heavy SDK); "standard" vs - # "advanced" picks the model inside that path. - model: str = Field( - default="anthropic/claude-sonnet-4-6", - description="Model used for the 'standard' tier (Sonnet by default). " - "Applies to both baseline (fast) and SDK (extended thinking) paths. " - "Override via CHAT_MODEL env var.", + # Chat model tiers — a 2×2 of (path, tier). ``path`` = ``CopilotMode`` + # (``"fast"`` → baseline OpenAI-compat / any OpenRouter model; + # ``"extended_thinking"`` → Claude Agent SDK, Anthropic-only CLI). + # ``tier`` = ``CopilotLlmModel`` (``"standard"`` / ``"advanced"``). + # Each cell has its own config so the two paths can evolve + # independently (cheap provider on baseline, Anthropic on SDK) at each + # tier without conflating one path's needs with the other's constraint. + # + # Historical env var names (``CHAT_MODEL`` / ``CHAT_ADVANCED_MODEL`` / + # ``CHAT_FAST_MODEL``) are preserved via ``validation_alias`` so + # existing deployments continue to override the same effective cell. + fast_standard_model: str = Field( + default="moonshotai/kimi-k2.6", + validation_alias=AliasChoices( + "CHAT_FAST_STANDARD_MODEL", + "CHAT_FAST_MODEL", + ), + description="Baseline path, 'standard' / ``None`` tier. Kimi K2.6 " + "by default: ~5x cheaper input and ~5.4x cheaper output than Sonnet, " + "SWE-Bench Verified parity with Opus, and OpenRouter advertises the " + "``reasoning`` + ``include_reasoning`` extension params on the " + "Moonshot endpoints — so the baseline reasoning plumbing lights up " + "without provider-specific code. Roll back to the Anthropic route " + "via ``CHAT_FAST_STANDARD_MODEL=anthropic/claude-sonnet-4-6`` (then " + "``cache_control`` breakpoints reactivate via " + "``_is_anthropic_model``).", ) - advanced_model: str = Field( - default="anthropic/claude-opus-4-7", - description="Model used for the 'advanced' tier (Opus by default). " - "Applies to both baseline (fast) and SDK (extended thinking) paths. " - "Override via CHAT_ADVANCED_MODEL env var.", + fast_advanced_model: str = Field( + default="anthropic/claude-opus-4.7", + validation_alias=AliasChoices("CHAT_FAST_ADVANCED_MODEL"), + description="Baseline path, 'advanced' tier. Opus by default. " + "Override via ``CHAT_FAST_ADVANCED_MODEL``.", + ) + thinking_standard_model: str = Field( + default="anthropic/claude-sonnet-4-6", + validation_alias=AliasChoices( + "CHAT_THINKING_STANDARD_MODEL", + "CHAT_MODEL", + ), + description="SDK (extended-thinking) path, 'standard' / ``None`` " + "tier. Sonnet by default: the Claude Agent SDK CLI only speaks to " + "Anthropic endpoints, so the standard SDK tier has to stay on an " + "Anthropic model regardless of what the baseline path runs. " + "Override via ``CHAT_THINKING_STANDARD_MODEL`` (legacy " + "``CHAT_MODEL`` still honored).", + ) + thinking_advanced_model: str = Field( + default="anthropic/claude-opus-4.7", + validation_alias=AliasChoices( + "CHAT_THINKING_ADVANCED_MODEL", + "CHAT_ADVANCED_MODEL", + ), + description="SDK (extended-thinking) path, 'advanced' tier. Opus " + "by default. Override via ``CHAT_THINKING_ADVANCED_MODEL`` " + "(legacy ``CHAT_ADVANCED_MODEL`` still honored).", ) title_model: str = Field( default="openai/gpt-4o-mini", @@ -426,3 +470,10 @@ class ChatConfig(BaseSettings): env_file = ".env" env_file_encoding = "utf-8" extra = "ignore" # Ignore extra environment variables + # Accept both the Python attribute name and the validation_alias when + # constructing a ``ChatConfig`` directly (e.g. in tests passing + # ``thinking_standard_model=...``). Without this, pydantic only + # accepts the alias names (``CHAT_THINKING_STANDARD_MODEL`` env) and + # rejects field-name kwargs — breaking ``ChatConfig(field=...)`` in + # every test that constructs a config. + populate_by_name = True diff --git a/autogpt_platform/backend/backend/copilot/model.py b/autogpt_platform/backend/backend/copilot/model.py index 3e4e8923ab..1adef8e7c8 100644 --- a/autogpt_platform/backend/backend/copilot/model.py +++ b/autogpt_platform/backend/backend/copilot/model.py @@ -20,7 +20,7 @@ from openai.types.chat.chat_completion_message_tool_call_param import ( ) from prisma.models import ChatMessage as PrismaChatMessage from prisma.models import ChatSession as PrismaChatSession -from pydantic import BaseModel +from pydantic import BaseModel, PrivateAttr from backend.data.db_accessors import chat_db, library_db from backend.data.graph import GraphSettings @@ -205,6 +205,15 @@ class ChatSessionInfo(BaseModel): class ChatSession(ChatSessionInfo): messages: list[ChatMessage] + # In-flight tool-call names for the CURRENT turn. Not persisted to + # DB and not serialised on the wire — ``PrivateAttr`` keeps this a + # process-local scratch buffer that's invisible to ``model_dump`` / + # ``model_dump_json`` / the redis cache path. Populated by the + # baseline tool executor the moment a tool is dispatched so in-turn + # guards (e.g. ``require_guide_read``) can see the call before it + # lands in ``messages`` at turn-end. Cleared when the turn + # completes. + _inflight_tool_calls: set[str] = PrivateAttr(default_factory=set) @classmethod def new( @@ -242,6 +251,56 @@ class ChatSession(ChatSessionInfo): messages=[ChatMessage.from_db(m) for m in prisma_session.Messages], ) + def announce_inflight_tool_call(self, tool_name: str) -> None: + """Record that *tool_name* is being dispatched in the current turn. + + Called by the baseline tool executor **before** the tool actually + runs (the announcement is about dispatch, not success). If the + tool raises, the name stays in the buffer for the rest of the + turn — that matches the guide-read gate's contract ("was the tool + called?") but means any future gate wanting *successful* + dispatches would need its own tracking. + + Lets in-turn guards (see + ``copilot/tools/helpers.py::require_guide_read``) see a tool + call the moment it's issued, instead of waiting for the + ``session.messages`` flush at turn end — fixing a loop where a + second tool in the same turn re-fires a guard despite the + guarding tool having already been called (seen on Kimi K2.6 in + particular because its aggressive tool-call chaining exercises + this path much more than Sonnet does). The buffer is cleared by + :meth:`clear_inflight_tool_calls` at turn end. + """ + self._inflight_tool_calls.add(tool_name) + + def clear_inflight_tool_calls(self) -> None: + """Reset the in-flight tool-call announcement buffer.""" + self._inflight_tool_calls.clear() + + def has_tool_been_called(self, tool_name: str) -> bool: + """True when *tool_name* has been called in this session. + + Checks the in-flight announcement buffer (for calls dispatched + in the *current* turn but not yet flushed into ``messages``) and + the durable ``messages`` history (for past turns + prior rounds + within this turn whose writes already landed). The durable + scan is session-wide, not turn-scoped: a matching tool call + anywhere in ``messages`` counts. This matches the guide-read + contract — once the guide has been read in the session, the + agent doesn't need to re-read it for later create/edit/fix + tools. + """ + if tool_name in self._inflight_tool_calls: + return True + for msg in reversed(self.messages): + if msg.role != "assistant" or not msg.tool_calls: + continue + for tc in msg.tool_calls: + name = tc.get("function", {}).get("name") or tc.get("name") + if name == tool_name: + return True + return False + def add_tool_call_to_current_turn(self, tool_call: dict) -> None: """Attach a tool_call to the current turn's assistant message. diff --git a/autogpt_platform/backend/backend/copilot/permissions.py b/autogpt_platform/backend/backend/copilot/permissions.py index a87cad1e9b..58cce98fbf 100644 --- a/autogpt_platform/backend/backend/copilot/permissions.py +++ b/autogpt_platform/backend/backend/copilot/permissions.py @@ -107,6 +107,7 @@ ToolName = Literal[ "validate_agent_graph", "view_agent_output", "web_fetch", + "web_search", "write_workspace_file", # SDK built-ins "Agent", diff --git a/autogpt_platform/backend/backend/copilot/sdk/service.py b/autogpt_platform/backend/backend/copilot/sdk/service.py index 6c7493c045..69dda8f227 100644 --- a/autogpt_platform/backend/backend/copilot/sdk/service.py +++ b/autogpt_platform/backend/backend/copilot/sdk/service.py @@ -450,7 +450,9 @@ async def _reduce_context( # useful for the eventual upload_transcript call that seeds future turns. if transcript_content and not tried_compaction: compacted = await compact_transcript( - transcript_content, model=config.model, log_prefix=log_prefix + transcript_content, + model=config.thinking_standard_model, + log_prefix=log_prefix, ) if ( compacted @@ -700,7 +702,7 @@ def _resolve_sdk_model() -> str | None: """Resolve the model name for the Claude Agent SDK CLI. Uses `config.claude_agent_model` if set, otherwise derives from - `config.model` via :func:`_normalize_model_name`. + `config.thinking_standard_model` via :func:`_normalize_model_name`. When `use_claude_code_subscription` is enabled and no explicit `claude_agent_model` is set, returns `None` so the CLI uses the @@ -710,7 +712,7 @@ def _resolve_sdk_model() -> str | None: return config.claude_agent_model if config.use_claude_code_subscription: return None - return _normalize_model_name(config.model) + return _normalize_model_name(config.thinking_standard_model) def _resolve_fallback_model() -> str | None: @@ -739,7 +741,7 @@ async def _resolve_sdk_model_for_request( cost (reported by the SDK) already reflects model-pricing differences. """ if model == "advanced": - sdk_model = _normalize_model_name(config.advanced_model) + sdk_model = _normalize_model_name(config.thinking_advanced_model) logger.info( "[SDK] [%s] Per-request model override: advanced (%s)", session_id[:12] if session_id else "?", @@ -1191,7 +1193,10 @@ async def _compress_messages( try: result = await _run_compression( - messages_dict, config.model, "[SDK]", target_tokens=target_tokens + messages_dict, + config.thinking_standard_model, + "[SDK]", + target_tokens=target_tokens, ) except Exception as exc: # Guard against timeouts or unexpected errors in compression — @@ -3856,7 +3861,7 @@ async def stream_chat_completion_sdk( cache_creation_tokens=turn_cache_creation_tokens, log_prefix=log_prefix, cost_usd=turn_cost_usd, - model=sdk_model or config.model, + model=sdk_model or config.thinking_standard_model, provider="anthropic", ) diff --git a/autogpt_platform/backend/backend/copilot/sdk/service_helpers_test.py b/autogpt_platform/backend/backend/copilot/sdk/service_helpers_test.py index 3b919c6036..4eb5bc4ac2 100644 --- a/autogpt_platform/backend/backend/copilot/sdk/service_helpers_test.py +++ b/autogpt_platform/backend/backend/copilot/sdk/service_helpers_test.py @@ -364,9 +364,10 @@ class TestNormalizeModelName: """Unit tests for the model-name normalisation helper. The per-request model toggle calls _normalize_model_name with either - ``"anthropic/claude-opus-4-6"`` (for 'advanced') or ``config.model`` (for - 'standard'). These tests verify the OpenRouter/provider-prefix stripping - that keeps the value compatible with the Claude CLI. + ``config.thinking_advanced_model`` (for 'advanced') or + ``config.thinking_standard_model`` (for 'standard'). These tests verify + the OpenRouter/provider-prefix stripping that keeps the value compatible + with the Claude CLI. """ def test_strips_anthropic_prefix(self): diff --git a/autogpt_platform/backend/backend/copilot/sdk/service_test.py b/autogpt_platform/backend/backend/copilot/sdk/service_test.py index d47f67252a..619fce3017 100644 --- a/autogpt_platform/backend/backend/copilot/sdk/service_test.py +++ b/autogpt_platform/backend/backend/copilot/sdk/service_test.py @@ -395,7 +395,7 @@ class TestResolveSdkModel: from backend.copilot import config as cfg_mod cfg = cfg_mod.ChatConfig( - model="anthropic/claude-opus-4.6", + thinking_standard_model="anthropic/claude-opus-4.6", claude_agent_model=None, use_openrouter=True, api_key="or-key", @@ -412,7 +412,7 @@ class TestResolveSdkModel: from backend.copilot import config as cfg_mod cfg = cfg_mod.ChatConfig( - model="anthropic/claude-opus-4.6", + thinking_standard_model="anthropic/claude-opus-4.6", claude_agent_model=None, use_openrouter=False, api_key=None, @@ -430,7 +430,7 @@ class TestResolveSdkModel: from backend.copilot import config as cfg_mod cfg = cfg_mod.ChatConfig( - model="anthropic/claude-opus-4.6", + thinking_standard_model="anthropic/claude-opus-4.6", claude_agent_model=None, use_openrouter=True, api_key=None, @@ -447,7 +447,7 @@ class TestResolveSdkModel: from backend.copilot import config as cfg_mod cfg = cfg_mod.ChatConfig( - model="anthropic/claude-opus-4.6", + thinking_standard_model="anthropic/claude-opus-4.6", claude_agent_model="claude-sonnet-4-5-20250514", use_openrouter=True, api_key="or-key", @@ -462,7 +462,7 @@ class TestResolveSdkModel: from backend.copilot import config as cfg_mod cfg = cfg_mod.ChatConfig( - model="anthropic/claude-opus-4.6", + thinking_standard_model="anthropic/claude-opus-4.6", claude_agent_model=None, use_openrouter=False, api_key=None, @@ -477,7 +477,7 @@ class TestResolveSdkModel: from backend.copilot import config as cfg_mod cfg = cfg_mod.ChatConfig( - model="claude-opus-4.6", + thinking_standard_model="claude-opus-4.6", claude_agent_model=None, use_openrouter=False, api_key=None, diff --git a/autogpt_platform/backend/backend/copilot/sdk/tool_adapter.py b/autogpt_platform/backend/backend/copilot/sdk/tool_adapter.py index d97937da23..7e1fa0396d 100644 --- a/autogpt_platform/backend/backend/copilot/sdk/tool_adapter.py +++ b/autogpt_platform/backend/backend/copilot/sdk/tool_adapter.py @@ -779,7 +779,9 @@ def create_copilot_mcp_server(*, use_e2b: bool = False): # In E2B mode, all five are disabled — MCP equivalents provide direct sandbox # access. read_file also handles local tool-results and ephemeral reads. _SDK_BUILTIN_FILE_TOOLS = ["Read", "Write", "Edit", "Glob", "Grep"] -_SDK_BUILTIN_ALWAYS = ["Task", "Agent", "WebSearch", "TodoWrite"] +# WebSearch moved to ``SDK_DISALLOWED_TOOLS`` — routed through +# ``mcp__copilot__web_search`` so cost tracking is unified across paths. +_SDK_BUILTIN_ALWAYS = ["Task", "Agent", "TodoWrite"] _SDK_BUILTIN_TOOLS = [*_SDK_BUILTIN_FILE_TOOLS, *_SDK_BUILTIN_ALWAYS] # SDK built-in tools that must be explicitly blocked. @@ -805,6 +807,7 @@ _SDK_BUILTIN_TOOLS = [*_SDK_BUILTIN_FILE_TOOLS, *_SDK_BUILTIN_ALWAYS] SDK_DISALLOWED_TOOLS = [ "Bash", "WebFetch", + "WebSearch", "AskUserQuestion", "Write", "Edit", diff --git a/autogpt_platform/backend/backend/copilot/service.py b/autogpt_platform/backend/backend/copilot/service.py index e068a201d3..b0399f87e3 100644 --- a/autogpt_platform/backend/backend/copilot/service.py +++ b/autogpt_platform/backend/backend/copilot/service.py @@ -42,17 +42,18 @@ settings = Settings() def resolve_chat_model(tier: CopilotLlmModel | None) -> str: - """Return the configured OpenRouter model string for the given tier. + """Return the configured SDK model for the given tier. - Shared by the baseline (fast) and SDK (extended thinking) paths so - both honor the same standard/advanced env-var configuration. ``None`` - and ``'standard'`` fall through to ``config.model``; ``'advanced'`` - uses ``config.advanced_model``. Keep this flat — if a third tier - shows up later, extend here and both paths pick it up for free. + The SDK (extended-thinking) path is Anthropic-only — the Claude Agent + SDK CLI refuses non-Anthropic endpoints — so both SDK tiers resolve + to the ``thinking_*_model`` cells. Baseline has its own resolver + (``_resolve_baseline_model``) that reads the ``fast_*_model`` cells; + the two paths diverge deliberately at the config layer so a cheaper + baseline provider can't break SDK, or vice versa. """ if tier == "advanced": - return config.advanced_model - return config.model + return config.thinking_advanced_model + return config.thinking_standard_model _client: LangfuseAsyncOpenAI | None = None diff --git a/autogpt_platform/backend/backend/copilot/tools/__init__.py b/autogpt_platform/backend/backend/copilot/tools/__init__.py index 9ba050b79a..7aace646a6 100644 --- a/autogpt_platform/backend/backend/copilot/tools/__init__.py +++ b/autogpt_platform/backend/backend/copilot/tools/__init__.py @@ -45,6 +45,7 @@ from .run_sub_session import RunSubSessionTool from .search_docs import SearchDocsTool from .validate_agent import ValidateAgentGraphTool from .web_fetch import WebFetchTool +from .web_search import WebSearchTool from .workspace_files import ( DeleteWorkspaceFileTool, ListWorkspaceFilesTool, @@ -93,6 +94,7 @@ TOOL_REGISTRY: dict[str, BaseTool] = { "get_agent_building_guide": GetAgentBuildingGuideTool(), # Web fetch for safe URL retrieval "web_fetch": WebFetchTool(), + "web_search": WebSearchTool(), # Agent-browser multi-step automation (navigate, act, screenshot) "browser_navigate": BrowserNavigateTool(), "browser_act": BrowserActTool(), diff --git a/autogpt_platform/backend/backend/copilot/tools/agent_guide_gate_test.py b/autogpt_platform/backend/backend/copilot/tools/agent_guide_gate_test.py index 850592ec55..a5c03162e3 100644 --- a/autogpt_platform/backend/backend/copilot/tools/agent_guide_gate_test.py +++ b/autogpt_platform/backend/backend/copilot/tools/agent_guide_gate_test.py @@ -7,8 +7,6 @@ tokens and then produce JSON that fails validation — wasting turns on auto-fix loops. """ -from unittest.mock import MagicMock - import pytest from backend.copilot.model import ChatMessage, ChatSession @@ -21,12 +19,21 @@ def _session_with_messages( messages: list[ChatMessage], builder_graph_id: str | None = None, ) -> ChatSession: - """Build a minimal ChatSession whose ``messages`` matches *messages*.""" - session = MagicMock(spec=ChatSession) + """Build a real ChatSession with the given messages. + + Uses ``ChatSession.new`` + attribute reassignment rather than + ``MagicMock(spec=...)`` because the gate now calls + ``session.has_tool_been_called(...)`` and a ``spec`` mock + returns a truthy ``MagicMock`` from that call, hiding real gate + behaviour. A live ``ChatSession`` also correctly initialises the + ``_inflight_tool_calls`` PrivateAttr scratch buffer used by the + in-turn announcement path. + """ + session = ChatSession.new( + "test-user", dry_run=False, builder_graph_id=builder_graph_id + ) session.session_id = "test-session" session.messages = messages - session.metadata = MagicMock() - session.metadata.builder_graph_id = builder_graph_id return session @@ -124,6 +131,47 @@ def test_tool_name_surfaced_in_error(tool_name: str): assert tool_name in result.message +def test_inflight_announcement_lets_gate_pass_within_same_turn(): + """Regression for the Kimi baseline loop: the guide call is + dispatched earlier in the SAME turn and buffered by the + ``_baseline_tool_executor`` into the in-flight announcement set, + but hasn't been flushed into ``session.messages`` yet. The gate + must see it anyway — otherwise a follow-up ``create_agent`` in the + same turn re-fires the guard despite the guide call and the model + loops retrying the guide.""" + session = _session_with_messages( + [ChatMessage(role="user", content="build something")] + ) + # Simulate _baseline_tool_executor's announce. + session.announce_inflight_tool_call("get_agent_building_guide") + assert require_guide_read(session, "create_agent") is None + + +def test_inflight_clear_restores_gate_for_next_turn(): + """End-of-turn cleanup must drop the in-flight buffer so it can't + leak into the *next* turn's ``session.messages`` scan (e.g. a second + session turn that should legitimately require a fresh guide call if + ``messages`` got compressed away).""" + session = _session_with_messages([ChatMessage(role="user", content="build")]) + session.announce_inflight_tool_call("get_agent_building_guide") + assert require_guide_read(session, "create_agent") is None + session.clear_inflight_tool_calls() + # With the buffer cleared and no guide row in messages, the guard + # fires again. + assert isinstance(require_guide_read(session, "create_agent"), ErrorResponse) + + +def test_inflight_announcement_does_not_serialise_into_model_dump(): + """PrivateAttr invariant: the scratch buffer must never leak into + ``model_dump()`` / the Redis cache payload / the DB — it's + process-local turn state, not durable session state.""" + session = _session_with_messages([]) + session.announce_inflight_tool_call("get_agent_building_guide") + dumped = session.model_dump() + assert "_inflight_tool_calls" not in dumped + assert "inflight_tool_calls" not in dumped + + def test_builder_bound_session_bypasses_gate(): """Builder-bound sessions receive the guide via on every turn, so the tool-call gate is unnecessary and only wastes a diff --git a/autogpt_platform/backend/backend/copilot/tools/helpers.py b/autogpt_platform/backend/backend/copilot/tools/helpers.py index 3bd134e4c7..6c25e79188 100644 --- a/autogpt_platform/backend/backend/copilot/tools/helpers.py +++ b/autogpt_platform/backend/backend/copilot/tools/helpers.py @@ -787,22 +787,18 @@ def _resolve_discriminated_credentials( _AGENT_GUIDE_TOOL_NAME = "get_agent_building_guide" -def _guide_read_in_session(session: ChatSession) -> bool: - """True if this session's assistant messages include a guide tool call.""" - for msg in reversed(session.messages): - if msg.role != "assistant" or not msg.tool_calls: - continue - for tc in msg.tool_calls: - name = tc.get("function", {}).get("name") or tc.get("name") - if name == _AGENT_GUIDE_TOOL_NAME: - return True - return False - - def require_guide_read(session: ChatSession, tool_name: str): """Return an ErrorResponse if the guide hasn't been loaded this session. Import inline to keep ``helpers.py`` free of tool-response imports. + Uses :meth:`ChatSession.has_tool_been_called` which checks both the + persisted ``messages`` list (session-wide) and the in-flight + announcement buffer — so a guide call dispatched earlier in the + *current* turn (before ``session.messages`` flushes at turn end) is + recognised too. Otherwise a second tool in the same turn would + re-fire this guard despite the guide having been called — seen on + Kimi K2.6 in particular because its aggressive tool-call chaining + exercises this path far more than Sonnet does. """ from .models import ErrorResponse # noqa: PLC0415 — avoid circular import @@ -812,7 +808,7 @@ def require_guide_read(session: ChatSession, tool_name: str): # requiring one would waste a round-trip every turn. if session.metadata.builder_graph_id: return None - if _guide_read_in_session(session): + if session.has_tool_been_called(_AGENT_GUIDE_TOOL_NAME): return None return ErrorResponse( message=( diff --git a/autogpt_platform/backend/backend/copilot/tools/models.py b/autogpt_platform/backend/backend/copilot/tools/models.py index 8fa7e6cbb4..08b62056a4 100644 --- a/autogpt_platform/backend/backend/copilot/tools/models.py +++ b/autogpt_platform/backend/backend/copilot/tools/models.py @@ -76,6 +76,7 @@ class ResponseType(str, Enum): # Web WEB_FETCH = "web_fetch" + WEB_SEARCH = "web_search" # Feature requests FEATURE_REQUEST_SEARCH = "feature_request_search" @@ -585,6 +586,30 @@ class WebFetchResponse(ToolResponseBase): truncated: bool = False +class WebSearchResult(BaseModel): + """One entry in a web_search tool response.""" + + title: str + url: str + snippet: str = "" + page_age: str | None = None + + +class WebSearchResponse(ToolResponseBase): + """Response for web_search tool — mirrors the shape of the SDK's + native ``WebSearch`` tool so the LLM sees a consistent interface + regardless of which path dispatched the call.""" + + type: ResponseType = ResponseType.WEB_SEARCH + query: str + results: list[WebSearchResult] = Field(default_factory=list) + # Backend-reported usage for this call (copied from Anthropic's + # ``usage.server_tool_use``). Surfaces as metadata for frontend + # debug panels but is also what drives rate-limit / cost tracking + # via ``persist_and_record_usage(provider="anthropic")``. + search_requests: int = 0 + + class BashExecResponse(ToolResponseBase): """Response for bash_exec tool.""" diff --git a/autogpt_platform/backend/backend/copilot/tools/tool_schema_test.py b/autogpt_platform/backend/backend/copilot/tools/tool_schema_test.py index e0403cdc79..7b370f810c 100644 --- a/autogpt_platform/backend/backend/copilot/tools/tool_schema_test.py +++ b/autogpt_platform/backend/backend/copilot/tools/tool_schema_test.py @@ -21,7 +21,11 @@ from backend.copilot.tools import TOOL_REGISTRY # response shape carries) and the dry_run description. Keeps the # regression gate effective while accepting a deliberate ~120-token # spend on LLM-decision-critical copy. -_CHAR_BUDGET = 32_500 +# Bumped 32500 -> 32800 on PR #12871 for the new web_search tool +# (server-side Anthropic beta). Description already trimmed to the +# minimum viable copy; the bump absorbs the schema skeleton cost +# (~300 chars / ~75 tokens) for a new LLM-facing primitive. +_CHAR_BUDGET = 32_800 @pytest.fixture(scope="module") diff --git a/autogpt_platform/backend/backend/copilot/tools/web_search.py b/autogpt_platform/backend/backend/copilot/tools/web_search.py new file mode 100644 index 0000000000..feb999d4d6 --- /dev/null +++ b/autogpt_platform/backend/backend/copilot/tools/web_search.py @@ -0,0 +1,224 @@ +"""Web search tool — wraps Anthropic's server-side ``web_search`` beta. + +Single entry point for web search on both SDK and baseline paths. The +``web_search_20250305`` tool is server-side on Anthropic, so we call +the Messages API directly regardless of which LLM invoked the copilot +tool — OpenRouter can't proxy server-side tool execution. +""" + +import logging +from typing import Any + +from anthropic import AsyncAnthropic + +from backend.copilot.model import ChatSession +from backend.copilot.token_tracking import persist_and_record_usage +from backend.util.settings import Settings + +from .base import BaseTool +from .models import ErrorResponse, ToolResponseBase, WebSearchResponse, WebSearchResult + +logger = logging.getLogger(__name__) + +_WEB_SEARCH_DISPATCH_MODEL = "claude-haiku-4-5" +_MAX_DISPATCH_TOKENS = 512 +_DEFAULT_MAX_RESULTS = 5 +_HARD_MAX_RESULTS = 20 + + +class WebSearchTool(BaseTool): + """Search the public web and return cited results.""" + + @property + def name(self) -> str: + return "web_search" + + @property + def description(self) -> str: + return ( + "Search the web for live info (news, recent docs). Returns " + "{title, url, snippet}; use web_fetch to deep-dive a URL. " + "Prefer one targeted query over many reformulations." + ) + + @property + def parameters(self) -> dict[str, Any]: + return { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Search query.", + }, + "max_results": { + "type": "integer", + "description": ( + f"Max results (default {_DEFAULT_MAX_RESULTS}, " + f"cap {_HARD_MAX_RESULTS})." + ), + "default": _DEFAULT_MAX_RESULTS, + }, + }, + "required": ["query"], + } + + @property + def requires_auth(self) -> bool: + return False + + @property + def is_available(self) -> bool: + return bool(Settings().secrets.anthropic_api_key) + + async def _execute( + self, + user_id: str | None, + session: ChatSession, + query: str = "", + max_results: int = _DEFAULT_MAX_RESULTS, + **kwargs: Any, + ) -> ToolResponseBase: + query = (query or "").strip() + session_id = session.session_id if session else None + if not query: + return ErrorResponse( + message="Please provide a non-empty search query.", + error="missing_query", + session_id=session_id, + ) + + try: + max_results = int(max_results) + except (TypeError, ValueError): + max_results = _DEFAULT_MAX_RESULTS + max_results = max(1, min(max_results, _HARD_MAX_RESULTS)) + + api_key = Settings().secrets.anthropic_api_key + if not api_key: + return ErrorResponse( + message=( + "Web search is unavailable — the deployment has no " + "Anthropic API key configured." + ), + error="web_search_not_configured", + session_id=session_id, + ) + + client = AsyncAnthropic(api_key=api_key) + try: + resp = await client.messages.create( + model=_WEB_SEARCH_DISPATCH_MODEL, + max_tokens=_MAX_DISPATCH_TOKENS, + tools=[ + { + "type": "web_search_20250305", + "name": "web_search", + "max_uses": 1, + } + ], + messages=[ + { + "role": "user", + "content": ( + f"Use the web_search tool exactly once with the " + f"query {query!r} and then stop. Do not " + f"summarise — the caller parses the raw " + f"tool_result." + ), + } + ], + ) + except Exception as exc: + logger.warning( + "[web_search] Anthropic call failed for query=%r: %s", query, exc + ) + return ErrorResponse( + message=f"Web search failed: {exc}", + error="web_search_failed", + session_id=session_id, + ) + + results, search_requests = _extract_results(resp, limit=max_results) + + cost_usd = _estimate_cost_usd(resp, search_requests=search_requests) + try: + usage = getattr(resp, "usage", None) + await persist_and_record_usage( + session=session, + user_id=user_id, + prompt_tokens=getattr(usage, "input_tokens", 0) or 0, + completion_tokens=getattr(usage, "output_tokens", 0) or 0, + log_prefix="[web_search]", + cost_usd=cost_usd, + model=_WEB_SEARCH_DISPATCH_MODEL, + provider="anthropic", + ) + except Exception as exc: + logger.warning("[web_search] usage tracking failed: %s", exc) + + return WebSearchResponse( + message=f"Found {len(results)} result(s) for {query!r}.", + query=query, + results=results, + search_requests=search_requests, + session_id=session_id, + ) + + +def _extract_results(resp: Any, *, limit: int) -> tuple[list[WebSearchResult], int]: + """Pull results + server-side request count from an Anthropic response.""" + results: list[WebSearchResult] = [] + search_requests = 0 + + for block in getattr(resp, "content", []) or []: + btype = getattr(block, "type", None) + if btype == "web_search_tool_result": + content = getattr(block, "content", []) or [] + for item in content: + if getattr(item, "type", None) != "web_search_result": + continue + if len(results) >= limit: + break + # Anthropic's ``web_search_result`` exposes only + # ``title``/``url``/``page_age`` plus an opaque + # ``encrypted_content`` blob that is meant for citation + # round-tripping, not for display — it is base64-ish + # binary and would show as gibberish if surfaced to the + # model or the frontend. There is no plain-text snippet + # field in the current beta; callers get the readable + # text via the model's ``text`` blocks with citations, + # not via this list. Leave ``snippet`` empty. + results.append( + WebSearchResult( + title=getattr(item, "title", "") or "", + url=getattr(item, "url", "") or "", + snippet="", + page_age=getattr(item, "page_age", None), + ) + ) + + usage = getattr(resp, "usage", None) + server_tool_use = getattr(usage, "server_tool_use", None) if usage else None + if server_tool_use is not None: + search_requests = getattr(server_tool_use, "web_search_requests", 0) or 0 + + return results, search_requests + + +# Update when Anthropic revises pricing. +_COST_PER_SEARCH_USD = 0.010 # $10 per 1,000 web_search requests +_HAIKU_INPUT_USD_PER_MTOK = 1.0 +_HAIKU_OUTPUT_USD_PER_MTOK = 5.0 + + +def _estimate_cost_usd(resp: Any, *, search_requests: int) -> float: + """Per-search fee × count + Haiku dispatch tokens.""" + usage = getattr(resp, "usage", None) + input_tokens = getattr(usage, "input_tokens", 0) if usage else 0 + output_tokens = getattr(usage, "output_tokens", 0) if usage else 0 + + search_cost = search_requests * _COST_PER_SEARCH_USD + inference_cost = (input_tokens / 1_000_000) * _HAIKU_INPUT_USD_PER_MTOK + ( + output_tokens / 1_000_000 + ) * _HAIKU_OUTPUT_USD_PER_MTOK + return round(search_cost + inference_cost, 6) diff --git a/autogpt_platform/backend/backend/copilot/tools/web_search_test.py b/autogpt_platform/backend/backend/copilot/tools/web_search_test.py new file mode 100644 index 0000000000..3d516f295a --- /dev/null +++ b/autogpt_platform/backend/backend/copilot/tools/web_search_test.py @@ -0,0 +1,308 @@ +"""Tests for the ``web_search`` copilot tool. + +Covers the result extractor + cost estimator as pure units (fed with +synthetic Anthropic response objects), plus light integration tests that +mock ``AsyncAnthropic.messages.create`` and confirm the handler plumbs +through to ``persist_and_record_usage`` with the right provider tag. +""" + +from types import SimpleNamespace +from unittest.mock import AsyncMock, patch + +import pytest + +from backend.copilot.model import ChatSession + +from .models import ErrorResponse, WebSearchResponse, WebSearchResult +from .web_search import ( + _COST_PER_SEARCH_USD, + WebSearchTool, + _estimate_cost_usd, + _extract_results, +) + + +def _fake_anthropic_response( + *, + results: list[dict] | None = None, + search_requests: int = 1, + input_tokens: int = 120, + output_tokens: int = 40, +) -> SimpleNamespace: + """Build a synthetic Anthropic Messages response. + + Matches the shape produced by ``client.messages.create`` when the + response includes a ``web_search_tool_result`` content block and + ``usage.server_tool_use.web_search_requests`` on the turn meter. + """ + content = [] + if results is not None: + content.append( + SimpleNamespace( + type="web_search_tool_result", + content=[ + SimpleNamespace( + type="web_search_result", + title=r.get("title", "untitled"), + url=r.get("url", ""), + encrypted_content=r.get("snippet", ""), + page_age=r.get("page_age"), + ) + for r in results + ], + ) + ) + usage = SimpleNamespace( + input_tokens=input_tokens, + output_tokens=output_tokens, + server_tool_use=SimpleNamespace(web_search_requests=search_requests), + ) + return SimpleNamespace(content=content, usage=usage) + + +class TestExtractResults: + """The extractor is the only Anthropic-response-shape contact point; + pin its behaviour so an API shape change surfaces here first.""" + + def test_extracts_title_url_page_age_and_drops_encrypted_snippet(self): + # Anthropic's ``web_search_result`` ships an opaque + # ``encrypted_content`` blob that is not safe to surface — + # the extractor must drop it (snippet=="") regardless of + # whether the blob is non-empty. + resp = _fake_anthropic_response( + results=[ + { + "title": "Kimi K2.6 launch", + "url": "https://example.com/kimi", + "snippet": "EiJjbGF1ZGUtZW5jcnlwdGVkLWJsb2I=", + "page_age": "1 day", + }, + { + "title": "OpenRouter pricing", + "url": "https://openrouter.ai/moonshotai/kimi-k2.6", + "snippet": "", + }, + ] + ) + out, requests = _extract_results(resp, limit=10) + assert requests == 1 + assert len(out) == 2 + assert out[0].title == "Kimi K2.6 launch" + assert out[0].url == "https://example.com/kimi" + assert out[0].snippet == "" + assert out[0].page_age == "1 day" + assert out[1].snippet == "" + + def test_limit_caps_returned_results(self): + resp = _fake_anthropic_response( + results=[{"title": f"r{i}", "url": f"https://e/{i}"} for i in range(10)] + ) + out, _ = _extract_results(resp, limit=3) + assert len(out) == 3 + assert [r.title for r in out] == ["r0", "r1", "r2"] + + def test_missing_content_returns_empty(self): + resp = SimpleNamespace(content=[], usage=None) + out, requests = _extract_results(resp, limit=10) + assert out == [] + assert requests == 0 + + def test_non_search_blocks_are_ignored(self): + resp = SimpleNamespace( + content=[ + SimpleNamespace(type="text", text="Here's what I found..."), + SimpleNamespace( + type="web_search_tool_result", + content=[ + SimpleNamespace( + type="web_search_result", + title="real", + url="https://real.example", + encrypted_content="body", + page_age=None, + ) + ], + ), + ], + usage=None, + ) + out, _ = _extract_results(resp, limit=10) + assert len(out) == 1 and out[0].title == "real" + + +class TestEstimateCostUsd: + """Pin the per-search fee + Haiku inference math — the pricing + constants in ``web_search.py`` are hard-coded (no live lookup) so a + drift between Anthropic's schedule and our constants must surface + in this test for the next reader to notice.""" + + def test_zero_searches_still_charges_inference(self): + resp = _fake_anthropic_response(results=[], search_requests=0) + cost = _estimate_cost_usd(resp, search_requests=0) + # Haiku at 1000 input / 5000 output tokens = tiny but non-zero. + assert 0 < cost < 0.001 + + def test_single_search_fee_dominates(self): + resp = _fake_anthropic_response( + results=[{"title": "x", "url": "https://e"}], + search_requests=1, + input_tokens=100, + output_tokens=20, + ) + cost = _estimate_cost_usd(resp, search_requests=1) + # ~$0.010 search + trivial inference — total still ~1 cent. + assert cost >= _COST_PER_SEARCH_USD + assert cost < _COST_PER_SEARCH_USD + 0.001 + + def test_three_searches_linear_in_count(self): + resp = _fake_anthropic_response( + results=[], search_requests=3, input_tokens=0, output_tokens=0 + ) + cost = _estimate_cost_usd(resp, search_requests=3) + assert cost == pytest.approx(3 * _COST_PER_SEARCH_USD) + + +class TestWebSearchToolDispatch: + """Lightweight integration test: mock the Anthropic client, confirm + the handler returns a ``WebSearchResponse`` and the usage tracker is + called with ``provider='anthropic'`` (not 'open_router', even on the + baseline path — server-side web_search bills Anthropic regardless of + the calling LLM's route).""" + + def _session(self) -> ChatSession: + s = ChatSession.new("test-user", dry_run=False) + s.session_id = "sess-1" + return s + + @pytest.mark.asyncio + async def test_returns_response_with_results_and_tracks_cost(self, monkeypatch): + fake_resp = _fake_anthropic_response( + results=[ + { + "title": "hello", + "url": "https://example.com", + "snippet": "greeting", + } + ], + search_requests=1, + ) + mock_client = type( + "MC", + (), + { + "messages": type( + "M", (), {"create": AsyncMock(return_value=fake_resp)} + )() + }, + )() + + # Stub the Anthropic API key so ``is_available`` is True. + monkeypatch.setattr( + "backend.copilot.tools.web_search.Settings", + lambda: SimpleNamespace( + secrets=SimpleNamespace(anthropic_api_key="sk-test") + ), + ) + + with ( + patch( + "backend.copilot.tools.web_search.AsyncAnthropic", + return_value=mock_client, + ), + patch( + "backend.copilot.tools.web_search.persist_and_record_usage", + new=AsyncMock(return_value=160), + ) as mock_track, + ): + tool = WebSearchTool() + result = await tool._execute( + user_id="u1", + session=self._session(), + query="kimi k2.6 launch", + max_results=5, + ) + + assert isinstance(result, WebSearchResponse) + assert result.query == "kimi k2.6 launch" + assert len(result.results) == 1 + assert isinstance(result.results[0], WebSearchResult) + assert result.search_requests == 1 + + # Cost tracker must have been called with provider="anthropic". + assert mock_track.await_count == 1 + kwargs = mock_track.await_args.kwargs + assert kwargs["provider"] == "anthropic" + assert kwargs["model"] == "claude-haiku-4-5" + assert kwargs["user_id"] == "u1" + assert kwargs["cost_usd"] >= _COST_PER_SEARCH_USD + + @pytest.mark.asyncio + async def test_missing_api_key_returns_error_without_calling_anthropic( + self, monkeypatch + ): + monkeypatch.setattr( + "backend.copilot.tools.web_search.Settings", + lambda: SimpleNamespace(secrets=SimpleNamespace(anthropic_api_key="")), + ) + anthropic_stub = AsyncMock() + with ( + patch( + "backend.copilot.tools.web_search.AsyncAnthropic", + return_value=anthropic_stub, + ), + patch( + "backend.copilot.tools.web_search.persist_and_record_usage", + new=AsyncMock(), + ) as mock_track, + ): + tool = WebSearchTool() + assert tool.is_available is False + result = await tool._execute( + user_id="u1", + session=self._session(), + query="anything", + ) + assert isinstance(result, ErrorResponse) + assert result.error == "web_search_not_configured" + anthropic_stub.messages.create.assert_not_called() + mock_track.assert_not_called() + + @pytest.mark.asyncio + async def test_empty_query_rejected_without_api_call(self, monkeypatch): + monkeypatch.setattr( + "backend.copilot.tools.web_search.Settings", + lambda: SimpleNamespace( + secrets=SimpleNamespace(anthropic_api_key="sk-test") + ), + ) + anthropic_stub = AsyncMock() + with patch( + "backend.copilot.tools.web_search.AsyncAnthropic", + return_value=anthropic_stub, + ): + tool = WebSearchTool() + result = await tool._execute( + user_id="u1", session=self._session(), query=" " + ) + assert isinstance(result, ErrorResponse) + assert result.error == "missing_query" + anthropic_stub.messages.create.assert_not_called() + + +class TestToolRegistryIntegration: + """The tool must be registered under the ``web_search`` name so the + MCP layer exposes it as ``mcp__copilot__web_search`` — which is + what the SDK path now dispatches to (see + ``sdk/tool_adapter.py::SDK_DISALLOWED_TOOLS`` which blocks the CLI's + native ``WebSearch`` in favour of the MCP route).""" + + def test_web_search_is_in_tool_registry(self): + from backend.copilot.tools import TOOL_REGISTRY + + assert "web_search" in TOOL_REGISTRY + assert isinstance(TOOL_REGISTRY["web_search"], WebSearchTool) + + def test_sdk_native_websearch_is_disallowed(self): + from backend.copilot.sdk.tool_adapter import SDK_DISALLOWED_TOOLS + + assert "WebSearch" in SDK_DISALLOWED_TOOLS diff --git a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/GenericTool.tsx b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/GenericTool.tsx index 995c18df05..74aa3153d5 100644 --- a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/GenericTool.tsx +++ b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/GenericTool.tsx @@ -305,15 +305,58 @@ function getWebAccordionData( string, unknown >; - const url = - getStringField(inp as Record, "url", "query") ?? - "Web content"; + const query = getStringField(inp, "query"); + const url = getStringField(inp, "url") ?? query ?? "Web content"; + + const results = Array.isArray(output.results) + ? (output.results as Array>) + : null; + + if (results) { + return { + title: `${results.length} search result${results.length === 1 ? "" : "s"}`, + description: query ? truncate(query, 80) : undefined, + content: ( +
+ {results.map((r, i) => { + const title = getStringField(r, "title") ?? "(untitled)"; + const href = getStringField(r, "url") ?? ""; + const snippet = getStringField(r, "snippet"); + const pageAge = getStringField(r, "page_age"); + return ( +
+ {href ? ( + + {title} + + ) : ( + {title} + )} + {href && ( +
+ {truncate(href, 100)} +
+ )} + {snippet &&

{snippet}

} + {pageAge && ( +
{pageAge}
+ )} +
+ ); + })} +
+ ), + }; + } - // Try direct string fields first, then MCP content blocks, then raw JSON let content = getStringField(output, "content", "text", "_raw"); if (!content) content = extractMcpText(output); if (!content) { - // Fallback: render the raw JSON so the accordion isn't empty try { const raw = JSON.stringify(output, null, 2); if (raw !== "{}") content = raw; @@ -327,11 +370,7 @@ function getWebAccordionData( const message = getStringField(output, "message"); return { - title: statusCode - ? `Response (${statusCode})` - : url - ? "Web fetch" - : "Search results", + title: statusCode ? `Response (${statusCode})` : "Web fetch", description: truncate(url, 80), content: content ? ( {content} diff --git a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/__tests__/GenericTool.test.tsx b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/__tests__/GenericTool.test.tsx index 4308eb49bf..48e0409393 100644 --- a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/__tests__/GenericTool.test.tsx +++ b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/__tests__/GenericTool.test.tsx @@ -1,6 +1,6 @@ import { describe, expect, it } from "vitest"; import type { ToolUIPart } from "ai"; -import { render, screen } from "@/tests/integrations/test-utils"; +import { fireEvent, render, screen } from "@/tests/integrations/test-utils"; import { GenericTool } from "../GenericTool"; function makePart(overrides: Record = {}): ToolUIPart { @@ -136,4 +136,181 @@ describe("GenericTool", () => { const trigger2 = screen.getByRole("button", { expanded: false }); expect(trigger2.textContent).toContain("completed"); }); + + describe("web_search results rendering", () => { + function makeWebSearchPart( + results: Array>, + query = "kimi k2.6", + ): ToolUIPart { + return { + type: "tool-web_search", + toolCallId: "call-web-1", + state: "output-available", + input: { query }, + output: { + type: "web_search_response", + results, + query, + search_requests: 1, + }, + } as unknown as ToolUIPart; + } + + it("renders an 'N search results' title and shows the query in the description", () => { + render( + , + ); + const trigger = screen.getByRole("button", { expanded: false }); + expect(trigger.textContent).toContain("2 search results"); + expect(trigger.textContent).toContain("kimi k2.6"); + + fireEvent.click(trigger); + + const firstLink = screen.getByRole("link", { + name: "Kimi K2.6 release notes", + }) as HTMLAnchorElement; + expect(firstLink.getAttribute("href")).toBe("https://example.com/kimi"); + expect(firstLink.getAttribute("target")).toBe("_blank"); + expect(firstLink.getAttribute("rel")).toBe("noopener noreferrer"); + expect(screen.queryByText("A fast model")).not.toBeNull(); + expect(screen.queryByText("2 days ago")).not.toBeNull(); + + const secondLink = screen.getByRole("link", { + name: "Second result", + }) as HTMLAnchorElement; + expect(secondLink.getAttribute("href")).toBe("https://example.com/two"); + }); + + it("uses singular 'search result' when there is exactly one result", () => { + render( + , + ); + const trigger = screen.getByRole("button", { expanded: false }); + expect(trigger.textContent).toContain("1 search result"); + expect(trigger.textContent).not.toContain("1 search results"); + }); + + it("handles an empty results array (0 search results)", () => { + render(); + const trigger = screen.getByRole("button", { expanded: false }); + expect(trigger.textContent).toContain("0 search results"); + }); + + it("renders an untitled non-link when a result has no url", () => { + render( + , + ); + fireEvent.click(screen.getByRole("button", { expanded: false })); + expect(screen.queryByRole("link")).toBeNull(); + expect(screen.queryByText("No URL entry")).not.toBeNull(); + expect(screen.queryByText("Just text")).not.toBeNull(); + }); + + it("shows subtitle 'Searched \"…\"' once web_search output is available", () => { + const { container } = render( + , + ); + // MorphingTextAnimation splits each character into its own span and + // substitutes spaces with  , so assert on a normalized textContent + // rather than the raw substring. + const normalized = (container.textContent ?? "").replace(/ /g, " "); + expect(normalized).toContain('Searched "kimi k2.6"'); + }); + + it("uses '(untitled)' when a search result has no title", () => { + render( + , + ); + fireEvent.click(screen.getByRole("button", { expanded: false })); + const link = screen.getByRole("link", { + name: "(untitled)", + }) as HTMLAnchorElement; + expect(link.getAttribute("href")).toBe("https://example.com/x"); + }); + }); + + describe("getWebAccordionData non-results fallback", () => { + function makeWebFetchPart(output: Record): ToolUIPart { + return { + type: "tool-web_fetch", + toolCallId: "call-fetch-1", + state: "output-available", + input: { url: "https://example.com/page" }, + output, + } as unknown as ToolUIPart; + } + + it("renders 'Web fetch' title when output has content instead of results", () => { + render( + , + ); + const trigger = screen.getByRole("button", { expanded: false }); + expect(trigger.textContent).toContain("Web fetch"); + fireEvent.click(trigger); + expect(screen.queryByText("fetched body")).not.toBeNull(); + }); + + it("renders 'Response (N)' title when output has a status_code", () => { + render( + , + ); + const trigger = screen.getByRole("button", { expanded: false }); + expect(trigger.textContent).toContain("Response (404)"); + }); + + it("falls back to MCP text blocks when direct content is absent", () => { + render( + , + ); + fireEvent.click(screen.getByRole("button", { expanded: false })); + expect(screen.queryByText("mcp body")).not.toBeNull(); + }); + }); }); diff --git a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/__tests__/helpers.test.ts b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/__tests__/helpers.test.ts index de0b9155b6..ca8cc6ba0b 100644 --- a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/__tests__/helpers.test.ts +++ b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/__tests__/helpers.test.ts @@ -22,6 +22,11 @@ describe("extractToolName", () => { const part = { type: "Read" } as unknown as ToolUIPart; expect(extractToolName(part)).toBe("Read"); }); + + it("strips the tool- prefix for web_search", () => { + const part = { type: "tool-web_search" } as unknown as ToolUIPart; + expect(extractToolName(part)).toBe("web_search"); + }); }); describe("formatToolName", () => { @@ -60,8 +65,9 @@ describe("getToolCategory", () => { expect(getToolCategory("bash_exec")).toBe("bash"); }); - it("returns 'web' for web_fetch, WebSearch, WebFetch", () => { + it("returns 'web' for web_fetch, web_search, WebSearch, WebFetch", () => { expect(getToolCategory("web_fetch")).toBe("web"); + expect(getToolCategory("web_search")).toBe("web"); expect(getToolCategory("WebSearch")).toBe("web"); expect(getToolCategory("WebFetch")).toBe("web"); }); @@ -229,6 +235,50 @@ describe("getAnimationText", () => { expect(getAnimationText(part, "web")).toBe('Searching "test query"'); }); + it("shows searching text for web_search with a query summary", () => { + const part = makePart({ + type: "tool-web_search", + state: "input-streaming", + input: { query: "kimi k2.6" }, + }); + expect(getAnimationText(part, "web")).toBe('Searching "kimi k2.6"'); + }); + + it("falls back to generic searching text for web_search with no query", () => { + const part = makePart({ + type: "tool-web_search", + state: "input-streaming", + }); + expect(getAnimationText(part, "web")).toBe("Searching the web…"); + }); + + it("shows completed text for web_search with a query summary", () => { + const part = makePart({ + type: "tool-web_search", + state: "output-available", + input: { query: "kimi k2.6" }, + output: { results: [] }, + }); + expect(getAnimationText(part, "web")).toBe('Searched "kimi k2.6"'); + }); + + it("falls back to generic completed text for web_search with no query", () => { + const part = makePart({ + type: "tool-web_search", + state: "output-available", + output: { results: [] }, + }); + expect(getAnimationText(part, "web")).toBe("Web search completed"); + }); + + it("shows error text for web_search failure", () => { + const part = makePart({ + type: "tool-web_search", + state: "output-error", + }); + expect(getAnimationText(part, "web")).toBe("Search failed"); + }); + it("shows fetching text for web_fetch", () => { const part = makePart({ type: "tool-web_fetch", diff --git a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/helpers.ts b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/helpers.ts index f8da6fbc2f..e1103e1435 100644 --- a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/helpers.ts +++ b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/helpers.ts @@ -60,6 +60,7 @@ export function getToolCategory(toolName: string): ToolCategory { case "bash_exec": return "bash"; case "web_fetch": + case "web_search": case "WebSearch": case "WebFetch": return "web"; @@ -114,6 +115,7 @@ function getInputSummary(toolName: string, input: unknown): string | null { case "web_fetch": case "WebFetch": return typeof inp.url === "string" ? inp.url : null; + case "web_search": case "WebSearch": return typeof inp.query === "string" ? inp.query : null; case "browser_navigate": @@ -220,7 +222,7 @@ export function getAnimationText( ? `Running: ${shortSummary}` : "Running command\u2026"; case "web": - if (toolName === "WebSearch") { + if (toolName === "WebSearch" || toolName === "web_search") { return shortSummary ? `Searching "${shortSummary}"` : "Searching the web\u2026"; @@ -282,7 +284,7 @@ export function getAnimationText( // exit status here would just double up. return shortSummary ? `Ran: ${shortSummary}` : "Command completed"; case "web": - if (toolName === "WebSearch") { + if (toolName === "WebSearch" || toolName === "web_search") { return shortSummary ? `Searched "${shortSummary}"` : "Web search completed"; @@ -352,7 +354,9 @@ export function getAnimationText( case "bash": return "Command failed"; case "web": - return toolName === "WebSearch" ? "Search failed" : "Fetch failed"; + return toolName === "WebSearch" || toolName === "web_search" + ? "Search failed" + : "Fetch failed"; case "browser": return "Browser action failed"; default: diff --git a/autogpt_platform/frontend/src/app/api/openapi.json b/autogpt_platform/frontend/src/app/api/openapi.json index 5997921c26..83fa19af10 100644 --- a/autogpt_platform/frontend/src/app/api/openapi.json +++ b/autogpt_platform/frontend/src/app/api/openapi.json @@ -14564,6 +14564,7 @@ "browser_screenshot", "bash_exec", "web_fetch", + "web_search", "feature_request_search", "feature_request_created", "memory_store",