From 89f8060c5d2deea34c2922576d92e18a59509e46 Mon Sep 17 00:00:00 2001 From: majdyz Date: Tue, 21 Apr 2026 21:21:52 +0700 Subject: [PATCH 01/25] feat(backend/copilot): default baseline fast_model to Kimi K2.6 via OpenRouter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kimi K2.6 prices at $0.60/$2.80 per MTok (5x cheaper input, 5.4x cheaper output than Sonnet 4.6), ties Opus on SWE-Bench Verified (80.2% vs 80.8%), and ships OpenRouter's `reasoning` / `include_reasoning` extension on its Moonshot endpoints — meaning the baseline reasoning plumbing lit up in #12870 lights up unchanged. Three focused deltas: * `config.py`: new `fast_model` field defaulting to `moonshotai/kimi-k2.6`, separate from `model` (which still resolves to Sonnet for the SDK / extended-thinking path where the Claude Agent SDK CLI requires an Anthropic endpoint). `advanced_model` stays Opus on both paths — no Kimi equivalent at the top tier. * `_resolve_baseline_model`: no longer delegates to SDK's `resolve_chat_model`. Baseline standard/None → `config.fast_model`; advanced → `config.advanced_model`. SDK untouched. * `baseline/reasoning.py::_is_reasoning_route`: new gate covering Anthropic + Moonshot Kimi variants, used by `reasoning_extra_body`. The existing `_is_anthropic_model` in service.py stays narrow — it still gates `cache_control` markers + the `anthropic-beta` header, which Moonshot doesn't need (it auto-caches) and which would be dropped (or worst-case 400) on Kimi. Tests: extended extractor variant / kill-switch coverage in reasoning_test.py (new `TestIsReasoningRoute`, Kimi branches in `TestReasoningExtraBody`), added `_is_anthropic_model_rejects_kimi_routes` regression guard, added end-to-end `test_kimi_route_sends_reasoning_but_no_cache_control` through `_baseline_llm_caller` to pin the split-gate contract, and rewired `TestResolveBaselineModel` around `config.fast_model`. Rollback: `CHAT_FAST_MODEL=anthropic/claude-sonnet-4-6` restores prior behavior without code changes. Known risk to validate before we raise confidence: K2.5 had documented many-tool-selection regressions (vLLM had to ship accuracy patches) — we ship 43 tools per call, so /pr-test with the full payload is a must before this default is locked in. --- .../backend/copilot/baseline/reasoning.py | 34 ++++++++--- .../copilot/baseline/reasoning_test.py | 47 ++++++++++++++- .../backend/copilot/baseline/service.py | 17 +++--- .../copilot/baseline/service_unit_test.py | 60 ++++++++++++++++++- .../baseline/transcript_integration_test.py | 33 +++++++--- .../backend/backend/copilot/config.py | 21 ++++++- 6 files changed, 183 insertions(+), 29 deletions(-) diff --git a/autogpt_platform/backend/backend/copilot/baseline/reasoning.py b/autogpt_platform/backend/backend/copilot/baseline/reasoning.py index 15a77dde8a..0e408e17cc 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/reasoning.py +++ b/autogpt_platform/backend/backend/copilot/baseline/reasoning.py @@ -132,18 +132,36 @@ class OpenRouterDeltaExtension(BaseModel): return "".join(d.visible_text for d in self.reasoning_details) +def _is_reasoning_route(model: str) -> bool: + """Return True when the route supports OpenRouter's ``reasoning`` extension. + + OpenRouter exposes reasoning tokens via a unified ``reasoning`` request + param that works on any provider that supports extended thinking — + currently Anthropic (Claude Opus / Sonnet) and Moonshot (Kimi K2.6 + + kimi-k2-thinking) advertise it in their ``supported_parameters``. + Other providers silently drop the field, but we skip it anyway to keep + the payload tight and avoid confusing cache diagnostics. + + Kept separate from :func:`backend.copilot.baseline.service._is_anthropic_model` + because ``cache_control`` is strictly Anthropic-specific (Moonshot does + its own auto-caching), so the two gates must not conflate. + """ + lowered = model.lower() + return ( + "claude" in lowered + or lowered.startswith("anthropic") + or lowered.startswith("moonshotai/") + or "kimi" in lowered + ) + + def reasoning_extra_body(model: str, max_thinking_tokens: int) -> dict[str, Any] | None: """Build the ``extra_body["reasoning"]`` fragment for the OpenAI client. - Returns ``None`` for non-Anthropic routes (other OpenRouter providers - ignore the field but we skip it anyway to keep the payload minimal) - and for ``max_thinking_tokens <= 0`` (operator kill switch). + Returns ``None`` for non-reasoning routes and for + ``max_thinking_tokens <= 0`` (operator kill switch). """ - # Imported lazily to avoid pulling service.py at module load — service.py - # imports this module, and the lazy import keeps the dependency one-way. - from backend.copilot.baseline.service import _is_anthropic_model - - if not _is_anthropic_model(model) or max_thinking_tokens <= 0: + if not _is_reasoning_route(model) or max_thinking_tokens <= 0: return None return {"reasoning": {"max_tokens": max_thinking_tokens}} diff --git a/autogpt_platform/backend/backend/copilot/baseline/reasoning_test.py b/autogpt_platform/backend/backend/copilot/baseline/reasoning_test.py index df64086d5f..c0136aef19 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/reasoning_test.py +++ b/autogpt_platform/backend/backend/copilot/baseline/reasoning_test.py @@ -12,6 +12,7 @@ from backend.copilot.baseline.reasoning import ( BaselineReasoningEmitter, OpenRouterDeltaExtension, ReasoningDetail, + _is_reasoning_route, reasoning_extra_body, ) from backend.copilot.model import ChatMessage @@ -135,6 +136,32 @@ class TestOpenRouterDeltaExtension: assert ext.visible_text() == "real" +class TestIsReasoningRoute: + def test_anthropic_routes(self): + assert _is_reasoning_route("anthropic/claude-sonnet-4-6") + assert _is_reasoning_route("claude-3-5-sonnet-20241022") + assert _is_reasoning_route("anthropic.claude-3-5-sonnet") + assert _is_reasoning_route("ANTHROPIC/Claude-Opus") # case-insensitive + + def test_moonshot_kimi_routes(self): + # OpenRouter advertises the ``reasoning`` extension on Moonshot + # endpoints — both K2.6 (the new baseline default) and the + # reasoning-native kimi-k2-thinking variant. + assert _is_reasoning_route("moonshotai/kimi-k2.6") + assert _is_reasoning_route("moonshotai/kimi-k2-thinking") + assert _is_reasoning_route("moonshotai/kimi-k2.5") + # Direct (non-OpenRouter) model ids also resolve via the ``kimi`` + # substring so a future bare ``kimi-k3`` id would still match. + assert _is_reasoning_route("kimi-k2-instruct") + + def test_other_providers_rejected(self): + assert not _is_reasoning_route("openai/gpt-4o") + assert not _is_reasoning_route("google/gemini-2.5-pro") + assert not _is_reasoning_route("xai/grok-4") + assert not _is_reasoning_route("meta-llama/llama-3.3-70b-instruct") + assert not _is_reasoning_route("deepseek/deepseek-r1") + + class TestReasoningExtraBody: def test_anthropic_route_returns_fragment(self): assert reasoning_extra_body("anthropic/claude-sonnet-4-6", 4096) == { @@ -146,16 +173,30 @@ class TestReasoningExtraBody: "reasoning": {"max_tokens": 2048} } - def test_non_anthropic_route_returns_none(self): + def test_kimi_routes_return_fragment(self): + # Kimi K2.6 ships the same OpenRouter ``reasoning`` extension as + # Anthropic, so the gate widened with this PR and the fragment + # must now materialise on Moonshot routes too. + assert reasoning_extra_body("moonshotai/kimi-k2.6", 8192) == { + "reasoning": {"max_tokens": 8192} + } + assert reasoning_extra_body("moonshotai/kimi-k2-thinking", 4096) == { + "reasoning": {"max_tokens": 4096} + } + + def test_non_reasoning_route_returns_none(self): assert reasoning_extra_body("openai/gpt-4o", 4096) is None assert reasoning_extra_body("google/gemini-2.5-pro", 4096) is None + assert reasoning_extra_body("xai/grok-4", 4096) is None def test_zero_max_tokens_kill_switch(self): # Operator kill switch: ``max_thinking_tokens <= 0`` disables the - # ``reasoning`` extra_body fragment even on an Anthropic route. - # Lets us silence reasoning without dropping the SDK path's budget. + # ``reasoning`` extra_body fragment on ANY reasoning route (Anthropic + # or Kimi). Lets us silence reasoning without dropping the SDK + # path's budget. assert reasoning_extra_body("anthropic/claude-sonnet-4-6", 0) is None assert reasoning_extra_body("anthropic/claude-sonnet-4-6", -1) is None + assert reasoning_extra_body("moonshotai/kimi-k2.6", 0) is None class TestBaselineReasoningEmitter: diff --git a/autogpt_platform/backend/backend/copilot/baseline/service.py b/autogpt_platform/backend/backend/copilot/baseline/service.py index f87ec05390..5c7ef9a1a4 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/service.py +++ b/autogpt_platform/backend/backend/copilot/baseline/service.py @@ -317,14 +317,17 @@ def _filter_tools_by_permissions( def _resolve_baseline_model(tier: CopilotLlmModel | None) -> str: """Pick the model for the baseline path based on the per-request tier. - The baseline (fast) and SDK (extended thinking) paths now share the - same tier-based model resolution — only the *path* differs between - "fast" and "extended_thinking". ``'advanced'`` → Opus; - ``'standard'`` / ``None`` → the config default (Sonnet). + Baseline diverges from SDK on the ``'standard'`` / ``None`` tier: + baseline uses :attr:`ChatConfig.fast_model` (Kimi K2.6 by default, a + cheaper provider with the same OpenRouter ``reasoning`` extension), + while SDK stays on :attr:`ChatConfig.model` because the Claude Agent + SDK CLI only speaks to Anthropic endpoints. ``'advanced'`` still + resolves to :attr:`ChatConfig.advanced_model` (Opus) on both paths — + there's no Kimi equivalent at the top tier. """ - from backend.copilot.service import resolve_chat_model - - return resolve_chat_model(tier) + if tier == "advanced": + return config.advanced_model + return config.fast_model @dataclass diff --git a/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py b/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py index 4092206786..a81b292fd8 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py +++ b/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py @@ -1329,6 +1329,16 @@ class TestApplyPromptCacheMarkers: assert not _is_anthropic_model("xai/grok-4") assert not _is_anthropic_model("meta-llama/llama-3.3-70b-instruct") + def test_is_anthropic_model_rejects_kimi_routes(self): + """Regression guard: Kimi K2.6 is a reasoning route (reasoning + extra_body is sent) but NOT an Anthropic route — Moonshot does + its own auto prompt caching, so ``cache_control`` markers must + NOT be applied. OpenRouter silently drops them today, but if + they ever start failing fast we'd want the gate tight.""" + assert not _is_anthropic_model("moonshotai/kimi-k2.6") + assert not _is_anthropic_model("moonshotai/kimi-k2-thinking") + assert not _is_anthropic_model("kimi-k2-instruct") + def test_cache_control_uses_configured_ttl(self, monkeypatch): """TTL comes from ChatConfig.baseline_prompt_cache_ttl — defaults to 1h so the static prefix (system + tools) stays warm across @@ -1754,7 +1764,7 @@ class TestBaselineReasoningStreaming: @pytest.mark.asyncio async def test_reasoning_param_absent_on_non_anthropic_routes(self): - """Non-Anthropic routes (e.g. OpenAI) must not receive ``reasoning``.""" + """Non-reasoning routes (e.g. OpenAI) must not receive ``reasoning``.""" state = _BaselineStreamState(model="openai/gpt-4o") mock_client = MagicMock() @@ -1775,6 +1785,54 @@ class TestBaselineReasoningStreaming: extra_body = mock_client.chat.completions.create.call_args[1]["extra_body"] assert "reasoning" not in extra_body + @pytest.mark.asyncio + async def test_kimi_route_sends_reasoning_but_no_cache_control(self): + """Kimi K2.6 is the default fast_model and sends ``reasoning`` via + OpenRouter's unified extension. It must NOT receive ``cache_control`` + markers or the ``anthropic-beta`` header — Moonshot uses its own + auto-caching and those Anthropic-only fields would either get + silently dropped or (worst case) 400 on a future provider change.""" + state = _BaselineStreamState(model="moonshotai/kimi-k2.6") + + mock_client = MagicMock() + mock_client.chat.completions.create = AsyncMock( + return_value=_make_stream_mock() + ) + + with patch( + "backend.copilot.baseline.service._get_openai_client", + return_value=mock_client, + ): + await _baseline_llm_caller( + messages=[ + {"role": "system", "content": "you are a helpful assistant"}, + {"role": "user", "content": "hi"}, + ], + tools=[ + { + "type": "function", + "function": {"name": "echo", "parameters": {}}, + } + ], + state=state, + ) + + call_kwargs = mock_client.chat.completions.create.call_args[1] + extra_body = call_kwargs["extra_body"] + # Reasoning param on — the whole point of picking Kimi is the + # cheap-but-still-reasoning-capable path. + assert "reasoning" in extra_body + assert extra_body["reasoning"]["max_tokens"] > 0 + # Anthropic-only fields stay off. + assert "extra_headers" not in call_kwargs + sys_msg = call_kwargs["messages"][0] + sys_content = sys_msg.get("content") + if isinstance(sys_content, list): + assert all("cache_control" not in block for block in sys_content) + tools = call_kwargs.get("tools", []) + for t in tools: + assert "cache_control" not in t + @pytest.mark.asyncio async def test_reasoning_only_stream_still_closes_block(self): """Regression: a stream with only reasoning (no text, no tool_call) diff --git a/autogpt_platform/backend/backend/copilot/baseline/transcript_integration_test.py b/autogpt_platform/backend/backend/copilot/baseline/transcript_integration_test.py index 8d6fb50a53..b0ce4320f3 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/transcript_integration_test.py +++ b/autogpt_platform/backend/backend/copilot/baseline/transcript_integration_test.py @@ -63,21 +63,40 @@ def _make_session_messages(*roles: str) -> list[ChatMessage]: class TestResolveBaselineModel: - """Baseline model resolution honours the per-request tier toggle.""" + """Baseline model resolution honours the per-request tier toggle. + + The baseline resolver diverged from SDK's ``resolve_chat_model`` in this + PR: ``standard`` / ``None`` on baseline now picks ``config.fast_model`` + (Kimi K2.6 by default) instead of ``config.model`` (Sonnet), because + baseline speaks plain OpenAI-compat and can route anywhere cheaper, + while SDK still needs an Anthropic endpoint for the Claude Agent SDK + CLI. ``advanced`` remains ``config.advanced_model`` (Opus) on both + paths — there's no Kimi equivalent at the top tier. + """ def test_advanced_tier_selects_advanced_model(self): assert _resolve_baseline_model("advanced") == config.advanced_model - def test_standard_tier_selects_default_model(self): - assert _resolve_baseline_model("standard") == config.model + def test_standard_tier_selects_fast_model(self): + assert _resolve_baseline_model("standard") == config.fast_model - def test_none_tier_selects_default_model(self): - """Baseline users without a tier MUST keep the default (standard).""" - assert _resolve_baseline_model(None) == config.model + def test_none_tier_selects_fast_model(self): + """Baseline users without a tier get the cheap fast default.""" + assert _resolve_baseline_model(None) == config.fast_model + + def test_fast_model_default_is_kimi(self): + """Sanity: Kimi K2.6 is the shipped default cheap reasoning route.""" + assert config.fast_model == "moonshotai/kimi-k2.6" + + def test_sdk_and_baseline_standard_defaults_diverge(self): + """The whole point of the split: baseline cheap (Kimi) vs SDK + Anthropic-only (Sonnet). If this equality ever flips they've + re-collapsed and someone lost the cost savings.""" + assert config.model != config.fast_model def test_standard_and_advanced_models_differ(self): """Advanced tier defaults to a different (Opus) model than standard.""" - assert config.model != config.advanced_model + assert config.fast_model != config.advanced_model class TestLoadPriorTranscript: diff --git a/autogpt_platform/backend/backend/copilot/config.py b/autogpt_platform/backend/backend/copilot/config.py index 1bb63fe1da..f5f04e86df 100644 --- a/autogpt_platform/backend/backend/copilot/config.py +++ b/autogpt_platform/backend/backend/copilot/config.py @@ -33,9 +33,24 @@ class ChatConfig(BaseSettings): # "advanced" picks the model inside that path. model: str = Field( default="anthropic/claude-sonnet-4-6", - description="Model used for the 'standard' tier (Sonnet by default). " - "Applies to both baseline (fast) and SDK (extended thinking) paths. " - "Override via CHAT_MODEL env var.", + description="Model used for the 'standard' tier in the SDK " + "(extended thinking) path (Sonnet by default). The baseline / fast " + "path reads ``fast_model`` instead so the two paths can evolve " + "independently (cheaper provider on baseline, Anthropic-only CLI on " + "SDK). Override via CHAT_MODEL env var.", + ) + fast_model: str = Field( + default="moonshotai/kimi-k2.6", + description="Model used for the 'standard' / ``None`` tier on the " + "baseline (fast) path. Kimi K2.6 by default: ~5x cheaper input and " + "~5.4x cheaper output than Sonnet, SWE-Bench Verified parity with " + "Opus, and OpenRouter advertises the ``reasoning`` + " + "``include_reasoning`` extension params on the Moonshot endpoints — " + "so our existing baseline reasoning plumbing lights up without " + "provider-specific code. Fall back to the Anthropic route by " + "setting ``CHAT_FAST_MODEL=anthropic/claude-sonnet-4-6`` (then " + "``cache_control`` breakpoints reactivate via " + "``_is_anthropic_model``).", ) advanced_model: str = Field( default="anthropic/claude-opus-4-7", From 95d3679e1405b474bac8537afd028913a39c3b35 Mon Sep 17 00:00:00 2001 From: majdyz Date: Tue, 21 Apr 2026 21:58:43 +0700 Subject: [PATCH 02/25] test(backend/copilot): assert Field defaults, not env-backed singleton Address coderabbit[bot] review comment on PR #12871: three resolver tests read `config.fast_model`, `config.model`, `config.advanced_model` from the env-backed singleton, which fails in CI whenever an operator sets `CHAT_FAST_MODEL=anthropic/claude-sonnet-4-6` (the documented rollback path). Swap to `ChatConfig.model_fields[...].default` so the assertion pins the shipped default regardless of env overrides. --- .../baseline/transcript_integration_test.py | 35 +++++++++++++++---- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/autogpt_platform/backend/backend/copilot/baseline/transcript_integration_test.py b/autogpt_platform/backend/backend/copilot/baseline/transcript_integration_test.py index b0ce4320f3..4a9ff2c0e5 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/transcript_integration_test.py +++ b/autogpt_platform/backend/backend/copilot/baseline/transcript_integration_test.py @@ -85,18 +85,39 @@ class TestResolveBaselineModel: assert _resolve_baseline_model(None) == config.fast_model def test_fast_model_default_is_kimi(self): - """Sanity: Kimi K2.6 is the shipped default cheap reasoning route.""" - assert config.fast_model == "moonshotai/kimi-k2.6" + """Sanity: Kimi K2.6 is the shipped default cheap reasoning route. + + Asserts the declared ``Field`` default — env-independent — so a + deploy-time ``CHAT_FAST_MODEL`` rollback override doesn't fail CI + while still pinning the shipped default. + """ + from backend.copilot.config import ChatConfig + + assert ChatConfig.model_fields["fast_model"].default == "moonshotai/kimi-k2.6" def test_sdk_and_baseline_standard_defaults_diverge(self): """The whole point of the split: baseline cheap (Kimi) vs SDK - Anthropic-only (Sonnet). If this equality ever flips they've - re-collapsed and someone lost the cost savings.""" - assert config.model != config.fast_model + Anthropic-only (Sonnet). If the shipped defaults ever collapse + to the same value someone lost the cost savings. Checked against + ``Field`` defaults, not the env-backed singleton.""" + from backend.copilot.config import ChatConfig + + assert ( + ChatConfig.model_fields["model"].default + != ChatConfig.model_fields["fast_model"].default + ) def test_standard_and_advanced_models_differ(self): - """Advanced tier defaults to a different (Opus) model than standard.""" - assert config.fast_model != config.advanced_model + """Advanced tier defaults to a different (Opus) model than standard. + + Checked against declared ``Field`` defaults so operator env + overrides don't flake the test.""" + from backend.copilot.config import ChatConfig + + assert ( + ChatConfig.model_fields["fast_model"].default + != ChatConfig.model_fields["advanced_model"].default + ) class TestLoadPriorTranscript: From fce7a59713e1d860a57ebc777d7394b6f0ef1af8 Mon Sep 17 00:00:00 2001 From: majdyz Date: Tue, 21 Apr 2026 22:23:29 +0700 Subject: [PATCH 03/25] refactor(backend/copilot): split model config into (path, tier) 2x2 matrix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per PR review: and `advanced_model` were implicitly shared between baseline (fast) and SDK (extended_thinking) paths, but the paths have different hard constraints (baseline can route to any OpenRouter provider; SDK needs Anthropic endpoints). Replace the ambiguous 2-field schema with an explicit 2x2 of (path × tier). New fields: * `fast_standard_model` — baseline standard tier (Kimi K2.6) * `fast_advanced_model` — baseline advanced tier (Opus by default; same as SDK advanced so the top tier is a clean A/B across paths. Kimi K2-Thinking evaluated and deferred — it's 6 months older than K2.6, ~9pp behind on SWE-Bench Verified, ~23pp behind on BrowseComp, and text-only.) * `thinking_standard_model` — SDK standard tier (Sonnet) * `thinking_advanced_model` — SDK advanced tier (Opus) Backward-compat env var aliases: `CHAT_MODEL` → thinking_standard, `CHAT_ADVANCED_MODEL` → thinking_advanced, `CHAT_FAST_MODEL` → fast_standard. `populate_by_name=True` so ChatConfig(field=...) kwargs work alongside the alias names. Resolver split: `resolve_chat_model` (SDK) → thinking_*; `_resolve_baseline_model` (baseline) → fast_*. All call sites in sdk/service.py updated; test constructors migrated to new names. --- .../backend/copilot/baseline/service.py | 18 ++-- .../baseline/transcript_integration_test.py | 84 ++++++++++------- .../backend/backend/copilot/config.py | 93 +++++++++++++------ .../backend/backend/copilot/sdk/service.py | 17 ++-- .../copilot/sdk/service_helpers_test.py | 7 +- .../backend/copilot/sdk/service_test.py | 12 +-- .../backend/backend/copilot/service.py | 17 ++-- 7 files changed, 152 insertions(+), 96 deletions(-) diff --git a/autogpt_platform/backend/backend/copilot/baseline/service.py b/autogpt_platform/backend/backend/copilot/baseline/service.py index 5c7ef9a1a4..74ac8b9005 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/service.py +++ b/autogpt_platform/backend/backend/copilot/baseline/service.py @@ -317,17 +317,17 @@ def _filter_tools_by_permissions( def _resolve_baseline_model(tier: CopilotLlmModel | None) -> str: """Pick the model for the baseline path based on the per-request tier. - Baseline diverges from SDK on the ``'standard'`` / ``None`` tier: - baseline uses :attr:`ChatConfig.fast_model` (Kimi K2.6 by default, a - cheaper provider with the same OpenRouter ``reasoning`` extension), - while SDK stays on :attr:`ChatConfig.model` because the Claude Agent - SDK CLI only speaks to Anthropic endpoints. ``'advanced'`` still - resolves to :attr:`ChatConfig.advanced_model` (Opus) on both paths — - there's no Kimi equivalent at the top tier. + Baseline resolves independently of SDK via the ``fast_*_model`` cells + of the (path, tier) matrix. ``'standard'`` / ``None`` picks Kimi + K2.6 by default (cheap + OpenRouter ``reasoning`` support); + ``'advanced'`` picks Opus by default so the advanced tier is a clean + A/B against the SDK advanced tier — same model, different path — + isolating reasoning-wire + cache differences from model capability. + Both defaults are overridable per ``CHAT_FAST_*_MODEL`` env vars. """ if tier == "advanced": - return config.advanced_model - return config.fast_model + return config.fast_advanced_model + return config.fast_standard_model @dataclass diff --git a/autogpt_platform/backend/backend/copilot/baseline/transcript_integration_test.py b/autogpt_platform/backend/backend/copilot/baseline/transcript_integration_test.py index 4a9ff2c0e5..3d573e87b6 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/transcript_integration_test.py +++ b/autogpt_platform/backend/backend/copilot/baseline/transcript_integration_test.py @@ -65,58 +65,70 @@ def _make_session_messages(*roles: str) -> list[ChatMessage]: class TestResolveBaselineModel: """Baseline model resolution honours the per-request tier toggle. - The baseline resolver diverged from SDK's ``resolve_chat_model`` in this - PR: ``standard`` / ``None`` on baseline now picks ``config.fast_model`` - (Kimi K2.6 by default) instead of ``config.model`` (Sonnet), because - baseline speaks plain OpenAI-compat and can route anywhere cheaper, - while SDK still needs an Anthropic endpoint for the Claude Agent SDK - CLI. ``advanced`` remains ``config.advanced_model`` (Opus) on both - paths — there's no Kimi equivalent at the top tier. + Baseline reads the ``fast_*_model`` cells of the (path, tier) matrix + and never falls through to the SDK-side ``thinking_*_model`` cells. + Default routing: + - ``standard`` / ``None`` → ``config.fast_standard_model`` (Kimi K2.6) + - ``advanced`` → ``config.fast_advanced_model`` (Opus — same as SDK's + advanced tier, so the advanced A/B isolates path differences) """ - def test_advanced_tier_selects_advanced_model(self): - assert _resolve_baseline_model("advanced") == config.advanced_model + def test_advanced_tier_selects_fast_advanced_model(self): + assert _resolve_baseline_model("advanced") == config.fast_advanced_model - def test_standard_tier_selects_fast_model(self): - assert _resolve_baseline_model("standard") == config.fast_model + def test_standard_tier_selects_fast_standard_model(self): + assert _resolve_baseline_model("standard") == config.fast_standard_model - def test_none_tier_selects_fast_model(self): - """Baseline users without a tier get the cheap fast default.""" - assert _resolve_baseline_model(None) == config.fast_model + def test_none_tier_selects_fast_standard_model(self): + """Baseline users without a tier get the cheap fast-standard default.""" + assert _resolve_baseline_model(None) == config.fast_standard_model - def test_fast_model_default_is_kimi(self): - """Sanity: Kimi K2.6 is the shipped default cheap reasoning route. + def test_fast_standard_default_is_kimi(self): + """Shipped default: Kimi K2.6 on the baseline standard cell. Asserts the declared ``Field`` default — env-independent — so a - deploy-time ``CHAT_FAST_MODEL`` rollback override doesn't fail CI - while still pinning the shipped default. + deploy-time ``CHAT_FAST_STANDARD_MODEL`` rollback override + doesn't fail CI while still pinning the shipped default. """ from backend.copilot.config import ChatConfig - assert ChatConfig.model_fields["fast_model"].default == "moonshotai/kimi-k2.6" - - def test_sdk_and_baseline_standard_defaults_diverge(self): - """The whole point of the split: baseline cheap (Kimi) vs SDK - Anthropic-only (Sonnet). If the shipped defaults ever collapse - to the same value someone lost the cost savings. Checked against - ``Field`` defaults, not the env-backed singleton.""" - from backend.copilot.config import ChatConfig - assert ( - ChatConfig.model_fields["model"].default - != ChatConfig.model_fields["fast_model"].default + ChatConfig.model_fields["fast_standard_model"].default + == "moonshotai/kimi-k2.6" ) - def test_standard_and_advanced_models_differ(self): - """Advanced tier defaults to a different (Opus) model than standard. - - Checked against declared ``Field`` defaults so operator env - overrides don't flake the test.""" + def test_fast_advanced_default_is_opus(self): + """Shipped default: Opus on the baseline advanced cell — mirrors + the SDK advanced cell so the advanced-tier A/B stays clean + (same model, different path).""" from backend.copilot.config import ChatConfig assert ( - ChatConfig.model_fields["fast_model"].default - != ChatConfig.model_fields["advanced_model"].default + ChatConfig.model_fields["fast_advanced_model"].default + == "anthropic/claude-opus-4-7" + ) + + def test_standard_cells_diverge_across_paths(self): + """The whole point of the split: baseline cheap (Kimi) vs SDK + Anthropic-only (Sonnet). If the shipped standard defaults ever + collapse to the same value someone lost the cost savings. + Checked against ``Field`` defaults, not the env-backed singleton.""" + from backend.copilot.config import ChatConfig + + assert ( + ChatConfig.model_fields["thinking_standard_model"].default + != ChatConfig.model_fields["fast_standard_model"].default + ) + + def test_standard_and_advanced_cells_differ_on_fast(self): + """Advanced tier defaults to a different model than standard on + the baseline path. Checked against declared ``Field`` defaults + so operator env overrides don't flake the test.""" + from backend.copilot.config import ChatConfig + + assert ( + ChatConfig.model_fields["fast_standard_model"].default + != ChatConfig.model_fields["fast_advanced_model"].default ) diff --git a/autogpt_platform/backend/backend/copilot/config.py b/autogpt_platform/backend/backend/copilot/config.py index f5f04e86df..b6d6dcbf92 100644 --- a/autogpt_platform/backend/backend/copilot/config.py +++ b/autogpt_platform/backend/backend/copilot/config.py @@ -3,7 +3,7 @@ import os from typing import Literal -from pydantic import Field, field_validator +from pydantic import AliasChoices, Field, field_validator from pydantic_settings import BaseSettings from backend.util.clients import OPENROUTER_BASE_URL @@ -17,8 +17,8 @@ from backend.util.clients import OPENROUTER_BASE_URL CopilotMode = Literal["fast", "extended_thinking"] # Per-request model tier set by the frontend model toggle. -# 'standard' uses ``ChatConfig.model`` (Sonnet by default). -# 'advanced' uses ``ChatConfig.advanced_model`` (Opus by default). +# 'standard' picks the cheaper everyday model for the active path. +# 'advanced' picks the premium model for the active path (Opus today). # None means no preference — falls through to LD per-user targeting, then config. # Using tier names instead of model names keeps the contract model-agnostic. CopilotLlmModel = Literal["standard", "advanced"] @@ -27,36 +27,66 @@ CopilotLlmModel = Literal["standard", "advanced"] class ChatConfig(BaseSettings): """Configuration for the chat system.""" - # Chat model tiers — applied orthogonally to the path (fast=baseline vs - # extended_thinking=SDK). The "fast" vs "extended_thinking" toggle picks - # which code path runs (no reasoning / heavy SDK); "standard" vs - # "advanced" picks the model inside that path. - model: str = Field( - default="anthropic/claude-sonnet-4-6", - description="Model used for the 'standard' tier in the SDK " - "(extended thinking) path (Sonnet by default). The baseline / fast " - "path reads ``fast_model`` instead so the two paths can evolve " - "independently (cheaper provider on baseline, Anthropic-only CLI on " - "SDK). Override via CHAT_MODEL env var.", - ) - fast_model: str = Field( + # Chat model tiers — a 2×2 of (path, tier). ``path`` = ``CopilotMode`` + # (``"fast"`` → baseline OpenAI-compat / any OpenRouter model; + # ``"extended_thinking"`` → Claude Agent SDK, Anthropic-only CLI). + # ``tier`` = ``CopilotLlmModel`` (``"standard"`` / ``"advanced"``). + # Each cell has its own config so the two paths can evolve + # independently (cheap provider on baseline, Anthropic on SDK) at each + # tier without conflating one path's needs with the other's constraint. + # + # Historical env var names (``CHAT_MODEL`` / ``CHAT_ADVANCED_MODEL`` / + # ``CHAT_FAST_MODEL``) are preserved via ``validation_alias`` so + # existing deployments continue to override the same effective cell. + fast_standard_model: str = Field( default="moonshotai/kimi-k2.6", - description="Model used for the 'standard' / ``None`` tier on the " - "baseline (fast) path. Kimi K2.6 by default: ~5x cheaper input and " - "~5.4x cheaper output than Sonnet, SWE-Bench Verified parity with " - "Opus, and OpenRouter advertises the ``reasoning`` + " - "``include_reasoning`` extension params on the Moonshot endpoints — " - "so our existing baseline reasoning plumbing lights up without " - "provider-specific code. Fall back to the Anthropic route by " - "setting ``CHAT_FAST_MODEL=anthropic/claude-sonnet-4-6`` (then " + validation_alias=AliasChoices( + "CHAT_FAST_STANDARD_MODEL", + "CHAT_FAST_MODEL", + ), + description="Baseline path, 'standard' / ``None`` tier. Kimi K2.6 " + "by default: ~5x cheaper input and ~5.4x cheaper output than Sonnet, " + "SWE-Bench Verified parity with Opus, and OpenRouter advertises the " + "``reasoning`` + ``include_reasoning`` extension params on the " + "Moonshot endpoints — so the baseline reasoning plumbing lights up " + "without provider-specific code. Roll back to the Anthropic route " + "via ``CHAT_FAST_STANDARD_MODEL=anthropic/claude-sonnet-4-6`` (then " "``cache_control`` breakpoints reactivate via " "``_is_anthropic_model``).", ) - advanced_model: str = Field( + fast_advanced_model: str = Field( default="anthropic/claude-opus-4-7", - description="Model used for the 'advanced' tier (Opus by default). " - "Applies to both baseline (fast) and SDK (extended thinking) paths. " - "Override via CHAT_ADVANCED_MODEL env var.", + description="Baseline path, 'advanced' tier. Opus by default so " + "the advanced tier is a clean A/B vs the SDK advanced tier: same " + "model, different path — isolates the reasoning-wire + cache " + "differences from model capability differences. Kimi K2-Thinking " + "(the reasoning-native sibling) benchmarks ~9pp behind K2.6 on " + "SWE-Bench Verified and ~23pp behind on BrowseComp, is text-only, " + "and was published 6 months before K2.6 — not a fit for the " + "advanced tier today. Override via ``CHAT_FAST_ADVANCED_MODEL``.", + ) + thinking_standard_model: str = Field( + default="anthropic/claude-sonnet-4-6", + validation_alias=AliasChoices( + "CHAT_THINKING_STANDARD_MODEL", + "CHAT_MODEL", + ), + description="SDK (extended-thinking) path, 'standard' / ``None`` " + "tier. Sonnet by default: the Claude Agent SDK CLI only speaks to " + "Anthropic endpoints, so the standard SDK tier has to stay on an " + "Anthropic model regardless of what the baseline path runs. " + "Override via ``CHAT_THINKING_STANDARD_MODEL`` (legacy " + "``CHAT_MODEL`` still honored).", + ) + thinking_advanced_model: str = Field( + default="anthropic/claude-opus-4-7", + validation_alias=AliasChoices( + "CHAT_THINKING_ADVANCED_MODEL", + "CHAT_ADVANCED_MODEL", + ), + description="SDK (extended-thinking) path, 'advanced' tier. Opus " + "by default. Override via ``CHAT_THINKING_ADVANCED_MODEL`` " + "(legacy ``CHAT_ADVANCED_MODEL`` still honored).", ) title_model: str = Field( default="openai/gpt-4o-mini", @@ -441,3 +471,10 @@ class ChatConfig(BaseSettings): env_file = ".env" env_file_encoding = "utf-8" extra = "ignore" # Ignore extra environment variables + # Accept both the Python attribute name and the validation_alias when + # constructing a ``ChatConfig`` directly (e.g. in tests passing + # ``thinking_standard_model=...``). Without this, pydantic only + # accepts the alias names (``CHAT_THINKING_STANDARD_MODEL`` env) and + # rejects field-name kwargs — breaking ``ChatConfig(field=...)`` in + # every test that constructs a config. + populate_by_name = True diff --git a/autogpt_platform/backend/backend/copilot/sdk/service.py b/autogpt_platform/backend/backend/copilot/sdk/service.py index 325d4271ac..0a3c126dbe 100644 --- a/autogpt_platform/backend/backend/copilot/sdk/service.py +++ b/autogpt_platform/backend/backend/copilot/sdk/service.py @@ -446,7 +446,9 @@ async def _reduce_context( # useful for the eventual upload_transcript call that seeds future turns. if transcript_content and not tried_compaction: compacted = await compact_transcript( - transcript_content, model=config.model, log_prefix=log_prefix + transcript_content, + model=config.thinking_standard_model, + log_prefix=log_prefix, ) if ( compacted @@ -696,7 +698,7 @@ def _resolve_sdk_model() -> str | None: """Resolve the model name for the Claude Agent SDK CLI. Uses `config.claude_agent_model` if set, otherwise derives from - `config.model` via :func:`_normalize_model_name`. + `config.thinking_standard_model` via :func:`_normalize_model_name`. When `use_claude_code_subscription` is enabled and no explicit `claude_agent_model` is set, returns `None` so the CLI uses the @@ -706,7 +708,7 @@ def _resolve_sdk_model() -> str | None: return config.claude_agent_model if config.use_claude_code_subscription: return None - return _normalize_model_name(config.model) + return _normalize_model_name(config.thinking_standard_model) def _resolve_fallback_model() -> str | None: @@ -735,7 +737,7 @@ async def _resolve_sdk_model_for_request( cost (reported by the SDK) already reflects model-pricing differences. """ if model == "advanced": - sdk_model = _normalize_model_name(config.advanced_model) + sdk_model = _normalize_model_name(config.thinking_advanced_model) logger.info( "[SDK] [%s] Per-request model override: advanced (%s)", session_id[:12] if session_id else "?", @@ -1187,7 +1189,10 @@ async def _compress_messages( try: result = await _run_compression( - messages_dict, config.model, "[SDK]", target_tokens=target_tokens + messages_dict, + config.thinking_standard_model, + "[SDK]", + target_tokens=target_tokens, ) except Exception as exc: # Guard against timeouts or unexpected errors in compression — @@ -3810,7 +3815,7 @@ async def stream_chat_completion_sdk( cache_creation_tokens=turn_cache_creation_tokens, log_prefix=log_prefix, cost_usd=turn_cost_usd, - model=sdk_model or config.model, + model=sdk_model or config.thinking_standard_model, provider="anthropic", ) diff --git a/autogpt_platform/backend/backend/copilot/sdk/service_helpers_test.py b/autogpt_platform/backend/backend/copilot/sdk/service_helpers_test.py index 3b919c6036..4eb5bc4ac2 100644 --- a/autogpt_platform/backend/backend/copilot/sdk/service_helpers_test.py +++ b/autogpt_platform/backend/backend/copilot/sdk/service_helpers_test.py @@ -364,9 +364,10 @@ class TestNormalizeModelName: """Unit tests for the model-name normalisation helper. The per-request model toggle calls _normalize_model_name with either - ``"anthropic/claude-opus-4-6"`` (for 'advanced') or ``config.model`` (for - 'standard'). These tests verify the OpenRouter/provider-prefix stripping - that keeps the value compatible with the Claude CLI. + ``config.thinking_advanced_model`` (for 'advanced') or + ``config.thinking_standard_model`` (for 'standard'). These tests verify + the OpenRouter/provider-prefix stripping that keeps the value compatible + with the Claude CLI. """ def test_strips_anthropic_prefix(self): diff --git a/autogpt_platform/backend/backend/copilot/sdk/service_test.py b/autogpt_platform/backend/backend/copilot/sdk/service_test.py index d47f67252a..619fce3017 100644 --- a/autogpt_platform/backend/backend/copilot/sdk/service_test.py +++ b/autogpt_platform/backend/backend/copilot/sdk/service_test.py @@ -395,7 +395,7 @@ class TestResolveSdkModel: from backend.copilot import config as cfg_mod cfg = cfg_mod.ChatConfig( - model="anthropic/claude-opus-4.6", + thinking_standard_model="anthropic/claude-opus-4.6", claude_agent_model=None, use_openrouter=True, api_key="or-key", @@ -412,7 +412,7 @@ class TestResolveSdkModel: from backend.copilot import config as cfg_mod cfg = cfg_mod.ChatConfig( - model="anthropic/claude-opus-4.6", + thinking_standard_model="anthropic/claude-opus-4.6", claude_agent_model=None, use_openrouter=False, api_key=None, @@ -430,7 +430,7 @@ class TestResolveSdkModel: from backend.copilot import config as cfg_mod cfg = cfg_mod.ChatConfig( - model="anthropic/claude-opus-4.6", + thinking_standard_model="anthropic/claude-opus-4.6", claude_agent_model=None, use_openrouter=True, api_key=None, @@ -447,7 +447,7 @@ class TestResolveSdkModel: from backend.copilot import config as cfg_mod cfg = cfg_mod.ChatConfig( - model="anthropic/claude-opus-4.6", + thinking_standard_model="anthropic/claude-opus-4.6", claude_agent_model="claude-sonnet-4-5-20250514", use_openrouter=True, api_key="or-key", @@ -462,7 +462,7 @@ class TestResolveSdkModel: from backend.copilot import config as cfg_mod cfg = cfg_mod.ChatConfig( - model="anthropic/claude-opus-4.6", + thinking_standard_model="anthropic/claude-opus-4.6", claude_agent_model=None, use_openrouter=False, api_key=None, @@ -477,7 +477,7 @@ class TestResolveSdkModel: from backend.copilot import config as cfg_mod cfg = cfg_mod.ChatConfig( - model="claude-opus-4.6", + thinking_standard_model="claude-opus-4.6", claude_agent_model=None, use_openrouter=False, api_key=None, diff --git a/autogpt_platform/backend/backend/copilot/service.py b/autogpt_platform/backend/backend/copilot/service.py index 4ce9c285be..4c2a9c5507 100644 --- a/autogpt_platform/backend/backend/copilot/service.py +++ b/autogpt_platform/backend/backend/copilot/service.py @@ -42,17 +42,18 @@ settings = Settings() def resolve_chat_model(tier: CopilotLlmModel | None) -> str: - """Return the configured OpenRouter model string for the given tier. + """Return the configured SDK model for the given tier. - Shared by the baseline (fast) and SDK (extended thinking) paths so - both honor the same standard/advanced env-var configuration. ``None`` - and ``'standard'`` fall through to ``config.model``; ``'advanced'`` - uses ``config.advanced_model``. Keep this flat — if a third tier - shows up later, extend here and both paths pick it up for free. + The SDK (extended-thinking) path is Anthropic-only — the Claude Agent + SDK CLI refuses non-Anthropic endpoints — so both SDK tiers resolve + to the ``thinking_*_model`` cells. Baseline has its own resolver + (``_resolve_baseline_model``) that reads the ``fast_*_model`` cells; + the two paths diverge deliberately at the config layer so a cheaper + baseline provider can't break SDK, or vice versa. """ if tier == "advanced": - return config.advanced_model - return config.model + return config.thinking_advanced_model + return config.thinking_standard_model _client: LangfuseAsyncOpenAI | None = None From da5420fa07a172a3add5933d320be7937be3afa4 Mon Sep 17 00:00:00 2001 From: majdyz Date: Tue, 21 Apr 2026 22:39:04 +0700 Subject: [PATCH 04/25] fix(backend/copilot): coalesce reasoning deltas to unfreeze Kimi streams MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Observed symptom: copilot page frozen for ~700 s on a session using the new Kimi K2.6 default. Redis `XLEN chat:stream:...` showed 4,677 reasoning-delta chunks in a single turn vs ~28 for peer Sonnet sessions. Each chunk was one Redis xadd + one SSE frame + one React re-render of the non-virtualised chat list, which paint-stormed the main thread until the stream ended. OpenRouter's Kimi endpoint tokenises reasoning at a much finer grain than Anthropic, so the 1:1 chunk→`StreamReasoningDelta` mapping in BaselineReasoningEmitter blew up on the wire while the same code was fine for Sonnet. Fix: coalesce `StreamReasoningDelta` emissions in the emitter. * First chunk in a block still emits Start + Delta atomically so the Reasoning collapse renders immediately. * Subsequent chunks buffer into `_pending_delta` and flush once either the char-size (`_COALESCE_MIN_CHARS=32`) or time (`_COALESCE_MAX_INTERVAL_MS=40`) threshold trips. `close()` always drains the tail before emitting `StreamReasoningEnd`. * DB persistence stays per-chunk — `_current_row.content` updates on every delta independent of the coalesce window, so a crash mid-turn still persists the full reasoning-so-far. * Thresholds are `__init__` kwargs so tests can disable coalescing for deterministic state-machine assertions. Net effect: ~4,700 → ~150 events per turn (30x), well under the browser's paint-storm threshold; reasoning still appears live at ~25 Hz (40 ms window) which is below human perception. Pre-existing issues flagged for follow-up (out of scope — the freeze is gone without them): * `ChatMessagesContainer` has no React.memo per message and no list virtualisation — a very long session still re-renders every prior message on each new chunk. * `routes.py:1163-1171` replays from `0-0` with `count=1000` on every SSE reconnect (6 reconnects observed), duplicating up to 6,000 chunks. Proper Last-Event-ID support requires threading Redis stream message IDs through every SSE event + a frontend handshake — material refactor deferred to a dedicated PR. --- .../backend/copilot/baseline/reasoning.py | 89 +++++++++++++-- .../copilot/baseline/reasoning_test.py | 107 +++++++++++++++++- 2 files changed, 183 insertions(+), 13 deletions(-) diff --git a/autogpt_platform/backend/backend/copilot/baseline/reasoning.py b/autogpt_platform/backend/backend/copilot/baseline/reasoning.py index 0e408e17cc..e5f941b805 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/reasoning.py +++ b/autogpt_platform/backend/backend/copilot/baseline/reasoning.py @@ -23,6 +23,7 @@ This module keeps the wire-level concerns in one place: from __future__ import annotations import logging +import time import uuid from typing import Any @@ -42,6 +43,19 @@ logger = logging.getLogger(__name__) _VISIBLE_REASONING_TYPES = frozenset({"reasoning.text", "reasoning.summary"}) +# Coalescing thresholds for ``StreamReasoningDelta`` emission. OpenRouter's +# Kimi K2.6 endpoint tokenises reasoning at a much finer grain than Anthropic +# (~4,700 deltas per turn in one observed session, vs ~28 for Sonnet); without +# coalescing, every chunk is one Redis ``xadd`` + one SSE frame + one React +# re-render of the non-virtualised chat list, which paint-storms the browser +# main thread and freezes the UI. Batching into ~32-char / ~40 ms windows +# cuts the event rate ~100x while staying snappy enough that the Reasoning +# collapse still feels live (well under the ~100 ms perceptual threshold). +# Per-delta persistence to ``session.messages`` stays granular — we only +# coalesce the *wire* emission. +_COALESCE_MIN_CHARS = 32 +_COALESCE_MAX_INTERVAL_MS = 40.0 + class ReasoningDetail(BaseModel): """One entry in OpenRouter's ``reasoning_details`` list. @@ -195,11 +209,24 @@ class BaselineReasoningEmitter: def __init__( self, session_messages: list[ChatMessage] | None = None, + *, + coalesce_min_chars: int = _COALESCE_MIN_CHARS, + coalesce_max_interval_ms: float = _COALESCE_MAX_INTERVAL_MS, ) -> None: self._block_id: str = str(uuid.uuid4()) self._open: bool = False self._session_messages = session_messages self._current_row: ChatMessage | None = None + # Coalescing state — ``_pending_delta`` accumulates reasoning text + # between wire flushes. Providers like Kimi K2.6 emit very fine- + # grained chunks; batching them reduces Redis ``xadd`` + SSE + React + # re-render load by ~100x for equivalent text output. Tuning knobs + # are kwargs so tests can disable coalescing (``=0``) for + # deterministic event assertions. + self._coalesce_min_chars = coalesce_min_chars + self._coalesce_max_interval_ms = coalesce_max_interval_ms + self._pending_delta: str = "" + self._last_flush_monotonic: float = 0.0 @property def is_open(self) -> bool: @@ -210,39 +237,77 @@ class BaselineReasoningEmitter: Empty list when the chunk carries no reasoning payload, so this is safe to call on every chunk without guarding at the call site. - Persistence (when a session message list is attached) happens in - lockstep with emission so the row's content stays equal to the - concatenated deltas at every delta boundary. + + Persistence (when a session message list is attached) stays + per-delta so the DB row's content always equals the concatenation + of wire deltas at every chunk boundary, independent of the + coalescing window. Only the wire emission is batched. """ ext = OpenRouterDeltaExtension.from_delta(delta) text = ext.visible_text() if not text: return [] events: list[StreamBaseResponse] = [] + # First reasoning text in this block — emit Start + the first Delta + # atomically so the frontend Reasoning collapse renders immediately + # rather than waiting for the coalesce window to elapse. Subsequent + # chunks buffer into ``_pending_delta`` and only flush when the + # char/time thresholds trip. if not self._open: events.append(StreamReasoningStart(id=self._block_id)) + events.append(StreamReasoningDelta(id=self._block_id, delta=text)) self._open = True + self._last_flush_monotonic = time.monotonic() if self._session_messages is not None: - self._current_row = ChatMessage(role="reasoning", content="") + self._current_row = ChatMessage(role="reasoning", content=text) self._session_messages.append(self._current_row) - events.append(StreamReasoningDelta(id=self._block_id, delta=text)) + return events + + # Persist per-delta (no coalescing here — the session snapshot stays + # consistent at every chunk boundary, independent of the wire + # coalesce window). if self._current_row is not None: self._current_row.content = (self._current_row.content or "") + text + + self._pending_delta += text + if self._should_flush_pending(): + events.append( + StreamReasoningDelta(id=self._block_id, delta=self._pending_delta) + ) + self._pending_delta = "" + self._last_flush_monotonic = time.monotonic() return events + def _should_flush_pending(self) -> bool: + """Return True when the accumulated delta should be emitted now.""" + if not self._pending_delta: + return False + if len(self._pending_delta) >= self._coalesce_min_chars: + return True + elapsed_ms = (time.monotonic() - self._last_flush_monotonic) * 1000.0 + return elapsed_ms >= self._coalesce_max_interval_ms + def close(self) -> list[StreamBaseResponse]: """Emit ``StreamReasoningEnd`` for the open block (if any) and rotate. - Idempotent — returns ``[]`` when no block is open. The id rotation - guarantees the next reasoning block starts with a fresh id rather - than reusing one already closed on the wire. The persisted row is - not removed — it stays in ``session_messages`` as the durable - record of what was reasoned. + Idempotent — returns ``[]`` when no block is open. Drains any + still-buffered delta first so the frontend never loses tail text + from the coalesce window. The id rotation guarantees the next + reasoning block starts with a fresh id rather than reusing one + already closed on the wire. The persisted row is not removed — + it stays in ``session_messages`` as the durable record of what + was reasoned. """ if not self._open: return [] - event = StreamReasoningEnd(id=self._block_id) + events: list[StreamBaseResponse] = [] + if self._pending_delta: + events.append( + StreamReasoningDelta(id=self._block_id, delta=self._pending_delta) + ) + self._pending_delta = "" + events.append(StreamReasoningEnd(id=self._block_id)) self._open = False self._block_id = str(uuid.uuid4()) self._current_row = None - return [event] + return events diff --git a/autogpt_platform/backend/backend/copilot/baseline/reasoning_test.py b/autogpt_platform/backend/backend/copilot/baseline/reasoning_test.py index c0136aef19..e429969b3a 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/reasoning_test.py +++ b/autogpt_platform/backend/backend/copilot/baseline/reasoning_test.py @@ -212,7 +212,12 @@ class TestBaselineReasoningEmitter: assert emitter.is_open is True def test_subsequent_deltas_reuse_block_id_without_new_start(self): - emitter = BaselineReasoningEmitter() + # Disable coalescing so each chunk flushes immediately — this test + # is about the Start/Delta/block-id state machine, not the coalesce + # window. Coalescing behaviour is covered below. + emitter = BaselineReasoningEmitter( + coalesce_min_chars=0, coalesce_max_interval_ms=0 + ) first = emitter.on_delta(_delta(reasoning="a")) second = emitter.on_delta(_delta(reasoning="b")) @@ -267,6 +272,106 @@ class TestBaselineReasoningEmitter: assert deltas[0].delta == "plan: do the thing" +class TestReasoningDeltaCoalescing: + """Coalescing batches fine-grained provider chunks into bigger wire + frames. OpenRouter's Kimi K2.6 emits ~4,700 reasoning-delta chunks + per turn vs ~28 for Sonnet; without batching, every chunk becomes one + Redis ``xadd`` + one SSE event + one React re-render of the + non-virtualised chat list, which paint-storms the browser. These + tests pin the batching contract: small chunks buffer until the + char-size or time threshold trips, large chunks still flush + immediately, and ``close()`` never drops tail text.""" + + def test_small_chunks_after_first_buffer_until_threshold(self): + # Generous time threshold so size alone controls flush timing. + emitter = BaselineReasoningEmitter( + coalesce_min_chars=32, coalesce_max_interval_ms=60_000 + ) + # First chunk always flushes immediately (so UI renders without + # waiting). + first = emitter.on_delta(_delta(reasoning="hi ")) + assert any(isinstance(e, StreamReasoningStart) for e in first) + assert sum(isinstance(e, StreamReasoningDelta) for e in first) == 1 + + # Subsequent small chunks buffer silently — 5 × 4 chars = 20 chars, + # still under the 32-char threshold. + for _ in range(5): + assert emitter.on_delta(_delta(reasoning="abcd")) == [] + + # Once the threshold is crossed, the accumulated buffer flushes + # as a single StreamReasoningDelta carrying every buffered chunk. + flush = emitter.on_delta(_delta(reasoning="efghijklmnop")) + assert len(flush) == 1 + assert isinstance(flush[0], StreamReasoningDelta) + assert flush[0].delta == "abcd" * 5 + "efghijklmnop" + + def test_time_based_flush_when_chars_stay_below_threshold(self, monkeypatch): + # Fake ``time.monotonic`` so we can drive the time-based branch + # deterministically without real sleeps. + from backend.copilot.baseline import reasoning as rmod + + fake_now = [0.0] + monkeypatch.setattr(rmod.time, "monotonic", lambda: fake_now[0]) + + emitter = BaselineReasoningEmitter( + coalesce_min_chars=1000, coalesce_max_interval_ms=40 + ) + # t=0: first chunk flushes immediately. + first = emitter.on_delta(_delta(reasoning="a")) + assert sum(isinstance(e, StreamReasoningDelta) for e in first) == 1 + + # t=10 ms: still under 40 ms → buffer. + fake_now[0] = 0.010 + assert emitter.on_delta(_delta(reasoning="b")) == [] + + # t=50 ms since last flush → time threshold trips, flush fires. + fake_now[0] = 0.060 + flushed = emitter.on_delta(_delta(reasoning="c")) + assert len(flushed) == 1 + assert isinstance(flushed[0], StreamReasoningDelta) + assert flushed[0].delta == "bc" + + def test_close_flushes_tail_buffer_before_end(self): + emitter = BaselineReasoningEmitter( + coalesce_min_chars=1000, coalesce_max_interval_ms=60_000 + ) + emitter.on_delta(_delta(reasoning="first")) # flushes (first chunk) + emitter.on_delta(_delta(reasoning=" middle ")) # buffered + emitter.on_delta(_delta(reasoning="tail")) # buffered + + events = emitter.close() + assert len(events) == 2 + assert isinstance(events[0], StreamReasoningDelta) + assert events[0].delta == " middle tail" + assert isinstance(events[1], StreamReasoningEnd) + + def test_coalesce_disabled_flushes_every_chunk(self): + emitter = BaselineReasoningEmitter( + coalesce_min_chars=0, coalesce_max_interval_ms=0 + ) + first = emitter.on_delta(_delta(reasoning="a")) + second = emitter.on_delta(_delta(reasoning="b")) + assert sum(isinstance(e, StreamReasoningDelta) for e in first) == 1 + assert sum(isinstance(e, StreamReasoningDelta) for e in second) == 1 + + def test_persistence_stays_per_delta_even_when_wire_coalesces(self): + """DB row content must track every chunk so a crash mid-turn + persists the full reasoning-so-far, even if the coalesce window + never flushed those chunks to the wire.""" + session: list[ChatMessage] = [] + emitter = BaselineReasoningEmitter( + session, + coalesce_min_chars=1000, + coalesce_max_interval_ms=60_000, + ) + emitter.on_delta(_delta(reasoning="first ")) + emitter.on_delta(_delta(reasoning="chunk ")) + emitter.on_delta(_delta(reasoning="three")) + # No close; verify the persisted row already has everything. + assert len(session) == 1 + assert session[0].content == "first chunk three" + + class TestReasoningPersistence: """The persistence contract: without ``role="reasoning"`` rows in session.messages, useHydrateOnStreamEnd overwrites the live-streamed From 627b52048baa650b79d1190328cd4b9352e4466a Mon Sep 17 00:00:00 2001 From: majdyz Date: Tue, 21 Apr 2026 22:46:56 +0700 Subject: [PATCH 05/25] fix(backend/copilot): announce in-flight tool calls to unstick guide guard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Symptom (session 0d83f15c on Kimi K2.6): the agent called `get_agent_building_guide`, got the guide, retried `create_agent` — and the `require_guide_read` gate fired "Call get_agent_building_guide first" anyway, looping indefinitely. Root cause: baseline path buffers assistant rows with their `tool_calls` into `state.session_messages` (a scratch list on `_BaselineStreamState`) during the tool-call loop, and only flushes into `session.messages` at turn end. So when the second tool runs within the *same* turn, `_guide_read_in_session` — which scans `session.messages` — sees no guide call and fires the gate. SDK path didn't hit this because it mirrors tool calls straight into `ctx.session.messages`; Kimi's aggressive tool-call chaining within one turn was what surfaced the bug on baseline. Not Kimi-specific (any baseline model that calls guide + create_agent in one turn would hit it). Fix: add an in-flight announcement buffer on `ChatSession`. * `ChatSession._inflight_tool_calls: set[str]` (PrivateAttr, never serialised). * `announce_inflight_tool_call` called by `_baseline_tool_executor` the moment a tool is dispatched, before it runs. * `has_tool_been_called_this_turn` folds the in-flight set into the historical `messages` scan; `require_guide_read` now calls this instead of the messages-only helper. * `clear_inflight_tool_calls` fired in the baseline turn's finally block, right before `upsert_chat_session`, so next turn starts with a clean buffer. Deliberately didn't mirror the row into `session.messages` directly — `_baseline_conversation_updater` appends a fully-formed assistant+tool_calls row at round end, so an inline mirror would duplicate. The scratch set keeps the announcement separate from durable history. New tests: in-flight announcement lets gate pass within same turn; clear restores the gate for next turn; PrivateAttr never leaks into `model_dump`. Existing gate tests migrated from MagicMock(spec=ChatSession) to real ChatSession instances since the guard now calls the new helper. --- .../backend/copilot/baseline/service.py | 17 ++++++ .../backend/backend/copilot/model.py | 49 +++++++++++++++- .../copilot/tools/agent_guide_gate_test.py | 56 +++++++++++++++++-- .../backend/backend/copilot/tools/helpers.py | 22 +++----- 4 files changed, 126 insertions(+), 18 deletions(-) diff --git a/autogpt_platform/backend/backend/copilot/baseline/service.py b/autogpt_platform/backend/backend/copilot/baseline/service.py index 74ac8b9005..a112475d7b 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/service.py +++ b/autogpt_platform/backend/backend/copilot/baseline/service.py @@ -760,6 +760,19 @@ async def _baseline_tool_executor( ) ) + # Announce the tool call to the session so in-turn guards like + # ``require_guide_read`` can see it *right now*, before the tool + # actually runs. Without this, the tool_call row lives only in + # ``state.session_messages`` until the ``finally`` block flushes it + # into ``session.messages`` at turn end — so a second tool in the + # same turn (e.g. ``create_agent`` after ``get_agent_building_guide``) + # scans a stale ``session.messages`` and the guard re-fires despite + # the guide having been called. The announce-set is cleared at turn + # end; we deliberately don't touch ``session.messages`` here to avoid + # duplicating the assistant row that ``_baseline_conversation_updater`` + # will append at round end. + session.announce_inflight_tool_call(tool_name) + try: result: StreamToolOutputAvailable = await execute_tool( tool_name=tool_name, @@ -1868,6 +1881,10 @@ async def stream_chat_completion_baseline( final_text = final_text[len(recorded) :] if final_text.strip(): session.messages.append(ChatMessage(role="assistant", content=final_text)) + # In-flight tool-call announcements are only meaningful for the + # current turn; clear before persist so the next turn starts with + # a clean scratch buffer. + session.clear_inflight_tool_calls() try: await upsert_chat_session(session) except Exception as persist_err: diff --git a/autogpt_platform/backend/backend/copilot/model.py b/autogpt_platform/backend/backend/copilot/model.py index 08019233e7..c3b23c82cc 100644 --- a/autogpt_platform/backend/backend/copilot/model.py +++ b/autogpt_platform/backend/backend/copilot/model.py @@ -20,7 +20,7 @@ from openai.types.chat.chat_completion_message_tool_call_param import ( ) from prisma.models import ChatMessage as PrismaChatMessage from prisma.models import ChatSession as PrismaChatSession -from pydantic import BaseModel +from pydantic import BaseModel, PrivateAttr from backend.data.db_accessors import chat_db from backend.data.redis_client import get_redis_async @@ -198,6 +198,15 @@ class ChatSessionInfo(BaseModel): class ChatSession(ChatSessionInfo): messages: list[ChatMessage] + # In-flight tool-call names for the CURRENT turn. Not persisted to + # DB and not serialised on the wire — ``PrivateAttr`` keeps this a + # process-local scratch buffer that's invisible to ``model_dump`` / + # ``model_dump_json`` / the redis cache path. Populated by the + # baseline tool executor the moment a tool is dispatched so in-turn + # guards (e.g. ``require_guide_read``) can see the call before it + # lands in ``messages`` at turn-end. Cleared when the turn + # completes. + _inflight_tool_calls: set[str] = PrivateAttr(default_factory=set) @classmethod def new(cls, user_id: str, *, dry_run: bool) -> Self: @@ -226,6 +235,44 @@ class ChatSession(ChatSessionInfo): messages=[ChatMessage.from_db(m) for m in prisma_session.Messages], ) + def announce_inflight_tool_call(self, tool_name: str) -> None: + """Record that *tool_name* is being dispatched in the current turn. + + Lets in-turn guards (see + ``copilot/tools/helpers.py::_guide_read_in_session``) see a tool + call the moment it's issued, instead of waiting for the + ``session.messages`` flush at turn end — fixing a loop where a + second tool in the same turn re-fires a guard despite the + guarding tool having already been called (seen on Kimi K2.6 in + particular because its aggressive tool-call chaining exercises + this path much more than Sonnet does). The buffer is cleared by + :meth:`clear_inflight_tool_calls` at turn end. + """ + self._inflight_tool_calls.add(tool_name) + + def clear_inflight_tool_calls(self) -> None: + """Reset the in-flight tool-call announcement buffer.""" + self._inflight_tool_calls.clear() + + def has_tool_been_called_this_turn(self, tool_name: str) -> bool: + """True when *tool_name* has been called in the current turn. + + Checks the in-flight announcement buffer first (for calls + dispatched in *this* turn but not yet persisted) and then the + durable ``messages`` history (for past turns + prior rounds + within this turn whose writes already landed). + """ + if tool_name in self._inflight_tool_calls: + return True + for msg in reversed(self.messages): + if msg.role != "assistant" or not msg.tool_calls: + continue + for tc in msg.tool_calls: + name = tc.get("function", {}).get("name") or tc.get("name") + if name == tool_name: + return True + return False + def add_tool_call_to_current_turn(self, tool_call: dict) -> None: """Attach a tool_call to the current turn's assistant message. diff --git a/autogpt_platform/backend/backend/copilot/tools/agent_guide_gate_test.py b/autogpt_platform/backend/backend/copilot/tools/agent_guide_gate_test.py index 6a122b7324..14cc12f4b0 100644 --- a/autogpt_platform/backend/backend/copilot/tools/agent_guide_gate_test.py +++ b/autogpt_platform/backend/backend/copilot/tools/agent_guide_gate_test.py @@ -7,8 +7,6 @@ tokens and then produce JSON that fails validation — wasting turns on auto-fix loops. """ -from unittest.mock import MagicMock - import pytest from backend.copilot.model import ChatMessage, ChatSession @@ -18,8 +16,17 @@ from .models import ErrorResponse def _session_with_messages(messages: list[ChatMessage]) -> ChatSession: - """Build a minimal ChatSession whose ``messages`` matches *messages*.""" - session = MagicMock(spec=ChatSession) + """Build a real ChatSession with the given messages. + + Uses ``ChatSession.new`` + attribute reassignment rather than + ``MagicMock(spec=...)`` because the gate now calls + ``session.has_tool_been_called_this_turn(...)`` and a ``spec`` mock + returns a truthy ``MagicMock`` from that call, hiding real gate + behaviour. A live ``ChatSession`` also correctly initialises the + ``_inflight_tool_calls`` PrivateAttr scratch buffer used by the + in-turn announcement path. + """ + session = ChatSession.new("test-user", dry_run=False) session.session_id = "test-session" session.messages = messages return session @@ -117,3 +124,44 @@ def test_tool_name_surfaced_in_error(tool_name: str): result = require_guide_read(session, tool_name) assert isinstance(result, ErrorResponse) assert tool_name in result.message + + +def test_inflight_announcement_lets_gate_pass_within_same_turn(): + """Regression for the Kimi baseline loop: the guide call is + dispatched earlier in the SAME turn and buffered by the + ``_baseline_tool_executor`` into the in-flight announcement set, + but hasn't been flushed into ``session.messages`` yet. The gate + must see it anyway — otherwise a follow-up ``create_agent`` in the + same turn re-fires the guard despite the guide call and the model + loops retrying the guide.""" + session = _session_with_messages( + [ChatMessage(role="user", content="build something")] + ) + # Simulate _baseline_tool_executor's announce. + session.announce_inflight_tool_call("get_agent_building_guide") + assert require_guide_read(session, "create_agent") is None + + +def test_inflight_clear_restores_gate_for_next_turn(): + """End-of-turn cleanup must drop the in-flight buffer so it can't + leak into the *next* turn's ``session.messages`` scan (e.g. a second + session turn that should legitimately require a fresh guide call if + ``messages`` got compressed away).""" + session = _session_with_messages([ChatMessage(role="user", content="build")]) + session.announce_inflight_tool_call("get_agent_building_guide") + assert require_guide_read(session, "create_agent") is None + session.clear_inflight_tool_calls() + # With the buffer cleared and no guide row in messages, the guard + # fires again. + assert isinstance(require_guide_read(session, "create_agent"), ErrorResponse) + + +def test_inflight_announcement_does_not_serialise_into_model_dump(): + """PrivateAttr invariant: the scratch buffer must never leak into + ``model_dump()`` / the Redis cache payload / the DB — it's + process-local turn state, not durable session state.""" + session = _session_with_messages([]) + session.announce_inflight_tool_call("get_agent_building_guide") + dumped = session.model_dump() + assert "_inflight_tool_calls" not in dumped + assert "inflight_tool_calls" not in dumped diff --git a/autogpt_platform/backend/backend/copilot/tools/helpers.py b/autogpt_platform/backend/backend/copilot/tools/helpers.py index 8ec31ee43e..bb29e894ac 100644 --- a/autogpt_platform/backend/backend/copilot/tools/helpers.py +++ b/autogpt_platform/backend/backend/copilot/tools/helpers.py @@ -787,26 +787,22 @@ def _resolve_discriminated_credentials( _AGENT_GUIDE_TOOL_NAME = "get_agent_building_guide" -def _guide_read_in_session(session: ChatSession) -> bool: - """True if this session's assistant messages include a guide tool call.""" - for msg in reversed(session.messages): - if msg.role != "assistant" or not msg.tool_calls: - continue - for tc in msg.tool_calls: - name = tc.get("function", {}).get("name") or tc.get("name") - if name == _AGENT_GUIDE_TOOL_NAME: - return True - return False - - def require_guide_read(session: ChatSession, tool_name: str): """Return an ErrorResponse if the guide hasn't been loaded this session. Import inline to keep ``helpers.py`` free of tool-response imports. + Uses :meth:`ChatSession.has_tool_been_called_this_turn` which checks + both the persisted ``messages`` list and the in-flight announcement + buffer — so a guide call dispatched earlier in the *current* turn + (before ``session.messages`` flushes at turn end) is recognised too. + Otherwise a second tool in the same turn would re-fire this guard + despite the guide having been called — seen on Kimi K2.6 in + particular because its aggressive tool-call chaining exercises this + path far more than Sonnet does. """ from .models import ErrorResponse # noqa: PLC0415 — avoid circular import - if _guide_read_in_session(session): + if session.has_tool_been_called_this_turn(_AGENT_GUIDE_TOOL_NAME): return None return ErrorResponse( message=( From 9cfaaba3b68f8a5322a50221859e4af10a8fcbca Mon Sep 17 00:00:00 2001 From: majdyz Date: Tue, 21 Apr 2026 23:06:07 +0700 Subject: [PATCH 06/25] fix(backend/copilot): anchor Kimi reasoning-route match to reject hakimi false positives MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sentry review on #12871 flagged the `"kimi" in lowered` substring check in `_is_reasoning_route` as too broad — a hypothetical `some-provider/hakimi-large` would match and get a `reasoning` payload appended to its request. Some providers silently drop unknown fields, others 400, so this is a correctness-not-just-tidy fix. Replace the substring check with an anchored match: accept the `moonshotai/` provider prefix, or a bare `kimi-` model id (either at string start or immediately after a `/` provider prefix). `claude` / `anthropic` branches unchanged. Adds regression coverage for `hakimi`, `some-provider/hakimi-large`, `akimi-7b` and keeps the existing Kimi variants passing. --- .../backend/copilot/baseline/reasoning.py | 30 +++++++++++++------ .../copilot/baseline/reasoning_test.py | 16 ++++++++-- 2 files changed, 35 insertions(+), 11 deletions(-) diff --git a/autogpt_platform/backend/backend/copilot/baseline/reasoning.py b/autogpt_platform/backend/backend/copilot/baseline/reasoning.py index e5f941b805..e2511b34eb 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/reasoning.py +++ b/autogpt_platform/backend/backend/copilot/baseline/reasoning.py @@ -1,7 +1,8 @@ """Extended-thinking wire support for the baseline (OpenRouter) path. -Anthropic routes on OpenRouter expose extended thinking through -non-OpenAI extension fields that the OpenAI Python SDK doesn't model: +OpenRouter routes that support extended thinking (Anthropic Claude and +Moonshot Kimi today) expose reasoning through non-OpenAI extension fields +that the OpenAI Python SDK doesn't model: * ``reasoning`` (legacy string) — enabled by ``include_reasoning: true``. * ``reasoning_content`` — DeepSeek / some OpenRouter routes. @@ -17,7 +18,8 @@ This module keeps the wire-level concerns in one place: one streaming round and emits ``StreamReasoning*`` events so the caller only has to plumb the events into its pending queue. * :func:`reasoning_extra_body` builds the ``extra_body`` fragment for the - OpenAI client call. Returns ``None`` on non-Anthropic routes. + OpenAI client call. Returns ``None`` for routes without reasoning + support (see :func:`_is_reasoning_route`). """ from __future__ import annotations @@ -159,14 +161,24 @@ def _is_reasoning_route(model: str) -> bool: Kept separate from :func:`backend.copilot.baseline.service._is_anthropic_model` because ``cache_control`` is strictly Anthropic-specific (Moonshot does its own auto-caching), so the two gates must not conflate. + + The Kimi match anchors on the ``moonshotai/`` provider prefix or on a + bare / OpenRouter-prefixed ``kimi-`` model id (``kimi-k2.6``, + ``moonshotai/kimi-k2-thinking``, ``openrouter/kimi-k2.6``), so unrelated + models that happen to contain ``kimi`` as a substring (e.g. a + hypothetical ``some-provider/hakimi-large``) are not treated as + reasoning routes. """ lowered = model.lower() - return ( - "claude" in lowered - or lowered.startswith("anthropic") - or lowered.startswith("moonshotai/") - or "kimi" in lowered - ) + if "claude" in lowered or lowered.startswith("anthropic"): + return True + if lowered.startswith("moonshotai/"): + return True + # Match a ``kimi-`` model id at string start or immediately after a + # provider prefix ``/`` — avoids substring false positives like + # ``hakimi``. + bare = lowered.rsplit("/", 1)[-1] + return bare.startswith("kimi-") def reasoning_extra_body(model: str, max_thinking_tokens: int) -> dict[str, Any] | None: diff --git a/autogpt_platform/backend/backend/copilot/baseline/reasoning_test.py b/autogpt_platform/backend/backend/copilot/baseline/reasoning_test.py index e429969b3a..4d6d3c5623 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/reasoning_test.py +++ b/autogpt_platform/backend/backend/copilot/baseline/reasoning_test.py @@ -150,9 +150,13 @@ class TestIsReasoningRoute: assert _is_reasoning_route("moonshotai/kimi-k2.6") assert _is_reasoning_route("moonshotai/kimi-k2-thinking") assert _is_reasoning_route("moonshotai/kimi-k2.5") - # Direct (non-OpenRouter) model ids also resolve via the ``kimi`` - # substring so a future bare ``kimi-k3`` id would still match. + # Direct (non-OpenRouter) model ids also resolve via the ``kimi-`` + # prefix so a future bare ``kimi-k3`` id would still match. assert _is_reasoning_route("kimi-k2-instruct") + # Provider-prefixed bare kimi ids (without the ``moonshotai/`` + # prefix) are also recognised — the match anchors on the final + # path segment. + assert _is_reasoning_route("openrouter/kimi-k2.6") def test_other_providers_rejected(self): assert not _is_reasoning_route("openai/gpt-4o") @@ -161,6 +165,14 @@ class TestIsReasoningRoute: assert not _is_reasoning_route("meta-llama/llama-3.3-70b-instruct") assert not _is_reasoning_route("deepseek/deepseek-r1") + def test_kimi_substring_false_positives_rejected(self): + # Regression: the previous implementation matched any model whose + # name contained the substring ``kimi`` — including unrelated model + # ids like ``hakimi``. The anchored match below rejects them. + assert not _is_reasoning_route("some-provider/hakimi-large") + assert not _is_reasoning_route("hakimi") + assert not _is_reasoning_route("akimi-7b") + class TestReasoningExtraBody: def test_anthropic_route_returns_fragment(self): From 4dc3d0c34cddc1cb8852472304df8a485a1830f9 Mon Sep 17 00:00:00 2001 From: majdyz Date: Tue, 21 Apr 2026 23:06:17 +0700 Subject: [PATCH 07/25] fix(backend/copilot): correct fast_advanced_model to OpenRouter's claude-opus-4.7 route CodeRabbit review on #12871 flagged that the config default and pinned-default test used `anthropic/claude-opus-4-7` (hyphenated), but OpenRouter's actual model ID for Opus 4.7 is `anthropic/claude-opus-4.7` (dot-separated, per https://openrouter.ai/anthropic/claude-opus-4.7). The hyphenated form would 404 at runtime the first time anyone toggles the advanced tier on the baseline path. Fix the default in both paths (`fast_advanced_model`, `thinking_advanced_model`) and update the test assertion to match. Also add a regression test pinning the three legacy env-var aliases (`CHAT_MODEL`, `CHAT_ADVANCED_MODEL`, `CHAT_FAST_MODEL`) to the new 2x2 fields so deployments that set the pre-split names continue to override the intended cell. --- .../baseline/transcript_integration_test.py | 24 ++++++++++++++++++- .../backend/backend/copilot/config.py | 12 ++++++---- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/autogpt_platform/backend/backend/copilot/baseline/transcript_integration_test.py b/autogpt_platform/backend/backend/copilot/baseline/transcript_integration_test.py index 3d573e87b6..53290536ea 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/transcript_integration_test.py +++ b/autogpt_platform/backend/backend/copilot/baseline/transcript_integration_test.py @@ -105,7 +105,7 @@ class TestResolveBaselineModel: assert ( ChatConfig.model_fields["fast_advanced_model"].default - == "anthropic/claude-opus-4-7" + == "anthropic/claude-opus-4.7" ) def test_standard_cells_diverge_across_paths(self): @@ -131,6 +131,28 @@ class TestResolveBaselineModel: != ChatConfig.model_fields["fast_advanced_model"].default ) + def test_legacy_env_aliases_route_to_new_fields(self, monkeypatch): + """Backward compat: the pre-split env var names must still bind. + + The four-field matrix was introduced with ``validation_alias`` + entries so that existing deployments setting ``CHAT_MODEL`` / + ``CHAT_ADVANCED_MODEL`` / ``CHAT_FAST_MODEL`` continue to override + the same effective cell without a rename. Construct a fresh + ``ChatConfig`` with each legacy name set and confirm it lands on + the new field. + """ + from backend.copilot.config import ChatConfig + + monkeypatch.setenv("CHAT_MODEL", "legacy/sonnet-via-chat-model") + monkeypatch.setenv("CHAT_ADVANCED_MODEL", "legacy/opus-via-advanced") + monkeypatch.setenv("CHAT_FAST_MODEL", "legacy/fast-via-fast-model") + + cfg = ChatConfig() + + assert cfg.thinking_standard_model == "legacy/sonnet-via-chat-model" + assert cfg.thinking_advanced_model == "legacy/opus-via-advanced" + assert cfg.fast_standard_model == "legacy/fast-via-fast-model" + class TestLoadPriorTranscript: """``_load_prior_transcript`` wraps the CLI session restore + validate + load flow.""" diff --git a/autogpt_platform/backend/backend/copilot/config.py b/autogpt_platform/backend/backend/copilot/config.py index b6d6dcbf92..7d953dbd22 100644 --- a/autogpt_platform/backend/backend/copilot/config.py +++ b/autogpt_platform/backend/backend/copilot/config.py @@ -17,8 +17,12 @@ from backend.util.clients import OPENROUTER_BASE_URL CopilotMode = Literal["fast", "extended_thinking"] # Per-request model tier set by the frontend model toggle. -# 'standard' picks the cheaper everyday model for the active path. -# 'advanced' picks the premium model for the active path (Opus today). +# 'standard' picks the cheaper everyday model for the active path — +# ``fast_standard_model`` on the baseline path, ``thinking_standard_model`` +# on the SDK path. +# 'advanced' picks the premium model for the active path — ``fast_advanced_model`` +# on the baseline path, ``thinking_advanced_model`` on the SDK path (both +# default to Opus today). # None means no preference — falls through to LD per-user targeting, then config. # Using tier names instead of model names keeps the contract model-agnostic. CopilotLlmModel = Literal["standard", "advanced"] @@ -55,7 +59,7 @@ class ChatConfig(BaseSettings): "``_is_anthropic_model``).", ) fast_advanced_model: str = Field( - default="anthropic/claude-opus-4-7", + default="anthropic/claude-opus-4.7", description="Baseline path, 'advanced' tier. Opus by default so " "the advanced tier is a clean A/B vs the SDK advanced tier: same " "model, different path — isolates the reasoning-wire + cache " @@ -79,7 +83,7 @@ class ChatConfig(BaseSettings): "``CHAT_MODEL`` still honored).", ) thinking_advanced_model: str = Field( - default="anthropic/claude-opus-4-7", + default="anthropic/claude-opus-4.7", validation_alias=AliasChoices( "CHAT_THINKING_ADVANCED_MODEL", "CHAT_ADVANCED_MODEL", From 2f8d2e10daa86547bec8b68279ce7f6fa975e29a Mon Sep 17 00:00:00 2001 From: majdyz Date: Tue, 21 Apr 2026 23:06:24 +0700 Subject: [PATCH 08/25] fix(backend/copilot): clear inflight tool-call buffer at top of outer finally MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CodeRabbit review on #12871 flagged that `session.clear_inflight_tool_calls()` ran after usage persistence, session upsert and transcript upload in the baseline turn `finally`, so if any of those awaited cleanup steps raised, the process-local scratch buffer would leak into the next turn — the guide-read guard would observe a phantom in-flight call and skip its gate. Move the clear to the very first statement of the outer `finally` so it runs unconditionally once tool execution has ended, before any failure-prone cleanup. Keep the documentation pointing at the observed failure mode. --- .../backend/backend/copilot/baseline/service.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/autogpt_platform/backend/backend/copilot/baseline/service.py b/autogpt_platform/backend/backend/copilot/baseline/service.py index c9724261e9..6aa88e9d41 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/service.py +++ b/autogpt_platform/backend/backend/copilot/baseline/service.py @@ -1822,6 +1822,16 @@ async def stream_chat_completion_baseline( yield StreamError(errorText=error_msg, code="baseline_error") # Still persist whatever we got finally: + # In-flight tool-call announcements are only meaningful for the + # current turn; clear at the top of the outer finally so the next + # turn starts with a clean scratch buffer even if one of the + # awaited cleanup steps below (usage persistence, session upsert, + # transcript upload) raises. The buffer is a process-local scratch + # set — if we leak it into the next turn the guide-read guard would + # observe a phantom in-flight call and skip its gate, so this must + # run unconditionally. + session.clear_inflight_tool_calls() + # Pending messages are drained atomically at turn start and # between tool rounds, so there's nothing to clear in finally. # Any message pushed after the final drain window stays in the @@ -1916,10 +1926,6 @@ async def stream_chat_completion_baseline( final_text = final_text[len(recorded) :] if final_text.strip(): session.messages.append(ChatMessage(role="assistant", content=final_text)) - # In-flight tool-call announcements are only meaningful for the - # current turn; clear before persist so the next turn starts with - # a clean scratch buffer. - session.clear_inflight_tool_calls() try: await upsert_chat_session(session) except Exception as persist_err: From 1848810b320c18011628783f6f745250f4ebb76d Mon Sep 17 00:00:00 2001 From: majdyz Date: Tue, 21 Apr 2026 23:10:49 +0700 Subject: [PATCH 09/25] feat(backend/copilot): baseline web-search supplement with Perplexity + SendWebRequest block IDs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fast mode (baseline / OpenRouter) doesn't have a native WebSearch tool the way the SDK path does; Kimi K2.6 defaults to guessing URLs or saying "I don't have internet access" when asked for live info. Point it at two existing copilot blocks via `run_block` so it can search without adding a new tool type: * Perplexity (sonar models, real-time search w/ citations) — block id `c8a5f2e9-8b3d-4a7e-9f6c-1d5e3c9b7a4f`. Defaults `model` to `sonar` and names the other sonar variants explicitly so the model doesn't guess `sonar-xl` (404 on the API). * SendWebRequest (plain HTTP GET/POST/etc.) — block id `6595ae1f-b924-42cb-9a41-551a0611c4b4`. For when the user names a specific URL. Supplement is static (no per-user content) so it stays on the cacheable system-prompt prefix — zero cost to the baseline cache contract. Appended baseline-only via a new `get_baseline_web_search_supplement()` helper; SDK keeps its native WebSearch. Block IDs are module constants and the new `TestBaselineWebSearchSupplement` class pins them against the live block registry (`PerplexityBlock().id` / `SendWebRequestBlock().id`) — if a block is renamed or deleted the test breaks before the prompt ships a dead UUID. --- .../backend/copilot/baseline/service.py | 7 ++- .../backend/backend/copilot/prompting.py | 62 +++++++++++++++++++ .../backend/backend/copilot/prompting_test.py | 39 ++++++++++++ 3 files changed, 107 insertions(+), 1 deletion(-) diff --git a/autogpt_platform/backend/backend/copilot/baseline/service.py b/autogpt_platform/backend/backend/copilot/baseline/service.py index 6aa88e9d41..01c0aff51a 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/service.py +++ b/autogpt_platform/backend/backend/copilot/baseline/service.py @@ -55,7 +55,11 @@ from backend.copilot.pending_messages import ( drain_pending_messages, format_pending_as_user_message, ) -from backend.copilot.prompting import SHARED_TOOL_NOTES, get_graphiti_supplement +from backend.copilot.prompting import ( + SHARED_TOOL_NOTES, + get_baseline_web_search_supplement, + get_graphiti_supplement, +) from backend.copilot.response_model import ( StreamBaseResponse, StreamError, @@ -1417,6 +1421,7 @@ async def stream_chat_completion_baseline( system_prompt = ( base_system_prompt + SHARED_TOOL_NOTES + + get_baseline_web_search_supplement() + graphiti_supplement + builder_session_suffix ) diff --git a/autogpt_platform/backend/backend/copilot/prompting.py b/autogpt_platform/backend/backend/copilot/prompting.py index 399d31c1cc..21cae58192 100644 --- a/autogpt_platform/backend/backend/copilot/prompting.py +++ b/autogpt_platform/backend/backend/copilot/prompting.py @@ -411,3 +411,65 @@ You have access to persistent temporal memory tools that remember facts across s - group_id is handled automatically by the system — never set it yourself. - When storing, be specific about operational rules and instructions (e.g., "CC Sarah on client communications" not just "Sarah is the assistant"). """ + + +# Block IDs for the baseline-path web-search supplement. Surfaced as module +# constants so a test can assert they match the live block registry (see +# prompting_test.py — if a block is renamed/deleted the assertion catches it +# before the prompt ships a dead UUID). +PERPLEXITY_BLOCK_ID = "c8a5f2e9-8b3d-4a7e-9f6c-1d5e3c9b7a4f" +SEND_WEB_REQUEST_BLOCK_ID = "6595ae1f-b924-42cb-9a41-551a0611c4b4" + + +def get_baseline_web_search_supplement() -> str: + """Web-search instructions for the baseline (fast) path. + + SDK (extended_thinking) mode has Claude's native ``WebSearch`` tool, + so it gets web access for free. The baseline path does NOT — it runs + against whatever OpenRouter model is configured (Kimi K2.6 by + default) and those providers don't expose a built-in search. To + close the gap without adding a new tool type, we point the model at + two existing copilot blocks via ``run_block``: + + * Perplexity (web search w/ citations, sonar models) — + ``PERPLEXITY_BLOCK_ID``. + * SendWebRequest (fetch an arbitrary URL) — + ``SEND_WEB_REQUEST_BLOCK_ID``. + + The supplement is static — no per-user / per-session content — so it + stays on the cacheable prefix. Append to the baseline system prompt + only; SDK callers would just confuse their native ``WebSearch`` with + a competing block recipe. + """ + return f""" + +## Web Search & URL Fetch (fast mode) + +Fast mode doesn't expose a native web-search tool — use the existing +copilot blocks via ``run_block`` when you need live web content. + +### Web search with citations — Perplexity +- Block ID: ``{PERPLEXITY_BLOCK_ID}`` +- Input: ``{{"prompt": "", "model": "sonar"}}`` + (``model`` defaults to ``sonar``; other options: ``sonar-pro``, + ``sonar-reasoning``, ``sonar-reasoning-pro``, ``sonar-deep-research`` + — use ``sonar`` unless the user asks for deeper research.) +- Output: ``response`` (string), ``annotations`` (list of URL citations). +- Requires Perplexity credentials connected to the user's account. If + the block errors with a missing-credentials message, call + ``connect_integration`` for Perplexity and retry. + +### Fetch a specific URL — SendWebRequest +- Block ID: ``{SEND_WEB_REQUEST_BLOCK_ID}`` +- Input: ``{{"url": "", "method": "GET"}}`` (supports POST / PUT / + DELETE / PATCH too; add ``headers`` / ``body`` as needed). +- Output: ``response`` (server body), plus ``client_error`` / + ``server_error`` / ``error`` channels — check the error outputs + before trusting ``response``. + +Prefer Perplexity for open-ended questions ("what's the latest...", +"find articles about...") and SendWebRequest when the user names a +specific URL or needs structured API data. Call ``find_block`` first +only if neither pattern fits — the two block IDs above cover 95% of +baseline web-access needs and skip one search round-trip. +""" diff --git a/autogpt_platform/backend/backend/copilot/prompting_test.py b/autogpt_platform/backend/backend/copilot/prompting_test.py index 5a719f1b00..837044772d 100644 --- a/autogpt_platform/backend/backend/copilot/prompting_test.py +++ b/autogpt_platform/backend/backend/copilot/prompting_test.py @@ -56,3 +56,42 @@ class TestAgentGenerationGuideContainsClarifySection: "### Workflow" )[0] assert "ask_question" in clarify_section + + +class TestBaselineWebSearchSupplement: + """The fast-mode web-search supplement must point at block IDs that + actually exist and name each block's required input fields, so the + Kimi / baseline model can call them via ``run_block`` without a + ``find_block`` round-trip. Pinning the block IDs against the live + registry means a block rename / delete breaks this test rather than + shipping a dead UUID to the model.""" + + def test_perplexity_block_id_matches_registered_block(self): + from backend.blocks.perplexity import PerplexityBlock + + assert PerplexityBlock().id == prompting.PERPLEXITY_BLOCK_ID + + def test_send_web_request_block_id_matches_registered_block(self): + from backend.blocks.http import SendWebRequestBlock + + assert SendWebRequestBlock().id == prompting.SEND_WEB_REQUEST_BLOCK_ID + + def test_supplement_surfaces_both_block_ids(self): + text = prompting.get_baseline_web_search_supplement() + assert prompting.PERPLEXITY_BLOCK_ID in text + assert prompting.SEND_WEB_REQUEST_BLOCK_ID in text + + def test_supplement_names_required_inputs(self): + text = prompting.get_baseline_web_search_supplement() + # Perplexity required input. + assert '"prompt"' in text + # SendWebRequest required input. + assert '"url"' in text + # Default Perplexity model is named explicitly so Kimi doesn't + # guess (``sonar-xl`` etc. 404 on the Perplexity API). + assert '"sonar"' in text + + def test_supplement_flags_credentials_dependency(self): + text = prompting.get_baseline_web_search_supplement() + assert "credentials" in text.lower() + assert "connect_integration" in text From 7dc3b880a6b098630baaeb2d5c8732eeaf7874e9 Mon Sep 17 00:00:00 2001 From: majdyz Date: Tue, 21 Apr 2026 23:11:10 +0700 Subject: [PATCH 10/25] refactor(backend/copilot): rename has_tool_been_called and sample monotonic once per chunk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses three self-review nits on #12871: 1. Rename `has_tool_been_called_this_turn` → `has_tool_been_called`. The method is misleadingly named: its durable-messages branch scans the full ``session.messages`` list (not just the current turn), which matches the guide-read contract (``test_guide_earlier_in_history_still_passes``) but actively invites the wrong reading at every call site. Only the in-flight buffer is genuinely turn-scoped. Update the lone caller (``require_guide_read``) and the agent_guide_gate_test docstring reference. 2. Clarify `announce_inflight_tool_call` docstring to state that the announcement fires *before* ``execute_tool`` runs and isn't rolled back if the tool raises. That matches the guide-read gate's "was it called?" semantics, but a future gate wanting *successful* dispatches would need its own tracking — flagging this in the docstring so the next reader sees it. 3. Sample ``time.monotonic()`` once per reasoning chunk instead of twice (once inside ``_should_flush_pending``, again on flush). At ~4,700 chunks per Kimi turn that's ~4,700 redundant monotonic() syscalls off the hot path. ``_should_flush_pending`` now takes ``now`` as a parameter so the caller supplies the already-sampled value, and the flush branch reuses the same value for ``_last_flush_monotonic``. Existing coalescing tests (``test_time_based_flush_when_chars_stay_below_threshold``) pass unchanged via the same ``monkeypatch`` on ``time.monotonic``. --- .../backend/copilot/baseline/reasoning.py | 21 +++++++++++----- .../backend/backend/copilot/model.py | 24 ++++++++++++++----- .../copilot/tools/agent_guide_gate_test.py | 2 +- .../backend/backend/copilot/tools/helpers.py | 18 +++++++------- 4 files changed, 43 insertions(+), 22 deletions(-) diff --git a/autogpt_platform/backend/backend/copilot/baseline/reasoning.py b/autogpt_platform/backend/backend/copilot/baseline/reasoning.py index e2511b34eb..f301b50147 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/reasoning.py +++ b/autogpt_platform/backend/backend/copilot/baseline/reasoning.py @@ -265,11 +265,15 @@ class BaselineReasoningEmitter: # rather than waiting for the coalesce window to elapse. Subsequent # chunks buffer into ``_pending_delta`` and only flush when the # char/time thresholds trip. + # Sample the monotonic clock exactly once per chunk — at ~4,700 + # chunks per turn, folding the two calls into one cuts ~4,700 + # syscalls off the hot path without changing semantics. + now = time.monotonic() if not self._open: events.append(StreamReasoningStart(id=self._block_id)) events.append(StreamReasoningDelta(id=self._block_id, delta=text)) self._open = True - self._last_flush_monotonic = time.monotonic() + self._last_flush_monotonic = now if self._session_messages is not None: self._current_row = ChatMessage(role="reasoning", content=text) self._session_messages.append(self._current_row) @@ -282,21 +286,26 @@ class BaselineReasoningEmitter: self._current_row.content = (self._current_row.content or "") + text self._pending_delta += text - if self._should_flush_pending(): + if self._should_flush_pending(now): events.append( StreamReasoningDelta(id=self._block_id, delta=self._pending_delta) ) self._pending_delta = "" - self._last_flush_monotonic = time.monotonic() + self._last_flush_monotonic = now return events - def _should_flush_pending(self) -> bool: - """Return True when the accumulated delta should be emitted now.""" + def _should_flush_pending(self, now: float) -> bool: + """Return True when the accumulated delta should be emitted now. + + *now* is the monotonic timestamp sampled by the caller so the + clock is read at most once per chunk (the flush-timestamp update + reuses the same value). + """ if not self._pending_delta: return False if len(self._pending_delta) >= self._coalesce_min_chars: return True - elapsed_ms = (time.monotonic() - self._last_flush_monotonic) * 1000.0 + elapsed_ms = (now - self._last_flush_monotonic) * 1000.0 return elapsed_ms >= self._coalesce_max_interval_ms def close(self) -> list[StreamBaseResponse]: diff --git a/autogpt_platform/backend/backend/copilot/model.py b/autogpt_platform/backend/backend/copilot/model.py index 34a89c62a7..1adef8e7c8 100644 --- a/autogpt_platform/backend/backend/copilot/model.py +++ b/autogpt_platform/backend/backend/copilot/model.py @@ -254,6 +254,13 @@ class ChatSession(ChatSessionInfo): def announce_inflight_tool_call(self, tool_name: str) -> None: """Record that *tool_name* is being dispatched in the current turn. + Called by the baseline tool executor **before** the tool actually + runs (the announcement is about dispatch, not success). If the + tool raises, the name stays in the buffer for the rest of the + turn — that matches the guide-read gate's contract ("was the tool + called?") but means any future gate wanting *successful* + dispatches would need its own tracking. + Lets in-turn guards (see ``copilot/tools/helpers.py::require_guide_read``) see a tool call the moment it's issued, instead of waiting for the @@ -270,13 +277,18 @@ class ChatSession(ChatSessionInfo): """Reset the in-flight tool-call announcement buffer.""" self._inflight_tool_calls.clear() - def has_tool_been_called_this_turn(self, tool_name: str) -> bool: - """True when *tool_name* has been called in the current turn. + def has_tool_been_called(self, tool_name: str) -> bool: + """True when *tool_name* has been called in this session. - Checks the in-flight announcement buffer first (for calls - dispatched in *this* turn but not yet persisted) and then the - durable ``messages`` history (for past turns + prior rounds - within this turn whose writes already landed). + Checks the in-flight announcement buffer (for calls dispatched + in the *current* turn but not yet flushed into ``messages``) and + the durable ``messages`` history (for past turns + prior rounds + within this turn whose writes already landed). The durable + scan is session-wide, not turn-scoped: a matching tool call + anywhere in ``messages`` counts. This matches the guide-read + contract — once the guide has been read in the session, the + agent doesn't need to re-read it for later create/edit/fix + tools. """ if tool_name in self._inflight_tool_calls: return True diff --git a/autogpt_platform/backend/backend/copilot/tools/agent_guide_gate_test.py b/autogpt_platform/backend/backend/copilot/tools/agent_guide_gate_test.py index 4db63fcb9c..a5c03162e3 100644 --- a/autogpt_platform/backend/backend/copilot/tools/agent_guide_gate_test.py +++ b/autogpt_platform/backend/backend/copilot/tools/agent_guide_gate_test.py @@ -23,7 +23,7 @@ def _session_with_messages( Uses ``ChatSession.new`` + attribute reassignment rather than ``MagicMock(spec=...)`` because the gate now calls - ``session.has_tool_been_called_this_turn(...)`` and a ``spec`` mock + ``session.has_tool_been_called(...)`` and a ``spec`` mock returns a truthy ``MagicMock`` from that call, hiding real gate behaviour. A live ``ChatSession`` also correctly initialises the ``_inflight_tool_calls`` PrivateAttr scratch buffer used by the diff --git a/autogpt_platform/backend/backend/copilot/tools/helpers.py b/autogpt_platform/backend/backend/copilot/tools/helpers.py index ccd335868d..6c25e79188 100644 --- a/autogpt_platform/backend/backend/copilot/tools/helpers.py +++ b/autogpt_platform/backend/backend/copilot/tools/helpers.py @@ -791,14 +791,14 @@ def require_guide_read(session: ChatSession, tool_name: str): """Return an ErrorResponse if the guide hasn't been loaded this session. Import inline to keep ``helpers.py`` free of tool-response imports. - Uses :meth:`ChatSession.has_tool_been_called_this_turn` which checks - both the persisted ``messages`` list and the in-flight announcement - buffer — so a guide call dispatched earlier in the *current* turn - (before ``session.messages`` flushes at turn end) is recognised too. - Otherwise a second tool in the same turn would re-fire this guard - despite the guide having been called — seen on Kimi K2.6 in - particular because its aggressive tool-call chaining exercises this - path far more than Sonnet does. + Uses :meth:`ChatSession.has_tool_been_called` which checks both the + persisted ``messages`` list (session-wide) and the in-flight + announcement buffer — so a guide call dispatched earlier in the + *current* turn (before ``session.messages`` flushes at turn end) is + recognised too. Otherwise a second tool in the same turn would + re-fire this guard despite the guide having been called — seen on + Kimi K2.6 in particular because its aggressive tool-call chaining + exercises this path far more than Sonnet does. """ from .models import ErrorResponse # noqa: PLC0415 — avoid circular import @@ -808,7 +808,7 @@ def require_guide_read(session: ChatSession, tool_name: str): # requiring one would waste a round-trip every turn. if session.metadata.builder_graph_id: return None - if session.has_tool_been_called_this_turn(_AGENT_GUIDE_TOOL_NAME): + if session.has_tool_been_called(_AGENT_GUIDE_TOOL_NAME): return None return ErrorResponse( message=( From 54d6d4a3e61da120490317761e2bde0360461a77 Mon Sep 17 00:00:00 2001 From: majdyz Date: Tue, 21 Apr 2026 23:17:36 +0700 Subject: [PATCH 11/25] fix(backend/copilot): drive baseline perplexity supplement from PerplexityModel enum MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Self-review on #12871 found the supplement was shipping invented sonar IDs: the prompt told Kimi to pass bare ``"sonar"`` / ``"sonar-pro"`` / ``"sonar-reasoning"`` / ``"sonar-reasoning-pro"``, but ``PerplexityModel`` only accepts the provider-prefixed forms (``perplexity/sonar``, ``perplexity/sonar-pro``, ``perplexity/sonar-deep-research``). The block's ``_sanitize_perplexity_model`` silently coerced every unknown value back to ``perplexity/sonar`` with a WARNING — so ``-pro`` and the two nonexistent ``-reasoning`` variants all collapsed to plain ``sonar`` and nobody got deeper research when they asked for it. Rewrite the supplement to render the valid model list directly from ``PerplexityModel`` at call time, and name the default with its enum value (``perplexity/sonar``). Prose now tells the model it MUST pass the provider-prefixed value verbatim and that unknown values silently fall back, so it can't wander off. Two new regression tests: * ``test_supplement_uses_perplexitymodel_enum_values_verbatim`` asserts every enum value surfaces in the rendered text and the default example is ``"model": "perplexity/sonar"`` — upstream adding or dropping a SKU automatically stays in sync with the supplement without any further edits. * ``test_supplement_does_not_mention_invented_sonar_variants`` explicitly rejects the old bare/reasoning strings so the next reader can't accidentally reintroduce them. The existing registry-drift tests (block IDs pinned to ``PerplexityBlock().id`` / ``SendWebRequestBlock().id``) stay in place. --- .../backend/backend/copilot/prompting.py | 25 +++++++++-- .../backend/backend/copilot/prompting_test.py | 41 +++++++++++++++++-- 2 files changed, 59 insertions(+), 7 deletions(-) diff --git a/autogpt_platform/backend/backend/copilot/prompting.py b/autogpt_platform/backend/backend/copilot/prompting.py index 21cae58192..b0ec560730 100644 --- a/autogpt_platform/backend/backend/copilot/prompting.py +++ b/autogpt_platform/backend/backend/copilot/prompting.py @@ -436,11 +436,25 @@ def get_baseline_web_search_supplement() -> str: * SendWebRequest (fetch an arbitrary URL) — ``SEND_WEB_REQUEST_BLOCK_ID``. + The Perplexity model list is pulled from ``PerplexityModel`` at + render time — if a sonar SKU is added / dropped upstream the + supplement follows automatically and the registry test catches + block-ID drift. + The supplement is static — no per-user / per-session content — so it stays on the cacheable prefix. Append to the baseline system prompt only; SDK callers would just confuse their native ``WebSearch`` with a competing block recipe. """ + from backend.blocks.perplexity import PerplexityModel # noqa: PLC0415 + + default_model = PerplexityModel.SONAR.value + # Enumerate in declaration order so the default sonar SKU stays + # first. ``.value`` gives the provider-prefixed form Kimi must pass + # verbatim (``perplexity/sonar``, not bare ``sonar``) — the block's + # input validator coerces unknown values back to SONAR, so the + # previous prompt silently degraded every call. + model_lines = "\n".join(f" - ``{m.value}``" for m in PerplexityModel) return f""" ## Web Search & URL Fetch (fast mode) @@ -450,10 +464,13 @@ copilot blocks via ``run_block`` when you need live web content. ### Web search with citations — Perplexity - Block ID: ``{PERPLEXITY_BLOCK_ID}`` -- Input: ``{{"prompt": "", "model": "sonar"}}`` - (``model`` defaults to ``sonar``; other options: ``sonar-pro``, - ``sonar-reasoning``, ``sonar-reasoning-pro``, ``sonar-deep-research`` - — use ``sonar`` unless the user asks for deeper research.) +- Input: ``{{"prompt": "", "model": "{default_model}"}}``. +- ``model`` MUST be one of the provider-prefixed values below (pass the + full string verbatim — unknown values silently fall back to + ``{default_model}``): +{model_lines} +- Default to ``{default_model}`` unless the user asks for deeper + research, in which case pass ``perplexity/sonar-deep-research``. - Output: ``response`` (string), ``annotations`` (list of URL citations). - Requires Perplexity credentials connected to the user's account. If the block errors with a missing-credentials message, call diff --git a/autogpt_platform/backend/backend/copilot/prompting_test.py b/autogpt_platform/backend/backend/copilot/prompting_test.py index 837044772d..c8be038012 100644 --- a/autogpt_platform/backend/backend/copilot/prompting_test.py +++ b/autogpt_platform/backend/backend/copilot/prompting_test.py @@ -87,9 +87,44 @@ class TestBaselineWebSearchSupplement: assert '"prompt"' in text # SendWebRequest required input. assert '"url"' in text - # Default Perplexity model is named explicitly so Kimi doesn't - # guess (``sonar-xl`` etc. 404 on the Perplexity API). - assert '"sonar"' in text + + def test_supplement_uses_perplexitymodel_enum_values_verbatim(self): + """Regression: the earlier supplement invented bare sonar IDs + (``"sonar"``, ``"sonar-reasoning"``, ``"sonar-reasoning-pro"``) + that don't match ``PerplexityModel`` values — every call logged + an ``Invalid PerplexityModel`` warning and silently fell back to + plain ``sonar``. The supplement must now list exactly the enum + values, in full provider-prefixed form, and the default must + equal ``PerplexityModel.SONAR.value``.""" + from backend.blocks.perplexity import PerplexityModel + + text = prompting.get_baseline_web_search_supplement() + # Every enum value surfaces verbatim. + for model in PerplexityModel: + assert ( + model.value in text + ), f"Supplement missing {model.value!r} (known PerplexityModel value)" + # The default example carries the provider prefix so Kimi can + # pass it through without the fallback warning firing. + assert f'"model": "{PerplexityModel.SONAR.value}"' in text + + def test_supplement_does_not_mention_invented_sonar_variants(self): + """Regression: these bare strings were listed as valid Perplexity + models before the enum-driven rewrite — none match a real + ``PerplexityModel`` value, so the block silently fell back to + ``SONAR`` on every call. Guard against the next reader + accidentally reintroducing them.""" + text = prompting.get_baseline_web_search_supplement() + # ``sonar-reasoning`` / ``sonar-reasoning-pro`` are not enum + # members today — if upstream adds them, re-enable this check + # alongside an ``assert PerplexityModel.SONAR_REASONING ...``. + assert "sonar-reasoning" not in text + # Bare ``"sonar"`` without the ``perplexity/`` prefix is rejected + # by the block's model validator; the enum-driven supplement + # should emit only the provider-prefixed form. Check the + # quote-wrapped bare form to avoid matching ``perplexity/sonar``. + assert '"sonar"' not in text + assert '"sonar-pro"' not in text def test_supplement_flags_credentials_dependency(self): text = prompting.get_baseline_web_search_supplement() From e48144b356a76b8b70a6d96eb5d3a7ea4882e9f6 Mon Sep 17 00:00:00 2001 From: majdyz Date: Tue, 21 Apr 2026 23:21:02 +0700 Subject: [PATCH 12/25] fix(backend/copilot): add explicit validation_alias for fast_advanced_model env var MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sentry review on #12871 flagged ``fast_advanced_model`` as the only cell in the (path, tier) matrix without a ``validation_alias`` — the docstring said override via ``CHAT_FAST_ADVANCED_MODEL`` but the alias wasn't declared. The env var does in fact bind today via ``env_prefix = "CHAT_"``, so this isn't breaking anything right now — but it's the only field of the four that binds implicitly, and any future refactor that drops ``env_prefix`` would silently lose the override without a test catching it. Add ``validation_alias=AliasChoices("CHAT_FAST_ADVANCED_MODEL")`` and a new regression test ``test_all_four_new_env_vars_bind_to_their_cells`` that sets all four ``CHAT_*_*_MODEL`` vars (with the legacy aliases cleared) and asserts each cell reads back the right explicit value. Paired with the existing ``test_legacy_env_aliases_route_to_new_fields``, the config contract is fully pinned from both sides (new names + legacy names). --- .../baseline/transcript_integration_test.py | 28 +++++++++++++++++++ .../backend/backend/copilot/config.py | 9 +++++- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/autogpt_platform/backend/backend/copilot/baseline/transcript_integration_test.py b/autogpt_platform/backend/backend/copilot/baseline/transcript_integration_test.py index 53290536ea..808b06eb32 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/transcript_integration_test.py +++ b/autogpt_platform/backend/backend/copilot/baseline/transcript_integration_test.py @@ -153,6 +153,34 @@ class TestResolveBaselineModel: assert cfg.thinking_advanced_model == "legacy/opus-via-advanced" assert cfg.fast_standard_model == "legacy/fast-via-fast-model" + def test_all_four_new_env_vars_bind_to_their_cells(self, monkeypatch): + """Each of the four (path, tier) cells must be overridable via + its documented ``CHAT_*_*_MODEL`` env var — including + ``CHAT_FAST_ADVANCED_MODEL`` which was missing a + ``validation_alias`` in the original split and only bound + implicitly through ``env_prefix``. Pinning all four here so + that whenever someone touches the config shape, an accidental + unbinding fails CI instead of silently ignoring operator + overrides. + """ + from backend.copilot.config import ChatConfig + + monkeypatch.setenv("CHAT_FAST_STANDARD_MODEL", "explicit/fast-std") + monkeypatch.setenv("CHAT_FAST_ADVANCED_MODEL", "explicit/fast-adv") + monkeypatch.setenv("CHAT_THINKING_STANDARD_MODEL", "explicit/think-std") + monkeypatch.setenv("CHAT_THINKING_ADVANCED_MODEL", "explicit/think-adv") + # Clear the legacy aliases so they don't win priority in + # ``AliasChoices`` (first match wins). + for legacy in ("CHAT_MODEL", "CHAT_ADVANCED_MODEL", "CHAT_FAST_MODEL"): + monkeypatch.delenv(legacy, raising=False) + + cfg = ChatConfig() + + assert cfg.fast_standard_model == "explicit/fast-std" + assert cfg.fast_advanced_model == "explicit/fast-adv" + assert cfg.thinking_standard_model == "explicit/think-std" + assert cfg.thinking_advanced_model == "explicit/think-adv" + class TestLoadPriorTranscript: """``_load_prior_transcript`` wraps the CLI session restore + validate + load flow.""" diff --git a/autogpt_platform/backend/backend/copilot/config.py b/autogpt_platform/backend/backend/copilot/config.py index 7d953dbd22..0c3c55d8dc 100644 --- a/autogpt_platform/backend/backend/copilot/config.py +++ b/autogpt_platform/backend/backend/copilot/config.py @@ -60,6 +60,9 @@ class ChatConfig(BaseSettings): ) fast_advanced_model: str = Field( default="anthropic/claude-opus-4.7", + validation_alias=AliasChoices( + "CHAT_FAST_ADVANCED_MODEL", + ), description="Baseline path, 'advanced' tier. Opus by default so " "the advanced tier is a clean A/B vs the SDK advanced tier: same " "model, different path — isolates the reasoning-wire + cache " @@ -67,7 +70,11 @@ class ChatConfig(BaseSettings): "(the reasoning-native sibling) benchmarks ~9pp behind K2.6 on " "SWE-Bench Verified and ~23pp behind on BrowseComp, is text-only, " "and was published 6 months before K2.6 — not a fit for the " - "advanced tier today. Override via ``CHAT_FAST_ADVANCED_MODEL``.", + "advanced tier today. Override via ``CHAT_FAST_ADVANCED_MODEL``. " + "Unlike the other three cells there is no legacy env var to " + "alias — this cell was created in the 2×2 split — but the " + "``validation_alias`` stays explicit so the override isn't " + "coupled to ``env_prefix`` continuing to exist.", ) thinking_standard_model: str = Field( default="anthropic/claude-sonnet-4-6", From c9a86e83390dfdf67af0c13ae2dfe2c346a904a6 Mon Sep 17 00:00:00 2001 From: majdyz Date: Tue, 21 Apr 2026 23:23:09 +0700 Subject: [PATCH 13/25] Revert "fix(backend/copilot): drive baseline perplexity supplement from PerplexityModel enum" This reverts commit 54d6d4a3e61da120490317761e2bde0360461a77. --- .../backend/backend/copilot/prompting.py | 25 ++--------- .../backend/backend/copilot/prompting_test.py | 41 ++----------------- 2 files changed, 7 insertions(+), 59 deletions(-) diff --git a/autogpt_platform/backend/backend/copilot/prompting.py b/autogpt_platform/backend/backend/copilot/prompting.py index b0ec560730..21cae58192 100644 --- a/autogpt_platform/backend/backend/copilot/prompting.py +++ b/autogpt_platform/backend/backend/copilot/prompting.py @@ -436,25 +436,11 @@ def get_baseline_web_search_supplement() -> str: * SendWebRequest (fetch an arbitrary URL) — ``SEND_WEB_REQUEST_BLOCK_ID``. - The Perplexity model list is pulled from ``PerplexityModel`` at - render time — if a sonar SKU is added / dropped upstream the - supplement follows automatically and the registry test catches - block-ID drift. - The supplement is static — no per-user / per-session content — so it stays on the cacheable prefix. Append to the baseline system prompt only; SDK callers would just confuse their native ``WebSearch`` with a competing block recipe. """ - from backend.blocks.perplexity import PerplexityModel # noqa: PLC0415 - - default_model = PerplexityModel.SONAR.value - # Enumerate in declaration order so the default sonar SKU stays - # first. ``.value`` gives the provider-prefixed form Kimi must pass - # verbatim (``perplexity/sonar``, not bare ``sonar``) — the block's - # input validator coerces unknown values back to SONAR, so the - # previous prompt silently degraded every call. - model_lines = "\n".join(f" - ``{m.value}``" for m in PerplexityModel) return f""" ## Web Search & URL Fetch (fast mode) @@ -464,13 +450,10 @@ copilot blocks via ``run_block`` when you need live web content. ### Web search with citations — Perplexity - Block ID: ``{PERPLEXITY_BLOCK_ID}`` -- Input: ``{{"prompt": "", "model": "{default_model}"}}``. -- ``model`` MUST be one of the provider-prefixed values below (pass the - full string verbatim — unknown values silently fall back to - ``{default_model}``): -{model_lines} -- Default to ``{default_model}`` unless the user asks for deeper - research, in which case pass ``perplexity/sonar-deep-research``. +- Input: ``{{"prompt": "", "model": "sonar"}}`` + (``model`` defaults to ``sonar``; other options: ``sonar-pro``, + ``sonar-reasoning``, ``sonar-reasoning-pro``, ``sonar-deep-research`` + — use ``sonar`` unless the user asks for deeper research.) - Output: ``response`` (string), ``annotations`` (list of URL citations). - Requires Perplexity credentials connected to the user's account. If the block errors with a missing-credentials message, call diff --git a/autogpt_platform/backend/backend/copilot/prompting_test.py b/autogpt_platform/backend/backend/copilot/prompting_test.py index c8be038012..837044772d 100644 --- a/autogpt_platform/backend/backend/copilot/prompting_test.py +++ b/autogpt_platform/backend/backend/copilot/prompting_test.py @@ -87,44 +87,9 @@ class TestBaselineWebSearchSupplement: assert '"prompt"' in text # SendWebRequest required input. assert '"url"' in text - - def test_supplement_uses_perplexitymodel_enum_values_verbatim(self): - """Regression: the earlier supplement invented bare sonar IDs - (``"sonar"``, ``"sonar-reasoning"``, ``"sonar-reasoning-pro"``) - that don't match ``PerplexityModel`` values — every call logged - an ``Invalid PerplexityModel`` warning and silently fell back to - plain ``sonar``. The supplement must now list exactly the enum - values, in full provider-prefixed form, and the default must - equal ``PerplexityModel.SONAR.value``.""" - from backend.blocks.perplexity import PerplexityModel - - text = prompting.get_baseline_web_search_supplement() - # Every enum value surfaces verbatim. - for model in PerplexityModel: - assert ( - model.value in text - ), f"Supplement missing {model.value!r} (known PerplexityModel value)" - # The default example carries the provider prefix so Kimi can - # pass it through without the fallback warning firing. - assert f'"model": "{PerplexityModel.SONAR.value}"' in text - - def test_supplement_does_not_mention_invented_sonar_variants(self): - """Regression: these bare strings were listed as valid Perplexity - models before the enum-driven rewrite — none match a real - ``PerplexityModel`` value, so the block silently fell back to - ``SONAR`` on every call. Guard against the next reader - accidentally reintroducing them.""" - text = prompting.get_baseline_web_search_supplement() - # ``sonar-reasoning`` / ``sonar-reasoning-pro`` are not enum - # members today — if upstream adds them, re-enable this check - # alongside an ``assert PerplexityModel.SONAR_REASONING ...``. - assert "sonar-reasoning" not in text - # Bare ``"sonar"`` without the ``perplexity/`` prefix is rejected - # by the block's model validator; the enum-driven supplement - # should emit only the provider-prefixed form. Check the - # quote-wrapped bare form to avoid matching ``perplexity/sonar``. - assert '"sonar"' not in text - assert '"sonar-pro"' not in text + # Default Perplexity model is named explicitly so Kimi doesn't + # guess (``sonar-xl`` etc. 404 on the Perplexity API). + assert '"sonar"' in text def test_supplement_flags_credentials_dependency(self): text = prompting.get_baseline_web_search_supplement() From 0d8a27fb7a7f7ce7c861a6942098e0552ab58694 Mon Sep 17 00:00:00 2001 From: majdyz Date: Tue, 21 Apr 2026 23:23:09 +0700 Subject: [PATCH 14/25] Revert "feat(backend/copilot): baseline web-search supplement with Perplexity + SendWebRequest block IDs" This reverts commit 1848810b320c18011628783f6f745250f4ebb76d. --- .../backend/copilot/baseline/service.py | 7 +-- .../backend/backend/copilot/prompting.py | 62 ------------------- .../backend/backend/copilot/prompting_test.py | 39 ------------ 3 files changed, 1 insertion(+), 107 deletions(-) diff --git a/autogpt_platform/backend/backend/copilot/baseline/service.py b/autogpt_platform/backend/backend/copilot/baseline/service.py index 01c0aff51a..6aa88e9d41 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/service.py +++ b/autogpt_platform/backend/backend/copilot/baseline/service.py @@ -55,11 +55,7 @@ from backend.copilot.pending_messages import ( drain_pending_messages, format_pending_as_user_message, ) -from backend.copilot.prompting import ( - SHARED_TOOL_NOTES, - get_baseline_web_search_supplement, - get_graphiti_supplement, -) +from backend.copilot.prompting import SHARED_TOOL_NOTES, get_graphiti_supplement from backend.copilot.response_model import ( StreamBaseResponse, StreamError, @@ -1421,7 +1417,6 @@ async def stream_chat_completion_baseline( system_prompt = ( base_system_prompt + SHARED_TOOL_NOTES - + get_baseline_web_search_supplement() + graphiti_supplement + builder_session_suffix ) diff --git a/autogpt_platform/backend/backend/copilot/prompting.py b/autogpt_platform/backend/backend/copilot/prompting.py index 21cae58192..399d31c1cc 100644 --- a/autogpt_platform/backend/backend/copilot/prompting.py +++ b/autogpt_platform/backend/backend/copilot/prompting.py @@ -411,65 +411,3 @@ You have access to persistent temporal memory tools that remember facts across s - group_id is handled automatically by the system — never set it yourself. - When storing, be specific about operational rules and instructions (e.g., "CC Sarah on client communications" not just "Sarah is the assistant"). """ - - -# Block IDs for the baseline-path web-search supplement. Surfaced as module -# constants so a test can assert they match the live block registry (see -# prompting_test.py — if a block is renamed/deleted the assertion catches it -# before the prompt ships a dead UUID). -PERPLEXITY_BLOCK_ID = "c8a5f2e9-8b3d-4a7e-9f6c-1d5e3c9b7a4f" -SEND_WEB_REQUEST_BLOCK_ID = "6595ae1f-b924-42cb-9a41-551a0611c4b4" - - -def get_baseline_web_search_supplement() -> str: - """Web-search instructions for the baseline (fast) path. - - SDK (extended_thinking) mode has Claude's native ``WebSearch`` tool, - so it gets web access for free. The baseline path does NOT — it runs - against whatever OpenRouter model is configured (Kimi K2.6 by - default) and those providers don't expose a built-in search. To - close the gap without adding a new tool type, we point the model at - two existing copilot blocks via ``run_block``: - - * Perplexity (web search w/ citations, sonar models) — - ``PERPLEXITY_BLOCK_ID``. - * SendWebRequest (fetch an arbitrary URL) — - ``SEND_WEB_REQUEST_BLOCK_ID``. - - The supplement is static — no per-user / per-session content — so it - stays on the cacheable prefix. Append to the baseline system prompt - only; SDK callers would just confuse their native ``WebSearch`` with - a competing block recipe. - """ - return f""" - -## Web Search & URL Fetch (fast mode) - -Fast mode doesn't expose a native web-search tool — use the existing -copilot blocks via ``run_block`` when you need live web content. - -### Web search with citations — Perplexity -- Block ID: ``{PERPLEXITY_BLOCK_ID}`` -- Input: ``{{"prompt": "", "model": "sonar"}}`` - (``model`` defaults to ``sonar``; other options: ``sonar-pro``, - ``sonar-reasoning``, ``sonar-reasoning-pro``, ``sonar-deep-research`` - — use ``sonar`` unless the user asks for deeper research.) -- Output: ``response`` (string), ``annotations`` (list of URL citations). -- Requires Perplexity credentials connected to the user's account. If - the block errors with a missing-credentials message, call - ``connect_integration`` for Perplexity and retry. - -### Fetch a specific URL — SendWebRequest -- Block ID: ``{SEND_WEB_REQUEST_BLOCK_ID}`` -- Input: ``{{"url": "", "method": "GET"}}`` (supports POST / PUT / - DELETE / PATCH too; add ``headers`` / ``body`` as needed). -- Output: ``response`` (server body), plus ``client_error`` / - ``server_error`` / ``error`` channels — check the error outputs - before trusting ``response``. - -Prefer Perplexity for open-ended questions ("what's the latest...", -"find articles about...") and SendWebRequest when the user names a -specific URL or needs structured API data. Call ``find_block`` first -only if neither pattern fits — the two block IDs above cover 95% of -baseline web-access needs and skip one search round-trip. -""" diff --git a/autogpt_platform/backend/backend/copilot/prompting_test.py b/autogpt_platform/backend/backend/copilot/prompting_test.py index 837044772d..5a719f1b00 100644 --- a/autogpt_platform/backend/backend/copilot/prompting_test.py +++ b/autogpt_platform/backend/backend/copilot/prompting_test.py @@ -56,42 +56,3 @@ class TestAgentGenerationGuideContainsClarifySection: "### Workflow" )[0] assert "ask_question" in clarify_section - - -class TestBaselineWebSearchSupplement: - """The fast-mode web-search supplement must point at block IDs that - actually exist and name each block's required input fields, so the - Kimi / baseline model can call them via ``run_block`` without a - ``find_block`` round-trip. Pinning the block IDs against the live - registry means a block rename / delete breaks this test rather than - shipping a dead UUID to the model.""" - - def test_perplexity_block_id_matches_registered_block(self): - from backend.blocks.perplexity import PerplexityBlock - - assert PerplexityBlock().id == prompting.PERPLEXITY_BLOCK_ID - - def test_send_web_request_block_id_matches_registered_block(self): - from backend.blocks.http import SendWebRequestBlock - - assert SendWebRequestBlock().id == prompting.SEND_WEB_REQUEST_BLOCK_ID - - def test_supplement_surfaces_both_block_ids(self): - text = prompting.get_baseline_web_search_supplement() - assert prompting.PERPLEXITY_BLOCK_ID in text - assert prompting.SEND_WEB_REQUEST_BLOCK_ID in text - - def test_supplement_names_required_inputs(self): - text = prompting.get_baseline_web_search_supplement() - # Perplexity required input. - assert '"prompt"' in text - # SendWebRequest required input. - assert '"url"' in text - # Default Perplexity model is named explicitly so Kimi doesn't - # guess (``sonar-xl`` etc. 404 on the Perplexity API). - assert '"sonar"' in text - - def test_supplement_flags_credentials_dependency(self): - text = prompting.get_baseline_web_search_supplement() - assert "credentials" in text.lower() - assert "connect_integration" in text From 059180427282793a52f58b77e85e889291e7ca13 Mon Sep 17 00:00:00 2001 From: majdyz Date: Tue, 21 Apr 2026 23:35:48 +0700 Subject: [PATCH 15/25] fix(backend/copilot): anchor Claude reasoning-route match to reject foreign provider prefixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sentry review on #12871 flagged that the Claude branch of ``_is_reasoning_route`` still used ``"claude" in lowered`` — broad substring match — while the Kimi branch got anchored in an earlier commit. A custom ``someprovider/claude-mock-v1`` configured via ``CHAT_FAST_STANDARD_MODEL`` would inherit the ``reasoning`` extra_body and 400 against its upstream. Tighten the gate: ``anthropic``/``anthropic.`` prefixes and the ``moonshotai/`` prefix are accepted as before, plus a bare ``claude-`` or ``kimi-`` model id with no provider prefix (keeps ``claude-3-5-sonnet-20241022`` / ``kimi-k2-instruct`` working for direct CLI configs). Anything with a foreign ``/`` prefix falls through to False — blocks both ``someprovider/claude-mock-v1`` and ``other/kimi-pro``. One explicit carve-out: ``openrouter/kimi-`` stays recognised because ``openrouter/`` was the existing prefix for K2.6 in earlier tests and changing it would be a behaviour regression. Adds ``test_claude_substring_false_positives_rejected`` covering both the new Claude and Kimi false-positive cases. All existing positive cases (including ``ANTHROPIC/Claude-Opus`` case-insensitive, ``anthropic.claude-3-5-sonnet`` Bedrock style) still pass. --- .../backend/copilot/baseline/reasoning.py | 50 ++++++++++++++----- .../copilot/baseline/reasoning_test.py | 15 ++++++ 2 files changed, 53 insertions(+), 12 deletions(-) diff --git a/autogpt_platform/backend/backend/copilot/baseline/reasoning.py b/autogpt_platform/backend/backend/copilot/baseline/reasoning.py index f301b50147..f5f8b576bb 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/reasoning.py +++ b/autogpt_platform/backend/backend/copilot/baseline/reasoning.py @@ -162,23 +162,49 @@ def _is_reasoning_route(model: str) -> bool: because ``cache_control`` is strictly Anthropic-specific (Moonshot does its own auto-caching), so the two gates must not conflate. - The Kimi match anchors on the ``moonshotai/`` provider prefix or on a - bare / OpenRouter-prefixed ``kimi-`` model id (``kimi-k2.6``, - ``moonshotai/kimi-k2-thinking``, ``openrouter/kimi-k2.6``), so unrelated - models that happen to contain ``kimi`` as a substring (e.g. a - hypothetical ``some-provider/hakimi-large``) are not treated as - reasoning routes. + Both the Claude and Kimi matches are anchored to the provider + prefix (or to a bare model id with no prefix at all) to avoid + substring false positives — a custom ``some-other-provider/claude-mock`` + or ``provider/hakimi-large`` configured via + ``CHAT_FAST_STANDARD_MODEL`` must NOT inherit the reasoning + extra_body and take a 400 from its upstream. Recognised shapes: + + * Claude — ``anthropic/`` or ``anthropic.`` provider prefix, or a + bare ``claude-`` model id with no provider prefix + (``claude-opus-4.7``, ``anthropic/claude-sonnet-4-6``, + ``anthropic.claude-3-5-sonnet``). A non-Anthropic prefix like + ``someprovider/claude-mock`` is rejected on purpose. + * Kimi — ``moonshotai/`` provider prefix, or a ``kimi-`` model id + with no provider prefix (``kimi-k2.6``, + ``moonshotai/kimi-k2-thinking``). Like Claude, a non-Moonshot + prefix is rejected — exception: ``openrouter/kimi-k2.6`` stays + recognised because ``openrouter/`` is how we route to Moonshot + today and changing that would be a behaviour regression for + existing deployments. """ lowered = model.lower() - if "claude" in lowered or lowered.startswith("anthropic"): + if lowered.startswith("anthropic"): return True if lowered.startswith("moonshotai/"): return True - # Match a ``kimi-`` model id at string start or immediately after a - # provider prefix ``/`` — avoids substring false positives like - # ``hakimi``. - bare = lowered.rsplit("/", 1)[-1] - return bare.startswith("kimi-") + # ``openrouter/`` historically routes to whatever the default + # upstream for the model is — for kimi that's Moonshot, so accept + # ``openrouter/kimi-...`` here. Other ``openrouter/`` models + # (e.g. ``openrouter/auto``) fall through to the no-prefix check + # below and are rejected unless they start with ``claude-`` / + # ``kimi-`` after the slash, which no real OpenRouter route does. + if lowered.startswith("openrouter/kimi-"): + return True + if "/" in lowered: + # Any other provider prefix is a custom / non-Anthropic / + # non-Moonshot route and must not opt into reasoning. This + # blocks substring false positives like + # ``some-provider/claude-mock-v1`` or ``other/kimi-pro``. + return False + # No provider prefix — accept bare ``claude-*`` and ``kimi-*`` ids + # so direct CLI configs (``claude-3-5-sonnet-20241022``, + # ``kimi-k2-instruct``) keep working. + return lowered.startswith("claude-") or lowered.startswith("kimi-") def reasoning_extra_body(model: str, max_thinking_tokens: int) -> dict[str, Any] | None: diff --git a/autogpt_platform/backend/backend/copilot/baseline/reasoning_test.py b/autogpt_platform/backend/backend/copilot/baseline/reasoning_test.py index 4d6d3c5623..e18c8066e4 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/reasoning_test.py +++ b/autogpt_platform/backend/backend/copilot/baseline/reasoning_test.py @@ -173,6 +173,21 @@ class TestIsReasoningRoute: assert not _is_reasoning_route("hakimi") assert not _is_reasoning_route("akimi-7b") + def test_claude_substring_false_positives_rejected(self): + # Regression (Sentry review on #12871): ``'claude' in lowered`` + # matched any substring — a custom + # ``someprovider/claude-mock-v1`` set via + # ``CHAT_FAST_STANDARD_MODEL`` would inherit the reasoning + # extra_body and take a 400 from its upstream. The anchored + # match requires either an ``anthropic`` / ``anthropic.`` / + # ``anthropic/`` prefix, or a bare ``claude-`` id with no + # provider prefix. + assert not _is_reasoning_route("someprovider/claude-mock-v1") + assert not _is_reasoning_route("custom/claude-like-model") + # Same principle for Kimi — a non-Moonshot provider prefix is + # rejected even when the model id starts with ``kimi-``. + assert not _is_reasoning_route("other/kimi-pro") + class TestReasoningExtraBody: def test_anthropic_route_returns_fragment(self): From 1316e16f04697a3b4ee4c995e661f735bbd23743 Mon Sep 17 00:00:00 2001 From: majdyz Date: Tue, 21 Apr 2026 23:39:27 +0700 Subject: [PATCH 16/25] feat(backend/copilot): add web_search tool via Anthropic web_search beta MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New `web_search` copilot tool wraps Anthropic's server-side `web_search_20250305` so both SDK and baseline paths have a single unified search interface. Previously baseline (Kimi on OpenRouter) had no native search and had to go through the Perplexity block via `run_block`; SDK (Sonnet) used Claude Code's native WebSearch. * `copilot/tools/web_search.py` — `WebSearchTool` dispatches through `AsyncAnthropic.messages.create` with a cheap Haiku model + `web_search_20250305` tool, parses `web_search_tool_result` blocks into {title, url, snippet, page_age}. `is_available` hides the tool when no Anthropic API key is configured. * `sdk/tool_adapter.py` — moved `WebSearch` from SDK built-in-always list to `SDK_DISALLOWED_TOOLS` so SDK routes through `mcp__copilot__web_search` too. Single code path for cost tracking. * `persist_and_record_usage(provider="anthropic")` — billing lands in the same turn-accounting bucket as LLM cost, so rate limits and credit charges stay coherent. Cost = per-search fee ($10/1K) + Haiku dispatch tokens. * `copilot/tools/models.py` — new `WebSearchResponse` / `WebSearchResult` models matching the native WebSearch shape. 12 new tests: result extractor (title/url/snippet/page_age, limit cap, non-search blocks ignored), cost estimator (per-search fee linear in count), integration (cost tracker called with provider='anthropic'), no-API-key short-circuit, registry sanity. --- .../backend/copilot/sdk/tool_adapter.py | 5 +- .../backend/backend/copilot/tools/__init__.py | 2 + .../backend/backend/copilot/tools/models.py | 25 ++ .../backend/copilot/tools/web_search.py | 221 +++++++++++++ .../backend/copilot/tools/web_search_test.py | 304 ++++++++++++++++++ 5 files changed, 556 insertions(+), 1 deletion(-) create mode 100644 autogpt_platform/backend/backend/copilot/tools/web_search.py create mode 100644 autogpt_platform/backend/backend/copilot/tools/web_search_test.py diff --git a/autogpt_platform/backend/backend/copilot/sdk/tool_adapter.py b/autogpt_platform/backend/backend/copilot/sdk/tool_adapter.py index d97937da23..7e1fa0396d 100644 --- a/autogpt_platform/backend/backend/copilot/sdk/tool_adapter.py +++ b/autogpt_platform/backend/backend/copilot/sdk/tool_adapter.py @@ -779,7 +779,9 @@ def create_copilot_mcp_server(*, use_e2b: bool = False): # In E2B mode, all five are disabled — MCP equivalents provide direct sandbox # access. read_file also handles local tool-results and ephemeral reads. _SDK_BUILTIN_FILE_TOOLS = ["Read", "Write", "Edit", "Glob", "Grep"] -_SDK_BUILTIN_ALWAYS = ["Task", "Agent", "WebSearch", "TodoWrite"] +# WebSearch moved to ``SDK_DISALLOWED_TOOLS`` — routed through +# ``mcp__copilot__web_search`` so cost tracking is unified across paths. +_SDK_BUILTIN_ALWAYS = ["Task", "Agent", "TodoWrite"] _SDK_BUILTIN_TOOLS = [*_SDK_BUILTIN_FILE_TOOLS, *_SDK_BUILTIN_ALWAYS] # SDK built-in tools that must be explicitly blocked. @@ -805,6 +807,7 @@ _SDK_BUILTIN_TOOLS = [*_SDK_BUILTIN_FILE_TOOLS, *_SDK_BUILTIN_ALWAYS] SDK_DISALLOWED_TOOLS = [ "Bash", "WebFetch", + "WebSearch", "AskUserQuestion", "Write", "Edit", diff --git a/autogpt_platform/backend/backend/copilot/tools/__init__.py b/autogpt_platform/backend/backend/copilot/tools/__init__.py index 9ba050b79a..7aace646a6 100644 --- a/autogpt_platform/backend/backend/copilot/tools/__init__.py +++ b/autogpt_platform/backend/backend/copilot/tools/__init__.py @@ -45,6 +45,7 @@ from .run_sub_session import RunSubSessionTool from .search_docs import SearchDocsTool from .validate_agent import ValidateAgentGraphTool from .web_fetch import WebFetchTool +from .web_search import WebSearchTool from .workspace_files import ( DeleteWorkspaceFileTool, ListWorkspaceFilesTool, @@ -93,6 +94,7 @@ TOOL_REGISTRY: dict[str, BaseTool] = { "get_agent_building_guide": GetAgentBuildingGuideTool(), # Web fetch for safe URL retrieval "web_fetch": WebFetchTool(), + "web_search": WebSearchTool(), # Agent-browser multi-step automation (navigate, act, screenshot) "browser_navigate": BrowserNavigateTool(), "browser_act": BrowserActTool(), diff --git a/autogpt_platform/backend/backend/copilot/tools/models.py b/autogpt_platform/backend/backend/copilot/tools/models.py index 8fa7e6cbb4..08b62056a4 100644 --- a/autogpt_platform/backend/backend/copilot/tools/models.py +++ b/autogpt_platform/backend/backend/copilot/tools/models.py @@ -76,6 +76,7 @@ class ResponseType(str, Enum): # Web WEB_FETCH = "web_fetch" + WEB_SEARCH = "web_search" # Feature requests FEATURE_REQUEST_SEARCH = "feature_request_search" @@ -585,6 +586,30 @@ class WebFetchResponse(ToolResponseBase): truncated: bool = False +class WebSearchResult(BaseModel): + """One entry in a web_search tool response.""" + + title: str + url: str + snippet: str = "" + page_age: str | None = None + + +class WebSearchResponse(ToolResponseBase): + """Response for web_search tool — mirrors the shape of the SDK's + native ``WebSearch`` tool so the LLM sees a consistent interface + regardless of which path dispatched the call.""" + + type: ResponseType = ResponseType.WEB_SEARCH + query: str + results: list[WebSearchResult] = Field(default_factory=list) + # Backend-reported usage for this call (copied from Anthropic's + # ``usage.server_tool_use``). Surfaces as metadata for frontend + # debug panels but is also what drives rate-limit / cost tracking + # via ``persist_and_record_usage(provider="anthropic")``. + search_requests: int = 0 + + class BashExecResponse(ToolResponseBase): """Response for bash_exec tool.""" diff --git a/autogpt_platform/backend/backend/copilot/tools/web_search.py b/autogpt_platform/backend/backend/copilot/tools/web_search.py new file mode 100644 index 0000000000..4b7ac3a53f --- /dev/null +++ b/autogpt_platform/backend/backend/copilot/tools/web_search.py @@ -0,0 +1,221 @@ +"""Web search tool — wraps Anthropic's server-side ``web_search`` beta. + +Single entry point for web search on both SDK and baseline paths. The +``web_search_20250305`` tool is server-side on Anthropic, so we call +the Messages API directly regardless of which LLM invoked the copilot +tool — OpenRouter can't proxy server-side tool execution. +""" + +import logging +from typing import Any + +from anthropic import AsyncAnthropic + +from backend.copilot.model import ChatSession +from backend.copilot.token_tracking import persist_and_record_usage +from backend.util.settings import Settings + +from .base import BaseTool +from .models import ErrorResponse, ToolResponseBase, WebSearchResponse, WebSearchResult + +logger = logging.getLogger(__name__) + +_WEB_SEARCH_DISPATCH_MODEL = "claude-haiku-4-5" +_MAX_DISPATCH_TOKENS = 512 +_DEFAULT_MAX_RESULTS = 5 +_HARD_MAX_RESULTS = 20 + + +class WebSearchTool(BaseTool): + """Search the public web and return cited results.""" + + @property + def name(self) -> str: + return "web_search" + + @property + def description(self) -> str: + return ( + "Search the web and return cited results. Use this for live " + "information — news, current events, up-to-date docs, recent " + "releases — when the model's training data would be stale. " + "Returns a list of {title, url, snippet} plus the URLs so " + "``web_fetch`` can deep-dive any result. Costs a few cents " + "per search; prefer one well-targeted query over many " + "reformulations." + ) + + @property + def parameters(self) -> dict[str, Any]: + return { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "The search query — a question or topic.", + }, + "max_results": { + "type": "integer", + "description": ( + f"Maximum results to return (default " + f"{_DEFAULT_MAX_RESULTS}, hard cap {_HARD_MAX_RESULTS})." + ), + "default": _DEFAULT_MAX_RESULTS, + }, + }, + "required": ["query"], + } + + @property + def requires_auth(self) -> bool: + return False + + @property + def is_available(self) -> bool: + return bool(Settings().secrets.anthropic_api_key) + + async def _execute( + self, + user_id: str | None, + session: ChatSession, + query: str = "", + max_results: int = _DEFAULT_MAX_RESULTS, + **kwargs: Any, + ) -> ToolResponseBase: + query = (query or "").strip() + session_id = session.session_id if session else None + if not query: + return ErrorResponse( + message="Please provide a non-empty search query.", + error="missing_query", + session_id=session_id, + ) + + try: + max_results = int(max_results) + except (TypeError, ValueError): + max_results = _DEFAULT_MAX_RESULTS + max_results = max(1, min(max_results, _HARD_MAX_RESULTS)) + + api_key = Settings().secrets.anthropic_api_key + if not api_key: + return ErrorResponse( + message=( + "Web search is unavailable — the deployment has no " + "Anthropic API key configured." + ), + error="web_search_not_configured", + session_id=session_id, + ) + + client = AsyncAnthropic(api_key=api_key) + try: + resp = await client.messages.create( + model=_WEB_SEARCH_DISPATCH_MODEL, + max_tokens=_MAX_DISPATCH_TOKENS, + tools=[ + { + "type": "web_search_20250305", + "name": "web_search", + "max_uses": 1, + } + ], + messages=[ + { + "role": "user", + "content": ( + f"Use the web_search tool exactly once with the " + f"query {query!r} and then stop. Do not " + f"summarise — the caller parses the raw " + f"tool_result." + ), + } + ], + ) + except Exception as exc: + logger.warning( + "[web_search] Anthropic call failed for query=%r: %s", query, exc + ) + return ErrorResponse( + message=f"Web search failed: {exc}", + error="web_search_failed", + session_id=session_id, + ) + + results, search_requests = _extract_results(resp, limit=max_results) + + cost_usd = _estimate_cost_usd(resp, search_requests=search_requests) + try: + usage = getattr(resp, "usage", None) + await persist_and_record_usage( + session=session, + user_id=user_id, + prompt_tokens=getattr(usage, "input_tokens", 0) or 0, + completion_tokens=getattr(usage, "output_tokens", 0) or 0, + log_prefix="[web_search]", + cost_usd=cost_usd, + model=_WEB_SEARCH_DISPATCH_MODEL, + provider="anthropic", + ) + except Exception as exc: + logger.warning("[web_search] usage tracking failed: %s", exc) + + return WebSearchResponse( + message=f"Found {len(results)} result(s) for {query!r}.", + query=query, + results=results, + search_requests=search_requests, + session_id=session_id, + ) + + +def _extract_results(resp: Any, *, limit: int) -> tuple[list[WebSearchResult], int]: + """Pull results + server-side request count from an Anthropic response.""" + results: list[WebSearchResult] = [] + search_requests = 0 + + for block in getattr(resp, "content", []) or []: + btype = getattr(block, "type", None) + if btype == "web_search_tool_result": + content = getattr(block, "content", []) or [] + for item in content: + if getattr(item, "type", None) != "web_search_result": + continue + if len(results) >= limit: + break + results.append( + WebSearchResult( + title=getattr(item, "title", "") or "", + url=getattr(item, "url", "") or "", + snippet=getattr(item, "encrypted_content", None) + or getattr(item, "page_content", "") + or "", + page_age=getattr(item, "page_age", None), + ) + ) + + usage = getattr(resp, "usage", None) + server_tool_use = getattr(usage, "server_tool_use", None) if usage else None + if server_tool_use is not None: + search_requests = getattr(server_tool_use, "web_search_requests", 0) or 0 + + return results, search_requests + + +# Update when Anthropic revises pricing. +_COST_PER_SEARCH_USD = 0.010 # $10 per 1,000 web_search requests +_HAIKU_INPUT_USD_PER_MTOK = 1.0 +_HAIKU_OUTPUT_USD_PER_MTOK = 5.0 + + +def _estimate_cost_usd(resp: Any, *, search_requests: int) -> float: + """Per-search fee × count + Haiku dispatch tokens.""" + usage = getattr(resp, "usage", None) + input_tokens = getattr(usage, "input_tokens", 0) if usage else 0 + output_tokens = getattr(usage, "output_tokens", 0) if usage else 0 + + search_cost = search_requests * _COST_PER_SEARCH_USD + inference_cost = (input_tokens / 1_000_000) * _HAIKU_INPUT_USD_PER_MTOK + ( + output_tokens / 1_000_000 + ) * _HAIKU_OUTPUT_USD_PER_MTOK + return round(search_cost + inference_cost, 6) diff --git a/autogpt_platform/backend/backend/copilot/tools/web_search_test.py b/autogpt_platform/backend/backend/copilot/tools/web_search_test.py new file mode 100644 index 0000000000..fe7885e171 --- /dev/null +++ b/autogpt_platform/backend/backend/copilot/tools/web_search_test.py @@ -0,0 +1,304 @@ +"""Tests for the ``web_search`` copilot tool. + +Covers the result extractor + cost estimator as pure units (fed with +synthetic Anthropic response objects), plus light integration tests that +mock ``AsyncAnthropic.messages.create`` and confirm the handler plumbs +through to ``persist_and_record_usage`` with the right provider tag. +""" + +from types import SimpleNamespace +from unittest.mock import AsyncMock, patch + +import pytest + +from backend.copilot.model import ChatSession + +from .models import ErrorResponse, WebSearchResponse, WebSearchResult +from .web_search import ( + _COST_PER_SEARCH_USD, + WebSearchTool, + _estimate_cost_usd, + _extract_results, +) + + +def _fake_anthropic_response( + *, + results: list[dict] | None = None, + search_requests: int = 1, + input_tokens: int = 120, + output_tokens: int = 40, +) -> SimpleNamespace: + """Build a synthetic Anthropic Messages response. + + Matches the shape produced by ``client.messages.create`` when the + response includes a ``web_search_tool_result`` content block and + ``usage.server_tool_use.web_search_requests`` on the turn meter. + """ + content = [] + if results is not None: + content.append( + SimpleNamespace( + type="web_search_tool_result", + content=[ + SimpleNamespace( + type="web_search_result", + title=r.get("title", "untitled"), + url=r.get("url", ""), + encrypted_content=r.get("snippet", ""), + page_age=r.get("page_age"), + ) + for r in results + ], + ) + ) + usage = SimpleNamespace( + input_tokens=input_tokens, + output_tokens=output_tokens, + server_tool_use=SimpleNamespace(web_search_requests=search_requests), + ) + return SimpleNamespace(content=content, usage=usage) + + +class TestExtractResults: + """The extractor is the only Anthropic-response-shape contact point; + pin its behaviour so an API shape change surfaces here first.""" + + def test_extracts_title_url_snippet_and_page_age(self): + resp = _fake_anthropic_response( + results=[ + { + "title": "Kimi K2.6 launch", + "url": "https://example.com/kimi", + "snippet": "Moonshot released K2.6 on 2026-04-20.", + "page_age": "1 day", + }, + { + "title": "OpenRouter pricing", + "url": "https://openrouter.ai/moonshotai/kimi-k2.6", + "snippet": "", + }, + ] + ) + out, requests = _extract_results(resp, limit=10) + assert requests == 1 + assert len(out) == 2 + assert out[0].title == "Kimi K2.6 launch" + assert out[0].url == "https://example.com/kimi" + assert out[0].snippet.startswith("Moonshot released") + assert out[0].page_age == "1 day" + assert out[1].snippet == "" + + def test_limit_caps_returned_results(self): + resp = _fake_anthropic_response( + results=[{"title": f"r{i}", "url": f"https://e/{i}"} for i in range(10)] + ) + out, _ = _extract_results(resp, limit=3) + assert len(out) == 3 + assert [r.title for r in out] == ["r0", "r1", "r2"] + + def test_missing_content_returns_empty(self): + resp = SimpleNamespace(content=[], usage=None) + out, requests = _extract_results(resp, limit=10) + assert out == [] + assert requests == 0 + + def test_non_search_blocks_are_ignored(self): + resp = SimpleNamespace( + content=[ + SimpleNamespace(type="text", text="Here's what I found..."), + SimpleNamespace( + type="web_search_tool_result", + content=[ + SimpleNamespace( + type="web_search_result", + title="real", + url="https://real.example", + encrypted_content="body", + page_age=None, + ) + ], + ), + ], + usage=None, + ) + out, _ = _extract_results(resp, limit=10) + assert len(out) == 1 and out[0].title == "real" + + +class TestEstimateCostUsd: + """Pin the per-search fee + Haiku inference math — the pricing + constants in ``web_search.py`` are hard-coded (no live lookup) so a + drift between Anthropic's schedule and our constants must surface + in this test for the next reader to notice.""" + + def test_zero_searches_still_charges_inference(self): + resp = _fake_anthropic_response(results=[], search_requests=0) + cost = _estimate_cost_usd(resp, search_requests=0) + # Haiku at 1000 input / 5000 output tokens = tiny but non-zero. + assert 0 < cost < 0.001 + + def test_single_search_fee_dominates(self): + resp = _fake_anthropic_response( + results=[{"title": "x", "url": "https://e"}], + search_requests=1, + input_tokens=100, + output_tokens=20, + ) + cost = _estimate_cost_usd(resp, search_requests=1) + # ~$0.010 search + trivial inference — total still ~1 cent. + assert cost >= _COST_PER_SEARCH_USD + assert cost < _COST_PER_SEARCH_USD + 0.001 + + def test_three_searches_linear_in_count(self): + resp = _fake_anthropic_response( + results=[], search_requests=3, input_tokens=0, output_tokens=0 + ) + cost = _estimate_cost_usd(resp, search_requests=3) + assert cost == pytest.approx(3 * _COST_PER_SEARCH_USD) + + +class TestWebSearchToolDispatch: + """Lightweight integration test: mock the Anthropic client, confirm + the handler returns a ``WebSearchResponse`` and the usage tracker is + called with ``provider='anthropic'`` (not 'open_router', even on the + baseline path — server-side web_search bills Anthropic regardless of + the calling LLM's route).""" + + def _session(self) -> ChatSession: + s = ChatSession.new("test-user", dry_run=False) + s.session_id = "sess-1" + return s + + @pytest.mark.asyncio + async def test_returns_response_with_results_and_tracks_cost(self, monkeypatch): + fake_resp = _fake_anthropic_response( + results=[ + { + "title": "hello", + "url": "https://example.com", + "snippet": "greeting", + } + ], + search_requests=1, + ) + mock_client = type( + "MC", + (), + { + "messages": type( + "M", (), {"create": AsyncMock(return_value=fake_resp)} + )() + }, + )() + + # Stub the Anthropic API key so ``is_available`` is True. + monkeypatch.setattr( + "backend.copilot.tools.web_search.Settings", + lambda: SimpleNamespace( + secrets=SimpleNamespace(anthropic_api_key="sk-test") + ), + ) + + with ( + patch( + "backend.copilot.tools.web_search.AsyncAnthropic", + return_value=mock_client, + ), + patch( + "backend.copilot.tools.web_search.persist_and_record_usage", + new=AsyncMock(return_value=160), + ) as mock_track, + ): + tool = WebSearchTool() + result = await tool._execute( + user_id="u1", + session=self._session(), + query="kimi k2.6 launch", + max_results=5, + ) + + assert isinstance(result, WebSearchResponse) + assert result.query == "kimi k2.6 launch" + assert len(result.results) == 1 + assert isinstance(result.results[0], WebSearchResult) + assert result.search_requests == 1 + + # Cost tracker must have been called with provider="anthropic". + assert mock_track.await_count == 1 + kwargs = mock_track.await_args.kwargs + assert kwargs["provider"] == "anthropic" + assert kwargs["model"] == "claude-haiku-4-5" + assert kwargs["user_id"] == "u1" + assert kwargs["cost_usd"] >= _COST_PER_SEARCH_USD + + @pytest.mark.asyncio + async def test_missing_api_key_returns_error_without_calling_anthropic( + self, monkeypatch + ): + monkeypatch.setattr( + "backend.copilot.tools.web_search.Settings", + lambda: SimpleNamespace(secrets=SimpleNamespace(anthropic_api_key="")), + ) + anthropic_stub = AsyncMock() + with ( + patch( + "backend.copilot.tools.web_search.AsyncAnthropic", + return_value=anthropic_stub, + ), + patch( + "backend.copilot.tools.web_search.persist_and_record_usage", + new=AsyncMock(), + ) as mock_track, + ): + tool = WebSearchTool() + assert tool.is_available is False + result = await tool._execute( + user_id="u1", + session=self._session(), + query="anything", + ) + assert isinstance(result, ErrorResponse) + assert result.error == "web_search_not_configured" + anthropic_stub.messages.create.assert_not_called() + mock_track.assert_not_called() + + @pytest.mark.asyncio + async def test_empty_query_rejected_without_api_call(self, monkeypatch): + monkeypatch.setattr( + "backend.copilot.tools.web_search.Settings", + lambda: SimpleNamespace( + secrets=SimpleNamespace(anthropic_api_key="sk-test") + ), + ) + anthropic_stub = AsyncMock() + with patch( + "backend.copilot.tools.web_search.AsyncAnthropic", + return_value=anthropic_stub, + ): + tool = WebSearchTool() + result = await tool._execute( + user_id="u1", session=self._session(), query=" " + ) + assert isinstance(result, ErrorResponse) + assert result.error == "missing_query" + anthropic_stub.messages.create.assert_not_called() + + +class TestToolRegistryIntegration: + """The tool must be registered under the ``web_search`` name so the + MCP layer exposes it as ``mcp__copilot__web_search`` — which is + what the SDK path now dispatches to (see + ``sdk/tool_adapter.py::SDK_DISALLOWED_TOOLS`` which blocks the CLI's + native ``WebSearch`` in favour of the MCP route).""" + + def test_web_search_is_in_tool_registry(self): + from backend.copilot.tools import TOOL_REGISTRY + + assert "web_search" in TOOL_REGISTRY + assert isinstance(TOOL_REGISTRY["web_search"], WebSearchTool) + + def test_sdk_native_websearch_is_disallowed(self): + from backend.copilot.sdk.tool_adapter import SDK_DISALLOWED_TOOLS + + assert "WebSearch" in SDK_DISALLOWED_TOOLS From 3bc28ac691aafa8ef2b8e232a63c318e432c090b Mon Sep 17 00:00:00 2001 From: majdyz Date: Tue, 21 Apr 2026 23:43:25 +0700 Subject: [PATCH 17/25] refactor(backend/copilot): tighten anthropic prefix match + trim fast_advanced_model comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - _is_reasoning_route: match both ``anthropic/`` and ``anthropic.`` explicitly so ``anthropic-mock`` and other off-prefix names no longer slip through. - config.fast_advanced_model: trim verbose K2-Thinking comparison rationale from the field description — the PR description is the right place for that. --- .../backend/copilot/baseline/reasoning.py | 2 +- .../backend/backend/copilot/config.py | 18 +++--------------- 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/autogpt_platform/backend/backend/copilot/baseline/reasoning.py b/autogpt_platform/backend/backend/copilot/baseline/reasoning.py index f5f8b576bb..0c689ed4a7 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/reasoning.py +++ b/autogpt_platform/backend/backend/copilot/baseline/reasoning.py @@ -183,7 +183,7 @@ def _is_reasoning_route(model: str) -> bool: existing deployments. """ lowered = model.lower() - if lowered.startswith("anthropic"): + if lowered.startswith(("anthropic/", "anthropic.")): return True if lowered.startswith("moonshotai/"): return True diff --git a/autogpt_platform/backend/backend/copilot/config.py b/autogpt_platform/backend/backend/copilot/config.py index 0c3c55d8dc..1183217f37 100644 --- a/autogpt_platform/backend/backend/copilot/config.py +++ b/autogpt_platform/backend/backend/copilot/config.py @@ -60,21 +60,9 @@ class ChatConfig(BaseSettings): ) fast_advanced_model: str = Field( default="anthropic/claude-opus-4.7", - validation_alias=AliasChoices( - "CHAT_FAST_ADVANCED_MODEL", - ), - description="Baseline path, 'advanced' tier. Opus by default so " - "the advanced tier is a clean A/B vs the SDK advanced tier: same " - "model, different path — isolates the reasoning-wire + cache " - "differences from model capability differences. Kimi K2-Thinking " - "(the reasoning-native sibling) benchmarks ~9pp behind K2.6 on " - "SWE-Bench Verified and ~23pp behind on BrowseComp, is text-only, " - "and was published 6 months before K2.6 — not a fit for the " - "advanced tier today. Override via ``CHAT_FAST_ADVANCED_MODEL``. " - "Unlike the other three cells there is no legacy env var to " - "alias — this cell was created in the 2×2 split — but the " - "``validation_alias`` stays explicit so the override isn't " - "coupled to ``env_prefix`` continuing to exist.", + validation_alias=AliasChoices("CHAT_FAST_ADVANCED_MODEL"), + description="Baseline path, 'advanced' tier. Opus by default. " + "Override via ``CHAT_FAST_ADVANCED_MODEL``.", ) thinking_standard_model: str = Field( default="anthropic/claude-sonnet-4-6", From 7ee0b0aeab984923a2388b4922dbce2cac833eff Mon Sep 17 00:00:00 2001 From: majdyz Date: Tue, 21 Apr 2026 23:53:23 +0700 Subject: [PATCH 18/25] dx(frontend): regenerate openapi.json with web_search ResponseType --- autogpt_platform/frontend/src/app/api/openapi.json | 1 + 1 file changed, 1 insertion(+) diff --git a/autogpt_platform/frontend/src/app/api/openapi.json b/autogpt_platform/frontend/src/app/api/openapi.json index e83ad80dbe..3f3928d95e 100644 --- a/autogpt_platform/frontend/src/app/api/openapi.json +++ b/autogpt_platform/frontend/src/app/api/openapi.json @@ -14142,6 +14142,7 @@ "browser_screenshot", "bash_exec", "web_fetch", + "web_search", "feature_request_search", "feature_request_created", "memory_store", From e7457983a14d0d290b2fb8b6858881749035f43b Mon Sep 17 00:00:00 2001 From: majdyz Date: Wed, 22 Apr 2026 00:11:46 +0700 Subject: [PATCH 19/25] feat(frontend/copilot): align web_search UI with native WebSearch rendering - Map ``web_search`` to the ``web`` tool category so the MCP copilot tool shares the globe icon + accordion layout with the SDK's native ``WebSearch``. - Render the structured ``results`` array (title / url / snippet / page_age) as clickable citation list instead of dumping JSON. Falls back to the existing ``content`` / MCP text / raw JSON path for the pre-existing ``web_fetch`` + native ``WebSearch`` shapes. --- .../copilot/tools/GenericTool/GenericTool.tsx | 61 ++++++++++++++++--- .../copilot/tools/GenericTool/helpers.ts | 1 + 2 files changed, 52 insertions(+), 10 deletions(-) diff --git a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/GenericTool.tsx b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/GenericTool.tsx index 995c18df05..5ea634a69a 100644 --- a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/GenericTool.tsx +++ b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/GenericTool.tsx @@ -305,15 +305,60 @@ function getWebAccordionData( string, unknown >; - const url = - getStringField(inp as Record, "url", "query") ?? - "Web content"; + const query = getStringField(inp, "query"); + const url = getStringField(inp, "url") ?? query ?? "Web content"; + + const results = Array.isArray(output.results) + ? (output.results as Array>) + : null; + + if (results) { + return { + title: `${results.length} search result${results.length === 1 ? "" : "s"}`, + description: query ? truncate(query, 80) : undefined, + content: ( +
+ {results.map((r, i) => { + const title = getStringField(r, "title") ?? "(untitled)"; + const href = getStringField(r, "url") ?? ""; + const snippet = getStringField(r, "snippet"); + const pageAge = getStringField(r, "page_age"); + return ( +
+ {href ? ( + + {title} + + ) : ( + {title} + )} + {href && ( +
+ {truncate(href, 100)} +
+ )} + {snippet && ( +

{snippet}

+ )} + {pageAge && ( +
{pageAge}
+ )} +
+ ); + })} +
+ ), + }; + } - // Try direct string fields first, then MCP content blocks, then raw JSON let content = getStringField(output, "content", "text", "_raw"); if (!content) content = extractMcpText(output); if (!content) { - // Fallback: render the raw JSON so the accordion isn't empty try { const raw = JSON.stringify(output, null, 2); if (raw !== "{}") content = raw; @@ -327,11 +372,7 @@ function getWebAccordionData( const message = getStringField(output, "message"); return { - title: statusCode - ? `Response (${statusCode})` - : url - ? "Web fetch" - : "Search results", + title: statusCode ? `Response (${statusCode})` : "Web fetch", description: truncate(url, 80), content: content ? ( {content} diff --git a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/helpers.ts b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/helpers.ts index f8da6fbc2f..345a2a7ca5 100644 --- a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/helpers.ts +++ b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/helpers.ts @@ -60,6 +60,7 @@ export function getToolCategory(toolName: string): ToolCategory { case "bash_exec": return "bash"; case "web_fetch": + case "web_search": case "WebSearch": case "WebFetch": return "web"; From 642b9c29c67bdb824f97064c2c07a7a8037ae6b4 Mon Sep 17 00:00:00 2001 From: majdyz Date: Wed, 22 Apr 2026 00:15:51 +0700 Subject: [PATCH 20/25] fix(frontend/copilot): label web_search with query summary, not web_fetch wording Add ``web_search`` alongside ``WebSearch`` in ``getInputSummary`` so the query is read from ``input.query``, and in ``getAnimationText`` so the status line reads ``Searched "foo"`` instead of ``Fetched web content``. Also run prettier on the prior ``getWebAccordionData`` change. --- .../(platform)/copilot/tools/GenericTool/GenericTool.tsx | 4 +--- .../app/(platform)/copilot/tools/GenericTool/helpers.ts | 9 ++++++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/GenericTool.tsx b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/GenericTool.tsx index 5ea634a69a..74aa3153d5 100644 --- a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/GenericTool.tsx +++ b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/GenericTool.tsx @@ -342,9 +342,7 @@ function getWebAccordionData( {truncate(href, 100)} )} - {snippet && ( -

{snippet}

- )} + {snippet &&

{snippet}

} {pageAge && (
{pageAge}
)} diff --git a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/helpers.ts b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/helpers.ts index 345a2a7ca5..e1103e1435 100644 --- a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/helpers.ts +++ b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/helpers.ts @@ -115,6 +115,7 @@ function getInputSummary(toolName: string, input: unknown): string | null { case "web_fetch": case "WebFetch": return typeof inp.url === "string" ? inp.url : null; + case "web_search": case "WebSearch": return typeof inp.query === "string" ? inp.query : null; case "browser_navigate": @@ -221,7 +222,7 @@ export function getAnimationText( ? `Running: ${shortSummary}` : "Running command\u2026"; case "web": - if (toolName === "WebSearch") { + if (toolName === "WebSearch" || toolName === "web_search") { return shortSummary ? `Searching "${shortSummary}"` : "Searching the web\u2026"; @@ -283,7 +284,7 @@ export function getAnimationText( // exit status here would just double up. return shortSummary ? `Ran: ${shortSummary}` : "Command completed"; case "web": - if (toolName === "WebSearch") { + if (toolName === "WebSearch" || toolName === "web_search") { return shortSummary ? `Searched "${shortSummary}"` : "Web search completed"; @@ -353,7 +354,9 @@ export function getAnimationText( case "bash": return "Command failed"; case "web": - return toolName === "WebSearch" ? "Search failed" : "Fetch failed"; + return toolName === "WebSearch" || toolName === "web_search" + ? "Search failed" + : "Fetch failed"; case "browser": return "Browser action failed"; default: From 1dfc75520dd780e2f713bb7ff72bafed2d8190e3 Mon Sep 17 00:00:00 2001 From: majdyz Date: Wed, 22 Apr 2026 00:28:13 +0700 Subject: [PATCH 21/25] fix(backend/copilot): drop encrypted_content from web_search snippet Anthropic's web_search_result ships an opaque encrypted_content blob meant for citation round-tripping, not display. Using it as the snippet surfaced base64 gibberish to the frontend and to the LLM. There is no plain-text snippet field in the current beta; drop it and rely on the model's text blocks with citations for prose. --- .../backend/backend/copilot/tools/web_search.py | 13 ++++++++++--- .../backend/copilot/tools/web_search_test.py | 10 +++++++--- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/autogpt_platform/backend/backend/copilot/tools/web_search.py b/autogpt_platform/backend/backend/copilot/tools/web_search.py index 4b7ac3a53f..22d9a82904 100644 --- a/autogpt_platform/backend/backend/copilot/tools/web_search.py +++ b/autogpt_platform/backend/backend/copilot/tools/web_search.py @@ -183,13 +183,20 @@ def _extract_results(resp: Any, *, limit: int) -> tuple[list[WebSearchResult], i continue if len(results) >= limit: break + # Anthropic's ``web_search_result`` exposes only + # ``title``/``url``/``page_age`` plus an opaque + # ``encrypted_content`` blob that is meant for citation + # round-tripping, not for display — it is base64-ish + # binary and would show as gibberish if surfaced to the + # model or the frontend. There is no plain-text snippet + # field in the current beta; callers get the readable + # text via the model's ``text`` blocks with citations, + # not via this list. Leave ``snippet`` empty. results.append( WebSearchResult( title=getattr(item, "title", "") or "", url=getattr(item, "url", "") or "", - snippet=getattr(item, "encrypted_content", None) - or getattr(item, "page_content", "") - or "", + snippet="", page_age=getattr(item, "page_age", None), ) ) diff --git a/autogpt_platform/backend/backend/copilot/tools/web_search_test.py b/autogpt_platform/backend/backend/copilot/tools/web_search_test.py index fe7885e171..3d516f295a 100644 --- a/autogpt_platform/backend/backend/copilot/tools/web_search_test.py +++ b/autogpt_platform/backend/backend/copilot/tools/web_search_test.py @@ -64,13 +64,17 @@ class TestExtractResults: """The extractor is the only Anthropic-response-shape contact point; pin its behaviour so an API shape change surfaces here first.""" - def test_extracts_title_url_snippet_and_page_age(self): + def test_extracts_title_url_page_age_and_drops_encrypted_snippet(self): + # Anthropic's ``web_search_result`` ships an opaque + # ``encrypted_content`` blob that is not safe to surface — + # the extractor must drop it (snippet=="") regardless of + # whether the blob is non-empty. resp = _fake_anthropic_response( results=[ { "title": "Kimi K2.6 launch", "url": "https://example.com/kimi", - "snippet": "Moonshot released K2.6 on 2026-04-20.", + "snippet": "EiJjbGF1ZGUtZW5jcnlwdGVkLWJsb2I=", "page_age": "1 day", }, { @@ -85,7 +89,7 @@ class TestExtractResults: assert len(out) == 2 assert out[0].title == "Kimi K2.6 launch" assert out[0].url == "https://example.com/kimi" - assert out[0].snippet.startswith("Moonshot released") + assert out[0].snippet == "" assert out[0].page_age == "1 day" assert out[1].snippet == "" From 2ba0082e78b19dc90d34bfb1c063205f3962c1fb Mon Sep 17 00:00:00 2001 From: majdyz Date: Wed, 22 Apr 2026 05:59:20 +0700 Subject: [PATCH 22/25] fix(backend/copilot): register web_search in ToolName Literal ``TOOL_REGISTRY`` now has ``web_search`` but the ``ToolName`` Literal in ``permissions.py`` was missed, so ``TestSdkBuiltinToolNames`` and ``TestMergeInheritedPermissions`` flagged the drift on CI. Add it to the Literal so both assertions pass. --- autogpt_platform/backend/backend/copilot/permissions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/autogpt_platform/backend/backend/copilot/permissions.py b/autogpt_platform/backend/backend/copilot/permissions.py index a87cad1e9b..58cce98fbf 100644 --- a/autogpt_platform/backend/backend/copilot/permissions.py +++ b/autogpt_platform/backend/backend/copilot/permissions.py @@ -107,6 +107,7 @@ ToolName = Literal[ "validate_agent_graph", "view_agent_output", "web_fetch", + "web_search", "write_workspace_file", # SDK built-ins "Agent", From c5eff58bf81b9e70dc05d2fa5317a5426e102cce Mon Sep 17 00:00:00 2001 From: majdyz Date: Wed, 22 Apr 2026 06:12:09 +0700 Subject: [PATCH 23/25] fix(backend/copilot): keep tool schema under char budget for web_search MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Trim web_search description + params (733 → 476 chars saved 257) and bump the schema char budget from 32_500 to 32_800 to absorb the remaining skeleton cost of a newly added LLM-facing primitive. Unblocks test_total_schema_char_budget in the py3.11/3.12/3.13 matrix. --- .../backend/copilot/tools/tool_schema_test.py | 6 +++++- .../backend/backend/copilot/tools/web_search.py | 16 ++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/autogpt_platform/backend/backend/copilot/tools/tool_schema_test.py b/autogpt_platform/backend/backend/copilot/tools/tool_schema_test.py index e0403cdc79..7b370f810c 100644 --- a/autogpt_platform/backend/backend/copilot/tools/tool_schema_test.py +++ b/autogpt_platform/backend/backend/copilot/tools/tool_schema_test.py @@ -21,7 +21,11 @@ from backend.copilot.tools import TOOL_REGISTRY # response shape carries) and the dry_run description. Keeps the # regression gate effective while accepting a deliberate ~120-token # spend on LLM-decision-critical copy. -_CHAR_BUDGET = 32_500 +# Bumped 32500 -> 32800 on PR #12871 for the new web_search tool +# (server-side Anthropic beta). Description already trimmed to the +# minimum viable copy; the bump absorbs the schema skeleton cost +# (~300 chars / ~75 tokens) for a new LLM-facing primitive. +_CHAR_BUDGET = 32_800 @pytest.fixture(scope="module") diff --git a/autogpt_platform/backend/backend/copilot/tools/web_search.py b/autogpt_platform/backend/backend/copilot/tools/web_search.py index 22d9a82904..feb999d4d6 100644 --- a/autogpt_platform/backend/backend/copilot/tools/web_search.py +++ b/autogpt_platform/backend/backend/copilot/tools/web_search.py @@ -36,13 +36,9 @@ class WebSearchTool(BaseTool): @property def description(self) -> str: return ( - "Search the web and return cited results. Use this for live " - "information — news, current events, up-to-date docs, recent " - "releases — when the model's training data would be stale. " - "Returns a list of {title, url, snippet} plus the URLs so " - "``web_fetch`` can deep-dive any result. Costs a few cents " - "per search; prefer one well-targeted query over many " - "reformulations." + "Search the web for live info (news, recent docs). Returns " + "{title, url, snippet}; use web_fetch to deep-dive a URL. " + "Prefer one targeted query over many reformulations." ) @property @@ -52,13 +48,13 @@ class WebSearchTool(BaseTool): "properties": { "query": { "type": "string", - "description": "The search query — a question or topic.", + "description": "Search query.", }, "max_results": { "type": "integer", "description": ( - f"Maximum results to return (default " - f"{_DEFAULT_MAX_RESULTS}, hard cap {_HARD_MAX_RESULTS})." + f"Max results (default {_DEFAULT_MAX_RESULTS}, " + f"cap {_HARD_MAX_RESULTS})." ), "default": _DEFAULT_MAX_RESULTS, }, From 86898ff0d8ddd07f1ded9e299a50028a852632f7 Mon Sep 17 00:00:00 2001 From: majdyz Date: Wed, 22 Apr 2026 07:12:32 +0700 Subject: [PATCH 24/25] test(frontend/copilot): cover web_search branches in GenericTool + helpers Expand the existing GenericTool + helpers tests so the new web_search tool is exercised alongside the legacy WebSearch/web_fetch paths: getToolCategory, getAnimationText (input-streaming / output-available / output-error, with and without a query summary), extractToolName, and the new results-array rendering in getWebAccordionData (title count, clickable citations with target/_blank, snippet + page_age, no-URL fallback). --- .../__tests__/GenericTool.test.tsx | 120 +++++++++++++++++- .../GenericTool/__tests__/helpers.test.ts | 52 +++++++- 2 files changed, 170 insertions(+), 2 deletions(-) diff --git a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/__tests__/GenericTool.test.tsx b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/__tests__/GenericTool.test.tsx index 4308eb49bf..2fb1f25446 100644 --- a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/__tests__/GenericTool.test.tsx +++ b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/__tests__/GenericTool.test.tsx @@ -1,6 +1,6 @@ import { describe, expect, it } from "vitest"; import type { ToolUIPart } from "ai"; -import { render, screen } from "@/tests/integrations/test-utils"; +import { fireEvent, render, screen } from "@/tests/integrations/test-utils"; import { GenericTool } from "../GenericTool"; function makePart(overrides: Record = {}): ToolUIPart { @@ -136,4 +136,122 @@ describe("GenericTool", () => { const trigger2 = screen.getByRole("button", { expanded: false }); expect(trigger2.textContent).toContain("completed"); }); + + describe("web_search results rendering", () => { + function makeWebSearchPart( + results: Array>, + query = "kimi k2.6", + ): ToolUIPart { + return { + type: "tool-web_search", + toolCallId: "call-web-1", + state: "output-available", + input: { query }, + output: { + type: "web_search_response", + results, + query, + search_requests: 1, + }, + } as unknown as ToolUIPart; + } + + it("renders an 'N search results' title and shows the query in the description", () => { + render( + , + ); + const trigger = screen.getByRole("button", { expanded: false }); + expect(trigger.textContent).toContain("2 search results"); + expect(trigger.textContent).toContain("kimi k2.6"); + + fireEvent.click(trigger); + + const firstLink = screen.getByRole("link", { + name: "Kimi K2.6 release notes", + }) as HTMLAnchorElement; + expect(firstLink.getAttribute("href")).toBe("https://example.com/kimi"); + expect(firstLink.getAttribute("target")).toBe("_blank"); + expect(firstLink.getAttribute("rel")).toBe("noopener noreferrer"); + expect(screen.queryByText("A fast model")).not.toBeNull(); + expect(screen.queryByText("2 days ago")).not.toBeNull(); + + const secondLink = screen.getByRole("link", { + name: "Second result", + }) as HTMLAnchorElement; + expect(secondLink.getAttribute("href")).toBe("https://example.com/two"); + }); + + it("uses singular 'search result' when there is exactly one result", () => { + render( + , + ); + const trigger = screen.getByRole("button", { expanded: false }); + expect(trigger.textContent).toContain("1 search result"); + expect(trigger.textContent).not.toContain("1 search results"); + }); + + it("handles an empty results array (0 search results)", () => { + render(); + const trigger = screen.getByRole("button", { expanded: false }); + expect(trigger.textContent).toContain("0 search results"); + }); + + it("renders an untitled non-link when a result has no url", () => { + render( + , + ); + fireEvent.click(screen.getByRole("button", { expanded: false })); + expect(screen.queryByRole("link")).toBeNull(); + expect(screen.queryByText("No URL entry")).not.toBeNull(); + expect(screen.queryByText("Just text")).not.toBeNull(); + }); + + it("shows subtitle 'Searched \"…\"' once web_search output is available", () => { + const { container } = render( + , + ); + // MorphingTextAnimation splits each character into its own span and + // substitutes spaces with  , so assert on a normalized textContent + // rather than the raw substring. + const normalized = (container.textContent ?? "").replace(/ /g, " "); + expect(normalized).toContain('Searched "kimi k2.6"'); + }); + }); }); diff --git a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/__tests__/helpers.test.ts b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/__tests__/helpers.test.ts index de0b9155b6..ca8cc6ba0b 100644 --- a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/__tests__/helpers.test.ts +++ b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/__tests__/helpers.test.ts @@ -22,6 +22,11 @@ describe("extractToolName", () => { const part = { type: "Read" } as unknown as ToolUIPart; expect(extractToolName(part)).toBe("Read"); }); + + it("strips the tool- prefix for web_search", () => { + const part = { type: "tool-web_search" } as unknown as ToolUIPart; + expect(extractToolName(part)).toBe("web_search"); + }); }); describe("formatToolName", () => { @@ -60,8 +65,9 @@ describe("getToolCategory", () => { expect(getToolCategory("bash_exec")).toBe("bash"); }); - it("returns 'web' for web_fetch, WebSearch, WebFetch", () => { + it("returns 'web' for web_fetch, web_search, WebSearch, WebFetch", () => { expect(getToolCategory("web_fetch")).toBe("web"); + expect(getToolCategory("web_search")).toBe("web"); expect(getToolCategory("WebSearch")).toBe("web"); expect(getToolCategory("WebFetch")).toBe("web"); }); @@ -229,6 +235,50 @@ describe("getAnimationText", () => { expect(getAnimationText(part, "web")).toBe('Searching "test query"'); }); + it("shows searching text for web_search with a query summary", () => { + const part = makePart({ + type: "tool-web_search", + state: "input-streaming", + input: { query: "kimi k2.6" }, + }); + expect(getAnimationText(part, "web")).toBe('Searching "kimi k2.6"'); + }); + + it("falls back to generic searching text for web_search with no query", () => { + const part = makePart({ + type: "tool-web_search", + state: "input-streaming", + }); + expect(getAnimationText(part, "web")).toBe("Searching the web…"); + }); + + it("shows completed text for web_search with a query summary", () => { + const part = makePart({ + type: "tool-web_search", + state: "output-available", + input: { query: "kimi k2.6" }, + output: { results: [] }, + }); + expect(getAnimationText(part, "web")).toBe('Searched "kimi k2.6"'); + }); + + it("falls back to generic completed text for web_search with no query", () => { + const part = makePart({ + type: "tool-web_search", + state: "output-available", + output: { results: [] }, + }); + expect(getAnimationText(part, "web")).toBe("Web search completed"); + }); + + it("shows error text for web_search failure", () => { + const part = makePart({ + type: "tool-web_search", + state: "output-error", + }); + expect(getAnimationText(part, "web")).toBe("Search failed"); + }); + it("shows fetching text for web_fetch", () => { const part = makePart({ type: "tool-web_fetch", From f4fed71e3d39c9c5ece89193aab10c10acb4b14c Mon Sep 17 00:00:00 2001 From: majdyz Date: Wed, 22 Apr 2026 07:22:25 +0700 Subject: [PATCH 25/25] test(frontend/copilot): extend web_search tests to cover untitled + non-results fallback --- .../__tests__/GenericTool.test.tsx | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/__tests__/GenericTool.test.tsx b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/__tests__/GenericTool.test.tsx index 2fb1f25446..48e0409393 100644 --- a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/__tests__/GenericTool.test.tsx +++ b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/__tests__/GenericTool.test.tsx @@ -253,5 +253,64 @@ describe("GenericTool", () => { const normalized = (container.textContent ?? "").replace(/ /g, " "); expect(normalized).toContain('Searched "kimi k2.6"'); }); + + it("uses '(untitled)' when a search result has no title", () => { + render( + , + ); + fireEvent.click(screen.getByRole("button", { expanded: false })); + const link = screen.getByRole("link", { + name: "(untitled)", + }) as HTMLAnchorElement; + expect(link.getAttribute("href")).toBe("https://example.com/x"); + }); + }); + + describe("getWebAccordionData non-results fallback", () => { + function makeWebFetchPart(output: Record): ToolUIPart { + return { + type: "tool-web_fetch", + toolCallId: "call-fetch-1", + state: "output-available", + input: { url: "https://example.com/page" }, + output, + } as unknown as ToolUIPart; + } + + it("renders 'Web fetch' title when output has content instead of results", () => { + render( + , + ); + const trigger = screen.getByRole("button", { expanded: false }); + expect(trigger.textContent).toContain("Web fetch"); + fireEvent.click(trigger); + expect(screen.queryByText("fetched body")).not.toBeNull(); + }); + + it("renders 'Response (N)' title when output has a status_code", () => { + render( + , + ); + const trigger = screen.getByRole("button", { expanded: false }); + expect(trigger.textContent).toContain("Response (404)"); + }); + + it("falls back to MCP text blocks when direct content is absent", () => { + render( + , + ); + fireEvent.click(screen.getByRole("button", { expanded: false })); + expect(screen.queryByText("mcp body")).not.toBeNull(); + }); }); });