fix(backend/copilot): announce in-flight tool calls to unstick guide guard

Symptom (session 0d83f15c on Kimi K2.6): the agent called `get_agent_building_guide`, got the guide, retried `create_agent` — and the `require_guide_read` gate fired "Call get_agent_building_guide first" anyway, looping indefinitely.

Root cause: baseline path buffers assistant rows with their `tool_calls` into `state.session_messages` (a scratch list on `_BaselineStreamState`) during the tool-call loop, and only flushes into `session.messages` at turn end.  So when the second tool runs within the *same* turn, `_guide_read_in_session` — which scans `session.messages` — sees no guide call and fires the gate.  SDK path didn't hit this because it mirrors tool calls straight into `ctx.session.messages`; Kimi's aggressive tool-call chaining within one turn was what surfaced the bug on baseline.  Not Kimi-specific (any baseline model that calls guide + create_agent in one turn would hit it).

Fix: add an in-flight announcement buffer on `ChatSession`.

* `ChatSession._inflight_tool_calls: set[str]` (PrivateAttr, never serialised).

* `announce_inflight_tool_call` called by `_baseline_tool_executor` the moment a tool is dispatched, before it runs.

* `has_tool_been_called_this_turn` folds the in-flight set into the historical `messages` scan; `require_guide_read` now calls this instead of the messages-only helper.

* `clear_inflight_tool_calls` fired in the baseline turn's finally block, right before `upsert_chat_session`, so next turn starts with a clean buffer.

Deliberately didn't mirror the row into `session.messages` directly — `_baseline_conversation_updater` appends a fully-formed assistant+tool_calls row at round end, so an inline mirror would duplicate.  The scratch set keeps the announcement separate from durable history.

New tests: in-flight announcement lets gate pass within same turn; clear restores the gate for next turn; PrivateAttr never leaks into `model_dump`.  Existing gate tests migrated from MagicMock(spec=ChatSession) to real ChatSession instances since the guard now calls the new helper.
This commit is contained in:
majdyz
2026-04-21 22:46:56 +07:00
parent da5420fa07
commit 627b52048b
4 changed files with 126 additions and 18 deletions

View File

@@ -760,6 +760,19 @@ async def _baseline_tool_executor(
)
)
# Announce the tool call to the session so in-turn guards like
# ``require_guide_read`` can see it *right now*, before the tool
# actually runs. Without this, the tool_call row lives only in
# ``state.session_messages`` until the ``finally`` block flushes it
# into ``session.messages`` at turn end — so a second tool in the
# same turn (e.g. ``create_agent`` after ``get_agent_building_guide``)
# scans a stale ``session.messages`` and the guard re-fires despite
# the guide having been called. The announce-set is cleared at turn
# end; we deliberately don't touch ``session.messages`` here to avoid
# duplicating the assistant row that ``_baseline_conversation_updater``
# will append at round end.
session.announce_inflight_tool_call(tool_name)
try:
result: StreamToolOutputAvailable = await execute_tool(
tool_name=tool_name,
@@ -1868,6 +1881,10 @@ async def stream_chat_completion_baseline(
final_text = final_text[len(recorded) :]
if final_text.strip():
session.messages.append(ChatMessage(role="assistant", content=final_text))
# In-flight tool-call announcements are only meaningful for the
# current turn; clear before persist so the next turn starts with
# a clean scratch buffer.
session.clear_inflight_tool_calls()
try:
await upsert_chat_session(session)
except Exception as persist_err:

View File

@@ -20,7 +20,7 @@ from openai.types.chat.chat_completion_message_tool_call_param import (
)
from prisma.models import ChatMessage as PrismaChatMessage
from prisma.models import ChatSession as PrismaChatSession
from pydantic import BaseModel
from pydantic import BaseModel, PrivateAttr
from backend.data.db_accessors import chat_db
from backend.data.redis_client import get_redis_async
@@ -198,6 +198,15 @@ class ChatSessionInfo(BaseModel):
class ChatSession(ChatSessionInfo):
messages: list[ChatMessage]
# In-flight tool-call names for the CURRENT turn. Not persisted to
# DB and not serialised on the wire — ``PrivateAttr`` keeps this a
# process-local scratch buffer that's invisible to ``model_dump`` /
# ``model_dump_json`` / the redis cache path. Populated by the
# baseline tool executor the moment a tool is dispatched so in-turn
# guards (e.g. ``require_guide_read``) can see the call before it
# lands in ``messages`` at turn-end. Cleared when the turn
# completes.
_inflight_tool_calls: set[str] = PrivateAttr(default_factory=set)
@classmethod
def new(cls, user_id: str, *, dry_run: bool) -> Self:
@@ -226,6 +235,44 @@ class ChatSession(ChatSessionInfo):
messages=[ChatMessage.from_db(m) for m in prisma_session.Messages],
)
def announce_inflight_tool_call(self, tool_name: str) -> None:
"""Record that *tool_name* is being dispatched in the current turn.
Lets in-turn guards (see
``copilot/tools/helpers.py::_guide_read_in_session``) see a tool
call the moment it's issued, instead of waiting for the
``session.messages`` flush at turn end — fixing a loop where a
second tool in the same turn re-fires a guard despite the
guarding tool having already been called (seen on Kimi K2.6 in
particular because its aggressive tool-call chaining exercises
this path much more than Sonnet does). The buffer is cleared by
:meth:`clear_inflight_tool_calls` at turn end.
"""
self._inflight_tool_calls.add(tool_name)
def clear_inflight_tool_calls(self) -> None:
"""Reset the in-flight tool-call announcement buffer."""
self._inflight_tool_calls.clear()
def has_tool_been_called_this_turn(self, tool_name: str) -> bool:
"""True when *tool_name* has been called in the current turn.
Checks the in-flight announcement buffer first (for calls
dispatched in *this* turn but not yet persisted) and then the
durable ``messages`` history (for past turns + prior rounds
within this turn whose writes already landed).
"""
if tool_name in self._inflight_tool_calls:
return True
for msg in reversed(self.messages):
if msg.role != "assistant" or not msg.tool_calls:
continue
for tc in msg.tool_calls:
name = tc.get("function", {}).get("name") or tc.get("name")
if name == tool_name:
return True
return False
def add_tool_call_to_current_turn(self, tool_call: dict) -> None:
"""Attach a tool_call to the current turn's assistant message.

View File

@@ -7,8 +7,6 @@ tokens and then produce JSON that fails validation — wasting turns on
auto-fix loops.
"""
from unittest.mock import MagicMock
import pytest
from backend.copilot.model import ChatMessage, ChatSession
@@ -18,8 +16,17 @@ from .models import ErrorResponse
def _session_with_messages(messages: list[ChatMessage]) -> ChatSession:
"""Build a minimal ChatSession whose ``messages`` matches *messages*."""
session = MagicMock(spec=ChatSession)
"""Build a real ChatSession with the given messages.
Uses ``ChatSession.new`` + attribute reassignment rather than
``MagicMock(spec=...)`` because the gate now calls
``session.has_tool_been_called_this_turn(...)`` and a ``spec`` mock
returns a truthy ``MagicMock`` from that call, hiding real gate
behaviour. A live ``ChatSession`` also correctly initialises the
``_inflight_tool_calls`` PrivateAttr scratch buffer used by the
in-turn announcement path.
"""
session = ChatSession.new("test-user", dry_run=False)
session.session_id = "test-session"
session.messages = messages
return session
@@ -117,3 +124,44 @@ def test_tool_name_surfaced_in_error(tool_name: str):
result = require_guide_read(session, tool_name)
assert isinstance(result, ErrorResponse)
assert tool_name in result.message
def test_inflight_announcement_lets_gate_pass_within_same_turn():
"""Regression for the Kimi baseline loop: the guide call is
dispatched earlier in the SAME turn and buffered by the
``_baseline_tool_executor`` into the in-flight announcement set,
but hasn't been flushed into ``session.messages`` yet. The gate
must see it anyway — otherwise a follow-up ``create_agent`` in the
same turn re-fires the guard despite the guide call and the model
loops retrying the guide."""
session = _session_with_messages(
[ChatMessage(role="user", content="build something")]
)
# Simulate _baseline_tool_executor's announce.
session.announce_inflight_tool_call("get_agent_building_guide")
assert require_guide_read(session, "create_agent") is None
def test_inflight_clear_restores_gate_for_next_turn():
"""End-of-turn cleanup must drop the in-flight buffer so it can't
leak into the *next* turn's ``session.messages`` scan (e.g. a second
session turn that should legitimately require a fresh guide call if
``messages`` got compressed away)."""
session = _session_with_messages([ChatMessage(role="user", content="build")])
session.announce_inflight_tool_call("get_agent_building_guide")
assert require_guide_read(session, "create_agent") is None
session.clear_inflight_tool_calls()
# With the buffer cleared and no guide row in messages, the guard
# fires again.
assert isinstance(require_guide_read(session, "create_agent"), ErrorResponse)
def test_inflight_announcement_does_not_serialise_into_model_dump():
"""PrivateAttr invariant: the scratch buffer must never leak into
``model_dump()`` / the Redis cache payload / the DB — it's
process-local turn state, not durable session state."""
session = _session_with_messages([])
session.announce_inflight_tool_call("get_agent_building_guide")
dumped = session.model_dump()
assert "_inflight_tool_calls" not in dumped
assert "inflight_tool_calls" not in dumped

View File

@@ -787,26 +787,22 @@ def _resolve_discriminated_credentials(
_AGENT_GUIDE_TOOL_NAME = "get_agent_building_guide"
def _guide_read_in_session(session: ChatSession) -> bool:
"""True if this session's assistant messages include a guide tool call."""
for msg in reversed(session.messages):
if msg.role != "assistant" or not msg.tool_calls:
continue
for tc in msg.tool_calls:
name = tc.get("function", {}).get("name") or tc.get("name")
if name == _AGENT_GUIDE_TOOL_NAME:
return True
return False
def require_guide_read(session: ChatSession, tool_name: str):
"""Return an ErrorResponse if the guide hasn't been loaded this session.
Import inline to keep ``helpers.py`` free of tool-response imports.
Uses :meth:`ChatSession.has_tool_been_called_this_turn` which checks
both the persisted ``messages`` list and the in-flight announcement
buffer — so a guide call dispatched earlier in the *current* turn
(before ``session.messages`` flushes at turn end) is recognised too.
Otherwise a second tool in the same turn would re-fire this guard
despite the guide having been called — seen on Kimi K2.6 in
particular because its aggressive tool-call chaining exercises this
path far more than Sonnet does.
"""
from .models import ErrorResponse # noqa: PLC0415 — avoid circular import
if _guide_read_in_session(session):
if session.has_tool_been_called_this_turn(_AGENT_GUIDE_TOOL_NAME):
return None
return ErrorResponse(
message=(