fix(backend/copilot): announce in-flight tool calls to unstick guide guard

Symptom (session 0d83f15c on Kimi K2.6): the agent called `get_agent_building_guide`, got the guide, retried `create_agent` — and the `require_guide_read` gate fired "Call get_agent_building_guide first" anyway, looping indefinitely. Root cause: baseline path buffers assistant rows with their `tool_calls` into `state.session_messages` (a scratch list on `_BaselineStreamState`) during the tool-call loop, and only flushes into `session.messages` at turn end. So when the second tool runs within the *same* turn, `_guide_read_in_session` — which scans `session.messages` — sees no guide call and fires the gate. SDK path didn't hit this because it mirrors tool calls straight into `ctx.session.messages`; Kimi's aggressive tool-call chaining within one turn was what surfaced the bug on baseline. Not Kimi-specific (any baseline model that calls guide + create_agent in one turn would hit it). Fix: add an in-flight announcement buffer on `ChatSession`. * `ChatSession._inflight_tool_calls: set[str]` (PrivateAttr, never serialised). * `announce_inflight_tool_call` called by `_baseline_tool_executor` the moment a tool is dispatched, before it runs. * `has_tool_been_called_this_turn` folds the in-flight set into the historical `messages` scan; `require_guide_read` now calls this instead of the messages-only helper. * `clear_inflight_tool_calls` fired in the baseline turn's finally block, right before `upsert_chat_session`, so next turn starts with a clean buffer. Deliberately didn't mirror the row into `session.messages` directly — `_baseline_conversation_updater` appends a fully-formed assistant+tool_calls row at round end, so an inline mirror would duplicate. The scratch set keeps the announcement separate from durable history. New tests: in-flight announcement lets gate pass within same turn; clear restores the gate for next turn; PrivateAttr never leaks into `model_dump`. Existing gate tests migrated from MagicMock(spec=ChatSession) to real ChatSession instances since the guard now calls the new helper.
2026-04-30 03:00:41 -04:00 · 2026-04-21 22:46:56 +07:00
parent da5420fa07
commit 627b52048b
4 changed files with 126 additions and 18 deletions
--- a/autogpt_platform/backend/backend/copilot/baseline/service.py
+++ b/autogpt_platform/backend/backend/copilot/baseline/service.py
@@ -760,6 +760,19 @@ async def _baseline_tool_executor(
        )
    )

+    # Announce the tool call to the session so in-turn guards like
+    # ``require_guide_read`` can see it *right now*, before the tool
+    # actually runs.  Without this, the tool_call row lives only in
+    # ``state.session_messages`` until the ``finally`` block flushes it
+    # into ``session.messages`` at turn end — so a second tool in the
+    # same turn (e.g. ``create_agent`` after ``get_agent_building_guide``)
+    # scans a stale ``session.messages`` and the guard re-fires despite
+    # the guide having been called.  The announce-set is cleared at turn
+    # end; we deliberately don't touch ``session.messages`` here to avoid
+    # duplicating the assistant row that ``_baseline_conversation_updater``
+    # will append at round end.
+    session.announce_inflight_tool_call(tool_name)
+
    try:
        result: StreamToolOutputAvailable = await execute_tool(
            tool_name=tool_name,
@@ -1868,6 +1881,10 @@ async def stream_chat_completion_baseline(
                final_text = final_text[len(recorded) :]
        if final_text.strip():
            session.messages.append(ChatMessage(role="assistant", content=final_text))
+        # In-flight tool-call announcements are only meaningful for the
+        # current turn; clear before persist so the next turn starts with
+        # a clean scratch buffer.
+        session.clear_inflight_tool_calls()
        try:
            await upsert_chat_session(session)
        except Exception as persist_err:
--- a/autogpt_platform/backend/backend/copilot/model.py
+++ b/autogpt_platform/backend/backend/copilot/model.py
@@ -20,7 +20,7 @@ from openai.types.chat.chat_completion_message_tool_call_param import (
 )
 from prisma.models import ChatMessage as PrismaChatMessage
 from prisma.models import ChatSession as PrismaChatSession
-from pydantic import BaseModel
+from pydantic import BaseModel, PrivateAttr

 from backend.data.db_accessors import chat_db
 from backend.data.redis_client import get_redis_async
@@ -198,6 +198,15 @@ class ChatSessionInfo(BaseModel):

 class ChatSession(ChatSessionInfo):
    messages: list[ChatMessage]
+    # In-flight tool-call names for the CURRENT turn.  Not persisted to
+    # DB and not serialised on the wire — ``PrivateAttr`` keeps this a
+    # process-local scratch buffer that's invisible to ``model_dump`` /
+    # ``model_dump_json`` / the redis cache path.  Populated by the
+    # baseline tool executor the moment a tool is dispatched so in-turn
+    # guards (e.g. ``require_guide_read``) can see the call before it
+    # lands in ``messages`` at turn-end.  Cleared when the turn
+    # completes.
+    _inflight_tool_calls: set[str] = PrivateAttr(default_factory=set)

    @classmethod
    def new(cls, user_id: str, *, dry_run: bool) -> Self:
@@ -226,6 +235,44 @@ class ChatSession(ChatSessionInfo):
            messages=[ChatMessage.from_db(m) for m in prisma_session.Messages],
        )

+    def announce_inflight_tool_call(self, tool_name: str) -> None:
+        """Record that *tool_name* is being dispatched in the current turn.
+
+        Lets in-turn guards (see
+        ``copilot/tools/helpers.py::_guide_read_in_session``) see a tool
+        call the moment it's issued, instead of waiting for the
+        ``session.messages`` flush at turn end — fixing a loop where a
+        second tool in the same turn re-fires a guard despite the
+        guarding tool having already been called (seen on Kimi K2.6 in
+        particular because its aggressive tool-call chaining exercises
+        this path much more than Sonnet does).  The buffer is cleared by
+        :meth:`clear_inflight_tool_calls` at turn end.
+        """
+        self._inflight_tool_calls.add(tool_name)
+
+    def clear_inflight_tool_calls(self) -> None:
+        """Reset the in-flight tool-call announcement buffer."""
+        self._inflight_tool_calls.clear()
+
+    def has_tool_been_called_this_turn(self, tool_name: str) -> bool:
+        """True when *tool_name* has been called in the current turn.
+
+        Checks the in-flight announcement buffer first (for calls
+        dispatched in *this* turn but not yet persisted) and then the
+        durable ``messages`` history (for past turns + prior rounds
+        within this turn whose writes already landed).
+        """
+        if tool_name in self._inflight_tool_calls:
+            return True
+        for msg in reversed(self.messages):
+            if msg.role != "assistant" or not msg.tool_calls:
+                continue
+            for tc in msg.tool_calls:
+                name = tc.get("function", {}).get("name") or tc.get("name")
+                if name == tool_name:
+                    return True
+        return False
+
    def add_tool_call_to_current_turn(self, tool_call: dict) -> None:
        """Attach a tool_call to the current turn's assistant message.

--- a/autogpt_platform/backend/backend/copilot/tools/agent_guide_gate_test.py
+++ b/autogpt_platform/backend/backend/copilot/tools/agent_guide_gate_test.py
@@ -7,8 +7,6 @@ tokens and then produce JSON that fails validation — wasting turns on
 auto-fix loops.
 """

-from unittest.mock import MagicMock
-
 import pytest

 from backend.copilot.model import ChatMessage, ChatSession
@@ -18,8 +16,17 @@ from .models import ErrorResponse


 def _session_with_messages(messages: list[ChatMessage]) -> ChatSession:
-    """Build a minimal ChatSession whose ``messages`` matches *messages*."""
-    session = MagicMock(spec=ChatSession)
+    """Build a real ChatSession with the given messages.
+
+    Uses ``ChatSession.new`` + attribute reassignment rather than
+    ``MagicMock(spec=...)`` because the gate now calls
+    ``session.has_tool_been_called_this_turn(...)`` and a ``spec`` mock
+    returns a truthy ``MagicMock`` from that call, hiding real gate
+    behaviour.  A live ``ChatSession`` also correctly initialises the
+    ``_inflight_tool_calls`` PrivateAttr scratch buffer used by the
+    in-turn announcement path.
+    """
+    session = ChatSession.new("test-user", dry_run=False)
    session.session_id = "test-session"
    session.messages = messages
    return session
@@ -117,3 +124,44 @@ def test_tool_name_surfaced_in_error(tool_name: str):
    result = require_guide_read(session, tool_name)
    assert isinstance(result, ErrorResponse)
    assert tool_name in result.message
+
+
+def test_inflight_announcement_lets_gate_pass_within_same_turn():
+    """Regression for the Kimi baseline loop: the guide call is
+    dispatched earlier in the SAME turn and buffered by the
+    ``_baseline_tool_executor`` into the in-flight announcement set,
+    but hasn't been flushed into ``session.messages`` yet.  The gate
+    must see it anyway — otherwise a follow-up ``create_agent`` in the
+    same turn re-fires the guard despite the guide call and the model
+    loops retrying the guide."""
+    session = _session_with_messages(
+        [ChatMessage(role="user", content="build something")]
+    )
+    # Simulate _baseline_tool_executor's announce.
+    session.announce_inflight_tool_call("get_agent_building_guide")
+    assert require_guide_read(session, "create_agent") is None
+
+
+def test_inflight_clear_restores_gate_for_next_turn():
+    """End-of-turn cleanup must drop the in-flight buffer so it can't
+    leak into the *next* turn's ``session.messages`` scan (e.g. a second
+    session turn that should legitimately require a fresh guide call if
+    ``messages`` got compressed away)."""
+    session = _session_with_messages([ChatMessage(role="user", content="build")])
+    session.announce_inflight_tool_call("get_agent_building_guide")
+    assert require_guide_read(session, "create_agent") is None
+    session.clear_inflight_tool_calls()
+    # With the buffer cleared and no guide row in messages, the guard
+    # fires again.
+    assert isinstance(require_guide_read(session, "create_agent"), ErrorResponse)
+
+
+def test_inflight_announcement_does_not_serialise_into_model_dump():
+    """PrivateAttr invariant: the scratch buffer must never leak into
+    ``model_dump()`` / the Redis cache payload / the DB — it's
+    process-local turn state, not durable session state."""
+    session = _session_with_messages([])
+    session.announce_inflight_tool_call("get_agent_building_guide")
+    dumped = session.model_dump()
+    assert "_inflight_tool_calls" not in dumped
+    assert "inflight_tool_calls" not in dumped
--- a/autogpt_platform/backend/backend/copilot/tools/helpers.py
+++ b/autogpt_platform/backend/backend/copilot/tools/helpers.py
@@ -787,26 +787,22 @@ def _resolve_discriminated_credentials(
 _AGENT_GUIDE_TOOL_NAME = "get_agent_building_guide"


-def _guide_read_in_session(session: ChatSession) -> bool:
-    """True if this session's assistant messages include a guide tool call."""
-    for msg in reversed(session.messages):
-        if msg.role != "assistant" or not msg.tool_calls:
-            continue
-        for tc in msg.tool_calls:
-            name = tc.get("function", {}).get("name") or tc.get("name")
-            if name == _AGENT_GUIDE_TOOL_NAME:
-                return True
-    return False
-
-
 def require_guide_read(session: ChatSession, tool_name: str):
    """Return an ErrorResponse if the guide hasn't been loaded this session.

    Import inline to keep ``helpers.py`` free of tool-response imports.
+    Uses :meth:`ChatSession.has_tool_been_called_this_turn` which checks
+    both the persisted ``messages`` list and the in-flight announcement
+    buffer — so a guide call dispatched earlier in the *current* turn
+    (before ``session.messages`` flushes at turn end) is recognised too.
+    Otherwise a second tool in the same turn would re-fire this guard
+    despite the guide having been called — seen on Kimi K2.6 in
+    particular because its aggressive tool-call chaining exercises this
+    path far more than Sonnet does.
    """
    from .models import ErrorResponse  # noqa: PLC0415 — avoid circular import

-    if _guide_read_in_session(session):
+    if session.has_tool_been_called_this_turn(_AGENT_GUIDE_TOOL_NAME):
        return None
    return ErrorResponse(
        message=(