diff --git a/autogpt_platform/backend/backend/copilot/baseline/service.py b/autogpt_platform/backend/backend/copilot/baseline/service.py index 74ac8b9005..a112475d7b 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/service.py +++ b/autogpt_platform/backend/backend/copilot/baseline/service.py @@ -760,6 +760,19 @@ async def _baseline_tool_executor( ) ) + # Announce the tool call to the session so in-turn guards like + # ``require_guide_read`` can see it *right now*, before the tool + # actually runs. Without this, the tool_call row lives only in + # ``state.session_messages`` until the ``finally`` block flushes it + # into ``session.messages`` at turn end — so a second tool in the + # same turn (e.g. ``create_agent`` after ``get_agent_building_guide``) + # scans a stale ``session.messages`` and the guard re-fires despite + # the guide having been called. The announce-set is cleared at turn + # end; we deliberately don't touch ``session.messages`` here to avoid + # duplicating the assistant row that ``_baseline_conversation_updater`` + # will append at round end. + session.announce_inflight_tool_call(tool_name) + try: result: StreamToolOutputAvailable = await execute_tool( tool_name=tool_name, @@ -1868,6 +1881,10 @@ async def stream_chat_completion_baseline( final_text = final_text[len(recorded) :] if final_text.strip(): session.messages.append(ChatMessage(role="assistant", content=final_text)) + # In-flight tool-call announcements are only meaningful for the + # current turn; clear before persist so the next turn starts with + # a clean scratch buffer. + session.clear_inflight_tool_calls() try: await upsert_chat_session(session) except Exception as persist_err: diff --git a/autogpt_platform/backend/backend/copilot/model.py b/autogpt_platform/backend/backend/copilot/model.py index 08019233e7..c3b23c82cc 100644 --- a/autogpt_platform/backend/backend/copilot/model.py +++ b/autogpt_platform/backend/backend/copilot/model.py @@ -20,7 +20,7 @@ from openai.types.chat.chat_completion_message_tool_call_param import ( ) from prisma.models import ChatMessage as PrismaChatMessage from prisma.models import ChatSession as PrismaChatSession -from pydantic import BaseModel +from pydantic import BaseModel, PrivateAttr from backend.data.db_accessors import chat_db from backend.data.redis_client import get_redis_async @@ -198,6 +198,15 @@ class ChatSessionInfo(BaseModel): class ChatSession(ChatSessionInfo): messages: list[ChatMessage] + # In-flight tool-call names for the CURRENT turn. Not persisted to + # DB and not serialised on the wire — ``PrivateAttr`` keeps this a + # process-local scratch buffer that's invisible to ``model_dump`` / + # ``model_dump_json`` / the redis cache path. Populated by the + # baseline tool executor the moment a tool is dispatched so in-turn + # guards (e.g. ``require_guide_read``) can see the call before it + # lands in ``messages`` at turn-end. Cleared when the turn + # completes. + _inflight_tool_calls: set[str] = PrivateAttr(default_factory=set) @classmethod def new(cls, user_id: str, *, dry_run: bool) -> Self: @@ -226,6 +235,44 @@ class ChatSession(ChatSessionInfo): messages=[ChatMessage.from_db(m) for m in prisma_session.Messages], ) + def announce_inflight_tool_call(self, tool_name: str) -> None: + """Record that *tool_name* is being dispatched in the current turn. + + Lets in-turn guards (see + ``copilot/tools/helpers.py::_guide_read_in_session``) see a tool + call the moment it's issued, instead of waiting for the + ``session.messages`` flush at turn end — fixing a loop where a + second tool in the same turn re-fires a guard despite the + guarding tool having already been called (seen on Kimi K2.6 in + particular because its aggressive tool-call chaining exercises + this path much more than Sonnet does). The buffer is cleared by + :meth:`clear_inflight_tool_calls` at turn end. + """ + self._inflight_tool_calls.add(tool_name) + + def clear_inflight_tool_calls(self) -> None: + """Reset the in-flight tool-call announcement buffer.""" + self._inflight_tool_calls.clear() + + def has_tool_been_called_this_turn(self, tool_name: str) -> bool: + """True when *tool_name* has been called in the current turn. + + Checks the in-flight announcement buffer first (for calls + dispatched in *this* turn but not yet persisted) and then the + durable ``messages`` history (for past turns + prior rounds + within this turn whose writes already landed). + """ + if tool_name in self._inflight_tool_calls: + return True + for msg in reversed(self.messages): + if msg.role != "assistant" or not msg.tool_calls: + continue + for tc in msg.tool_calls: + name = tc.get("function", {}).get("name") or tc.get("name") + if name == tool_name: + return True + return False + def add_tool_call_to_current_turn(self, tool_call: dict) -> None: """Attach a tool_call to the current turn's assistant message. diff --git a/autogpt_platform/backend/backend/copilot/tools/agent_guide_gate_test.py b/autogpt_platform/backend/backend/copilot/tools/agent_guide_gate_test.py index 6a122b7324..14cc12f4b0 100644 --- a/autogpt_platform/backend/backend/copilot/tools/agent_guide_gate_test.py +++ b/autogpt_platform/backend/backend/copilot/tools/agent_guide_gate_test.py @@ -7,8 +7,6 @@ tokens and then produce JSON that fails validation — wasting turns on auto-fix loops. """ -from unittest.mock import MagicMock - import pytest from backend.copilot.model import ChatMessage, ChatSession @@ -18,8 +16,17 @@ from .models import ErrorResponse def _session_with_messages(messages: list[ChatMessage]) -> ChatSession: - """Build a minimal ChatSession whose ``messages`` matches *messages*.""" - session = MagicMock(spec=ChatSession) + """Build a real ChatSession with the given messages. + + Uses ``ChatSession.new`` + attribute reassignment rather than + ``MagicMock(spec=...)`` because the gate now calls + ``session.has_tool_been_called_this_turn(...)`` and a ``spec`` mock + returns a truthy ``MagicMock`` from that call, hiding real gate + behaviour. A live ``ChatSession`` also correctly initialises the + ``_inflight_tool_calls`` PrivateAttr scratch buffer used by the + in-turn announcement path. + """ + session = ChatSession.new("test-user", dry_run=False) session.session_id = "test-session" session.messages = messages return session @@ -117,3 +124,44 @@ def test_tool_name_surfaced_in_error(tool_name: str): result = require_guide_read(session, tool_name) assert isinstance(result, ErrorResponse) assert tool_name in result.message + + +def test_inflight_announcement_lets_gate_pass_within_same_turn(): + """Regression for the Kimi baseline loop: the guide call is + dispatched earlier in the SAME turn and buffered by the + ``_baseline_tool_executor`` into the in-flight announcement set, + but hasn't been flushed into ``session.messages`` yet. The gate + must see it anyway — otherwise a follow-up ``create_agent`` in the + same turn re-fires the guard despite the guide call and the model + loops retrying the guide.""" + session = _session_with_messages( + [ChatMessage(role="user", content="build something")] + ) + # Simulate _baseline_tool_executor's announce. + session.announce_inflight_tool_call("get_agent_building_guide") + assert require_guide_read(session, "create_agent") is None + + +def test_inflight_clear_restores_gate_for_next_turn(): + """End-of-turn cleanup must drop the in-flight buffer so it can't + leak into the *next* turn's ``session.messages`` scan (e.g. a second + session turn that should legitimately require a fresh guide call if + ``messages`` got compressed away).""" + session = _session_with_messages([ChatMessage(role="user", content="build")]) + session.announce_inflight_tool_call("get_agent_building_guide") + assert require_guide_read(session, "create_agent") is None + session.clear_inflight_tool_calls() + # With the buffer cleared and no guide row in messages, the guard + # fires again. + assert isinstance(require_guide_read(session, "create_agent"), ErrorResponse) + + +def test_inflight_announcement_does_not_serialise_into_model_dump(): + """PrivateAttr invariant: the scratch buffer must never leak into + ``model_dump()`` / the Redis cache payload / the DB — it's + process-local turn state, not durable session state.""" + session = _session_with_messages([]) + session.announce_inflight_tool_call("get_agent_building_guide") + dumped = session.model_dump() + assert "_inflight_tool_calls" not in dumped + assert "inflight_tool_calls" not in dumped diff --git a/autogpt_platform/backend/backend/copilot/tools/helpers.py b/autogpt_platform/backend/backend/copilot/tools/helpers.py index 8ec31ee43e..bb29e894ac 100644 --- a/autogpt_platform/backend/backend/copilot/tools/helpers.py +++ b/autogpt_platform/backend/backend/copilot/tools/helpers.py @@ -787,26 +787,22 @@ def _resolve_discriminated_credentials( _AGENT_GUIDE_TOOL_NAME = "get_agent_building_guide" -def _guide_read_in_session(session: ChatSession) -> bool: - """True if this session's assistant messages include a guide tool call.""" - for msg in reversed(session.messages): - if msg.role != "assistant" or not msg.tool_calls: - continue - for tc in msg.tool_calls: - name = tc.get("function", {}).get("name") or tc.get("name") - if name == _AGENT_GUIDE_TOOL_NAME: - return True - return False - - def require_guide_read(session: ChatSession, tool_name: str): """Return an ErrorResponse if the guide hasn't been loaded this session. Import inline to keep ``helpers.py`` free of tool-response imports. + Uses :meth:`ChatSession.has_tool_been_called_this_turn` which checks + both the persisted ``messages`` list and the in-flight announcement + buffer — so a guide call dispatched earlier in the *current* turn + (before ``session.messages`` flushes at turn end) is recognised too. + Otherwise a second tool in the same turn would re-fire this guard + despite the guide having been called — seen on Kimi K2.6 in + particular because its aggressive tool-call chaining exercises this + path far more than Sonnet does. """ from .models import ErrorResponse # noqa: PLC0415 — avoid circular import - if _guide_read_in_session(session): + if session.has_tool_been_called_this_turn(_AGENT_GUIDE_TOOL_NAME): return None return ErrorResponse( message=(