mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-04-30 03:00:41 -04:00
fix(backend/copilot): announce in-flight tool calls to unstick guide guard
Symptom (session 0d83f15c on Kimi K2.6): the agent called `get_agent_building_guide`, got the guide, retried `create_agent` — and the `require_guide_read` gate fired "Call get_agent_building_guide first" anyway, looping indefinitely. Root cause: baseline path buffers assistant rows with their `tool_calls` into `state.session_messages` (a scratch list on `_BaselineStreamState`) during the tool-call loop, and only flushes into `session.messages` at turn end. So when the second tool runs within the *same* turn, `_guide_read_in_session` — which scans `session.messages` — sees no guide call and fires the gate. SDK path didn't hit this because it mirrors tool calls straight into `ctx.session.messages`; Kimi's aggressive tool-call chaining within one turn was what surfaced the bug on baseline. Not Kimi-specific (any baseline model that calls guide + create_agent in one turn would hit it). Fix: add an in-flight announcement buffer on `ChatSession`. * `ChatSession._inflight_tool_calls: set[str]` (PrivateAttr, never serialised). * `announce_inflight_tool_call` called by `_baseline_tool_executor` the moment a tool is dispatched, before it runs. * `has_tool_been_called_this_turn` folds the in-flight set into the historical `messages` scan; `require_guide_read` now calls this instead of the messages-only helper. * `clear_inflight_tool_calls` fired in the baseline turn's finally block, right before `upsert_chat_session`, so next turn starts with a clean buffer. Deliberately didn't mirror the row into `session.messages` directly — `_baseline_conversation_updater` appends a fully-formed assistant+tool_calls row at round end, so an inline mirror would duplicate. The scratch set keeps the announcement separate from durable history. New tests: in-flight announcement lets gate pass within same turn; clear restores the gate for next turn; PrivateAttr never leaks into `model_dump`. Existing gate tests migrated from MagicMock(spec=ChatSession) to real ChatSession instances since the guard now calls the new helper.
This commit is contained in:
@@ -760,6 +760,19 @@ async def _baseline_tool_executor(
|
||||
)
|
||||
)
|
||||
|
||||
# Announce the tool call to the session so in-turn guards like
|
||||
# ``require_guide_read`` can see it *right now*, before the tool
|
||||
# actually runs. Without this, the tool_call row lives only in
|
||||
# ``state.session_messages`` until the ``finally`` block flushes it
|
||||
# into ``session.messages`` at turn end — so a second tool in the
|
||||
# same turn (e.g. ``create_agent`` after ``get_agent_building_guide``)
|
||||
# scans a stale ``session.messages`` and the guard re-fires despite
|
||||
# the guide having been called. The announce-set is cleared at turn
|
||||
# end; we deliberately don't touch ``session.messages`` here to avoid
|
||||
# duplicating the assistant row that ``_baseline_conversation_updater``
|
||||
# will append at round end.
|
||||
session.announce_inflight_tool_call(tool_name)
|
||||
|
||||
try:
|
||||
result: StreamToolOutputAvailable = await execute_tool(
|
||||
tool_name=tool_name,
|
||||
@@ -1868,6 +1881,10 @@ async def stream_chat_completion_baseline(
|
||||
final_text = final_text[len(recorded) :]
|
||||
if final_text.strip():
|
||||
session.messages.append(ChatMessage(role="assistant", content=final_text))
|
||||
# In-flight tool-call announcements are only meaningful for the
|
||||
# current turn; clear before persist so the next turn starts with
|
||||
# a clean scratch buffer.
|
||||
session.clear_inflight_tool_calls()
|
||||
try:
|
||||
await upsert_chat_session(session)
|
||||
except Exception as persist_err:
|
||||
|
||||
@@ -20,7 +20,7 @@ from openai.types.chat.chat_completion_message_tool_call_param import (
|
||||
)
|
||||
from prisma.models import ChatMessage as PrismaChatMessage
|
||||
from prisma.models import ChatSession as PrismaChatSession
|
||||
from pydantic import BaseModel
|
||||
from pydantic import BaseModel, PrivateAttr
|
||||
|
||||
from backend.data.db_accessors import chat_db
|
||||
from backend.data.redis_client import get_redis_async
|
||||
@@ -198,6 +198,15 @@ class ChatSessionInfo(BaseModel):
|
||||
|
||||
class ChatSession(ChatSessionInfo):
|
||||
messages: list[ChatMessage]
|
||||
# In-flight tool-call names for the CURRENT turn. Not persisted to
|
||||
# DB and not serialised on the wire — ``PrivateAttr`` keeps this a
|
||||
# process-local scratch buffer that's invisible to ``model_dump`` /
|
||||
# ``model_dump_json`` / the redis cache path. Populated by the
|
||||
# baseline tool executor the moment a tool is dispatched so in-turn
|
||||
# guards (e.g. ``require_guide_read``) can see the call before it
|
||||
# lands in ``messages`` at turn-end. Cleared when the turn
|
||||
# completes.
|
||||
_inflight_tool_calls: set[str] = PrivateAttr(default_factory=set)
|
||||
|
||||
@classmethod
|
||||
def new(cls, user_id: str, *, dry_run: bool) -> Self:
|
||||
@@ -226,6 +235,44 @@ class ChatSession(ChatSessionInfo):
|
||||
messages=[ChatMessage.from_db(m) for m in prisma_session.Messages],
|
||||
)
|
||||
|
||||
def announce_inflight_tool_call(self, tool_name: str) -> None:
|
||||
"""Record that *tool_name* is being dispatched in the current turn.
|
||||
|
||||
Lets in-turn guards (see
|
||||
``copilot/tools/helpers.py::_guide_read_in_session``) see a tool
|
||||
call the moment it's issued, instead of waiting for the
|
||||
``session.messages`` flush at turn end — fixing a loop where a
|
||||
second tool in the same turn re-fires a guard despite the
|
||||
guarding tool having already been called (seen on Kimi K2.6 in
|
||||
particular because its aggressive tool-call chaining exercises
|
||||
this path much more than Sonnet does). The buffer is cleared by
|
||||
:meth:`clear_inflight_tool_calls` at turn end.
|
||||
"""
|
||||
self._inflight_tool_calls.add(tool_name)
|
||||
|
||||
def clear_inflight_tool_calls(self) -> None:
|
||||
"""Reset the in-flight tool-call announcement buffer."""
|
||||
self._inflight_tool_calls.clear()
|
||||
|
||||
def has_tool_been_called_this_turn(self, tool_name: str) -> bool:
|
||||
"""True when *tool_name* has been called in the current turn.
|
||||
|
||||
Checks the in-flight announcement buffer first (for calls
|
||||
dispatched in *this* turn but not yet persisted) and then the
|
||||
durable ``messages`` history (for past turns + prior rounds
|
||||
within this turn whose writes already landed).
|
||||
"""
|
||||
if tool_name in self._inflight_tool_calls:
|
||||
return True
|
||||
for msg in reversed(self.messages):
|
||||
if msg.role != "assistant" or not msg.tool_calls:
|
||||
continue
|
||||
for tc in msg.tool_calls:
|
||||
name = tc.get("function", {}).get("name") or tc.get("name")
|
||||
if name == tool_name:
|
||||
return True
|
||||
return False
|
||||
|
||||
def add_tool_call_to_current_turn(self, tool_call: dict) -> None:
|
||||
"""Attach a tool_call to the current turn's assistant message.
|
||||
|
||||
|
||||
@@ -7,8 +7,6 @@ tokens and then produce JSON that fails validation — wasting turns on
|
||||
auto-fix loops.
|
||||
"""
|
||||
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from backend.copilot.model import ChatMessage, ChatSession
|
||||
@@ -18,8 +16,17 @@ from .models import ErrorResponse
|
||||
|
||||
|
||||
def _session_with_messages(messages: list[ChatMessage]) -> ChatSession:
|
||||
"""Build a minimal ChatSession whose ``messages`` matches *messages*."""
|
||||
session = MagicMock(spec=ChatSession)
|
||||
"""Build a real ChatSession with the given messages.
|
||||
|
||||
Uses ``ChatSession.new`` + attribute reassignment rather than
|
||||
``MagicMock(spec=...)`` because the gate now calls
|
||||
``session.has_tool_been_called_this_turn(...)`` and a ``spec`` mock
|
||||
returns a truthy ``MagicMock`` from that call, hiding real gate
|
||||
behaviour. A live ``ChatSession`` also correctly initialises the
|
||||
``_inflight_tool_calls`` PrivateAttr scratch buffer used by the
|
||||
in-turn announcement path.
|
||||
"""
|
||||
session = ChatSession.new("test-user", dry_run=False)
|
||||
session.session_id = "test-session"
|
||||
session.messages = messages
|
||||
return session
|
||||
@@ -117,3 +124,44 @@ def test_tool_name_surfaced_in_error(tool_name: str):
|
||||
result = require_guide_read(session, tool_name)
|
||||
assert isinstance(result, ErrorResponse)
|
||||
assert tool_name in result.message
|
||||
|
||||
|
||||
def test_inflight_announcement_lets_gate_pass_within_same_turn():
|
||||
"""Regression for the Kimi baseline loop: the guide call is
|
||||
dispatched earlier in the SAME turn and buffered by the
|
||||
``_baseline_tool_executor`` into the in-flight announcement set,
|
||||
but hasn't been flushed into ``session.messages`` yet. The gate
|
||||
must see it anyway — otherwise a follow-up ``create_agent`` in the
|
||||
same turn re-fires the guard despite the guide call and the model
|
||||
loops retrying the guide."""
|
||||
session = _session_with_messages(
|
||||
[ChatMessage(role="user", content="build something")]
|
||||
)
|
||||
# Simulate _baseline_tool_executor's announce.
|
||||
session.announce_inflight_tool_call("get_agent_building_guide")
|
||||
assert require_guide_read(session, "create_agent") is None
|
||||
|
||||
|
||||
def test_inflight_clear_restores_gate_for_next_turn():
|
||||
"""End-of-turn cleanup must drop the in-flight buffer so it can't
|
||||
leak into the *next* turn's ``session.messages`` scan (e.g. a second
|
||||
session turn that should legitimately require a fresh guide call if
|
||||
``messages`` got compressed away)."""
|
||||
session = _session_with_messages([ChatMessage(role="user", content="build")])
|
||||
session.announce_inflight_tool_call("get_agent_building_guide")
|
||||
assert require_guide_read(session, "create_agent") is None
|
||||
session.clear_inflight_tool_calls()
|
||||
# With the buffer cleared and no guide row in messages, the guard
|
||||
# fires again.
|
||||
assert isinstance(require_guide_read(session, "create_agent"), ErrorResponse)
|
||||
|
||||
|
||||
def test_inflight_announcement_does_not_serialise_into_model_dump():
|
||||
"""PrivateAttr invariant: the scratch buffer must never leak into
|
||||
``model_dump()`` / the Redis cache payload / the DB — it's
|
||||
process-local turn state, not durable session state."""
|
||||
session = _session_with_messages([])
|
||||
session.announce_inflight_tool_call("get_agent_building_guide")
|
||||
dumped = session.model_dump()
|
||||
assert "_inflight_tool_calls" not in dumped
|
||||
assert "inflight_tool_calls" not in dumped
|
||||
|
||||
@@ -787,26 +787,22 @@ def _resolve_discriminated_credentials(
|
||||
_AGENT_GUIDE_TOOL_NAME = "get_agent_building_guide"
|
||||
|
||||
|
||||
def _guide_read_in_session(session: ChatSession) -> bool:
|
||||
"""True if this session's assistant messages include a guide tool call."""
|
||||
for msg in reversed(session.messages):
|
||||
if msg.role != "assistant" or not msg.tool_calls:
|
||||
continue
|
||||
for tc in msg.tool_calls:
|
||||
name = tc.get("function", {}).get("name") or tc.get("name")
|
||||
if name == _AGENT_GUIDE_TOOL_NAME:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def require_guide_read(session: ChatSession, tool_name: str):
|
||||
"""Return an ErrorResponse if the guide hasn't been loaded this session.
|
||||
|
||||
Import inline to keep ``helpers.py`` free of tool-response imports.
|
||||
Uses :meth:`ChatSession.has_tool_been_called_this_turn` which checks
|
||||
both the persisted ``messages`` list and the in-flight announcement
|
||||
buffer — so a guide call dispatched earlier in the *current* turn
|
||||
(before ``session.messages`` flushes at turn end) is recognised too.
|
||||
Otherwise a second tool in the same turn would re-fire this guard
|
||||
despite the guide having been called — seen on Kimi K2.6 in
|
||||
particular because its aggressive tool-call chaining exercises this
|
||||
path far more than Sonnet does.
|
||||
"""
|
||||
from .models import ErrorResponse # noqa: PLC0415 — avoid circular import
|
||||
|
||||
if _guide_read_in_session(session):
|
||||
if session.has_tool_been_called_this_turn(_AGENT_GUIDE_TOOL_NAME):
|
||||
return None
|
||||
return ErrorResponse(
|
||||
message=(
|
||||
|
||||
Reference in New Issue
Block a user