mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-04-30 03:00:41 -04:00
fix(backend/copilot): prevent duplicate assistant text after mid-loop pending drain
Track _flushed_assistant_text_len on _BaselineStreamState so the finally block only appends assistant text produced AFTER the last mid-loop flush. Without this, state.assistant_text (all rounds) vs state.session_messages (post-flush only) desync caused the startswith(recorded) dedup to fail, duplicating round-1 assistant text in session.messages. Adds regression test in service_unit_test.py.
This commit is contained in:
@@ -345,6 +345,11 @@ class _BaselineStreamState:
|
||||
cost_usd: float | None = None
|
||||
thinking_stripper: _ThinkingStripper = field(default_factory=_ThinkingStripper)
|
||||
session_messages: list[ChatMessage] = field(default_factory=list)
|
||||
# Tracks how much of ``assistant_text`` has already been flushed to
|
||||
# ``session.messages`` via mid-loop pending drains, so the ``finally``
|
||||
# block only appends the *new* assistant text (avoiding duplication of
|
||||
# round-1 text when round-1 entries were cleared from session_messages).
|
||||
_flushed_assistant_text_len: int = 0
|
||||
|
||||
|
||||
async def _baseline_llm_caller(
|
||||
@@ -1300,6 +1305,10 @@ async def stream_chat_completion_baseline(
|
||||
for _buffered in state.session_messages:
|
||||
session.messages.append(_buffered)
|
||||
state.session_messages.clear()
|
||||
# Record how much assistant_text has been covered by the
|
||||
# structured entries just flushed, so the finally block's
|
||||
# final-text dedup doesn't re-append rounds already persisted.
|
||||
state._flushed_assistant_text_len = len(state.assistant_text)
|
||||
|
||||
for pm in pending:
|
||||
# ``format_pending_as_user_message`` embeds file
|
||||
@@ -1447,7 +1456,11 @@ async def stream_chat_completion_baseline(
|
||||
# no tool calls, i.e. the natural finish). Only add it if the
|
||||
# conversation updater didn't already record it as part of a
|
||||
# tool-call round (which would have empty response_text).
|
||||
final_text = state.assistant_text
|
||||
# Only consider assistant text produced AFTER the last mid-loop
|
||||
# flush. ``_flushed_assistant_text_len`` tracks the prefix already
|
||||
# persisted via structured session_messages during mid-loop pending
|
||||
# drains; including it here would duplicate those rounds.
|
||||
final_text = state.assistant_text[state._flushed_assistant_text_len :]
|
||||
if state.session_messages:
|
||||
# Strip text already captured in tool-call round messages
|
||||
recorded = "".join(
|
||||
|
||||
@@ -949,3 +949,83 @@ class TestMidLoopPendingFlushOrdering:
|
||||
assert session_messages[4].tool_calls is not None
|
||||
assert session_messages[4].tool_calls[0]["id"] == "tc_2"
|
||||
assert session_messages[5].tool_call_id == "tc_2"
|
||||
|
||||
def test_flushed_assistant_text_len_prevents_duplicate_final_text(self):
|
||||
"""After mid-loop drain clears state.session_messages, the finally
|
||||
block must not re-append assistant text from rounds already flushed.
|
||||
|
||||
``state.assistant_text`` accumulates ALL rounds' text, but
|
||||
``state.session_messages`` only holds entries from rounds AFTER the
|
||||
last mid-loop flush. Without ``_flushed_assistant_text_len``, the
|
||||
``finally`` block's ``startswith(recorded)`` check fails because
|
||||
``recorded`` only covers post-flush rounds, and the full
|
||||
``assistant_text`` is appended — duplicating pre-flush rounds.
|
||||
"""
|
||||
state = _BaselineStreamState()
|
||||
session_messages: list[ChatMessage] = [
|
||||
ChatMessage(role="user", content="user turn"),
|
||||
]
|
||||
|
||||
# Simulate round 1 text accumulation (as _bound_llm_caller does)
|
||||
state.assistant_text += "calling search"
|
||||
|
||||
# Round 1 conversation_updater buffers structured entries
|
||||
builder = TranscriptBuilder()
|
||||
builder.append_user("user turn")
|
||||
response1 = LLMLoopResponse(
|
||||
response_text="calling search",
|
||||
tool_calls=[LLMToolCall(id="tc_1", name="search", arguments="{}")],
|
||||
raw_response=None,
|
||||
prompt_tokens=0,
|
||||
completion_tokens=0,
|
||||
)
|
||||
_baseline_conversation_updater(
|
||||
[],
|
||||
response1,
|
||||
tool_results=[
|
||||
ToolCallResult(
|
||||
tool_call_id="tc_1", tool_name="search", content="result"
|
||||
)
|
||||
],
|
||||
transcript_builder=builder,
|
||||
state=state,
|
||||
model="test-model",
|
||||
)
|
||||
|
||||
# Mid-loop drain: flush + clear + record flushed text length
|
||||
for _buffered in state.session_messages:
|
||||
session_messages.append(_buffered)
|
||||
state.session_messages.clear()
|
||||
state._flushed_assistant_text_len = len(state.assistant_text)
|
||||
session_messages.append(ChatMessage(role="user", content="pending message"))
|
||||
|
||||
# Simulate round 2 text accumulation
|
||||
state.assistant_text += "final answer"
|
||||
|
||||
# Round 2: natural finish (no tool calls → no session_messages entry)
|
||||
|
||||
# --- Finally block logic (production code) ---
|
||||
for msg in state.session_messages:
|
||||
session_messages.append(msg)
|
||||
|
||||
final_text = state.assistant_text[state._flushed_assistant_text_len :]
|
||||
if state.session_messages:
|
||||
recorded = "".join(
|
||||
m.content or "" for m in state.session_messages if m.role == "assistant"
|
||||
)
|
||||
if final_text.startswith(recorded):
|
||||
final_text = final_text[len(recorded) :]
|
||||
if final_text.strip():
|
||||
session_messages.append(ChatMessage(role="assistant", content=final_text))
|
||||
|
||||
# The final assistant message should only contain round-2 text,
|
||||
# not the round-1 text that was already flushed mid-loop.
|
||||
assistant_msgs = [m for m in session_messages if m.role == "assistant"]
|
||||
# Round-1 structured assistant (from mid-loop flush)
|
||||
assert assistant_msgs[0].content == "calling search"
|
||||
assert assistant_msgs[0].tool_calls is not None
|
||||
# Round-2 final text (from finally block)
|
||||
assert assistant_msgs[1].content == "final answer"
|
||||
assert assistant_msgs[1].tool_calls is None
|
||||
# Crucially: only 2 assistant messages, not 3 (no duplicate)
|
||||
assert len(assistant_msgs) == 2
|
||||
|
||||
Reference in New Issue
Block a user