fix(backend/copilot): prevent duplicate assistant text after mid-loop pending drain

Track _flushed_assistant_text_len on _BaselineStreamState so the finally
block only appends assistant text produced AFTER the last mid-loop flush.
Without this, state.assistant_text (all rounds) vs state.session_messages
(post-flush only) desync caused the startswith(recorded) dedup to fail,
duplicating round-1 assistant text in session.messages.

Adds regression test in service_unit_test.py.
This commit is contained in:
majdyz
2026-04-11 15:00:25 +00:00
parent d49ffac0a1
commit c70e34c30e
2 changed files with 94 additions and 1 deletions

View File

@@ -345,6 +345,11 @@ class _BaselineStreamState:
cost_usd: float | None = None
thinking_stripper: _ThinkingStripper = field(default_factory=_ThinkingStripper)
session_messages: list[ChatMessage] = field(default_factory=list)
# Tracks how much of ``assistant_text`` has already been flushed to
# ``session.messages`` via mid-loop pending drains, so the ``finally``
# block only appends the *new* assistant text (avoiding duplication of
# round-1 text when round-1 entries were cleared from session_messages).
_flushed_assistant_text_len: int = 0
async def _baseline_llm_caller(
@@ -1300,6 +1305,10 @@ async def stream_chat_completion_baseline(
for _buffered in state.session_messages:
session.messages.append(_buffered)
state.session_messages.clear()
# Record how much assistant_text has been covered by the
# structured entries just flushed, so the finally block's
# final-text dedup doesn't re-append rounds already persisted.
state._flushed_assistant_text_len = len(state.assistant_text)
for pm in pending:
# ``format_pending_as_user_message`` embeds file
@@ -1447,7 +1456,11 @@ async def stream_chat_completion_baseline(
# no tool calls, i.e. the natural finish). Only add it if the
# conversation updater didn't already record it as part of a
# tool-call round (which would have empty response_text).
final_text = state.assistant_text
# Only consider assistant text produced AFTER the last mid-loop
# flush. ``_flushed_assistant_text_len`` tracks the prefix already
# persisted via structured session_messages during mid-loop pending
# drains; including it here would duplicate those rounds.
final_text = state.assistant_text[state._flushed_assistant_text_len :]
if state.session_messages:
# Strip text already captured in tool-call round messages
recorded = "".join(

View File

@@ -949,3 +949,83 @@ class TestMidLoopPendingFlushOrdering:
assert session_messages[4].tool_calls is not None
assert session_messages[4].tool_calls[0]["id"] == "tc_2"
assert session_messages[5].tool_call_id == "tc_2"
def test_flushed_assistant_text_len_prevents_duplicate_final_text(self):
"""After mid-loop drain clears state.session_messages, the finally
block must not re-append assistant text from rounds already flushed.
``state.assistant_text`` accumulates ALL rounds' text, but
``state.session_messages`` only holds entries from rounds AFTER the
last mid-loop flush. Without ``_flushed_assistant_text_len``, the
``finally`` block's ``startswith(recorded)`` check fails because
``recorded`` only covers post-flush rounds, and the full
``assistant_text`` is appended — duplicating pre-flush rounds.
"""
state = _BaselineStreamState()
session_messages: list[ChatMessage] = [
ChatMessage(role="user", content="user turn"),
]
# Simulate round 1 text accumulation (as _bound_llm_caller does)
state.assistant_text += "calling search"
# Round 1 conversation_updater buffers structured entries
builder = TranscriptBuilder()
builder.append_user("user turn")
response1 = LLMLoopResponse(
response_text="calling search",
tool_calls=[LLMToolCall(id="tc_1", name="search", arguments="{}")],
raw_response=None,
prompt_tokens=0,
completion_tokens=0,
)
_baseline_conversation_updater(
[],
response1,
tool_results=[
ToolCallResult(
tool_call_id="tc_1", tool_name="search", content="result"
)
],
transcript_builder=builder,
state=state,
model="test-model",
)
# Mid-loop drain: flush + clear + record flushed text length
for _buffered in state.session_messages:
session_messages.append(_buffered)
state.session_messages.clear()
state._flushed_assistant_text_len = len(state.assistant_text)
session_messages.append(ChatMessage(role="user", content="pending message"))
# Simulate round 2 text accumulation
state.assistant_text += "final answer"
# Round 2: natural finish (no tool calls → no session_messages entry)
# --- Finally block logic (production code) ---
for msg in state.session_messages:
session_messages.append(msg)
final_text = state.assistant_text[state._flushed_assistant_text_len :]
if state.session_messages:
recorded = "".join(
m.content or "" for m in state.session_messages if m.role == "assistant"
)
if final_text.startswith(recorded):
final_text = final_text[len(recorded) :]
if final_text.strip():
session_messages.append(ChatMessage(role="assistant", content=final_text))
# The final assistant message should only contain round-2 text,
# not the round-1 text that was already flushed mid-loop.
assistant_msgs = [m for m in session_messages if m.role == "assistant"]
# Round-1 structured assistant (from mid-loop flush)
assert assistant_msgs[0].content == "calling search"
assert assistant_msgs[0].tool_calls is not None
# Round-2 final text (from finally block)
assert assistant_msgs[1].content == "final answer"
assert assistant_msgs[1].tool_calls is None
# Crucially: only 2 assistant messages, not 3 (no duplicate)
assert len(assistant_msgs) == 2