diff --git a/autogpt_platform/backend/backend/copilot/baseline/service.py b/autogpt_platform/backend/backend/copilot/baseline/service.py index e5de490984..9a32b6fc65 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/service.py +++ b/autogpt_platform/backend/backend/copilot/baseline/service.py @@ -345,6 +345,11 @@ class _BaselineStreamState: cost_usd: float | None = None thinking_stripper: _ThinkingStripper = field(default_factory=_ThinkingStripper) session_messages: list[ChatMessage] = field(default_factory=list) + # Tracks how much of ``assistant_text`` has already been flushed to + # ``session.messages`` via mid-loop pending drains, so the ``finally`` + # block only appends the *new* assistant text (avoiding duplication of + # round-1 text when round-1 entries were cleared from session_messages). + _flushed_assistant_text_len: int = 0 async def _baseline_llm_caller( @@ -1300,6 +1305,10 @@ async def stream_chat_completion_baseline( for _buffered in state.session_messages: session.messages.append(_buffered) state.session_messages.clear() + # Record how much assistant_text has been covered by the + # structured entries just flushed, so the finally block's + # final-text dedup doesn't re-append rounds already persisted. + state._flushed_assistant_text_len = len(state.assistant_text) for pm in pending: # ``format_pending_as_user_message`` embeds file @@ -1447,7 +1456,11 @@ async def stream_chat_completion_baseline( # no tool calls, i.e. the natural finish). Only add it if the # conversation updater didn't already record it as part of a # tool-call round (which would have empty response_text). - final_text = state.assistant_text + # Only consider assistant text produced AFTER the last mid-loop + # flush. ``_flushed_assistant_text_len`` tracks the prefix already + # persisted via structured session_messages during mid-loop pending + # drains; including it here would duplicate those rounds. + final_text = state.assistant_text[state._flushed_assistant_text_len :] if state.session_messages: # Strip text already captured in tool-call round messages recorded = "".join( diff --git a/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py b/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py index 057530732e..b67793076f 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py +++ b/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py @@ -949,3 +949,83 @@ class TestMidLoopPendingFlushOrdering: assert session_messages[4].tool_calls is not None assert session_messages[4].tool_calls[0]["id"] == "tc_2" assert session_messages[5].tool_call_id == "tc_2" + + def test_flushed_assistant_text_len_prevents_duplicate_final_text(self): + """After mid-loop drain clears state.session_messages, the finally + block must not re-append assistant text from rounds already flushed. + + ``state.assistant_text`` accumulates ALL rounds' text, but + ``state.session_messages`` only holds entries from rounds AFTER the + last mid-loop flush. Without ``_flushed_assistant_text_len``, the + ``finally`` block's ``startswith(recorded)`` check fails because + ``recorded`` only covers post-flush rounds, and the full + ``assistant_text`` is appended — duplicating pre-flush rounds. + """ + state = _BaselineStreamState() + session_messages: list[ChatMessage] = [ + ChatMessage(role="user", content="user turn"), + ] + + # Simulate round 1 text accumulation (as _bound_llm_caller does) + state.assistant_text += "calling search" + + # Round 1 conversation_updater buffers structured entries + builder = TranscriptBuilder() + builder.append_user("user turn") + response1 = LLMLoopResponse( + response_text="calling search", + tool_calls=[LLMToolCall(id="tc_1", name="search", arguments="{}")], + raw_response=None, + prompt_tokens=0, + completion_tokens=0, + ) + _baseline_conversation_updater( + [], + response1, + tool_results=[ + ToolCallResult( + tool_call_id="tc_1", tool_name="search", content="result" + ) + ], + transcript_builder=builder, + state=state, + model="test-model", + ) + + # Mid-loop drain: flush + clear + record flushed text length + for _buffered in state.session_messages: + session_messages.append(_buffered) + state.session_messages.clear() + state._flushed_assistant_text_len = len(state.assistant_text) + session_messages.append(ChatMessage(role="user", content="pending message")) + + # Simulate round 2 text accumulation + state.assistant_text += "final answer" + + # Round 2: natural finish (no tool calls → no session_messages entry) + + # --- Finally block logic (production code) --- + for msg in state.session_messages: + session_messages.append(msg) + + final_text = state.assistant_text[state._flushed_assistant_text_len :] + if state.session_messages: + recorded = "".join( + m.content or "" for m in state.session_messages if m.role == "assistant" + ) + if final_text.startswith(recorded): + final_text = final_text[len(recorded) :] + if final_text.strip(): + session_messages.append(ChatMessage(role="assistant", content=final_text)) + + # The final assistant message should only contain round-2 text, + # not the round-1 text that was already flushed mid-loop. + assistant_msgs = [m for m in session_messages if m.role == "assistant"] + # Round-1 structured assistant (from mid-loop flush) + assert assistant_msgs[0].content == "calling search" + assert assistant_msgs[0].tool_calls is not None + # Round-2 final text (from finally block) + assert assistant_msgs[1].content == "final answer" + assert assistant_msgs[1].tool_calls is None + # Crucially: only 2 assistant messages, not 3 (no duplicate) + assert len(assistant_msgs) == 2