fix(backend/copilot): prevent duplicate assistant text after mid-loop pending drain

Track _flushed_assistant_text_len on _BaselineStreamState so the finally block only appends assistant text produced AFTER the last mid-loop flush. Without this, state.assistant_text (all rounds) vs state.session_messages (post-flush only) desync caused the startswith(recorded) dedup to fail, duplicating round-1 assistant text in session.messages. Adds regression test in service_unit_test.py.
2026-04-30 03:00:41 -04:00 · 2026-04-11 15:00:25 +00:00
parent d49ffac0a1
commit c70e34c30e
2 changed files with 94 additions and 1 deletions
--- a/autogpt_platform/backend/backend/copilot/baseline/service.py
+++ b/autogpt_platform/backend/backend/copilot/baseline/service.py
@@ -345,6 +345,11 @@ class _BaselineStreamState:
    cost_usd: float | None = None
    thinking_stripper: _ThinkingStripper = field(default_factory=_ThinkingStripper)
    session_messages: list[ChatMessage] = field(default_factory=list)
+    # Tracks how much of ``assistant_text`` has already been flushed to
+    # ``session.messages`` via mid-loop pending drains, so the ``finally``
+    # block only appends the *new* assistant text (avoiding duplication of
+    # round-1 text when round-1 entries were cleared from session_messages).
+    _flushed_assistant_text_len: int = 0


 async def _baseline_llm_caller(
@@ -1300,6 +1305,10 @@ async def stream_chat_completion_baseline(
                for _buffered in state.session_messages:
                    session.messages.append(_buffered)
                state.session_messages.clear()
+                # Record how much assistant_text has been covered by the
+                # structured entries just flushed, so the finally block's
+                # final-text dedup doesn't re-append rounds already persisted.
+                state._flushed_assistant_text_len = len(state.assistant_text)

                for pm in pending:
                    # ``format_pending_as_user_message`` embeds file
@@ -1447,7 +1456,11 @@ async def stream_chat_completion_baseline(
        # no tool calls, i.e. the natural finish).  Only add it if the
        # conversation updater didn't already record it as part of a
        # tool-call round (which would have empty response_text).
-        final_text = state.assistant_text
+        # Only consider assistant text produced AFTER the last mid-loop
+        # flush.  ``_flushed_assistant_text_len`` tracks the prefix already
+        # persisted via structured session_messages during mid-loop pending
+        # drains; including it here would duplicate those rounds.
+        final_text = state.assistant_text[state._flushed_assistant_text_len :]
        if state.session_messages:
            # Strip text already captured in tool-call round messages
            recorded = "".join(
--- a/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py
+++ b/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py
@@ -949,3 +949,83 @@ class TestMidLoopPendingFlushOrdering:
        assert session_messages[4].tool_calls is not None
        assert session_messages[4].tool_calls[0]["id"] == "tc_2"
        assert session_messages[5].tool_call_id == "tc_2"
+
+    def test_flushed_assistant_text_len_prevents_duplicate_final_text(self):
+        """After mid-loop drain clears state.session_messages, the finally
+        block must not re-append assistant text from rounds already flushed.
+
+        ``state.assistant_text`` accumulates ALL rounds' text, but
+        ``state.session_messages`` only holds entries from rounds AFTER the
+        last mid-loop flush.  Without ``_flushed_assistant_text_len``, the
+        ``finally`` block's ``startswith(recorded)`` check fails because
+        ``recorded`` only covers post-flush rounds, and the full
+        ``assistant_text`` is appended — duplicating pre-flush rounds.
+        """
+        state = _BaselineStreamState()
+        session_messages: list[ChatMessage] = [
+            ChatMessage(role="user", content="user turn"),
+        ]
+
+        # Simulate round 1 text accumulation (as _bound_llm_caller does)
+        state.assistant_text += "calling search"
+
+        # Round 1 conversation_updater buffers structured entries
+        builder = TranscriptBuilder()
+        builder.append_user("user turn")
+        response1 = LLMLoopResponse(
+            response_text="calling search",
+            tool_calls=[LLMToolCall(id="tc_1", name="search", arguments="{}")],
+            raw_response=None,
+            prompt_tokens=0,
+            completion_tokens=0,
+        )
+        _baseline_conversation_updater(
+            [],
+            response1,
+            tool_results=[
+                ToolCallResult(
+                    tool_call_id="tc_1", tool_name="search", content="result"
+                )
+            ],
+            transcript_builder=builder,
+            state=state,
+            model="test-model",
+        )
+
+        # Mid-loop drain: flush + clear + record flushed text length
+        for _buffered in state.session_messages:
+            session_messages.append(_buffered)
+        state.session_messages.clear()
+        state._flushed_assistant_text_len = len(state.assistant_text)
+        session_messages.append(ChatMessage(role="user", content="pending message"))
+
+        # Simulate round 2 text accumulation
+        state.assistant_text += "final answer"
+
+        # Round 2: natural finish (no tool calls → no session_messages entry)
+
+        # --- Finally block logic (production code) ---
+        for msg in state.session_messages:
+            session_messages.append(msg)
+
+        final_text = state.assistant_text[state._flushed_assistant_text_len :]
+        if state.session_messages:
+            recorded = "".join(
+                m.content or "" for m in state.session_messages if m.role == "assistant"
+            )
+            if final_text.startswith(recorded):
+                final_text = final_text[len(recorded) :]
+        if final_text.strip():
+            session_messages.append(ChatMessage(role="assistant", content=final_text))
+
+        # The final assistant message should only contain round-2 text,
+        # not the round-1 text that was already flushed mid-loop.
+        assistant_msgs = [m for m in session_messages if m.role == "assistant"]
+        # Round-1 structured assistant (from mid-loop flush)
+        assert assistant_msgs[0].content == "calling search"
+        assert assistant_msgs[0].tool_calls is not None
+        # Round-2 final text (from finally block)
+        assert assistant_msgs[1].content == "final answer"
+        assert assistant_msgs[1].tool_calls is None
+        # Crucially: only 2 assistant messages, not 3 (no duplicate)
+        assert len(assistant_msgs) == 2