diff --git a/autogpt_platform/backend/backend/copilot/baseline/service.py b/autogpt_platform/backend/backend/copilot/baseline/service.py index 4bcdfd80d9..e5de490984 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/service.py +++ b/autogpt_platform/backend/backend/copilot/baseline/service.py @@ -957,7 +957,19 @@ async def stream_chat_completion_baseline( # maybe_append_user_message dedup is wrong here. session.messages.append(ChatMessage(role="user", content=content)) - session = await upsert_chat_session(session) + # Persist the drained pending messages (if any) plus the current user + # message. Wrap in try/except so a transient DB failure here does not + # silently discard messages that were already popped from Redis — the + # turn can still proceed using the in-memory session.messages, and a + # later resume/replay will backfill from the DB on the next turn. + try: + session = await upsert_chat_session(session) + except Exception as _persist_err: + logger.warning( + "[Baseline] Failed to persist session at turn start " + "(pending drain may not be durable): %s", + _persist_err, + ) # Select model based on the per-request mode. 'fast' downgrades to # the cheaper/faster model; everything else keeps the default. @@ -1274,6 +1286,21 @@ async def stream_chat_completion_baseline( continue pending = await drain_pending_messages(session_id) if pending: + # Flush any buffered assistant/tool messages from completed + # rounds into session.messages BEFORE appending the pending + # user message. ``_baseline_conversation_updater`` only + # records assistant+tool rounds into ``state.session_messages`` + # — they are normally batch-flushed in the finally block. + # Without this in-order flush, the mid-loop pending user + # message lands before the preceding round's assistant/tool + # entries, producing chronologically-wrong session.messages + # on persist (user interposed between an assistant tool_call + # and its tool-result), which breaks OpenAI tool-call ordering + # invariants on the next turn's replay. + for _buffered in state.session_messages: + session.messages.append(_buffered) + state.session_messages.clear() + for pm in pending: # ``format_pending_as_user_message`` embeds file # attachments and context URL/page content into the diff --git a/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py b/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py index ba1374b720..057530732e 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py +++ b/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py @@ -828,3 +828,124 @@ class TestBaselineCostExtraction: # response was never assigned so cost extraction must not raise assert state.cost_usd is None + + +class TestMidLoopPendingFlushOrdering: + """Regression test for the mid-loop pending drain ordering invariant. + + ``_baseline_conversation_updater`` records assistant+tool entries from + each tool-call round into ``state.session_messages``; the finally block + of ``stream_chat_completion_baseline`` batch-flushes them into + ``session.messages`` at the end of the turn. + + The mid-loop pending drain appends pending user messages directly to + ``session.messages``. Without flushing ``state.session_messages`` first, + the pending user message lands BEFORE the preceding round's assistant+ + tool entries in the final persisted ``session.messages`` — which + produces a malformed tool-call/tool-result ordering on the next turn's + replay. + + This test documents the invariant by replaying the production flush + sequence against an in-memory state. + """ + + def test_flush_then_append_preserves_chronological_order(self): + """Mid-loop drain must flush state.session_messages before appending + the pending user message, so the final order matches the + chronological execution order. + """ + # Initial state: user turn already appended by maybe_append_user_message + session_messages: list[ChatMessage] = [ + ChatMessage(role="user", content="original user turn"), + ] + state = _BaselineStreamState() + + # Round 1 completes: conversation_updater buffers assistant+tool + # entries into state.session_messages (but does NOT write to + # session.messages yet). + builder = TranscriptBuilder() + builder.append_user("original user turn") + response = LLMLoopResponse( + response_text="calling search", + tool_calls=[LLMToolCall(id="tc_1", name="search", arguments="{}")], + raw_response=None, + prompt_tokens=0, + completion_tokens=0, + ) + tool_results = [ + ToolCallResult( + tool_call_id="tc_1", tool_name="search", content="search output" + ), + ] + openai_messages: list = [] + _baseline_conversation_updater( + openai_messages, + response, + tool_results=tool_results, + transcript_builder=builder, + state=state, + model="test-model", + ) + # state.session_messages should now hold the round-1 assistant + tool + assert len(state.session_messages) == 2 + assert state.session_messages[0].role == "assistant" + assert state.session_messages[1].role == "tool" + + # --- Mid-loop pending drain (production code pattern) --- + # Flush first, THEN append pending. This is the ordering fix. + for _buffered in state.session_messages: + session_messages.append(_buffered) + state.session_messages.clear() + session_messages.append( + ChatMessage(role="user", content="pending mid-loop message") + ) + + # Round 2 completes: new assistant+tool entries buffer again. + response2 = LLMLoopResponse( + response_text="another call", + tool_calls=[LLMToolCall(id="tc_2", name="calc", arguments="{}")], + raw_response=None, + prompt_tokens=0, + completion_tokens=0, + ) + tool_results2 = [ + ToolCallResult( + tool_call_id="tc_2", tool_name="calc", content="calc output" + ), + ] + _baseline_conversation_updater( + openai_messages, + response2, + tool_results=tool_results2, + transcript_builder=builder, + state=state, + model="test-model", + ) + + # --- Finally-block flush (end of turn) --- + for msg in state.session_messages: + session_messages.append(msg) + + # Assert chronological order: original user, round-1 assistant, + # round-1 tool, pending user, round-2 assistant, round-2 tool. + assert [m.role for m in session_messages] == [ + "user", + "assistant", + "tool", + "user", + "assistant", + "tool", + ] + assert session_messages[0].content == "original user turn" + assert session_messages[3].content == "pending mid-loop message" + # The assistant message carrying tool_call tc_1 must be immediately + # followed by its tool result — no user message interposed. + assert session_messages[1].role == "assistant" + assert session_messages[1].tool_calls is not None + assert session_messages[1].tool_calls[0]["id"] == "tc_1" + assert session_messages[2].role == "tool" + assert session_messages[2].tool_call_id == "tc_1" + # Same invariant for the round after the pending user. + assert session_messages[4].tool_calls is not None + assert session_messages[4].tool_calls[0]["id"] == "tc_2" + assert session_messages[5].tool_call_id == "tc_2"