fix(backend): address PR review — KeyError, reasoning consistency, test cleanup

- Use p.get("role") instead of p["role"] to avoid KeyError on Responses API items that lack a role key (sentry + cursor findings) - Non-agent mode now checks for tool calls before inserting reasoning, matching agent mode behavior (cursor finding) - Update test docstrings to reflect that tests now pass (not xfail) - Remove unused MESSAGE_ITEM fixture - Fix outdated test comment about "correct by accident" Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 03:00:28 -04:00 · 2026-03-19 22:10:21 -05:00
parent 549c1ec566
commit ff292efd3d
2 changed files with 23 additions and 23 deletions
--- a/autogpt_platform/backend/backend/blocks/smart_decision_maker.py
+++ b/autogpt_platform/backend/backend/blocks/smart_decision_maker.py
@@ -1133,7 +1133,9 @@ class SmartDecisionMakerBlock(Block):
            )

        if input_data.sys_prompt and not any(
-            p["role"] == "system" and p["content"].startswith(MAIN_OBJECTIVE_PREFIX)
+            p.get("role") == "system"
+            and isinstance(p.get("content"), str)
+            and p["content"].startswith(MAIN_OBJECTIVE_PREFIX)
            for p in prompt
        ):
            prompt.append(
@@ -1144,7 +1146,9 @@ class SmartDecisionMakerBlock(Block):
            )

        if input_data.prompt and not any(
-            p["role"] == "user" and p["content"].startswith(MAIN_OBJECTIVE_PREFIX)
+            p.get("role") == "user"
+            and isinstance(p.get("content"), str)
+            and p["content"].startswith(MAIN_OBJECTIVE_PREFIX)
            for p in prompt
        ):
            prompt.append(
@@ -1253,7 +1257,18 @@ class SmartDecisionMakerBlock(Block):
                yield emit_key, arg_value

        converted = _convert_raw_response_to_dict(response.raw_response)
-        if response.reasoning:
+
+        # Check for tool calls to avoid inserting reasoning between tool pairs
+        if isinstance(converted, list):
+            has_tool_calls = any(
+                item.get("type") == "function_call" for item in converted
+            )
+        else:
+            has_tool_calls = isinstance(converted.get("content"), list) and any(
+                item.get("type") == "tool_use" for item in converted.get("content", [])
+            )
+
+        if response.reasoning and not has_tool_calls:
            prompt.append(
                {"role": "assistant", "content": f"[Reasoning]: {response.reasoning}"}
            )
--- a/autogpt_platform/backend/backend/util/prompt_responses_api_test.py
+++ b/autogpt_platform/backend/backend/util/prompt_responses_api_test.py
@@ -5,13 +5,8 @@ The Responses API uses a different conversation format:
 - Tool results are items with ``type: "function_call_output"`` and ``call_id``
 - These items do NOT have ``role`` at the top level

-When SmartDecisionMakerBlock builds conversation history with these items and
-``conversation_compaction=True`` (the default), the conversation passes through
-``compress_context()`` — whose helper functions currently only recognise the
-Chat Completions (``role: "tool"``) and Anthropic (``tool_use`` / ``tool_result``)
-formats.
-
-Each xfail test documents a specific function that fails with Responses API items.
+These tests validate that prompt utilities correctly handle Responses API items
+alongside Chat Completions and Anthropic formats.
 """

 import pytest
@@ -55,14 +50,6 @@ FUNCTION_CALL_OUTPUT_ITEM = {
    "output": '{"results": ["result1", "result2", "result3"]}',
 }

-MESSAGE_ITEM = {
-    "type": "message",
-    "id": "msg_abc",
-    "role": "assistant",
-    "status": "completed",
-    "content": [{"type": "output_text", "text": "Here are the results."}],
-}
-

 # ═══════════════════════════════════════════════════════════════════════════
 # _msg_tokens
@@ -408,17 +395,15 @@ class TestValidateOrphansResponsesApi:
        result = validate_and_remove_orphan_tool_responses(messages, log_warning=False)
        assert len(result) == 2

-    def test_responses_api_paired_trivially_kept(self):
-        """Matched Responses API pairs are kept — but only because the
-        validator doesn't see them at all (both are invisible).  The
-        result is correct by accident, not by design."""
+    def test_responses_api_paired_kept(self):
+        """Matched Responses API pairs are kept because the validator
+        properly recognizes function_call and function_call_output items."""
        messages = [
            {"role": "user", "content": "Do something."},
            FUNCTION_CALL_ITEM,
            FUNCTION_CALL_OUTPUT_ITEM,
        ]
        result = validate_and_remove_orphan_tool_responses(messages, log_warning=False)
-        # Kept, but only because neither item was recognised
        assert len(result) == 3

    def test_responses_api_orphan_output_removed(self):