feat(platform): update OpenAI calls to use responses.create for reasoning models

Adds conditional support for OpenAI's Responses API for reasoning models (o1, o3, etc.) that are incompatible with chat.completions.create. Changes: - Add openai_responses.py helper module with: - requires_responses_api() for model detection (exact matching) - convert_tools_to_responses_format() for tool format conversion - extract_responses_tool_calls() for tool call extraction - extract_usage() for normalized token usage - extract_responses_content() for content extraction - extract_responses_reasoning() for reasoning extraction - Update llm.py OpenAI provider to conditionally use responses.create for reasoning models while keeping chat.completions.create for others - Add unit tests for helper functions Resolves: #11624 Linear: OPEN-2911
2026-02-13 08:14:58 -05:00 · 2026-02-13 08:15:42 +00:00
4 changed files with 408 additions and 27 deletions
--- a/autogpt_platform/backend/backend/blocks/basic.py
+++ b/autogpt_platform/backend/backend/blocks/basic.py
@@ -126,7 +126,6 @@ class PrintToConsoleBlock(Block):
            output_schema=PrintToConsoleBlock.Output,
            test_input={"text": "Hello, World!"},
            is_sensitive_action=True,
-            disabled=True,  # Disabled per Nick Tindle's request (OPEN-3000)
            test_output=[
                ("output", "Hello, World!"),
                ("status", "printed"),
--- a/autogpt_platform/backend/backend/blocks/llm.py
+++ b/autogpt_platform/backend/backend/blocks/llm.py
@@ -32,6 +32,14 @@ from backend.data.model import (
 from backend.integrations.providers import ProviderName
 from backend.util import json
 from backend.util.logging import TruncatedLogger
+from backend.util.openai_responses import (
+    convert_tools_to_responses_format,
+    extract_responses_content,
+    extract_responses_reasoning,
+    extract_responses_tool_calls,
+    extract_usage,
+    requires_responses_api,
+)
 from backend.util.prompt import compress_context, estimate_token_count
 from backend.util.text import TextFormatter

@@ -659,38 +667,72 @@ async def llm_call(
    max_tokens = max(min(available_tokens, model_max_output, user_max), 1)

    if provider == "openai":
-        tools_param = tools if tools else openai.NOT_GIVEN
        oai_client = openai.AsyncOpenAI(api_key=credentials.api_key.get_secret_value())
-        response_format = None

-        parallel_tool_calls = get_parallel_tool_calls_param(
-            llm_model, parallel_tool_calls
-        )
+        # Check if this model requires the Responses API (reasoning models: o1, o3, etc.)
+        if requires_responses_api(llm_model.value):
+            # Use responses.create for reasoning models
+            tools_converted = (
+                convert_tools_to_responses_format(tools) if tools else None
+            )

-        if force_json_output:
-            response_format = {"type": "json_object"}
+            response = await oai_client.responses.create(
+                model=llm_model.value,
+                input=prompt,  # type: ignore
+                tools=tools_converted,  # type: ignore
+                max_output_tokens=max_tokens,
+                store=False,  # Don't persist conversations
+            )

-        response = await oai_client.chat.completions.create(
-            model=llm_model.value,
-            messages=prompt,  # type: ignore
-            response_format=response_format,  # type: ignore
-            max_completion_tokens=max_tokens,
-            tools=tools_param,  # type: ignore
-            parallel_tool_calls=parallel_tool_calls,
-        )
+            tool_calls = extract_responses_tool_calls(response)
+            reasoning = extract_responses_reasoning(response)
+            content = extract_responses_content(response)
+            prompt_tokens, completion_tokens = extract_usage(response, True)

-        tool_calls = extract_openai_tool_calls(response)
-        reasoning = extract_openai_reasoning(response)
+            return LLMResponse(
+                raw_response=response,
+                prompt=prompt,
+                response=content,
+                tool_calls=tool_calls,
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                reasoning=reasoning,
+            )
+        else:
+            # Use chat.completions.create for standard models
+            tools_param = tools if tools else openai.NOT_GIVEN
+            response_format = None

-        return LLMResponse(
-            raw_response=response.choices[0].message,
-            prompt=prompt,
-            response=response.choices[0].message.content or "",
-            tool_calls=tool_calls,
-            prompt_tokens=response.usage.prompt_tokens if response.usage else 0,
-            completion_tokens=response.usage.completion_tokens if response.usage else 0,
-            reasoning=reasoning,
-        )
+            parallel_tool_calls = get_parallel_tool_calls_param(
+                llm_model, parallel_tool_calls
+            )
+
+            if force_json_output:
+                response_format = {"type": "json_object"}
+
+            response = await oai_client.chat.completions.create(
+                model=llm_model.value,
+                messages=prompt,  # type: ignore
+                response_format=response_format,  # type: ignore
+                max_completion_tokens=max_tokens,
+                tools=tools_param,  # type: ignore
+                parallel_tool_calls=parallel_tool_calls,
+            )
+
+            tool_calls = extract_openai_tool_calls(response)
+            reasoning = extract_openai_reasoning(response)
+
+            return LLMResponse(
+                raw_response=response.choices[0].message,
+                prompt=prompt,
+                response=response.choices[0].message.content or "",
+                tool_calls=tool_calls,
+                prompt_tokens=response.usage.prompt_tokens if response.usage else 0,
+                completion_tokens=(
+                    response.usage.completion_tokens if response.usage else 0
+                ),
+                reasoning=reasoning,
+            )
    elif provider == "anthropic":

        an_tools = convert_openai_tool_fmt_to_anthropic(tools)
--- a/autogpt_platform/backend/backend/util/openai_responses.py
+++ b/autogpt_platform/backend/backend/util/openai_responses.py
@@ -0,0 +1,185 @@
+"""Helpers for OpenAI Responses API migration.
+
+This module provides utilities for conditionally using OpenAI's Responses API
+instead of Chat Completions for reasoning models (o1, o3, etc.) that require it.
+"""
+
+from typing import Any
+
+# Exact model identifiers that require the Responses API.
+# Use exact matching to avoid false positives on future models.
+# NOTE: Update this set when OpenAI releases new reasoning models.
+REASONING_MODELS = frozenset(
+    {
+        # O1 family
+        "o1",
+        "o1-mini",
+        "o1-preview",
+        "o1-2024-12-17",
+        # O3 family
+        "o3",
+        "o3-mini",
+        "o3-2025-04-16",
+        "o3-mini-2025-01-31",
+    }
+)
+
+
+def requires_responses_api(model: str) -> bool:
+    """Check if model requires the Responses API (exact match).
+
+    Args:
+        model: The model identifier string (e.g., "o3-mini", "gpt-4o")
+
+    Returns:
+        True if the model requires responses.create, False otherwise
+    """
+    return model in REASONING_MODELS
+
+
+def convert_tools_to_responses_format(tools: list[dict] | None) -> list[dict]:
+    """Convert Chat Completions tool format to Responses API format.
+
+    The Responses API uses internally-tagged polymorphism (flatter structure)
+    and functions are strict by default.
+
+    Chat Completions format:
+        {"type": "function", "function": {"name": "...", "parameters": {...}}}
+
+    Responses API format:
+        {"type": "function", "name": "...", "parameters": {...}}
+
+    Args:
+        tools: List of tools in Chat Completions format
+
+    Returns:
+        List of tools in Responses API format
+    """
+    if not tools:
+        return []
+
+    converted = []
+    for tool in tools:
+        if tool.get("type") == "function":
+            func = tool.get("function", {})
+            converted.append(
+                {
+                    "type": "function",
+                    "name": func.get("name"),
+                    "description": func.get("description"),
+                    "parameters": func.get("parameters"),
+                    # Note: strict=True is default in Responses API
+                }
+            )
+        else:
+            # Pass through non-function tools as-is
+            converted.append(tool)
+    return converted
+
+
+def extract_responses_tool_calls(response: Any) -> list[dict] | None:
+    """Extract tool calls from Responses API response.
+
+    The Responses API returns tool calls as separate items in the output array
+    with type="function_call".
+
+    Args:
+        response: The Responses API response object
+
+    Returns:
+        List of tool calls in a normalized format, or None if no tool calls
+    """
+    tool_calls = []
+    for item in response.output:
+        if getattr(item, "type", None) == "function_call":
+            tool_calls.append(
+                {
+                    "id": item.call_id,
+                    "type": "function",
+                    "function": {
+                        "name": item.name,
+                        "arguments": item.arguments,
+                    },
+                }
+            )
+    return tool_calls if tool_calls else None
+
+
+def extract_usage(response: Any, is_responses_api: bool) -> tuple[int, int]:
+    """Extract token usage from either API response.
+
+    The Responses API uses different field names for token counts:
+    - Chat Completions: prompt_tokens, completion_tokens
+    - Responses API: input_tokens, output_tokens
+
+    Args:
+        response: The API response object
+        is_responses_api: True if response is from Responses API
+
+    Returns:
+        Tuple of (prompt_tokens, completion_tokens)
+    """
+    if not response.usage:
+        return 0, 0
+
+    if is_responses_api:
+        # Responses API uses different field names
+        return (
+            getattr(response.usage, "input_tokens", 0),
+            getattr(response.usage, "output_tokens", 0),
+        )
+    else:
+        # Chat Completions API
+        return (
+            getattr(response.usage, "prompt_tokens", 0),
+            getattr(response.usage, "completion_tokens", 0),
+        )
+
+
+def extract_responses_content(response: Any) -> str:
+    """Extract text content from Responses API response.
+
+    Args:
+        response: The Responses API response object
+
+    Returns:
+        The text content from the response, or empty string if none
+    """
+    # The SDK provides a helper property
+    if hasattr(response, "output_text"):
+        return response.output_text or ""
+
+    # Fallback: manually extract from output items
+    for item in response.output:
+        if getattr(item, "type", None) == "message":
+            for content in getattr(item, "content", []):
+                if getattr(content, "type", None) == "output_text":
+                    return getattr(content, "text", "")
+    return ""
+
+
+def extract_responses_reasoning(response: Any) -> str | None:
+    """Extract reasoning content from Responses API response.
+
+    Reasoning models return their reasoning process in the response,
+    which can be useful for debugging or display.
+
+    Args:
+        response: The Responses API response object
+
+    Returns:
+        The reasoning text, or None if not present
+    """
+    for item in response.output:
+        if getattr(item, "type", None) == "reasoning":
+            # Reasoning items may have summary or content
+            summary = getattr(item, "summary", [])
+            if summary:
+                # Join summary items if present
+                texts = []
+                for s in summary:
+                    if hasattr(s, "text"):
+                        texts.append(s.text)
+                if texts:
+                    return "\n".join(texts)
+    return None
--- a/autogpt_platform/backend/backend/util/openai_responses_test.py
+++ b/autogpt_platform/backend/backend/util/openai_responses_test.py
@@ -0,0 +1,155 @@
+"""Tests for OpenAI Responses API helpers."""
+
+import pytest
+
+from backend.util.openai_responses import (
+    REASONING_MODELS,
+    convert_tools_to_responses_format,
+    requires_responses_api,
+)
+
+
+class TestRequiresResponsesApi:
+    """Tests for the requires_responses_api function."""
+
+    def test_o1_models_require_responses_api(self):
+        """O1 family models should require the Responses API."""
+        assert requires_responses_api("o1") is True
+        assert requires_responses_api("o1-mini") is True
+        assert requires_responses_api("o1-preview") is True
+        assert requires_responses_api("o1-2024-12-17") is True
+
+    def test_o3_models_require_responses_api(self):
+        """O3 family models should require the Responses API."""
+        assert requires_responses_api("o3") is True
+        assert requires_responses_api("o3-mini") is True
+        assert requires_responses_api("o3-2025-04-16") is True
+        assert requires_responses_api("o3-mini-2025-01-31") is True
+
+    def test_gpt_models_do_not_require_responses_api(self):
+        """GPT models should NOT require the Responses API."""
+        assert requires_responses_api("gpt-4o") is False
+        assert requires_responses_api("gpt-4o-mini") is False
+        assert requires_responses_api("gpt-4-turbo") is False
+        assert requires_responses_api("gpt-3.5-turbo") is False
+        assert requires_responses_api("gpt-5") is False
+        assert requires_responses_api("gpt-5-mini") is False
+
+    def test_other_models_do_not_require_responses_api(self):
+        """Other provider models should NOT require the Responses API."""
+        assert requires_responses_api("claude-3-opus") is False
+        assert requires_responses_api("llama-3.3-70b") is False
+        assert requires_responses_api("gemini-pro") is False
+
+    def test_empty_string_does_not_require_responses_api(self):
+        """Empty string should not require the Responses API."""
+        assert requires_responses_api("") is False
+
+    def test_exact_matching_no_false_positives(self):
+        """Should not match models that just start with 'o1' or 'o3'."""
+        # These are hypothetical models that start with o1/o3 but aren't
+        # actually reasoning models
+        assert requires_responses_api("o1-turbo-hypothetical") is False
+        assert requires_responses_api("o3-fast-hypothetical") is False
+        assert requires_responses_api("o100") is False
+
+
+class TestConvertToolsToResponsesFormat:
+    """Tests for the convert_tools_to_responses_format function."""
+
+    def test_empty_tools_returns_empty_list(self):
+        """Empty or None tools should return empty list."""
+        assert convert_tools_to_responses_format(None) == []
+        assert convert_tools_to_responses_format([]) == []
+
+    def test_converts_function_tool_format(self):
+        """Should convert Chat Completions function format to Responses format."""
+        chat_completions_tools = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_weather",
+                    "description": "Get the weather in a location",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "location": {"type": "string"},
+                        },
+                        "required": ["location"],
+                    },
+                },
+            }
+        ]
+
+        result = convert_tools_to_responses_format(chat_completions_tools)
+
+        assert len(result) == 1
+        assert result[0]["type"] == "function"
+        assert result[0]["name"] == "get_weather"
+        assert result[0]["description"] == "Get the weather in a location"
+        assert result[0]["parameters"] == {
+            "type": "object",
+            "properties": {
+                "location": {"type": "string"},
+            },
+            "required": ["location"],
+        }
+        # Should not have nested "function" key
+        assert "function" not in result[0]
+
+    def test_handles_multiple_tools(self):
+        """Should handle multiple tools."""
+        chat_completions_tools = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "tool_1",
+                    "description": "First tool",
+                    "parameters": {"type": "object", "properties": {}},
+                },
+            },
+            {
+                "type": "function",
+                "function": {
+                    "name": "tool_2",
+                    "description": "Second tool",
+                    "parameters": {"type": "object", "properties": {}},
+                },
+            },
+        ]
+
+        result = convert_tools_to_responses_format(chat_completions_tools)
+
+        assert len(result) == 2
+        assert result[0]["name"] == "tool_1"
+        assert result[1]["name"] == "tool_2"
+
+    def test_passes_through_non_function_tools(self):
+        """Non-function tools should be passed through as-is."""
+        tools = [{"type": "web_search", "config": {"enabled": True}}]
+
+        result = convert_tools_to_responses_format(tools)
+
+        assert result == tools
+
+
+class TestReasoningModelsSet:
+    """Tests for the REASONING_MODELS constant."""
+
+    def test_reasoning_models_is_frozenset(self):
+        """REASONING_MODELS should be a frozenset (immutable)."""
+        assert isinstance(REASONING_MODELS, frozenset)
+
+    def test_contains_expected_models(self):
+        """Should contain all expected reasoning models."""
+        expected = {
+            "o1",
+            "o1-mini",
+            "o1-preview",
+            "o1-2024-12-17",
+            "o3",
+            "o3-mini",
+            "o3-2025-04-16",
+            "o3-mini-2025-01-31",
+        }
+        assert expected.issubset(REASONING_MODELS)