cleanup: remove stray debug write in CodeAct function_calling; restore arg parser tests from main\n\nCo-authored-by: openhands <openhands@all-hands.dev>

merge: resolve conflicts with main
- Keep centralized model_features.py patterns and may_return_empty_reasoning() - Import may_return_empty_reasoning in CodeAct function_calling and remove debug traces - Fix argparse tests by aligning expected help output and removing conflict markers Co-authored-by: openhands <openhands@all-hands.dev>
2026-04-29 03:00:45 -04:00 · 2025-08-25 10:06:05 +00:00 · 2025-08-25 10:02:50 +00:00 · 2025-08-25 09:56:44 +00:00 · 2025-08-25 09:03:09 +00:00 · 2025-08-04 20:57:00 +02:00
4 changed files with 204 additions and 3 deletions
--- a/.openhands/TASKS.md
+++ b/.openhands/TASKS.md
@@ -0,0 +1,14 @@
+# Task List
+
+1. ✅ Fetch latest main and merge into working branch
+Merged origin/main with conflicts resolved
+2. ✅ Remove temporary debug prints and finalize logic
+Cleaned up debug prints in model_features and function_calling; centralized logic retained
+3. 🔄 Run focused unit tests for Grok empty reasoning behavior
+Run pytest ensuring repo-local package used
+4. ⏳ Run pre-commit hooks for backend
+Install hooks and run python pre-commit config
+5. ⏳ Create new branch for PR-to-PR and commit changes
+Create a new branch off oh-9809-merge-main to open PR-to-PR
+6. ⏳ Push branch and open PR-to-PR against PR #9809 head
+Target repo: claysauruswrecks/OpenHands, base branch: main; source: our branch in All-Hands-AI/OpenHands
--- a/openhands/agenthub/codeact_agent/function_calling.py
+++ b/openhands/agenthub/codeact_agent/function_calling.py
@@ -43,6 +43,7 @@ from openhands.events.action.agent import CondensationRequestAction
 from openhands.events.action.mcp import MCPAction
 from openhands.events.event import FileEditSource, FileReadSource
 from openhands.events.tool import ToolCallMetadata
+from openhands.llm.model_features import may_return_empty_reasoning
 from openhands.llm.tool_names import TASK_TRACKER_TOOL_NAME


@@ -293,10 +294,23 @@ def response_to_actions(
            )
            actions.append(action)
    else:
+        content = str(assistant_msg.content) if assistant_msg.content else ''
+
+        # Check if this is a model that may return empty responses while reasoning
+        model_name = str(getattr(response, 'model', ''))
+        is_reasoning_model = may_return_empty_reasoning(model_name)
+
+        # Ref: https://github.com/All-Hands-AI/OpenHands/pull/9809
+        # Don't wait for response if content is empty and it's Grok
+        # This prevents getting stuck when Grok models return empty content while thinking
+        # For certain reasoning models (e.g., Grok-4), the API may emit empty content
+        # while thinking. In that case, don't wait for more assistant content.
+        wait_for_response = not (is_reasoning_model and not content)
+
        actions.append(
            MessageAction(
-                content=str(assistant_msg.content) if assistant_msg.content else '',
-                wait_for_response=True,
+                content=content,
+                wait_for_response=wait_for_response,
            )
        )

--- a/openhands/llm/model_features.py
+++ b/openhands/llm/model_features.py
@@ -126,6 +126,18 @@ SUPPORTS_STOP_WORDS_FALSE_PATTERNS: list[str] = [
    'deepseek-r1-0528*',
 ]

+# Models that may return empty responses while performing internal reasoning
+# Use provider-qualified patterns to avoid false positives
+EMPTY_REASONING_RESPONSE_PATTERNS: list[str] = [
+    'xai/grok-4*',
+    '*/xai/grok-4*',
+]
+
+
+def may_return_empty_reasoning(model: str) -> bool:
+    matched = model_matches(model, EMPTY_REASONING_RESPONSE_PATTERNS)
+    return matched
+

 def get_features(model: str) -> ModelFeatures:
    return ModelFeatures(
--- a/tests/unit/agenthub/test_function_calling.py
+++ b/tests/unit/agenthub/test_function_calling.py
@@ -1,6 +1,7 @@
 """Test function calling module."""

 import json
+from typing import Any
 from unittest.mock import patch

 import pytest
@@ -14,11 +15,14 @@ from openhands.events.action import (
    FileEditAction,
    FileReadAction,
    IPythonRunCellAction,
+    MessageAction,
 )
 from openhands.events.event import FileEditSource, FileReadSource


-def create_mock_response(function_name: str, arguments: dict) -> ModelResponse:
+def create_mock_response(
+    function_name: str, arguments: dict[str, Any]
+) -> ModelResponse:
    """Helper function to create a mock response with a tool call."""
    return ModelResponse(
        id='mock-id',
@@ -272,3 +276,160 @@ def test_unexpected_argument_handling():
    # Verify the error message mentions the unexpected argument
    assert 'old_str_prefix' in str(exc_info.value)
    assert 'Unexpected argument' in str(exc_info.value)
+
+
+def test_message_action_empty_response_non_grok():
+    """Test that non-Grok models preserve original behavior for empty responses."""
+    # Create a response with no tool calls and empty content
+    response = ModelResponse(
+        id='mock-id',
+        model='gpt-4o',  # Non-Grok model
+        choices=[
+            {
+                'message': {
+                    'content': '',  # Empty content
+                    'role': 'assistant',
+                },
+                'index': 0,
+                'finish_reason': 'stop',
+            }
+        ],
+    )
+
+    actions = response_to_actions(response)
+    assert len(actions) == 1
+    assert isinstance(actions[0], MessageAction)
+    assert actions[0].content == ''
+    # Non-Grok models should still wait for response even with empty content (original behavior)
+    assert actions[0].wait_for_response is True
+
+
+def test_message_action_empty_response_grok():
+    """Test that Grok models don't wait for response when content is empty."""
+    # Create a response with no tool calls and empty content
+    response = ModelResponse(
+        id='mock-id',
+        model='xai/grok-4-0709',  # Grok model
+        choices=[
+            {
+                'message': {
+                    'content': '',  # Empty content
+                    'role': 'assistant',
+                },
+                'index': 0,
+                'finish_reason': 'stop',
+            }
+        ],
+    )
+
+    actions = response_to_actions(response)
+    assert len(actions) == 1
+    assert isinstance(actions[0], MessageAction)
+    assert actions[0].content == ''
+    # Grok models should NOT wait for response when content is empty (new behavior)
+    assert actions[0].wait_for_response is False
+
+
+def test_message_action_non_empty_response_grok():
+    """Test that Grok models still wait for response when content is non-empty."""
+    # Create a response with no tool calls but with content
+    response = ModelResponse(
+        id='mock-id',
+        model='xai/grok-4-0709',  # Grok model
+        choices=[
+            {
+                'message': {
+                    'content': 'Here is my response.',  # Non-empty content
+                    'role': 'assistant',
+                },
+                'index': 0,
+                'finish_reason': 'stop',
+            }
+        ],
+    )
+
+    actions = response_to_actions(response)
+    assert len(actions) == 1
+    assert isinstance(actions[0], MessageAction)
+    assert actions[0].content == 'Here is my response.'
+    # Grok models should wait for response when there's actual content
+    assert actions[0].wait_for_response is True
+
+
+@pytest.mark.parametrize(
+    'model_name',
+    [
+        'xai/grok-4-0709',
+        'litellm_proxy/xai/grok-4-0709',
+        'openrouter/xai/grok-4-0709',
+    ],
+)
+def test_message_action_grok_model_variants(model_name):
+    """Test that different Grok model name formats are handled correctly."""
+    response = ModelResponse(
+        id='mock-id',
+        model=model_name,
+        choices=[
+            {
+                'message': {
+                    'content': '',  # Empty content
+                    'role': 'assistant',
+                },
+                'index': 0,
+                'finish_reason': 'stop',
+            }
+        ],
+    )
+
+    actions = response_to_actions(response)
+    assert len(actions) == 1
+    assert isinstance(actions[0], MessageAction)
+    assert actions[0].content == ''
+    # All Grok model variants should NOT wait for response when content is empty
+    assert actions[0].wait_for_response is False
+
+
+@pytest.mark.parametrize(
+    'model_name',
+    [
+        'grok-lite',  # Similar name but not the specific Grok-4 model
+        'xai/other-model',  # Same provider but different model
+        'gpt-4-grok-mode',  # Contains 'grok' but not the actual model
+    ],
+)
+def test_message_action_similar_model_names(model_name):
+    """Test that models with similar names to Grok are not affected."""
+    response = ModelResponse(
+        id='mock-id',
+        model=model_name,
+        choices=[
+            {
+                'message': {
+                    'content': '',  # Empty content
+                    'role': 'assistant',
+                },
+                'index': 0,
+                'finish_reason': 'stop',
+            }
+        ],
+    )
+
+    actions = response_to_actions(response)
+    assert len(actions) == 1
+    assert isinstance(actions[0], MessageAction)
+    assert actions[0].content == ''
+    # Non-Grok models should maintain original behavior
+    assert actions[0].wait_for_response is True
+
+
+def test_message_action_with_tool_calls_not_affected():
+    """Test that the empty response handling doesn't affect tool calls."""
+    # This should not be affected by the empty response logic since it has tool calls
+    response = create_mock_response('execute_bash', {'command': 'ls -la'})
+    response.model = 'xai/grok-4-0709'  # Add model attribute
+
+    actions = response_to_actions(response)
+    assert len(actions) == 1
+    assert isinstance(actions[0], CmdRunAction)
+    # Tool calls don't have wait_for_response attribute
+    assert not hasattr(actions[0], 'wait_for_response')
Author	SHA1	Message	Date
enyst	8fbdff7fa2	cleanup: remove stray debug write in CodeAct function_calling; restore arg parser tests from main\n\nCo-authored-by: openhands <openhands@all-hands.dev>	2025-08-25 10:06:05 +00:00
enyst	ea86d96d76	merge: resolve conflicts with main - Keep centralized model_features.py patterns and may_return_empty_reasoning() - Import may_return_empty_reasoning in CodeAct function_calling and remove debug traces - Fix argparse tests by aligning expected help output and removing conflict markers Co-authored-by: openhands <openhands@all-hands.dev>	2025-08-25 10:02:50 +00:00
enyst	738a56f0b7	codeact: handle Grok-4 empty reasoning via centralized feature check; clean up debug traces - Use openhands.llm.model_features.may_return_empty_reasoning(model) - Ensure MessageAction(wait_for_response=False) when Grok-4 returns empty content - Remove temporary debug prints Co-authored-by: openhands <openhands@all-hands.dev>	2025-08-25 09:56:44 +00:00
enyst	f50f8a1076	Merge main into PR branch and refactor: move Grok empty reasoning model check into model_features.py and resolve conflicts in function_calling and arg parser tests\n\n- Introduce may_return_empty_reasoning() in openhands.llm.model_features\n- Use it in function_calling.py instead of importing constants from llm.py\n- Resolve merge conflicts by adopting centralized model feature checks\n- Align test_arg_parser expected help output with current argparse formatting\n\nCo-authored-by: openhands <openhands@all-hands.dev>	2025-08-25 09:03:09 +00:00
Engel Nyst	e6cb9a6433	Merge branch 'main' into main	2025-08-04 20:57:00 +02:00
mamoodi	ce8ec961cd	Merge branch 'main' into main	2025-08-04 14:35:43 -04:00
Engel Nyst	f1b4f58ba5	Merge branch 'main' into main	2025-07-31 17:23:34 +02:00
mamoodi	e5922889f5	Merge branch 'main' into main	2025-07-31 11:00:08 -04:00
mamoodi	fd0204ebb5	Merge branch 'main' into main	2025-07-31 10:07:17 -04:00
Engel Nyst	772115ce50	Update openhands/agenthub/codeact_agent/function_calling.py	2025-07-20 22:32:38 +02:00
Engel Nyst	353235aa68	Update openhands/agenthub/codeact_agent/function_calling.py	2025-07-20 22:32:09 +02:00
Engel Nyst	9ebe551619	Update openhands/agenthub/codeact_agent/function_calling.py	2025-07-20 22:31:41 +02:00
Engel Nyst	c219c80701	Merge branch 'main' into main	2025-07-20 22:22:00 +02:00
claysauruswrecks	b4fc3c4e45	add all grok4 names	2025-07-19 15:08:08 -07:00
claysauruswrecks	5b43bb350c	lint fix	2025-07-19 15:03:35 -07:00
claysauruswrecks	6daa2ef0c9	remove html decoding	2025-07-19 14:53:53 -07:00
claysauruswrecks	57c7795b93	add html decode test	2025-07-19 14:45:42 -07:00
claysauruswrecks	d6a2ae0e7f	lint fix	2025-07-19 14:31:50 -07:00
claysauruswrecks	6e317f9972	Fix test_help_message to match current argparse output format Updated expected help message format in test_arg_parser.py to match the more compact argparse output that uses '-d, --directory DIRECTORY' instead of the older '-d DIRECTORY, --directory DIRECTORY' format.	2025-07-19 14:04:45 -07:00
claysauruswrecks	1fc01b5f29	Fix TypeError in function_calling.py when model name is not a string When using Mock objects in tests, the model attribute might not be a string, causing a TypeError when checking if it contains any reasoning models. This fix ensures model_name is always converted to a string before the check.	2025-07-19 14:00:40 -07:00
claysauruswrecks	0107ebc95f	fix grok 4 empty tokens after reasoning	2025-07-19 13:19:13 -07:00