mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-04-29 03:00:45 -04:00
Compare commits
21 Commits
openhands/
...
oh-9809-pr
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8fbdff7fa2 | ||
|
|
ea86d96d76 | ||
|
|
738a56f0b7 | ||
|
|
f50f8a1076 | ||
|
|
e6cb9a6433 | ||
|
|
ce8ec961cd | ||
|
|
f1b4f58ba5 | ||
|
|
e5922889f5 | ||
|
|
fd0204ebb5 | ||
|
|
772115ce50 | ||
|
|
353235aa68 | ||
|
|
9ebe551619 | ||
|
|
c219c80701 | ||
|
|
b4fc3c4e45 | ||
|
|
5b43bb350c | ||
|
|
6daa2ef0c9 | ||
|
|
57c7795b93 | ||
|
|
d6a2ae0e7f | ||
|
|
6e317f9972 | ||
|
|
1fc01b5f29 | ||
|
|
0107ebc95f |
14
.openhands/TASKS.md
Normal file
14
.openhands/TASKS.md
Normal file
@@ -0,0 +1,14 @@
|
||||
# Task List
|
||||
|
||||
1. ✅ Fetch latest main and merge into working branch
|
||||
Merged origin/main with conflicts resolved
|
||||
2. ✅ Remove temporary debug prints and finalize logic
|
||||
Cleaned up debug prints in model_features and function_calling; centralized logic retained
|
||||
3. 🔄 Run focused unit tests for Grok empty reasoning behavior
|
||||
Run pytest ensuring repo-local package used
|
||||
4. ⏳ Run pre-commit hooks for backend
|
||||
Install hooks and run python pre-commit config
|
||||
5. ⏳ Create new branch for PR-to-PR and commit changes
|
||||
Create a new branch off oh-9809-merge-main to open PR-to-PR
|
||||
6. ⏳ Push branch and open PR-to-PR against PR #9809 head
|
||||
Target repo: claysauruswrecks/OpenHands, base branch: main; source: our branch in All-Hands-AI/OpenHands
|
||||
@@ -43,6 +43,7 @@ from openhands.events.action.agent import CondensationRequestAction
|
||||
from openhands.events.action.mcp import MCPAction
|
||||
from openhands.events.event import FileEditSource, FileReadSource
|
||||
from openhands.events.tool import ToolCallMetadata
|
||||
from openhands.llm.model_features import may_return_empty_reasoning
|
||||
from openhands.llm.tool_names import TASK_TRACKER_TOOL_NAME
|
||||
|
||||
|
||||
@@ -293,10 +294,23 @@ def response_to_actions(
|
||||
)
|
||||
actions.append(action)
|
||||
else:
|
||||
content = str(assistant_msg.content) if assistant_msg.content else ''
|
||||
|
||||
# Check if this is a model that may return empty responses while reasoning
|
||||
model_name = str(getattr(response, 'model', ''))
|
||||
is_reasoning_model = may_return_empty_reasoning(model_name)
|
||||
|
||||
# Ref: https://github.com/All-Hands-AI/OpenHands/pull/9809
|
||||
# Don't wait for response if content is empty and it's Grok
|
||||
# This prevents getting stuck when Grok models return empty content while thinking
|
||||
# For certain reasoning models (e.g., Grok-4), the API may emit empty content
|
||||
# while thinking. In that case, don't wait for more assistant content.
|
||||
wait_for_response = not (is_reasoning_model and not content)
|
||||
|
||||
actions.append(
|
||||
MessageAction(
|
||||
content=str(assistant_msg.content) if assistant_msg.content else '',
|
||||
wait_for_response=True,
|
||||
content=content,
|
||||
wait_for_response=wait_for_response,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@@ -126,6 +126,18 @@ SUPPORTS_STOP_WORDS_FALSE_PATTERNS: list[str] = [
|
||||
'deepseek-r1-0528*',
|
||||
]
|
||||
|
||||
# Models that may return empty responses while performing internal reasoning
|
||||
# Use provider-qualified patterns to avoid false positives
|
||||
EMPTY_REASONING_RESPONSE_PATTERNS: list[str] = [
|
||||
'xai/grok-4*',
|
||||
'*/xai/grok-4*',
|
||||
]
|
||||
|
||||
|
||||
def may_return_empty_reasoning(model: str) -> bool:
|
||||
matched = model_matches(model, EMPTY_REASONING_RESPONSE_PATTERNS)
|
||||
return matched
|
||||
|
||||
|
||||
def get_features(model: str) -> ModelFeatures:
|
||||
return ModelFeatures(
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
"""Test function calling module."""
|
||||
|
||||
import json
|
||||
from typing import Any
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
@@ -14,11 +15,14 @@ from openhands.events.action import (
|
||||
FileEditAction,
|
||||
FileReadAction,
|
||||
IPythonRunCellAction,
|
||||
MessageAction,
|
||||
)
|
||||
from openhands.events.event import FileEditSource, FileReadSource
|
||||
|
||||
|
||||
def create_mock_response(function_name: str, arguments: dict) -> ModelResponse:
|
||||
def create_mock_response(
|
||||
function_name: str, arguments: dict[str, Any]
|
||||
) -> ModelResponse:
|
||||
"""Helper function to create a mock response with a tool call."""
|
||||
return ModelResponse(
|
||||
id='mock-id',
|
||||
@@ -272,3 +276,160 @@ def test_unexpected_argument_handling():
|
||||
# Verify the error message mentions the unexpected argument
|
||||
assert 'old_str_prefix' in str(exc_info.value)
|
||||
assert 'Unexpected argument' in str(exc_info.value)
|
||||
|
||||
|
||||
def test_message_action_empty_response_non_grok():
|
||||
"""Test that non-Grok models preserve original behavior for empty responses."""
|
||||
# Create a response with no tool calls and empty content
|
||||
response = ModelResponse(
|
||||
id='mock-id',
|
||||
model='gpt-4o', # Non-Grok model
|
||||
choices=[
|
||||
{
|
||||
'message': {
|
||||
'content': '', # Empty content
|
||||
'role': 'assistant',
|
||||
},
|
||||
'index': 0,
|
||||
'finish_reason': 'stop',
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
actions = response_to_actions(response)
|
||||
assert len(actions) == 1
|
||||
assert isinstance(actions[0], MessageAction)
|
||||
assert actions[0].content == ''
|
||||
# Non-Grok models should still wait for response even with empty content (original behavior)
|
||||
assert actions[0].wait_for_response is True
|
||||
|
||||
|
||||
def test_message_action_empty_response_grok():
|
||||
"""Test that Grok models don't wait for response when content is empty."""
|
||||
# Create a response with no tool calls and empty content
|
||||
response = ModelResponse(
|
||||
id='mock-id',
|
||||
model='xai/grok-4-0709', # Grok model
|
||||
choices=[
|
||||
{
|
||||
'message': {
|
||||
'content': '', # Empty content
|
||||
'role': 'assistant',
|
||||
},
|
||||
'index': 0,
|
||||
'finish_reason': 'stop',
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
actions = response_to_actions(response)
|
||||
assert len(actions) == 1
|
||||
assert isinstance(actions[0], MessageAction)
|
||||
assert actions[0].content == ''
|
||||
# Grok models should NOT wait for response when content is empty (new behavior)
|
||||
assert actions[0].wait_for_response is False
|
||||
|
||||
|
||||
def test_message_action_non_empty_response_grok():
|
||||
"""Test that Grok models still wait for response when content is non-empty."""
|
||||
# Create a response with no tool calls but with content
|
||||
response = ModelResponse(
|
||||
id='mock-id',
|
||||
model='xai/grok-4-0709', # Grok model
|
||||
choices=[
|
||||
{
|
||||
'message': {
|
||||
'content': 'Here is my response.', # Non-empty content
|
||||
'role': 'assistant',
|
||||
},
|
||||
'index': 0,
|
||||
'finish_reason': 'stop',
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
actions = response_to_actions(response)
|
||||
assert len(actions) == 1
|
||||
assert isinstance(actions[0], MessageAction)
|
||||
assert actions[0].content == 'Here is my response.'
|
||||
# Grok models should wait for response when there's actual content
|
||||
assert actions[0].wait_for_response is True
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'model_name',
|
||||
[
|
||||
'xai/grok-4-0709',
|
||||
'litellm_proxy/xai/grok-4-0709',
|
||||
'openrouter/xai/grok-4-0709',
|
||||
],
|
||||
)
|
||||
def test_message_action_grok_model_variants(model_name):
|
||||
"""Test that different Grok model name formats are handled correctly."""
|
||||
response = ModelResponse(
|
||||
id='mock-id',
|
||||
model=model_name,
|
||||
choices=[
|
||||
{
|
||||
'message': {
|
||||
'content': '', # Empty content
|
||||
'role': 'assistant',
|
||||
},
|
||||
'index': 0,
|
||||
'finish_reason': 'stop',
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
actions = response_to_actions(response)
|
||||
assert len(actions) == 1
|
||||
assert isinstance(actions[0], MessageAction)
|
||||
assert actions[0].content == ''
|
||||
# All Grok model variants should NOT wait for response when content is empty
|
||||
assert actions[0].wait_for_response is False
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'model_name',
|
||||
[
|
||||
'grok-lite', # Similar name but not the specific Grok-4 model
|
||||
'xai/other-model', # Same provider but different model
|
||||
'gpt-4-grok-mode', # Contains 'grok' but not the actual model
|
||||
],
|
||||
)
|
||||
def test_message_action_similar_model_names(model_name):
|
||||
"""Test that models with similar names to Grok are not affected."""
|
||||
response = ModelResponse(
|
||||
id='mock-id',
|
||||
model=model_name,
|
||||
choices=[
|
||||
{
|
||||
'message': {
|
||||
'content': '', # Empty content
|
||||
'role': 'assistant',
|
||||
},
|
||||
'index': 0,
|
||||
'finish_reason': 'stop',
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
actions = response_to_actions(response)
|
||||
assert len(actions) == 1
|
||||
assert isinstance(actions[0], MessageAction)
|
||||
assert actions[0].content == ''
|
||||
# Non-Grok models should maintain original behavior
|
||||
assert actions[0].wait_for_response is True
|
||||
|
||||
|
||||
def test_message_action_with_tool_calls_not_affected():
|
||||
"""Test that the empty response handling doesn't affect tool calls."""
|
||||
# This should not be affected by the empty response logic since it has tool calls
|
||||
response = create_mock_response('execute_bash', {'command': 'ls -la'})
|
||||
response.model = 'xai/grok-4-0709' # Add model attribute
|
||||
|
||||
actions = response_to_actions(response)
|
||||
assert len(actions) == 1
|
||||
assert isinstance(actions[0], CmdRunAction)
|
||||
# Tool calls don't have wait_for_response attribute
|
||||
assert not hasattr(actions[0], 'wait_for_response')
|
||||
|
||||
Reference in New Issue
Block a user