Compare commits

...

21 Commits

Author SHA1 Message Date
enyst
8fbdff7fa2 cleanup: remove stray debug write in CodeAct function_calling; restore arg parser tests from main\n\nCo-authored-by: openhands <openhands@all-hands.dev> 2025-08-25 10:06:05 +00:00
enyst
ea86d96d76 merge: resolve conflicts with main
- Keep centralized model_features.py patterns and may_return_empty_reasoning()
- Import may_return_empty_reasoning in CodeAct function_calling and remove debug traces
- Fix argparse tests by aligning expected help output and removing conflict markers

Co-authored-by: openhands <openhands@all-hands.dev>
2025-08-25 10:02:50 +00:00
enyst
738a56f0b7 codeact: handle Grok-4 empty reasoning via centralized feature check; clean up debug traces
- Use openhands.llm.model_features.may_return_empty_reasoning(model)
- Ensure MessageAction(wait_for_response=False) when Grok-4 returns empty content
- Remove temporary debug prints

Co-authored-by: openhands <openhands@all-hands.dev>
2025-08-25 09:56:44 +00:00
enyst
f50f8a1076 Merge main into PR branch and refactor: move Grok empty reasoning model check into model_features.py and resolve conflicts in function_calling and arg parser tests\n\n- Introduce may_return_empty_reasoning() in openhands.llm.model_features\n- Use it in function_calling.py instead of importing constants from llm.py\n- Resolve merge conflicts by adopting centralized model feature checks\n- Align test_arg_parser expected help output with current argparse formatting\n\nCo-authored-by: openhands <openhands@all-hands.dev> 2025-08-25 09:03:09 +00:00
Engel Nyst
e6cb9a6433 Merge branch 'main' into main 2025-08-04 20:57:00 +02:00
mamoodi
ce8ec961cd Merge branch 'main' into main 2025-08-04 14:35:43 -04:00
Engel Nyst
f1b4f58ba5 Merge branch 'main' into main 2025-07-31 17:23:34 +02:00
mamoodi
e5922889f5 Merge branch 'main' into main 2025-07-31 11:00:08 -04:00
mamoodi
fd0204ebb5 Merge branch 'main' into main 2025-07-31 10:07:17 -04:00
Engel Nyst
772115ce50 Update openhands/agenthub/codeact_agent/function_calling.py 2025-07-20 22:32:38 +02:00
Engel Nyst
353235aa68 Update openhands/agenthub/codeact_agent/function_calling.py 2025-07-20 22:32:09 +02:00
Engel Nyst
9ebe551619 Update openhands/agenthub/codeact_agent/function_calling.py 2025-07-20 22:31:41 +02:00
Engel Nyst
c219c80701 Merge branch 'main' into main 2025-07-20 22:22:00 +02:00
claysauruswrecks
b4fc3c4e45 add all grok4 names 2025-07-19 15:08:08 -07:00
claysauruswrecks
5b43bb350c lint fix 2025-07-19 15:03:35 -07:00
claysauruswrecks
6daa2ef0c9 remove html decoding 2025-07-19 14:53:53 -07:00
claysauruswrecks
57c7795b93 add html decode test 2025-07-19 14:45:42 -07:00
claysauruswrecks
d6a2ae0e7f lint fix 2025-07-19 14:31:50 -07:00
claysauruswrecks
6e317f9972 Fix test_help_message to match current argparse output format
Updated expected help message format in test_arg_parser.py to match
the more compact argparse output that uses '-d, --directory DIRECTORY'
instead of the older '-d DIRECTORY, --directory DIRECTORY' format.
2025-07-19 14:04:45 -07:00
claysauruswrecks
1fc01b5f29 Fix TypeError in function_calling.py when model name is not a string
When using Mock objects in tests, the model attribute might not be a string,
causing a TypeError when checking if it contains any reasoning models.
This fix ensures model_name is always converted to a string before the check.
2025-07-19 14:00:40 -07:00
claysauruswrecks
0107ebc95f fix grok 4 empty tokens after reasoning 2025-07-19 13:19:13 -07:00
4 changed files with 204 additions and 3 deletions

14
.openhands/TASKS.md Normal file
View File

@@ -0,0 +1,14 @@
# Task List
1. ✅ Fetch latest main and merge into working branch
Merged origin/main with conflicts resolved
2. ✅ Remove temporary debug prints and finalize logic
Cleaned up debug prints in model_features and function_calling; centralized logic retained
3. 🔄 Run focused unit tests for Grok empty reasoning behavior
Run pytest ensuring repo-local package used
4. ⏳ Run pre-commit hooks for backend
Install hooks and run python pre-commit config
5. ⏳ Create new branch for PR-to-PR and commit changes
Create a new branch off oh-9809-merge-main to open PR-to-PR
6. ⏳ Push branch and open PR-to-PR against PR #9809 head
Target repo: claysauruswrecks/OpenHands, base branch: main; source: our branch in All-Hands-AI/OpenHands

View File

@@ -43,6 +43,7 @@ from openhands.events.action.agent import CondensationRequestAction
from openhands.events.action.mcp import MCPAction
from openhands.events.event import FileEditSource, FileReadSource
from openhands.events.tool import ToolCallMetadata
from openhands.llm.model_features import may_return_empty_reasoning
from openhands.llm.tool_names import TASK_TRACKER_TOOL_NAME
@@ -293,10 +294,23 @@ def response_to_actions(
)
actions.append(action)
else:
content = str(assistant_msg.content) if assistant_msg.content else ''
# Check if this is a model that may return empty responses while reasoning
model_name = str(getattr(response, 'model', ''))
is_reasoning_model = may_return_empty_reasoning(model_name)
# Ref: https://github.com/All-Hands-AI/OpenHands/pull/9809
# Don't wait for response if content is empty and it's Grok
# This prevents getting stuck when Grok models return empty content while thinking
# For certain reasoning models (e.g., Grok-4), the API may emit empty content
# while thinking. In that case, don't wait for more assistant content.
wait_for_response = not (is_reasoning_model and not content)
actions.append(
MessageAction(
content=str(assistant_msg.content) if assistant_msg.content else '',
wait_for_response=True,
content=content,
wait_for_response=wait_for_response,
)
)

View File

@@ -126,6 +126,18 @@ SUPPORTS_STOP_WORDS_FALSE_PATTERNS: list[str] = [
'deepseek-r1-0528*',
]
# Models that may return empty responses while performing internal reasoning
# Use provider-qualified patterns to avoid false positives
EMPTY_REASONING_RESPONSE_PATTERNS: list[str] = [
'xai/grok-4*',
'*/xai/grok-4*',
]
def may_return_empty_reasoning(model: str) -> bool:
matched = model_matches(model, EMPTY_REASONING_RESPONSE_PATTERNS)
return matched
def get_features(model: str) -> ModelFeatures:
return ModelFeatures(

View File

@@ -1,6 +1,7 @@
"""Test function calling module."""
import json
from typing import Any
from unittest.mock import patch
import pytest
@@ -14,11 +15,14 @@ from openhands.events.action import (
FileEditAction,
FileReadAction,
IPythonRunCellAction,
MessageAction,
)
from openhands.events.event import FileEditSource, FileReadSource
def create_mock_response(function_name: str, arguments: dict) -> ModelResponse:
def create_mock_response(
function_name: str, arguments: dict[str, Any]
) -> ModelResponse:
"""Helper function to create a mock response with a tool call."""
return ModelResponse(
id='mock-id',
@@ -272,3 +276,160 @@ def test_unexpected_argument_handling():
# Verify the error message mentions the unexpected argument
assert 'old_str_prefix' in str(exc_info.value)
assert 'Unexpected argument' in str(exc_info.value)
def test_message_action_empty_response_non_grok():
"""Test that non-Grok models preserve original behavior for empty responses."""
# Create a response with no tool calls and empty content
response = ModelResponse(
id='mock-id',
model='gpt-4o', # Non-Grok model
choices=[
{
'message': {
'content': '', # Empty content
'role': 'assistant',
},
'index': 0,
'finish_reason': 'stop',
}
],
)
actions = response_to_actions(response)
assert len(actions) == 1
assert isinstance(actions[0], MessageAction)
assert actions[0].content == ''
# Non-Grok models should still wait for response even with empty content (original behavior)
assert actions[0].wait_for_response is True
def test_message_action_empty_response_grok():
"""Test that Grok models don't wait for response when content is empty."""
# Create a response with no tool calls and empty content
response = ModelResponse(
id='mock-id',
model='xai/grok-4-0709', # Grok model
choices=[
{
'message': {
'content': '', # Empty content
'role': 'assistant',
},
'index': 0,
'finish_reason': 'stop',
}
],
)
actions = response_to_actions(response)
assert len(actions) == 1
assert isinstance(actions[0], MessageAction)
assert actions[0].content == ''
# Grok models should NOT wait for response when content is empty (new behavior)
assert actions[0].wait_for_response is False
def test_message_action_non_empty_response_grok():
"""Test that Grok models still wait for response when content is non-empty."""
# Create a response with no tool calls but with content
response = ModelResponse(
id='mock-id',
model='xai/grok-4-0709', # Grok model
choices=[
{
'message': {
'content': 'Here is my response.', # Non-empty content
'role': 'assistant',
},
'index': 0,
'finish_reason': 'stop',
}
],
)
actions = response_to_actions(response)
assert len(actions) == 1
assert isinstance(actions[0], MessageAction)
assert actions[0].content == 'Here is my response.'
# Grok models should wait for response when there's actual content
assert actions[0].wait_for_response is True
@pytest.mark.parametrize(
'model_name',
[
'xai/grok-4-0709',
'litellm_proxy/xai/grok-4-0709',
'openrouter/xai/grok-4-0709',
],
)
def test_message_action_grok_model_variants(model_name):
"""Test that different Grok model name formats are handled correctly."""
response = ModelResponse(
id='mock-id',
model=model_name,
choices=[
{
'message': {
'content': '', # Empty content
'role': 'assistant',
},
'index': 0,
'finish_reason': 'stop',
}
],
)
actions = response_to_actions(response)
assert len(actions) == 1
assert isinstance(actions[0], MessageAction)
assert actions[0].content == ''
# All Grok model variants should NOT wait for response when content is empty
assert actions[0].wait_for_response is False
@pytest.mark.parametrize(
'model_name',
[
'grok-lite', # Similar name but not the specific Grok-4 model
'xai/other-model', # Same provider but different model
'gpt-4-grok-mode', # Contains 'grok' but not the actual model
],
)
def test_message_action_similar_model_names(model_name):
"""Test that models with similar names to Grok are not affected."""
response = ModelResponse(
id='mock-id',
model=model_name,
choices=[
{
'message': {
'content': '', # Empty content
'role': 'assistant',
},
'index': 0,
'finish_reason': 'stop',
}
],
)
actions = response_to_actions(response)
assert len(actions) == 1
assert isinstance(actions[0], MessageAction)
assert actions[0].content == ''
# Non-Grok models should maintain original behavior
assert actions[0].wait_for_response is True
def test_message_action_with_tool_calls_not_affected():
"""Test that the empty response handling doesn't affect tool calls."""
# This should not be affected by the empty response logic since it has tool calls
response = create_mock_response('execute_bash', {'command': 'ls -la'})
response.model = 'xai/grok-4-0709' # Add model attribute
actions = response_to_actions(response)
assert len(actions) == 1
assert isinstance(actions[0], CmdRunAction)
# Tool calls don't have wait_for_response attribute
assert not hasattr(actions[0], 'wait_for_response')