fix(anthropic): disable extended thinking for Opus 4.1 (#10532)

Co-authored-by: openhands <openhands@all-hands.dev>
2026-01-10 07:18:10 -05:00 · 2025-08-21 00:13:15 +02:00
parent 5ce5469bfa
commit 7861c1ddf7
4 changed files with 33 additions and 6 deletions
--- a/tests/unit/llm/test_llm.py
+++ b/tests/unit/llm/test_llm.py
@@ -1204,7 +1204,7 @@ def test_gemini_medium_reasoning_effort_passes_through(mock_completion):


@patch('openhands.llm.llm.litellm_completion')
-def test_opus_41_reasoning_pops_temperature_top_p(mock_completion):
+def test_opus_41_keeps_temperature_top_p(mock_completion):
    mock_completion.return_value = {
        'choices': [{'message': {'content': 'ok'}}],
    }
@@ -1217,7 +1217,8 @@ def test_opus_41_reasoning_pops_temperature_top_p(mock_completion):
    llm = LLM(config, service_id='svc')
    llm.completion(messages=[{'role': 'user', 'content': 'hi'}])
    call_kwargs = mock_completion.call_args[1]
-    assert 'temperature' not in call_kwargs
+    assert call_kwargs.get('temperature') == 0.7
+    # Anthropic rejects both temperature and top_p together on Opus; we keep temperature and drop top_p
    assert 'top_p' not in call_kwargs


@@ -1239,6 +1240,21 @@ def test_opus_4_keeps_temperature_top_p(mock_completion):
    assert call_kwargs.get('top_p') == 0.9


+@patch('openhands.llm.llm.litellm_completion')
+def test_opus_41_disables_thinking(mock_completion):
+    mock_completion.return_value = {
+        'choices': [{'message': {'content': 'ok'}}],
+    }
+    config = LLMConfig(
+        model='anthropic/claude-opus-4-1-20250805',
+        api_key='k',
+    )
+    llm = LLM(config, service_id='svc')
+    llm.completion(messages=[{'role': 'user', 'content': 'hi'}])
+    call_kwargs = mock_completion.call_args[1]
+    assert call_kwargs.get('thinking') == {'type': 'disabled'}
+
+
@patch('openhands.llm.llm.litellm.get_model_info')
 def test_is_caching_prompt_active_anthropic_prefixed(mock_get_model_info):
    # Avoid external calls, but behavior shouldn't depend on model info
--- a/tests/unit/llm/test_model_features.py
+++ b/tests/unit/llm/test_model_features.py
@@ -199,7 +199,6 @@ def test_function_calling_models(model):
        'gemini-2.5-flash',
        'gemini-2.5-pro',
        'gpt-5',
-        'claude-opus-4-1-20250805',
    ],
 )
 def test_reasoning_effort_models(model):
@@ -230,7 +229,6 @@ def test_deepseek_reasoning_effort_models(model):
        'claude-3-haiku-20240307',
        'claude-3-opus-20240229',
        'claude-sonnet-4-latest',
-        'claude-opus-4-1-20250805',
    ],
 )
 def test_prompt_cache_models(model):
@@ -254,7 +252,7 @@ def test_prompt_cache_models(model):
        ('gemini-2.5-pro', True),
        ('gpt-5', True),
        ('gpt-5-2025-08-07', True),
-        ('claude-opus-4-1-20250805', True),
+        ('claude-opus-4-1-20250805', False),
        # DeepSeek
        ('deepseek/DeepSeek-R1-0528:671b-Q4_K_XL', True),
        ('DeepSeek-R1-0528', True),