tests(LLM): add concurrency and provider-mapping reinit tests; fix style

Co-authored-by: openhands <openhands@all-hands.dev>
style(tests): apply pre-commit formatting fixes
2026-04-29 03:00:45 -04:00 · 2025-08-14 16:26:02 +00:00 · 2025-08-14 05:27:36 +00:00 · 2025-08-14 05:24:38 +00:00 · 2025-08-14 05:18:38 +00:00 · 2025-08-14 03:31:27 +00:00
2 changed files with 458 additions and 6 deletions
@@ -3,6 +3,7 @@ import os
 import time
 import warnings
 from functools import partial
 from threading import RLock
 from typing import Any, Callable
 import httpx
@@ -142,6 +143,7 @@ class LLM(RetryMixin, DebugMixin):
            metrics: The metrics to use.
        """
        self._tried_model_info = False
        self._lock = RLock()
        self.metrics: Metrics = (
            metrics if metrics is not None else Metrics(model_name=config.model)
        )
@@ -157,11 +159,22 @@ class LLM(RetryMixin, DebugMixin):
                )
            os.makedirs(self.config.log_completions_folder, exist_ok=True)
-        # call init_model_info to initialize config.max_output_tokens
+        # Initialize core internals in a single pass
-        # which is used in partial function
+        self._initialize_core()
    def _initialize_core(self) -> None:
        """Initialize or re-initialize all components derived from config.
        This centralizes initialization to avoid duplication between __init__ and reinit().
        """
        # call init_model_info to initialize config.max_output_tokens used in partial
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            self.init_model_info()
        # Recompute function-calling capability regardless of model_info cache
        self._compute_function_calling_active()
        if self.vision_is_active():
            logger.debug('LLM: model has vision enabled')
        if self.is_caching_prompt_active():
@@ -175,6 +188,17 @@ class LLM(RetryMixin, DebugMixin):
        else:
            self.tokenizer = None
        # Initialize the completion function
        self._build_completion_function()
        # Build the completion wrapper with retry logic
        self._rebuild_completion_wrapper()
    def _build_completion_function(self) -> None:
        """Build the completion function based on current configuration.
        This method creates the partial function that will be used for LLM completions.
        It can be called multiple times to rebuild the function when configuration changes.
        """
        # set up the completion function
        kwargs: dict[str, Any] = {
            'temperature': self.config.temperature,
@@ -251,6 +275,78 @@ class LLM(RetryMixin, DebugMixin):
        self._completion_unwrapped = self._completion
    def reinit(self, new_config: LLMConfig) -> None:
        """Reinitialize the LLM with a new configuration.
        This is a threadsafe operation that updates configuration and rebuilds
        the completion pipeline (completion function + retry wrapper). It also
        refreshes model info and tokenizer as needed.
        """
        with self._lock:
            # Reset capability/cost flags so the new config gets a clean slate
            self.cost_metric_supported = True
            old_model = self.config.model
            old_tokenizer = self.config.custom_tokenizer
            # Update the configuration (deep copy to avoid external mutation)
            self.config = copy.deepcopy(new_config)
            # Update metrics model name if model changed and refresh model info
            if old_model != new_config.model:
                self.metrics.model_name = new_config.model
                logger.debug(
                    f'LLM model changed from {old_model} to {new_config.model}'
                )
                # Reset model info to force re-initialization
                self._tried_model_info = False
                self.model_info = None
                with warnings.catch_warnings():
                    warnings.simplefilter('ignore')
                    self.init_model_info()
                # Log new capabilities
                if self.vision_is_active():
                    logger.debug('LLM: model has vision enabled')
                if self.is_caching_prompt_active():
                    logger.debug('LLM: caching prompt enabled')
                if self.is_function_calling_active():
                    logger.debug('LLM: model supports function calling')
            # Update tokenizer if custom_tokenizer changed
            if old_tokenizer != new_config.custom_tokenizer:
                if new_config.custom_tokenizer is not None:
                    self.tokenizer = create_pretrained_tokenizer(
                        new_config.custom_tokenizer
                    )
                    logger.debug(
                        f'LLM tokenizer updated to {new_config.custom_tokenizer}'
                    )
                else:
                    self.tokenizer = None
                    logger.debug('LLM tokenizer reset to default')
            # Handle log completions folder creation if needed
            if new_config.log_completions:
                if new_config.log_completions_folder is None:
                    raise RuntimeError(
                        'log_completions_folder is required when log_completions is enabled'
                    )
                os.makedirs(new_config.log_completions_folder, exist_ok=True)
            # Re-initialize core internals (model info, tokenizer, completion & wrapper)
            self._initialize_core()
            logger.debug('LLM reinitialized successfully')
    # Backward-compat: keep update_config as an alias
    def update_config(self, new_config: LLMConfig) -> None:
        self.reinit(new_config)
    def _rebuild_completion_wrapper(self) -> None:
        """Rebuild the completion wrapper with retry decorator.
        This method recreates the wrapper function that includes retry logic
        and other processing around the base completion function.
        """
        @self.retry_decorator(
            num_retries=self.config.num_retries,
            retry_exceptions=LLM_RETRY_EXCEPTIONS,
@@ -553,14 +649,19 @@ class LLM(RetryMixin, DebugMixin):
                    self.config.max_output_tokens = self.model_info['max_tokens']
        # Initialize function calling capability
-        # Check if model name is in our supported list
+        self._compute_function_calling_active()
    def _compute_function_calling_active(self) -> None:
        """Compute and cache whether function calling is active for current config.
        Respects user override via native_tool_calling. Otherwise bases decision on
        supported model names.
        """
        model_name_supported = (
            self.config.model in FUNCTION_CALLING_SUPPORTED_MODELS
            or self.config.model.split('/')[-1] in FUNCTION_CALLING_SUPPORTED_MODELS
            or any(m in self.config.model for m in FUNCTION_CALLING_SUPPORTED_MODELS)
        )
        # Handle native_tool_calling user-defined configuration
        if self.config.native_tool_calling is None:
            self._function_calling_active = model_name_supported
        else:
@@ -1104,7 +1104,219 @@ def test_azure_model_default_max_tokens():
    assert llm.config.max_output_tokens is None  # Default value
-# Gemini Performance Optimization Tests
+def test_llm_update_config_basic(default_config):
    """Test basic LLM configuration update functionality."""
    llm = LLM(default_config)
    # Verify initial state
    assert llm.config.model == 'gpt-4o'
    assert llm.config.temperature == 0.0
    assert llm.metrics.model_name == 'gpt-4o'
    # Create new config with different settings
    new_config = LLMConfig(
        model='claude-3-5-sonnet-20241022',
        api_key='new_test_key',
        temperature=0.7,
        max_output_tokens=2000,
        top_p=0.9,
    )
    # Update the configuration
    llm.update_config(new_config)
    # Verify the configuration was updated
    assert llm.config.model == 'claude-3-5-sonnet-20241022'
    assert llm.config.api_key.get_secret_value() == 'new_test_key'
    assert llm.config.temperature == 0.7
    assert llm.config.max_output_tokens == 2000
    assert llm.config.top_p == 0.9
    assert llm.metrics.model_name == 'claude-3-5-sonnet-20241022'
 def test_llm_update_config_model_change_resets_model_info(default_config):
    """Test that changing model resets model info for re-initialization."""
    llm = LLM(default_config)
    # Set some model info to verify it gets reset
    llm.model_info = {'test': 'info'}
    llm._tried_model_info = True
    # Create new config with different model
    new_config = copy.deepcopy(default_config)
    new_config.model = 'claude-3-5-sonnet-20241022'
    # Update the configuration
    llm.update_config(new_config)
    # Verify model info was reset and metrics updated
    assert llm.metrics.model_name == 'claude-3-5-sonnet-20241022'
    # _tried_model_info should be reset to False to force re-initialization
    # (it will be set back to True after init_model_info() is called)
 def test_llm_update_config_same_model_preserves_model_info(default_config):
    """Test that keeping the same model preserves model info."""
    llm = LLM(default_config)
    # Create new config with same model but different other settings
    new_config = copy.deepcopy(default_config)
    new_config.temperature = 0.5
    new_config.max_output_tokens = 1500
    # Update the configuration
    llm.update_config(new_config)
    # Verify model info was preserved but other settings changed
    assert llm.config.temperature == 0.5
    assert llm.config.max_output_tokens == 1500
    assert llm.metrics.model_name == 'gpt-4o'  # Same model
 def test_llm_update_config_custom_tokenizer_update(default_config):
    """Test updating custom tokenizer configuration."""
    llm = LLM(default_config)
    # Initially no custom tokenizer
    assert llm.config.custom_tokenizer is None
    assert llm.tokenizer is None
    # Update config with custom tokenizer
    new_config = copy.deepcopy(default_config)
    new_config.custom_tokenizer = 'gpt2'
    llm.update_config(new_config)
    # Verify tokenizer was updated
    assert llm.config.custom_tokenizer == 'gpt2'
    assert llm.tokenizer is not None
    # Update back to no custom tokenizer
    new_config.custom_tokenizer = None
    llm.update_config(new_config)
    # Verify tokenizer was reset
    assert llm.config.custom_tokenizer is None
    assert llm.tokenizer is None
 def test_llm_update_config_log_completions_folder_creation(default_config):
    """Test that log completions folder is created when needed."""
    with tempfile.TemporaryDirectory() as temp_dir:
        log_folder = Path(temp_dir) / 'test_completions'
        llm = LLM(default_config)
        # Update config to enable log completions
        new_config = copy.deepcopy(default_config)
        new_config.log_completions = True
        new_config.log_completions_folder = str(log_folder)
        # Folder shouldn't exist yet
        assert not log_folder.exists()
        # Update configuration
        llm.update_config(new_config)
        # Verify folder was created
        assert log_folder.exists()
        assert log_folder.is_dir()
 def test_llm_update_config_log_completions_folder_required():
    """Test that log_completions_folder is required when log_completions is True."""
    config = LLMConfig(model='gpt-4o', api_key='test_key')
    llm = LLM(config)
    # Create config with log_completions=True but no folder
    new_config = copy.deepcopy(config)
    new_config.log_completions = True
    new_config.log_completions_folder = None
    # Should raise RuntimeError
    with pytest.raises(RuntimeError, match='log_completions_folder is required'):
        llm.update_config(new_config)
 def test_llm_update_config_completion_function_rebuilt(default_config):
    """Test that completion function is rebuilt after config update."""
    llm = LLM(default_config)
    # Store reference to original completion function
    # Update configuration
    new_config = copy.deepcopy(default_config)
    new_config.temperature = 0.8
    new_config.max_output_tokens = 1500
    llm.update_config(new_config)
    # Verify completion functions exist (they should be rebuilt)
    assert llm._completion is not None
    assert llm._completion_unwrapped is not None
    # The functions should be different objects since they were rebuilt
    # (though this is implementation detail, the important thing is they work)
    assert callable(llm._completion)
    assert callable(llm._completion_unwrapped)
 def test_llm_update_config_preserves_metrics_and_retry_listener(default_config):
    """Test that metrics and retry listener are preserved during config update."""
    # Create custom metrics and retry listener
    custom_metrics = Metrics(model_name='initial_model')
    retry_listener = MagicMock()
    llm = LLM(default_config, metrics=custom_metrics, retry_listener=retry_listener)
    # Verify initial state
    assert llm.metrics is custom_metrics
    assert llm.retry_listener is retry_listener
    # Update configuration
    new_config = copy.deepcopy(default_config)
    new_config.model = 'claude-3-5-sonnet-20241022'
    llm.update_config(new_config)
    # Verify metrics and retry listener are preserved
    assert llm.metrics is custom_metrics
    assert llm.retry_listener is retry_listener
    # But metrics model name should be updated
    assert llm.metrics.model_name == 'claude-3-5-sonnet-20241022'
 def test_llm_update_config_deep_copy_independence():
    """Test that config update uses deep copy and doesn't affect original config."""
    original_config = LLMConfig(
        model='gpt-4o',
        api_key='test_key',
        temperature=0.0,
    )
    llm = LLM(original_config)
    # Create new config
    new_config = LLMConfig(
        model='claude-3-5-sonnet-20241022',
        api_key='new_key',
        temperature=0.7,
    )
    # Update LLM config
    llm.update_config(new_config)
    # Modify the new_config after update
    new_config.temperature = 0.9
    new_config.model = 'different-model'
    # LLM config should not be affected by external changes
    assert llm.config.temperature == 0.7
    assert llm.config.model == 'claude-3-5-sonnet-20241022'
    # Original config should also be unchanged
    assert original_config.temperature == 0.0
    assert original_config.model == 'gpt-4o'
 def test_gemini_model_keeps_none_reasoning_effort():
@@ -1302,3 +1514,142 @@ def test_gemini_performance_optimization_end_to_end(mock_completion):
    # Verify temperature and top_p were removed for reasoning models
    assert 'temperature' not in call_kwargs
    assert 'top_p' not in call_kwargs
 def test_llm_reinit_basic(default_config):
    """Reinit should update config and metrics like update_config."""
    llm = LLM(default_config)
    assert llm.metrics.model_name == 'gpt-4o'
    new_config = LLMConfig(
        model='claude-3-5-sonnet-20241022',
        api_key='new_test_key',
        temperature=0.7,
        max_output_tokens=1234,
    )
    llm.reinit(new_config)
    assert llm.config.model == 'claude-3-5-sonnet-20241022'
    assert llm.config.api_key.get_secret_value() == 'new_test_key'
    assert llm.config.temperature == 0.7
    assert llm.config.max_output_tokens == 1234
    assert llm.metrics.model_name == 'claude-3-5-sonnet-20241022'
    assert callable(llm._completion)
    assert callable(llm._completion_unwrapped)
 def test_llm_reinit_recomputes_function_calling_capability():
    """Reinit should recompute function-calling capability even if model doesn't change."""
    base = LLMConfig(model='gpt-3.5-turbo', api_key='key')
    llm = LLM(base)
    # gpt-3.5-turbo is not in FUNCTION_CALLING_SUPPORTED_MODELS
    assert llm.is_function_calling_active() is False
    # Turn on native tool calling; same model
    cfg_on = copy.deepcopy(base)
    cfg_on.native_tool_calling = True
    llm.reinit(cfg_on)
    assert llm.is_function_calling_active() is True
    # Turn off explicitly
    cfg_off = copy.deepcopy(base)
    cfg_off.native_tool_calling = False
    llm.reinit(cfg_off)
    assert llm.is_function_calling_active() is False
 def test_llm_reinit_resets_cost_flag(default_config):
    """Reinit should reset cost_metric_supported so a new model can report cost."""
    llm = LLM(default_config)
    llm.cost_metric_supported = False
    # Same model, but reinit should reset the flag to True
    llm.reinit(copy.deepcopy(default_config))
    assert llm.cost_metric_supported is True
 def test_llm_reinit_thread_safety_with_inflight_completion(default_config):
    """Concurrent reinit during an in-flight completion should not raise,
    and subsequent completions should use the new config.
    """
    import threading
    import time
    from unittest.mock import patch
    llm = LLM(default_config)
    calls = []
    def fake_completion(*args, **kwargs):
        # Simulate provider latency and record the model used
        calls.append(kwargs.get('model'))
        time.sleep(0.2)
        return {
            'id': 'test-1',
            'choices': [{'message': {'content': 'ok'}}],
            'usage': {'prompt_tokens': 1, 'completion_tokens': 1},
        }
    with patch('openhands.llm.llm.litellm_completion') as mock_completion:
        mock_completion.side_effect = fake_completion
        # Start an in-flight completion with the initial model
        t = threading.Thread(
            target=llm.completion, args=([{'role': 'user', 'content': 'hi'}],)
        )
        t.start()
        # Reinit while the completion is in-flight
        time.sleep(0.05)
        new_cfg = copy.deepcopy(default_config)
        new_cfg.model = 'claude-3-5-sonnet-20241022'
        llm.reinit(new_cfg)
        t.join()
        # Subsequent completion should use the new config
        llm.completion(messages=[{'role': 'user', 'content': 'again'}])
        # Ensure the latest call used the new model and metrics reflect it
        assert len(calls) >= 1
        assert calls[-1] == 'claude-3-5-sonnet-20241022'
        assert llm.metrics.model_name == 'claude-3-5-sonnet-20241022'
@patch('openhands.llm.llm.litellm_completion')
 def test_llm_reinit_provider_mappings(mock_completion, default_config):
    """Reinit should apply provider-specific mappings (openhands proxy, azure max_tokens)."""
    # Return a minimal, valid response
    mock_completion.return_value = {
        'id': 'call',
        'choices': [{'message': {'content': 'ok'}}],
        'usage': {'prompt_tokens': 1, 'completion_tokens': 1},
    }
    llm = LLM(default_config)
    # 1) OpenHands provider rewrite to litellm_proxy
    cfg_proxy = copy.deepcopy(default_config)
    cfg_proxy.model = 'openhands/qwen3-coder'
    llm.reinit(cfg_proxy)
    llm.completion(messages=[{'role': 'user', 'content': 'x'}])
    model_arg = mock_completion.call_args.kwargs.get('model')
    base_url_arg = mock_completion.call_args.kwargs.get('base_url')
    assert model_arg == 'litellm_proxy/qwen3-coder'
    assert base_url_arg == 'https://llm-proxy.app.all-hands.dev/'
    # 2) Azure mapping: max_completion_tokens -> max_tokens
    mock_completion.reset_mock()
    cfg_azure = copy.deepcopy(default_config)
    cfg_azure.model = 'azure/gpt-4o'
    cfg_azure.max_output_tokens = 777
    cfg_azure.api_version = '2024-06-01'
    llm.reinit(cfg_azure)
    llm.completion(messages=[{'role': 'user', 'content': 'y'}])
    kwargs = mock_completion.call_args.kwargs
    assert kwargs.get('model') == 'azure/gpt-4o'
    assert 'max_tokens' in kwargs and kwargs['max_tokens'] == 777
    assert 'max_completion_tokens' not in kwargs
Author	SHA1	Message	Date
Engel Nyst	c8d856cfa5	tests(LLM): add concurrency and provider-mapping reinit tests; fix style Co-authored-by: openhands <openhands@all-hands.dev>	2025-08-14 16:26:02 +00:00
Engel Nyst	db5c0c687c	style(tests): apply pre-commit formatting fixes Co-authored-by: openhands <openhands@all-hands.dev>	2025-08-14 05:27:36 +00:00
Engel Nyst	82d5b0388b	tests(LLM): add reinit tests (basic, capability recompute, cost flag reset) Co-authored-by: openhands <openhands@all-hands.dev>	2025-08-14 05:24:38 +00:00
Engel Nyst	bf192688a5	LLM: add lock + reinit helper; recompute capabilities in centralized initializer; keep update_config alias Co-authored-by: openhands <openhands@all-hands.dev>	2025-08-14 05:18:38 +00:00
Engel Nyst	477d9b17c0	Merge branch 'main' into llm-init to resolve conflicts Co-authored-by: OpenHands-GPT-4.1 <openhands@all-hands.dev>	2025-08-14 03:31:27 +00:00
OpenHands Bot	6756427217	🤖 Auto-fix Python linting issues	2025-07-30 23:33:14 +00:00
Engel Nyst	1b74920509	Merge branch 'main' into llm-init	2025-07-31 01:31:31 +02:00
Engel Nyst	c933b88f36	Merge branch 'main' into llm-init	2025-07-30 02:01:52 +02:00
Engel Nyst	e0e9d3d07c	Merge branch 'main' into llm-init	2025-07-26 22:50:17 +02:00
Engel Nyst	00f9ff08c7	Merge branch 'main' into llm-init	2025-07-22 00:53:46 +02:00
Engel Nyst	f37f8fb723	Merge branch 'main' into llm-init	2025-07-21 01:57:07 +02:00
Engel Nyst	8628da0037	Refactor LLM class to support runtime configuration updates - Decouple partial function creation from LLM initialization - Extract _build_completion_function() method for reusable completion setup - Add update_config() method for hot-swapping LLM configuration at runtime - Add _rebuild_completion_wrapper() method to recreate retry decorator wrapper - Preserve metrics and retry listener instances during config updates - Handle model changes by resetting model info for re-initialization - Support custom tokenizer updates and log completion folder creation - Add comprehensive unit tests covering all update scenarios This prepares the LLM class for the upcoming unified configuration system that will enable runtime config reloading without restart. Co-authored-by: OpenHands <openhands@all-hands.dev>	2025-07-19 15:07:52 +02:00