Chore: clean up LLM (prompt caching, supports fn calling), leftover renames (#6095)

This commit is contained in:
Engel Nyst
2025-02-01 18:14:08 +01:00
committed by GitHub
parent 3b0bbce54a
commit eb8d1600c3
21 changed files with 119 additions and 187 deletions

View File

@@ -219,7 +219,7 @@ jobs:
exit 1 exit 1
fi fi
# Run unit tests with the EventStream runtime Docker images as root # Run unit tests with the Docker runtime Docker images as root
test_runtime_root: test_runtime_root:
name: RT Unit Tests (Root) name: RT Unit Tests (Root)
needs: [ghcr_build_runtime] needs: [ghcr_build_runtime]
@@ -286,7 +286,7 @@ jobs:
image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image }} image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image }}
image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]') image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')
TEST_RUNTIME=eventstream \ TEST_RUNTIME=docker \
SANDBOX_USER_ID=$(id -u) \ SANDBOX_USER_ID=$(id -u) \
SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \ SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
TEST_IN_CI=true \ TEST_IN_CI=true \
@@ -297,7 +297,7 @@ jobs:
env: env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
# Run unit tests with the EventStream runtime Docker images as openhands user # Run unit tests with the Docker runtime Docker images as openhands user
test_runtime_oh: test_runtime_oh:
name: RT Unit Tests (openhands) name: RT Unit Tests (openhands)
runs-on: ubuntu-latest runs-on: ubuntu-latest
@@ -363,7 +363,7 @@ jobs:
image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image }} image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image }}
image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]') image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')
TEST_RUNTIME=eventstream \ TEST_RUNTIME=docker \
SANDBOX_USER_ID=$(id -u) \ SANDBOX_USER_ID=$(id -u) \
SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \ SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
TEST_IN_CI=true \ TEST_IN_CI=true \

View File

@@ -1,8 +1,8 @@
# 📦 Runtime EventStream # 📦 Runtime Docker
Le Runtime EventStream d'OpenHands est le composant principal qui permet l'exécution sécurisée et flexible des actions des agents d'IA. Le Runtime Docker d'OpenHands est le composant principal qui permet l'exécution sécurisée et flexible des actions des agents d'IA.
Il crée un environnement en bac à sable (sandbox) en utilisant Docker, où du code arbitraire peut être exécuté en toute sécurité sans risquer le système hôte. Il crée un environnement en bac à sable (sandbox) en utilisant Docker, où du code arbitraire peut être exécuté en toute sécurité sans risquer le système hôte.
## Pourquoi avons-nous besoin d'un runtime en bac à sable ? ## Pourquoi avons-nous besoin d'un runtime en bac à sable ?

View File

@@ -163,7 +163,7 @@ Les options de configuration de base sont définies dans la section `[core]` du
- `runtime` - `runtime`
- Type : `str` - Type : `str`
- Valeur par défaut : `"eventstream"` - Valeur par défaut : `"docker"`
- Description : Environnement d'exécution - Description : Environnement d'exécution
- `default_agent` - `default_agent`

View File

@@ -114,7 +114,7 @@ Pour créer un workflow d'évaluation pour votre benchmark, suivez ces étapes :
def get_config(instance: pd.Series, metadata: EvalMetadata) -> AppConfig: def get_config(instance: pd.Series, metadata: EvalMetadata) -> AppConfig:
config = AppConfig( config = AppConfig(
default_agent=metadata.agent_class, default_agent=metadata.agent_class,
runtime='eventstream', runtime='docker',
max_iterations=metadata.max_iterations, max_iterations=metadata.max_iterations,
sandbox=SandboxConfig( sandbox=SandboxConfig(
base_container_image='your_container_image', base_container_image='your_container_image',

View File

@@ -1,8 +1,8 @@
以下是翻译后的内容: 以下是翻译后的内容:
# 📦 EventStream 运行时 # 📦 Docker 运行时
OpenHands EventStream 运行时是实现 AI 代理操作安全灵活执行的核心组件。 OpenHands Docker 运行时是实现 AI 代理操作安全灵活执行的核心组件。
它使用 Docker 创建一个沙盒环境,可以安全地运行任意代码而不会危及主机系统。 它使用 Docker 创建一个沙盒环境,可以安全地运行任意代码而不会危及主机系统。
## 为什么我们需要沙盒运行时? ## 为什么我们需要沙盒运行时?

View File

@@ -162,7 +162,7 @@
- `runtime` - `runtime`
- 类型: `str` - 类型: `str`
- 默认值: `"eventstream"` - 默认值: `"docker"`
- 描述: 运行时环境 - 描述: 运行时环境
- `default_agent` - `default_agent`

View File

@@ -112,7 +112,7 @@ OpenHands 的主要入口点在 `openhands/core/main.py` 中。以下是它的
def get_config(instance: pd.Series, metadata: EvalMetadata) -> AppConfig: def get_config(instance: pd.Series, metadata: EvalMetadata) -> AppConfig:
config = AppConfig( config = AppConfig(
default_agent=metadata.agent_class, default_agent=metadata.agent_class,
runtime='eventstream', runtime='docker',
max_iterations=metadata.max_iterations, max_iterations=metadata.max_iterations,
sandbox=SandboxConfig( sandbox=SandboxConfig(
base_container_image='your_container_image', base_container_image='your_container_image',

View File

@@ -1,6 +1,6 @@
# 📦 EventStream Runtime # 📦 Docker Runtime
The OpenHands EventStream Runtime is the core component that enables secure and flexible execution of AI agent's action. The OpenHands Docker Runtime is the core component that enables secure and flexible execution of AI agent's action.
It creates a sandboxed environment using Docker, where arbitrary code can be run safely without risking the host system. It creates a sandboxed environment using Docker, where arbitrary code can be run safely without risking the host system.
## Why do we need a sandboxed runtime? ## Why do we need a sandboxed runtime?

View File

@@ -126,7 +126,7 @@ The core configuration options are defined in the `[core]` section of the `confi
- `runtime` - `runtime`
- Type: `str` - Type: `str`
- Default: `"eventstream"` - Default: `"docker"`
- Description: Runtime environment - Description: Runtime environment
- `default_agent` - `default_agent`

View File

@@ -112,7 +112,7 @@ To create an evaluation workflow for your benchmark, follow these steps:
def get_config(instance: pd.Series, metadata: EvalMetadata) -> AppConfig: def get_config(instance: pd.Series, metadata: EvalMetadata) -> AppConfig:
config = AppConfig( config = AppConfig(
default_agent=metadata.agent_class, default_agent=metadata.agent_class,
runtime='eventstream', runtime='docker',
max_iterations=metadata.max_iterations, max_iterations=metadata.max_iterations,
sandbox=SandboxConfig( sandbox=SandboxConfig(
base_container_image='your_container_image', base_container_image='your_container_image',

View File

@@ -98,12 +98,6 @@ class OperationCancelled(Exception):
super().__init__(message) super().__init__(message)
class CloudFlareBlockageError(Exception):
"""Exception raised when a request is blocked by CloudFlare."""
pass
# ============================================ # ============================================
# LLM function calling Exceptions # LLM function calling Exceptions
# ============================================ # ============================================

View File

@@ -27,7 +27,6 @@ from litellm.exceptions import (
from litellm.types.utils import CostPerToken, ModelResponse, Usage from litellm.types.utils import CostPerToken, ModelResponse, Usage
from litellm.utils import create_pretrained_tokenizer from litellm.utils import create_pretrained_tokenizer
from openhands.core.exceptions import CloudFlareBlockageError
from openhands.core.logger import openhands_logger as logger from openhands.core.logger import openhands_logger as logger
from openhands.core.message import Message from openhands.core.message import Message
from openhands.llm.debug_mixin import DebugMixin from openhands.llm.debug_mixin import DebugMixin
@@ -218,99 +217,86 @@ class LLM(RetryMixin, DebugMixin):
# log the entire LLM prompt # log the entire LLM prompt
self.log_prompt(messages) self.log_prompt(messages)
if self.is_caching_prompt_active():
# Anthropic-specific prompt caching
if 'claude-3' in self.config.model:
kwargs['extra_headers'] = {
'anthropic-beta': 'prompt-caching-2024-07-31',
}
# set litellm modify_params to the configured value # set litellm modify_params to the configured value
# True by default to allow litellm to do transformations like adding a default message, when a message is empty # True by default to allow litellm to do transformations like adding a default message, when a message is empty
# NOTE: this setting is global; unlike drop_params, it cannot be overridden in the litellm completion partial # NOTE: this setting is global; unlike drop_params, it cannot be overridden in the litellm completion partial
litellm.modify_params = self.config.modify_params litellm.modify_params = self.config.modify_params
try: # Record start time for latency measurement
# Record start time for latency measurement start_time = time.time()
start_time = time.time()
# we don't support streaming here, thus we get a ModelResponse
resp: ModelResponse = self._completion_unwrapped(*args, **kwargs)
# Calculate and record latency # we don't support streaming here, thus we get a ModelResponse
latency = time.time() - start_time resp: ModelResponse = self._completion_unwrapped(*args, **kwargs)
response_id = resp.get('id', 'unknown')
self.metrics.add_response_latency(latency, response_id)
non_fncall_response = copy.deepcopy(resp) # Calculate and record latency
latency = time.time() - start_time
response_id = resp.get('id', 'unknown')
self.metrics.add_response_latency(latency, response_id)
non_fncall_response = copy.deepcopy(resp)
if mock_function_calling:
assert len(resp.choices) == 1
assert mock_fncall_tools is not None
non_fncall_response_message = resp.choices[0].message
fn_call_messages_with_response = (
convert_non_fncall_messages_to_fncall_messages(
messages + [non_fncall_response_message], mock_fncall_tools
)
)
fn_call_response_message = fn_call_messages_with_response[-1]
if not isinstance(fn_call_response_message, LiteLLMMessage):
fn_call_response_message = LiteLLMMessage(
**fn_call_response_message
)
resp.choices[0].message = fn_call_response_message
message_back: str = resp['choices'][0]['message']['content'] or ''
tool_calls: list[ChatCompletionMessageToolCall] = resp['choices'][0][
'message'
].get('tool_calls', [])
if tool_calls:
for tool_call in tool_calls:
fn_name = tool_call.function.name
fn_args = tool_call.function.arguments
message_back += f'\nFunction call: {fn_name}({fn_args})'
# log the LLM response
self.log_response(message_back)
# post-process the response first to calculate cost
cost = self._post_completion(resp)
# log for evals or other scripts that need the raw completion
if self.config.log_completions:
assert self.config.log_completions_folder is not None
log_file = os.path.join(
self.config.log_completions_folder,
# use the metric model name (for draft editor)
f'{self.metrics.model_name.replace("/", "__")}-{time.time()}.json',
)
# set up the dict to be logged
_d = {
'messages': messages,
'response': resp,
'args': args,
'kwargs': {k: v for k, v in kwargs.items() if k != 'messages'},
'timestamp': time.time(),
'cost': cost,
}
# if non-native function calling, save messages/response separately
if mock_function_calling: if mock_function_calling:
assert len(resp.choices) == 1 # Overwrite response as non-fncall to be consistent with messages
assert mock_fncall_tools is not None _d['response'] = non_fncall_response
non_fncall_response_message = resp.choices[0].message
fn_call_messages_with_response = (
convert_non_fncall_messages_to_fncall_messages(
messages + [non_fncall_response_message], mock_fncall_tools
)
)
fn_call_response_message = fn_call_messages_with_response[-1]
if not isinstance(fn_call_response_message, LiteLLMMessage):
fn_call_response_message = LiteLLMMessage(
**fn_call_response_message
)
resp.choices[0].message = fn_call_response_message
message_back: str = resp['choices'][0]['message']['content'] or '' # Save fncall_messages/response separately
tool_calls: list[ChatCompletionMessageToolCall] = resp['choices'][0][ _d['fncall_messages'] = original_fncall_messages
'message' _d['fncall_response'] = resp
].get('tool_calls', []) with open(log_file, 'w') as f:
if tool_calls: f.write(json.dumps(_d))
for tool_call in tool_calls:
fn_name = tool_call.function.name
fn_args = tool_call.function.arguments
message_back += f'\nFunction call: {fn_name}({fn_args})'
# log the LLM response return resp
self.log_response(message_back)
# post-process the response first to calculate cost
cost = self._post_completion(resp)
# log for evals or other scripts that need the raw completion
if self.config.log_completions:
assert self.config.log_completions_folder is not None
log_file = os.path.join(
self.config.log_completions_folder,
# use the metric model name (for draft editor)
f'{self.metrics.model_name.replace("/", "__")}-{time.time()}.json',
)
# set up the dict to be logged
_d = {
'messages': messages,
'response': resp,
'args': args,
'kwargs': {k: v for k, v in kwargs.items() if k != 'messages'},
'timestamp': time.time(),
'cost': cost,
}
# if non-native function calling, save messages/response separately
if mock_function_calling:
# Overwrite response as non-fncall to be consistent with messages
_d['response'] = non_fncall_response
# Save fncall_messages/response separately
_d['fncall_messages'] = original_fncall_messages
_d['fncall_response'] = resp
with open(log_file, 'w') as f:
f.write(json.dumps(_d))
return resp
except APIError as e:
if 'Attention Required! | Cloudflare' in str(e):
raise CloudFlareBlockageError(
'Request blocked by CloudFlare'
) from e
raise
self._completion = wrapper self._completion = wrapper
@@ -414,6 +400,25 @@ class LLM(RetryMixin, DebugMixin):
): ):
self.config.max_output_tokens = self.model_info['max_tokens'] self.config.max_output_tokens = self.model_info['max_tokens']
# Initialize function calling capability
# Check if model name is in our supported list
model_name_supported = (
self.config.model in FUNCTION_CALLING_SUPPORTED_MODELS
or self.config.model.split('/')[-1] in FUNCTION_CALLING_SUPPORTED_MODELS
or any(m in self.config.model for m in FUNCTION_CALLING_SUPPORTED_MODELS)
)
# Handle native_tool_calling user-defined configuration
if self.config.native_tool_calling is None:
self._function_calling_active = model_name_supported
elif self.config.native_tool_calling is False:
self._function_calling_active = False
else:
# try to enable native tool calling if supported by the model
self._function_calling_active = litellm.supports_function_calling(
model=self.config.model
)
def vision_is_active(self) -> bool: def vision_is_active(self) -> bool:
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.simplefilter('ignore') warnings.simplefilter('ignore')
@@ -455,24 +460,11 @@ class LLM(RetryMixin, DebugMixin):
) )
def is_function_calling_active(self) -> bool: def is_function_calling_active(self) -> bool:
# Check if model name is in our supported list """Returns whether function calling is supported and enabled for this LLM instance.
model_name_supported = (
self.config.model in FUNCTION_CALLING_SUPPORTED_MODELS
or self.config.model.split('/')[-1] in FUNCTION_CALLING_SUPPORTED_MODELS
or any(m in self.config.model for m in FUNCTION_CALLING_SUPPORTED_MODELS)
)
# Handle native_tool_calling user-defined configuration The result is cached during initialization for performance.
if self.config.native_tool_calling is None: """
return model_name_supported return self._function_calling_active
elif self.config.native_tool_calling is False:
return False
else:
# try to enable native tool calling if supported by the model
supports_fn_call = litellm.supports_function_calling(
model=self.config.model
)
return supports_fn_call
def _post_completion(self, response: ModelResponse) -> float: def _post_completion(self, response: ModelResponse) -> float:
"""Post-process the completion response. """Post-process the completion response.

View File

@@ -24,7 +24,7 @@ class RetryMixin:
A retry decorator with the parameters customizable in configuration. A retry decorator with the parameters customizable in configuration.
""" """
num_retries = kwargs.get('num_retries') num_retries = kwargs.get('num_retries')
retry_exceptions = kwargs.get('retry_exceptions') retry_exceptions: tuple = kwargs.get('retry_exceptions', ())
retry_min_wait = kwargs.get('retry_min_wait') retry_min_wait = kwargs.get('retry_min_wait')
retry_max_wait = kwargs.get('retry_max_wait') retry_max_wait = kwargs.get('retry_max_wait')
retry_multiplier = kwargs.get('retry_multiplier') retry_multiplier = kwargs.get('retry_multiplier')
@@ -39,7 +39,9 @@ class RetryMixin:
before_sleep=before_sleep, before_sleep=before_sleep,
stop=stop_after_attempt(num_retries) | stop_if_should_exit(), stop=stop_after_attempt(num_retries) | stop_if_should_exit(),
reraise=True, reraise=True,
retry=(retry_if_exception_type(retry_exceptions)), retry=(
retry_if_exception_type(retry_exceptions)
), # retry only for these types
wait=wait_exponential( wait=wait_exponential(
multiplier=retry_multiplier, multiplier=retry_multiplier,
min=retry_min_wait, min=retry_min_wait,

View File

@@ -1,4 +1,4 @@
"""Bash-related tests for the EventStreamRuntime, which connects to the ActionExecutor running in the sandbox.""" """Bash-related tests for the DockerRuntime, which connects to the ActionExecutor running in the sandbox."""
import os import os
import time import time

View File

@@ -1,4 +1,4 @@
"""Browsing-related tests for the EventStreamRuntime, which connects to the ActionExecutor running in the sandbox.""" """Browsing-related tests for the DockerRuntime, which connects to the ActionExecutor running in the sandbox."""
from conftest import _close_test_runtime, _load_runtime from conftest import _close_test_runtime, _load_runtime

View File

@@ -1,4 +1,4 @@
"""Edit-related tests for the EventStreamRuntime.""" """Edit-related tests for the DockerRuntime."""
import os import os

View File

@@ -1,4 +1,4 @@
"""Env vars related tests for the EventStreamRuntime, which connects to the ActionExecutor running in the sandbox.""" """Env vars related tests for the DockerRuntime, which connects to the ActionExecutor running in the sandbox."""
import os import os
from unittest.mock import patch from unittest.mock import patch

View File

@@ -1,4 +1,4 @@
"""Image-related tests for the EventStreamRuntime, which connects to the ActionExecutor running in the sandbox.""" """Image-related tests for the DockerRuntime, which connects to the ActionExecutor running in the sandbox."""
import pytest import pytest
from conftest import _close_test_runtime, _load_runtime from conftest import _close_test_runtime, _load_runtime

View File

@@ -1,4 +1,4 @@
"""Bash-related tests for the EventStreamRuntime, which connects to the ActionExecutor running in the sandbox.""" """Bash-related tests for the DockerRuntime, which connects to the ActionExecutor running in the sandbox."""
import asyncio import asyncio
import os import os

View File

@@ -389,27 +389,6 @@ def test_completion_with_two_positional_args(mock_litellm_completion, default_co
) # No positional args should be passed to litellm_completion here ) # No positional args should be passed to litellm_completion here
@patch('openhands.llm.llm.litellm_completion')
def test_llm_cloudflare_blockage(mock_litellm_completion, default_config):
from litellm.exceptions import APIError
from openhands.core.exceptions import CloudFlareBlockageError
llm = LLM(default_config)
mock_litellm_completion.side_effect = APIError(
message='Attention Required! | Cloudflare',
llm_provider='test_provider',
model='test_model',
status_code=403,
)
with pytest.raises(CloudFlareBlockageError, match='Request blocked by CloudFlare'):
llm.completion(messages=[{'role': 'user', 'content': 'Hello'}])
# Ensure the completion was called
mock_litellm_completion.assert_called_once()
@patch('openhands.llm.llm.litellm.token_counter') @patch('openhands.llm.llm.litellm.token_counter')
def test_get_token_count_with_dict_messages(mock_token_counter, default_config): def test_get_token_count_with_dict_messages(mock_token_counter, default_config):
mock_token_counter.return_value = 42 mock_token_counter.return_value = 42

View File

@@ -128,38 +128,3 @@ def test_get_messages_prompt_caching(codeact_agent: CodeActAgent):
assert cached_user_messages[0].content[0].text.startswith('You are OpenHands agent') assert cached_user_messages[0].content[0].text.startswith('You are OpenHands agent')
assert cached_user_messages[2].content[0].text.startswith('User message 1') assert cached_user_messages[2].content[0].text.startswith('User message 1')
assert cached_user_messages[3].content[0].text.startswith('User message 1') assert cached_user_messages[3].content[0].text.startswith('User message 1')
def test_prompt_caching_headers(codeact_agent: CodeActAgent):
history = list()
# Setup
msg1 = MessageAction('Hello, agent!')
msg1._source = 'user'
history.append(msg1)
msg2 = MessageAction('Hello, user!')
msg2._source = 'agent'
history.append(msg2)
mock_state = Mock()
mock_state.history = history
mock_state.max_iterations = 5
mock_state.iteration = 0
mock_state.extra_data = {}
codeact_agent.reset()
# Create a mock for litellm_completion
def check_headers(**kwargs):
assert 'extra_headers' in kwargs
assert 'anthropic-beta' in kwargs['extra_headers']
assert kwargs['extra_headers']['anthropic-beta'] == 'prompt-caching-2024-07-31'
return ModelResponse(
choices=[{'message': {'content': 'Hello! How can I assist you today?'}}]
)
codeact_agent.llm._completion_unwrapped = check_headers
result = codeact_agent.step(mock_state)
# Assert
assert isinstance(result, MessageAction)
assert result.content == 'Hello! How can I assist you today?'