mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-04-29 03:00:45 -04:00
Compare commits
1 Commits
downgrade-
...
openhands-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1f182bd4c8 |
@@ -0,0 +1,5 @@
|
||||
expect(extractModelAndProvider("claude-3-5-sonnet-20241022")).toEqual({
|
||||
provider: "anthropic",
|
||||
model: "claude-3-5-sonnet-20241022",
|
||||
separator: "/",
|
||||
});
|
||||
@@ -0,0 +1,65 @@
|
||||
import { expect, test } from "vitest";
|
||||
import { organizeModelsAndProviders } from "../../src/utils/organizeModelsAndProviders";
|
||||
|
||||
test("organizeModelsAndProviders", () => {
|
||||
const models = [
|
||||
"azure/ada",
|
||||
"azure/gpt-35-turbo",
|
||||
"azure/gpt-3-turbo",
|
||||
"azure/standard/1024-x-1024/dall-e-2",
|
||||
"vertex_ai_beta/chat-bison",
|
||||
"vertex_ai_beta/chat-bison-32k",
|
||||
"sagemaker/meta-textgeneration-llama-2-13b",
|
||||
"cohere.command-r-v1:0",
|
||||
"cloudflare/@cf/mistral/mistral-7b-instruct-v0.1",
|
||||
"gpt-4o",
|
||||
"together-ai-21.1b-41b",
|
||||
"gpt-4o-mini",
|
||||
"claude-3-5-sonnet-20241022",
|
||||
"claude-3-haiku-20240307",
|
||||
"claude-2",
|
||||
"claude-2.1",
|
||||
"anthropic.unsafe-claude-2.1",
|
||||
];
|
||||
|
||||
const object = organizeModelsAndProviders(models);
|
||||
|
||||
expect(object).toEqual({
|
||||
azure: {
|
||||
separator: "/",
|
||||
models: [
|
||||
"ada",
|
||||
"gpt-35-turbo",
|
||||
"gpt-3-turbo",
|
||||
"standard/1024-x-1024/dall-e-2",
|
||||
],
|
||||
},
|
||||
vertex_ai_beta: {
|
||||
separator: "/",
|
||||
models: ["chat-bison", "chat-bison-32k"],
|
||||
},
|
||||
sagemaker: { separator: "/", models: ["meta-textgeneration-llama-2-13b"] },
|
||||
cohere: { separator: ".", models: ["command-r-v1:0"] },
|
||||
cloudflare: {
|
||||
separator: "/",
|
||||
models: ["@cf/mistral/mistral-7b-instruct-v0.1"],
|
||||
},
|
||||
openai: {
|
||||
separator: "/",
|
||||
models: ["gpt-4o", "gpt-4o-mini"],
|
||||
},
|
||||
anthropic: {
|
||||
separator: "/",
|
||||
models: [
|
||||
"claude-3-5-sonnet-20241022",
|
||||
"claude-3-haiku-20240307",
|
||||
"claude-2",
|
||||
"claude-2.1",
|
||||
],
|
||||
},
|
||||
other: {
|
||||
separator: "",
|
||||
models: ["together-ai-21.1b-41b"],
|
||||
},
|
||||
});
|
||||
});
|
||||
29
"b/frontend/src/utils/verified-models.ts\""
Normal file
29
"b/frontend/src/utils/verified-models.ts\""
Normal file
@@ -0,0 +1,29 @@
|
||||
// Here are the list of verified models and providers that we know work well with OpenHands.
|
||||
export const VERIFIED_PROVIDERS = ["openai", "azure", "anthropic"];
|
||||
export const VERIFIED_MODELS = ["gpt-4o", "claude-3-5-sonnet-20241022"];
|
||||
|
||||
// LiteLLM does not return OpenAI models with the provider, so we list them here to set them ourselves for consistency
|
||||
// (e.g., they return `gpt-4o` instead of `openai/gpt-4o`)
|
||||
export const VERIFIED_OPENAI_MODELS = [
|
||||
"gpt-4o",
|
||||
"gpt-4o-mini",
|
||||
"gpt-4-turbo",
|
||||
"gpt-4",
|
||||
"gpt-4-32k",
|
||||
"o1-mini",
|
||||
"o1-preview",
|
||||
];
|
||||
|
||||
// LiteLLM does not return the compatible Anthropic models with the provider, so we list them here to set them ourselves
|
||||
// (e.g., they return `claude-3-5-sonnet-20241022` instead of `anthropic/claude-3-5-sonnet-20241022`)
|
||||
export const VERIFIED_ANTHROPIC_MODELS = [
|
||||
"claude-2",
|
||||
"claude-2.1",
|
||||
"claude-3-5-sonnet-20241022",
|
||||
"claude-3-5-sonnet-20240620",
|
||||
"claude-3-haiku-20240307",
|
||||
"claude-3-opus-20240229",
|
||||
"claude-3-sonnet-20240229",
|
||||
"claude-instant-1",
|
||||
"claude-instant-1.2",
|
||||
];
|
||||
@@ -14,97 +14,4 @@ Pour démarrer une session OpenHands interactive via la ligne de commande, suive
|
||||
|
||||
2. Exécutez la commande suivante :
|
||||
|
||||
```bash
|
||||
poetry run python -m openhands.core.cli
|
||||
```
|
||||
|
||||
Cette commande démarrera une session interactive où vous pourrez saisir des tâches et recevoir des réponses d'OpenHands.
|
||||
|
||||
Vous devrez vous assurer de définir votre modèle, votre clé API et d'autres paramètres via des variables d'environnement
|
||||
[ou le fichier `config.toml`](https://github.com/All-Hands-AI/OpenHands/blob/main/config.template.toml).
|
||||
|
||||
|
||||
## Avec Docker
|
||||
|
||||
Pour exécuter OpenHands en mode CLI avec Docker, suivez ces étapes :
|
||||
|
||||
1. Définissez `WORKSPACE_BASE` sur le répertoire que vous souhaitez qu'OpenHands modifie :
|
||||
|
||||
```bash
|
||||
WORKSPACE_BASE=$(pwd)/workspace
|
||||
```
|
||||
|
||||
2. Définissez `LLM_MODEL` sur le modèle que vous souhaitez utiliser :
|
||||
|
||||
```bash
|
||||
LLM_MODEL="anthropic/claude-3-5-sonnet-20240620"
|
||||
```
|
||||
|
||||
3. Définissez `LLM_API_KEY` sur votre clé API :
|
||||
|
||||
```bash
|
||||
LLM_API_KEY="sk_test_12345"
|
||||
```
|
||||
|
||||
4. Exécutez la commande Docker suivante :
|
||||
|
||||
```bash
|
||||
docker run -it \
|
||||
--pull=always \
|
||||
-e SANDBOX_USER_ID=$(id -u) \
|
||||
-e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
|
||||
-e LLM_API_KEY=$LLM_API_KEY \
|
||||
-e LLM_MODEL=$LLM_MODEL \
|
||||
-v $WORKSPACE_BASE:/opt/workspace_base \
|
||||
-v /var/run/docker.sock:/var/run/docker.sock \
|
||||
--add-host host.docker.internal:host-gateway \
|
||||
--name openhands-app-$(date +%Y%m%d%H%M%S) \
|
||||
ghcr.io/all-hands-ai/openhands:0.11 \
|
||||
python -m openhands.core.cli
|
||||
```
|
||||
|
||||
Cette commande démarrera une session interactive dans Docker où vous pourrez saisir des tâches et recevoir des réponses d'OpenHands.
|
||||
|
||||
## Exemples de commandes CLI et de sorties attendues
|
||||
|
||||
Voici quelques exemples de commandes CLI et leurs sorties attendues :
|
||||
|
||||
### Exemple 1 : Tâche simple
|
||||
|
||||
```bash
|
||||
Comment puis-je vous aider ? >> Écrivez un script Python qui affiche "Hello, World!"
|
||||
```
|
||||
|
||||
Sortie attendue :
|
||||
|
||||
```bash
|
||||
🤖 Bien sûr ! Voici un script Python qui affiche "Hello, World!" :
|
||||
|
||||
❯ print("Hello, World!")
|
||||
```
|
||||
|
||||
### Exemple 2 : Commande Bash
|
||||
|
||||
```bash
|
||||
Comment puis-je vous aider ? >> Créez un répertoire nommé "test_dir"
|
||||
```
|
||||
|
||||
Sortie attendue :
|
||||
|
||||
```bash
|
||||
🤖 Création d'un répertoire nommé "test_dir" :
|
||||
|
||||
❯ mkdir test_dir
|
||||
```
|
||||
|
||||
### Exemple 3 : Gestion des erreurs
|
||||
|
||||
```bash
|
||||
Comment puis-je vous aider ? >> Supprimez un fichier inexistant
|
||||
```
|
||||
|
||||
Sortie attendue :
|
||||
|
||||
```bash
|
||||
🤖 Une erreur s'est produite. Veuillez réessayer.
|
||||
```
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
|
||||
|
||||
# Évaluation
|
||||
|
||||
Ce guide fournit un aperçu de la façon d'intégrer votre propre benchmark d'évaluation dans le framework OpenHands.
|
||||
@@ -11,270 +9,4 @@ OpenHands en mode développement utilise `config.toml` pour garder une trace de
|
||||
|
||||
Voici un exemple de fichier de configuration que vous pouvez utiliser pour définir et utiliser plusieurs LLMs :
|
||||
|
||||
```toml
|
||||
[llm]
|
||||
# IMPORTANT : ajoutez votre clé API ici et définissez le modèle que vous souhaitez évaluer
|
||||
model = "claude-3-5-sonnet-20240620"
|
||||
api_key = "sk-XXX"
|
||||
|
||||
[llm.eval_gpt4_1106_preview_llm]
|
||||
model = "gpt-4-1106-preview"
|
||||
api_key = "XXX"
|
||||
temperature = 0.0
|
||||
|
||||
[llm.eval_some_openai_compatible_model_llm]
|
||||
model = "openai/MODEL_NAME"
|
||||
base_url = "https://OPENAI_COMPATIBLE_URL/v1"
|
||||
api_key = "XXX"
|
||||
temperature = 0.0
|
||||
```
|
||||
|
||||
|
||||
## Comment utiliser OpenHands en ligne de commande
|
||||
|
||||
OpenHands peut être exécuté depuis la ligne de commande en utilisant le format suivant :
|
||||
|
||||
```bash
|
||||
poetry run python ./openhands/core/main.py \
|
||||
-i <max_iterations> \
|
||||
-t "<task_description>" \
|
||||
-c <agent_class> \
|
||||
-l <llm_config>
|
||||
```
|
||||
|
||||
Par exemple :
|
||||
|
||||
```bash
|
||||
poetry run python ./openhands/core/main.py \
|
||||
-i 10 \
|
||||
-t "Écrivez-moi un script bash qui affiche hello world." \
|
||||
-c CodeActAgent \
|
||||
-l llm
|
||||
```
|
||||
|
||||
Cette commande exécute OpenHands avec :
|
||||
- Un maximum de 10 itérations
|
||||
- La description de tâche spécifiée
|
||||
- En utilisant CodeActAgent
|
||||
- Avec la configuration LLM définie dans la section `llm` de votre fichier `config.toml`
|
||||
|
||||
## Comment fonctionne OpenHands
|
||||
|
||||
Le point d'entrée principal d'OpenHands se trouve dans `openhands/core/main.py`. Voici un flux simplifié de son fonctionnement :
|
||||
|
||||
1. Analyse des arguments de ligne de commande et chargement de la configuration
|
||||
2. Création d'un environnement d'exécution à l'aide de `create_runtime()`
|
||||
3. Initialisation de l'agent spécifié
|
||||
4. Exécution du contrôleur à l'aide de `run_controller()`, qui :
|
||||
- Attache l'environnement d'exécution à l'agent
|
||||
- Exécute la tâche de l'agent
|
||||
- Renvoie un état final une fois terminé
|
||||
|
||||
La fonction `run_controller()` est le cœur de l'exécution d'OpenHands. Elle gère l'interaction entre l'agent, l'environnement d'exécution et la tâche, en gérant des choses comme la simulation d'entrée utilisateur et le traitement des événements.
|
||||
|
||||
|
||||
## Le moyen le plus simple de commencer : Explorer les benchmarks existants
|
||||
|
||||
Nous vous encourageons à examiner les différents benchmarks d'évaluation disponibles dans le [répertoire `evaluation/`](https://github.com/All-Hands-AI/OpenHands/blob/main/evaluation) de notre dépôt.
|
||||
|
||||
Pour intégrer votre propre benchmark, nous vous suggérons de commencer par celui qui ressemble le plus à vos besoins. Cette approche peut considérablement rationaliser votre processus d'intégration, vous permettant de vous appuyer sur les structures existantes et de les adapter à vos exigences spécifiques.
|
||||
|
||||
## Comment créer un workflow d'évaluation
|
||||
|
||||
|
||||
Pour créer un workflow d'évaluation pour votre benchmark, suivez ces étapes :
|
||||
|
||||
1. Importez les utilitaires OpenHands pertinents :
|
||||
```python
|
||||
import openhands.agenthub
|
||||
from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
EvalOutput,
|
||||
make_metadata,
|
||||
prepare_dataset,
|
||||
reset_logger_for_multiprocessing,
|
||||
run_evaluation,
|
||||
)
|
||||
from openhands.controller.state.state import State
|
||||
from openhands.core.config import (
|
||||
AppConfig,
|
||||
SandboxConfig,
|
||||
get_llm_config_arg,
|
||||
parse_arguments,
|
||||
)
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
from openhands.core.main import create_runtime, run_controller
|
||||
from openhands.events.action import CmdRunAction
|
||||
from openhands.events.observation import CmdOutputObservation, ErrorObservation
|
||||
from openhands.runtime.runtime import Runtime
|
||||
```
|
||||
|
||||
2. Créez une configuration :
|
||||
```python
|
||||
def get_config(instance: pd.Series, metadata: EvalMetadata) -> AppConfig:
|
||||
config = AppConfig(
|
||||
default_agent=metadata.agent_class,
|
||||
runtime='eventstream',
|
||||
max_iterations=metadata.max_iterations,
|
||||
sandbox=SandboxConfig(
|
||||
base_container_image='your_container_image',
|
||||
enable_auto_lint=True,
|
||||
timeout=300,
|
||||
),
|
||||
)
|
||||
config.set_llm_config(metadata.llm_config)
|
||||
return config
|
||||
```
|
||||
|
||||
3. Initialisez l'environnement d'exécution et configurez l'environnement d'évaluation :
|
||||
```python
|
||||
def initialize_runtime(runtime: Runtime, instance: pd.Series):
|
||||
# Configurez votre environnement d'évaluation ici
|
||||
# Par exemple, définir des variables d'environnement, préparer des fichiers, etc.
|
||||
pass
|
||||
```
|
||||
|
||||
4. Créez une fonction pour traiter chaque instance :
|
||||
```python
|
||||
from openhands.utils.async_utils import call_async_from_sync
|
||||
def process_instance(instance: pd.Series, metadata: EvalMetadata) -> EvalOutput:
|
||||
config = get_config(instance, metadata)
|
||||
runtime = create_runtime(config)
|
||||
call_async_from_sync(runtime.connect)
|
||||
initialize_runtime(runtime, instance)
|
||||
|
||||
instruction = get_instruction(instance, metadata)
|
||||
|
||||
state = run_controller(
|
||||
config=config,
|
||||
task_str=instruction,
|
||||
runtime=runtime,
|
||||
fake_user_response_fn=your_user_response_function,
|
||||
)
|
||||
|
||||
# Évaluez les actions de l'agent
|
||||
evaluation_result = await evaluate_agent_actions(runtime, instance)
|
||||
|
||||
return EvalOutput(
|
||||
instance_id=instance.instance_id,
|
||||
instruction=instruction,
|
||||
test_result=evaluation_result,
|
||||
metadata=metadata,
|
||||
history=state.history.compatibility_for_eval_history_pairs(),
|
||||
metrics=state.metrics.get() if state.metrics else None,
|
||||
error=state.last_error if state and state.last_error else None,
|
||||
)
|
||||
```
|
||||
|
||||
5. Exécutez l'évaluation :
|
||||
```python
|
||||
metadata = make_metadata(llm_config, dataset_name, agent_class, max_iterations, eval_note, eval_output_dir)
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
instances = prepare_dataset(your_dataset, output_file, eval_n_limit)
|
||||
|
||||
await run_evaluation(
|
||||
instances,
|
||||
metadata,
|
||||
output_file,
|
||||
num_workers,
|
||||
process_instance
|
||||
)
|
||||
```
|
||||
|
||||
Ce workflow configure la configuration, initialise l'environnement d'exécution, traite chaque instance en exécutant l'agent et en évaluant ses actions, puis collecte les résultats dans un objet `EvalOutput`. La fonction `run_evaluation` gère la parallélisation et le suivi de la progression.
|
||||
|
||||
N'oubliez pas de personnaliser les fonctions `get_instruction`, `your_user_response_function` et `evaluate_agent_actions` en fonction des exigences spécifiques de votre benchmark.
|
||||
|
||||
En suivant cette structure, vous pouvez créer un workflow d'évaluation robuste pour votre benchmark dans le framework OpenHands.
|
||||
|
||||
|
||||
## Comprendre la `user_response_fn`
|
||||
|
||||
La `user_response_fn` est un composant crucial dans le workflow d'évaluation d'OpenHands. Elle simule l'interaction de l'utilisateur avec l'agent, permettant des réponses automatisées pendant le processus d'évaluation. Cette fonction est particulièrement utile lorsque vous souhaitez fournir des réponses cohérentes et prédéfinies aux requêtes ou actions de l'agent.
|
||||
|
||||
|
||||
### Workflow et interaction
|
||||
|
||||
Le workflow correct pour gérer les actions et la `user_response_fn` est le suivant :
|
||||
|
||||
1. L'agent reçoit une tâche et commence à la traiter
|
||||
2. L'agent émet une Action
|
||||
3. Si l'Action est exécutable (par exemple, CmdRunAction, IPythonRunCellAction) :
|
||||
- Le Runtime traite l'Action
|
||||
- Le Runtime renvoie une Observation
|
||||
4. Si l'Action n'est pas exécutable (généralement une MessageAction) :
|
||||
- La `user_response_fn` est appelée
|
||||
- Elle renvoie une réponse utilisateur simulée
|
||||
5. L'agent reçoit soit l'Observation, soit la réponse simulée
|
||||
6. Les étapes 2 à 5 se répètent jusqu'à ce que la tâche soit terminée ou que le nombre maximum d'itérations soit atteint
|
||||
|
||||
Voici une représentation visuelle plus précise :
|
||||
|
||||
```
|
||||
[Agent]
|
||||
|
|
||||
v
|
||||
[Émettre une Action]
|
||||
|
|
||||
v
|
||||
[L'Action est-elle exécutable ?]
|
||||
/ \
|
||||
Oui Non
|
||||
| |
|
||||
v v
|
||||
[Runtime] [user_response_fn]
|
||||
| |
|
||||
v v
|
||||
[Renvoyer une Observation] [Réponse simulée]
|
||||
\ /
|
||||
\ /
|
||||
v v
|
||||
[L'agent reçoit le feedback]
|
||||
|
|
||||
v
|
||||
[Continuer ou terminer la tâche]
|
||||
```
|
||||
|
||||
Dans ce workflow :
|
||||
|
||||
- Les actions exécutables (comme l'exécution de commandes ou de code) sont gérées directement par le Runtime
|
||||
- Les actions non exécutables (généralement lorsque l'agent veut communiquer ou demander des clarifications) sont gérées par la `user_response_fn`
|
||||
- L'agent traite ensuite le feedback, qu'il s'agisse d'une Observation du Runtime ou d'une réponse simulée de la `user_response_fn`
|
||||
|
||||
Cette approche permet une gestion automatisée des actions concrètes et des interactions utilisateur simulées, ce qui la rend adaptée aux scénarios d'évaluation où vous souhaitez tester la capacité de l'agent à effectuer des tâches avec une intervention humaine minimale.
|
||||
|
||||
### Exemple d'implémentation
|
||||
|
||||
Voici un exemple de `user_response_fn` utilisée dans l'évaluation SWE-Bench :
|
||||
|
||||
```python
|
||||
def codeact_user_response(state: State | None) -> str:
|
||||
msg = (
|
||||
'Veuillez continuer à travailler sur la tâche avec l\'approche que vous jugez appropriée.\n'
|
||||
'Si vous pensez avoir résolu la tâche, veuillez d\'abord envoyer votre réponse à l\'utilisateur via un message, puis <execute_bash> exit </execute_bash>.\n'
|
||||
'IMPORTANT : VOUS NE DEVEZ JAMAIS DEMANDER DE L\'AIDE HUMAINE.\n'
|
||||
)
|
||||
|
||||
if state and state.history:
|
||||
# vérifier si l'agent a essayé de parler à l'utilisateur 3 fois, si oui, faire savoir à l'agent qu'il peut abandonner
|
||||
user_msgs = [
|
||||
event
|
||||
for event in state.history.get_events()
|
||||
if isinstance(event, MessageAction) and event.source == 'user'
|
||||
]
|
||||
if len(user_msgs) >= 2:
|
||||
# faire savoir à l'agent qu'il peut abandonner lorsqu'il a essayé 3 fois
|
||||
return (
|
||||
msg
|
||||
+ 'Si vous voulez abandonner, exécutez : <execute_bash> exit </execute_bash>.\n'
|
||||
)
|
||||
return msg
|
||||
```
|
||||
|
||||
Cette fonction fait ce qui suit :
|
||||
|
||||
1. Fournit un message standard encourageant l'agent à continuer à travailler
|
||||
2. Vérifie combien de fois l'agent a tenté de communiquer avec l'utilisateur
|
||||
3. Si l'agent a fait plusieurs tentatives, il lui donne la possibilité d'abandonner
|
||||
|
||||
En utilisant cette fonction, vous pouvez garantir un comportement cohérent sur plusieurs exécutions d'évaluation et empêcher l'agent de rester bloqué en attendant une entrée humaine.
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
|
||||
|
||||
# Mode sans interface
|
||||
|
||||
Vous pouvez exécuter OpenHands avec une seule commande, sans démarrer l'application web.
|
||||
@@ -13,46 +11,4 @@ Pour exécuter OpenHands en mode sans interface avec Python,
|
||||
[suivez les instructions de configuration de développement](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md),
|
||||
puis exécutez :
|
||||
|
||||
```bash
|
||||
poetry run python -m openhands.core.main -t "write a bash script that prints hi"
|
||||
```
|
||||
|
||||
Vous devrez vous assurer de définir votre modèle, votre clé API et d'autres paramètres via des variables d'environnement
|
||||
[ou le fichier `config.toml`](https://github.com/All-Hands-AI/OpenHands/blob/main/config.template.toml).
|
||||
|
||||
## Avec Docker
|
||||
|
||||
1. Définissez `WORKSPACE_BASE` sur le répertoire que vous voulez qu'OpenHands modifie :
|
||||
|
||||
```bash
|
||||
WORKSPACE_BASE=$(pwd)/workspace
|
||||
```
|
||||
|
||||
2. Définissez `LLM_MODEL` sur le modèle que vous voulez utiliser :
|
||||
|
||||
```bash
|
||||
LLM_MODEL="anthropic/claude-3-5-sonnet-20240620"
|
||||
```
|
||||
|
||||
3. Définissez `LLM_API_KEY` sur votre clé API :
|
||||
|
||||
```bash
|
||||
LLM_API_KEY="sk_test_12345"
|
||||
```
|
||||
|
||||
4. Exécutez la commande Docker suivante :
|
||||
|
||||
```bash
|
||||
docker run -it \
|
||||
--pull=always \
|
||||
-e SANDBOX_USER_ID=$(id -u) \
|
||||
-e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
|
||||
-e LLM_API_KEY=$LLM_API_KEY \
|
||||
-e LLM_MODEL=$LLM_MODEL \
|
||||
-v $WORKSPACE_BASE:/opt/workspace_base \
|
||||
-v /var/run/docker.sock:/var/run/docker.sock \
|
||||
--add-host host.docker.internal:host-gateway \
|
||||
--name openhands-app-$(date +%Y%m%d%H%M%S) \
|
||||
ghcr.io/all-hands-ai/openhands:0.11 \
|
||||
python -m openhands.core.main -t "write a bash script that prints hi"
|
||||
```
|
||||
|
||||
@@ -14,96 +14,4 @@ OpenHands 可以在交互式命令行模式下运行,允许用户通过命令行
|
||||
|
||||
2. 运行以下命令:
|
||||
|
||||
```bash
|
||||
poetry run python -m openhands.core.cli
|
||||
```
|
||||
|
||||
该命令将启动一个交互式会话,你可以在其中输入任务并接收来自 OpenHands 的响应。
|
||||
|
||||
你需要确保通过环境变量[或 `config.toml` 文件](https://github.com/All-Hands-AI/OpenHands/blob/main/config.template.toml)设置你的模型、API 密钥和其他设置。
|
||||
|
||||
|
||||
## 使用 Docker
|
||||
|
||||
要在 Docker 中以命令行模式运行 OpenHands,请按照以下步骤操作:
|
||||
|
||||
1. 将 `WORKSPACE_BASE` 设置为你希望 OpenHands 编辑的目录:
|
||||
|
||||
```bash
|
||||
WORKSPACE_BASE=$(pwd)/workspace
|
||||
```
|
||||
|
||||
2. 将 `LLM_MODEL` 设置为你要使用的模型:
|
||||
|
||||
```bash
|
||||
LLM_MODEL="anthropic/claude-3-5-sonnet-20240620"
|
||||
```
|
||||
|
||||
3. 将 `LLM_API_KEY` 设置为你的 API 密钥:
|
||||
|
||||
```bash
|
||||
LLM_API_KEY="sk_test_12345"
|
||||
```
|
||||
|
||||
4. 运行以下 Docker 命令:
|
||||
|
||||
```bash
|
||||
docker run -it \
|
||||
--pull=always \
|
||||
-e SANDBOX_USER_ID=$(id -u) \
|
||||
-e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
|
||||
-e LLM_API_KEY=$LLM_API_KEY \
|
||||
-e LLM_MODEL=$LLM_MODEL \
|
||||
-v $WORKSPACE_BASE:/opt/workspace_base \
|
||||
-v /var/run/docker.sock:/var/run/docker.sock \
|
||||
--add-host host.docker.internal:host-gateway \
|
||||
--name openhands-app-$(date +%Y%m%d%H%M%S) \
|
||||
ghcr.io/all-hands-ai/openhands:0.11 \
|
||||
python -m openhands.core.cli
|
||||
```
|
||||
|
||||
该命令将在 Docker 中启动一个交互式会话,你可以在其中输入任务并接收来自 OpenHands 的响应。
|
||||
|
||||
## 命令行命令和预期输出示例
|
||||
|
||||
以下是一些命令行命令及其预期输出的示例:
|
||||
|
||||
### 示例 1: 简单任务
|
||||
|
||||
```bash
|
||||
How can I help? >> Write a Python script that prints "Hello, World!"
|
||||
```
|
||||
|
||||
预期输出:
|
||||
|
||||
```bash
|
||||
🤖 Sure! Here is a Python script that prints "Hello, World!":
|
||||
|
||||
❯ print("Hello, World!")
|
||||
```
|
||||
|
||||
### 示例 2: Bash 命令
|
||||
|
||||
```bash
|
||||
How can I help? >> Create a directory named "test_dir"
|
||||
```
|
||||
|
||||
预期输出:
|
||||
|
||||
```bash
|
||||
🤖 Creating a directory named "test_dir":
|
||||
|
||||
❯ mkdir test_dir
|
||||
```
|
||||
|
||||
### 示例 3: 错误处理
|
||||
|
||||
```bash
|
||||
How can I help? >> Delete a non-existent file
|
||||
```
|
||||
|
||||
预期输出:
|
||||
|
||||
```bash
|
||||
🤖 An error occurred. Please try again.
|
||||
```
|
||||
|
||||
@@ -9,270 +9,4 @@
|
||||
|
||||
以下是一个示例配置文件,您可以使用它来定义和使用多个 LLM:
|
||||
|
||||
```toml
|
||||
[llm]
|
||||
# 重要:在此处添加您的 API 密钥,并将模型设置为您要评估的模型
|
||||
model = "claude-3-5-sonnet-20240620"
|
||||
api_key = "sk-XXX"
|
||||
|
||||
[llm.eval_gpt4_1106_preview_llm]
|
||||
model = "gpt-4-1106-preview"
|
||||
api_key = "XXX"
|
||||
temperature = 0.0
|
||||
|
||||
[llm.eval_some_openai_compatible_model_llm]
|
||||
model = "openai/MODEL_NAME"
|
||||
base_url = "https://OPENAI_COMPATIBLE_URL/v1"
|
||||
api_key = "XXX"
|
||||
temperature = 0.0
|
||||
```
|
||||
|
||||
|
||||
## 如何在命令行中使用 OpenHands
|
||||
|
||||
可以使用以下格式从命令行运行 OpenHands:
|
||||
|
||||
```bash
|
||||
poetry run python ./openhands/core/main.py \
|
||||
-i <max_iterations> \
|
||||
-t "<task_description>" \
|
||||
-c <agent_class> \
|
||||
-l <llm_config>
|
||||
```
|
||||
|
||||
例如:
|
||||
|
||||
```bash
|
||||
poetry run python ./openhands/core/main.py \
|
||||
-i 10 \
|
||||
-t "Write me a bash script that prints hello world." \
|
||||
-c CodeActAgent \
|
||||
-l llm
|
||||
```
|
||||
|
||||
此命令使用以下参数运行 OpenHands:
|
||||
- 最大迭代次数为 10
|
||||
- 指定的任务描述
|
||||
- 使用 CodeActAgent
|
||||
- 使用 `config.toml` 文件的 `llm` 部分中定义的 LLM 配置
|
||||
|
||||
## OpenHands 如何工作
|
||||
|
||||
OpenHands 的主要入口点在 `openhands/core/main.py` 中。以下是它工作原理的简化流程:
|
||||
|
||||
1. 解析命令行参数并加载配置
|
||||
2. 使用 `create_runtime()` 创建运行时环境
|
||||
3. 初始化指定的代理
|
||||
4. 使用 `run_controller()` 运行控制器,它:
|
||||
- 将运行时附加到代理
|
||||
- 执行代理的任务
|
||||
- 完成后返回最终状态
|
||||
|
||||
`run_controller()` 函数是 OpenHands 执行的核心。它管理代理、运行时和任务之间的交互,处理用户输入模拟和事件处理等事项。
|
||||
|
||||
|
||||
## 入门最简单的方法:探索现有基准
|
||||
|
||||
我们鼓励您查看我们仓库的 [`evaluation/` 目录](https://github.com/All-Hands-AI/OpenHands/blob/main/evaluation)中提供的各种评估基准。
|
||||
|
||||
要集成您自己的基准,我们建议从最接近您需求的基准开始。这种方法可以显著简化您的集成过程,允许您在现有结构的基础上进行构建并使其适应您的特定要求。
|
||||
|
||||
## 如何创建评估工作流
|
||||
|
||||
|
||||
要为您的基准创建评估工作流,请按照以下步骤操作:
|
||||
|
||||
1. 导入相关的 OpenHands 实用程序:
|
||||
```python
|
||||
import openhands.agenthub
|
||||
from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
EvalOutput,
|
||||
make_metadata,
|
||||
prepare_dataset,
|
||||
reset_logger_for_multiprocessing,
|
||||
run_evaluation,
|
||||
)
|
||||
from openhands.controller.state.state import State
|
||||
from openhands.core.config import (
|
||||
AppConfig,
|
||||
SandboxConfig,
|
||||
get_llm_config_arg,
|
||||
parse_arguments,
|
||||
)
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
from openhands.core.main import create_runtime, run_controller
|
||||
from openhands.events.action import CmdRunAction
|
||||
from openhands.events.observation import CmdOutputObservation, ErrorObservation
|
||||
from openhands.runtime.runtime import Runtime
|
||||
```
|
||||
|
||||
2. 创建配置:
|
||||
```python
|
||||
def get_config(instance: pd.Series, metadata: EvalMetadata) -> AppConfig:
|
||||
config = AppConfig(
|
||||
default_agent=metadata.agent_class,
|
||||
runtime='eventstream',
|
||||
max_iterations=metadata.max_iterations,
|
||||
sandbox=SandboxConfig(
|
||||
base_container_image='your_container_image',
|
||||
enable_auto_lint=True,
|
||||
timeout=300,
|
||||
),
|
||||
)
|
||||
config.set_llm_config(metadata.llm_config)
|
||||
return config
|
||||
```
|
||||
|
||||
3. 初始化运行时并设置评估环境:
|
||||
```python
|
||||
def initialize_runtime(runtime: Runtime, instance: pd.Series):
|
||||
# 在此处设置您的评估环境
|
||||
# 例如,设置环境变量、准备文件等
|
||||
pass
|
||||
```
|
||||
|
||||
4. 创建一个函数来处理每个实例:
|
||||
```python
|
||||
from openhands.utils.async_utils import call_async_from_sync
|
||||
def process_instance(instance: pd.Series, metadata: EvalMetadata) -> EvalOutput:
|
||||
config = get_config(instance, metadata)
|
||||
runtime = create_runtime(config)
|
||||
call_async_from_sync(runtime.connect)
|
||||
initialize_runtime(runtime, instance)
|
||||
|
||||
instruction = get_instruction(instance, metadata)
|
||||
|
||||
state = run_controller(
|
||||
config=config,
|
||||
task_str=instruction,
|
||||
runtime=runtime,
|
||||
fake_user_response_fn=your_user_response_function,
|
||||
)
|
||||
|
||||
# 评估代理的操作
|
||||
evaluation_result = await evaluate_agent_actions(runtime, instance)
|
||||
|
||||
return EvalOutput(
|
||||
instance_id=instance.instance_id,
|
||||
instruction=instruction,
|
||||
test_result=evaluation_result,
|
||||
metadata=metadata,
|
||||
history=state.history.compatibility_for_eval_history_pairs(),
|
||||
metrics=state.metrics.get() if state.metrics else None,
|
||||
error=state.last_error if state and state.last_error else None,
|
||||
)
|
||||
```
|
||||
|
||||
5. 运行评估:
|
||||
```python
|
||||
metadata = make_metadata(llm_config, dataset_name, agent_class, max_iterations, eval_note, eval_output_dir)
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
instances = prepare_dataset(your_dataset, output_file, eval_n_limit)
|
||||
|
||||
await run_evaluation(
|
||||
instances,
|
||||
metadata,
|
||||
output_file,
|
||||
num_workers,
|
||||
process_instance
|
||||
)
|
||||
```
|
||||
|
||||
此工作流设置配置,初始化运行时环境,通过运行代理并评估其操作来处理每个实例,然后将结果收集到 `EvalOutput` 对象中。`run_evaluation` 函数处理并行化和进度跟踪。
|
||||
|
||||
请记住根据您特定的基准要求自定义 `get_instruction`、`your_user_response_function` 和 `evaluate_agent_actions` 函数。
|
||||
|
||||
通过遵循此结构,您可以在 OpenHands 框架内为您的基准创建强大的评估工作流。
|
||||
|
||||
|
||||
## 理解 `user_response_fn`
|
||||
|
||||
`user_response_fn` 是 OpenHands 评估工作流中的关键组件。它模拟用户与代理的交互,允许在评估过程中自动响应。当您想要为代理的查询或操作提供一致的、预定义的响应时,此函数特别有用。
|
||||
|
||||
|
||||
### 工作流和交互
|
||||
|
||||
处理操作和 `user_response_fn` 的正确工作流如下:
|
||||
|
||||
1. 代理接收任务并开始处理
|
||||
2. 代理发出操作
|
||||
3. 如果操作可执行(例如 CmdRunAction、IPythonRunCellAction):
|
||||
- 运行时处理操作
|
||||
- 运行时返回观察结果
|
||||
4. 如果操作不可执行(通常是 MessageAction):
|
||||
- 调用 `user_response_fn`
|
||||
- 它返回模拟的用户响应
|
||||
5. 代理接收观察结果或模拟响应
|
||||
6. 重复步骤 2-5,直到任务完成或达到最大迭代次数
|
||||
|
||||
以下是更准确的可视化表示:
|
||||
|
||||
```
|
||||
[代理]
|
||||
|
|
||||
v
|
||||
[发出操作]
|
||||
|
|
||||
v
|
||||
[操作是否可执行?]
|
||||
/ \
|
||||
是 否
|
||||
| |
|
||||
v v
|
||||
[运行时] [user_response_fn]
|
||||
| |
|
||||
v v
|
||||
[返回观察结果] [模拟响应]
|
||||
\ /
|
||||
\ /
|
||||
v v
|
||||
[代理接收反馈]
|
||||
|
|
||||
v
|
||||
[继续或完成任务]
|
||||
```
|
||||
|
||||
在此工作流中:
|
||||
|
||||
- 可执行的操作(如运行命令或执行代码)由运行时直接处理
|
||||
- 不可执行的操作(通常是当代理想要通信或寻求澄清时)由 `user_response_fn` 处理
|
||||
- 然后,代理处理反馈,无论是来自运行时的观察结果还是来自 `user_response_fn` 的模拟响应
|
||||
|
||||
这种方法允许自动处理具体操作和模拟用户交互,使其适用于您想要测试代理在最少人工干预的情况下完成任务的能力的评估场景。
|
||||
|
||||
### 示例实现
|
||||
|
||||
以下是 SWE-Bench 评估中使用的 `user_response_fn` 示例:
|
||||
|
||||
```python
|
||||
def codeact_user_response(state: State | None) -> str:
|
||||
msg = (
|
||||
'Please continue working on the task on whatever approach you think is suitable.\n'
|
||||
'If you think you have solved the task, please first send your answer to user through message and then <execute_bash> exit </execute_bash>.\n'
|
||||
'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP.\n'
|
||||
)
|
||||
|
||||
if state and state.history:
|
||||
# 检查代理是否已尝试与用户对话 3 次,如果是,让代理知道它可以放弃
|
||||
user_msgs = [
|
||||
event
|
||||
for event in state.history.get_events()
|
||||
if isinstance(event, MessageAction) and event.source == 'user'
|
||||
]
|
||||
if len(user_msgs) >= 2:
|
||||
# 让代理知道它在尝试 3 次后可以放弃
|
||||
return (
|
||||
msg
|
||||
+ 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
|
||||
)
|
||||
return msg
|
||||
```
|
||||
|
||||
此函数执行以下操作:
|
||||
|
||||
1. 提供一条标准消息,鼓励代理继续工作
|
||||
2. 检查代理尝试与用户通信的次数
|
||||
3. 如果代理已多次尝试,它会提供放弃的选项
|
||||
|
||||
通过使用此函数,您可以确保在多次评估运行中保持一致的行为,并防止代理在等待人工输入时陷入困境。
|
||||
|
||||
@@ -13,47 +13,4 @@
|
||||
[请按照开发设置说明](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md),
|
||||
然后运行:
|
||||
|
||||
```bash
|
||||
poetry run python -m openhands.core.main -t "write a bash script that prints hi"
|
||||
```
|
||||
|
||||
你需要确保通过环境变量
|
||||
[或 `config.toml` 文件](https://github.com/All-Hands-AI/OpenHands/blob/main/config.template.toml)
|
||||
设置你的模型、API 密钥和其他设置。
|
||||
|
||||
## 使用 Docker
|
||||
|
||||
1. 将 `WORKSPACE_BASE` 设置为你希望 OpenHands 编辑的目录:
|
||||
|
||||
```bash
|
||||
WORKSPACE_BASE=$(pwd)/workspace
|
||||
```
|
||||
|
||||
2. 将 `LLM_MODEL` 设置为你要使用的模型:
|
||||
|
||||
```bash
|
||||
LLM_MODEL="anthropic/claude-3-5-sonnet-20240620"
|
||||
```
|
||||
|
||||
3. 将 `LLM_API_KEY` 设置为你的 API 密钥:
|
||||
|
||||
```bash
|
||||
LLM_API_KEY="sk_test_12345"
|
||||
```
|
||||
|
||||
4. 运行以下 Docker 命令:
|
||||
|
||||
```bash
|
||||
docker run -it \
|
||||
--pull=always \
|
||||
-e SANDBOX_USER_ID=$(id -u) \
|
||||
-e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
|
||||
-e LLM_API_KEY=$LLM_API_KEY \
|
||||
-e LLM_MODEL=$LLM_MODEL \
|
||||
-v $WORKSPACE_BASE:/opt/workspace_base \
|
||||
-v /var/run/docker.sock:/var/run/docker.sock \
|
||||
--add-host host.docker.internal:host-gateway \
|
||||
--name openhands-app-$(date +%Y%m%d%H%M%S) \
|
||||
ghcr.io/all-hands-ai/openhands:0.11 \
|
||||
python -m openhands.core.main -t "write a bash script that prints hi"
|
||||
```
|
||||
|
||||
@@ -35,7 +35,8 @@ WORKSPACE_BASE=$(pwd)/workspace
|
||||
2. Set `LLM_MODEL` to the model you want to use:
|
||||
|
||||
```bash
|
||||
LLM_MODEL="anthropic/claude-3-5-sonnet-20240620"
|
||||
LLM_MODEL="anthropic/claude-3-5-sonnet-20241022"
|
||||
|
||||
```
|
||||
|
||||
3. Set `LLM_API_KEY` to your API key:
|
||||
@@ -106,3 +107,4 @@ Expected Output:
|
||||
```bash
|
||||
🤖 An error occurred. Please try again.
|
||||
```
|
||||
|
||||
|
||||
@@ -9,270 +9,4 @@ OpenHands in development mode uses `config.toml` to keep track of most configura
|
||||
|
||||
Here's an example configuration file you can use to define and use multiple LLMs:
|
||||
|
||||
```toml
|
||||
[llm]
|
||||
# IMPORTANT: add your API key here, and set the model to the one you want to evaluate
|
||||
model = "claude-3-5-sonnet-20240620"
|
||||
api_key = "sk-XXX"
|
||||
|
||||
[llm.eval_gpt4_1106_preview_llm]
|
||||
model = "gpt-4-1106-preview"
|
||||
api_key = "XXX"
|
||||
temperature = 0.0
|
||||
|
||||
[llm.eval_some_openai_compatible_model_llm]
|
||||
model = "openai/MODEL_NAME"
|
||||
base_url = "https://OPENAI_COMPATIBLE_URL/v1"
|
||||
api_key = "XXX"
|
||||
temperature = 0.0
|
||||
```
|
||||
|
||||
|
||||
## How to use OpenHands in the command line
|
||||
|
||||
OpenHands can be run from the command line using the following format:
|
||||
|
||||
```bash
|
||||
poetry run python ./openhands/core/main.py \
|
||||
-i <max_iterations> \
|
||||
-t "<task_description>" \
|
||||
-c <agent_class> \
|
||||
-l <llm_config>
|
||||
```
|
||||
|
||||
For example:
|
||||
|
||||
```bash
|
||||
poetry run python ./openhands/core/main.py \
|
||||
-i 10 \
|
||||
-t "Write me a bash script that prints hello world." \
|
||||
-c CodeActAgent \
|
||||
-l llm
|
||||
```
|
||||
|
||||
This command runs OpenHands with:
|
||||
- A maximum of 10 iterations
|
||||
- The specified task description
|
||||
- Using the CodeActAgent
|
||||
- With the LLM configuration defined in the `llm` section of your `config.toml` file
|
||||
|
||||
## How does OpenHands work
|
||||
|
||||
The main entry point for OpenHands is in `openhands/core/main.py`. Here's a simplified flow of how it works:
|
||||
|
||||
1. Parse command-line arguments and load the configuration
|
||||
2. Create a runtime environment using `create_runtime()`
|
||||
3. Initialize the specified agent
|
||||
4. Run the controller using `run_controller()`, which:
|
||||
- Attaches the runtime to the agent
|
||||
- Executes the agent's task
|
||||
- Returns a final state when complete
|
||||
|
||||
The `run_controller()` function is the core of OpenHands's execution. It manages the interaction between the agent, the runtime, and the task, handling things like user input simulation and event processing.
|
||||
|
||||
|
||||
## Easiest way to get started: Exploring Existing Benchmarks
|
||||
|
||||
We encourage you to review the various evaluation benchmarks available in the [`evaluation/` directory](https://github.com/All-Hands-AI/OpenHands/blob/main/evaluation) of our repository.
|
||||
|
||||
To integrate your own benchmark, we suggest starting with the one that most closely resembles your needs. This approach can significantly streamline your integration process, allowing you to build upon existing structures and adapt them to your specific requirements.
|
||||
|
||||
## How to create an evaluation workflow
|
||||
|
||||
|
||||
To create an evaluation workflow for your benchmark, follow these steps:
|
||||
|
||||
1. Import relevant OpenHands utilities:
|
||||
```python
|
||||
import openhands.agenthub
|
||||
from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
EvalOutput,
|
||||
make_metadata,
|
||||
prepare_dataset,
|
||||
reset_logger_for_multiprocessing,
|
||||
run_evaluation,
|
||||
)
|
||||
from openhands.controller.state.state import State
|
||||
from openhands.core.config import (
|
||||
AppConfig,
|
||||
SandboxConfig,
|
||||
get_llm_config_arg,
|
||||
parse_arguments,
|
||||
)
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
from openhands.core.main import create_runtime, run_controller
|
||||
from openhands.events.action import CmdRunAction
|
||||
from openhands.events.observation import CmdOutputObservation, ErrorObservation
|
||||
from openhands.runtime.runtime import Runtime
|
||||
```
|
||||
|
||||
2. Create a configuration:
|
||||
```python
|
||||
def get_config(instance: pd.Series, metadata: EvalMetadata) -> AppConfig:
|
||||
config = AppConfig(
|
||||
default_agent=metadata.agent_class,
|
||||
runtime='eventstream',
|
||||
max_iterations=metadata.max_iterations,
|
||||
sandbox=SandboxConfig(
|
||||
base_container_image='your_container_image',
|
||||
enable_auto_lint=True,
|
||||
timeout=300,
|
||||
),
|
||||
)
|
||||
config.set_llm_config(metadata.llm_config)
|
||||
return config
|
||||
```
|
||||
|
||||
3. Initialize the runtime and set up the evaluation environment:
|
||||
```python
|
||||
def initialize_runtime(runtime: Runtime, instance: pd.Series):
|
||||
# Set up your evaluation environment here
|
||||
# For example, setting environment variables, preparing files, etc.
|
||||
pass
|
||||
```
|
||||
|
||||
4. Create a function to process each instance:
|
||||
```python
|
||||
from openhands.utils.async_utils import call_async_from_sync
|
||||
def process_instance(instance: pd.Series, metadata: EvalMetadata) -> EvalOutput:
|
||||
config = get_config(instance, metadata)
|
||||
runtime = create_runtime(config)
|
||||
call_async_from_sync(runtime.connect)
|
||||
initialize_runtime(runtime, instance)
|
||||
|
||||
instruction = get_instruction(instance, metadata)
|
||||
|
||||
state = run_controller(
|
||||
config=config,
|
||||
task_str=instruction,
|
||||
runtime=runtime,
|
||||
fake_user_response_fn=your_user_response_function,
|
||||
)
|
||||
|
||||
# Evaluate the agent's actions
|
||||
evaluation_result = await evaluate_agent_actions(runtime, instance)
|
||||
|
||||
return EvalOutput(
|
||||
instance_id=instance.instance_id,
|
||||
instruction=instruction,
|
||||
test_result=evaluation_result,
|
||||
metadata=metadata,
|
||||
history=state.history.compatibility_for_eval_history_pairs(),
|
||||
metrics=state.metrics.get() if state.metrics else None,
|
||||
error=state.last_error if state and state.last_error else None,
|
||||
)
|
||||
```
|
||||
|
||||
5. Run the evaluation:
|
||||
```python
|
||||
metadata = make_metadata(llm_config, dataset_name, agent_class, max_iterations, eval_note, eval_output_dir)
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
instances = prepare_dataset(your_dataset, output_file, eval_n_limit)
|
||||
|
||||
await run_evaluation(
|
||||
instances,
|
||||
metadata,
|
||||
output_file,
|
||||
num_workers,
|
||||
process_instance
|
||||
)
|
||||
```
|
||||
|
||||
This workflow sets up the configuration, initializes the runtime environment, processes each instance by running the agent and evaluating its actions, and then collects the results into an `EvalOutput` object. The `run_evaluation` function handles parallelization and progress tracking.
|
||||
|
||||
Remember to customize the `get_instruction`, `your_user_response_function`, and `evaluate_agent_actions` functions according to your specific benchmark requirements.
|
||||
|
||||
By following this structure, you can create a robust evaluation workflow for your benchmark within the OpenHands framework.
|
||||
|
||||
|
||||
## Understanding the `user_response_fn`
|
||||
|
||||
The `user_response_fn` is a crucial component in OpenHands's evaluation workflow. It simulates user interaction with the agent, allowing for automated responses during the evaluation process. This function is particularly useful when you want to provide consistent, predefined responses to the agent's queries or actions.
|
||||
|
||||
|
||||
### Workflow and Interaction
|
||||
|
||||
The correct workflow for handling actions and the `user_response_fn` is as follows:
|
||||
|
||||
1. Agent receives a task and starts processing
|
||||
2. Agent emits an Action
|
||||
3. If the Action is executable (e.g., CmdRunAction, IPythonRunCellAction):
|
||||
- The Runtime processes the Action
|
||||
- Runtime returns an Observation
|
||||
4. If the Action is not executable (typically a MessageAction):
|
||||
- The `user_response_fn` is called
|
||||
- It returns a simulated user response
|
||||
5. The agent receives either the Observation or the simulated response
|
||||
6. Steps 2-5 repeat until the task is completed or max iterations are reached
|
||||
|
||||
Here's a more accurate visual representation:
|
||||
|
||||
```
|
||||
[Agent]
|
||||
|
|
||||
v
|
||||
[Emit Action]
|
||||
|
|
||||
v
|
||||
[Is Action Executable?]
|
||||
/ \
|
||||
Yes No
|
||||
| |
|
||||
v v
|
||||
[Runtime] [user_response_fn]
|
||||
| |
|
||||
v v
|
||||
[Return Observation] [Simulated Response]
|
||||
\ /
|
||||
\ /
|
||||
v v
|
||||
[Agent receives feedback]
|
||||
|
|
||||
v
|
||||
[Continue or Complete Task]
|
||||
```
|
||||
|
||||
In this workflow:
|
||||
|
||||
- Executable actions (like running commands or executing code) are handled directly by the Runtime
|
||||
- Non-executable actions (typically when the agent wants to communicate or ask for clarification) are handled by the `user_response_fn`
|
||||
- The agent then processes the feedback, whether it's an Observation from the Runtime or a simulated response from the `user_response_fn`
|
||||
|
||||
This approach allows for automated handling of both concrete actions and simulated user interactions, making it suitable for evaluation scenarios where you want to test the agent's ability to complete tasks with minimal human intervention.
|
||||
|
||||
### Example Implementation
|
||||
|
||||
Here's an example of a `user_response_fn` used in the SWE-Bench evaluation:
|
||||
|
||||
```python
|
||||
def codeact_user_response(state: State | None) -> str:
|
||||
msg = (
|
||||
'Please continue working on the task on whatever approach you think is suitable.\n'
|
||||
'If you think you have solved the task, please first send your answer to user through message and then <execute_bash> exit </execute_bash>.\n'
|
||||
'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP.\n'
|
||||
)
|
||||
|
||||
if state and state.history:
|
||||
# check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up
|
||||
user_msgs = [
|
||||
event
|
||||
for event in state.history.get_events()
|
||||
if isinstance(event, MessageAction) and event.source == 'user'
|
||||
]
|
||||
if len(user_msgs) >= 2:
|
||||
# let the agent know that it can give up when it has tried 3 times
|
||||
return (
|
||||
msg
|
||||
+ 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
|
||||
)
|
||||
return msg
|
||||
```
|
||||
|
||||
This function does the following:
|
||||
|
||||
1. Provides a standard message encouraging the agent to continue working
|
||||
2. Checks how many times the agent has attempted to communicate with the user
|
||||
3. If the agent has made multiple attempts, it provides an option to give up
|
||||
|
||||
By using this function, you can ensure consistent behavior across multiple evaluation runs and prevent the agent from getting stuck waiting for human input.
|
||||
|
||||
@@ -11,46 +11,4 @@ To run OpenHands in headless mode with Python,
|
||||
[follow the Development setup instructions](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md),
|
||||
and then run:
|
||||
|
||||
```bash
|
||||
poetry run python -m openhands.core.main -t "write a bash script that prints hi"
|
||||
```
|
||||
|
||||
You'll need to be sure to set your model, API key, and other settings via environment variables
|
||||
[or the `config.toml` file](https://github.com/All-Hands-AI/OpenHands/blob/main/config.template.toml).
|
||||
|
||||
## With Docker
|
||||
|
||||
1. Set `WORKSPACE_BASE` to the directory you want OpenHands to edit:
|
||||
|
||||
```bash
|
||||
WORKSPACE_BASE=$(pwd)/workspace
|
||||
```
|
||||
|
||||
2. Set `LLM_MODEL` to the model you want to use:
|
||||
|
||||
```bash
|
||||
LLM_MODEL="anthropic/claude-3-5-sonnet-20240620"
|
||||
```
|
||||
|
||||
3. Set `LLM_API_KEY` to your API key:
|
||||
|
||||
```bash
|
||||
LLM_API_KEY="sk_test_12345"
|
||||
```
|
||||
|
||||
4. Run the following Docker command:
|
||||
|
||||
```bash
|
||||
docker run -it \
|
||||
--pull=always \
|
||||
-e SANDBOX_USER_ID=$(id -u) \
|
||||
-e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
|
||||
-e LLM_API_KEY=$LLM_API_KEY \
|
||||
-e LLM_MODEL=$LLM_MODEL \
|
||||
-v $WORKSPACE_BASE:/opt/workspace_base \
|
||||
-v /var/run/docker.sock:/var/run/docker.sock \
|
||||
--add-host host.docker.internal:host-gateway \
|
||||
--name openhands-app-$(date +%Y%m%d%H%M%S) \
|
||||
ghcr.io/all-hands-ai/openhands:0.11 \
|
||||
python -m openhands.core.main -t "write a bash script that prints hi"
|
||||
```
|
||||
|
||||
@@ -14,7 +14,8 @@ from openhands.storage import get_file_store
|
||||
@pytest.fixture
|
||||
def mock_llm():
|
||||
llm = Mock(spec=LLM)
|
||||
llm.config = LLMConfig(model='claude-3-5-sonnet-20240620', caching_prompt=True)
|
||||
llm.config = LLMConfig(model='claude-3-5-sonnet-20241022', caching_prompt=True)
|
||||
|
||||
llm.is_caching_prompt_active.return_value = True
|
||||
return llm
|
||||
|
||||
@@ -259,3 +260,4 @@ def test_prompt_caching_headers(codeact_agent, mock_event_stream):
|
||||
# Assert
|
||||
assert isinstance(result, MessageAction)
|
||||
assert result.content == 'Hello! How can I assist you today?'
|
||||
|
||||
|
||||
Reference in New Issue
Block a user