Fix: expose aggregated LLM metrics in State for evaluation scripts (#10537)

Co-authored-by: openhands <openhands@all-hands.dev>
This commit is contained in:
Engel Nyst
2025-08-21 17:43:09 +02:00
committed by GitHub
parent e9e2c98946
commit 91d3d1d20a
25 changed files with 268 additions and 25 deletions

View File

@@ -15,6 +15,7 @@ from evaluation.utils.shared import (
EvalOutput,
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
get_metrics,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -205,7 +206,7 @@ def process_instance(
task_state = state.extra_data['task_state']
logger.info('Task state: ' + str(task_state.to_dict()))
metrics = state.metrics.get() if state.metrics else None
metrics = get_metrics(state)
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here