Fix: expose aggregated LLM metrics in State for evaluation scripts (#10537)

Co-authored-by: openhands <openhands@all-hands.dev>
2026-01-08 22:38:05 -05:00 · 2025-08-21 17:43:09 +02:00
parent e9e2c98946
commit 91d3d1d20a
25 changed files with 268 additions and 25 deletions
--- a/evaluation/benchmarks/mint/run_infer.py
+++ b/evaluation/benchmarks/mint/run_infer.py
@@ -15,6 +15,7 @@ from evaluation.utils.shared import (
    EvalOutput,
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
+    get_metrics,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -205,7 +206,7 @@ def process_instance(
        task_state = state.extra_data['task_state']
        logger.info('Task state: ' + str(task_state.to_dict()))

-    metrics = state.metrics.get() if state.metrics else None
+    metrics = get_metrics(state)

    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
    # for compatibility with the existing output format, we can remake the pairs here