Fix: expose aggregated LLM metrics in State for evaluation scripts (#10537)

Co-authored-by: openhands <openhands@all-hands.dev>
2026-01-08 22:38:05 -05:00 · 2025-08-21 17:43:09 +02:00
parent e9e2c98946
commit 91d3d1d20a
25 changed files with 268 additions and 25 deletions
--- a/evaluation/benchmarks/biocoder/run_infer.py
+++ b/evaluation/benchmarks/biocoder/run_infer.py
@@ -15,6 +15,7 @@ from evaluation.utils.shared import (
    codeact_user_response,
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
+    get_metrics,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -294,7 +295,7 @@ def process_instance(
        raise ValueError('State should not be None.')

    test_result = complete_runtime(runtime, instance)
-    metrics = state.metrics.get() if state.metrics else None
+    metrics = get_metrics(state)
    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
    # for compatibility with the existing output format, we can remake the pairs here
    # remove when it becomes unnecessary