feat(direct_benchmark): show "would have passed" for timed-out challenges

When a challenge times out but the agent's solution would have passed
evaluation, this is now clearly indicated:

- Completion blocks show "TIMEOUT (would have passed)" in yellow
- Recent completions panel shows hourglass icon + "would pass" suffix
- Summary table has new "Would Pass" column
- Final summary shows "+N would pass" count
- Success rate includes "would pass" challenges

The evaluator still runs on timed-out challenges to calculate the score,
but success remains False. This gives visibility into near-misses that
just needed more time.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Nicholas Tindle
2026-01-20 00:30:00 -06:00
parent 0e65785228
commit 89003a585d
2 changed files with 81 additions and 29 deletions

View File

@@ -16,14 +16,11 @@ class Evaluator:
def evaluate(
self, result: ChallengeResult, challenge: Challenge
) -> ChallengeResult:
"""Evaluate a challenge result and update success/score."""
# If the challenge timed out or had an error, don't override with evaluation
# A timed-out challenge cannot be considered a pass
if result.timed_out:
result.success = False
result.score = 0.0
return result
"""Evaluate a challenge result and update success/score.
For timed-out challenges, we still run evaluation to populate the score
(so we can show "would have passed"), but success remains False.
"""
ground = challenge.ground_truth
if not ground:
@@ -63,7 +60,12 @@ class Evaluator:
# Update result
result.score = score
result.success = score >= 0.9 # 90% threshold for success
# Timed-out challenges cannot pass, even if evaluation would succeed
# (The score is still set so UI can show "would have passed")
if result.timed_out:
result.success = False
else:
result.success = score >= 0.9 # 90% threshold for success
return result

View File

@@ -181,7 +181,14 @@ class BenchmarkUI:
def _print_challenge_result(self, result: ChallengeResult) -> None:
"""Print detailed result for a single challenge."""
status_icon = "[green]PASS[/green]" if result.success else "[red]FAIL[/red]"
if result.success:
status_icon = "[green]PASS[/green]"
elif result.timed_out and result.score >= 0.9:
status_icon = "[yellow]TIMEOUT (would pass)[/yellow]"
elif result.timed_out:
status_icon = "[yellow]TIMEOUT[/yellow]"
else:
status_icon = "[red]FAIL[/red]"
console.print(
f"[dim][{result.config_name}][/dim] {result.challenge_name}: {status_icon} "
f"({result.n_steps} steps, ${result.cost:.4f})"
@@ -198,8 +205,21 @@ class BenchmarkUI:
from datetime import datetime
color = self.get_config_color(config_name)
status = "PASS" if result.success else "FAIL"
status_style = "green" if result.success else "red"
# Determine status display
if result.success:
status = "PASS"
status_style = "green"
elif result.timed_out and result.score >= 0.9:
# Timed out but would have passed - show this clearly
status = "TIMEOUT (would have passed)"
status_style = "yellow"
elif result.timed_out:
status = "TIMEOUT"
status_style = "yellow"
else:
status = "FAIL"
status_style = "red"
# Build challenge display with attempt if > 1
challenge_display = challenge_name
@@ -236,13 +256,10 @@ class BenchmarkUI:
f" [dim]Steps: {result.n_steps} | Time: {result.run_time_seconds:.1f}s | Cost: ${result.cost:.4f}[/dim]"
)
# Print error if any
if result.error_message:
# Print error if any (skip generic timeout message since status shows it)
if result.error_message and result.error_message != "Challenge timed out":
console.print(f" [red]Error: {result.error_message[:200]}[/red]")
if result.timed_out:
console.print(" [yellow]⚠ Timed out[/yellow]")
console.print(f"[{status_style}]{'' * 70}[/{status_style}]")
console.print()
@@ -315,6 +332,7 @@ class BenchmarkUI:
table = Table(title="Results by Configuration", show_header=True)
table.add_column("Configuration", style="cyan")
table.add_column("Passed", justify="right", style="green")
table.add_column("Would Pass", justify="right", style="yellow")
table.add_column("Failed", justify="right", style="red")
table.add_column("Rate", justify="right")
table.add_column("Cost", justify="right", style="yellow")
@@ -323,14 +341,18 @@ class BenchmarkUI:
if not results:
continue
passed = sum(1 for r in results if r.success)
failed = len(results) - passed
rate = (passed / len(results) * 100) if results else 0
would_pass = sum(1 for r in results if r.timed_out and r.score >= 0.9)
failed = len(results) - passed - would_pass
# Rate includes "would pass" since those are correct solutions
effective_passed = passed + would_pass
rate = (effective_passed / len(results) * 100) if results else 0
cost = sum(r.cost for r in results)
rate_style = "green" if rate >= 75 else "yellow" if rate >= 50 else "red"
table.add_row(
config_name,
str(passed),
str(would_pass) if would_pass > 0 else "-",
str(failed),
f"[{rate_style}]{rate:.1f}%[/{rate_style}]",
f"${cost:.4f}",
@@ -347,19 +369,34 @@ class BenchmarkUI:
else:
lines = []
for result in reversed(recent):
status = (
Text("\u2713", style="green")
if result.success
else Text("\u2717", style="red")
)
# Determine status icon and style
if result.success:
status = Text("\u2713", style="green")
elif result.timed_out and result.score >= 0.9:
status = Text("\u29D6", style="yellow") # Hourglass - would pass
elif result.timed_out:
status = Text("\u29D6", style="yellow") # Hourglass
else:
status = Text("\u2717", style="red")
# Build suffix for special cases
suffix = f" ({result.n_steps} steps)"
if result.timed_out and result.score >= 0.9:
suffix = f" ({result.n_steps} steps) [yellow]would pass[/yellow]"
line = Text.assemble(
(" ", ""),
status,
(" ", ""),
(f"[{result.config_name}] ", "dim"),
(result.challenge_name, "white"),
(f" ({result.n_steps} steps)", "dim"),
)
# Handle markup in suffix separately
if result.timed_out and result.score >= 0.9:
line.append(f" ({result.n_steps} steps) ", style="dim")
line.append("would pass", style="yellow")
else:
line.append(suffix, style="dim")
lines.append(line)
content = Group(*lines)
@@ -400,14 +437,27 @@ class BenchmarkUI:
# Overall stats
total_passed = sum(1 for r in self.completed if r.success)
total_failed = len(self.completed) - total_passed
total_would_pass = sum(
1 for r in self.completed if r.timed_out and r.score >= 0.9
)
total_failed = len(self.completed) - total_passed - total_would_pass
total_cost = sum(r.cost for r in self.completed)
total_rate = (total_passed / len(self.completed) * 100) if self.completed else 0
# Include "would pass" in the effective rate
effective_passed = total_passed + total_would_pass
total_rate = (
(effective_passed / len(self.completed) * 100) if self.completed else 0
)
console.print()
console.print(
f"[bold]Total:[/bold] {total_passed}/{len(self.completed)} passed"
)
if total_would_pass > 0:
console.print(
f"[bold]Total:[/bold] {total_passed}/{len(self.completed)} passed "
f"[yellow](+{total_would_pass} would pass)[/yellow]"
)
else:
console.print(
f"[bold]Total:[/bold] {total_passed}/{len(self.completed)} passed"
)
console.print(f"[bold]Success Rate:[/bold] {total_rate:.1f}%")
console.print(f"[bold]Total Cost:[/bold] ${total_cost:.4f}")
console.print(f"[bold]Elapsed Time:[/bold] {elapsed:.1f}s")