Compare commits

...

6 Commits

Author SHA1 Message Date
Otto
5e883459e0 Fix activity status wording for mid-execution credit failures
Change 'couldn't start' to 'couldn't complete' since we now also match
mid-execution credit exhaustion where some steps may have already run.
2026-03-31 14:26:08 +00:00
Otto
c9cb953274 Address review: None score, mid-execution credits, generic log, Exception test
- Return correctness_score=None instead of 0.0 so frontend skips the
  misleading red failure bar (it checks typeof === 'number')
- Replace dead 'insufficientbalanceerror' pattern with 'insufficient
  balance of' to catch mid-execution credit failures from credit.py
- Make log message generic (no hardcoded 'credit exhaustion')
- Handle None score in manager.py log formatting
- Add test for Exception object flowing through str() conversion
- Update ActivityStatusResponse type to allow None score
2026-03-31 14:00:48 +00:00
Otto
9ad68b4e1e Assert LLM block is not called in short-circuit path
Add mock for AIStructuredResponseGeneratorBlock in the integration test
to explicitly verify the LLM is never invoked when credit exhaustion
triggers the early exit.
2026-03-31 08:29:34 +00:00
Otto
b672e20b0d Fix formatting: apply black style to test file 2026-03-31 08:04:46 +00:00
Otto
a21748b1a8 Address CodeRabbit review: tighten matcher and sanitize logs
- Tighten _is_credit_exhaustion() to only match the exact internal
  preflight message and InsufficientBalanceError class name, avoiding
  false positives from broad 'insufficient balance' matching
- Remove raw error from INFO log to avoid leaking user IDs/balance data
- Update tests to reflect tighter matching
2026-03-31 07:59:43 +00:00
Otto
63f6b5122a Skip LLM execution analysis for credit exhaustion failures
Add early-exit in generate_activity_status_for_execution() that detects
credit/balance exhaustion errors and returns a static ActivityStatusResponse
without making an LLM call.

48% of all execution failures (1,472/3,048 in the last 24h) are credit
exhaustion, each triggering a paid LLM call to conclude 'no credits.'
This skips the LLM entirely for these known failures, saving ~$1-2/day.

Co-authored-by: Bently <github@bentlybro.com>
2026-03-31 07:04:34 +00:00
3 changed files with 244 additions and 58 deletions

View File

@@ -168,7 +168,7 @@ class ActivityStatusResponse(TypedDict):
"""Type definition for structured activity status response."""
activity_status: str
correctness_score: float
correctness_score: float | None
def _truncate_uuid(uuid_str: str) -> str:
@@ -178,6 +178,45 @@ def _truncate_uuid(uuid_str: str) -> str:
return uuid_str.split("-")[0] if "-" in uuid_str else uuid_str[:8]
_CREDIT_EXHAUSTION_MESSAGES = (
"you have no credits left to run an agent.",
"insufficient balance of",
)
def _is_credit_exhaustion(error_str: str) -> bool:
"""Check if the error indicates credit/balance exhaustion."""
error_lower = error_str.lower()
return any(message in error_lower for message in _CREDIT_EXHAUSTION_MESSAGES)
def _check_obvious_failure(
execution_stats: GraphExecutionStats,
execution_status: ExecutionStatus | None,
) -> ActivityStatusResponse | None:
"""
Check if the execution failed for an obvious, deterministic reason
that doesn't require LLM analysis.
Returns a static ActivityStatusResponse if matched, None otherwise.
"""
if execution_status != ExecutionStatus.FAILED:
return None
error_str = str(execution_stats.error) if execution_stats.error else ""
if _is_credit_exhaustion(error_str):
return {
"activity_status": (
"This run couldn't complete because your account has run out of credits. "
"Please top up your credits to continue using this agent."
),
"correctness_score": None,
}
return None
async def generate_activity_status_for_execution(
graph_exec_id: str,
graph_id: str,
@@ -237,6 +276,14 @@ async def generate_activity_status_for_execution(
"correctness_score": execution_stats.correctness_score,
}
# Check for obvious failures that don't need LLM analysis
obvious_result = _check_obvious_failure(execution_stats, execution_status)
if obvious_result is not None:
logger.info(
f"Skipping LLM analysis for {graph_exec_id}: " "obvious failure detected"
)
return obvious_result
# Check if we have OpenAI API key
try:
settings = Settings()

View File

@@ -12,6 +12,8 @@ from backend.data.execution import ExecutionStatus, NodeExecutionResult
from backend.data.model import GraphExecutionStats
from backend.executor.activity_status_generator import (
_build_execution_summary,
_check_obvious_failure,
_is_credit_exhaustion,
generate_activity_status_for_execution,
)
@@ -379,8 +381,9 @@ class TestLLMCall:
from backend.blocks.llm import AIStructuredResponseGeneratorBlock
from backend.data.model import APIKeyCredentials
with patch("backend.blocks.llm.llm_call") as mock_llm_call, patch(
"backend.blocks.llm.secrets.token_hex", return_value="test123"
with (
patch("backend.blocks.llm.llm_call") as mock_llm_call,
patch("backend.blocks.llm.secrets.token_hex", return_value="test123"),
):
mock_llm_call.return_value = LLMResponse(
raw_response={},
@@ -442,8 +445,9 @@ class TestLLMCall:
from backend.blocks.llm import AIStructuredResponseGeneratorBlock
from backend.data.model import APIKeyCredentials
with patch("backend.blocks.llm.llm_call") as mock_llm_call, patch(
"backend.blocks.llm.secrets.token_hex", return_value="test123"
with (
patch("backend.blocks.llm.llm_call") as mock_llm_call,
patch("backend.blocks.llm.secrets.token_hex", return_value="test123"),
):
# Return invalid JSON that will fail validation (missing required field)
mock_llm_call.return_value = LLMResponse(
@@ -515,17 +519,21 @@ class TestGenerateActivityStatusForExecution:
mock_graph.links = []
mock_db_client.get_graph.return_value = mock_graph
with patch(
"backend.executor.activity_status_generator.get_block"
) as mock_get_block, patch(
"backend.executor.activity_status_generator.Settings"
) as mock_settings, patch(
"backend.executor.activity_status_generator.AIStructuredResponseGeneratorBlock"
) as mock_structured_block, patch(
"backend.executor.activity_status_generator.is_feature_enabled",
return_value=True,
with (
patch(
"backend.executor.activity_status_generator.get_block"
) as mock_get_block,
patch(
"backend.executor.activity_status_generator.Settings"
) as mock_settings,
patch(
"backend.executor.activity_status_generator.AIStructuredResponseGeneratorBlock"
) as mock_structured_block,
patch(
"backend.executor.activity_status_generator.is_feature_enabled",
return_value=True,
),
):
mock_get_block.side_effect = lambda block_id: mock_blocks.get(block_id)
mock_settings.return_value.secrets.openai_internal_api_key = "test_key"
@@ -533,10 +541,13 @@ class TestGenerateActivityStatusForExecution:
mock_instance = mock_structured_block.return_value
async def mock_run(*args, **kwargs):
yield "response", {
"activity_status": "I analyzed your data and provided the requested insights.",
"correctness_score": 0.85,
}
yield (
"response",
{
"activity_status": "I analyzed your data and provided the requested insights.",
"correctness_score": 0.85,
},
)
mock_instance.run = mock_run
@@ -586,11 +597,14 @@ class TestGenerateActivityStatusForExecution:
"""Test activity status generation with no API key."""
mock_db_client = AsyncMock()
with patch(
"backend.executor.activity_status_generator.Settings"
) as mock_settings, patch(
"backend.executor.activity_status_generator.is_feature_enabled",
return_value=True,
with (
patch(
"backend.executor.activity_status_generator.Settings"
) as mock_settings,
patch(
"backend.executor.activity_status_generator.is_feature_enabled",
return_value=True,
),
):
mock_settings.return_value.secrets.openai_internal_api_key = ""
@@ -612,11 +626,14 @@ class TestGenerateActivityStatusForExecution:
mock_db_client = AsyncMock()
mock_db_client.get_node_executions.side_effect = Exception("Database error")
with patch(
"backend.executor.activity_status_generator.Settings"
) as mock_settings, patch(
"backend.executor.activity_status_generator.is_feature_enabled",
return_value=True,
with (
patch(
"backend.executor.activity_status_generator.Settings"
) as mock_settings,
patch(
"backend.executor.activity_status_generator.is_feature_enabled",
return_value=True,
),
):
mock_settings.return_value.secrets.openai_internal_api_key = "test_key"
@@ -641,17 +658,21 @@ class TestGenerateActivityStatusForExecution:
mock_db_client.get_graph_metadata.return_value = None # No metadata
mock_db_client.get_graph.return_value = None # No graph
with patch(
"backend.executor.activity_status_generator.get_block"
) as mock_get_block, patch(
"backend.executor.activity_status_generator.Settings"
) as mock_settings, patch(
"backend.executor.activity_status_generator.AIStructuredResponseGeneratorBlock"
) as mock_structured_block, patch(
"backend.executor.activity_status_generator.is_feature_enabled",
return_value=True,
with (
patch(
"backend.executor.activity_status_generator.get_block"
) as mock_get_block,
patch(
"backend.executor.activity_status_generator.Settings"
) as mock_settings,
patch(
"backend.executor.activity_status_generator.AIStructuredResponseGeneratorBlock"
) as mock_structured_block,
patch(
"backend.executor.activity_status_generator.is_feature_enabled",
return_value=True,
),
):
mock_get_block.side_effect = lambda block_id: mock_blocks.get(block_id)
mock_settings.return_value.secrets.openai_internal_api_key = "test_key"
@@ -659,10 +680,13 @@ class TestGenerateActivityStatusForExecution:
mock_instance = mock_structured_block.return_value
async def mock_run(*args, **kwargs):
yield "response", {
"activity_status": "Agent completed execution.",
"correctness_score": 0.8,
}
yield (
"response",
{
"activity_status": "Agent completed execution.",
"correctness_score": 0.8,
},
)
mock_instance.run = mock_run
@@ -704,17 +728,21 @@ class TestIntegration:
expected_activity = "I processed user input but failed during final output generation due to system error."
with patch(
"backend.executor.activity_status_generator.get_block"
) as mock_get_block, patch(
"backend.executor.activity_status_generator.Settings"
) as mock_settings, patch(
"backend.executor.activity_status_generator.AIStructuredResponseGeneratorBlock"
) as mock_structured_block, patch(
"backend.executor.activity_status_generator.is_feature_enabled",
return_value=True,
with (
patch(
"backend.executor.activity_status_generator.get_block"
) as mock_get_block,
patch(
"backend.executor.activity_status_generator.Settings"
) as mock_settings,
patch(
"backend.executor.activity_status_generator.AIStructuredResponseGeneratorBlock"
) as mock_structured_block,
patch(
"backend.executor.activity_status_generator.is_feature_enabled",
return_value=True,
),
):
mock_get_block.side_effect = lambda block_id: mock_blocks.get(block_id)
mock_settings.return_value.secrets.openai_internal_api_key = "test_key"
@@ -722,10 +750,13 @@ class TestIntegration:
mock_instance = mock_structured_block.return_value
async def mock_run(*args, **kwargs):
yield "response", {
"activity_status": expected_activity,
"correctness_score": 0.3, # Low score since there was a failure
}
yield (
"response",
{
"activity_status": expected_activity,
"correctness_score": 0.3, # Low score since there was a failure
},
)
mock_instance.run = mock_run
@@ -774,3 +805,109 @@ class TestIntegration:
mock_db_client.get_node_executions.assert_not_called()
mock_db_client.get_graph_metadata.assert_not_called()
mock_db_client.get_graph.assert_not_called()
class TestObviousFailureDetection:
"""Tests for obvious failure detection that skips LLM analysis."""
def test_credit_exhaustion_detected(self):
"""Credit exhaustion errors should be detected."""
assert _is_credit_exhaustion("You have no credits left to run an agent.")
assert _is_credit_exhaustion(
"Insufficient balance of $0, where this will cost $1"
)
def test_credit_exhaustion_case_insensitive(self):
"""Detection should be case-insensitive."""
assert _is_credit_exhaustion("YOU HAVE NO CREDITS LEFT TO RUN AN AGENT.")
assert _is_credit_exhaustion("INSUFFICIENT BALANCE OF $0")
def test_non_credit_errors_not_matched(self):
"""Non-credit errors should not match."""
assert not _is_credit_exhaustion("Connection timeout")
assert not _is_credit_exhaustion("API rate limit exceeded")
assert not _is_credit_exhaustion("Invalid credentials")
assert not _is_credit_exhaustion("")
assert not _is_credit_exhaustion("Insufficient balance") # No trailing "of"
def test_partial_word_no_false_positive(self):
"""Similar words like 'credential' should not match 'credit'."""
assert not _is_credit_exhaustion("Invalid credential provided")
assert not _is_credit_exhaustion("Credential expired")
def test_check_obvious_failure_credit_exhaustion(self):
"""Credit exhaustion should return static response."""
stats = GraphExecutionStats(error="You have no credits left to run an agent.")
result = _check_obvious_failure(stats, ExecutionStatus.FAILED)
assert result is not None
assert result["correctness_score"] is None
assert "credits" in result["activity_status"].lower()
def test_check_obvious_failure_non_failed_status(self):
"""Non-FAILED status should always return None."""
stats = GraphExecutionStats(error="Some error")
assert _check_obvious_failure(stats, ExecutionStatus.COMPLETED) is None
assert _check_obvious_failure(stats, ExecutionStatus.TERMINATED) is None
assert _check_obvious_failure(stats, None) is None
def test_check_obvious_failure_unknown_error(self):
"""Unknown errors should return None (fall through to LLM)."""
stats = GraphExecutionStats(error="Some unexpected error occurred")
result = _check_obvious_failure(stats, ExecutionStatus.FAILED)
assert result is None
def test_check_obvious_failure_no_error(self):
"""FAILED status with no error string should return None."""
stats = GraphExecutionStats(error=None)
result = _check_obvious_failure(stats, ExecutionStatus.FAILED)
assert result is None
def test_check_obvious_failure_with_exception_object(self):
"""Exception objects (not just strings) should be handled via str()."""
error = Exception("You have no credits left to run an agent.")
stats = GraphExecutionStats(error=error)
result = _check_obvious_failure(stats, ExecutionStatus.FAILED)
assert result is not None
assert result["correctness_score"] is None
assert "credits" in result["activity_status"].lower()
@pytest.mark.asyncio
async def test_generate_skips_llm_for_credit_exhaustion(self):
"""Full integration: credit exhaustion should skip LLM and DB calls."""
stats = GraphExecutionStats(
error="You have no credits left to run an agent.",
node_count=0,
node_error_count=0,
)
mock_db_client = AsyncMock()
with (
patch(
"backend.executor.activity_status_generator.is_feature_enabled",
return_value=True,
),
patch(
"backend.executor.activity_status_generator.AIStructuredResponseGeneratorBlock"
) as mock_structured_block,
):
result = await generate_activity_status_for_execution(
graph_exec_id="test_exec",
graph_id="test_graph",
graph_version=1,
execution_stats=stats,
db_client=mock_db_client,
user_id="test_user",
execution_status=ExecutionStatus.FAILED,
skip_feature_flag=False,
)
assert result is not None
assert result["correctness_score"] is None
assert "credits" in result["activity_status"].lower()
# Verify NO database or LLM calls were made
mock_db_client.get_node_executions.assert_not_called()
mock_db_client.get_graph_metadata.assert_not_called()
mock_db_client.get_graph.assert_not_called()
mock_structured_block.assert_not_called()

View File

@@ -880,9 +880,11 @@ class ExecutionProcessor:
if activity_response is not None:
exec_stats.activity_status = activity_response["activity_status"]
exec_stats.correctness_score = activity_response["correctness_score"]
score = activity_response["correctness_score"]
score_str = f"{score:.2f}" if score is not None else "N/A"
log_metadata.info(
f"Generated activity status: {activity_response['activity_status']} "
f"(correctness: {activity_response['correctness_score']:.2f})"
f"(correctness: {score_str})"
)
else:
log_metadata.debug(