fix(copilot): user-friendly exhaustion error, reduce E2B timeout, add events_yielded guard test

- Replace raw SDK error text with actionable message when all retry
  attempts are exhausted ("Your conversation is too long...")
- Reduce e2b_sandbox_timeout from 600s (10 min) to 420s (7 min)
- Add TestEventsYieldedGuard tests verifying the retry loop breaks
  immediately when events have already been sent to the frontend
This commit is contained in:
Zamil Majdy
2026-03-15 13:50:30 +07:00
parent f1151c5cc1
commit 29efcfb280
3 changed files with 94 additions and 2 deletions

View File

@@ -115,7 +115,7 @@ class ChatConfig(BaseSettings):
description="E2B sandbox template to use for copilot sessions.",
)
e2b_sandbox_timeout: int = Field(
default=600, # 10 min safety net — allows headroom for compaction retries
default=420, # 7 min safety net — allows headroom for compaction retries
description="E2B sandbox running-time timeout (seconds). "
"E2B timeout is wall-clock (not idle). Explicit per-turn pause is the primary "
"mechanism; this is the safety net.",

View File

@@ -569,6 +569,91 @@ class TestRetryStateMachine:
assert state["stream_completed"] is False
# ---------------------------------------------------------------------------
# Scenario 9: events_yielded > 0 prevents retry
# ---------------------------------------------------------------------------
class TestEventsYieldedGuard:
"""When events have already been yielded to the frontend, retrying would
produce duplicate/inconsistent output. The retry loop must break
immediately with an error instead of continuing to the next attempt."""
def _simulate_retry_with_events_yielded(
self,
events_yielded_per_attempt: list[int],
transcript_content: str = "some_content",
) -> dict:
"""Simulate the retry loop with explicit events_yielded counts.
Args:
events_yielded_per_attempt: Number of non-heartbeat events yielded
before the error on each attempt. Only the first attempt that
errors with events_yielded > 0 matters — the loop should break.
transcript_content: Initial transcript content.
"""
stream_err: Exception | None = None
ended_with_stream_error = False
attempts_made = 0
for attempt in range(
min(_MAX_STREAM_ATTEMPTS, len(events_yielded_per_attempt))
):
attempts_made += 1
events_yielded = events_yielded_per_attempt[attempt]
# Simulate stream error
stream_err = Exception("simulated stream error")
is_context_error = True
if events_yielded > 0:
# This is the guard under test: when events have been
# yielded, the loop breaks immediately — no retry.
ended_with_stream_error = True
break
if not is_context_error:
ended_with_stream_error = True
break
# Would continue to next attempt
continue
else:
ended_with_stream_error = True
return {
"attempts_made": attempts_made,
"stream_err": stream_err,
"ended_with_stream_error": ended_with_stream_error,
}
def test_events_yielded_prevents_retry(self):
"""When events were yielded on attempt 1, no retry should happen."""
state = self._simulate_retry_with_events_yielded([5])
assert state["attempts_made"] == 1
assert state["ended_with_stream_error"] is True
assert state["stream_err"] is not None
def test_zero_events_allows_retry(self):
"""When no events were yielded on attempt 1, retry should proceed."""
state = self._simulate_retry_with_events_yielded([0, 0, 0])
assert state["attempts_made"] == 3
assert state["ended_with_stream_error"] is True # all exhausted
def test_events_on_second_attempt_stops_retry(self):
"""Attempt 1: 0 events (retry allowed).
Attempt 2: events yielded (no further retry)."""
state = self._simulate_retry_with_events_yielded([0, 3])
assert state["attempts_made"] == 2
assert state["ended_with_stream_error"] is True
def test_single_event_is_enough_to_prevent_retry(self):
"""Even a single non-heartbeat event should prevent retry."""
state = self._simulate_retry_with_events_yielded([1])
assert state["attempts_made"] == 1
assert state["ended_with_stream_error"] is True
# ---------------------------------------------------------------------------
# Edge cases
# ---------------------------------------------------------------------------

View File

@@ -1617,8 +1617,15 @@ async def stream_chat_completion_sdk(
# retries were consumed vs "sdk_stream_error" for non-context
# errors that broke the loop immediately (network, auth, etc.).
safe_err = str(stream_err).replace("\n", " ").replace("\r", "")[:500]
if attempts_exhausted:
error_text = (
"Your conversation is too long. "
"Please start a new chat or clear some history."
)
else:
error_text = f"SDK stream error: {safe_err}"
yield StreamError(
errorText=f"SDK stream error: {safe_err}",
errorText=error_text,
code=(
"all_attempts_exhausted"
if attempts_exhausted