fix(copilot): user-friendly exhaustion error, reduce E2B timeout, add events_yielded guard test

- Replace raw SDK error text with actionable message when all retry attempts are exhausted ("Your conversation is too long...") - Reduce e2b_sandbox_timeout from 600s (10 min) to 420s (7 min) - Add TestEventsYieldedGuard tests verifying the retry loop breaks immediately when events have already been sent to the frontend
2026-04-08 03:00:28 -04:00 · 2026-03-15 13:50:30 +07:00
parent f1151c5cc1
commit 29efcfb280
3 changed files with 94 additions and 2 deletions
--- a/autogpt_platform/backend/backend/copilot/config.py
+++ b/autogpt_platform/backend/backend/copilot/config.py
@@ -115,7 +115,7 @@ class ChatConfig(BaseSettings):
        description="E2B sandbox template to use for copilot sessions.",
    )
    e2b_sandbox_timeout: int = Field(
-        default=600,  # 10 min safety net — allows headroom for compaction retries
+        default=420,  # 7 min safety net — allows headroom for compaction retries
        description="E2B sandbox running-time timeout (seconds). "
        "E2B timeout is wall-clock (not idle). Explicit per-turn pause is the primary "
        "mechanism; this is the safety net.",
--- a/autogpt_platform/backend/backend/copilot/sdk/retry_scenarios_test.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/retry_scenarios_test.py
@@ -569,6 +569,91 @@ class TestRetryStateMachine:
        assert state["stream_completed"] is False


+# ---------------------------------------------------------------------------
+# Scenario 9: events_yielded > 0 prevents retry
+# ---------------------------------------------------------------------------
+
+
+class TestEventsYieldedGuard:
+    """When events have already been yielded to the frontend, retrying would
+    produce duplicate/inconsistent output.  The retry loop must break
+    immediately with an error instead of continuing to the next attempt."""
+
+    def _simulate_retry_with_events_yielded(
+        self,
+        events_yielded_per_attempt: list[int],
+        transcript_content: str = "some_content",
+    ) -> dict:
+        """Simulate the retry loop with explicit events_yielded counts.
+
+        Args:
+            events_yielded_per_attempt: Number of non-heartbeat events yielded
+                before the error on each attempt.  Only the first attempt that
+                errors with events_yielded > 0 matters — the loop should break.
+            transcript_content: Initial transcript content.
+        """
+        stream_err: Exception | None = None
+        ended_with_stream_error = False
+        attempts_made = 0
+
+        for attempt in range(
+            min(_MAX_STREAM_ATTEMPTS, len(events_yielded_per_attempt))
+        ):
+            attempts_made += 1
+            events_yielded = events_yielded_per_attempt[attempt]
+
+            # Simulate stream error
+            stream_err = Exception("simulated stream error")
+            is_context_error = True
+
+            if events_yielded > 0:
+                # This is the guard under test: when events have been
+                # yielded, the loop breaks immediately — no retry.
+                ended_with_stream_error = True
+                break
+
+            if not is_context_error:
+                ended_with_stream_error = True
+                break
+
+            # Would continue to next attempt
+            continue
+        else:
+            ended_with_stream_error = True
+
+        return {
+            "attempts_made": attempts_made,
+            "stream_err": stream_err,
+            "ended_with_stream_error": ended_with_stream_error,
+        }
+
+    def test_events_yielded_prevents_retry(self):
+        """When events were yielded on attempt 1, no retry should happen."""
+        state = self._simulate_retry_with_events_yielded([5])
+        assert state["attempts_made"] == 1
+        assert state["ended_with_stream_error"] is True
+        assert state["stream_err"] is not None
+
+    def test_zero_events_allows_retry(self):
+        """When no events were yielded on attempt 1, retry should proceed."""
+        state = self._simulate_retry_with_events_yielded([0, 0, 0])
+        assert state["attempts_made"] == 3
+        assert state["ended_with_stream_error"] is True  # all exhausted
+
+    def test_events_on_second_attempt_stops_retry(self):
+        """Attempt 1: 0 events (retry allowed).
+        Attempt 2: events yielded (no further retry)."""
+        state = self._simulate_retry_with_events_yielded([0, 3])
+        assert state["attempts_made"] == 2
+        assert state["ended_with_stream_error"] is True
+
+    def test_single_event_is_enough_to_prevent_retry(self):
+        """Even a single non-heartbeat event should prevent retry."""
+        state = self._simulate_retry_with_events_yielded([1])
+        assert state["attempts_made"] == 1
+        assert state["ended_with_stream_error"] is True
+
+
 # ---------------------------------------------------------------------------
 # Edge cases
 # ---------------------------------------------------------------------------
--- a/autogpt_platform/backend/backend/copilot/sdk/service.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/service.py
@@ -1617,8 +1617,15 @@ async def stream_chat_completion_sdk(
            # retries were consumed vs "sdk_stream_error" for non-context
            # errors that broke the loop immediately (network, auth, etc.).
            safe_err = str(stream_err).replace("\n", " ").replace("\r", "")[:500]
+            if attempts_exhausted:
+                error_text = (
+                    "Your conversation is too long. "
+                    "Please start a new chat or clear some history."
+                )
+            else:
+                error_text = f"SDK stream error: {safe_err}"
            yield StreamError(
-                errorText=f"SDK stream error: {safe_err}",
+                errorText=error_text,
                code=(
                    "all_attempts_exhausted"
                    if attempts_exhausted