From c6a31cb501ca12886fa13d66c926cc43d001b38b Mon Sep 17 00:00:00 2001
From: majdyz <majdy.zamil@gmail.com>
Date: Fri, 10 Apr 2026 15:15:52 +0000
Subject: [PATCH 01/30] feat(copilot): inject user messages mid-turn via
 pending buffer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a user sends a follow-up message while a copilot turn is still
streaming, we now queue it into a per-session Redis buffer and let the
executor currently processing the turn drain it between tool-call
rounds — the model sees the new message before its next LLM call.
Previously such messages were blocked at the RabbitMQ/cluster-lock
layer and only processed after the current turn completed.

### New module
`backend/copilot/pending_messages.py`
- Redis list buffer keyed by ``copilot:pending:{session_id}``
- Pub/sub notify channel as a wake-up hint for future blocking-wait use
- Cap of ``MAX_PENDING_MESSAGES=10`` — trims oldest on overflow
- 1h TTL matches ``stream_ttl`` default
- Helpers: ``push_pending_message``, ``drain_pending_messages``,
  ``peek_pending_count``, ``clear_pending_messages``,
  ``format_pending_as_user_message``

### New endpoint
`POST /sessions/{session_id}/messages/pending`
- Returns 202 + current buffer length
- Persists the message to the DB so it's in the transcript immediately
- Sanitises file IDs against the caller's workspace
- Does NOT start a new turn (unlike ``stream``)

### Baseline path (simple — in-process injection)
`backend/copilot/baseline/service.py`
- Between iterations of ``tool_call_loop``, drain pending and append to
  the shared ``openai_messages`` list so the loop picks them up on the
  next LLM call
- Persist session via ``upsert_chat_session`` after injection
- Finally-block safety net clears the buffer on early exit

### SDK path (in-process injection via live client.query)
`backend/copilot/sdk/service.py`
- When the SDK loop detects ``acc.stream_completed``, before breaking,
  drain pending and send them via the existing open ``client.query()``
  as a new user message; reset ``stream_completed`` to ``False`` and
  ``continue`` the async-for loop so we keep consuming CLI messages
- Combines multiple drained messages into a single ``query()`` call via
  ``_combine_pending_messages`` to preserve ordering
- Finally-block safety net clears the buffer on early exit
- This works because the Claude Agent SDK's ``ClaudeSDKClient`` is a
  long-lived connection: ``query()`` writes a new user message to the
  CLI's stdin and the same ``receive_response()`` stream picks up the
  next turn's events, so we keep session continuity without releasing
  the cluster lock or restarting the subprocess

### Tests
`backend/copilot/pending_messages_test.py`
- FakeRedis + FakePipeline so tests don't need a live Redis
- Covers push/drain, ordering, buffer cap (MAX_PENDING_MESSAGES),
  clear, publish hook, malformed-payload handling, and the format
  helper (plain / with context / with file_ids)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../backend/api/features/chat/routes.py       |  99 ++++++++
 .../backend/copilot/baseline/service.py       |  48 ++++
 .../backend/copilot/pending_messages.py       | 196 +++++++++++++++
 .../backend/copilot/pending_messages_test.py  | 233 ++++++++++++++++++
 .../backend/backend/copilot/sdk/service.py    |  69 ++++++
 5 files changed, 645 insertions(+)
 create mode 100644 autogpt_platform/backend/backend/copilot/pending_messages.py
 create mode 100644 autogpt_platform/backend/backend/copilot/pending_messages_test.py

diff --git a/autogpt_platform/backend/backend/api/features/chat/routes.py b/autogpt_platform/backend/backend/api/features/chat/routes.py
index 57a7b9a204..a1eebdd6e3 100644
--- a/autogpt_platform/backend/backend/api/features/chat/routes.py
+++ b/autogpt_platform/backend/backend/api/features/chat/routes.py
@@ -29,6 +29,11 @@ from backend.copilot.model import (
     get_user_sessions,
     update_session_title,
 )
+from backend.copilot.pending_messages import (
+    MAX_PENDING_MESSAGES,
+    PendingMessage,
+    push_pending_message,
+)
 from backend.copilot.rate_limit import (
     CoPilotUsageStatus,
     RateLimitExceeded,
@@ -119,6 +124,26 @@ class StreamChatRequest(BaseModel):
     )
 
 
+class QueuePendingMessageRequest(BaseModel):
+    """Request model for queueing a message into an in-flight turn.
+
+    Unlike ``StreamChatRequest`` this endpoint does **not** start a new
+    turn — the message is appended to a per-session pending buffer that
+    the executor currently processing the turn will drain between tool
+    rounds.
+    """
+
+    message: str = Field(min_length=1)
+    context: dict[str, str] | None = None
+    file_ids: list[str] | None = Field(default=None, max_length=20)
+
+
+class QueuePendingMessageResponse(BaseModel):
+    queued: bool
+    buffer_length: int
+    message: str
+
+
 class CreateSessionRequest(BaseModel):
     """Request model for creating a new chat session.
 
@@ -1012,6 +1037,80 @@ async def stream_chat_post(
     )
 
 
+@router.post(
+    "/sessions/{session_id}/messages/pending",
+    response_model=QueuePendingMessageResponse,
+    status_code=202,
+)
+async def queue_pending_message(
+    session_id: str,
+    request: QueuePendingMessageRequest,
+    user_id: str = Security(auth.get_user_id),
+):
+    """Queue a new user message into an in-flight copilot turn.
+
+    When a user sends a follow-up message while a turn is still
+    streaming, we don't want to block them or start a separate turn —
+    this endpoint appends the message to a per-session pending buffer
+    that the executor currently processing the turn will drain between
+    tool-call rounds, injecting it into the conversation before the
+    model's next LLM call.
+
+    Returns 202 with the new buffer length on success.  If the buffer
+    is full (``MAX_PENDING_MESSAGES``), the oldest pending message is
+    evicted to make room for the new one — the newest message always
+    wins.
+
+    Intended for the frontend "send while streaming" flow.  If no turn
+    is currently in flight the message is still queued — the next turn
+    the user starts will pick it up before its first LLM call.
+    """
+    await _validate_and_get_session(session_id, user_id)
+
+    # Persist the message to the session immediately so it shows up in
+    # the transcript even before the executor drains the buffer.
+    chat_msg = ChatMessage(role="user", content=request.message)
+    if user_id:
+        track_user_message(
+            user_id=user_id,
+            session_id=session_id,
+            message_length=len(request.message),
+        )
+    await append_and_save_message(session_id, chat_msg)
+
+    # Sanitise file IDs to the user's own workspace (same logic as
+    # stream_chat_post) so injection doesn't surface other users' files.
+    sanitized_file_ids: list[str] = []
+    if request.file_ids and user_id:
+        valid_ids = [fid for fid in request.file_ids if _UUID_RE.match(fid)]
+        if valid_ids:
+            workspace = await get_or_create_workspace(user_id)
+            files = await UserWorkspaceFile.prisma().find_many(
+                where={
+                    "id": {"in": valid_ids},
+                    "workspaceId": workspace.id,
+                    "isDeleted": False,
+                }
+            )
+            sanitized_file_ids = [wf.id for wf in files]
+
+    pending = PendingMessage(
+        content=request.message,
+        file_ids=sanitized_file_ids,
+        context=request.context,
+    )
+    buffer_length = await push_pending_message(session_id, pending)
+
+    return QueuePendingMessageResponse(
+        queued=True,
+        buffer_length=buffer_length,
+        message=(
+            f"Queued — will be injected into the current turn "
+            f"(buffer: {buffer_length}/{MAX_PENDING_MESSAGES})"
+        ),
+    )
+
+
 @router.get(
     "/sessions/{session_id}/stream",
 )
diff --git a/autogpt_platform/backend/backend/copilot/baseline/service.py b/autogpt_platform/backend/backend/copilot/baseline/service.py
index a8044d80b7..1658d93eb1 100644
--- a/autogpt_platform/backend/backend/copilot/baseline/service.py
+++ b/autogpt_platform/backend/backend/copilot/baseline/service.py
@@ -35,6 +35,11 @@ from backend.copilot.model import (
     maybe_append_user_message,
     upsert_chat_session,
 )
+from backend.copilot.pending_messages import (
+    clear_pending_messages,
+    drain_pending_messages,
+    format_pending_as_user_message,
+)
 from backend.copilot.prompting import get_baseline_supplement, get_graphiti_supplement
 from backend.copilot.response_model import (
     StreamBaseResponse,
@@ -1160,6 +1165,35 @@ async def stream_chat_completion_baseline(
                 yield evt
             state.pending_events.clear()
 
+            # Inject any messages the user queued while the turn was
+            # running.  ``tool_call_loop`` mutates ``openai_messages``
+            # in-place, so appending here means the model sees the new
+            # messages before its next LLM call.  Also persist them to
+            # the ChatSession so they're part of the durable transcript.
+            pending = await drain_pending_messages(session_id)
+            if pending:
+                for pm in pending:
+                    maybe_append_user_message(
+                        session, pm.content, is_user_message=True
+                    )
+                    openai_messages.append(format_pending_as_user_message(pm))
+                    transcript_builder.append_user(content=pm.content)
+                try:
+                    await upsert_chat_session(session)
+                except Exception as persist_err:
+                    logger.warning(
+                        "[Baseline] Failed to persist pending messages for "
+                        "session %s: %s",
+                        session_id,
+                        persist_err,
+                    )
+                logger.info(
+                    "[Baseline] Injected %d pending message(s) into "
+                    "session %s mid-turn",
+                    len(pending),
+                    session_id,
+                )
+
         if loop_result and not loop_result.finished_naturally:
             limit_msg = (
                 f"Exceeded {_MAX_TOOL_ROUNDS} tool-call rounds "
@@ -1200,6 +1234,20 @@ async def stream_chat_completion_baseline(
         yield StreamError(errorText=error_msg, code="baseline_error")
         # Still persist whatever we got
     finally:
+        # Safety net — if the stream exited early (error, timeout, etc.)
+        # we may still have queued pending messages in the buffer.  Drop
+        # them so they don't leak into the next turn.  During normal
+        # completion the tool-call loop drain will already have cleared
+        # the buffer, so this is a no-op in the happy path.
+        try:
+            await clear_pending_messages(session_id)
+        except Exception as clear_err:
+            logger.warning(
+                "[Baseline] Failed to clear pending messages for %s: %s",
+                session_id,
+                clear_err,
+            )
+
         # Set cost attributes on OTEL span before closing
         if _trace_ctx is not None:
             try:
diff --git a/autogpt_platform/backend/backend/copilot/pending_messages.py b/autogpt_platform/backend/backend/copilot/pending_messages.py
new file mode 100644
index 0000000000..0930a87e2d
--- /dev/null
+++ b/autogpt_platform/backend/backend/copilot/pending_messages.py
@@ -0,0 +1,196 @@
+"""Pending-message buffer for in-flight copilot turns.
+
+When a user sends a new message while a copilot turn is already executing,
+instead of blocking the frontend (or queueing a brand-new turn after the
+current one finishes), we want the new message to be *injected into the
+running turn* — appended between tool-call rounds so the model sees it
+before its next LLM call.
+
+This module provides the cross-process buffer that makes that possible:
+
+- **Producer** (chat API route): pushes a pending message to Redis and
+  publishes a notification on a pub/sub channel.
+- **Consumer** (executor running the turn): on each tool-call round,
+  drains the buffer and appends the pending messages to the conversation.
+
+The Redis list is the durable store; the pub/sub channel is a fast
+wake-up hint for long-idle consumers (not used by default, but available
+for future blocking-wait semantics).
+
+A hard cap of ``MAX_PENDING_MESSAGES`` per session prevents abuse.  The
+buffer is trimmed to the latest ``MAX_PENDING_MESSAGES`` on every push.
+"""
+
+import json
+import logging
+import time
+from typing import Any, cast
+
+from pydantic import BaseModel, Field
+
+from backend.data.redis_client import get_redis_async
+
+logger = logging.getLogger(__name__)
+
+# Per-session cap.  Higher values risk a runaway consumer; lower values
+# risk dropping user input under heavy typing.  10 was chosen as a
+# reasonable ceiling — a user typing faster than the copilot can drain
+# between tool rounds is already an unusual usage pattern.
+MAX_PENDING_MESSAGES = 10
+
+# Redis key + TTL.  The buffer is ephemeral: if a turn completes or the
+# executor dies, the pending messages should either have been drained
+# already or are safe to drop (the user can resend).
+_PENDING_KEY_PREFIX = "copilot:pending:"
+_PENDING_CHANNEL_PREFIX = "copilot:pending:notify:"
+_PENDING_TTL_SECONDS = 3600  # 1 hour — matches stream_ttl default
+
+
+class PendingMessage(BaseModel):
+    """A user message queued for injection into an in-flight turn."""
+
+    content: str = Field(min_length=1)
+    file_ids: list[str] = Field(default_factory=list)
+    context: dict[str, str] | None = None
+    # Unix epoch seconds at enqueue time, for ordering and debugging.
+    enqueued_at: float = Field(default_factory=time.time)
+
+
+def _buffer_key(session_id: str) -> str:
+    return f"{_PENDING_KEY_PREFIX}{session_id}"
+
+
+def _notify_channel(session_id: str) -> str:
+    return f"{_PENDING_CHANNEL_PREFIX}{session_id}"
+
+
+async def push_pending_message(
+    session_id: str,
+    message: PendingMessage,
+) -> int:
+    """Append a pending message to the session's buffer.
+
+    Returns the new buffer length.  Enforces ``MAX_PENDING_MESSAGES`` by
+    trimming from the left (oldest) — the newest message always wins if
+    the user has been typing faster than the copilot can drain.
+    """
+    redis = await get_redis_async()
+    key = _buffer_key(session_id)
+    payload = message.model_dump_json()
+
+    # Push + trim + expire in a pipeline so the three writes land atomically
+    # enough for this use case (pipelining doesn't guarantee atomicity
+    # across commands but ordering is preserved).
+    async with redis.pipeline(transaction=False) as pipe:
+        pipe.rpush(key, payload)
+        pipe.ltrim(key, -MAX_PENDING_MESSAGES, -1)
+        pipe.expire(key, _PENDING_TTL_SECONDS)
+        pipe.llen(key)
+        results = await pipe.execute()
+
+    new_length = int(results[-1])
+
+    # Fire-and-forget notify.  Subscribers use this as a wake-up hint;
+    # the buffer itself is authoritative so a lost notify is harmless.
+    try:
+        await redis.publish(_notify_channel(session_id), "1")
+    except Exception as e:  # pragma: no cover
+        logger.warning("pending_messages: publish failed for %s: %s", session_id, e)
+
+    logger.info(
+        "pending_messages: pushed message to session=%s (buffer_len=%d)",
+        session_id,
+        new_length,
+    )
+    return new_length
+
+
+async def drain_pending_messages(session_id: str) -> list[PendingMessage]:
+    """Atomically pop all pending messages for *session_id*.
+
+    Returns them in enqueue order (oldest first).  Uses ``LPOP`` with a
+    count so the read+delete is a single Redis round trip.  If the list
+    is empty or missing, returns ``[]``.
+    """
+    redis = await get_redis_async()
+    key = _buffer_key(session_id)
+
+    # Redis LPOP with count (Redis 6.2+) returns None for missing key,
+    # empty list if we somehow race an empty key, or the popped items.
+    # redis-py's async lpop overload with a count collapses the return
+    # type in pyright; cast the awaitable so strict type-check stays
+    # clean without changing runtime behaviour.
+    lpop_result = await cast(
+        "Any",
+        redis.lpop(key, MAX_PENDING_MESSAGES),
+    )
+    if not lpop_result:
+        return []
+    raw_popped: list[Any] = list(lpop_result)
+
+    # redis-py may return bytes or str depending on decode_responses.
+    decoded: list[str] = [
+        item.decode("utf-8") if isinstance(item, bytes) else str(item)
+        for item in raw_popped
+    ]
+
+    messages: list[PendingMessage] = []
+    for payload in decoded:
+        try:
+            messages.append(PendingMessage(**json.loads(payload)))
+        except Exception as e:
+            logger.warning(
+                "pending_messages: dropping malformed entry for %s: %s",
+                session_id,
+                e,
+            )
+
+    if messages:
+        logger.info(
+            "pending_messages: drained %d messages for session=%s",
+            len(messages),
+            session_id,
+        )
+    return messages
+
+
+async def peek_pending_count(session_id: str) -> int:
+    """Return the current buffer length without consuming it."""
+    redis = await get_redis_async()
+    length = await cast("Any", redis.llen(_buffer_key(session_id)))
+    return int(length)
+
+
+async def clear_pending_messages(session_id: str) -> None:
+    """Drop the session's pending buffer.
+
+    Called at the end of a turn (success or failure) so messages from a
+    previous turn don't leak into the next one.  The buffer may already
+    have been drained inside the turn — this is a safety net.
+    """
+    redis = await get_redis_async()
+    await redis.delete(_buffer_key(session_id))
+
+
+def format_pending_as_user_message(message: PendingMessage) -> dict[str, Any]:
+    """Shape a ``PendingMessage`` into the OpenAI-format user message dict.
+
+    Used by the baseline tool-call loop when injecting the buffered
+    message into the conversation.  Context/file metadata (if any) is
+    embedded into the content so the model sees everything in one block.
+    """
+    parts: list[str] = [message.content]
+    if message.context:
+        url = message.context.get("url")
+        if url:
+            parts.append(f"\n\n[Page URL: {url}]")
+        page_content = message.context.get("content")
+        if page_content:
+            parts.append(f"\n\n[Page content]\n{page_content}")
+    if message.file_ids:
+        parts.append(
+            "\n\n[Attached files]\n"
+            + "\n".join(f"- file_id={fid}" for fid in message.file_ids)
+            + "\nUse read_workspace_file with the file_id to access file contents."
+        )
+    return {"role": "user", "content": "".join(parts)}
diff --git a/autogpt_platform/backend/backend/copilot/pending_messages_test.py b/autogpt_platform/backend/backend/copilot/pending_messages_test.py
new file mode 100644
index 0000000000..b03906f52a
--- /dev/null
+++ b/autogpt_platform/backend/backend/copilot/pending_messages_test.py
@@ -0,0 +1,233 @@
+"""Tests for the copilot pending-messages buffer.
+
+Uses a fake async Redis client so the tests don't require a real Redis
+instance (the backend test suite's DB/Redis fixtures are heavyweight
+and pull in the full app startup).
+"""
+
+import json
+from typing import Any
+
+import pytest
+
+from backend.copilot import pending_messages as pm_module
+from backend.copilot.pending_messages import (
+    MAX_PENDING_MESSAGES,
+    PendingMessage,
+    clear_pending_messages,
+    drain_pending_messages,
+    format_pending_as_user_message,
+    peek_pending_count,
+    push_pending_message,
+)
+
+# ── Fake Redis ──────────────────────────────────────────────────────
+
+
+class _FakePipeline:
+    def __init__(self, parent: "_FakeRedis") -> None:
+        self._parent = parent
+        self._ops: list[tuple[str, tuple[Any, ...]]] = []
+
+    async def __aenter__(self) -> "_FakePipeline":
+        return self
+
+    async def __aexit__(self, *args: object) -> None:
+        return None
+
+    def rpush(self, key: str, value: Any) -> None:
+        self._ops.append(("rpush", (key, value)))
+
+    def ltrim(self, key: str, start: int, stop: int) -> None:
+        self._ops.append(("ltrim", (key, start, stop)))
+
+    def expire(self, key: str, ttl: int) -> None:
+        self._ops.append(("expire", (key, ttl)))
+
+    def llen(self, key: str) -> None:
+        self._ops.append(("llen", (key,)))
+
+    async def execute(self) -> list[Any]:
+        results: list[Any] = []
+        for op, args in self._ops:
+            if op == "rpush":
+                key, value = args
+                self._parent.lists.setdefault(key, []).append(value)
+                results.append(len(self._parent.lists[key]))
+            elif op == "ltrim":
+                key, start, stop = args
+                lst = self._parent.lists.get(key, [])
+                # Emulate Redis LTRIM (-N, -1) = last N
+                if start < 0 and stop == -1:
+                    self._parent.lists[key] = lst[start:]
+                else:
+                    self._parent.lists[key] = lst[start : stop + 1]
+                results.append(True)
+            elif op == "expire":
+                results.append(True)
+            elif op == "llen":
+                key = args[0]
+                results.append(len(self._parent.lists.get(key, [])))
+        return results
+
+
+class _FakeRedis:
+    def __init__(self) -> None:
+        self.lists: dict[str, list[str]] = {}
+        self.published: list[tuple[str, str]] = []
+
+    def pipeline(self, transaction: bool = False) -> _FakePipeline:
+        return _FakePipeline(self)
+
+    async def publish(self, channel: str, payload: str) -> int:
+        self.published.append((channel, payload))
+        return 1
+
+    async def lpop(self, key: str, count: int) -> list[str] | None:
+        lst = self.lists.get(key)
+        if not lst:
+            return None
+        popped = lst[:count]
+        self.lists[key] = lst[count:]
+        return popped
+
+    async def llen(self, key: str) -> int:
+        return len(self.lists.get(key, []))
+
+    async def delete(self, key: str) -> int:
+        if key in self.lists:
+            del self.lists[key]
+            return 1
+        return 0
+
+
+@pytest.fixture()
+def fake_redis(monkeypatch: pytest.MonkeyPatch) -> _FakeRedis:
+    redis = _FakeRedis()
+
+    async def _get_redis_async() -> _FakeRedis:
+        return redis
+
+    monkeypatch.setattr(pm_module, "get_redis_async", _get_redis_async)
+    return redis
+
+
+# ── Basic push / drain ──────────────────────────────────────────────
+
+
+@pytest.mark.asyncio
+async def test_push_and_drain_single_message(fake_redis: _FakeRedis) -> None:
+    length = await push_pending_message("sess1", PendingMessage(content="hello"))
+    assert length == 1
+    assert await peek_pending_count("sess1") == 1
+
+    drained = await drain_pending_messages("sess1")
+    assert len(drained) == 1
+    assert drained[0].content == "hello"
+    assert await peek_pending_count("sess1") == 0
+
+
+@pytest.mark.asyncio
+async def test_push_and_drain_preserves_order(fake_redis: _FakeRedis) -> None:
+    for i in range(3):
+        await push_pending_message("sess2", PendingMessage(content=f"msg {i}"))
+
+    drained = await drain_pending_messages("sess2")
+    assert [m.content for m in drained] == ["msg 0", "msg 1", "msg 2"]
+
+
+@pytest.mark.asyncio
+async def test_drain_empty_returns_empty_list(fake_redis: _FakeRedis) -> None:
+    assert await drain_pending_messages("nope") == []
+
+
+# ── Buffer cap ──────────────────────────────────────────────────────
+
+
+@pytest.mark.asyncio
+async def test_cap_drops_oldest_when_exceeded(fake_redis: _FakeRedis) -> None:
+    # Push MAX_PENDING_MESSAGES + 3 messages
+    for i in range(MAX_PENDING_MESSAGES + 3):
+        await push_pending_message("sess3", PendingMessage(content=f"m{i}"))
+
+    # Buffer should be clamped to MAX
+    assert await peek_pending_count("sess3") == MAX_PENDING_MESSAGES
+
+    drained = await drain_pending_messages("sess3")
+    assert len(drained) == MAX_PENDING_MESSAGES
+    # Oldest 3 dropped — we should only see m3..m(MAX+2)
+    assert drained[0].content == "m3"
+    assert drained[-1].content == f"m{MAX_PENDING_MESSAGES + 2}"
+
+
+# ── Clear ───────────────────────────────────────────────────────────
+
+
+@pytest.mark.asyncio
+async def test_clear_removes_buffer(fake_redis: _FakeRedis) -> None:
+    await push_pending_message("sess4", PendingMessage(content="x"))
+    await push_pending_message("sess4", PendingMessage(content="y"))
+    await clear_pending_messages("sess4")
+    assert await peek_pending_count("sess4") == 0
+
+
+@pytest.mark.asyncio
+async def test_clear_is_idempotent(fake_redis: _FakeRedis) -> None:
+    # Clearing an already-empty buffer should not raise
+    await clear_pending_messages("sess_empty")
+    await clear_pending_messages("sess_empty")
+
+
+# ── Publish hook ────────────────────────────────────────────────────
+
+
+@pytest.mark.asyncio
+async def test_push_publishes_notification(fake_redis: _FakeRedis) -> None:
+    await push_pending_message("sess5", PendingMessage(content="hi"))
+    assert ("copilot:pending:notify:sess5", "1") in fake_redis.published
+
+
+# ── Format helper ───────────────────────────────────────────────────
+
+
+def test_format_pending_plain_text() -> None:
+    msg = PendingMessage(content="just text")
+    out = format_pending_as_user_message(msg)
+    assert out == {"role": "user", "content": "just text"}
+
+
+def test_format_pending_with_context_url() -> None:
+    msg = PendingMessage(
+        content="see this page",
+        context={"url": "https://example.com"},
+    )
+    out = format_pending_as_user_message(msg)
+    assert out["role"] == "user"
+    assert "see this page" in out["content"]
+    assert "https://example.com" in out["content"]
+
+
+def test_format_pending_with_file_ids() -> None:
+    msg = PendingMessage(content="look here", file_ids=["a", "b"])
+    out = format_pending_as_user_message(msg)
+    assert "file_id=a" in out["content"]
+    assert "file_id=b" in out["content"]
+
+
+# ── Malformed payload handling ──────────────────────────────────────
+
+
+@pytest.mark.asyncio
+async def test_drain_skips_malformed_entries(
+    fake_redis: _FakeRedis,
+) -> None:
+    # Seed the fake with a mix of valid and malformed payloads
+    fake_redis.lists["copilot:pending:bad"] = [
+        json.dumps({"content": "valid"}),
+        "{not valid json",
+        json.dumps({"content": "also valid", "file_ids": ["a"]}),
+    ]
+    drained = await drain_pending_messages("bad")
+    assert len(drained) == 2
+    assert drained[0].content == "valid"
+    assert drained[1].content == "also valid"
diff --git a/autogpt_platform/backend/backend/copilot/sdk/service.py b/autogpt_platform/backend/backend/copilot/sdk/service.py
index c2a60a8ba0..feaaabe0ce 100644
--- a/autogpt_platform/backend/backend/copilot/sdk/service.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/service.py
@@ -34,6 +34,11 @@ from opentelemetry import trace as otel_trace
 from pydantic import BaseModel
 
 from backend.copilot.context import get_workspace_manager
+from backend.copilot.pending_messages import (
+    PendingMessage,
+    clear_pending_messages,
+    drain_pending_messages,
+)
 from backend.copilot.permissions import apply_tool_permissions
 from backend.copilot.rate_limit import get_user_tier
 from backend.copilot.transcript import (
@@ -213,6 +218,25 @@ def _is_prompt_too_long(err: BaseException) -> bool:
     return False
 
 
+def _combine_pending_messages(pending: list[PendingMessage]) -> str:
+    """Merge drained pending messages into a single user-message body.
+
+    The Claude Agent SDK's ``client.query()`` takes a plain string (or
+    an async iterable); the simplest way to preserve ordering across
+    multiple drained messages is to concatenate them with a separator
+    and send a single ``query()`` call.  If there's only one message,
+    its ``content`` is returned verbatim so the transcript stays clean.
+    """
+    if len(pending) == 1:
+        return pending[0].content
+    parts: list[str] = []
+    for idx, msg in enumerate(pending, start=1):
+        header = f"[Additional message {idx}]" if idx > 1 else ""
+        body = msg.content
+        parts.append(f"{header}\n{body}".lstrip("\n") if header else body)
+    return "\n\n".join(parts)
+
+
 def _is_sdk_disconnect_error(exc: BaseException) -> bool:
     """Return True if *exc* is an expected SDK cleanup error from client disconnect.
 
@@ -1784,6 +1808,39 @@ async def _run_stream_attempt(
                 _msgs_since_flush = 0
 
             if acc.stream_completed:
+                # Before exiting the loop, check if the user queued any
+                # follow-up messages while this turn was running.  If so,
+                # send them to the same live SDK client as a new query
+                # and reset the stream completion state so we keep
+                # consuming CLI messages.  This avoids releasing the
+                # cluster lock and requeueing — the pending messages
+                # flow directly into the existing conversation.
+                pending = await drain_pending_messages(ctx.session_id)
+                if pending:
+                    logger.info(
+                        "%s Injecting %d pending message(s) mid-turn",
+                        ctx.log_prefix,
+                        len(pending),
+                    )
+                    injected_text = _combine_pending_messages(pending)
+                    injected_chat_msg = ChatMessage(role="user", content=injected_text)
+                    ctx.session.messages.append(injected_chat_msg)
+                    state.transcript_builder.append_user(content=injected_text)
+                    try:
+                        await asyncio.shield(upsert_chat_session(ctx.session))
+                    except Exception as persist_err:
+                        logger.warning(
+                            "%s Failed to persist injected pending message: %s",
+                            ctx.log_prefix,
+                            persist_err,
+                        )
+                    await client.query(injected_text, session_id=ctx.session_id)
+                    # Reset turn-level state so the next ResultMessage
+                    # ends the injected turn cleanly instead of
+                    # re-completing the previous one.
+                    acc.stream_completed = False
+                    _last_real_msg_time = time.monotonic()
+                    continue
                 break
     finally:
         await _safe_close_sdk_client(sdk_client, ctx.log_prefix)
@@ -2726,6 +2783,18 @@ async def stream_chat_completion_sdk(
 
         raise
     finally:
+        # Safety net — drop any pending messages still in the buffer.
+        # During normal completion the mid-turn drain already cleared
+        # them; this handles early exits (errors, cancellation, retry).
+        try:
+            await clear_pending_messages(session_id)
+        except Exception as _clear_err:
+            logger.warning(
+                "Failed to clear pending messages for %s: %s",
+                session_id,
+                _clear_err,
+            )
+
         # --- Close OTEL context (with cost attributes) ---
         if _otel_ctx is not None:
             try:

From cafe49f29580dec1b1ae368beed6f81a4bcada80 Mon Sep 17 00:00:00 2001
From: majdyz <majdy.zamil@gmail.com>
Date: Fri, 10 Apr 2026 15:37:40 +0000
Subject: [PATCH 02/30] fix(copilot): address round 1 review on
 pending-messages feature
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Critical fix — the SDK mid-stream injection was structurally broken.
``ClaudeSDKClient.receive_response()`` explicitly returns after the
first ``ResultMessage``, so re-issuing ``client.query()`` and setting
``acc.stream_completed = False`` could never restart the iteration —
the next ``__anext__`` raised ``StopAsyncIteration`` and the injected
turn's response was never consumed.  Replaced the broken mid-stream
path with a turn-start drain that works for both baseline and SDK.

### Changes

**Atomic push via Lua EVAL** (``pending_messages.py``)
- Replace the ``RPUSH`` + ``LTRIM`` + ``EXPIRE`` + ``LLEN`` pipeline
  (which was ``transaction=False`` and racy against concurrent
  ``LPOP``) with a single Lua script so the push is atomic.
- Drop the unused ``enqueued_at`` field.
- Add 16k ``max_length`` cap on ``PendingMessage.content``.

**Baseline path** (``baseline/service.py``)
- Drain at turn start (atomic ``LPOP``): any message queued while the
  session was idle or between turns is picked up before the first
  LLM call.
- Mid-loop drain now skips the final ``tool_call_loop`` yield
  (``finished_naturally=True``) — draining there would append a user
  message the loop is about to exit past, silently losing it.
- Inject via ``format_pending_as_user_message`` so file IDs + context
  are preserved in both ``openai_messages`` and the persisted session
  transcript (previously the DB copy lost file/context metadata).
- Remove the ``finally`` ``clear_pending_messages`` — atomic drain at
  turn start means any late push belongs to the next turn; clearing
  here would racily clobber it.

**SDK path** (``sdk/service.py``)
- Remove the broken mid-stream injection block entirely.
- Drain at turn start (same atomic ``LPOP``) and merge the drained
  messages into ``current_message`` before ``_build_query_message``,
  so the SDK CLI sees them as part of the initial user message.
- Remove the ``finally`` ``clear_pending_messages``.
- Delete the unused ``_combine_pending_messages`` helper.

**Endpoint** (``api/features/chat/routes.py``)
- Enforce ``check_rate_limit`` / ``get_global_rate_limits`` — was
  bypassing per-user daily/weekly token limits that ``/stream``
  enforces.
- ``QueuePendingMessageRequest`` gets ``extra="forbid"`` and
  ``message: max_length=16_000``.
- Push-first, persist-second: if the Redis push fails we raise 5xx;
  previously the session DB got an orphan user message with no
  corresponding queued entry and a retry would duplicate it.
- Log a warning when sanitised file IDs drop unknown entries.
- Persisted message content now uses ``format_pending_as_user_message``
  so the session copy matches what the model actually sees on drain.
- Response returns ``buffer_length``, ``max_buffer_length``, and
  ``turn_in_flight`` so the frontend can show accurate feedback about
  whether the message will hit the current turn or the next one.

**Tests** (``pending_messages_test.py``)
- ``_FakeRedis.eval`` emulates the Lua push script so the existing
  push/drain/cap tests keep working under the new atomic path.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../backend/api/features/chat/routes.py       | 101 ++++++++++++++----
 .../backend/copilot/baseline/service.py       |  61 +++++++----
 .../backend/copilot/pending_messages.py       |  48 ++++++---
 .../backend/copilot/pending_messages_test.py  |  19 ++++
 .../backend/backend/copilot/sdk/service.py    |  92 +++++-----------
 5 files changed, 199 insertions(+), 122 deletions(-)

diff --git a/autogpt_platform/backend/backend/api/features/chat/routes.py b/autogpt_platform/backend/backend/api/features/chat/routes.py
index a1eebdd6e3..b2269b0964 100644
--- a/autogpt_platform/backend/backend/api/features/chat/routes.py
+++ b/autogpt_platform/backend/backend/api/features/chat/routes.py
@@ -32,6 +32,7 @@ from backend.copilot.model import (
 from backend.copilot.pending_messages import (
     MAX_PENDING_MESSAGES,
     PendingMessage,
+    format_pending_as_user_message,
     push_pending_message,
 )
 from backend.copilot.rate_limit import (
@@ -133,15 +134,28 @@ class QueuePendingMessageRequest(BaseModel):
     rounds.
     """
 
-    message: str = Field(min_length=1)
-    context: dict[str, str] | None = None
+    model_config = ConfigDict(extra="forbid")
+
+    message: str = Field(min_length=1, max_length=16_000)
+    context: dict[str, str] | None = Field(
+        default=None,
+        description="Optional page context: expected keys are 'url' and 'content'.",
+    )
     file_ids: list[str] | None = Field(default=None, max_length=20)
 
 
 class QueuePendingMessageResponse(BaseModel):
+    """Response for the pending-message endpoint.
+
+    Clients should rely on ``queued`` / ``buffer_length`` / ``turn_in_flight``
+    — the ``detail`` field is human-readable and may change without notice.
+    """
+
     queued: bool
     buffer_length: int
-    message: str
+    max_buffer_length: int
+    turn_in_flight: bool
+    detail: str
 
 
 class CreateSessionRequest(BaseModel):
@@ -1051,32 +1065,44 @@ async def queue_pending_message(
 
     When a user sends a follow-up message while a turn is still
     streaming, we don't want to block them or start a separate turn —
-    this endpoint appends the message to a per-session pending buffer
-    that the executor currently processing the turn will drain between
-    tool-call rounds, injecting it into the conversation before the
-    model's next LLM call.
+    this endpoint appends the message to a per-session pending buffer.
+    The executor currently running the turn (baseline path) drains the
+    buffer between tool-call rounds and appends the message to the
+    conversation before the next LLM call.  On the SDK path the buffer
+    is drained at the *start* of the next turn (the long-lived
+    ``ClaudeSDKClient.receive_response`` iterator returns after a
+    ``ResultMessage`` so there is no safe point to inject mid-stream
+    into an existing connection).
 
-    Returns 202 with the new buffer length on success.  If the buffer
-    is full (``MAX_PENDING_MESSAGES``), the oldest pending message is
-    evicted to make room for the new one — the newest message always
-    wins.
-
-    Intended for the frontend "send while streaming" flow.  If no turn
-    is currently in flight the message is still queued — the next turn
-    the user starts will pick it up before its first LLM call.
+    Returns 202.  Enforces the same per-user daily/weekly token rate
+    limit as the regular ``/stream`` endpoint so a client can't bypass
+    it by batching messages through here.
     """
     await _validate_and_get_session(session_id, user_id)
 
-    # Persist the message to the session immediately so it shows up in
-    # the transcript even before the executor drains the buffer.
-    chat_msg = ChatMessage(role="user", content=request.message)
+    # Pre-turn rate-limit check — mirrors stream_chat_post.  Without
+    # this, a client could bypass per-turn token limits by batching
+    # their extra context through this endpoint while a cheap stream
+    # is in flight.
+    if user_id:
+        try:
+            daily_limit, weekly_limit, _tier = await get_global_rate_limits(
+                user_id, config.daily_token_limit, config.weekly_token_limit
+            )
+            await check_rate_limit(
+                user_id=user_id,
+                daily_token_limit=daily_limit,
+                weekly_token_limit=weekly_limit,
+            )
+        except RateLimitExceeded as e:
+            raise HTTPException(status_code=429, detail=str(e)) from e
+
     if user_id:
         track_user_message(
             user_id=user_id,
             session_id=session_id,
             message_length=len(request.message),
         )
-    await append_and_save_message(session_id, chat_msg)
 
     # Sanitise file IDs to the user's own workspace (same logic as
     # stream_chat_post) so injection doesn't surface other users' files.
@@ -1093,7 +1119,18 @@ async def queue_pending_message(
                 }
             )
             sanitized_file_ids = [wf.id for wf in files]
+            if len(sanitized_file_ids) != len(valid_ids):
+                logger.warning(
+                    "queue_pending_message: dropped %d file id(s) not in "
+                    "caller's workspace (session=%s)",
+                    len(valid_ids) - len(sanitized_file_ids),
+                    session_id,
+                )
 
+    # Push to Redis BEFORE writing to the session DB.  If the push
+    # fails we raise 5xx and the client retries; ``append_and_save_message``
+    # would otherwise leave an orphan user message persisted with no
+    # corresponding queued pending entry, and a retry would duplicate it.
     pending = PendingMessage(
         content=request.message,
         file_ids=sanitized_file_ids,
@@ -1101,12 +1138,32 @@ async def queue_pending_message(
     )
     buffer_length = await push_pending_message(session_id, pending)
 
+    # Persist the message into the session transcript only after the
+    # push succeeds.  The message content embeds file/context metadata
+    # via format_pending_as_user_message so the DB copy matches what
+    # the model will actually see on drain.
+    chat_msg = ChatMessage(
+        role="user",
+        content=format_pending_as_user_message(pending)["content"],
+    )
+    await append_and_save_message(session_id, chat_msg)
+
+    # Check whether a turn is currently running for UX feedback.
+    active_session = await stream_registry.get_session(session_id)
+    turn_in_flight = bool(active_session and active_session.status == "running")
+
     return QueuePendingMessageResponse(
         queued=True,
         buffer_length=buffer_length,
-        message=(
-            f"Queued — will be injected into the current turn "
-            f"(buffer: {buffer_length}/{MAX_PENDING_MESSAGES})"
+        max_buffer_length=MAX_PENDING_MESSAGES,
+        turn_in_flight=turn_in_flight,
+        detail=(
+            (
+                "Queued — will be injected into the current turn."
+                if turn_in_flight
+                else "Queued — will be injected at the start of the next turn."
+            )
+            + f" buffer={buffer_length}/{MAX_PENDING_MESSAGES}"
         ),
     )
 
diff --git a/autogpt_platform/backend/backend/copilot/baseline/service.py b/autogpt_platform/backend/backend/copilot/baseline/service.py
index 1658d93eb1..bb800c10c7 100644
--- a/autogpt_platform/backend/backend/copilot/baseline/service.py
+++ b/autogpt_platform/backend/backend/copilot/baseline/service.py
@@ -36,7 +36,6 @@ from backend.copilot.model import (
     upsert_chat_session,
 )
 from backend.copilot.pending_messages import (
-    clear_pending_messages,
     drain_pending_messages,
     format_pending_as_user_message,
 )
@@ -933,6 +932,23 @@ async def stream_chat_completion_baseline(
                 message_length=len(message or ""),
             )
 
+    # Drain any messages the user queued via POST /messages/pending
+    # while this session was idle (or during a previous turn whose
+    # mid-loop drains missed them).  Atomic LPOP guarantees that a
+    # concurrent push lands *after* the drain and stays queued for the
+    # next turn instead of being lost.  Prepended to the session so
+    # the initial LLM call sees them.
+    drained_at_start = await drain_pending_messages(session_id)
+    if drained_at_start:
+        logger.info(
+            "[Baseline] Draining %d pending message(s) at turn start " "for session %s",
+            len(drained_at_start),
+            session_id,
+        )
+        for _pm in drained_at_start:
+            _content = format_pending_as_user_message(_pm)["content"]
+            maybe_append_user_message(session, _content, is_user_message=True)
+
     session = await upsert_chat_session(session)
 
     # Select model based on the per-request mode.  'fast' downgrades to
@@ -1168,16 +1184,32 @@ async def stream_chat_completion_baseline(
             # Inject any messages the user queued while the turn was
             # running.  ``tool_call_loop`` mutates ``openai_messages``
             # in-place, so appending here means the model sees the new
-            # messages before its next LLM call.  Also persist them to
-            # the ChatSession so they're part of the durable transcript.
+            # messages on its next LLM call.
+            #
+            # IMPORTANT: skip when the loop has already finished (no
+            # more LLM calls are coming).  Draining here would silently
+            # lose the message because ``tool_call_loop`` is about to
+            # return on the next ``async for`` step — the user would
+            # see a 202 from the pending endpoint but the model would
+            # never actually read the text.  Those messages stay in
+            # the buffer and will be picked up at the start of the
+            # next turn.
+            if loop_result is None or loop_result.finished_naturally:
+                continue
             pending = await drain_pending_messages(session_id)
             if pending:
                 for pm in pending:
+                    # ``format_pending_as_user_message`` embeds file
+                    # attachments and context URL/page content into the
+                    # content string so the in-session transcript is
+                    # a faithful copy of what the model actually saw.
+                    formatted = format_pending_as_user_message(pm)
+                    content_for_db = formatted["content"]
                     maybe_append_user_message(
-                        session, pm.content, is_user_message=True
+                        session, content_for_db, is_user_message=True
                     )
-                    openai_messages.append(format_pending_as_user_message(pm))
-                    transcript_builder.append_user(content=pm.content)
+                    openai_messages.append(formatted)
+                    transcript_builder.append_user(content=content_for_db)
                 try:
                     await upsert_chat_session(session)
                 except Exception as persist_err:
@@ -1234,19 +1266,10 @@ async def stream_chat_completion_baseline(
         yield StreamError(errorText=error_msg, code="baseline_error")
         # Still persist whatever we got
     finally:
-        # Safety net — if the stream exited early (error, timeout, etc.)
-        # we may still have queued pending messages in the buffer.  Drop
-        # them so they don't leak into the next turn.  During normal
-        # completion the tool-call loop drain will already have cleared
-        # the buffer, so this is a no-op in the happy path.
-        try:
-            await clear_pending_messages(session_id)
-        except Exception as clear_err:
-            logger.warning(
-                "[Baseline] Failed to clear pending messages for %s: %s",
-                session_id,
-                clear_err,
-            )
+        # Pending messages are drained atomically at turn start and
+        # between tool rounds, so there's nothing to clear in finally.
+        # Any message pushed after the final drain window stays in the
+        # buffer and gets picked up at the start of the next turn.
 
         # Set cost attributes on OTEL span before closing
         if _trace_ctx is not None:
diff --git a/autogpt_platform/backend/backend/copilot/pending_messages.py b/autogpt_platform/backend/backend/copilot/pending_messages.py
index 0930a87e2d..ea0ae6bc4c 100644
--- a/autogpt_platform/backend/backend/copilot/pending_messages.py
+++ b/autogpt_platform/backend/backend/copilot/pending_messages.py
@@ -23,7 +23,6 @@ buffer is trimmed to the latest ``MAX_PENDING_MESSAGES`` on every push.
 
 import json
 import logging
-import time
 from typing import Any, cast
 
 from pydantic import BaseModel, Field
@@ -49,11 +48,9 @@ _PENDING_TTL_SECONDS = 3600  # 1 hour — matches stream_ttl default
 class PendingMessage(BaseModel):
     """A user message queued for injection into an in-flight turn."""
 
-    content: str = Field(min_length=1)
+    content: str = Field(min_length=1, max_length=16_000)
     file_ids: list[str] = Field(default_factory=list)
     context: dict[str, str] | None = None
-    # Unix epoch seconds at enqueue time, for ordering and debugging.
-    enqueued_at: float = Field(default_factory=time.time)
 
 
 def _buffer_key(session_id: str) -> str:
@@ -64,31 +61,50 @@ def _notify_channel(session_id: str) -> str:
     return f"{_PENDING_CHANNEL_PREFIX}{session_id}"
 
 
+# Lua script: push-then-trim-then-expire-then-length, atomically.
+# Running these four commands via a single EVAL guarantees a concurrent
+# LPOP drain lands either entirely before the push (returns 0 from
+# our earlier LLEN) or entirely after it (sees the new message) —
+# never in the middle of a partial state.
+_PUSH_LUA = """
+redis.call('RPUSH', KEYS[1], ARGV[1])
+redis.call('LTRIM', KEYS[1], -tonumber(ARGV[2]), -1)
+redis.call('EXPIRE', KEYS[1], tonumber(ARGV[3]))
+return redis.call('LLEN', KEYS[1])
+"""
+
+
 async def push_pending_message(
     session_id: str,
     message: PendingMessage,
 ) -> int:
-    """Append a pending message to the session's buffer.
+    """Append a pending message to the session's buffer atomically.
 
     Returns the new buffer length.  Enforces ``MAX_PENDING_MESSAGES`` by
     trimming from the left (oldest) — the newest message always wins if
     the user has been typing faster than the copilot can drain.
+
+    The push + trim + expire + llen are wrapped in a single Lua EVAL so
+    concurrent LPOP drains from the executor never observe a partial
+    state.
     """
     redis = await get_redis_async()
     key = _buffer_key(session_id)
     payload = message.model_dump_json()
 
-    # Push + trim + expire in a pipeline so the three writes land atomically
-    # enough for this use case (pipelining doesn't guarantee atomicity
-    # across commands but ordering is preserved).
-    async with redis.pipeline(transaction=False) as pipe:
-        pipe.rpush(key, payload)
-        pipe.ltrim(key, -MAX_PENDING_MESSAGES, -1)
-        pipe.expire(key, _PENDING_TTL_SECONDS)
-        pipe.llen(key)
-        results = await pipe.execute()
-
-    new_length = int(results[-1])
+    new_length = int(
+        await cast(
+            "Any",
+            redis.eval(
+                _PUSH_LUA,
+                1,
+                key,
+                payload,
+                str(MAX_PENDING_MESSAGES),
+                str(_PENDING_TTL_SECONDS),
+            ),
+        )
+    )
 
     # Fire-and-forget notify.  Subscribers use this as a wake-up hint;
     # the buffer itself is authoritative so a lost notify is harmless.
diff --git a/autogpt_platform/backend/backend/copilot/pending_messages_test.py b/autogpt_platform/backend/backend/copilot/pending_messages_test.py
index b03906f52a..7fec16c708 100644
--- a/autogpt_platform/backend/backend/copilot/pending_messages_test.py
+++ b/autogpt_platform/backend/backend/copilot/pending_messages_test.py
@@ -79,6 +79,25 @@ class _FakeRedis:
     def pipeline(self, transaction: bool = False) -> _FakePipeline:
         return _FakePipeline(self)
 
+    async def eval(self, script: str, num_keys: int, *args: Any) -> Any:
+        """Emulate the push Lua script.
+
+        The real Lua script runs atomically in Redis; the fake
+        implementation just runs the equivalent list operations in
+        order and returns the final LLEN.  That's enough to exercise
+        the cap + ordering invariants the tests care about.
+        """
+        key = args[0]
+        payload = args[1]
+        max_len = int(args[2])
+        # ARGV[3] is TTL — fake doesn't enforce expiry
+        lst = self.lists.setdefault(key, [])
+        lst.append(payload)
+        if len(lst) > max_len:
+            # RPUSH + LTRIM(-N, -1) = keep only last N
+            self.lists[key] = lst[-max_len:]
+        return len(self.lists[key])
+
     async def publish(self, channel: str, payload: str) -> int:
         self.published.append((channel, payload))
         return 1
diff --git a/autogpt_platform/backend/backend/copilot/sdk/service.py b/autogpt_platform/backend/backend/copilot/sdk/service.py
index feaaabe0ce..7d13b24925 100644
--- a/autogpt_platform/backend/backend/copilot/sdk/service.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/service.py
@@ -35,9 +35,8 @@ from pydantic import BaseModel
 
 from backend.copilot.context import get_workspace_manager
 from backend.copilot.pending_messages import (
-    PendingMessage,
-    clear_pending_messages,
     drain_pending_messages,
+    format_pending_as_user_message,
 )
 from backend.copilot.permissions import apply_tool_permissions
 from backend.copilot.rate_limit import get_user_tier
@@ -218,25 +217,6 @@ def _is_prompt_too_long(err: BaseException) -> bool:
     return False
 
 
-def _combine_pending_messages(pending: list[PendingMessage]) -> str:
-    """Merge drained pending messages into a single user-message body.
-
-    The Claude Agent SDK's ``client.query()`` takes a plain string (or
-    an async iterable); the simplest way to preserve ordering across
-    multiple drained messages is to concatenate them with a separator
-    and send a single ``query()`` call.  If there's only one message,
-    its ``content`` is returned verbatim so the transcript stays clean.
-    """
-    if len(pending) == 1:
-        return pending[0].content
-    parts: list[str] = []
-    for idx, msg in enumerate(pending, start=1):
-        header = f"[Additional message {idx}]" if idx > 1 else ""
-        body = msg.content
-        parts.append(f"{header}\n{body}".lstrip("\n") if header else body)
-    return "\n\n".join(parts)
-
-
 def _is_sdk_disconnect_error(exc: BaseException) -> bool:
     """Return True if *exc* is an expected SDK cleanup error from client disconnect.
 
@@ -1808,39 +1788,6 @@ async def _run_stream_attempt(
                 _msgs_since_flush = 0
 
             if acc.stream_completed:
-                # Before exiting the loop, check if the user queued any
-                # follow-up messages while this turn was running.  If so,
-                # send them to the same live SDK client as a new query
-                # and reset the stream completion state so we keep
-                # consuming CLI messages.  This avoids releasing the
-                # cluster lock and requeueing — the pending messages
-                # flow directly into the existing conversation.
-                pending = await drain_pending_messages(ctx.session_id)
-                if pending:
-                    logger.info(
-                        "%s Injecting %d pending message(s) mid-turn",
-                        ctx.log_prefix,
-                        len(pending),
-                    )
-                    injected_text = _combine_pending_messages(pending)
-                    injected_chat_msg = ChatMessage(role="user", content=injected_text)
-                    ctx.session.messages.append(injected_chat_msg)
-                    state.transcript_builder.append_user(content=injected_text)
-                    try:
-                        await asyncio.shield(upsert_chat_session(ctx.session))
-                    except Exception as persist_err:
-                        logger.warning(
-                            "%s Failed to persist injected pending message: %s",
-                            ctx.log_prefix,
-                            persist_err,
-                        )
-                    await client.query(injected_text, session_id=ctx.session_id)
-                    # Reset turn-level state so the next ResultMessage
-                    # ends the injected turn cleanly instead of
-                    # re-completing the previous one.
-                    acc.stream_completed = False
-                    _last_real_msg_time = time.monotonic()
-                    continue
                 break
     finally:
         await _safe_close_sdk_client(sdk_client, ctx.log_prefix)
@@ -2328,6 +2275,28 @@ async def stream_chat_completion_sdk(
             if last_user:
                 current_message = last_user[-1].content or ""
 
+        # Drain any messages the user queued via POST /messages/pending
+        # while the previous turn was running (or since the session was
+        # idle).  Messages are drained ATOMICALLY — one LPOP with count
+        # removes them all at once, so a concurrent push lands *after*
+        # the drain and stays queued for the next turn instead of being
+        # lost between LPOP and clear.  File IDs and context are
+        # preserved via format_pending_as_user_message.
+        pending_at_start = await drain_pending_messages(session_id)
+        if pending_at_start:
+            logger.info(
+                "%s Draining %d pending message(s) at turn start",
+                log_prefix,
+                len(pending_at_start),
+            )
+            pending_texts: list[str] = [
+                format_pending_as_user_message(pm)["content"] for pm in pending_at_start
+            ]
+            if current_message.strip():
+                current_message = current_message + "\n\n" + "\n\n".join(pending_texts)
+            else:
+                current_message = "\n\n".join(pending_texts)
+
         if not current_message.strip():
             yield StreamError(
                 errorText="Message cannot be empty.",
@@ -2783,17 +2752,10 @@ async def stream_chat_completion_sdk(
 
         raise
     finally:
-        # Safety net — drop any pending messages still in the buffer.
-        # During normal completion the mid-turn drain already cleared
-        # them; this handles early exits (errors, cancellation, retry).
-        try:
-            await clear_pending_messages(session_id)
-        except Exception as _clear_err:
-            logger.warning(
-                "Failed to clear pending messages for %s: %s",
-                session_id,
-                _clear_err,
-            )
+        # Pending messages are drained atomically at the start of each
+        # turn (see drain_pending_messages call above), so there's
+        # nothing to clean up here — any message pushed after that
+        # point belongs to the next turn.
 
         # --- Close OTEL context (with cost attributes) ---
         if _otel_ctx is not None:

From f140e731501ef19837c31254d671ce3ffb2195ff Mon Sep 17 00:00:00 2001
From: majdyz <majdy.zamil@gmail.com>
Date: Fri, 10 Apr 2026 15:57:57 +0000
Subject: [PATCH 03/30] fix(copilot): address round 2 review on
 pending-messages feature
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Critical: SDK path was double-injecting.  The endpoint persisted the
message to ``session.messages`` AND the executor drained it from Redis
and concatenated into ``current_message`` — the LLM saw each queued
message twice (once via the compacted history / gap context that
``_build_query_message`` pulls from ``session.messages``, once via
the new query).  Baseline avoided this via ``maybe_append_user_message``
dedup but SDK had no equivalent guard.

### Fix: Redis is the single source of truth

- Endpoint no longer persists to ``session.messages``.  It only
  pushes to Redis and returns.
- Baseline drain-at-start calls ``maybe_append_user_message`` (dedup
  is a safety net, not the primary guard).
- SDK drain-at-start calls ``maybe_append_user_message`` too, so the
  durable transcript records the queued messages.  The concatenation
  into ``current_message`` stays so the SDK CLI sees the content in
  the first user message of the new turn.

### Baseline max-iterations silent-loss — Fixed

``tool_call_loop`` yields ``finished_naturally=False`` when
``iteration == max_iterations`` then returns.  Previously the drain
only skipped ``finished_naturally=True``, so messages drained on the
max-iterations final yield were appended to ``openai_messages`` and
silently lost (the loop was already exiting).  Now the drain also
skips when ``loop_result.iterations >= _MAX_TOOL_ROUNDS``.

### API response cleanup

- ``QueuePendingMessageResponse``: dropped ``queued`` (always True) and
  ``detail`` (human-readable, clients shouldn't parse).  Kept
  ``buffer_length``, ``max_buffer_length``, and ``turn_in_flight``.

### Tests

- Removed dead ``_FakePipeline`` class (the code switched to Lua EVAL
  in round 1 so the pipeline fake was unused).
- Added ``test_drain_decodes_bytes_payloads`` so the ``bytes → str``
  decode branch in ``drain_pending_messages`` is actually exercised
  (real redis-py returns bytes when ``decode_responses=False``).
- Updated ``_FakeRedis.lists`` type hint to ``list[str | bytes]``.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../backend/api/features/chat/routes.py       | 43 ++++-------
 .../backend/copilot/baseline/service.py       | 25 +++++--
 .../backend/copilot/pending_messages_test.py  | 75 ++++++-------------
 .../backend/backend/copilot/sdk/service.py    | 10 +++
 4 files changed, 65 insertions(+), 88 deletions(-)

diff --git a/autogpt_platform/backend/backend/api/features/chat/routes.py b/autogpt_platform/backend/backend/api/features/chat/routes.py
index b2269b0964..2e19ea8ca3 100644
--- a/autogpt_platform/backend/backend/api/features/chat/routes.py
+++ b/autogpt_platform/backend/backend/api/features/chat/routes.py
@@ -32,7 +32,6 @@ from backend.copilot.model import (
 from backend.copilot.pending_messages import (
     MAX_PENDING_MESSAGES,
     PendingMessage,
-    format_pending_as_user_message,
     push_pending_message,
 )
 from backend.copilot.rate_limit import (
@@ -147,15 +146,17 @@ class QueuePendingMessageRequest(BaseModel):
 class QueuePendingMessageResponse(BaseModel):
     """Response for the pending-message endpoint.
 
-    Clients should rely on ``queued`` / ``buffer_length`` / ``turn_in_flight``
-    — the ``detail`` field is human-readable and may change without notice.
+    - ``buffer_length``: how many messages are now in the session's
+      pending buffer (after this push)
+    - ``max_buffer_length``: the per-session cap (server-side constant)
+    - ``turn_in_flight``: ``True`` if a copilot turn was running when
+      we checked — purely informational for UX feedback.  Even when
+      ``False`` the message is still queued: the next turn drains it.
     """
 
-    queued: bool
     buffer_length: int
     max_buffer_length: int
     turn_in_flight: bool
-    detail: str
 
 
 class CreateSessionRequest(BaseModel):
@@ -1127,10 +1128,15 @@ async def queue_pending_message(
                     session_id,
                 )
 
-    # Push to Redis BEFORE writing to the session DB.  If the push
-    # fails we raise 5xx and the client retries; ``append_and_save_message``
-    # would otherwise leave an orphan user message persisted with no
-    # corresponding queued pending entry, and a retry would duplicate it.
+    # Redis is the single source of truth for pending messages.  We do
+    # NOT persist to ``session.messages`` here — the drain-at-start
+    # path in the baseline/SDK executor is the sole writer for pending
+    # content.  Persisting both here AND in the drain would cause
+    # double injection (executor sees the message in ``session.messages``
+    # *and* drains it from Redis) unless we also dedupe.  The dedup in
+    # ``maybe_append_user_message`` only checks trailing same-role
+    # repeats, so relying on it is fragile.  Keeping the endpoint
+    # Redis-only avoids the whole consistency-bug class.
     pending = PendingMessage(
         content=request.message,
         file_ids=sanitized_file_ids,
@@ -1138,33 +1144,14 @@ async def queue_pending_message(
     )
     buffer_length = await push_pending_message(session_id, pending)
 
-    # Persist the message into the session transcript only after the
-    # push succeeds.  The message content embeds file/context metadata
-    # via format_pending_as_user_message so the DB copy matches what
-    # the model will actually see on drain.
-    chat_msg = ChatMessage(
-        role="user",
-        content=format_pending_as_user_message(pending)["content"],
-    )
-    await append_and_save_message(session_id, chat_msg)
-
     # Check whether a turn is currently running for UX feedback.
     active_session = await stream_registry.get_session(session_id)
     turn_in_flight = bool(active_session and active_session.status == "running")
 
     return QueuePendingMessageResponse(
-        queued=True,
         buffer_length=buffer_length,
         max_buffer_length=MAX_PENDING_MESSAGES,
         turn_in_flight=turn_in_flight,
-        detail=(
-            (
-                "Queued — will be injected into the current turn."
-                if turn_in_flight
-                else "Queued — will be injected at the start of the next turn."
-            )
-            + f" buffer={buffer_length}/{MAX_PENDING_MESSAGES}"
-        ),
     )
 
 
diff --git a/autogpt_platform/backend/backend/copilot/baseline/service.py b/autogpt_platform/backend/backend/copilot/baseline/service.py
index bb800c10c7..51454918a9 100644
--- a/autogpt_platform/backend/backend/copilot/baseline/service.py
+++ b/autogpt_platform/backend/backend/copilot/baseline/service.py
@@ -1187,14 +1187,23 @@ async def stream_chat_completion_baseline(
             # messages on its next LLM call.
             #
             # IMPORTANT: skip when the loop has already finished (no
-            # more LLM calls are coming).  Draining here would silently
-            # lose the message because ``tool_call_loop`` is about to
-            # return on the next ``async for`` step — the user would
-            # see a 202 from the pending endpoint but the model would
-            # never actually read the text.  Those messages stay in
-            # the buffer and will be picked up at the start of the
-            # next turn.
-            if loop_result is None or loop_result.finished_naturally:
+            # more LLM calls are coming).  ``tool_call_loop`` yields
+            # a final ``ToolCallLoopResult`` on both paths:
+            #   - natural finish: ``finished_naturally=True``
+            #   - hit max_iterations: ``finished_naturally=False``
+            #                         and ``iterations >= max_iterations``
+            # In either case the loop is about to return on the next
+            # ``async for`` step, so draining here would silently
+            # lose the message (the user sees 202 but the model never
+            # reads the text).  Those messages stay in the buffer and
+            # get picked up at the start of the next turn.
+            if loop_result is None:
+                continue
+            is_final_yield = (
+                loop_result.finished_naturally
+                or loop_result.iterations >= _MAX_TOOL_ROUNDS
+            )
+            if is_final_yield:
                 continue
             pending = await drain_pending_messages(session_id)
             if pending:
diff --git a/autogpt_platform/backend/backend/copilot/pending_messages_test.py b/autogpt_platform/backend/backend/copilot/pending_messages_test.py
index 7fec16c708..7be4b7e53c 100644
--- a/autogpt_platform/backend/backend/copilot/pending_messages_test.py
+++ b/autogpt_platform/backend/backend/copilot/pending_messages_test.py
@@ -24,61 +24,14 @@ from backend.copilot.pending_messages import (
 # ── Fake Redis ──────────────────────────────────────────────────────
 
 
-class _FakePipeline:
-    def __init__(self, parent: "_FakeRedis") -> None:
-        self._parent = parent
-        self._ops: list[tuple[str, tuple[Any, ...]]] = []
-
-    async def __aenter__(self) -> "_FakePipeline":
-        return self
-
-    async def __aexit__(self, *args: object) -> None:
-        return None
-
-    def rpush(self, key: str, value: Any) -> None:
-        self._ops.append(("rpush", (key, value)))
-
-    def ltrim(self, key: str, start: int, stop: int) -> None:
-        self._ops.append(("ltrim", (key, start, stop)))
-
-    def expire(self, key: str, ttl: int) -> None:
-        self._ops.append(("expire", (key, ttl)))
-
-    def llen(self, key: str) -> None:
-        self._ops.append(("llen", (key,)))
-
-    async def execute(self) -> list[Any]:
-        results: list[Any] = []
-        for op, args in self._ops:
-            if op == "rpush":
-                key, value = args
-                self._parent.lists.setdefault(key, []).append(value)
-                results.append(len(self._parent.lists[key]))
-            elif op == "ltrim":
-                key, start, stop = args
-                lst = self._parent.lists.get(key, [])
-                # Emulate Redis LTRIM (-N, -1) = last N
-                if start < 0 and stop == -1:
-                    self._parent.lists[key] = lst[start:]
-                else:
-                    self._parent.lists[key] = lst[start : stop + 1]
-                results.append(True)
-            elif op == "expire":
-                results.append(True)
-            elif op == "llen":
-                key = args[0]
-                results.append(len(self._parent.lists.get(key, [])))
-        return results
-
-
 class _FakeRedis:
     def __init__(self) -> None:
-        self.lists: dict[str, list[str]] = {}
+        # Values are ``str | bytes`` because real redis-py returns
+        # bytes when ``decode_responses=False``; the drain path must
+        # handle both and our tests exercise both.
+        self.lists: dict[str, list[str | bytes]] = {}
         self.published: list[tuple[str, str]] = []
 
-    def pipeline(self, transaction: bool = False) -> _FakePipeline:
-        return _FakePipeline(self)
-
     async def eval(self, script: str, num_keys: int, *args: Any) -> Any:
         """Emulate the push Lua script.
 
@@ -102,7 +55,7 @@ class _FakeRedis:
         self.published.append((channel, payload))
         return 1
 
-    async def lpop(self, key: str, count: int) -> list[str] | None:
+    async def lpop(self, key: str, count: int) -> list[str | bytes] | None:
         lst = self.lists.get(key)
         if not lst:
             return None
@@ -250,3 +203,21 @@ async def test_drain_skips_malformed_entries(
     assert len(drained) == 2
     assert drained[0].content == "valid"
     assert drained[1].content == "also valid"
+
+
+@pytest.mark.asyncio
+async def test_drain_decodes_bytes_payloads(
+    fake_redis: _FakeRedis,
+) -> None:
+    """Real redis-py returns ``bytes`` when ``decode_responses=False``.
+
+    Seed the fake with bytes values to exercise the ``decode("utf-8")``
+    branch in ``drain_pending_messages`` so a regression there doesn't
+    slip past CI.
+    """
+    fake_redis.lists["copilot:pending:bytes_sess"] = [
+        json.dumps({"content": "from bytes"}).encode("utf-8"),
+    ]
+    drained = await drain_pending_messages("bytes_sess")
+    assert len(drained) == 1
+    assert drained[0].content == "from bytes"
diff --git a/autogpt_platform/backend/backend/copilot/sdk/service.py b/autogpt_platform/backend/backend/copilot/sdk/service.py
index 7d13b24925..a43beca39b 100644
--- a/autogpt_platform/backend/backend/copilot/sdk/service.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/service.py
@@ -2282,6 +2282,14 @@ async def stream_chat_completion_sdk(
         # the drain and stays queued for the next turn instead of being
         # lost between LPOP and clear.  File IDs and context are
         # preserved via format_pending_as_user_message.
+        #
+        # The drained content is concatenated into ``current_message``
+        # so the SDK CLI sees it in the new user message, AND appended
+        # to ``session.messages`` (via ``maybe_append_user_message``,
+        # which dedupes trailing same-role repeats) so the durable
+        # transcript records it too.  The endpoint deliberately does
+        # NOT persist to session.messages — Redis is the single source
+        # of truth until this drain runs.
         pending_at_start = await drain_pending_messages(session_id)
         if pending_at_start:
             logger.info(
@@ -2292,6 +2300,8 @@ async def stream_chat_completion_sdk(
             pending_texts: list[str] = [
                 format_pending_as_user_message(pm)["content"] for pm in pending_at_start
             ]
+            for _pt in pending_texts:
+                maybe_append_user_message(session, _pt, is_user_message=True)
             if current_message.strip():
                 current_message = current_message + "\n\n" + "\n\n".join(pending_texts)
             else:

From 80e580f387d3d78acc47c9ac76b05b53116ed712 Mon Sep 17 00:00:00 2001
From: majdyz <majdy.zamil@gmail.com>
Date: Fri, 10 Apr 2026 16:10:34 +0000
Subject: [PATCH 04/30] fix(baseline): mirror drained pending messages into
 transcript_builder
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Round 3 follow-up: the drain-at-start in ``stream_chat_completion_baseline``
persisted pending messages to ``session.messages`` but never called
``transcript_builder.append_user`` for them.  A mid-turn transcript
upload would be missing the drained text, which could produce a
malformed assistant-after-assistant structure on the next turn.

The drain block runs BEFORE ``transcript_builder`` is instantiated
(which happens after prompt/transcript async setup), so we can't call
append_user in the drain block itself.  Instead, we remember the
drained list and mirror it into the transcript right after the
single-message ``transcript_builder.append_user(content=message)``
call near the prompt-build site.

Also cleaned up the stray adjacent-string concatenation in the log
line (``"...turn start " "for session %s"`` → single string).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../backend/backend/copilot/baseline/service.py   | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/autogpt_platform/backend/backend/copilot/baseline/service.py b/autogpt_platform/backend/backend/copilot/baseline/service.py
index 51454918a9..3858c9694b 100644
--- a/autogpt_platform/backend/backend/copilot/baseline/service.py
+++ b/autogpt_platform/backend/backend/copilot/baseline/service.py
@@ -936,12 +936,11 @@ async def stream_chat_completion_baseline(
     # while this session was idle (or during a previous turn whose
     # mid-loop drains missed them).  Atomic LPOP guarantees that a
     # concurrent push lands *after* the drain and stays queued for the
-    # next turn instead of being lost.  Prepended to the session so
-    # the initial LLM call sees them.
+    # next turn instead of being lost.
     drained_at_start = await drain_pending_messages(session_id)
     if drained_at_start:
         logger.info(
-            "[Baseline] Draining %d pending message(s) at turn start " "for session %s",
+            "[Baseline] Draining %d pending message(s) at turn start for session %s",
             len(drained_at_start),
             session_id,
         )
@@ -1009,6 +1008,16 @@ async def stream_chat_completion_baseline(
     if message and is_user_message:
         transcript_builder.append_user(content=message)
 
+    # Mirror any messages drained at turn start (see above) into the
+    # transcript — otherwise the loaded prior transcript would be
+    # missing them and a mid-turn upload could leave a malformed
+    # assistant-after-assistant structure on the next turn.
+    if drained_at_start:
+        for _pm in drained_at_start:
+            transcript_builder.append_user(
+                content=format_pending_as_user_message(_pm)["content"]
+            )
+
     # Generate title for new sessions
     if is_user_message and not session.title:
         user_messages = [m for m in session.messages if m.role == "user"]

From 51465fbb0229ed9ac257490caf3d49f60eacd3d8 Mon Sep 17 00:00:00 2001
From: majdyz <majdy.zamil@gmail.com>
Date: Fri, 10 Apr 2026 16:15:02 +0000
Subject: [PATCH 05/30] docs(pending_messages): fix two stale comments in
 pending_messages.py

Round 4 review nits:

- ``_PUSH_LUA`` block comment mentioned "returns 0 from our earlier
  LLEN" which was a leftover from an earlier design that had a
  separate LLEN check. The atomicity guarantee doesn't depend on it.
  Reworded to describe Redis EVAL serialisation instead.
- ``clear_pending_messages`` docstring said "called at the end of a
  turn" but the finally-block call sites were removed in round 2
  when the atomic drain-at-start became the primary consumer. The
  function is now only an operator/debug escape hatch. Docstring
  updated to match.

No behavioural change.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../backend/backend/copilot/pending_messages.py   | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/autogpt_platform/backend/backend/copilot/pending_messages.py b/autogpt_platform/backend/backend/copilot/pending_messages.py
index ea0ae6bc4c..4c62cecf25 100644
--- a/autogpt_platform/backend/backend/copilot/pending_messages.py
+++ b/autogpt_platform/backend/backend/copilot/pending_messages.py
@@ -62,10 +62,9 @@ def _notify_channel(session_id: str) -> str:
 
 
 # Lua script: push-then-trim-then-expire-then-length, atomically.
-# Running these four commands via a single EVAL guarantees a concurrent
-# LPOP drain lands either entirely before the push (returns 0 from
-# our earlier LLEN) or entirely after it (sees the new message) —
-# never in the middle of a partial state.
+# Redis serializes EVAL commands, so a concurrent ``LPOP`` drain
+# observes either the pre-push or post-push state of the list — never
+# a partial state where the RPUSH has landed but LTRIM hasn't run.
 _PUSH_LUA = """
 redis.call('RPUSH', KEYS[1], ARGV[1])
 redis.call('LTRIM', KEYS[1], -tonumber(ARGV[2]), -1)
@@ -180,9 +179,11 @@ async def peek_pending_count(session_id: str) -> int:
 async def clear_pending_messages(session_id: str) -> None:
     """Drop the session's pending buffer.
 
-    Called at the end of a turn (success or failure) so messages from a
-    previous turn don't leak into the next one.  The buffer may already
-    have been drained inside the turn — this is a safety net.
+    Not called by the normal turn flow — the atomic ``LPOP`` drain at
+    turn start is the primary consumer, and any push that arrives
+    after the drain window belongs to the next turn by definition.
+    Retained as an operator/debug escape hatch for manually clearing a
+    stuck session and as a fixture in the unit tests.
     """
     redis = await get_redis_async()
     await redis.delete(_buffer_key(session_id))

From a4dbcf424782c8afb90c22b8ca04624b91b8359f Mon Sep 17 00:00:00 2001
From: majdyz <zamil.majdy@agpt.co>
Date: Fri, 10 Apr 2026 23:29:44 +0700
Subject: [PATCH 06/30] =?UTF-8?q?fix(backend/copilot):=20address=20round-3?=
 =?UTF-8?q?=20review=20=E2=80=94=20dedup,=20persist,=20guards?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Replace maybe_append_user_message with direct session.messages.append
  for pending drain in both baseline mid-loop and SDK drain-at-start:
  pending messages are atomically popped from Redis and are never
  stale-cache duplicates, so the dedup is wrong and causes
  openai_messages/transcript to diverge from the DB record
- Add immediate upsert_chat_session after SDK drain-at-start so a
  crash between drain and finally doesn't lose messages already removed
  from Redis
- Capture _pre_drain_msg_count before the baseline drain-at-start:
  use it for is_first_turn (prevents pending messages from flipping the
  flag to False on an actual first turn) and for _load_prior_transcript
  (prevents the stale-transcript check from firing on every turn that
  drains pending messages, which would block transcript upload forever)
- Remove redundant if user_id: guards in queue_pending_message — user_id
  is guaranteed non-empty by Security(auth.get_user_id); the guards made
  the rate-limit check silently optional
---
 .../backend/api/features/chat/routes.py       | 37 +++++++++----------
 .../backend/copilot/baseline/service.py       | 25 ++++++++++---
 .../backend/backend/copilot/sdk/service.py    | 26 ++++++++++---
 3 files changed, 58 insertions(+), 30 deletions(-)

diff --git a/autogpt_platform/backend/backend/api/features/chat/routes.py b/autogpt_platform/backend/backend/api/features/chat/routes.py
index 2e19ea8ca3..6d057b0270 100644
--- a/autogpt_platform/backend/backend/api/features/chat/routes.py
+++ b/autogpt_platform/backend/backend/api/features/chat/routes.py
@@ -1085,30 +1085,29 @@ async def queue_pending_message(
     # this, a client could bypass per-turn token limits by batching
     # their extra context through this endpoint while a cheap stream
     # is in flight.
-    if user_id:
-        try:
-            daily_limit, weekly_limit, _tier = await get_global_rate_limits(
-                user_id, config.daily_token_limit, config.weekly_token_limit
-            )
-            await check_rate_limit(
-                user_id=user_id,
-                daily_token_limit=daily_limit,
-                weekly_token_limit=weekly_limit,
-            )
-        except RateLimitExceeded as e:
-            raise HTTPException(status_code=429, detail=str(e)) from e
-
-    if user_id:
-        track_user_message(
-            user_id=user_id,
-            session_id=session_id,
-            message_length=len(request.message),
+    # user_id is guaranteed non-empty by Security(auth.get_user_id) — no guard needed.
+    try:
+        daily_limit, weekly_limit, _tier = await get_global_rate_limits(
+            user_id, config.daily_token_limit, config.weekly_token_limit
         )
+        await check_rate_limit(
+            user_id=user_id,
+            daily_token_limit=daily_limit,
+            weekly_token_limit=weekly_limit,
+        )
+    except RateLimitExceeded as e:
+        raise HTTPException(status_code=429, detail=str(e)) from e
+
+    track_user_message(
+        user_id=user_id,
+        session_id=session_id,
+        message_length=len(request.message),
+    )
 
     # Sanitise file IDs to the user's own workspace (same logic as
     # stream_chat_post) so injection doesn't surface other users' files.
     sanitized_file_ids: list[str] = []
-    if request.file_ids and user_id:
+    if request.file_ids:
         valid_ids = [fid for fid in request.file_ids if _UUID_RE.match(fid)]
         if valid_ids:
             workspace = await get_or_create_workspace(user_id)
diff --git a/autogpt_platform/backend/backend/copilot/baseline/service.py b/autogpt_platform/backend/backend/copilot/baseline/service.py
index f9b5a7d9ea..f46c31ff21 100644
--- a/autogpt_platform/backend/backend/copilot/baseline/service.py
+++ b/autogpt_platform/backend/backend/copilot/baseline/service.py
@@ -934,6 +934,10 @@ async def stream_chat_completion_baseline(
                 message_length=len(message or ""),
             )
 
+    # Capture count *before* the pending drain so is_first_turn and the
+    # transcript staleness check are not skewed by queued messages.
+    _pre_drain_msg_count = len(session.messages)
+
     # Drain any messages the user queued via POST /messages/pending
     # while this session was idle (or during a previous turn whose
     # mid-loop drains missed them).  Atomic LPOP guarantees that a
@@ -948,7 +952,10 @@ async def stream_chat_completion_baseline(
         )
         for _pm in drained_at_start:
             _content = format_pending_as_user_message(_pm)["content"]
-            maybe_append_user_message(session, _content, is_user_message=True)
+            # Append directly — pending messages are atomically-popped from
+            # Redis and are never stale-cache duplicates, so the
+            # maybe_append_user_message dedup is wrong here.
+            session.messages.append(ChatMessage(role="user", content=_content))
 
     session = await upsert_chat_session(session)
 
@@ -979,7 +986,9 @@ async def stream_chat_completion_baseline(
 
     # Build system prompt only on the first turn to avoid mid-conversation
     # changes from concurrent chats updating business understanding.
-    is_first_turn = len(session.messages) <= 1
+    # Use the pre-drain count so queued pending messages don't incorrectly
+    # flip is_first_turn to False on an actual first turn.
+    is_first_turn = _pre_drain_msg_count <= 1
     # Gate context fetch on both first turn AND user message so that assistant-
     # role calls (e.g. tool-result submissions) on the first turn don't trigger
     # a needless DB lookup for user understanding.
@@ -997,7 +1006,9 @@ async def stream_chat_completion_baseline(
                 _load_prior_transcript(
                     user_id=user_id,
                     session_id=session_id,
-                    session_msg_count=len(session.messages),
+                    # Use pre-drain count so pending messages don't falsely
+                    # mark the stored transcript as stale and prevent upload.
+                    session_msg_count=_pre_drain_msg_count,
                     transcript_builder=transcript_builder,
                 ),
                 prompt_task,
@@ -1266,8 +1277,12 @@ async def stream_chat_completion_baseline(
                     # a faithful copy of what the model actually saw.
                     formatted = format_pending_as_user_message(pm)
                     content_for_db = formatted["content"]
-                    maybe_append_user_message(
-                        session, content_for_db, is_user_message=True
+                    # Append directly — pending messages are atomically-popped
+                    # from Redis and are never stale-cache duplicates, so the
+                    # maybe_append_user_message dedup is wrong here and would
+                    # cause openai_messages/transcript to diverge from session.
+                    session.messages.append(
+                        ChatMessage(role="user", content=content_for_db)
                     )
                     openai_messages.append(formatted)
                     transcript_builder.append_user(content=content_for_db)
diff --git a/autogpt_platform/backend/backend/copilot/sdk/service.py b/autogpt_platform/backend/backend/copilot/sdk/service.py
index 0b3eba5ee0..a818b66d08 100644
--- a/autogpt_platform/backend/backend/copilot/sdk/service.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/service.py
@@ -2291,11 +2291,12 @@ async def stream_chat_completion_sdk(
         #
         # The drained content is concatenated into ``current_message``
         # so the SDK CLI sees it in the new user message, AND appended
-        # to ``session.messages`` (via ``maybe_append_user_message``,
-        # which dedupes trailing same-role repeats) so the durable
-        # transcript records it too.  The endpoint deliberately does
-        # NOT persist to session.messages — Redis is the single source
-        # of truth until this drain runs.
+        # directly to ``session.messages`` (no dedup — pending messages are
+        # atomically-popped from Redis and are never stale-cache duplicates)
+        # so the durable transcript records it too.  Session is persisted
+        # immediately after the drain so a crash doesn't lose the messages.
+        # The endpoint deliberately does NOT persist to session.messages —
+        # Redis is the single source of truth until this drain runs.
         pending_at_start = await drain_pending_messages(session_id)
         if pending_at_start:
             logger.info(
@@ -2307,11 +2308,24 @@ async def stream_chat_completion_sdk(
                 format_pending_as_user_message(pm)["content"] for pm in pending_at_start
             ]
             for _pt in pending_texts:
-                maybe_append_user_message(session, _pt, is_user_message=True)
+                # Append directly — pending messages are atomically-popped from
+                # Redis and are never stale-cache duplicates, so the
+                # maybe_append_user_message dedup is wrong here.
+                session.messages.append(ChatMessage(role="user", content=_pt))
             if current_message.strip():
                 current_message = current_message + "\n\n" + "\n\n".join(pending_texts)
             else:
                 current_message = "\n\n".join(pending_texts)
+            # Persist immediately so a crash between here and the finally block
+            # doesn't lose messages that were already drained from Redis.
+            try:
+                session = await upsert_chat_session(session)
+            except Exception as _persist_err:
+                logger.warning(
+                    "%s Failed to persist drained pending messages: %s",
+                    log_prefix,
+                    _persist_err,
+                )
 
         if not current_message.strip():
             yield StreamError(

From f8f7df7b0a146ec0f3fd54afbdcc68b77ec61ea4 Mon Sep 17 00:00:00 2001
From: majdyz <majdy.zamil@gmail.com>
Date: Fri, 10 Apr 2026 16:34:08 +0000
Subject: [PATCH 07/30] fix(copilot): address CI failures on pending-messages
 PR
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. SDK retry tests failing with "Event loop is closed" — the
   drain-at-start call in stream_chat_completion_sdk was reaching the
   real ``drain_pending_messages`` (which hits Redis) instead of being
   mocked.  Added a ``drain_pending_messages`` stub returning ``[]`` to
   the shared ``_make_sdk_patches`` helper so all retry-integration
   tests skip the drain path.

2. API types check failing — the new
   ``POST /sessions/{id}/messages/pending`` endpoint wasn't reflected
   in the frontend's ``openapi.json``.  Regenerated via
   ``poetry run export-api-schema --output ../frontend/src/app/api/openapi.json``
   and ``pnpm prettier --write``.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../copilot/sdk/retry_scenarios_test.py       |   6 ++
 .../frontend/src/app/api/openapi.json         | 101 ++++++++++++++++++
 2 files changed, 107 insertions(+)

diff --git a/autogpt_platform/backend/backend/copilot/sdk/retry_scenarios_test.py b/autogpt_platform/backend/backend/copilot/sdk/retry_scenarios_test.py
index fd831214a6..710daf626a 100644
--- a/autogpt_platform/backend/backend/copilot/sdk/retry_scenarios_test.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/retry_scenarios_test.py
@@ -1031,6 +1031,12 @@ def _make_sdk_patches(
         ),
         (f"{_SVC}.upload_transcript", dict(new_callable=AsyncMock)),
         (f"{_SVC}.get_user_tier", dict(new_callable=AsyncMock, return_value=None)),
+        # Stub pending-message drain so retry tests don't hit Redis.
+        # Returns an empty list → no mid-turn injection happens.
+        (
+            f"{_SVC}.drain_pending_messages",
+            dict(new_callable=AsyncMock, return_value=[]),
+        ),
     ]
 
 
diff --git a/autogpt_platform/frontend/src/app/api/openapi.json b/autogpt_platform/frontend/src/app/api/openapi.json
index 446b2eb079..2546df9357 100644
--- a/autogpt_platform/frontend/src/app/api/openapi.json
+++ b/autogpt_platform/frontend/src/app/api/openapi.json
@@ -1605,6 +1605,56 @@
         }
       }
     },
+    "/api/chat/sessions/{session_id}/messages/pending": {
+      "post": {
+        "tags": ["v2", "chat", "chat"],
+        "summary": "Queue Pending Message",
+        "description": "Queue a new user message into an in-flight copilot turn.\n\nWhen a user sends a follow-up message while a turn is still\nstreaming, we don't want to block them or start a separate turn —\nthis endpoint appends the message to a per-session pending buffer.\nThe executor currently running the turn (baseline path) drains the\nbuffer between tool-call rounds and appends the message to the\nconversation before the next LLM call.  On the SDK path the buffer\nis drained at the *start* of the next turn (the long-lived\n``ClaudeSDKClient.receive_response`` iterator returns after a\n``ResultMessage`` so there is no safe point to inject mid-stream\ninto an existing connection).\n\nReturns 202.  Enforces the same per-user daily/weekly token rate\nlimit as the regular ``/stream`` endpoint so a client can't bypass\nit by batching messages through here.",
+        "operationId": "postV2QueuePendingMessage",
+        "security": [{ "HTTPBearerJWT": [] }],
+        "parameters": [
+          {
+            "name": "session_id",
+            "in": "path",
+            "required": true,
+            "schema": { "type": "string", "title": "Session Id" }
+          }
+        ],
+        "requestBody": {
+          "required": true,
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/QueuePendingMessageRequest"
+              }
+            }
+          }
+        },
+        "responses": {
+          "202": {
+            "description": "Successful Response",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/QueuePendingMessageResponse"
+                }
+              }
+            }
+          },
+          "401": {
+            "$ref": "#/components/responses/HTTP401NotAuthenticatedError"
+          },
+          "422": {
+            "description": "Validation Error",
+            "content": {
+              "application/json": {
+                "schema": { "$ref": "#/components/schemas/HTTPValidationError" }
+              }
+            }
+          }
+        }
+      }
+    },
     "/api/chat/sessions/{session_id}/stream": {
       "get": {
         "tags": ["v2", "chat", "chat"],
@@ -12668,6 +12718,57 @@
         "required": ["providers", "pagination"],
         "title": "ProviderResponse"
       },
+      "QueuePendingMessageRequest": {
+        "properties": {
+          "message": {
+            "type": "string",
+            "maxLength": 16000,
+            "minLength": 1,
+            "title": "Message"
+          },
+          "context": {
+            "anyOf": [
+              {
+                "additionalProperties": { "type": "string" },
+                "type": "object"
+              },
+              { "type": "null" }
+            ],
+            "title": "Context",
+            "description": "Optional page context: expected keys are 'url' and 'content'."
+          },
+          "file_ids": {
+            "anyOf": [
+              {
+                "items": { "type": "string" },
+                "type": "array",
+                "maxItems": 20
+              },
+              { "type": "null" }
+            ],
+            "title": "File Ids"
+          }
+        },
+        "additionalProperties": false,
+        "type": "object",
+        "required": ["message"],
+        "title": "QueuePendingMessageRequest",
+        "description": "Request model for queueing a message into an in-flight turn.\n\nUnlike ``StreamChatRequest`` this endpoint does **not** start a new\nturn — the message is appended to a per-session pending buffer that\nthe executor currently processing the turn will drain between tool\nrounds."
+      },
+      "QueuePendingMessageResponse": {
+        "properties": {
+          "buffer_length": { "type": "integer", "title": "Buffer Length" },
+          "max_buffer_length": {
+            "type": "integer",
+            "title": "Max Buffer Length"
+          },
+          "turn_in_flight": { "type": "boolean", "title": "Turn In Flight" }
+        },
+        "type": "object",
+        "required": ["buffer_length", "max_buffer_length", "turn_in_flight"],
+        "title": "QueuePendingMessageResponse",
+        "description": "Response for the pending-message endpoint.\n\n- ``buffer_length``: how many messages are now in the session's\n  pending buffer (after this push)\n- ``max_buffer_length``: the per-session cap (server-side constant)\n- ``turn_in_flight``: ``True`` if a copilot turn was running when\n  we checked — purely informational for UX feedback.  Even when\n  ``False`` the message is still queued: the next turn drains it."
+      },
       "RateLimitResetResponse": {
         "properties": {
           "success": { "type": "boolean", "title": "Success" },

From 39e89b50a7972ddbae66c9041d3c6e0f118840ff Mon Sep 17 00:00:00 2001
From: majdyz <majdy.zamil@gmail.com>
Date: Fri, 10 Apr 2026 16:41:55 +0000
Subject: [PATCH 08/30] fix(copilot): address remaining CI failures on
 pending-messages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. SDK pyright: the inner ``_fetch_transcript`` closure captured
   ``session`` which pyright couldn't narrow to non-None (the outer
   scope casts it, but the narrowing doesn't propagate into the
   nested async function).  Added an explicit ``assert session is not
   None`` at the top of the closure.
2. Lint: re-formatted ``platform_cost_test.py`` — some pre-existing
   whitespace drift from an upstream merge was tripping Black on CI.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 autogpt_platform/backend/backend/copilot/sdk/service.py     | 1 +
 autogpt_platform/backend/backend/data/platform_cost_test.py | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/autogpt_platform/backend/backend/copilot/sdk/service.py b/autogpt_platform/backend/backend/copilot/sdk/service.py
index a818b66d08..3384fc82f6 100644
--- a/autogpt_platform/backend/backend/copilot/sdk/service.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/service.py
@@ -2046,6 +2046,7 @@ async def stream_chat_completion_sdk(
 
         async def _fetch_transcript():
             """Download transcript for --resume if applicable."""
+            assert session is not None  # narrowed at line 1898
             if not (
                 config.claude_agent_use_resume and user_id and len(session.messages) > 1
             ):
diff --git a/autogpt_platform/backend/backend/data/platform_cost_test.py b/autogpt_platform/backend/backend/data/platform_cost_test.py
index 758e97d37b..65b1a20099 100644
--- a/autogpt_platform/backend/backend/data/platform_cost_test.py
+++ b/autogpt_platform/backend/backend/data/platform_cost_test.py
@@ -35,7 +35,6 @@ class TestUsdToMicrodollars:
         assert usd_to_microdollars(1.0) == 1_000_000
 
 
-
 class TestMaskEmail:
     def test_typical_email(self):
         assert _mask_email("user@example.com") == "us***@example.com"

From a7d97dacf33ca8b0ec4bf8e6b584ceebcd887d9f Mon Sep 17 00:00:00 2001
From: majdyz <zamil.majdy@agpt.co>
Date: Sat, 11 Apr 2026 00:00:07 +0700
Subject: [PATCH 09/30] fix(copilot): address review comments on
 pending-messages PR

- Use _pre_drain_msg_count for transcript load gate (len > 1 check)
  to avoid spurious transcript load on first turn with pending messages
- Use _pre_drain_msg_count for Graphiti warm context gate to prevent
  warm context skip when pending messages are drained at first turn
- Add context.url/content length validators to QueuePendingMessageRequest
  to prevent LLM context-window stuffing (2K url, 32K content caps)
- Rename underscore-prefixed active variables (_pm, _content, _pt)
  to conventional names (pm, content, pt) per Python convention
---
 .../backend/api/features/chat/routes.py       | 23 +++++++++++++++++++
 .../backend/copilot/baseline/service.py       | 22 ++++++++++--------
 .../backend/backend/copilot/sdk/service.py    |  4 ++--
 3 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/autogpt_platform/backend/backend/api/features/chat/routes.py b/autogpt_platform/backend/backend/api/features/chat/routes.py
index 6d057b0270..854ca116fc 100644
--- a/autogpt_platform/backend/backend/api/features/chat/routes.py
+++ b/autogpt_platform/backend/backend/api/features/chat/routes.py
@@ -142,6 +142,29 @@ class QueuePendingMessageRequest(BaseModel):
     )
     file_ids: list[str] | None = Field(default=None, max_length=20)
 
+    @field_validator("context")
+    @classmethod
+    def _validate_context_length(
+        cls, v: dict[str, str] | None
+    ) -> dict[str, str] | None:
+        if v is None:
+            return v
+        # Cap context values to prevent LLM context-window stuffing via
+        # large page payloads (url: 2 KB, content: 32 KB).
+        _URL_LIMIT = 2_000
+        _CONTENT_LIMIT = 32_000
+        url = v.get("url", "")
+        if len(url) > _URL_LIMIT:
+            raise ValueError(
+                f"context.url exceeds maximum length of {_URL_LIMIT} characters"
+            )
+        content = v.get("content", "")
+        if len(content) > _CONTENT_LIMIT:
+            raise ValueError(
+                f"context.content exceeds maximum length of {_CONTENT_LIMIT} characters"
+            )
+        return v
+
 
 class QueuePendingMessageResponse(BaseModel):
     """Response for the pending-message endpoint.
diff --git a/autogpt_platform/backend/backend/copilot/baseline/service.py b/autogpt_platform/backend/backend/copilot/baseline/service.py
index f46c31ff21..4bcdfd80d9 100644
--- a/autogpt_platform/backend/backend/copilot/baseline/service.py
+++ b/autogpt_platform/backend/backend/copilot/baseline/service.py
@@ -950,12 +950,12 @@ async def stream_chat_completion_baseline(
             len(drained_at_start),
             session_id,
         )
-        for _pm in drained_at_start:
-            _content = format_pending_as_user_message(_pm)["content"]
+        for pm in drained_at_start:
+            content = format_pending_as_user_message(pm)["content"]
             # Append directly — pending messages are atomically-popped from
             # Redis and are never stale-cache duplicates, so the
             # maybe_append_user_message dedup is wrong here.
-            session.messages.append(ChatMessage(role="user", content=_content))
+            session.messages.append(ChatMessage(role="user", content=content))
 
     session = await upsert_chat_session(session)
 
@@ -999,8 +999,10 @@ async def stream_chat_completion_baseline(
         prompt_task = _build_cacheable_system_prompt(None)
 
     # Run download + prompt build concurrently — both are independent I/O
-    # on the request critical path.
-    if user_id and len(session.messages) > 1:
+    # on the request critical path.  Use the pre-drain count so pending
+    # messages drained at turn start don't spuriously trigger a transcript
+    # load on an actual first turn.
+    if user_id and _pre_drain_msg_count > 1:
         transcript_covers_prefix, (base_system_prompt, understanding) = (
             await asyncio.gather(
                 _load_prior_transcript(
@@ -1025,9 +1027,9 @@ async def stream_chat_completion_baseline(
     # missing them and a mid-turn upload could leave a malformed
     # assistant-after-assistant structure on the next turn.
     if drained_at_start:
-        for _pm in drained_at_start:
+        for pm in drained_at_start:
             transcript_builder.append_user(
-                content=format_pending_as_user_message(_pm)["content"]
+                content=format_pending_as_user_message(pm)["content"]
             )
 
     # Generate title for new sessions
@@ -1050,8 +1052,10 @@ async def stream_chat_completion_baseline(
     graphiti_supplement = get_graphiti_supplement() if graphiti_enabled else ""
     system_prompt = base_system_prompt + get_baseline_supplement() + graphiti_supplement
 
-    # Warm context: pre-load relevant facts from Graphiti on first turn
-    if graphiti_enabled and user_id and len(session.messages) <= 1:
+    # Warm context: pre-load relevant facts from Graphiti on first turn.
+    # Use the pre-drain count so pending messages drained at turn start
+    # don't prevent warm context injection on an actual first turn.
+    if graphiti_enabled and user_id and _pre_drain_msg_count <= 1:
         from backend.copilot.graphiti.context import fetch_warm_context
 
         warm_ctx = await fetch_warm_context(user_id, message or "")
diff --git a/autogpt_platform/backend/backend/copilot/sdk/service.py b/autogpt_platform/backend/backend/copilot/sdk/service.py
index 3384fc82f6..39299ba14b 100644
--- a/autogpt_platform/backend/backend/copilot/sdk/service.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/service.py
@@ -2308,11 +2308,11 @@ async def stream_chat_completion_sdk(
             pending_texts: list[str] = [
                 format_pending_as_user_message(pm)["content"] for pm in pending_at_start
             ]
-            for _pt in pending_texts:
+            for pt in pending_texts:
                 # Append directly — pending messages are atomically-popped from
                 # Redis and are never stale-cache duplicates, so the
                 # maybe_append_user_message dedup is wrong here.
-                session.messages.append(ChatMessage(role="user", content=_pt))
+                session.messages.append(ChatMessage(role="user", content=pt))
             if current_message.strip():
                 current_message = current_message + "\n\n" + "\n\n".join(pending_texts)
             else:

From 5e8345e5eec37fda46e981724c65d562fd22b131 Mon Sep 17 00:00:00 2001
From: majdyz <zamil.majdy@agpt.co>
Date: Sat, 11 Apr 2026 00:06:24 +0700
Subject: [PATCH 10/30] fix(copilot): fix CodeQL false-positive in
 pending_messages_test

Replace broad `url in content` assertion with exact `[Page URL: url]`
substring check so CodeQL does not flag it as Incomplete URL Substring
Sanitization.
---
 .../backend/backend/copilot/pending_messages_test.py        | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/autogpt_platform/backend/backend/copilot/pending_messages_test.py b/autogpt_platform/backend/backend/copilot/pending_messages_test.py
index 7be4b7e53c..7eef559725 100644
--- a/autogpt_platform/backend/backend/copilot/pending_messages_test.py
+++ b/autogpt_platform/backend/backend/copilot/pending_messages_test.py
@@ -174,9 +174,11 @@ def test_format_pending_with_context_url() -> None:
         context={"url": "https://example.com"},
     )
     out = format_pending_as_user_message(msg)
+    content = out["content"]
     assert out["role"] == "user"
-    assert "see this page" in out["content"]
-    assert "https://example.com" in out["content"]
+    assert "see this page" in content
+    # The URL should appear verbatim in the [Page URL: ...] block.
+    assert "[Page URL: https://example.com]" in content
 
 
 def test_format_pending_with_file_ids() -> None:

From d10d14ae745f2b6bf3b5891536dafe4fbb67dfa2 Mon Sep 17 00:00:00 2001
From: majdyz <zamil.majdy@agpt.co>
Date: Sat, 11 Apr 2026 00:10:20 +0700
Subject: [PATCH 11/30] test(copilot): add coverage for pending-message
 endpoint and URL test

- Add 11 tests for QueuePendingMessageRequest validation and the
  POST /sessions/{id}/messages/pending endpoint covering:
  - 202 happy path
  - 422 on empty/oversized message, context.url > 2KB, context.content > 32KB, >20 file_ids
  - 404 on unknown session
  - 429 on rate limit exceeded
  - file_ids scoped to caller's workspace
- Fix CodeQL false-positive: replace broad url-in-content assertion
  with exact [Page URL: url] substring check in pending_messages_test
---
 .../backend/api/features/chat/routes_test.py  | 270 ++++++++++++++++++
 1 file changed, 270 insertions(+)

diff --git a/autogpt_platform/backend/backend/api/features/chat/routes_test.py b/autogpt_platform/backend/backend/api/features/chat/routes_test.py
index cd87fe611f..18d499047a 100644
--- a/autogpt_platform/backend/backend/api/features/chat/routes_test.py
+++ b/autogpt_platform/backend/backend/api/features/chat/routes_test.py
@@ -579,3 +579,273 @@ class TestStreamChatRequestModeValidation:
 
         req = StreamChatRequest(message="hi")
         assert req.mode is None
+
+
+# ─── QueuePendingMessageRequest validation ────────────────────────────
+
+
+class TestQueuePendingMessageRequest:
+    """Unit tests for QueuePendingMessageRequest field validation."""
+
+    def test_accepts_valid_message(self) -> None:
+        from backend.api.features.chat.routes import QueuePendingMessageRequest
+
+        req = QueuePendingMessageRequest(message="hello")
+        assert req.message == "hello"
+
+    def test_rejects_empty_message(self) -> None:
+        import pydantic
+
+        from backend.api.features.chat.routes import QueuePendingMessageRequest
+
+        with pytest.raises(pydantic.ValidationError):
+            QueuePendingMessageRequest(message="")
+
+    def test_rejects_message_over_limit(self) -> None:
+        import pydantic
+
+        from backend.api.features.chat.routes import QueuePendingMessageRequest
+
+        with pytest.raises(pydantic.ValidationError):
+            QueuePendingMessageRequest(message="x" * 16_001)
+
+    def test_accepts_valid_context(self) -> None:
+        from backend.api.features.chat.routes import QueuePendingMessageRequest
+
+        req = QueuePendingMessageRequest(
+            message="hi",
+            context={"url": "https://example.com", "content": "page text"},
+        )
+        assert req.context is not None
+        assert req.context["url"] == "https://example.com"
+
+    def test_rejects_context_url_over_limit(self) -> None:
+        import pydantic
+
+        from backend.api.features.chat.routes import QueuePendingMessageRequest
+
+        with pytest.raises(pydantic.ValidationError, match="url"):
+            QueuePendingMessageRequest(
+                message="hi",
+                context={"url": "https://example.com/" + "x" * 2_000},
+            )
+
+    def test_rejects_context_content_over_limit(self) -> None:
+        import pydantic
+
+        from backend.api.features.chat.routes import QueuePendingMessageRequest
+
+        with pytest.raises(pydantic.ValidationError, match="content"):
+            QueuePendingMessageRequest(
+                message="hi",
+                context={"content": "x" * 32_001},
+            )
+
+    def test_rejects_extra_fields(self) -> None:
+        """extra='forbid' should reject unknown fields."""
+        import pydantic
+
+        from backend.api.features.chat.routes import QueuePendingMessageRequest
+
+        with pytest.raises(pydantic.ValidationError):
+            QueuePendingMessageRequest(message="hi", unknown_field="bad")  # type: ignore[call-arg]
+
+    def test_accepts_up_to_20_file_ids(self) -> None:
+        from backend.api.features.chat.routes import QueuePendingMessageRequest
+
+        req = QueuePendingMessageRequest(
+            message="hi",
+            file_ids=[f"00000000-0000-0000-0000-{i:012d}" for i in range(20)],
+        )
+        assert req.file_ids is not None
+        assert len(req.file_ids) == 20
+
+    def test_rejects_more_than_20_file_ids(self) -> None:
+        import pydantic
+
+        from backend.api.features.chat.routes import QueuePendingMessageRequest
+
+        with pytest.raises(pydantic.ValidationError):
+            QueuePendingMessageRequest(
+                message="hi",
+                file_ids=[f"00000000-0000-0000-0000-{i:012d}" for i in range(21)],
+            )
+
+
+# ─── queue_pending_message endpoint ──────────────────────────────────
+
+
+def _mock_pending_internals(
+    mocker: pytest_mock.MockerFixture, *, session_exists: bool = True
+):
+    """Mock all async dependencies for the pending-message endpoint."""
+    if session_exists:
+        mock_session = mocker.MagicMock()
+        mock_session.id = "sess-1"
+        mocker.patch(
+            "backend.api.features.chat.routes._validate_and_get_session",
+            new_callable=AsyncMock,
+            return_value=mock_session,
+        )
+    else:
+        mocker.patch(
+            "backend.api.features.chat.routes._validate_and_get_session",
+            side_effect=fastapi.HTTPException(
+                status_code=404, detail="Session not found."
+            ),
+        )
+    mocker.patch(
+        "backend.api.features.chat.routes.get_global_rate_limits",
+        new_callable=AsyncMock,
+        return_value=(0, 0, None),
+    )
+    mocker.patch(
+        "backend.api.features.chat.routes.check_rate_limit",
+        new_callable=AsyncMock,
+        return_value=None,
+    )
+    mocker.patch(
+        "backend.api.features.chat.routes.track_user_message",
+        return_value=None,
+    )
+    mocker.patch(
+        "backend.api.features.chat.routes.push_pending_message",
+        new_callable=AsyncMock,
+        return_value=1,
+    )
+    mock_registry = mocker.MagicMock()
+    mock_registry.get_session = mocker.AsyncMock(return_value=None)
+    mocker.patch(
+        "backend.api.features.chat.routes.stream_registry",
+        mock_registry,
+    )
+
+
+def test_queue_pending_message_returns_202(mocker: pytest_mock.MockerFixture) -> None:
+    """Happy path: valid message returns 202 with buffer_length."""
+    _mock_pending_internals(mocker)
+
+    response = client.post(
+        "/sessions/sess-1/messages/pending",
+        json={"message": "follow-up"},
+    )
+
+    assert response.status_code == 202
+    data = response.json()
+    assert data["buffer_length"] == 1
+    assert data["turn_in_flight"] is False
+
+
+def test_queue_pending_message_empty_body_returns_422() -> None:
+    """Empty message must be rejected by Pydantic before hitting any route logic."""
+    response = client.post(
+        "/sessions/sess-1/messages/pending",
+        json={"message": ""},
+    )
+    assert response.status_code == 422
+
+
+def test_queue_pending_message_missing_message_returns_422() -> None:
+    """Missing 'message' field returns 422."""
+    response = client.post(
+        "/sessions/sess-1/messages/pending",
+        json={},
+    )
+    assert response.status_code == 422
+
+
+def test_queue_pending_message_session_not_found_returns_404(
+    mocker: pytest_mock.MockerFixture,
+) -> None:
+    """If the session doesn't exist or belong to the user, returns 404."""
+    _mock_pending_internals(mocker, session_exists=False)
+
+    response = client.post(
+        "/sessions/bad-sess/messages/pending",
+        json={"message": "hi"},
+    )
+    assert response.status_code == 404
+
+
+def test_queue_pending_message_rate_limited_returns_429(
+    mocker: pytest_mock.MockerFixture,
+) -> None:
+    """When rate limit is exceeded, endpoint returns 429."""
+    from backend.copilot.rate_limit import RateLimitExceeded
+
+    _mock_pending_internals(mocker)
+    mocker.patch(
+        "backend.api.features.chat.routes.check_rate_limit",
+        side_effect=RateLimitExceeded("daily", datetime.now(UTC) + timedelta(hours=1)),
+    )
+
+    response = client.post(
+        "/sessions/sess-1/messages/pending",
+        json={"message": "hi"},
+    )
+    assert response.status_code == 429
+
+
+def test_queue_pending_message_context_url_too_long_returns_422() -> None:
+    """context.url over 2 KB is rejected."""
+    response = client.post(
+        "/sessions/sess-1/messages/pending",
+        json={
+            "message": "hi",
+            "context": {"url": "https://example.com/" + "x" * 2_000},
+        },
+    )
+    assert response.status_code == 422
+
+
+def test_queue_pending_message_context_content_too_long_returns_422() -> None:
+    """context.content over 32 KB is rejected."""
+    response = client.post(
+        "/sessions/sess-1/messages/pending",
+        json={
+            "message": "hi",
+            "context": {"content": "x" * 32_001},
+        },
+    )
+    assert response.status_code == 422
+
+
+def test_queue_pending_message_too_many_file_ids_returns_422() -> None:
+    """More than 20 file_ids should be rejected."""
+    response = client.post(
+        "/sessions/sess-1/messages/pending",
+        json={
+            "message": "hi",
+            "file_ids": [f"00000000-0000-0000-0000-{i:012d}" for i in range(21)],
+        },
+    )
+    assert response.status_code == 422
+
+
+def test_queue_pending_message_file_ids_scoped_to_workspace(
+    mocker: pytest_mock.MockerFixture,
+) -> None:
+    """File IDs must be sanitized to the user's workspace before push."""
+    _mock_pending_internals(mocker)
+    mocker.patch(
+        "backend.api.features.chat.routes.get_or_create_workspace",
+        new_callable=AsyncMock,
+        return_value=type("W", (), {"id": "ws-1"})(),
+    )
+    mock_prisma = mocker.MagicMock()
+    mock_prisma.find_many = mocker.AsyncMock(return_value=[])
+    mocker.patch(
+        "prisma.models.UserWorkspaceFile.prisma",
+        return_value=mock_prisma,
+    )
+    fid = "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"
+
+    client.post(
+        "/sessions/sess-1/messages/pending",
+        json={"message": "hi", "file_ids": [fid, "not-a-uuid"]},
+    )
+
+    call_kwargs = mock_prisma.find_many.call_args[1]
+    assert call_kwargs["where"]["id"]["in"] == [fid]
+    assert call_kwargs["where"]["workspaceId"] == "ws-1"
+    assert call_kwargs["where"]["isDeleted"] is False

From 3ef24b32345e579b3db2c0cb145b8d30b477347a Mon Sep 17 00:00:00 2001
From: majdyz <zamil.majdy@agpt.co>
Date: Sat, 11 Apr 2026 00:27:15 +0700
Subject: [PATCH 12/30] refactor(copilot): narrow exception handling and type
 context field

- Replace broad `except Exception` with `except (json.JSONDecodeError,
  ValidationError, TypeError, ValueError)` in drain_pending_messages so
  unexpected non-data errors propagate instead of being silently swallowed
- Introduce `PendingMessageContext` Pydantic model to replace the raw
  `dict[str, str]` for the context field, making the url/content contract
  explicit and enabling typed attribute access instead of .get() calls
- Update routes.py to construct PendingMessageContext from the validated
  request dict before passing to PendingMessage
- Update tests to use PendingMessageContext directly

Addresses coderabbitai review comments.
---
 .../backend/api/features/chat/routes.py       |  3 ++-
 .../backend/copilot/pending_messages.py       | 23 +++++++++++--------
 .../backend/copilot/pending_messages_test.py  |  3 ++-
 3 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/autogpt_platform/backend/backend/api/features/chat/routes.py b/autogpt_platform/backend/backend/api/features/chat/routes.py
index 854ca116fc..d7ebe04507 100644
--- a/autogpt_platform/backend/backend/api/features/chat/routes.py
+++ b/autogpt_platform/backend/backend/api/features/chat/routes.py
@@ -32,6 +32,7 @@ from backend.copilot.model import (
 from backend.copilot.pending_messages import (
     MAX_PENDING_MESSAGES,
     PendingMessage,
+    PendingMessageContext,
     push_pending_message,
 )
 from backend.copilot.rate_limit import (
@@ -1162,7 +1163,7 @@ async def queue_pending_message(
     pending = PendingMessage(
         content=request.message,
         file_ids=sanitized_file_ids,
-        context=request.context,
+        context=PendingMessageContext(**request.context) if request.context else None,
     )
     buffer_length = await push_pending_message(session_id, pending)
 
diff --git a/autogpt_platform/backend/backend/copilot/pending_messages.py b/autogpt_platform/backend/backend/copilot/pending_messages.py
index 4c62cecf25..4c749fe9d8 100644
--- a/autogpt_platform/backend/backend/copilot/pending_messages.py
+++ b/autogpt_platform/backend/backend/copilot/pending_messages.py
@@ -25,7 +25,7 @@ import json
 import logging
 from typing import Any, cast
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, ValidationError
 
 from backend.data.redis_client import get_redis_async
 
@@ -45,12 +45,19 @@ _PENDING_CHANNEL_PREFIX = "copilot:pending:notify:"
 _PENDING_TTL_SECONDS = 3600  # 1 hour — matches stream_ttl default
 
 
+class PendingMessageContext(BaseModel):
+    """Structured page context attached to a pending message."""
+
+    url: str | None = None
+    content: str | None = None
+
+
 class PendingMessage(BaseModel):
     """A user message queued for injection into an in-flight turn."""
 
     content: str = Field(min_length=1, max_length=16_000)
     file_ids: list[str] = Field(default_factory=list)
-    context: dict[str, str] | None = None
+    context: PendingMessageContext | None = None
 
 
 def _buffer_key(session_id: str) -> str:
@@ -153,7 +160,7 @@ async def drain_pending_messages(session_id: str) -> list[PendingMessage]:
     for payload in decoded:
         try:
             messages.append(PendingMessage(**json.loads(payload)))
-        except Exception as e:
+        except (json.JSONDecodeError, ValidationError, TypeError, ValueError) as e:
             logger.warning(
                 "pending_messages: dropping malformed entry for %s: %s",
                 session_id,
@@ -198,12 +205,10 @@ def format_pending_as_user_message(message: PendingMessage) -> dict[str, Any]:
     """
     parts: list[str] = [message.content]
     if message.context:
-        url = message.context.get("url")
-        if url:
-            parts.append(f"\n\n[Page URL: {url}]")
-        page_content = message.context.get("content")
-        if page_content:
-            parts.append(f"\n\n[Page content]\n{page_content}")
+        if message.context.url:
+            parts.append(f"\n\n[Page URL: {message.context.url}]")
+        if message.context.content:
+            parts.append(f"\n\n[Page content]\n{message.context.content}")
     if message.file_ids:
         parts.append(
             "\n\n[Attached files]\n"
diff --git a/autogpt_platform/backend/backend/copilot/pending_messages_test.py b/autogpt_platform/backend/backend/copilot/pending_messages_test.py
index 7eef559725..6d0d445feb 100644
--- a/autogpt_platform/backend/backend/copilot/pending_messages_test.py
+++ b/autogpt_platform/backend/backend/copilot/pending_messages_test.py
@@ -14,6 +14,7 @@ from backend.copilot import pending_messages as pm_module
 from backend.copilot.pending_messages import (
     MAX_PENDING_MESSAGES,
     PendingMessage,
+    PendingMessageContext,
     clear_pending_messages,
     drain_pending_messages,
     format_pending_as_user_message,
@@ -171,7 +172,7 @@ def test_format_pending_plain_text() -> None:
 def test_format_pending_with_context_url() -> None:
     msg = PendingMessage(
         content="see this page",
-        context={"url": "https://example.com"},
+        context=PendingMessageContext(url="https://example.com"),
     )
     out = format_pending_as_user_message(msg)
     content = out["content"]

From 9da0dd111f3540d62a9da3c60bf0518969da2cac Mon Sep 17 00:00:00 2001
From: majdyz <zamil.majdy@agpt.co>
Date: Sat, 11 Apr 2026 00:31:03 +0700
Subject: [PATCH 13/30] refactor(copilot): extract shared file-ID sanitization
 helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extract `_resolve_workspace_files(user_id, file_ids)` helper from the
duplicated UUID-filter + workspace-DB-lookup logic in both
`stream_chat_post` and `queue_pending_message`.

Both endpoints now call the single helper; callers map the returned
`list[UserWorkspaceFile]` to IDs or file-description strings as before.

Also removes the redundant `if user_id:` guard from `stream_chat_post`'s
file-ID block — `Security(auth.get_user_id)` guarantees a non-empty string.

Addresses autogpt-pr-reviewer "Should Fix: Duplicated file-ID sanitization"
and coderabbitai nit on the if user_id guard.
---
 .../backend/api/features/chat/routes.py       | 94 ++++++++++---------
 1 file changed, 49 insertions(+), 45 deletions(-)

diff --git a/autogpt_platform/backend/backend/api/features/chat/routes.py b/autogpt_platform/backend/backend/api/features/chat/routes.py
index d7ebe04507..11d9ebf90f 100644
--- a/autogpt_platform/backend/backend/api/features/chat/routes.py
+++ b/autogpt_platform/backend/backend/api/features/chat/routes.py
@@ -102,6 +102,29 @@ async def _validate_and_get_session(
     return session
 
 
+async def _resolve_workspace_files(
+    user_id: str,
+    file_ids: list[str],
+) -> list[UserWorkspaceFile]:
+    """Filter *file_ids* to UUID-valid entries that exist in the caller's workspace.
+
+    Returns the matching ``UserWorkspaceFile`` records (empty list if none pass).
+    Used by both the stream and pending-message endpoints to prevent callers from
+    referencing other users' files.
+    """
+    valid_ids = [fid for fid in file_ids if _UUID_RE.match(fid)]
+    if not valid_ids:
+        return []
+    workspace = await get_or_create_workspace(user_id)
+    return await UserWorkspaceFile.prisma().find_many(
+        where={
+            "id": {"in": valid_ids},
+            "workspaceId": workspace.id,
+            "isDeleted": False,
+        }
+    )
+
+
 router = APIRouter(
     tags=["chat"],
 )
@@ -850,33 +873,21 @@ async def stream_chat_post(
     # Also sanitise file_ids so only validated, workspace-scoped IDs are
     # forwarded downstream (e.g. to the executor via enqueue_copilot_turn).
     sanitized_file_ids: list[str] | None = None
-    if request.file_ids and user_id:
-        # Filter to valid UUIDs only to prevent DB abuse
-        valid_ids = [fid for fid in request.file_ids if _UUID_RE.match(fid)]
-
-        if valid_ids:
-            workspace = await get_or_create_workspace(user_id)
-            # Batch query instead of N+1
-            files = await UserWorkspaceFile.prisma().find_many(
-                where={
-                    "id": {"in": valid_ids},
-                    "workspaceId": workspace.id,
-                    "isDeleted": False,
-                }
+    if request.file_ids:
+        files = await _resolve_workspace_files(user_id, request.file_ids)
+        # Only keep IDs that actually exist in the user's workspace
+        sanitized_file_ids = [wf.id for wf in files] or None
+        file_lines: list[str] = [
+            f"- {wf.name} ({wf.mimeType}, {round(wf.sizeBytes / 1024, 1)} KB), file_id={wf.id}"
+            for wf in files
+        ]
+        if file_lines:
+            files_block = (
+                "\n\n[Attached files]\n"
+                + "\n".join(file_lines)
+                + "\nUse read_workspace_file with the file_id to access file contents."
             )
-            # Only keep IDs that actually exist in the user's workspace
-            sanitized_file_ids = [wf.id for wf in files] or None
-            file_lines: list[str] = [
-                f"- {wf.name} ({wf.mimeType}, {round(wf.sizeBytes / 1024, 1)} KB), file_id={wf.id}"
-                for wf in files
-            ]
-            if file_lines:
-                files_block = (
-                    "\n\n[Attached files]\n"
-                    + "\n".join(file_lines)
-                    + "\nUse read_workspace_file with the file_id to access file contents."
-                )
-                request.message += files_block
+            request.message += files_block
 
     # Atomically append user message to session BEFORE creating task to avoid
     # race condition where GET_SESSION sees task as "running" but message isn't
@@ -1128,28 +1139,21 @@ async def queue_pending_message(
         message_length=len(request.message),
     )
 
-    # Sanitise file IDs to the user's own workspace (same logic as
-    # stream_chat_post) so injection doesn't surface other users' files.
+    # Sanitise file IDs to the user's own workspace so injection doesn't
+    # surface other users' files.  _resolve_workspace_files handles UUID
+    # filtering and the workspace-scoped DB lookup.
     sanitized_file_ids: list[str] = []
     if request.file_ids:
-        valid_ids = [fid for fid in request.file_ids if _UUID_RE.match(fid)]
-        if valid_ids:
-            workspace = await get_or_create_workspace(user_id)
-            files = await UserWorkspaceFile.prisma().find_many(
-                where={
-                    "id": {"in": valid_ids},
-                    "workspaceId": workspace.id,
-                    "isDeleted": False,
-                }
+        valid_id_count = sum(1 for fid in request.file_ids if _UUID_RE.match(fid))
+        files = await _resolve_workspace_files(user_id, request.file_ids)
+        sanitized_file_ids = [wf.id for wf in files]
+        if len(sanitized_file_ids) != valid_id_count:
+            logger.warning(
+                "queue_pending_message: dropped %d file id(s) not in "
+                "caller's workspace (session=%s)",
+                valid_id_count - len(sanitized_file_ids),
+                session_id,
             )
-            sanitized_file_ids = [wf.id for wf in files]
-            if len(sanitized_file_ids) != len(valid_ids):
-                logger.warning(
-                    "queue_pending_message: dropped %d file id(s) not in "
-                    "caller's workspace (session=%s)",
-                    len(valid_ids) - len(sanitized_file_ids),
-                    session_id,
-                )
 
     # Redis is the single source of truth for pending messages.  We do
     # NOT persist to ``session.messages`` here — the drain-at-start

From 18c75beb7a6b9d35c9dbc1c5ff44e0727c8b5c32 Mon Sep 17 00:00:00 2001
From: majdyz <zamil.majdy@agpt.co>
Date: Sat, 11 Apr 2026 00:33:49 +0700
Subject: [PATCH 14/30] nit(copilot): name pub/sub notify payload constant

Replace magic string "1" in redis.publish() with named constant
_NOTIFY_PAYLOAD for self-documentation.

Addresses autogpt-pr-reviewer nit.
---
 .../backend/backend/copilot/pending_messages.py             | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/autogpt_platform/backend/backend/copilot/pending_messages.py b/autogpt_platform/backend/backend/copilot/pending_messages.py
index 4c749fe9d8..0875a44046 100644
--- a/autogpt_platform/backend/backend/copilot/pending_messages.py
+++ b/autogpt_platform/backend/backend/copilot/pending_messages.py
@@ -44,6 +44,10 @@ _PENDING_KEY_PREFIX = "copilot:pending:"
 _PENDING_CHANNEL_PREFIX = "copilot:pending:notify:"
 _PENDING_TTL_SECONDS = 3600  # 1 hour — matches stream_ttl default
 
+# Payload sent on the pub/sub notify channel.  Subscribers treat any
+# message as a wake-up hint; the value itself is not meaningful.
+_NOTIFY_PAYLOAD = "1"
+
 
 class PendingMessageContext(BaseModel):
     """Structured page context attached to a pending message."""
@@ -115,7 +119,7 @@ async def push_pending_message(
     # Fire-and-forget notify.  Subscribers use this as a wake-up hint;
     # the buffer itself is authoritative so a lost notify is harmless.
     try:
-        await redis.publish(_notify_channel(session_id), "1")
+        await redis.publish(_notify_channel(session_id), _NOTIFY_PAYLOAD)
     except Exception as e:  # pragma: no cover
         logger.warning("pending_messages: publish failed for %s: %s", session_id, e)
 

From 9bfcdf3f1112d4e5cb0847434c8f9c10b637c973 Mon Sep 17 00:00:00 2001
From: majdyz <zamil.majdy@agpt.co>
Date: Sat, 11 Apr 2026 00:35:27 +0700
Subject: [PATCH 15/30] test(copilot): add combined-fields test for
 format_pending_as_user_message

Verify that content + context (url + content) + file_ids all appear in
the formatted output when all fields are present simultaneously.

Addresses autogpt-pr-reviewer 'format_pending_as_user_message never
tested with all fields simultaneously'.
---
 .../backend/copilot/pending_messages_test.py  | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/autogpt_platform/backend/backend/copilot/pending_messages_test.py b/autogpt_platform/backend/backend/copilot/pending_messages_test.py
index 6d0d445feb..cd3f6b7c43 100644
--- a/autogpt_platform/backend/backend/copilot/pending_messages_test.py
+++ b/autogpt_platform/backend/backend/copilot/pending_messages_test.py
@@ -189,6 +189,26 @@ def test_format_pending_with_file_ids() -> None:
     assert "file_id=b" in out["content"]
 
 
+def test_format_pending_with_all_fields() -> None:
+    """All fields (content + context url/content + file_ids) should all appear."""
+    msg = PendingMessage(
+        content="summarise this",
+        context=PendingMessageContext(
+            url="https://example.com/page",
+            content="headline text",
+        ),
+        file_ids=["f1", "f2"],
+    )
+    out = format_pending_as_user_message(msg)
+    body = out["content"]
+    assert out["role"] == "user"
+    assert "summarise this" in body
+    assert "[Page URL: https://example.com/page]" in body
+    assert "[Page content]\nheadline text" in body
+    assert "file_id=f1" in body
+    assert "file_id=f2" in body
+
+
 # ── Malformed payload handling ──────────────────────────────────────
 
 

From a7d06854e3f58942f522aaab09328872086e2a0c Mon Sep 17 00:00:00 2001
From: majdyz <zamil.majdy@agpt.co>
Date: Sat, 11 Apr 2026 00:42:25 +0700
Subject: [PATCH 16/30] feat(copilot): add per-user call-frequency rate limit
 to pending endpoint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The token-budget check guards against over-spending but does not prevent
rapid-fire pushes from a client with a large budget.  Add a Redis
INCR + EXPIRE sliding-window counter (30 calls per 60-second window per
user) to cap call frequency independently of token consumption.

Returns HTTP 429 with "Too many pending messages" when exceeded.
Fails open (Redis unavailable → allows request).

Adds test for the new 429 path.

Addresses autogpt-pr-reviewer "Should Fix: per-request rate limit".
---
 .../backend/api/features/chat/routes.py       | 28 +++++++++++++++++
 .../backend/api/features/chat/routes_test.py  | 30 ++++++++++++++++++-
 2 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/autogpt_platform/backend/backend/api/features/chat/routes.py b/autogpt_platform/backend/backend/api/features/chat/routes.py
index 11d9ebf90f..3ba03150e0 100644
--- a/autogpt_platform/backend/backend/api/features/chat/routes.py
+++ b/autogpt_platform/backend/backend/api/features/chat/routes.py
@@ -90,6 +90,15 @@ _UUID_RE = re.compile(
     r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", re.I
 )
 
+# Call-frequency cap for the pending-message endpoint.  The token-budget
+# check in queue_pending_message guards against overspend, but does not
+# prevent rapid-fire pushes from a client with a large budget.  This cap
+# (per user, per 60-second window) limits the rate a caller can hammer the
+# endpoint independently of token consumption.
+_PENDING_CALL_LIMIT = 30  # pushes per minute per user
+_PENDING_CALL_WINDOW_SECONDS = 60
+_PENDING_CALL_KEY_PREFIX = "copilot:pending:calls:"
+
 
 async def _validate_and_get_session(
     session_id: str,
@@ -1133,6 +1142,25 @@ async def queue_pending_message(
     except RateLimitExceeded as e:
         raise HTTPException(status_code=429, detail=str(e)) from e
 
+    # Call-frequency cap: prevent rapid-fire pushes that would bypass the
+    # token-budget check (which only fires per-turn, not per-push).
+    # Uses a Redis INCR + EXPIRE sliding counter; fails open if Redis is down.
+    try:
+        _redis = await get_redis_async()
+        _call_key = f"{_PENDING_CALL_KEY_PREFIX}{user_id}"
+        _call_count = await _redis.incr(_call_key)
+        if _call_count == 1:
+            await _redis.expire(_call_key, _PENDING_CALL_WINDOW_SECONDS)
+        if _call_count > _PENDING_CALL_LIMIT:
+            raise HTTPException(
+                status_code=429,
+                detail=f"Too many pending messages: limit is {_PENDING_CALL_LIMIT} per {_PENDING_CALL_WINDOW_SECONDS}s",
+            )
+    except HTTPException:
+        raise
+    except Exception:
+        pass  # Redis failure is non-fatal; fail open
+
     track_user_message(
         user_id=user_id,
         session_id=session_id,
diff --git a/autogpt_platform/backend/backend/api/features/chat/routes_test.py b/autogpt_platform/backend/backend/api/features/chat/routes_test.py
index 18d499047a..1254f13302 100644
--- a/autogpt_platform/backend/backend/api/features/chat/routes_test.py
+++ b/autogpt_platform/backend/backend/api/features/chat/routes_test.py
@@ -676,7 +676,10 @@ class TestQueuePendingMessageRequest:
 
 
 def _mock_pending_internals(
-    mocker: pytest_mock.MockerFixture, *, session_exists: bool = True
+    mocker: pytest_mock.MockerFixture,
+    *,
+    session_exists: bool = True,
+    call_count: int = 1,
 ):
     """Mock all async dependencies for the pending-message endpoint."""
     if session_exists:
@@ -704,6 +707,15 @@ def _mock_pending_internals(
         new_callable=AsyncMock,
         return_value=None,
     )
+    # Mock Redis for per-user call-frequency rate limit
+    mock_redis = mocker.MagicMock()
+    mock_redis.incr = mocker.AsyncMock(return_value=call_count)
+    mock_redis.expire = mocker.AsyncMock(return_value=True)
+    mocker.patch(
+        "backend.api.features.chat.routes.get_redis_async",
+        new_callable=AsyncMock,
+        return_value=mock_redis,
+    )
     mocker.patch(
         "backend.api.features.chat.routes.track_user_message",
         return_value=None,
@@ -786,6 +798,22 @@ def test_queue_pending_message_rate_limited_returns_429(
     assert response.status_code == 429
 
 
+def test_queue_pending_message_call_frequency_limit_returns_429(
+    mocker: pytest_mock.MockerFixture,
+) -> None:
+    """When per-user call frequency limit is exceeded, endpoint returns 429."""
+    from backend.api.features.chat.routes import _PENDING_CALL_LIMIT
+
+    _mock_pending_internals(mocker, call_count=_PENDING_CALL_LIMIT + 1)
+
+    response = client.post(
+        "/sessions/sess-1/messages/pending",
+        json={"message": "hi"},
+    )
+    assert response.status_code == 429
+    assert "Too many pending messages" in response.json()["detail"]
+
+
 def test_queue_pending_message_context_url_too_long_returns_422() -> None:
     """context.url over 2 KB is rejected."""
     response = client.post(

From c58176365ffdd3a1f93148d5ce25a61af0e515e5 Mon Sep 17 00:00:00 2001
From: majdyz <zamil.majdy@agpt.co>
Date: Sat, 11 Apr 2026 08:01:15 +0700
Subject: [PATCH 17/30] fix(backend/copilot): use atomic Lua EVAL for pending
 call-frequency counter

Replace separate INCR + EXPIRE with a single Lua EVAL so the rate-limit
key can never be orphaned without a TTL. If the process died between the
two commands the key would persist indefinitely, permanently locking out
the user after hitting the 30-push limit.

Fixes sentry bug report on routes.py:1153.
---
 .../backend/api/features/chat/routes.py       | 31 ++++++++++++++++---
 .../backend/api/features/chat/routes_test.py  |  5 ++-
 2 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/autogpt_platform/backend/backend/api/features/chat/routes.py b/autogpt_platform/backend/backend/api/features/chat/routes.py
index 3ba03150e0..7d36fa2485 100644
--- a/autogpt_platform/backend/backend/api/features/chat/routes.py
+++ b/autogpt_platform/backend/backend/api/features/chat/routes.py
@@ -4,7 +4,7 @@ import asyncio
 import logging
 import re
 from collections.abc import AsyncGenerator
-from typing import Annotated
+from typing import Annotated, Any, cast
 from uuid import uuid4
 
 from autogpt_libs import auth
@@ -99,6 +99,18 @@ _PENDING_CALL_LIMIT = 30  # pushes per minute per user
 _PENDING_CALL_WINDOW_SECONDS = 60
 _PENDING_CALL_KEY_PREFIX = "copilot:pending:calls:"
 
+# Lua script for atomic INCR + conditional EXPIRE.
+# Using a single EVAL ensures the counter never persists without a TTL —
+# a bare INCR followed by a separate EXPIRE can leave the key without
+# an expiry if the process crashes between the two commands.
+_CALL_INCR_LUA = """
+local count = redis.call('INCR', KEYS[1])
+if count == 1 then
+    redis.call('EXPIRE', KEYS[1], tonumber(ARGV[1]))
+end
+return count
+"""
+
 
 async def _validate_and_get_session(
     session_id: str,
@@ -1144,13 +1156,22 @@ async def queue_pending_message(
 
     # Call-frequency cap: prevent rapid-fire pushes that would bypass the
     # token-budget check (which only fires per-turn, not per-push).
-    # Uses a Redis INCR + EXPIRE sliding counter; fails open if Redis is down.
+    # Uses an atomic Lua EVAL (INCR + EXPIRE) so the key can never be
+    # orphaned without a TTL; fails open if Redis is down.
     try:
         _redis = await get_redis_async()
         _call_key = f"{_PENDING_CALL_KEY_PREFIX}{user_id}"
-        _call_count = await _redis.incr(_call_key)
-        if _call_count == 1:
-            await _redis.expire(_call_key, _PENDING_CALL_WINDOW_SECONDS)
+        _call_count = int(
+            await cast(
+                "Any",
+                _redis.eval(
+                    _CALL_INCR_LUA,
+                    1,
+                    _call_key,
+                    str(_PENDING_CALL_WINDOW_SECONDS),
+                ),
+            )
+        )
         if _call_count > _PENDING_CALL_LIMIT:
             raise HTTPException(
                 status_code=429,
diff --git a/autogpt_platform/backend/backend/api/features/chat/routes_test.py b/autogpt_platform/backend/backend/api/features/chat/routes_test.py
index 1254f13302..1c2af0c5e2 100644
--- a/autogpt_platform/backend/backend/api/features/chat/routes_test.py
+++ b/autogpt_platform/backend/backend/api/features/chat/routes_test.py
@@ -707,10 +707,9 @@ def _mock_pending_internals(
         new_callable=AsyncMock,
         return_value=None,
     )
-    # Mock Redis for per-user call-frequency rate limit
+    # Mock Redis for per-user call-frequency rate limit (atomic Lua EVAL)
     mock_redis = mocker.MagicMock()
-    mock_redis.incr = mocker.AsyncMock(return_value=call_count)
-    mock_redis.expire = mocker.AsyncMock(return_value=True)
+    mock_redis.eval = mocker.AsyncMock(return_value=call_count)
     mocker.patch(
         "backend.api.features.chat.routes.get_redis_async",
         new_callable=AsyncMock,

From 1d05b06e43d497d2b206498c67bc2d9b163ceb94 Mon Sep 17 00:00:00 2001
From: majdyz <zamil.majdy@agpt.co>
Date: Sat, 11 Apr 2026 08:25:14 +0700
Subject: [PATCH 18/30] fix(backend/copilot): prevent pending message
 duplication in stale-transcript gap

When use_resume=True and the transcript is stale, _build_query_message computes
a gap slice from session.messages[transcript_msg_count:-1].  Pending messages
drained at turn start are appended to session.messages AND concatenated into
current_message, so without the ceiling they appear in both gap_context and
current_message.

Capture _pre_drain_msg_count before drain_pending_messages() and pass it as
session_msg_ceiling to _build_query_message.  The gap slice is now bounded at
the pre-drain count, preventing pending messages from leaking into the gap.

Adds two regression tests in query_builder_test.py.
---
 .../backend/copilot/sdk/query_builder_test.py | 72 +++++++++++++++++++
 .../backend/backend/copilot/sdk/service.py    | 31 +++++++-
 2 files changed, 101 insertions(+), 2 deletions(-)

diff --git a/autogpt_platform/backend/backend/copilot/sdk/query_builder_test.py b/autogpt_platform/backend/backend/copilot/sdk/query_builder_test.py
index 57f037baba..4042dae590 100644
--- a/autogpt_platform/backend/backend/copilot/sdk/query_builder_test.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/query_builder_test.py
@@ -226,6 +226,78 @@ async def test_build_query_no_resume_multi_message(monkeypatch):
     assert was_compacted is False  # mock returns False
 
 
+@pytest.mark.asyncio
+async def test_build_query_session_msg_ceiling_prevents_pending_duplication():
+    """session_msg_ceiling stops pending messages from leaking into the gap.
+
+    Scenario: transcript covers 2 messages, session has 2 historical + 1 current
+    + 2 pending drained at turn start.  Without the ceiling the gap would include
+    the pending messages AND current_message already has them → duplication.
+    With session_msg_ceiling=3 (pre-drain count) the gap slice is empty and
+    only current_message carries the pending content.
+    """
+    # session.messages after drain: [hist1, hist2, current_msg, pending1, pending2]
+    session = _make_session(
+        [
+            ChatMessage(role="user", content="hist1"),
+            ChatMessage(role="assistant", content="hist2"),
+            ChatMessage(role="user", content="current msg with pending1 pending2"),
+            ChatMessage(role="user", content="pending1"),
+            ChatMessage(role="user", content="pending2"),
+        ]
+    )
+    # transcript covers hist1+hist2 (2 messages); pre-drain count was 3 (includes current_msg)
+    result, was_compacted = await _build_query_message(
+        "current msg with pending1 pending2",
+        session,
+        use_resume=True,
+        transcript_msg_count=2,
+        session_id="test-session",
+        session_msg_ceiling=3,  # len(session.messages) before drain
+    )
+    # Gap should be empty (transcript_msg_count == ceiling - 1), so no history prepended
+    assert result == "current msg with pending1 pending2"
+    assert was_compacted is False
+    # Pending messages must NOT appear in gap context
+    assert "pending1" not in result.split("current msg")[0]
+
+
+@pytest.mark.asyncio
+async def test_build_query_session_msg_ceiling_preserves_real_gap():
+    """session_msg_ceiling still surfaces a genuine stale-transcript gap.
+
+    Scenario: transcript covers 2 messages, session has 4 historical + 1 current
+    + 2 pending.  Ceiling = 5 (pre-drain).  Real gap = messages 2-3 (hist3, hist4).
+    """
+    session = _make_session(
+        [
+            ChatMessage(role="user", content="hist1"),
+            ChatMessage(role="assistant", content="hist2"),
+            ChatMessage(role="user", content="hist3"),
+            ChatMessage(role="assistant", content="hist4"),
+            ChatMessage(role="user", content="current"),
+            ChatMessage(role="user", content="pending1"),
+            ChatMessage(role="user", content="pending2"),
+        ]
+    )
+    result, was_compacted = await _build_query_message(
+        "current",
+        session,
+        use_resume=True,
+        transcript_msg_count=2,
+        session_id="test-session",
+        session_msg_ceiling=5,  # pre-drain: [hist1..hist4, current]
+    )
+    # Gap = session.messages[2:4] = [hist3, hist4]
+    assert "<conversation_history>" in result
+    assert "hist3" in result
+    assert "hist4" in result
+    assert "Now, the user says:\ncurrent" in result
+    # Pending messages must NOT appear in gap
+    assert "pending1" not in result
+    assert "pending2" not in result
+
+
 @pytest.mark.asyncio
 async def test_build_query_no_resume_multi_message_compacted(monkeypatch):
     """When compression actually compacts, was_compacted should be True."""
diff --git a/autogpt_platform/backend/backend/copilot/sdk/service.py b/autogpt_platform/backend/backend/copilot/sdk/service.py
index 39299ba14b..4d53611021 100644
--- a/autogpt_platform/backend/backend/copilot/sdk/service.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/service.py
@@ -959,17 +959,33 @@ async def _build_query_message(
     use_resume: bool,
     transcript_msg_count: int,
     session_id: str,
+    *,
+    session_msg_ceiling: int | None = None,
 ) -> tuple[str, bool]:
     """Build the query message with appropriate context.
 
+    Args:
+        session_msg_ceiling: If provided, treat ``session.messages`` as if it
+            only has this many entries when computing the gap slice.  Pass
+            ``len(session.messages)`` captured *before* appending any pending
+            messages so that mid-turn drains do not skew the gap calculation
+            and cause pending messages to be duplicated in both the gap context
+            and ``current_message``.
+
     Returns:
         Tuple of (query_message, was_compacted).
     """
     msg_count = len(session.messages)
+    # Use the ceiling if supplied (prevents pending-message duplication when
+    # messages were appended to session.messages after the drain but before
+    # this function is called).
+    effective_count = (
+        session_msg_ceiling if session_msg_ceiling is not None else msg_count
+    )
 
     if use_resume and transcript_msg_count > 0:
-        if transcript_msg_count < msg_count - 1:
-            gap = session.messages[transcript_msg_count:-1]
+        if transcript_msg_count < effective_count - 1:
+            gap = session.messages[transcript_msg_count : effective_count - 1]
             compressed, was_compressed = await _compress_messages(gap)
             gap_context = _format_conversation_context(compressed)
             if gap_context:
@@ -2282,6 +2298,15 @@ async def stream_chat_completion_sdk(
             if last_user:
                 current_message = last_user[-1].content or ""
 
+        # Capture the message count *before* draining so _build_query_message
+        # can compute the gap slice without including the newly-drained pending
+        # messages.  Pending messages are both appended to session.messages AND
+        # concatenated into current_message; without the ceiling the gap slice
+        # would extend into the pending messages and duplicate them in the
+        # model's input context (gap_context + current_message both containing
+        # them).
+        _pre_drain_msg_count = len(session.messages)
+
         # Drain any messages the user queued via POST /messages/pending
         # while the previous turn was running (or since the session was
         # idle).  Messages are drained ATOMICALLY — one LPOP with count
@@ -2341,6 +2366,7 @@ async def stream_chat_completion_sdk(
             use_resume,
             transcript_msg_count,
             session_id,
+            session_msg_ceiling=_pre_drain_msg_count,
         )
         # On the first turn inject user context into the message instead of the
         # system prompt — the system prompt is now static (same for all users)
@@ -2478,6 +2504,7 @@ async def stream_chat_completion_sdk(
                     state.use_resume,
                     state.transcript_msg_count,
                     session_id,
+                    session_msg_ceiling=_pre_drain_msg_count,
                 )
                 if attachments.hint:
                     state.query_message = f"{state.query_message}\n\n{attachments.hint}"

From 6b390d667726d206138a125cd07bd2fd5a0d33f2 Mon Sep 17 00:00:00 2001
From: majdyz <zamil.majdy@agpt.co>
Date: Sat, 11 Apr 2026 08:45:54 +0700
Subject: [PATCH 19/30] fix(backend/copilot): apply session_msg_ceiling to
 no-resume compression fallback

The no-resume fallback in _build_query_message used raw msg_count (> 1) to
detect multi-message history and session.messages[:-1] for the compression
slice. After a turn-start drain appends pending messages, msg_count is inflated
and the fallback fires on what should be a fresh first turn, placing the current
user message into the history context and delivering a confusing split prompt to
the model.

Apply session_msg_ceiling to both branches:
- elif condition: effective_count > 1 instead of msg_count > 1
- compression slice: session.messages[:effective_count - 1] instead of [:-1]

With _pre_drain_msg_count=1 on a first turn with drained pending messages,
effective_count=1 so the fallback is correctly skipped and current_message
(which already contains both the original and pending text) is returned as-is.

Adds regression test covering the spurious-fallback scenario.
---
 .../backend/copilot/sdk/query_builder_test.py | 33 +++++++++++++++++++
 .../backend/backend/copilot/sdk/service.py    |  8 +++--
 2 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/autogpt_platform/backend/backend/copilot/sdk/query_builder_test.py b/autogpt_platform/backend/backend/copilot/sdk/query_builder_test.py
index 4042dae590..4a7bf01823 100644
--- a/autogpt_platform/backend/backend/copilot/sdk/query_builder_test.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/query_builder_test.py
@@ -298,6 +298,39 @@ async def test_build_query_session_msg_ceiling_preserves_real_gap():
     assert "pending2" not in result
 
 
+@pytest.mark.asyncio
+async def test_build_query_session_msg_ceiling_suppresses_spurious_no_resume_fallback():
+    """session_msg_ceiling prevents the no-resume compression fallback from
+    firing on the first turn of a session when pending messages inflate msg_count.
+
+    Scenario: fresh session (1 message) + 1 pending message drained at turn start.
+    Without the ceiling: msg_count=2 > 1 → fallback triggers → pending message
+    leaked into history → wrong context sent to model.
+    With session_msg_ceiling=1 (pre-drain count): effective_count=1, 1 > 1 is False
+    → fallback does not trigger → current_message returned as-is.
+    """
+    # session.messages after drain: [current_msg, pending_msg]
+    session = _make_session(
+        [
+            ChatMessage(role="user", content="What is 2 plus 2?"),
+            ChatMessage(role="user", content="What is 7 plus 7?"),  # pending
+        ]
+    )
+    result, was_compacted = await _build_query_message(
+        "What is 2 plus 2?\n\nWhat is 7 plus 7?",
+        session,
+        use_resume=False,
+        transcript_msg_count=0,
+        session_id="test-session",
+        session_msg_ceiling=1,  # pre-drain: only 1 message existed
+    )
+    # Should return current_message directly without wrapping in history context
+    assert result == "What is 2 plus 2?\n\nWhat is 7 plus 7?"
+    assert was_compacted is False
+    # Pending question must NOT appear in a spurious history section
+    assert "<conversation_history>" not in result
+
+
 @pytest.mark.asyncio
 async def test_build_query_no_resume_multi_message_compacted(monkeypatch):
     """When compression actually compacts, was_compacted should be True."""
diff --git a/autogpt_platform/backend/backend/copilot/sdk/service.py b/autogpt_platform/backend/backend/copilot/sdk/service.py
index 4d53611021..88c41f4c51 100644
--- a/autogpt_platform/backend/backend/copilot/sdk/service.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/service.py
@@ -1001,12 +1001,14 @@ async def _build_query_message(
                     f"{gap_context}\n\nNow, the user says:\n{current_message}",
                     was_compressed,
                 )
-    elif not use_resume and msg_count > 1:
+    elif not use_resume and effective_count > 1:
         logger.warning(
             f"[SDK] Using compression fallback for session "
-            f"{session_id} ({msg_count} messages) — no transcript for --resume"
+            f"{session_id} ({effective_count} messages) — no transcript for --resume"
+        )
+        compressed, was_compressed = await _compress_messages(
+            session.messages[: effective_count - 1]
         )
-        compressed, was_compressed = await _compress_messages(session.messages[:-1])
         history_context = _format_conversation_context(compressed)
         if history_context:
             return (

From d49ffac0a1e0920e911bd392bb1cc73a5ea0f6c6 Mon Sep 17 00:00:00 2001
From: majdyz <zamil.majdy@agpt.co>
Date: Sat, 11 Apr 2026 14:55:46 +0000
Subject: [PATCH 20/30] fix(backend/copilot): flush buffered rounds before
 mid-loop pending drain and wrap turn-start persist

Address three review comments on the pending-message PR:

1. (Blocker) Mid-loop pending drain now flushes state.session_messages
   into session.messages before appending the pending user message, so
   assistant+tool entries from completed rounds land in chronological
   order. Without this, the next turn's replay could hit OpenAI tool-call
   ordering errors (user message interposed between assistant tool_call
   and its tool result).

2. (Should-Fix) Turn-start upsert_chat_session wrapped in try/except so
   a transient DB failure doesn't silently lose messages already popped
   from Redis.  Matches the pattern used in mid-loop and SDK drain paths.

3. (Nice-to-Have) Added TestMidLoopPendingFlushOrdering regression test
   in service_unit_test.py that replays the production flush sequence
   and asserts chronological ordering of assistant/tool/pending entries.
---
 .../backend/copilot/baseline/service.py       |  29 ++++-
 .../copilot/baseline/service_unit_test.py     | 121 ++++++++++++++++++
 2 files changed, 149 insertions(+), 1 deletion(-)

diff --git a/autogpt_platform/backend/backend/copilot/baseline/service.py b/autogpt_platform/backend/backend/copilot/baseline/service.py
index 4bcdfd80d9..e5de490984 100644
--- a/autogpt_platform/backend/backend/copilot/baseline/service.py
+++ b/autogpt_platform/backend/backend/copilot/baseline/service.py
@@ -957,7 +957,19 @@ async def stream_chat_completion_baseline(
             # maybe_append_user_message dedup is wrong here.
             session.messages.append(ChatMessage(role="user", content=content))
 
-    session = await upsert_chat_session(session)
+    # Persist the drained pending messages (if any) plus the current user
+    # message.  Wrap in try/except so a transient DB failure here does not
+    # silently discard messages that were already popped from Redis — the
+    # turn can still proceed using the in-memory session.messages, and a
+    # later resume/replay will backfill from the DB on the next turn.
+    try:
+        session = await upsert_chat_session(session)
+    except Exception as _persist_err:
+        logger.warning(
+            "[Baseline] Failed to persist session at turn start "
+            "(pending drain may not be durable): %s",
+            _persist_err,
+        )
 
     # Select model based on the per-request mode.  'fast' downgrades to
     # the cheaper/faster model; everything else keeps the default.
@@ -1274,6 +1286,21 @@ async def stream_chat_completion_baseline(
                 continue
             pending = await drain_pending_messages(session_id)
             if pending:
+                # Flush any buffered assistant/tool messages from completed
+                # rounds into session.messages BEFORE appending the pending
+                # user message.  ``_baseline_conversation_updater`` only
+                # records assistant+tool rounds into ``state.session_messages``
+                # — they are normally batch-flushed in the finally block.
+                # Without this in-order flush, the mid-loop pending user
+                # message lands before the preceding round's assistant/tool
+                # entries, producing chronologically-wrong session.messages
+                # on persist (user interposed between an assistant tool_call
+                # and its tool-result), which breaks OpenAI tool-call ordering
+                # invariants on the next turn's replay.
+                for _buffered in state.session_messages:
+                    session.messages.append(_buffered)
+                state.session_messages.clear()
+
                 for pm in pending:
                     # ``format_pending_as_user_message`` embeds file
                     # attachments and context URL/page content into the
diff --git a/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py b/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py
index ba1374b720..057530732e 100644
--- a/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py
+++ b/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py
@@ -828,3 +828,124 @@ class TestBaselineCostExtraction:
 
         # response was never assigned so cost extraction must not raise
         assert state.cost_usd is None
+
+
+class TestMidLoopPendingFlushOrdering:
+    """Regression test for the mid-loop pending drain ordering invariant.
+
+    ``_baseline_conversation_updater`` records assistant+tool entries from
+    each tool-call round into ``state.session_messages``; the finally block
+    of ``stream_chat_completion_baseline`` batch-flushes them into
+    ``session.messages`` at the end of the turn.
+
+    The mid-loop pending drain appends pending user messages directly to
+    ``session.messages``.  Without flushing ``state.session_messages`` first,
+    the pending user message lands BEFORE the preceding round's assistant+
+    tool entries in the final persisted ``session.messages`` — which
+    produces a malformed tool-call/tool-result ordering on the next turn's
+    replay.
+
+    This test documents the invariant by replaying the production flush
+    sequence against an in-memory state.
+    """
+
+    def test_flush_then_append_preserves_chronological_order(self):
+        """Mid-loop drain must flush state.session_messages before appending
+        the pending user message, so the final order matches the
+        chronological execution order.
+        """
+        # Initial state: user turn already appended by maybe_append_user_message
+        session_messages: list[ChatMessage] = [
+            ChatMessage(role="user", content="original user turn"),
+        ]
+        state = _BaselineStreamState()
+
+        # Round 1 completes: conversation_updater buffers assistant+tool
+        # entries into state.session_messages (but does NOT write to
+        # session.messages yet).
+        builder = TranscriptBuilder()
+        builder.append_user("original user turn")
+        response = LLMLoopResponse(
+            response_text="calling search",
+            tool_calls=[LLMToolCall(id="tc_1", name="search", arguments="{}")],
+            raw_response=None,
+            prompt_tokens=0,
+            completion_tokens=0,
+        )
+        tool_results = [
+            ToolCallResult(
+                tool_call_id="tc_1", tool_name="search", content="search output"
+            ),
+        ]
+        openai_messages: list = []
+        _baseline_conversation_updater(
+            openai_messages,
+            response,
+            tool_results=tool_results,
+            transcript_builder=builder,
+            state=state,
+            model="test-model",
+        )
+        # state.session_messages should now hold the round-1 assistant + tool
+        assert len(state.session_messages) == 2
+        assert state.session_messages[0].role == "assistant"
+        assert state.session_messages[1].role == "tool"
+
+        # --- Mid-loop pending drain (production code pattern) ---
+        # Flush first, THEN append pending.  This is the ordering fix.
+        for _buffered in state.session_messages:
+            session_messages.append(_buffered)
+        state.session_messages.clear()
+        session_messages.append(
+            ChatMessage(role="user", content="pending mid-loop message")
+        )
+
+        # Round 2 completes: new assistant+tool entries buffer again.
+        response2 = LLMLoopResponse(
+            response_text="another call",
+            tool_calls=[LLMToolCall(id="tc_2", name="calc", arguments="{}")],
+            raw_response=None,
+            prompt_tokens=0,
+            completion_tokens=0,
+        )
+        tool_results2 = [
+            ToolCallResult(
+                tool_call_id="tc_2", tool_name="calc", content="calc output"
+            ),
+        ]
+        _baseline_conversation_updater(
+            openai_messages,
+            response2,
+            tool_results=tool_results2,
+            transcript_builder=builder,
+            state=state,
+            model="test-model",
+        )
+
+        # --- Finally-block flush (end of turn) ---
+        for msg in state.session_messages:
+            session_messages.append(msg)
+
+        # Assert chronological order: original user, round-1 assistant,
+        # round-1 tool, pending user, round-2 assistant, round-2 tool.
+        assert [m.role for m in session_messages] == [
+            "user",
+            "assistant",
+            "tool",
+            "user",
+            "assistant",
+            "tool",
+        ]
+        assert session_messages[0].content == "original user turn"
+        assert session_messages[3].content == "pending mid-loop message"
+        # The assistant message carrying tool_call tc_1 must be immediately
+        # followed by its tool result — no user message interposed.
+        assert session_messages[1].role == "assistant"
+        assert session_messages[1].tool_calls is not None
+        assert session_messages[1].tool_calls[0]["id"] == "tc_1"
+        assert session_messages[2].role == "tool"
+        assert session_messages[2].tool_call_id == "tc_1"
+        # Same invariant for the round after the pending user.
+        assert session_messages[4].tool_calls is not None
+        assert session_messages[4].tool_calls[0]["id"] == "tc_2"
+        assert session_messages[5].tool_call_id == "tc_2"

From c70e34c30eaf7512c26d35d1dc009bc564d1fc3f Mon Sep 17 00:00:00 2001
From: majdyz <zamil.majdy@agpt.co>
Date: Sat, 11 Apr 2026 15:00:25 +0000
Subject: [PATCH 21/30] fix(backend/copilot): prevent duplicate assistant text
 after mid-loop pending drain

Track _flushed_assistant_text_len on _BaselineStreamState so the finally
block only appends assistant text produced AFTER the last mid-loop flush.
Without this, state.assistant_text (all rounds) vs state.session_messages
(post-flush only) desync caused the startswith(recorded) dedup to fail,
duplicating round-1 assistant text in session.messages.

Adds regression test in service_unit_test.py.
---
 .../backend/copilot/baseline/service.py       | 15 +++-
 .../copilot/baseline/service_unit_test.py     | 80 +++++++++++++++++++
 2 files changed, 94 insertions(+), 1 deletion(-)

diff --git a/autogpt_platform/backend/backend/copilot/baseline/service.py b/autogpt_platform/backend/backend/copilot/baseline/service.py
index e5de490984..9a32b6fc65 100644
--- a/autogpt_platform/backend/backend/copilot/baseline/service.py
+++ b/autogpt_platform/backend/backend/copilot/baseline/service.py
@@ -345,6 +345,11 @@ class _BaselineStreamState:
     cost_usd: float | None = None
     thinking_stripper: _ThinkingStripper = field(default_factory=_ThinkingStripper)
     session_messages: list[ChatMessage] = field(default_factory=list)
+    # Tracks how much of ``assistant_text`` has already been flushed to
+    # ``session.messages`` via mid-loop pending drains, so the ``finally``
+    # block only appends the *new* assistant text (avoiding duplication of
+    # round-1 text when round-1 entries were cleared from session_messages).
+    _flushed_assistant_text_len: int = 0
 
 
 async def _baseline_llm_caller(
@@ -1300,6 +1305,10 @@ async def stream_chat_completion_baseline(
                 for _buffered in state.session_messages:
                     session.messages.append(_buffered)
                 state.session_messages.clear()
+                # Record how much assistant_text has been covered by the
+                # structured entries just flushed, so the finally block's
+                # final-text dedup doesn't re-append rounds already persisted.
+                state._flushed_assistant_text_len = len(state.assistant_text)
 
                 for pm in pending:
                     # ``format_pending_as_user_message`` embeds file
@@ -1447,7 +1456,11 @@ async def stream_chat_completion_baseline(
         # no tool calls, i.e. the natural finish).  Only add it if the
         # conversation updater didn't already record it as part of a
         # tool-call round (which would have empty response_text).
-        final_text = state.assistant_text
+        # Only consider assistant text produced AFTER the last mid-loop
+        # flush.  ``_flushed_assistant_text_len`` tracks the prefix already
+        # persisted via structured session_messages during mid-loop pending
+        # drains; including it here would duplicate those rounds.
+        final_text = state.assistant_text[state._flushed_assistant_text_len :]
         if state.session_messages:
             # Strip text already captured in tool-call round messages
             recorded = "".join(
diff --git a/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py b/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py
index 057530732e..b67793076f 100644
--- a/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py
+++ b/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py
@@ -949,3 +949,83 @@ class TestMidLoopPendingFlushOrdering:
         assert session_messages[4].tool_calls is not None
         assert session_messages[4].tool_calls[0]["id"] == "tc_2"
         assert session_messages[5].tool_call_id == "tc_2"
+
+    def test_flushed_assistant_text_len_prevents_duplicate_final_text(self):
+        """After mid-loop drain clears state.session_messages, the finally
+        block must not re-append assistant text from rounds already flushed.
+
+        ``state.assistant_text`` accumulates ALL rounds' text, but
+        ``state.session_messages`` only holds entries from rounds AFTER the
+        last mid-loop flush.  Without ``_flushed_assistant_text_len``, the
+        ``finally`` block's ``startswith(recorded)`` check fails because
+        ``recorded`` only covers post-flush rounds, and the full
+        ``assistant_text`` is appended — duplicating pre-flush rounds.
+        """
+        state = _BaselineStreamState()
+        session_messages: list[ChatMessage] = [
+            ChatMessage(role="user", content="user turn"),
+        ]
+
+        # Simulate round 1 text accumulation (as _bound_llm_caller does)
+        state.assistant_text += "calling search"
+
+        # Round 1 conversation_updater buffers structured entries
+        builder = TranscriptBuilder()
+        builder.append_user("user turn")
+        response1 = LLMLoopResponse(
+            response_text="calling search",
+            tool_calls=[LLMToolCall(id="tc_1", name="search", arguments="{}")],
+            raw_response=None,
+            prompt_tokens=0,
+            completion_tokens=0,
+        )
+        _baseline_conversation_updater(
+            [],
+            response1,
+            tool_results=[
+                ToolCallResult(
+                    tool_call_id="tc_1", tool_name="search", content="result"
+                )
+            ],
+            transcript_builder=builder,
+            state=state,
+            model="test-model",
+        )
+
+        # Mid-loop drain: flush + clear + record flushed text length
+        for _buffered in state.session_messages:
+            session_messages.append(_buffered)
+        state.session_messages.clear()
+        state._flushed_assistant_text_len = len(state.assistant_text)
+        session_messages.append(ChatMessage(role="user", content="pending message"))
+
+        # Simulate round 2 text accumulation
+        state.assistant_text += "final answer"
+
+        # Round 2: natural finish (no tool calls → no session_messages entry)
+
+        # --- Finally block logic (production code) ---
+        for msg in state.session_messages:
+            session_messages.append(msg)
+
+        final_text = state.assistant_text[state._flushed_assistant_text_len :]
+        if state.session_messages:
+            recorded = "".join(
+                m.content or "" for m in state.session_messages if m.role == "assistant"
+            )
+            if final_text.startswith(recorded):
+                final_text = final_text[len(recorded) :]
+        if final_text.strip():
+            session_messages.append(ChatMessage(role="assistant", content=final_text))
+
+        # The final assistant message should only contain round-2 text,
+        # not the round-1 text that was already flushed mid-loop.
+        assistant_msgs = [m for m in session_messages if m.role == "assistant"]
+        # Round-1 structured assistant (from mid-loop flush)
+        assert assistant_msgs[0].content == "calling search"
+        assert assistant_msgs[0].tool_calls is not None
+        # Round-2 final text (from finally block)
+        assert assistant_msgs[1].content == "final answer"
+        assert assistant_msgs[1].tool_calls is None
+        # Crucially: only 2 assistant messages, not 3 (no duplicate)
+        assert len(assistant_msgs) == 2

From db9eb2913801dfee18c36b987cbd1878d3550431 Mon Sep 17 00:00:00 2001
From: majdyz <zamil.majdy@agpt.co>
Date: Sun, 12 Apr 2026 10:13:45 +0000
Subject: [PATCH 22/30] fix(backend): address review findings for
 pending-message endpoint

- Fix off-by-one in rate limit: use >= instead of > for call count check
- Move track_user_message() after push_pending_message() so analytics
  only fires on successful push
- Add logger.warning in rate-limiter except-Exception catch instead of
  silent pass
- Use fullmatch instead of match for UUID regex validation
- Add extra="forbid" to PendingMessageContext to reject unexpected fields
---
 .../backend/api/features/chat/routes.py       | 20 +++++++++----------
 .../backend/copilot/pending_messages.py       |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/autogpt_platform/backend/backend/api/features/chat/routes.py b/autogpt_platform/backend/backend/api/features/chat/routes.py
index 7d36fa2485..7b46690bb3 100644
--- a/autogpt_platform/backend/backend/api/features/chat/routes.py
+++ b/autogpt_platform/backend/backend/api/features/chat/routes.py
@@ -133,7 +133,7 @@ async def _resolve_workspace_files(
     Used by both the stream and pending-message endpoints to prevent callers from
     referencing other users' files.
     """
-    valid_ids = [fid for fid in file_ids if _UUID_RE.match(fid)]
+    valid_ids = [fid for fid in file_ids if _UUID_RE.fullmatch(fid)]
     if not valid_ids:
         return []
     workspace = await get_or_create_workspace(user_id)
@@ -1172,7 +1172,7 @@ async def queue_pending_message(
                 ),
             )
         )
-        if _call_count > _PENDING_CALL_LIMIT:
+        if _call_count >= _PENDING_CALL_LIMIT:
             raise HTTPException(
                 status_code=429,
                 detail=f"Too many pending messages: limit is {_PENDING_CALL_LIMIT} per {_PENDING_CALL_WINDOW_SECONDS}s",
@@ -1180,20 +1180,14 @@ async def queue_pending_message(
     except HTTPException:
         raise
     except Exception:
-        pass  # Redis failure is non-fatal; fail open
-
-    track_user_message(
-        user_id=user_id,
-        session_id=session_id,
-        message_length=len(request.message),
-    )
+        logger.warning("queue_pending_message: rate-limit check failed, failing open")  # non-fatal
 
     # Sanitise file IDs to the user's own workspace so injection doesn't
     # surface other users' files.  _resolve_workspace_files handles UUID
     # filtering and the workspace-scoped DB lookup.
     sanitized_file_ids: list[str] = []
     if request.file_ids:
-        valid_id_count = sum(1 for fid in request.file_ids if _UUID_RE.match(fid))
+        valid_id_count = sum(1 for fid in request.file_ids if _UUID_RE.fullmatch(fid))
         files = await _resolve_workspace_files(user_id, request.file_ids)
         sanitized_file_ids = [wf.id for wf in files]
         if len(sanitized_file_ids) != valid_id_count:
@@ -1220,6 +1214,12 @@ async def queue_pending_message(
     )
     buffer_length = await push_pending_message(session_id, pending)
 
+    track_user_message(
+        user_id=user_id,
+        session_id=session_id,
+        message_length=len(request.message),
+    )
+
     # Check whether a turn is currently running for UX feedback.
     active_session = await stream_registry.get_session(session_id)
     turn_in_flight = bool(active_session and active_session.status == "running")
diff --git a/autogpt_platform/backend/backend/copilot/pending_messages.py b/autogpt_platform/backend/backend/copilot/pending_messages.py
index 0875a44046..20f673215d 100644
--- a/autogpt_platform/backend/backend/copilot/pending_messages.py
+++ b/autogpt_platform/backend/backend/copilot/pending_messages.py
@@ -49,7 +49,7 @@ _PENDING_TTL_SECONDS = 3600  # 1 hour — matches stream_ttl default
 _NOTIFY_PAYLOAD = "1"
 
 
-class PendingMessageContext(BaseModel):
+class PendingMessageContext(BaseModel, extra="forbid"):
     """Structured page context attached to a pending message."""
 
     url: str | None = None

From 7b783aa03b5b28049d70300adac3ca8816b00f17 Mon Sep 17 00:00:00 2001
From: majdyz <zamil.majdy@agpt.co>
Date: Sun, 12 Apr 2026 11:21:23 +0000
Subject: [PATCH 23/30] fix(backend): use PendingMessageContext type in
 QueuePendingMessageRequest to prevent 500

Change context field from dict[str,str] to PendingMessageContext so
Pydantic validates (including extra="forbid") at request parse time,
returning a proper 422 instead of an unhandled ValidationError / 500
when the caller sends unexpected keys.
---
 .../backend/api/features/chat/routes.py       | 16 ++++++------
 .../frontend/src/app/api/openapi.json         | 25 +++++++++++++++----
 2 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/autogpt_platform/backend/backend/api/features/chat/routes.py b/autogpt_platform/backend/backend/api/features/chat/routes.py
index 7b46690bb3..638ea64272 100644
--- a/autogpt_platform/backend/backend/api/features/chat/routes.py
+++ b/autogpt_platform/backend/backend/api/features/chat/routes.py
@@ -181,30 +181,28 @@ class QueuePendingMessageRequest(BaseModel):
     model_config = ConfigDict(extra="forbid")
 
     message: str = Field(min_length=1, max_length=16_000)
-    context: dict[str, str] | None = Field(
+    context: PendingMessageContext | None = Field(
         default=None,
-        description="Optional page context: expected keys are 'url' and 'content'.",
+        description="Optional page context with 'url' and 'content' fields.",
     )
     file_ids: list[str] | None = Field(default=None, max_length=20)
 
     @field_validator("context")
     @classmethod
     def _validate_context_length(
-        cls, v: dict[str, str] | None
-    ) -> dict[str, str] | None:
+        cls, v: PendingMessageContext | None
+    ) -> PendingMessageContext | None:
         if v is None:
             return v
         # Cap context values to prevent LLM context-window stuffing via
         # large page payloads (url: 2 KB, content: 32 KB).
         _URL_LIMIT = 2_000
         _CONTENT_LIMIT = 32_000
-        url = v.get("url", "")
-        if len(url) > _URL_LIMIT:
+        if v.url and len(v.url) > _URL_LIMIT:
             raise ValueError(
                 f"context.url exceeds maximum length of {_URL_LIMIT} characters"
             )
-        content = v.get("content", "")
-        if len(content) > _CONTENT_LIMIT:
+        if v.content and len(v.content) > _CONTENT_LIMIT:
             raise ValueError(
                 f"context.content exceeds maximum length of {_CONTENT_LIMIT} characters"
             )
@@ -1210,7 +1208,7 @@ async def queue_pending_message(
     pending = PendingMessage(
         content=request.message,
         file_ids=sanitized_file_ids,
-        context=PendingMessageContext(**request.context) if request.context else None,
+        context=request.context,
     )
     buffer_length = await push_pending_message(session_id, pending)
 
diff --git a/autogpt_platform/frontend/src/app/api/openapi.json b/autogpt_platform/frontend/src/app/api/openapi.json
index 2546df9357..2001b53f87 100644
--- a/autogpt_platform/frontend/src/app/api/openapi.json
+++ b/autogpt_platform/frontend/src/app/api/openapi.json
@@ -12718,6 +12718,24 @@
         "required": ["providers", "pagination"],
         "title": "ProviderResponse"
       },
+      "PendingMessageContext": {
+        "properties": {
+          "url": {
+            "anyOf": [{ "type": "string" }, { "type": "null" }],
+            "title": "Url",
+            "default": null
+          },
+          "content": {
+            "anyOf": [{ "type": "string" }, { "type": "null" }],
+            "title": "Content",
+            "default": null
+          }
+        },
+        "additionalProperties": false,
+        "type": "object",
+        "title": "PendingMessageContext",
+        "description": "Structured page context attached to a pending message."
+      },
       "QueuePendingMessageRequest": {
         "properties": {
           "message": {
@@ -12728,14 +12746,11 @@
           },
           "context": {
             "anyOf": [
-              {
-                "additionalProperties": { "type": "string" },
-                "type": "object"
-              },
+              { "$ref": "#/components/schemas/PendingMessageContext" },
               { "type": "null" }
             ],
             "title": "Context",
-            "description": "Optional page context: expected keys are 'url' and 'content'."
+            "description": "Optional page context with 'url' and 'content' fields."
           },
           "file_ids": {
             "anyOf": [

From 5d7fa7c216462eba268709dc0f3ea7bc38457e85 Mon Sep 17 00:00:00 2001
From: majdyz <zamil.majdy@agpt.co>
Date: Sun, 12 Apr 2026 11:42:06 +0000
Subject: [PATCH 24/30] fix(backend): update test to use PendingMessageContext
 attribute access
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

context is now a PendingMessageContext object, not a dict — use
.url attribute instead of ["url"] subscript.
---
 .../backend/backend/api/features/chat/routes_test.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autogpt_platform/backend/backend/api/features/chat/routes_test.py b/autogpt_platform/backend/backend/api/features/chat/routes_test.py
index 1c2af0c5e2..401d73bea3 100644
--- a/autogpt_platform/backend/backend/api/features/chat/routes_test.py
+++ b/autogpt_platform/backend/backend/api/features/chat/routes_test.py
@@ -617,7 +617,7 @@ class TestQueuePendingMessageRequest:
             context={"url": "https://example.com", "content": "page text"},
         )
         assert req.context is not None
-        assert req.context["url"] == "https://example.com"
+        assert req.context.url == "https://example.com"
 
     def test_rejects_context_url_over_limit(self) -> None:
         import pydantic

From f3f598daa3fe824979a24aa069d8f0519491fd95 Mon Sep 17 00:00:00 2001
From: majdyz <zamil.majdy@agpt.co>
Date: Sun, 12 Apr 2026 12:10:05 +0000
Subject: [PATCH 25/30] Wrap mid-loop drain_pending_messages in try/except

If the Redis drain fails mid-tool-loop, log a warning and treat it as
no pending messages rather than crashing the entire copilot turn.
---
 .../backend/backend/copilot/baseline/service.py        | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/autogpt_platform/backend/backend/copilot/baseline/service.py b/autogpt_platform/backend/backend/copilot/baseline/service.py
index 9a32b6fc65..05ece25fe0 100644
--- a/autogpt_platform/backend/backend/copilot/baseline/service.py
+++ b/autogpt_platform/backend/backend/copilot/baseline/service.py
@@ -1289,7 +1289,15 @@ async def stream_chat_completion_baseline(
             )
             if is_final_yield:
                 continue
-            pending = await drain_pending_messages(session_id)
+            try:
+                pending = await drain_pending_messages(session_id)
+            except Exception:
+                logger.warning(
+                    "Mid-loop drain_pending_messages failed for session %s",
+                    session_id,
+                    exc_info=True,
+                )
+                pending = []
             if pending:
                 # Flush any buffered assistant/tool messages from completed
                 # rounds into session.messages BEFORE appending the pending

From 057412ebee31b61b192bd627e18e116c7e040f4b Mon Sep 17 00:00:00 2001
From: majdyz <zamil.majdy@agpt.co>
Date: Sun, 12 Apr 2026 23:14:54 +0000
Subject: [PATCH 26/30] fix(copilot): allow exactly 30 pending calls per window

Change >= to > so the 30th call (INCR returns 30) is accepted
and only the 31st triggers the 429.
---
 autogpt_platform/backend/backend/api/features/chat/routes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autogpt_platform/backend/backend/api/features/chat/routes.py b/autogpt_platform/backend/backend/api/features/chat/routes.py
index 638ea64272..af3753865c 100644
--- a/autogpt_platform/backend/backend/api/features/chat/routes.py
+++ b/autogpt_platform/backend/backend/api/features/chat/routes.py
@@ -1170,7 +1170,7 @@ async def queue_pending_message(
                 ),
             )
         )
-        if _call_count >= _PENDING_CALL_LIMIT:
+        if _call_count > _PENDING_CALL_LIMIT:
             raise HTTPException(
                 status_code=429,
                 detail=f"Too many pending messages: limit is {_PENDING_CALL_LIMIT} per {_PENDING_CALL_WINDOW_SECONDS}s",

From 10980f3799ccec90fa5f964640a368b48aa4624b Mon Sep 17 00:00:00 2001
From: majdyz <zamil.majdy@agpt.co>
Date: Mon, 13 Apr 2026 03:57:54 +0000
Subject: [PATCH 27/30] fix(copilot): wrap SDK turn-start drain in try/except,
 deduplicate format calls, elevate context length constants
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- sdk/service.py: wrap drain_pending_messages at turn start in try/except;
  a transient Redis error no longer kills the entire turn (baseline mid-loop
  drain was already protected, SDK was missed in round 5)
- baseline/service.py: pre-compute format_pending_as_user_message content
  once per drained message and reuse it for both session.messages and
  transcript_builder — eliminates the redundant second call per message
- routes.py: move _URL_LIMIT/_CONTENT_LIMIT out of the validator body into
  module-level _CONTEXT_URL_MAX_LENGTH/_CONTEXT_CONTENT_MAX_LENGTH so the
  contract limits are visible to tooling without reading the implementation
---
 .../backend/api/features/chat/routes.py        | 18 +++++++++++-------
 .../backend/copilot/baseline/service.py        | 14 +++++++++-----
 .../backend/backend/copilot/sdk/service.py     | 10 +++++++++-
 3 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/autogpt_platform/backend/backend/api/features/chat/routes.py b/autogpt_platform/backend/backend/api/features/chat/routes.py
index af3753865c..92bbf85652 100644
--- a/autogpt_platform/backend/backend/api/features/chat/routes.py
+++ b/autogpt_platform/backend/backend/api/features/chat/routes.py
@@ -99,6 +99,11 @@ _PENDING_CALL_LIMIT = 30  # pushes per minute per user
 _PENDING_CALL_WINDOW_SECONDS = 60
 _PENDING_CALL_KEY_PREFIX = "copilot:pending:calls:"
 
+# Maximum lengths for pending-message context fields (url: 2 KB, content: 32 KB).
+# Enforced by QueuePendingMessageRequest._validate_context_length.
+_CONTEXT_URL_MAX_LENGTH = 2_000
+_CONTEXT_CONTENT_MAX_LENGTH = 32_000
+
 # Lua script for atomic INCR + conditional EXPIRE.
 # Using a single EVAL ensures the counter never persists without a TTL —
 # a bare INCR followed by a separate EXPIRE can leave the key without
@@ -195,16 +200,15 @@ class QueuePendingMessageRequest(BaseModel):
         if v is None:
             return v
         # Cap context values to prevent LLM context-window stuffing via
-        # large page payloads (url: 2 KB, content: 32 KB).
-        _URL_LIMIT = 2_000
-        _CONTENT_LIMIT = 32_000
-        if v.url and len(v.url) > _URL_LIMIT:
+        # large page payloads.  Limits are module-level constants so
+        # they are visible to callers and documentation.
+        if v.url and len(v.url) > _CONTEXT_URL_MAX_LENGTH:
             raise ValueError(
-                f"context.url exceeds maximum length of {_URL_LIMIT} characters"
+                f"context.url exceeds maximum length of {_CONTEXT_URL_MAX_LENGTH} characters"
             )
-        if v.content and len(v.content) > _CONTENT_LIMIT:
+        if v.content and len(v.content) > _CONTEXT_CONTENT_MAX_LENGTH:
             raise ValueError(
-                f"context.content exceeds maximum length of {_CONTENT_LIMIT} characters"
+                f"context.content exceeds maximum length of {_CONTEXT_CONTENT_MAX_LENGTH} characters"
             )
         return v
 
diff --git a/autogpt_platform/backend/backend/copilot/baseline/service.py b/autogpt_platform/backend/backend/copilot/baseline/service.py
index 05ece25fe0..224757556d 100644
--- a/autogpt_platform/backend/backend/copilot/baseline/service.py
+++ b/autogpt_platform/backend/backend/copilot/baseline/service.py
@@ -949,6 +949,10 @@ async def stream_chat_completion_baseline(
     # concurrent push lands *after* the drain and stays queued for the
     # next turn instead of being lost.
     drained_at_start = await drain_pending_messages(session_id)
+    # Pre-compute formatted content once per message so we don't call
+    # format_pending_as_user_message twice (once for session.messages and
+    # once for transcript_builder below).
+    drained_at_start_content: list[str] = []
     if drained_at_start:
         logger.info(
             "[Baseline] Draining %d pending message(s) at turn start for session %s",
@@ -957,6 +961,7 @@ async def stream_chat_completion_baseline(
         )
         for pm in drained_at_start:
             content = format_pending_as_user_message(pm)["content"]
+            drained_at_start_content.append(content)
             # Append directly — pending messages are atomically-popped from
             # Redis and are never stale-cache duplicates, so the
             # maybe_append_user_message dedup is wrong here.
@@ -1043,11 +1048,10 @@ async def stream_chat_completion_baseline(
     # transcript — otherwise the loaded prior transcript would be
     # missing them and a mid-turn upload could leave a malformed
     # assistant-after-assistant structure on the next turn.
-    if drained_at_start:
-        for pm in drained_at_start:
-            transcript_builder.append_user(
-                content=format_pending_as_user_message(pm)["content"]
-            )
+    # Reuse the pre-computed content strings to avoid calling
+    # format_pending_as_user_message a second time.
+    for _drained_content in drained_at_start_content:
+        transcript_builder.append_user(content=_drained_content)
 
     # Generate title for new sessions
     if is_user_message and not session.title:
diff --git a/autogpt_platform/backend/backend/copilot/sdk/service.py b/autogpt_platform/backend/backend/copilot/sdk/service.py
index 88c41f4c51..23dcee83e5 100644
--- a/autogpt_platform/backend/backend/copilot/sdk/service.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/service.py
@@ -2325,7 +2325,15 @@ async def stream_chat_completion_sdk(
         # immediately after the drain so a crash doesn't lose the messages.
         # The endpoint deliberately does NOT persist to session.messages —
         # Redis is the single source of truth until this drain runs.
-        pending_at_start = await drain_pending_messages(session_id)
+        try:
+            pending_at_start = await drain_pending_messages(session_id)
+        except Exception:
+            logger.warning(
+                "%s drain_pending_messages failed at turn start, skipping",
+                log_prefix,
+                exc_info=True,
+            )
+            pending_at_start = []
         if pending_at_start:
             logger.info(
                 "%s Draining %d pending message(s) at turn start",

From 45f96d5769a75ea3390d416f4baf0ced21a906d5 Mon Sep 17 00:00:00 2001
From: majdyz <zamil.majdy@agpt.co>
Date: Mon, 13 Apr 2026 04:24:29 +0000
Subject: [PATCH 28/30] fix(copilot): wrap baseline turn-start drain in
 try/except; add 404/429 to OpenAPI spec
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Baseline turn-start drain_pending_messages was unprotected — a transient
Redis error would propagate up and kill the entire turn stream, unlike the
already-protected mid-loop and SDK paths. Wrap with try/except + fallback
to [] so a Redis hiccup degrades gracefully.

Also adds 404 (session not found) and 429 (rate-limit exceeded) response
codes to the pending endpoint's OpenAPI spec so TypeScript clients can
handle these error paths correctly.
---
 .../backend/backend/copilot/baseline/service.py          | 9 ++++++++-
 autogpt_platform/frontend/src/app/api/openapi.json       | 4 +++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/autogpt_platform/backend/backend/copilot/baseline/service.py b/autogpt_platform/backend/backend/copilot/baseline/service.py
index 224757556d..ad54b20f97 100644
--- a/autogpt_platform/backend/backend/copilot/baseline/service.py
+++ b/autogpt_platform/backend/backend/copilot/baseline/service.py
@@ -948,7 +948,14 @@ async def stream_chat_completion_baseline(
     # mid-loop drains missed them).  Atomic LPOP guarantees that a
     # concurrent push lands *after* the drain and stays queued for the
     # next turn instead of being lost.
-    drained_at_start = await drain_pending_messages(session_id)
+    try:
+        drained_at_start = await drain_pending_messages(session_id)
+    except Exception:
+        logger.warning(
+            "[Baseline] drain_pending_messages failed at turn start, skipping",
+            exc_info=True,
+        )
+        drained_at_start = []
     # Pre-compute formatted content once per message so we don't call
     # format_pending_as_user_message twice (once for session.messages and
     # once for transcript_builder below).
diff --git a/autogpt_platform/frontend/src/app/api/openapi.json b/autogpt_platform/frontend/src/app/api/openapi.json
index 2001b53f87..49d8ab64a0 100644
--- a/autogpt_platform/frontend/src/app/api/openapi.json
+++ b/autogpt_platform/frontend/src/app/api/openapi.json
@@ -1644,6 +1644,7 @@
           "401": {
             "$ref": "#/components/responses/HTTP401NotAuthenticatedError"
           },
+          "404": { "description": "Session not found or access denied" },
           "422": {
             "description": "Validation Error",
             "content": {
@@ -1651,7 +1652,8 @@
                 "schema": { "$ref": "#/components/schemas/HTTPValidationError" }
               }
             }
-          }
+          },
+          "429": { "description": "Token rate-limit or call-frequency cap exceeded" }
         }
       }
     },

From 6ccb44e0d55b406e6fa70e71317c646778af7aa2 Mon Sep 17 00:00:00 2001
From: majdyz <zamil.majdy@agpt.co>
Date: Mon, 13 Apr 2026 07:04:07 +0000
Subject: [PATCH 29/30] fix(copilot): add 404/429 to route decorator, reformat
 routes.py, regenerate openapi.json

Add responses={404, 429} to the pending endpoint's @router.post decorator
so FastAPI auto-generates them in the OpenAPI spec. Previously these were
only manually added to openapi.json and the CI schema-check (export +
diff) stripped them. Also apply black formatting to the long warning
line that was failing the backend lint check.
---
 .../backend/api/features/chat/routes.py       |  8 +++-
 .../frontend/src/app/api/openapi.json         | 48 ++++++++-----------
 2 files changed, 27 insertions(+), 29 deletions(-)

diff --git a/autogpt_platform/backend/backend/api/features/chat/routes.py b/autogpt_platform/backend/backend/api/features/chat/routes.py
index 92bbf85652..023e14f3dc 100644
--- a/autogpt_platform/backend/backend/api/features/chat/routes.py
+++ b/autogpt_platform/backend/backend/api/features/chat/routes.py
@@ -1114,6 +1114,10 @@ async def stream_chat_post(
     "/sessions/{session_id}/messages/pending",
     response_model=QueuePendingMessageResponse,
     status_code=202,
+    responses={
+        404: {"description": "Session not found or access denied"},
+        429: {"description": "Token rate-limit or call-frequency cap exceeded"},
+    },
 )
 async def queue_pending_message(
     session_id: str,
@@ -1182,7 +1186,9 @@ async def queue_pending_message(
     except HTTPException:
         raise
     except Exception:
-        logger.warning("queue_pending_message: rate-limit check failed, failing open")  # non-fatal
+        logger.warning(
+            "queue_pending_message: rate-limit check failed, failing open"
+        )  # non-fatal
 
     # Sanitise file IDs to the user's own workspace so injection doesn't
     # surface other users' files.  _resolve_workspace_files handles UUID
diff --git a/autogpt_platform/frontend/src/app/api/openapi.json b/autogpt_platform/frontend/src/app/api/openapi.json
index 49d8ab64a0..9d0d9a6e8c 100644
--- a/autogpt_platform/frontend/src/app/api/openapi.json
+++ b/autogpt_platform/frontend/src/app/api/openapi.json
@@ -1653,7 +1653,9 @@
               }
             }
           },
-          "429": { "description": "Token rate-limit or call-frequency cap exceeded" }
+          "429": {
+            "description": "Token rate-limit or call-frequency cap exceeded"
+          }
         }
       }
     },
@@ -9487,14 +9489,7 @@
       },
       "CreditTransactionType": {
         "type": "string",
-        "enum": [
-          "TOP_UP",
-          "USAGE",
-          "GRANT",
-          "REFUND",
-          "CARD_CHECK",
-          "SUBSCRIPTION"
-        ],
+        "enum": ["TOP_UP", "USAGE", "GRANT", "REFUND", "CARD_CHECK"],
         "title": "CreditTransactionType"
       },
       "DeleteFileResponse": {
@@ -12176,6 +12171,22 @@
         "title": "PendingHumanReviewModel",
         "description": "Response model for pending human review data.\n\nRepresents a human review request that is awaiting user action.\nContains all necessary information for a user to review and approve\nor reject data from a Human-in-the-Loop block execution.\n\nAttributes:\n    id: Unique identifier for the review record\n    user_id: ID of the user who must perform the review\n    node_exec_id: ID of the node execution that created this review\n    node_id: ID of the node definition (for grouping reviews from same node)\n    graph_exec_id: ID of the graph execution containing the node\n    graph_id: ID of the graph template being executed\n    graph_version: Version number of the graph template\n    payload: The actual data payload awaiting review\n    instructions: Instructions or message for the reviewer\n    editable: Whether the reviewer can edit the data\n    status: Current review status (WAITING, APPROVED, or REJECTED)\n    review_message: Optional message from the reviewer\n    created_at: Timestamp when review was created\n    updated_at: Timestamp when review was last modified\n    reviewed_at: Timestamp when review was completed (if applicable)"
       },
+      "PendingMessageContext": {
+        "properties": {
+          "url": {
+            "anyOf": [{ "type": "string" }, { "type": "null" }],
+            "title": "Url"
+          },
+          "content": {
+            "anyOf": [{ "type": "string" }, { "type": "null" }],
+            "title": "Content"
+          }
+        },
+        "additionalProperties": false,
+        "type": "object",
+        "title": "PendingMessageContext",
+        "description": "Structured page context attached to a pending message."
+      },
       "PlatformCostDashboard": {
         "properties": {
           "by_provider": {
@@ -12720,24 +12731,6 @@
         "required": ["providers", "pagination"],
         "title": "ProviderResponse"
       },
-      "PendingMessageContext": {
-        "properties": {
-          "url": {
-            "anyOf": [{ "type": "string" }, { "type": "null" }],
-            "title": "Url",
-            "default": null
-          },
-          "content": {
-            "anyOf": [{ "type": "string" }, { "type": "null" }],
-            "title": "Content",
-            "default": null
-          }
-        },
-        "additionalProperties": false,
-        "type": "object",
-        "title": "PendingMessageContext",
-        "description": "Structured page context attached to a pending message."
-      },
       "QueuePendingMessageRequest": {
         "properties": {
           "message": {
@@ -12751,7 +12744,6 @@
               { "$ref": "#/components/schemas/PendingMessageContext" },
               { "type": "null" }
             ],
-            "title": "Context",
             "description": "Optional page context with 'url' and 'content' fields."
           },
           "file_ids": {

From ca0c95b5936e9d33e5d5f39b576bd3fa791f8732 Mon Sep 17 00:00:00 2001
From: majdyz <zamil.majdy@agpt.co>
Date: Mon, 13 Apr 2026 07:13:21 +0000
Subject: [PATCH 30/30] fix(frontend): add SUBSCRIPTION to
 CreditTransactionType enum in openapi.json

Syncs the OpenAPI spec with the Prisma schema which already includes the
SUBSCRIPTION enum value in CreditTransactionType.
---
 autogpt_platform/frontend/src/app/api/openapi.json | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/autogpt_platform/frontend/src/app/api/openapi.json b/autogpt_platform/frontend/src/app/api/openapi.json
index 9d0d9a6e8c..1b3b1b75f2 100644
--- a/autogpt_platform/frontend/src/app/api/openapi.json
+++ b/autogpt_platform/frontend/src/app/api/openapi.json
@@ -9489,7 +9489,14 @@
       },
       "CreditTransactionType": {
         "type": "string",
-        "enum": ["TOP_UP", "USAGE", "GRANT", "REFUND", "CARD_CHECK"],
+        "enum": [
+          "TOP_UP",
+          "USAGE",
+          "GRANT",
+          "REFUND",
+          "CARD_CHECK",
+          "SUBSCRIPTION"
+        ],
         "title": "CreditTransactionType"
       },
       "DeleteFileResponse": {