poetry lock

fix(copilot): non-cancelling heartbeat, incremental saves, frontend reconnection
- Replace asyncio.timeout() with asyncio.wait() for SDK message iteration to avoid corrupting the internal anyio stream on timeout (root cause of tool outputs getting stuck) - Add CancelledError handling + pending task cleanup in finally block - Fix _end_text_if_open([]) discarding StreamTextEnd events (Sentry bug) - Save session to DB after each tool input/output for cross-device recovery - Optimize incremental saves by passing existing_message_count to skip redundant DB count queries - Frontend: invalidate session cache + reset resume ref on stream end so SSE reconnection works after drops
2026-02-24 03:00:28 -05:00 · 2026-02-20 15:11:46 +07:00 · 2026-02-20 15:07:18 +07:00 · 2026-02-20 15:07:18 +07:00 · 2026-02-20 11:51:49 +05:30 · 2026-02-20 13:12:05 +07:00
12 changed files with 1045 additions and 153 deletions
--- a/autogpt_platform/backend/backend/copilot/model.py
+++ b/autogpt_platform/backend/backend/copilot/model.py
@@ -432,13 +432,23 @@ async def _get_session_from_db(session_id: str) -> ChatSession | None:
    return session


-async def upsert_chat_session(session: ChatSession) -> ChatSession:
+async def upsert_chat_session(
+    session: ChatSession,
+    *,
+    existing_message_count: int | None = None,
+) -> ChatSession:
    """Update a chat session in both cache and database.

    Uses session-level locking to prevent race conditions when concurrent
    operations (e.g., background title update and main stream handler)
    attempt to upsert the same session simultaneously.

+    Args:
+        existing_message_count: If provided, skip the DB query to count
+            existing messages. The caller is responsible for tracking this
+            accurately. Useful for incremental saves in a streaming loop
+            where the caller already knows how many messages are persisted.
+
    Raises:
        DatabaseError: If the database write fails. The cache is still updated
            as a best-effort optimization, but the error is propagated to ensure
@@ -450,15 +460,20 @@ async def upsert_chat_session(session: ChatSession) -> ChatSession:

    async with lock:
        # Get existing message count from DB for incremental saves
-        existing_message_count = await chat_db().get_chat_session_message_count(
-            session.session_id
-        )
+        if existing_message_count is None:
+            existing_message_count = await chat_db().get_chat_session_message_count(
+                session.session_id
+            )

        db_error: Exception | None = None

        # Save to database (primary storage)
        try:
-            await _save_session_to_db(session, existing_message_count)
+            await _save_session_to_db(
+                session,
+                existing_message_count,
+                skip_existence_check=existing_message_count > 0,
+            )
        except Exception as e:
            logger.error(
                f"Failed to save session {session.session_id} to database: {e}"
@@ -489,21 +504,31 @@ async def upsert_chat_session(session: ChatSession) -> ChatSession:


 async def _save_session_to_db(
-    session: ChatSession, existing_message_count: int
+    session: ChatSession,
+    existing_message_count: int,
+    *,
+    skip_existence_check: bool = False,
 ) -> None:
-    """Save or update a chat session in the database."""
+    """Save or update a chat session in the database.
+
+    Args:
+        skip_existence_check: When True, skip the ``get_chat_session`` query
+            and assume the session row already exists.  Saves one DB round trip
+            for incremental saves during streaming.
+    """
    db = chat_db()

-    # Check if session exists in DB
-    existing = await db.get_chat_session(session.session_id)
+    if not skip_existence_check:
+        # Check if session exists in DB
+        existing = await db.get_chat_session(session.session_id)

-    if not existing:
-        # Create new session
-        await db.create_chat_session(
-            session_id=session.session_id,
-            user_id=session.user_id,
-        )
-        existing_message_count = 0
+        if not existing:
+            # Create new session
+            await db.create_chat_session(
+                session_id=session.session_id,
+                user_id=session.user_id,
+            )
+            existing_message_count = 0

    # Calculate total tokens from usage
    total_prompt = sum(u.prompt_tokens for u in session.usage)
--- a/autogpt_platform/backend/backend/copilot/sdk/response_adapter.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/response_adapter.py
@@ -47,8 +47,9 @@ class SDKResponseAdapter:
    text blocks, tool calls, and message lifecycle.
    """

-    def __init__(self, message_id: str | None = None):
+    def __init__(self, message_id: str | None = None, session_id: str | None = None):
        self.message_id = message_id or str(uuid.uuid4())
+        self.session_id = session_id
        self.text_block_id = str(uuid.uuid4())
        self.has_started_text = False
        self.has_ended_text = False
@@ -61,6 +62,11 @@ class SDKResponseAdapter:
        """Set the task ID for reconnection support."""
        self.task_id = task_id

+    @property
+    def has_unresolved_tool_calls(self) -> bool:
+        """True when there are tool calls that haven't received output yet."""
+        return bool(self.current_tool_calls.keys() - self.resolved_tool_calls)
+
    def convert_message(self, sdk_message: Message) -> list[StreamBaseResponse]:
        """Convert a single SDK message to Vercel AI SDK format."""
        responses: list[StreamBaseResponse] = []
@@ -77,7 +83,12 @@ class SDKResponseAdapter:
        elif isinstance(sdk_message, AssistantMessage):
            # Flush any SDK built-in tool calls that didn't get a UserMessage
            # result (e.g. WebSearch, Read handled internally by the CLI).
-            self._flush_unresolved_tool_calls(responses)
+            # BUT skip flush when this AssistantMessage is a parallel tool
+            # continuation (contains only ToolUseBlocks) — the prior tools
+            # are still executing concurrently and haven't finished yet.
+            is_tool_only = all(isinstance(b, ToolUseBlock) for b in sdk_message.content)
+            if not is_tool_only:
+                self._flush_unresolved_tool_calls(responses)

            # After tool results, the SDK sends a new AssistantMessage for the
            # next LLM turn. Open a new step if the previous one was closed.
@@ -118,8 +129,24 @@ class SDKResponseAdapter:
            blocks = content if isinstance(content, list) else []
            resolved_in_blocks: set[str] = set()

+            sid = (self.session_id or "?")[:12]
+            parent_id_preview = getattr(sdk_message, "parent_tool_use_id", None)
+            logger.info(
+                "[SDK] [%s] UserMessage: %d blocks, content_type=%s, "
+                "parent_tool_use_id=%s",
+                sid,
+                len(blocks),
+                type(content).__name__,
+                parent_id_preview[:12] if parent_id_preview else "None",
+            )
+
            for block in blocks:
                if isinstance(block, ToolResultBlock) and block.tool_use_id:
+                    # Skip if already resolved (e.g. by flush) — the real
+                    # result supersedes the empty flush, but re-emitting
+                    # would confuse the frontend's state machine.
+                    if block.tool_use_id in self.resolved_tool_calls:
+                        continue
                    tool_info = self.current_tool_calls.get(block.tool_use_id, {})
                    tool_name = tool_info.get("name", "unknown")

@@ -144,7 +171,11 @@ class SDKResponseAdapter:
            # Handle SDK built-in tool results carried via parent_tool_use_id
            # instead of (or in addition to) ToolResultBlock content.
            parent_id = sdk_message.parent_tool_use_id
-            if parent_id and parent_id not in resolved_in_blocks:
+            if (
+                parent_id
+                and parent_id not in resolved_in_blocks
+                and parent_id not in self.resolved_tool_calls
+            ):
                tool_info = self.current_tool_calls.get(parent_id, {})
                tool_name = tool_info.get("name", "unknown")

@@ -228,11 +259,28 @@ class SDKResponseAdapter:
        output, which we pop and emit here before the next ``AssistantMessage``
        starts.
        """
+        unresolved = [
+            (tid, info.get("name", "unknown"))
+            for tid, info in self.current_tool_calls.items()
+            if tid not in self.resolved_tool_calls
+        ]
+        sid = (self.session_id or "?")[:12]
+        if not unresolved:
+            logger.info(
+                "[SDK] [%s] Flush called but all %d tool(s) already resolved",
+                sid,
+                len(self.current_tool_calls),
+            )
+            return
+        logger.info(
+            "[SDK] [%s] Flushing %d unresolved tool call(s): %s",
+            sid,
+            len(unresolved),
+            ", ".join(f"{name}({tid[:12]})" for tid, name in unresolved),
+        )
+
        flushed = False
-        for tool_id, tool_info in self.current_tool_calls.items():
-            if tool_id in self.resolved_tool_calls:
-                continue
-            tool_name = tool_info.get("name", "unknown")
+        for tool_id, tool_name in unresolved:
            output = pop_pending_tool_output(tool_name)
            if output is not None:
                responses.append(
@@ -245,9 +293,12 @@ class SDKResponseAdapter:
                )
                self.resolved_tool_calls.add(tool_id)
                flushed = True
-                logger.debug(
-                    f"Flushed pending output for built-in tool {tool_name} "
-                    f"(call {tool_id})"
+                logger.info(
+                    "[SDK] [%s] Flushed stashed output for %s " "(call %s, %d chars)",
+                    sid,
+                    tool_name,
+                    tool_id[:12],
+                    len(output),
                )
            else:
                # No output available — emit an empty output so the frontend
@@ -263,9 +314,14 @@ class SDKResponseAdapter:
                )
                self.resolved_tool_calls.add(tool_id)
                flushed = True
-                logger.debug(
-                    f"Flushed empty output for unresolved tool {tool_name} "
-                    f"(call {tool_id})"
+                logger.warning(
+                    "[SDK] [%s] Flushed EMPTY output for unresolved tool %s "
+                    "(call %s) — stash was empty (likely SDK hook race "
+                    "condition: PostToolUse hook hadn't completed before "
+                    "flush was triggered)",
+                    sid,
+                    tool_name,
+                    tool_id[:12],
                )

        if flushed and self.step_open:
--- a/autogpt_platform/backend/backend/copilot/sdk/response_adapter_test.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/response_adapter_test.py
@@ -1,5 +1,8 @@
 """Unit tests for the SDK response adapter."""

+import asyncio
+
+import pytest
 from claude_agent_sdk import (
    AssistantMessage,
    ResultMessage,
@@ -27,6 +30,10 @@ from backend.copilot.response_model import (

 from .response_adapter import SDKResponseAdapter
 from .tool_adapter import MCP_TOOL_PREFIX
+from .tool_adapter import _pending_tool_outputs as _pto
+from .tool_adapter import _stash_event
+from .tool_adapter import stash_pending_tool_output as _stash
+from .tool_adapter import wait_for_stash


 def _adapter() -> SDKResponseAdapter:
@@ -364,3 +371,310 @@ def test_full_conversation_flow():
        "StreamFinishStep",  # step 2 closed
        "StreamFinish",
    ]
+
+
+# -- Flush unresolved tool calls --------------------------------------------
+
+
+def test_flush_unresolved_at_result_message():
+    """Built-in tools (WebSearch) without UserMessage results get flushed at ResultMessage."""
+    adapter = _adapter()
+    all_responses: list[StreamBaseResponse] = []
+
+    # 1. Init
+    all_responses.extend(
+        adapter.convert_message(SystemMessage(subtype="init", data={}))
+    )
+    # 2. Tool use (built-in tool — no MCP prefix)
+    all_responses.extend(
+        adapter.convert_message(
+            AssistantMessage(
+                content=[
+                    ToolUseBlock(id="ws-1", name="WebSearch", input={"query": "test"})
+                ],
+                model="test",
+            )
+        )
+    )
+    # 3. No UserMessage for this tool — go straight to ResultMessage
+    all_responses.extend(
+        adapter.convert_message(
+            ResultMessage(
+                subtype="success",
+                duration_ms=100,
+                duration_api_ms=50,
+                is_error=False,
+                num_turns=1,
+                session_id="s1",
+            )
+        )
+    )
+
+    types = [type(r).__name__ for r in all_responses]
+    assert types == [
+        "StreamStart",
+        "StreamStartStep",
+        "StreamToolInputStart",
+        "StreamToolInputAvailable",
+        "StreamToolOutputAvailable",  # flushed with empty output
+        "StreamFinishStep",  # step closed by flush
+        "StreamFinish",
+    ]
+    # The flushed output should be empty (no stash available)
+    output_event = [
+        r for r in all_responses if isinstance(r, StreamToolOutputAvailable)
+    ][0]
+    assert output_event.toolCallId == "ws-1"
+    assert output_event.toolName == "WebSearch"
+    assert output_event.output == ""
+
+
+def test_flush_unresolved_at_next_assistant_message():
+    """Built-in tools get flushed when the next AssistantMessage arrives."""
+    adapter = _adapter()
+    all_responses: list[StreamBaseResponse] = []
+
+    # 1. Init
+    all_responses.extend(
+        adapter.convert_message(SystemMessage(subtype="init", data={}))
+    )
+    # 2. Tool use (built-in — no UserMessage will come)
+    all_responses.extend(
+        adapter.convert_message(
+            AssistantMessage(
+                content=[
+                    ToolUseBlock(id="ws-1", name="WebSearch", input={"query": "test"})
+                ],
+                model="test",
+            )
+        )
+    )
+    # 3. Next AssistantMessage triggers flush before processing its blocks
+    all_responses.extend(
+        adapter.convert_message(
+            AssistantMessage(
+                content=[TextBlock(text="Here are the results")], model="test"
+            )
+        )
+    )
+
+    types = [type(r).__name__ for r in all_responses]
+    assert types == [
+        "StreamStart",
+        "StreamStartStep",
+        "StreamToolInputStart",
+        "StreamToolInputAvailable",
+        # Flush at next AssistantMessage:
+        "StreamToolOutputAvailable",
+        "StreamFinishStep",  # step closed by flush
+        # New step for continuation text:
+        "StreamStartStep",
+        "StreamTextStart",
+        "StreamTextDelta",
+    ]
+
+
+def test_flush_with_stashed_output():
+    """Stashed output from PostToolUse hook is used when flushing."""
+    adapter = _adapter()
+
+    # Simulate PostToolUse hook stashing output
+    _pto.set({})
+    _stash("WebSearch", "Search result: 5 items found")
+
+    all_responses: list[StreamBaseResponse] = []
+
+    # Tool use
+    all_responses.extend(
+        adapter.convert_message(
+            AssistantMessage(
+                content=[
+                    ToolUseBlock(id="ws-1", name="WebSearch", input={"query": "test"})
+                ],
+                model="test",
+            )
+        )
+    )
+    # ResultMessage triggers flush
+    all_responses.extend(
+        adapter.convert_message(
+            ResultMessage(
+                subtype="success",
+                duration_ms=100,
+                duration_api_ms=50,
+                is_error=False,
+                num_turns=1,
+                session_id="s1",
+            )
+        )
+    )
+
+    output_events = [
+        r for r in all_responses if isinstance(r, StreamToolOutputAvailable)
+    ]
+    assert len(output_events) == 1
+    assert output_events[0].output == "Search result: 5 items found"
+
+    # Cleanup
+    _pto.set({})  # type: ignore[arg-type]
+
+
+# -- wait_for_stash synchronisation tests --
+
+
+@pytest.mark.asyncio
+async def test_wait_for_stash_signaled():
+    """wait_for_stash returns True when stash_pending_tool_output signals."""
+    _pto.set({})
+    event = asyncio.Event()
+    _stash_event.set(event)
+
+    # Simulate a PostToolUse hook that stashes output after a short delay
+    async def delayed_stash():
+        await asyncio.sleep(0.01)
+        _stash("WebSearch", "result data")
+
+    asyncio.create_task(delayed_stash())
+    result = await wait_for_stash(timeout=1.0)
+
+    assert result is True
+    assert _pto.get({}).get("WebSearch") == ["result data"]
+
+    # Cleanup
+    _pto.set({})  # type: ignore[arg-type]
+    _stash_event.set(None)
+
+
+@pytest.mark.asyncio
+async def test_wait_for_stash_timeout():
+    """wait_for_stash returns False on timeout when no stash occurs."""
+    _pto.set({})
+    event = asyncio.Event()
+    _stash_event.set(event)
+
+    result = await wait_for_stash(timeout=0.05)
+    assert result is False
+
+    # Cleanup
+    _pto.set({})  # type: ignore[arg-type]
+    _stash_event.set(None)
+
+
+@pytest.mark.asyncio
+async def test_wait_for_stash_already_stashed():
+    """wait_for_stash picks up a stash that happened just before the wait."""
+    _pto.set({})
+    event = asyncio.Event()
+    _stash_event.set(event)
+
+    # Stash before waiting — simulates hook completing before message arrives
+    _stash("Read", "file contents")
+    # Event is now set; wait_for_stash detects the fast path and returns
+    # immediately without timing out.
+    result = await wait_for_stash(timeout=0.05)
+    assert result is True
+
+    # But the stash itself is populated
+    assert _pto.get({}).get("Read") == ["file contents"]
+
+    # Cleanup
+    _pto.set({})  # type: ignore[arg-type]
+    _stash_event.set(None)
+
+
+# -- Parallel tool call tests --
+
+
+def test_parallel_tool_calls_not_flushed_prematurely():
+    """Parallel tool calls should NOT be flushed when the next AssistantMessage
+    only contains ToolUseBlocks (parallel continuation)."""
+    adapter = SDKResponseAdapter()
+
+    # Init
+    adapter.convert_message(SystemMessage(subtype="init", data={}))
+
+    # First AssistantMessage: tool call #1
+    msg1 = AssistantMessage(
+        content=[ToolUseBlock(id="t1", name="WebSearch", input={"q": "foo"})],
+        model="test",
+    )
+    r1 = adapter.convert_message(msg1)
+    assert any(isinstance(r, StreamToolInputAvailable) for r in r1)
+    assert adapter.has_unresolved_tool_calls
+
+    # Second AssistantMessage: tool call #2 (parallel continuation)
+    msg2 = AssistantMessage(
+        content=[ToolUseBlock(id="t2", name="WebSearch", input={"q": "bar"})],
+        model="test",
+    )
+    r2 = adapter.convert_message(msg2)
+
+    # No flush should have happened — t1 should NOT have StreamToolOutputAvailable
+    output_events = [r for r in r2 if isinstance(r, StreamToolOutputAvailable)]
+    assert len(output_events) == 0, (
+        f"Tool-only AssistantMessage should not flush prior tools, "
+        f"but got {len(output_events)} output events"
+    )
+
+    # Both t1 and t2 should still be unresolved
+    assert "t1" not in adapter.resolved_tool_calls
+    assert "t2" not in adapter.resolved_tool_calls
+
+
+def test_text_assistant_message_flushes_prior_tools():
+    """An AssistantMessage with text (new turn) should flush unresolved tools."""
+    adapter = SDKResponseAdapter()
+
+    # Init
+    adapter.convert_message(SystemMessage(subtype="init", data={}))
+
+    # Tool call
+    msg1 = AssistantMessage(
+        content=[ToolUseBlock(id="t1", name="WebSearch", input={"q": "foo"})],
+        model="test",
+    )
+    adapter.convert_message(msg1)
+    assert adapter.has_unresolved_tool_calls
+
+    # Text AssistantMessage (new turn after tools completed)
+    msg2 = AssistantMessage(
+        content=[TextBlock(text="Here are the results")],
+        model="test",
+    )
+    r2 = adapter.convert_message(msg2)
+
+    # Flush SHOULD have happened — t1 gets empty output
+    output_events = [r for r in r2 if isinstance(r, StreamToolOutputAvailable)]
+    assert len(output_events) == 1
+    assert output_events[0].toolCallId == "t1"
+    assert "t1" in adapter.resolved_tool_calls
+
+
+def test_already_resolved_tool_skipped_in_user_message():
+    """A tool result in UserMessage should be skipped if already resolved by flush."""
+    adapter = SDKResponseAdapter()
+
+    adapter.convert_message(SystemMessage(subtype="init", data={}))
+
+    # Tool call + flush via text message
+    adapter.convert_message(
+        AssistantMessage(
+            content=[ToolUseBlock(id="t1", name="WebSearch", input={})],
+            model="test",
+        )
+    )
+    adapter.convert_message(
+        AssistantMessage(
+            content=[TextBlock(text="Done")],
+            model="test",
+        )
+    )
+    assert "t1" in adapter.resolved_tool_calls
+
+    # Now UserMessage arrives with the real result — should be skipped
+    user_msg = UserMessage(content=[ToolResultBlock(tool_use_id="t1", content="real")])
+    r = adapter.convert_message(user_msg)
+    output_events = [r_ for r_ in r if isinstance(r_, StreamToolOutputAvailable)]
+    assert (
+        len(output_events) == 0
+    ), "Already-resolved tool should not emit duplicate output"
--- a/autogpt_platform/backend/backend/copilot/sdk/sdk_compat_test.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/sdk_compat_test.py
@@ -0,0 +1,194 @@
+"""SDK compatibility tests — verify the claude-agent-sdk public API surface we depend on.
+
+Instead of pinning to a narrow version range, these tests verify that the
+installed SDK exposes every class, function, attribute, and method the copilot
+integration relies on.  If an SDK upgrade removes or renames something these
+tests will catch it immediately.
+"""
+
+import inspect
+
+import pytest
+
+# ---------------------------------------------------------------------------
+# Public types & factories
+# ---------------------------------------------------------------------------
+
+
+def test_sdk_exports_client_and_options():
+    from claude_agent_sdk import ClaudeAgentOptions, ClaudeSDKClient
+
+    assert inspect.isclass(ClaudeSDKClient)
+    assert inspect.isclass(ClaudeAgentOptions)
+
+
+def test_sdk_exports_message_types():
+    from claude_agent_sdk import (
+        AssistantMessage,
+        Message,
+        ResultMessage,
+        SystemMessage,
+        UserMessage,
+    )
+
+    for cls in (AssistantMessage, ResultMessage, SystemMessage, UserMessage):
+        assert inspect.isclass(cls), f"{cls.__name__} is not a class"
+    # Message is a Union type alias, just verify it's importable
+    assert Message is not None
+
+
+def test_sdk_exports_content_block_types():
+    from claude_agent_sdk import TextBlock, ToolResultBlock, ToolUseBlock
+
+    for cls in (TextBlock, ToolResultBlock, ToolUseBlock):
+        assert inspect.isclass(cls), f"{cls.__name__} is not a class"
+
+
+def test_sdk_exports_mcp_helpers():
+    from claude_agent_sdk import create_sdk_mcp_server, tool
+
+    assert callable(create_sdk_mcp_server)
+    assert callable(tool)
+
+
+# ---------------------------------------------------------------------------
+# ClaudeSDKClient interface
+# ---------------------------------------------------------------------------
+
+
+def test_client_has_required_methods():
+    from claude_agent_sdk import ClaudeSDKClient
+
+    required = ["connect", "disconnect", "query", "receive_messages"]
+    for name in required:
+        attr = getattr(ClaudeSDKClient, name, None)
+        assert attr is not None, f"ClaudeSDKClient.{name} missing"
+        assert callable(attr), f"ClaudeSDKClient.{name} is not callable"
+
+
+def test_client_supports_async_context_manager():
+    from claude_agent_sdk import ClaudeSDKClient
+
+    assert hasattr(ClaudeSDKClient, "__aenter__")
+    assert hasattr(ClaudeSDKClient, "__aexit__")
+
+
+# ---------------------------------------------------------------------------
+# ClaudeAgentOptions fields
+# ---------------------------------------------------------------------------
+
+
+def test_agent_options_accepts_required_fields():
+    """Verify ClaudeAgentOptions accepts all kwargs our code passes."""
+    from claude_agent_sdk import ClaudeAgentOptions
+
+    opts = ClaudeAgentOptions(
+        system_prompt="test",
+        cwd="/tmp",
+    )
+    assert opts.system_prompt == "test"
+    assert opts.cwd == "/tmp"
+
+
+def test_agent_options_accepts_all_our_fields():
+    """Comprehensive check of every field we use in service.py."""
+    from claude_agent_sdk import ClaudeAgentOptions
+
+    fields_we_use = [
+        "system_prompt",
+        "mcp_servers",
+        "allowed_tools",
+        "disallowed_tools",
+        "hooks",
+        "cwd",
+        "model",
+        "env",
+        "resume",
+        "max_buffer_size",
+    ]
+    sig = inspect.signature(ClaudeAgentOptions)
+    for field in fields_we_use:
+        assert field in sig.parameters, (
+            f"ClaudeAgentOptions no longer accepts '{field}' — "
+            f"available params: {list(sig.parameters.keys())}"
+        )
+
+
+# ---------------------------------------------------------------------------
+# Message attributes
+# ---------------------------------------------------------------------------
+
+
+def test_assistant_message_has_content_and_model():
+    from claude_agent_sdk import AssistantMessage, TextBlock
+
+    msg = AssistantMessage(content=[TextBlock(text="hi")], model="test")
+    assert hasattr(msg, "content")
+    assert hasattr(msg, "model")
+
+
+def test_result_message_has_required_attrs():
+    from claude_agent_sdk import ResultMessage
+
+    msg = ResultMessage(
+        subtype="success",
+        duration_ms=100,
+        duration_api_ms=50,
+        is_error=False,
+        num_turns=1,
+        session_id="s1",
+    )
+    assert msg.subtype == "success"
+    assert hasattr(msg, "result")
+
+
+def test_system_message_has_subtype_and_data():
+    from claude_agent_sdk import SystemMessage
+
+    msg = SystemMessage(subtype="init", data={})
+    assert msg.subtype == "init"
+    assert msg.data == {}
+
+
+def test_user_message_has_parent_tool_use_id():
+    from claude_agent_sdk import UserMessage
+
+    msg = UserMessage(content="test")
+    assert hasattr(msg, "parent_tool_use_id")
+    assert hasattr(msg, "tool_use_result")
+
+
+def test_tool_use_block_has_id_name_input():
+    from claude_agent_sdk import ToolUseBlock
+
+    block = ToolUseBlock(id="t1", name="test", input={"key": "val"})
+    assert block.id == "t1"
+    assert block.name == "test"
+    assert block.input == {"key": "val"}
+
+
+def test_tool_result_block_has_required_attrs():
+    from claude_agent_sdk import ToolResultBlock
+
+    block = ToolResultBlock(tool_use_id="t1", content="result")
+    assert block.tool_use_id == "t1"
+    assert block.content == "result"
+    assert hasattr(block, "is_error")
+
+
+# ---------------------------------------------------------------------------
+# Hook types
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "hook_event",
+    ["PreToolUse", "PostToolUse", "Stop"],
+)
+def test_sdk_exports_hook_event_type(hook_event: str):
+    """Verify HookEvent literal includes the events our security_hooks use."""
+    from claude_agent_sdk.types import HookEvent
+
+    # HookEvent is a Literal type — check that our events are valid values.
+    # We can't easily inspect Literal at runtime, so just verify the type exists.
+    assert HookEvent is not None
--- a/autogpt_platform/backend/backend/copilot/sdk/security_hooks.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/security_hooks.py
@@ -246,15 +246,33 @@ def create_security_hooks(
            """
            _ = context
            tool_name = cast(str, input_data.get("tool_name", ""))
-            logger.debug(f"[SDK] Tool success: {tool_name}, tool_use_id={tool_use_id}")
+            is_builtin = not tool_name.startswith(MCP_TOOL_PREFIX)
+            logger.info(
+                "[SDK] PostToolUse: %s (builtin=%s, tool_use_id=%s)",
+                tool_name,
+                is_builtin,
+                (tool_use_id or "")[:12],
+            )

            # Stash output for SDK built-in tools so the response adapter can
            # emit StreamToolOutputAvailable even when the CLI doesn't surface
            # a separate UserMessage with ToolResultBlock content.
-            if not tool_name.startswith(MCP_TOOL_PREFIX):
+            if is_builtin:
                tool_response = input_data.get("tool_response")
                if tool_response is not None:
+                    resp_preview = str(tool_response)[:100]
+                    logger.info(
+                        "[SDK] Stashing builtin output for %s (%d chars): %s...",
+                        tool_name,
+                        len(str(tool_response)),
+                        resp_preview,
+                    )
                    stash_pending_tool_output(tool_name, tool_response)
+                else:
+                    logger.warning(
+                        "[SDK] PostToolUse for builtin %s but tool_response is None",
+                        tool_name,
+                    )

            return cast(SyncHookJSONOutput, {})

--- a/autogpt_platform/backend/backend/copilot/sdk/service.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/service.py
@@ -24,6 +24,7 @@ from ..response_model import (
    StreamBaseResponse,
    StreamError,
    StreamFinish,
+    StreamFinishStep,
    StreamHeartbeat,
    StreamStart,
    StreamTextDelta,
@@ -46,6 +47,7 @@ from .tool_adapter import (
    LongRunningCallback,
    create_copilot_mcp_server,
    set_execution_context,
+    wait_for_stash,
 )
 from .transcript import (
    cleanup_cli_project_dir,
@@ -344,15 +346,15 @@ async def _compress_conversation_history(

    Returns the compressed prior messages (everything except the current message).
    """
-    prior = session.messages[:-1]
-    if len(prior) < 2:
-        return prior
+    messages = session.messages[:-1]
+    if len(messages) < 2:
+        return messages

    from backend.util.prompt import compress_context

    # Convert ChatMessages to dicts for compress_context
    messages_dict = []
-    for msg in prior:
+    for msg in messages:
        msg_dict: dict[str, Any] = {"role": msg.role}
        if msg.content:
            msg_dict["content"] = msg.content
@@ -400,7 +402,7 @@ async def _compress_conversation_history(
            for m in result.messages
        ]

-    return prior
+    return messages


 def _format_conversation_context(messages: list[ChatMessage]) -> str | None:
@@ -442,8 +444,8 @@ def _format_conversation_context(messages: list[ChatMessage]) -> str | None:
 def _is_tool_error_or_denial(content: str | None) -> bool:
    """Check if a tool message content indicates an error or denial.

-    We include these in conversation context so the agent doesn't
-    hallucinate success for operations that actually failed.
+    Currently unused — ``_format_conversation_context`` includes all tool
+    results.  Kept as a utility for future selective filtering.
    """
    if not content:
        return False
@@ -458,7 +460,7 @@ def _is_tool_error_or_denial(content: str | None) -> bool:
            "maximum",  # subtask-limit denial
            "denied",
            "blocked",
-            "failed",  # internal tool execution failures
+            "failed to",  # internal tool execution failures
            '"iserror": true',  # MCP protocol error flag
        )
    )
@@ -674,7 +676,7 @@ async def stream_chat_completion_sdk(

            options = ClaudeAgentOptions(**sdk_options_kwargs)  # type: ignore[arg-type]

-            adapter = SDKResponseAdapter(message_id=message_id)
+            adapter = SDKResponseAdapter(message_id=message_id, session_id=session_id)
            adapter.set_task_id(task_id)

            async with ClaudeSDKClient(options=options) as client:
@@ -699,10 +701,13 @@ async def stream_chat_completion_sdk(
                    transcript_msg_count,
                    session_id,
                )
-
                logger.info(
-                    f"[SDK] Sending query ({len(session.messages)} msgs, "
-                    f"resume={use_resume})"
+                    "[SDK] [%s] Sending query — resume=%s, "
+                    "total_msgs=%d, query_len=%d",
+                    session_id[:12],
+                    use_resume,
+                    len(session.messages),
+                    len(query_message),
                )
                await client.query(query_message, session_id=session_id)

@@ -710,98 +715,296 @@ async def stream_chat_completion_sdk(
                accumulated_tool_calls: list[dict[str, Any]] = []
                has_appended_assistant = False
                has_tool_results = False
+                # Track persisted message count to skip DB count queries
+                # on incremental saves.  Initial save happened at line 545.
+                saved_msg_count = len(session.messages)

-                # Use an explicit async iterator with timeout to send
-                # heartbeats when the CLI is idle (e.g. executing tools).
-                # This prevents proxies/LBs from closing the SSE connection.
-                # asyncio.timeout() is preferred over asyncio.wait_for()
-                # because wait_for wraps in a separate Task whose cancellation
-                # can leave the async generator in a broken state.
+                # Use an explicit async iterator with non-cancelling heartbeats.
+                # CRITICAL: we must NOT cancel __anext__() mid-flight — doing so
+                # (via asyncio.timeout or wait_for) corrupts the SDK's internal
+                # anyio memory stream, causing StopAsyncIteration on the next
+                # call and silently dropping all in-flight tool results.
+                # Instead, wrap __anext__() in a Task and use asyncio.wait()
+                # with a timeout.  On timeout we emit a heartbeat but keep the
+                # Task alive so it can deliver the next message.
                msg_iter = client.receive_messages().__aiter__()
-                while not stream_completed:
-                    try:
-                        async with asyncio.timeout(_HEARTBEAT_INTERVAL):
-                            sdk_msg = await msg_iter.__anext__()
-                    except TimeoutError:
-                        yield StreamHeartbeat()
-                        continue
-                    except StopAsyncIteration:
-                        break
+                pending_task: asyncio.Task[Any] | None = None
+                try:
+                    while not stream_completed:
+                        if pending_task is None:

-                    logger.debug(
-                        f"[SDK] Received: {type(sdk_msg).__name__} "
-                        f"{getattr(sdk_msg, 'subtype', '')}"
-                    )
-                    for response in adapter.convert_message(sdk_msg):
-                        if isinstance(response, StreamStart):
+                            async def _next_msg() -> Any:
+                                return await msg_iter.__anext__()
+
+                            pending_task = asyncio.create_task(_next_msg())
+
+                        done, _ = await asyncio.wait(
+                            {pending_task}, timeout=_HEARTBEAT_INTERVAL
+                        )
+
+                        if not done:
+                            # Timeout — emit heartbeat but keep the task alive
+                            yield StreamHeartbeat()
                            continue

-                        # Log tool events for debugging visibility issues
+                        # Task completed — get result
+                        pending_task = None
+                        try:
+                            sdk_msg = done.pop().result()
+                        except StopAsyncIteration:
+                            logger.info(
+                                "[SDK] [%s] Stream ended normally "
+                                "(StopAsyncIteration)",
+                                session_id[:12],
+                            )
+                            break
+                        except Exception as stream_err:
+                            # SDK sends {"type": "error"} which raises
+                            # Exception in receive_messages() — capture it
+                            # so the session can still be saved and the
+                            # frontend gets a clean finish.
+                            logger.error(
+                                "[SDK] [%s] Stream error from SDK: %s",
+                                session_id[:12],
+                                stream_err,
+                                exc_info=True,
+                            )
+                            yield StreamError(
+                                errorText=f"SDK stream error: {stream_err}",
+                                code="sdk_stream_error",
+                            )
+                            break
+
+                        logger.info(
+                            "[SDK] [%s] Received: %s %s "
+                            "(unresolved=%d, current=%d, resolved=%d)",
+                            session_id[:12],
+                            type(sdk_msg).__name__,
+                            getattr(sdk_msg, "subtype", ""),
+                            len(adapter.current_tool_calls)
+                            - len(adapter.resolved_tool_calls),
+                            len(adapter.current_tool_calls),
+                            len(adapter.resolved_tool_calls),
+                        )
+
+                        # Race-condition fix: SDK hooks (PostToolUse) are
+                        # executed asynchronously via start_soon() — the next
+                        # message can arrive before the hook stashes output.
+                        # wait_for_stash() awaits an asyncio.Event signaled by
+                        # stash_pending_tool_output(), completing as soon as
+                        # the hook finishes (typically <1ms).  The sleep(0)
+                        # after lets any remaining concurrent hooks complete.
+                        #
+                        # Skip for parallel tool continuations: when the SDK
+                        # sends parallel tool calls as separate
+                        # AssistantMessages (each containing only
+                        # ToolUseBlocks), we must NOT wait/flush — the prior
+                        # tools are still executing concurrently.
+                        from claude_agent_sdk import (
+                            AssistantMessage,
+                            ResultMessage,
+                            ToolUseBlock,
+                        )
+
+                        is_parallel_continuation = isinstance(
+                            sdk_msg, AssistantMessage
+                        ) and all(isinstance(b, ToolUseBlock) for b in sdk_msg.content)
+
+                        if (
+                            adapter.has_unresolved_tool_calls
+                            and isinstance(sdk_msg, (AssistantMessage, ResultMessage))
+                            and not is_parallel_continuation
+                        ):
+                            if await wait_for_stash(timeout=0.5):
+                                await asyncio.sleep(0)
+                            else:
+                                logger.warning(
+                                    "[SDK] [%s] Timed out waiting for "
+                                    "PostToolUse hook stash "
+                                    "(%d unresolved tool calls)",
+                                    session_id[:12],
+                                    len(adapter.current_tool_calls)
+                                    - len(adapter.resolved_tool_calls),
+                                )
+
+                        for response in adapter.convert_message(sdk_msg):
+                            if isinstance(response, StreamStart):
+                                continue
+
+                            # Log tool events for debugging
+                            if isinstance(
+                                response,
+                                (
+                                    StreamToolInputAvailable,
+                                    StreamToolOutputAvailable,
+                                ),
+                            ):
+                                extra = ""
+                                if isinstance(response, StreamToolOutputAvailable):
+                                    out_len = len(str(response.output))
+                                    extra = f", output_len={out_len}"
+                                logger.info(
+                                    "[SDK] [%s] Tool event: %s, tool=%s%s",
+                                    session_id[:12],
+                                    type(response).__name__,
+                                    getattr(response, "toolName", "N/A"),
+                                    extra,
+                                )
+
+                            yield response
+
+                            if isinstance(response, StreamTextDelta):
+                                delta = response.delta or ""
+                                # After tool results, start a new assistant
+                                # message for the post-tool text.
+                                if has_tool_results and has_appended_assistant:
+                                    assistant_response = ChatMessage(
+                                        role="assistant", content=delta
+                                    )
+                                    accumulated_tool_calls = []
+                                    has_appended_assistant = False
+                                    has_tool_results = False
+                                    session.messages.append(assistant_response)
+                                    has_appended_assistant = True
+                                else:
+                                    assistant_response.content = (
+                                        assistant_response.content or ""
+                                    ) + delta
+                                    if not has_appended_assistant:
+                                        session.messages.append(assistant_response)
+                                        has_appended_assistant = True
+
+                            elif isinstance(response, StreamToolInputAvailable):
+                                accumulated_tool_calls.append(
+                                    {
+                                        "id": response.toolCallId,
+                                        "type": "function",
+                                        "function": {
+                                            "name": response.toolName,
+                                            "arguments": json.dumps(
+                                                response.input or {}
+                                            ),
+                                        },
+                                    }
+                                )
+                                assistant_response.tool_calls = accumulated_tool_calls
+                                if not has_appended_assistant:
+                                    session.messages.append(assistant_response)
+                                    has_appended_assistant = True
+                                # Save before tool execution starts so the
+                                # pending tool call is visible on refresh /
+                                # other devices.
+                                try:
+                                    await upsert_chat_session(
+                                        session,
+                                        existing_message_count=saved_msg_count,
+                                    )
+                                    saved_msg_count = len(session.messages)
+                                except Exception as save_err:
+                                    logger.warning(
+                                        "[SDK] [%s] Incremental save " "failed: %s",
+                                        session_id[:12],
+                                        save_err,
+                                    )
+
+                            elif isinstance(response, StreamToolOutputAvailable):
+                                session.messages.append(
+                                    ChatMessage(
+                                        role="tool",
+                                        content=(
+                                            response.output
+                                            if isinstance(response.output, str)
+                                            else str(response.output)
+                                        ),
+                                        tool_call_id=response.toolCallId,
+                                    )
+                                )
+                                has_tool_results = True
+                                # Save after tool completes so the result is
+                                # visible on refresh / other devices.
+                                try:
+                                    await upsert_chat_session(
+                                        session,
+                                        existing_message_count=saved_msg_count,
+                                    )
+                                    saved_msg_count = len(session.messages)
+                                except Exception as save_err:
+                                    logger.warning(
+                                        "[SDK] [%s] Incremental save " "failed: %s",
+                                        session_id[:12],
+                                        save_err,
+                                    )
+
+                            elif isinstance(response, StreamFinish):
+                                stream_completed = True
+
+                except asyncio.CancelledError:
+                    # Task/generator was cancelled (e.g. client disconnect,
+                    # server shutdown).  Log and let the safety-net / finally
+                    # blocks handle cleanup.
+                    logger.warning(
+                        "[SDK] [%s] Streaming loop cancelled "
+                        "(asyncio.CancelledError)",
+                        session_id[:12],
+                    )
+                    raise
+                finally:
+                    # Cancel the pending __anext__ task to avoid a leaked
+                    # coroutine.  This is safe even if the task already
+                    # completed.
+                    if pending_task is not None and not pending_task.done():
+                        pending_task.cancel()
+                        try:
+                            await pending_task
+                        except (asyncio.CancelledError, StopAsyncIteration):
+                            pass
+
+                # Safety net: if tools are still unresolved after the
+                # streaming loop (e.g. StopAsyncIteration before ResultMessage,
+                # or SDK not sending UserMessages for built-in tools), flush
+                # them now so the frontend stops showing spinners.
+                if adapter.has_unresolved_tool_calls:
+                    logger.warning(
+                        "[SDK] [%s] %d unresolved tool(s) after stream loop — "
+                        "flushing as safety net",
+                        session_id[:12],
+                        len(adapter.current_tool_calls)
+                        - len(adapter.resolved_tool_calls),
+                    )
+                    safety_responses: list[StreamBaseResponse] = []
+                    adapter._flush_unresolved_tool_calls(safety_responses)
+                    for response in safety_responses:
                        if isinstance(
                            response,
                            (StreamToolInputAvailable, StreamToolOutputAvailable),
                        ):
                            logger.info(
-                                "[SDK] Tool event: %s, tool=%s",
+                                "[SDK] [%s] Safety flush: %s, tool=%s",
+                                session_id[:12],
                                type(response).__name__,
                                getattr(response, "toolName", "N/A"),
                            )
-
                        yield response

-                        if isinstance(response, StreamTextDelta):
-                            delta = response.delta or ""
-                            # After tool results, start a new assistant
-                            # message for the post-tool text.
-                            if has_tool_results and has_appended_assistant:
-                                assistant_response = ChatMessage(
-                                    role="assistant", content=delta
-                                )
-                                accumulated_tool_calls = []
-                                has_appended_assistant = False
-                                has_tool_results = False
-                                session.messages.append(assistant_response)
-                                has_appended_assistant = True
-                            else:
-                                assistant_response.content = (
-                                    assistant_response.content or ""
-                                ) + delta
-                                if not has_appended_assistant:
-                                    session.messages.append(assistant_response)
-                                    has_appended_assistant = True
-
-                        elif isinstance(response, StreamToolInputAvailable):
-                            accumulated_tool_calls.append(
-                                {
-                                    "id": response.toolCallId,
-                                    "type": "function",
-                                    "function": {
-                                        "name": response.toolName,
-                                        "arguments": json.dumps(response.input or {}),
-                                    },
-                                }
-                            )
-                            assistant_response.tool_calls = accumulated_tool_calls
-                            if not has_appended_assistant:
-                                session.messages.append(assistant_response)
-                                has_appended_assistant = True
-
-                        elif isinstance(response, StreamToolOutputAvailable):
-                            session.messages.append(
-                                ChatMessage(
-                                    role="tool",
-                                    content=(
-                                        response.output
-                                        if isinstance(response.output, str)
-                                        else str(response.output)
-                                    ),
-                                    tool_call_id=response.toolCallId,
-                                )
-                            )
-                            has_tool_results = True
-
-                        elif isinstance(response, StreamFinish):
-                            stream_completed = True
+                # If the stream ended without a ResultMessage (no
+                # StreamFinish), the SDK CLI exited unexpectedly.  Close
+                # the open step and emit StreamFinish so the frontend
+                # transitions to the "ready" state.
+                if not stream_completed:
+                    logger.warning(
+                        "[SDK] [%s] Stream ended without ResultMessage "
+                        "(StopAsyncIteration) — emitting StreamFinish",
+                        session_id[:12],
+                    )
+                    if adapter.step_open:
+                        yield StreamFinishStep()
+                        adapter.step_open = False
+                    closing_responses: list[StreamBaseResponse] = []
+                    adapter._end_text_if_open(closing_responses)
+                    for r in closing_responses:
+                        yield r
+                    yield StreamFinish()
+                    stream_completed = True

                if (
                    assistant_response.content or assistant_response.tool_calls
@@ -857,12 +1060,20 @@ async def stream_chat_completion_sdk(
            )

        await asyncio.shield(upsert_chat_session(session))
-        logger.debug(
-            f"[SDK] Session {session_id} saved with {len(session.messages)} messages"
+        logger.info(
+            "[SDK] [%s] Session saved with %d messages",
+            session_id[:12],
+            len(session.messages),
        )
        if not stream_completed:
            yield StreamFinish()

+    except asyncio.CancelledError:
+        # Client disconnect / server shutdown — log but re-raise so
+        # the framework can clean up.  The finally block still runs
+        # for transcript upload.
+        logger.warning("[SDK] [%s] Session cancelled (CancelledError)", session_id[:12])
+        raise
    except Exception as e:
        logger.error(f"[SDK] Error: {e}", exc_info=True)
        try:
--- a/autogpt_platform/backend/backend/copilot/sdk/tool_adapter.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/tool_adapter.py
@@ -9,6 +9,7 @@ via a callback provided by the service layer.  This avoids wasteful SDK polling
 and makes results survive page refreshes.
 """

+import asyncio
 import itertools
 import json
 import logging
@@ -44,6 +45,14 @@ _current_session: ContextVar[ChatSession | None] = ContextVar(
 _pending_tool_outputs: ContextVar[dict[str, list[str]]] = ContextVar(
    "pending_tool_outputs", default=None  # type: ignore[arg-type]
 )
+# Event signaled whenever stash_pending_tool_output() adds a new entry.
+# Used by the streaming loop to wait for PostToolUse hooks to complete
+# instead of sleeping an arbitrary duration.  The SDK fires hooks via
+# start_soon (fire-and-forget) so the next message can arrive before
+# the hook stashes its output — this event bridges that gap.
+_stash_event: ContextVar[asyncio.Event | None] = ContextVar(
+    "_stash_event", default=None
+)

 # Callback type for delegating long-running tools to the non-SDK infrastructure.
 # Args: (tool_name, arguments, session) → MCP-formatted response dict.
@@ -76,6 +85,7 @@ def set_execution_context(
    _current_user_id.set(user_id)
    _current_session.set(session)
    _pending_tool_outputs.set({})
+    _stash_event.set(asyncio.Event())
    _long_running_callback.set(long_running_callback)


@@ -134,6 +144,43 @@ def stash_pending_tool_output(tool_name: str, output: Any) -> None:
        except (TypeError, ValueError):
            text = str(output)
    pending.setdefault(tool_name, []).append(text)
+    # Signal any waiters that new output is available.
+    event = _stash_event.get(None)
+    if event is not None:
+        event.set()
+
+
+async def wait_for_stash(timeout: float = 0.5) -> bool:
+    """Wait for a PostToolUse hook to stash tool output.
+
+    The SDK fires PostToolUse hooks asynchronously via ``start_soon()`` —
+    the next message (AssistantMessage/ResultMessage) can arrive before the
+    hook completes and stashes its output.  This function bridges that gap
+    by waiting on the ``_stash_event``, which is signaled by
+    :func:`stash_pending_tool_output`.
+
+    After the event fires, callers should ``await asyncio.sleep(0)`` to
+    give any remaining concurrent hooks a chance to complete.
+
+    Returns ``True`` if a stash signal was received, ``False`` on timeout.
+    The timeout is a safety net — normally the stash happens within
+    microseconds of yielding to the event loop.
+    """
+    event = _stash_event.get(None)
+    if event is None:
+        return False
+    # Fast path: hook already completed before we got here.
+    if event.is_set():
+        event.clear()
+        return True
+    # Slow path: wait for the hook to signal.
+    try:
+        async with asyncio.timeout(timeout):
+            await event.wait()
+        event.clear()
+        return True
+    except TimeoutError:
+        return False


 async def _execute_tool_sync(
--- a/autogpt_platform/backend/poetry.lock
+++ b/autogpt_platform/backend/poetry.lock
@@ -899,17 +899,17 @@ files = [

 [[package]]
 name = "claude-agent-sdk"
-version = "0.1.35"
+version = "0.1.39"
 description = "Python SDK for Claude Code"
 optional = false
 python-versions = ">=3.10"
 groups = ["main"]
 files = [
-    {file = "claude_agent_sdk-0.1.35-py3-none-macosx_11_0_arm64.whl", hash = "sha256:df67f4deade77b16a9678b3a626c176498e40417f33b04beda9628287f375591"},
-    {file = "claude_agent_sdk-0.1.35-py3-none-manylinux_2_17_aarch64.whl", hash = "sha256:14963944f55ded7c8ed518feebfa5b4284aa6dd8d81aeff2e5b21a962ce65097"},
-    {file = "claude_agent_sdk-0.1.35-py3-none-manylinux_2_17_x86_64.whl", hash = "sha256:84344dcc535d179c1fc8a11c6f34c37c3b583447bdf09d869effb26514fd7a65"},
-    {file = "claude_agent_sdk-0.1.35-py3-none-win_amd64.whl", hash = "sha256:1b3d54b47448c93f6f372acd4d1757f047c3c1e8ef5804be7a1e3e53e2c79a5f"},
-    {file = "claude_agent_sdk-0.1.35.tar.gz", hash = "sha256:0f98e2b3c71ca85abfc042e7a35c648df88e87fda41c52e6779ef7b038dcbb52"},
+    {file = "claude_agent_sdk-0.1.39-py3-none-macosx_11_0_arm64.whl", hash = "sha256:6ed6a79781f545b761b9fe467bc5ae213a103c9d3f0fe7a9dad3c01790ed58fa"},
+    {file = "claude_agent_sdk-0.1.39-py3-none-manylinux_2_17_aarch64.whl", hash = "sha256:0c03b5a3772eaec42e29ea39240c7d24b760358082f2e36336db9e71dde3dda4"},
+    {file = "claude_agent_sdk-0.1.39-py3-none-manylinux_2_17_x86_64.whl", hash = "sha256:d2665c9e87b6ffece590bcdd6eb9def47cde4809b0d2f66e0a61a719189be7c9"},
+    {file = "claude_agent_sdk-0.1.39-py3-none-win_amd64.whl", hash = "sha256:d03324daf7076be79d2dd05944559aabf4cc11c98d3a574b992a442a7c7a26d6"},
+    {file = "claude_agent_sdk-0.1.39.tar.gz", hash = "sha256:dcf0ebd5a638c9a7d9f3af7640932a9212b2705b7056e4f08bd3968a865b4268"},
 ]

 [package.dependencies]
@@ -8530,4 +8530,4 @@ cffi = ["cffi (>=1.17,<2.0) ; platform_python_implementation != \"PyPy\" and pyt
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.10,<3.14"
-content-hash = "55e095de555482f0fe47de7695f390fe93e7bcf739b31c391b2e5e3c3d938ae3"
+content-hash = "3ef62836d8321b9a3b8e897dade8dc6ca9022fd9468c53f384b0871b521ab343"
--- a/autogpt_platform/backend/pyproject.toml
+++ b/autogpt_platform/backend/pyproject.toml
@@ -16,7 +16,7 @@ anthropic = "^0.79.0"
 apscheduler = "^3.11.1"
 autogpt-libs = { path = "../autogpt_libs", develop = true }
 bleach = { extras = ["css"], version = "^6.2.0" }
-claude-agent-sdk = "^0.1.0"
+claude-agent-sdk = "^0.1.39"  # see copilot/sdk/sdk_compat_test.py for capability checks
 click = "^8.2.0"
 cryptography = "^46.0"
 discord-py = "^2.5.2"
--- a/autogpt_platform/frontend/src/app/(platform)/copilot/helpers/convertChatSessionToUiMessages.ts
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/helpers/convertChatSessionToUiMessages.ts
@@ -58,6 +58,7 @@ function toToolInput(rawArguments: unknown): unknown {
 export function convertChatSessionMessagesToUiMessages(
  sessionId: string,
  rawMessages: unknown[],
+  options?: { isComplete?: boolean },
 ): UIMessage<unknown, UIDataTypes, UITools>[] {
  const messages = coerceSessionChatMessages(rawMessages);
  const toolOutputsByCallId = new Map<string, unknown>();
@@ -104,6 +105,16 @@ export function convertChatSessionMessagesToUiMessages(
            input,
            output: typeof output === "string" ? safeJsonParse(output) : output,
          });
+        } else if (options?.isComplete) {
+          // Session is complete (no active stream) but this tool call has
+          // no output in the DB — mark as completed to stop stale spinners.
+          parts.push({
+            type: `tool-${toolName}`,
+            toolCallId,
+            state: "output-available",
+            input,
+            output: "",
+          });
        } else {
          parts.push({
            type: `tool-${toolName}`,
--- a/autogpt_platform/frontend/src/app/(platform)/copilot/useChatSession.ts
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/useChatSession.ts
@@ -40,16 +40,6 @@ export function useChatSession() {
    }
  }, [sessionId, queryClient]);

-  // Memoize so the effect in useCopilotPage doesn't infinite-loop on a new
-  // array reference every render. Re-derives only when query data changes.
-  const hydratedMessages = useMemo(() => {
-    if (sessionQuery.data?.status !== 200 || !sessionId) return undefined;
-    return convertChatSessionMessagesToUiMessages(
-      sessionId,
-      sessionQuery.data.data.messages ?? [],
-    );
-  }, [sessionQuery.data, sessionId]);
-
  // Expose active_stream info so the caller can trigger manual resume
  // after hydration completes (rather than relying on AI SDK's built-in
  // resume which fires before hydration).
@@ -58,6 +48,19 @@ export function useChatSession() {
    return !!sessionQuery.data.data.active_stream;
  }, [sessionQuery.data]);

+  // Memoize so the effect in useCopilotPage doesn't infinite-loop on a new
+  // array reference every render. Re-derives only when query data changes.
+  // When the session is complete (no active stream), mark dangling tool
+  // calls as completed so stale spinners don't persist after refresh.
+  const hydratedMessages = useMemo(() => {
+    if (sessionQuery.data?.status !== 200 || !sessionId) return undefined;
+    return convertChatSessionMessagesToUiMessages(
+      sessionId,
+      sessionQuery.data.data.messages ?? [],
+      { isComplete: !hasActiveStream },
+    );
+  }, [sessionQuery.data, sessionId, hasActiveStream]);
+
  const { mutateAsync: createSessionMutation, isPending: isCreatingSession } =
    usePostV2CreateSession({
      mutation: {
--- a/autogpt_platform/frontend/src/app/(platform)/copilot/useCopilotPage.ts
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/useCopilotPage.ts
@@ -1,4 +1,5 @@
 import {
+  getGetV2GetSessionQueryKey,
  getGetV2ListSessionsQueryKey,
  postV2CancelSessionTask,
  useDeleteV2DeleteSession,
@@ -187,11 +188,35 @@ export function useCopilotPage() {
    });
  }, [hydratedMessages, setMessages, status]);

+  // Ref: tracks whether we've already resumed for a given session.
+  // Reset when the stream ends so re-resume is possible if the backend
+  // task is still running (SSE dropped but executor didn't finish).
+  const hasResumedRef = useRef<string | null>(null);
+
+  // When the stream ends (or drops), invalidate the session cache so the
+  // next hydration fetches fresh messages from the backend.  Without this,
+  // staleTime: Infinity means the cache keeps the pre-stream data forever,
+  // and any messages added during streaming are lost on remount/navigation.
+  const prevStatusRef = useRef(status);
+  useEffect(() => {
+    const prev = prevStatusRef.current;
+    prevStatusRef.current = status;
+
+    const wasActive = prev === "streaming" || prev === "submitted";
+    const isIdle = status === "ready" || status === "error";
+    if (wasActive && isIdle && sessionId) {
+      queryClient.invalidateQueries({
+        queryKey: getGetV2GetSessionQueryKey(sessionId),
+      });
+      // Allow re-resume if the backend task is still running.
+      hasResumedRef.current = null;
+    }
+  }, [status, sessionId, queryClient]);
+
  // Resume an active stream AFTER hydration completes.
  // The backend returns active_stream info when a task is still running.
  // We wait for hydration so the AI SDK has the conversation history
  // before the resumed stream appends the in-progress assistant message.
-  const hasResumedRef = useRef<string | null>(null);
  useEffect(() => {
    if (!hasActiveStream || !sessionId) return;
    if (!hydratedMessages || hydratedMessages.length === 0) return;
@@ -202,18 +227,6 @@ export function useCopilotPage() {
    resumeStream();
  }, [hasActiveStream, sessionId, hydratedMessages, status, resumeStream]);

-  // When the stream finishes, resolve any tool parts still showing spinners.
-  // This can happen if the backend didn't emit StreamToolOutputAvailable for
-  // a tool call before sending StreamFinish (e.g. SDK built-in tools).
-  const prevStatusRef = useRef(status);
-  useEffect(() => {
-    const prev = prevStatusRef.current;
-    prevStatusRef.current = status;
-    if (prev === "streaming" && status === "ready") {
-      setMessages((msgs) => resolveInProgressTools(msgs, "completed"));
-    }
-  }, [status, setMessages]);
-
  // Poll session endpoint when a long-running tool (create_agent, edit_agent)
  // is in progress. When the backend completes, the session data will contain
  // the final tool output — this hook detects the change and updates messages.
Author	SHA1	Message	Date
Zamil Majdy	9106f0b5ce	poetry lock	2026-02-20 15:11:46 +07:00
Zamil Majdy	37355f7581	fix(copilot): non-cancelling heartbeat, incremental saves, frontend reconnection - Replace asyncio.timeout() with asyncio.wait() for SDK message iteration to avoid corrupting the internal anyio stream on timeout (root cause of tool outputs getting stuck) - Add CancelledError handling + pending task cleanup in finally block - Fix _end_text_if_open([]) discarding StreamTextEnd events (Sentry bug) - Save session to DB after each tool input/output for cross-device recovery - Optimize incremental saves by passing existing_message_count to skip redundant DB count queries - Frontend: invalidate session cache + reset resume ref on stream end so SSE reconnection works after drops	2026-02-20 15:07:18 +07:00
Zamil Majdy	e1e3b6094e	poetry lock	2026-02-20 15:07:18 +07:00
Zamil Majdy	e18b3c561f	Merge branch 'dev' into fix/messed-up-copilot	2026-02-20 11:51:49 +05:30
Zamil Majdy	d937c6839a	fix(copilot): handle stream ending without text + PostToolUse logging When the SDK CLI exits without sending a ResultMessage (parallel tool execution), the frontend never gets StreamFinish and tools appear stuck. Now detect StopAsyncIteration and emit StreamFinish as a fallback. Also add INFO-level PostToolUse hook logging to trace whether the SDK fires hooks and stashes output for built-in tools like WebSearch.	2026-02-20 13:12:05 +07:00
Zamil Majdy	8c2363ea88	fix(copilot): add safety-net flush and diagnostic logging for parallel tools WebSearch/web_fetch parallel tool calls end with spinners resolving but no output shown, then the session ends with no text response at all. Add: - Safety-net flush after streaming loop for any unresolved tools - INFO-level logging for every SDK message (type, unresolved count) - UserMessage block detail logging to trace tool result delivery - Flush-called-but-empty logging to detect already-resolved-elsewhere	2026-02-20 13:07:42 +07:00
Zamil Majdy	a408b45542	fix(copilot): don't flush parallel tool calls prematurely The SDK sends parallel tool calls as separate AssistantMessages each containing only ToolUseBlocks. The flush logic treated each new AssistantMessage as a new turn and prematurely emitted empty output for prior tools, causing spinners to disappear and the stream to appear stuck. Skip flush and wait_for_stash when the incoming AssistantMessage is a parallel continuation (contains only ToolUseBlocks). Also prevent duplicate StreamToolOutputAvailable for already-resolved tool calls.	2026-02-20 11:43:44 +07:00
Zamil Majdy	3a38b5e9bd	fix(copilot): address review comments — wait_for_stash fast path, error marker, compat test - Add fast path in wait_for_stash: check event.is_set() before clearing to avoid unnecessary 0.5s timeout when PostToolUse hook completes before the streaming loop calls wait_for_stash - Tighten "failed" error marker to "failed to" in _is_tool_error_or_denial to avoid matching benign outputs like "3 tests failed" - Add max_buffer_size to SDK compat test fields_we_use	2026-02-20 11:18:49 +07:00
Zamil Majdy	3491365b45	poetry lock	2026-02-20 11:15:03 +07:00
Zamil Majdy	d3299cfd7f	fix(copilot): remove redundant resolveInProgressTools streaming→ready effect The backend wait_for_stash() + _flush_unresolved_tool_calls() already ensures all tool calls are resolved before StreamFinish. The useEffect that called resolveInProgressTools on streaming→ready was a frontend safety net for the same issue — no longer needed. Keep the function itself for stop() (user cancellation).	2026-02-20 10:52:32 +07:00
Zamil Majdy	9161090944	Merge branch 'dev' of github.com:Significant-Gravitas/AutoGPT into fix/messed-up-copilot Resolve conflict in service.py: - Take dev's _build_query_message() refactor - Restore _compress_conversation_history (dev's signature) - Keep _is_tool_error_or_denial (tested in dev, harmless) - Drop redundant inline query-building and approach logging	2026-02-20 10:48:20 +07:00
Zamil Majdy	372f9bff32	fix(copilot): address review comments — SDK compat test, output_len, error marker - Add sdk_compat_test.py (17 tests) verifying the claude-agent-sdk public API surface we depend on, replacing the need for a tight version pin. - Fix output_len logging: use len(str(...)) so dict outputs report serialized size, not key count. - Tighten "failed" error marker to "failed to" to avoid false positives on benign tool output like "3 tests failed out of 10".	2026-02-20 10:30:03 +07:00
Zamil Majdy	17995596db	poetry lock	2026-02-20 10:29:33 +07:00
Zamil Majdy	7acbbd0f05	poetry lock	2026-02-20 10:21:29 +07:00
Zamil Majdy	4be03fcc08	fix(copilot): remove redundant resolveInProgressTools frontend safety net The backend already resolves all tool calls via wait_for_stash + _flush_unresolved_tool_calls before StreamFinish, making the streaming→ready transition cleanup unnecessary. The isComplete hydration fix (for page refresh/crash recovery) is kept since it covers a genuinely different failure mode.	2026-02-20 10:12:33 +07:00
Zamil Majdy	eb7bd6bdae	fix(copilot): unify context-building logic for resume and non-resume paths Consolidates the two separate context-injection paths (gap detection for --resume, full compression for non-resume) into a single flow: determine messages → compress → format → prepend. Renames _compress_conversation_history to _compress_messages accepting a list directly.	2026-02-20 10:05:04 +07:00
Zamil Majdy	d81e7dd6c9	Merge branch 'dev' into fix/messed-up-copilot Resolve service.py conflicts: take dev's file-based transcript approach (CapturedTranscript.path + read_transcript_file) and public client API, layer our fixes on top (wait_for_stash race-condition fix, session_id logging, approach logging).	2026-02-20 09:54:07 +07:00
Zamil Majdy	78b52b956d	fix(copilot): address PR review comments — runtime check, SDK version pin, event-based stash - Replace bare `assert client._query` with proper RuntimeError check - Add TECH DEBT comments on private SDK internal usage - Pin claude-agent-sdk to ~0.1.35 (tighter constraint for private API access) - Replace sleep(0.1) with event-based wait_for_stash() for race-condition fix - Add wait_for_stash synchronisation tests	2026-02-20 09:46:19 +07:00
Zamil Majdy	e476185c3a	fix(copilot): mitigate SDK hook race condition and improve diagnostic logging - Add 100ms yield before flush when unresolved tool calls exist, giving PostToolUse hooks time to complete before the stash is checked. This mitigates the race condition in claude_agent_sdk where hooks are fire-and-forget (start_soon) while messages arrive immediately. - Add has_unresolved_tool_calls property to SDKResponseAdapter - Differentiate empty flush warnings to flag likely race conditions - Add session_id to all SDK log messages ([SDK] [<session>] ...) - Log session approach (resume/compression/single-turn) with context sizes - Elevate session save log from debug to info	2026-02-20 09:32:30 +07:00
Zamil Majdy	b1c5000937	fix(copilot): improve tool flush logging, transcript capture, and stale spinner safety nets - Elevate flush logging from debug to info/warning with structured messages showing tool names and IDs for production diagnostics - Capture raw SDK output for transcript instead of relying on Stop hook file path (CLI doesn't write JSONL in SDK mode) - Add _build_transcript() to reconstruct JSONL from captured entries - Add isComplete option to hydration conversion — marks dangling tool calls as completed when session has no active stream (fixes stale spinners on page refresh) - Add resolveInProgressTools safety net on streaming→ready transition (catches tool parts the backend didn't emit output for) - Add 3 new tests for flush mechanism (ResultMessage, AssistantMessage, stashed output)	2026-02-20 08:56:56 +07:00
Zamil Majdy	7ee870ed70	fix(copilot): catch OSError in sandbox killpg to prevent zombie processes Catch OSError broadly (not just ProcessLookupError) when calling os.killpg so that EPERM or other errors don't skip the subsequent await proc.communicate(), which would leave the subprocess un-reaped.	2026-02-20 02:49:51 +07:00
Zamil Majdy	240e403592	fix(copilot): fix transcript validation and resume test resilience - Replace brittle line-count check (< 3) in read_transcript_file with proper validate_transcript() which checks for actual user/assistant entries — avoids rejecting valid short transcripts while still filtering metadata-only files - Add debug logging for transcript source selection and fallback path to aid diagnosing resume issues in Docker - Make test_sdk_resume_multi_turn skip gracefully when the CLI doesn't produce usable transcripts (environment-dependent: CLI version, platform) instead of hard-failing	2026-02-20 02:47:23 +07:00
Zamil Majdy	c3e94f7d9c	fix(copilot): address review comments — counter order, error markers, tests - Move task_spawn_count increment after limit check so denied spawns don't consume a slot (greptile feedback) - Add "failed" marker to _is_tool_error_or_denial to catch internal tool execution failures from _mcp_error (coderabbit feedback) - Add 17 unit tests for _is_tool_error_or_denial covering all markers, denial messages, and false-positive scenarios	2026-02-20 02:29:51 +07:00
Zamil Majdy	a0a040f102	fix(copilot): sandbox kill, tool event logging, and background task UX - Fix sandbox process kill: use start_new_session + os.killpg to kill the entire bwrap process group on timeout (proc.kill alone only kills the parent, leaving children running until natural completion) - Add StreamToolInputAvailable/StreamToolOutputAvailable to publish_chunk logging filter so tool events are visible in Docker logs - Add system prompt instruction telling Claude not to use run_in_background on Task tool (gets denied by security hooks) - Add tool event debug logging in SDK streaming loop for tracing tool execution visibility issues	2026-02-20 02:24:31 +07:00
Zamil Majdy	23225aa323	fix(copilot): address review comments — slot counting, heartbeat, error detection - Move run_in_background check before task_spawn_count increment so denied background Tasks don't consume a subtask slot - Replace asyncio.wait_for() with asyncio.timeout() for heartbeat loop to avoid leaving the async generator in a broken state - Tighten _is_tool_error_or_denial: remove overly broad markers ("error", "failed", "not found") that cause false positives; add markers for actual denial messages ("not supported", "maximum")	2026-02-20 01:19:33 +07:00
Zamil Majdy	fed645cb79	Merge branch 'dev' of github.com:Significant-Gravitas/AutoGPT into fix/messed-up-copilot	2026-02-20 01:14:46 +07:00
Zamil Majdy	009753f2b3	fix(copilot): prevent background agent stalls and context hallucination - Block Task tool's run_in_background param in security hooks — background agents stall the SSE stream and get killed when the main agent exits - Add heartbeats (15s interval) to SDK streaming loop so proxies/LBs don't close idle SSE connections during long tool execution - Fix summarization prompt that forced LLM to fabricate content for all 9 mandatory sections; now sections are optional and hallucination is explicitly prohibited - Include tool error/denial outcomes in conversation context formatting — previously all tool messages were dropped, so the agent couldn't see that security denials blocked its file writes and hallucinated success	2026-02-19 23:28:22 +07:00