fix(copilot): resolve dangling tool spinners when stream finishes

When the backend doesn't emit StreamToolOutputAvailable for all tool calls before StreamFinish (e.g. SDK built-in tools like WebSearch), the frontend spinners would spin forever. Add a useEffect that watches for the streaming→ready transition and marks any remaining input-available/input-streaming tool parts as output-available. Extract shared resolveInProgressTools helper used by both the stop handler (cancelled) and stream-end (completed).
style: fix Black formatting on cancel endpoint
2026-02-24 03:00:28 -05:00 · 2026-02-20 03:48:20 +07:00 · 2026-02-20 02:46:03 +07:00 · 2026-02-20 02:31:18 +07:00 · 2026-02-20 02:25:43 +07:00 · 2026-02-20 02:21:28 +07:00
47 changed files with 2490 additions and 1059 deletions
--- a/autogpt_platform/backend/backend/api/features/chat/routes.py
+++ b/autogpt_platform/backend/backend/api/features/chat/routes.py
@@ -18,7 +18,7 @@ from backend.copilot.completion_handler import (
    process_operation_success,
 )
 from backend.copilot.config import ChatConfig
-from backend.copilot.executor.utils import enqueue_copilot_task
+from backend.copilot.executor.utils import enqueue_cancel_task, enqueue_copilot_task
 from backend.copilot.model import (
    ChatMessage,
    ChatSession,
@@ -50,6 +50,7 @@ from backend.copilot.tools.models import (
    OperationPendingResponse,
    OperationStartedResponse,
    SetupRequirementsResponse,
+    SuggestedGoalResponse,
    UnderstandingUpdatedResponse,
 )
 from backend.copilot.tracking import track_user_message
@@ -131,6 +132,14 @@ class ListSessionsResponse(BaseModel):
    total: int


+class CancelTaskResponse(BaseModel):
+    """Response model for the cancel task endpoint."""
+
+    cancelled: bool
+    task_id: str | None = None
+    reason: str | None = None
+
+
 class OperationCompleteRequest(BaseModel):
    """Request model for external completion webhook."""

@@ -313,6 +322,57 @@ async def get_session(
    )


+@router.post(
+    "/sessions/{session_id}/cancel",
+    status_code=200,
+)
+async def cancel_session_task(
+    session_id: str,
+    user_id: Annotated[str | None, Depends(auth.get_user_id)],
+) -> CancelTaskResponse:
+    """Cancel the active streaming task for a session.
+
+    Publishes a cancel event to the executor via RabbitMQ FANOUT, then
+    polls Redis until the task status flips from ``running`` or a timeout
+    (5 s) is reached.  Returns only after the cancellation is confirmed.
+    """
+    await _validate_and_get_session(session_id, user_id)
+
+    active_task, _ = await stream_registry.get_active_task_for_session(
+        session_id, user_id
+    )
+    if not active_task:
+        return CancelTaskResponse(cancelled=False, reason="no_active_task")
+
+    task_id = active_task.task_id
+    await enqueue_cancel_task(task_id)
+    logger.info(
+        f"[CANCEL] Published cancel for task ...{task_id[-8:]} "
+        f"session ...{session_id[-8:]}"
+    )
+
+    # Poll until the executor confirms the task is no longer running.
+    # Keep max_wait below typical reverse-proxy read timeouts.
+    poll_interval = 0.5
+    max_wait = 5.0
+    waited = 0.0
+    while waited < max_wait:
+        await asyncio.sleep(poll_interval)
+        waited += poll_interval
+        task = await stream_registry.get_task(task_id)
+        if task is None or task.status != "running":
+            logger.info(
+                f"[CANCEL] Task ...{task_id[-8:]} confirmed stopped "
+                f"(status={task.status if task else 'gone'}) after {waited:.1f}s"
+            )
+            return CancelTaskResponse(cancelled=True, task_id=task_id)
+
+    logger.warning(f"[CANCEL] Task ...{task_id[-8:]} not confirmed after {max_wait}s")
+    return CancelTaskResponse(
+        cancelled=True, task_id=task_id, reason="cancel_published_not_confirmed"
+    )
+
+
@router.post(
    "/sessions/{session_id}/stream",
 )
@@ -984,6 +1044,7 @@ ToolResponseUnion = (
    | AgentPreviewResponse
    | AgentSavedResponse
    | ClarificationNeededResponse
+    | SuggestedGoalResponse
    | BlockListResponse
    | BlockDetailsResponse
    | BlockOutputResponse
--- a/autogpt_platform/backend/backend/copilot/executor/manager.py
+++ b/autogpt_platform/backend/backend/copilot/executor/manager.py
@@ -4,7 +4,6 @@ This module contains the CoPilotExecutor class that consumes chat tasks from
 RabbitMQ and processes them using a thread pool, following the graph executor pattern.
 """

-import asyncio
 import logging
 import os
 import threading
--- a/autogpt_platform/backend/backend/copilot/executor/utils.py
+++ b/autogpt_platform/backend/backend/copilot/executor/utils.py
@@ -205,3 +205,20 @@ async def enqueue_copilot_task(
        message=entry.model_dump_json(),
        exchange=COPILOT_EXECUTION_EXCHANGE,
    )
+
+
+async def enqueue_cancel_task(task_id: str) -> None:
+    """Publish a cancel request for a running CoPilot task.
+
+    Sends a ``CancelCoPilotEvent`` to the FANOUT exchange so all executor
+    pods receive the cancellation signal.
+    """
+    from backend.util.clients import get_async_copilot_queue
+
+    event = CancelCoPilotEvent(task_id=task_id)
+    queue_client = await get_async_copilot_queue()
+    await queue_client.publish_message(
+        routing_key="",  # FANOUT ignores routing key
+        message=event.model_dump_json(),
+        exchange=COPILOT_CANCEL_EXCHANGE,
+    )
--- a/autogpt_platform/backend/backend/copilot/parallel_tool_calls_test.py
+++ b/autogpt_platform/backend/backend/copilot/parallel_tool_calls_test.py
@@ -0,0 +1,272 @@
+"""Tests for parallel tool call execution in CoPilot.
+
+These tests mock _yield_tool_call to avoid importing the full copilot stack
+which requires Prisma, DB connections, etc.
+"""
+
+import asyncio
+import time
+from typing import Any, cast
+
+import pytest
+
+
+@pytest.mark.asyncio
+async def test_parallel_tool_calls_run_concurrently():
+    """Multiple tool calls should complete in ~max(delays), not sum(delays)."""
+    # Import here to allow module-level mocking if needed
+    from backend.copilot.response_model import (
+        StreamToolInputAvailable,
+        StreamToolOutputAvailable,
+    )
+    from backend.copilot.service import _execute_tool_calls_parallel
+
+    n_tools = 3
+    delay_per_tool = 0.2
+    tool_calls = [
+        {
+            "id": f"call_{i}",
+            "type": "function",
+            "function": {"name": f"tool_{i}", "arguments": "{}"},
+        }
+        for i in range(n_tools)
+    ]
+
+    # Minimal session mock
+    class FakeSession:
+        session_id = "test"
+        user_id = "test"
+
+        def __init__(self):
+            self.messages = []
+
+    original_yield = None
+
+    async def fake_yield(tc_list, idx, sess, lock=None):
+        yield StreamToolInputAvailable(
+            toolCallId=tc_list[idx]["id"],
+            toolName=tc_list[idx]["function"]["name"],
+            input={},
+        )
+        await asyncio.sleep(delay_per_tool)
+        yield StreamToolOutputAvailable(
+            toolCallId=tc_list[idx]["id"],
+            toolName=tc_list[idx]["function"]["name"],
+            output="{}",
+        )
+
+    import backend.copilot.service as svc
+
+    original_yield = svc._yield_tool_call
+    svc._yield_tool_call = fake_yield
+    try:
+        start = time.monotonic()
+        events = []
+        async for event in _execute_tool_calls_parallel(
+            tool_calls, cast(Any, FakeSession())
+        ):
+            events.append(event)
+        elapsed = time.monotonic() - start
+    finally:
+        svc._yield_tool_call = original_yield
+
+    assert len(events) == n_tools * 2
+    # Parallel: should take ~delay, not ~n*delay
+    assert elapsed < delay_per_tool * (
+        n_tools - 0.5
+    ), f"Took {elapsed:.2f}s, expected parallel (~{delay_per_tool}s)"
+
+
+@pytest.mark.asyncio
+async def test_single_tool_call_works():
+    """Single tool call should work identically."""
+    from backend.copilot.response_model import (
+        StreamToolInputAvailable,
+        StreamToolOutputAvailable,
+    )
+    from backend.copilot.service import _execute_tool_calls_parallel
+
+    tool_calls = [
+        {
+            "id": "call_0",
+            "type": "function",
+            "function": {"name": "t", "arguments": "{}"},
+        }
+    ]
+
+    class FakeSession:
+        session_id = "test"
+        user_id = "test"
+
+        def __init__(self):
+            self.messages = []
+
+    async def fake_yield(tc_list, idx, sess, lock=None):
+        yield StreamToolInputAvailable(toolCallId="call_0", toolName="t", input={})
+        yield StreamToolOutputAvailable(toolCallId="call_0", toolName="t", output="{}")
+
+    import backend.copilot.service as svc
+
+    orig = svc._yield_tool_call
+    svc._yield_tool_call = fake_yield
+    try:
+        events = [
+            e
+            async for e in _execute_tool_calls_parallel(
+                tool_calls, cast(Any, FakeSession())
+            )
+        ]
+    finally:
+        svc._yield_tool_call = orig
+
+    assert len(events) == 2
+
+
+@pytest.mark.asyncio
+async def test_retryable_error_propagates():
+    """Retryable errors should be raised after all tools finish."""
+    from backend.copilot.response_model import StreamToolOutputAvailable
+    from backend.copilot.service import _execute_tool_calls_parallel
+
+    tool_calls = [
+        {
+            "id": f"call_{i}",
+            "type": "function",
+            "function": {"name": f"t_{i}", "arguments": "{}"},
+        }
+        for i in range(2)
+    ]
+
+    class FakeSession:
+        session_id = "test"
+        user_id = "test"
+
+        def __init__(self):
+            self.messages = []
+
+    async def fake_yield(tc_list, idx, sess, lock=None):
+        if idx == 1:
+            raise KeyError("bad")
+        from backend.copilot.response_model import StreamToolInputAvailable
+
+        yield StreamToolInputAvailable(
+            toolCallId=tc_list[idx]["id"], toolName="t_0", input={}
+        )
+        await asyncio.sleep(0.05)
+        yield StreamToolOutputAvailable(
+            toolCallId=tc_list[idx]["id"], toolName="t_0", output="{}"
+        )
+
+    import backend.copilot.service as svc
+
+    orig = svc._yield_tool_call
+    svc._yield_tool_call = fake_yield
+    try:
+        events = []
+        with pytest.raises(KeyError):
+            async for event in _execute_tool_calls_parallel(
+                tool_calls, cast(Any, FakeSession())
+            ):
+                events.append(event)
+        # First tool's events should still be yielded
+        assert any(isinstance(e, StreamToolOutputAvailable) for e in events)
+    finally:
+        svc._yield_tool_call = orig
+
+
+@pytest.mark.asyncio
+async def test_session_lock_shared():
+    """All parallel tools should receive the same lock instance."""
+    from backend.copilot.response_model import (
+        StreamToolInputAvailable,
+        StreamToolOutputAvailable,
+    )
+    from backend.copilot.service import _execute_tool_calls_parallel
+
+    tool_calls = [
+        {
+            "id": f"call_{i}",
+            "type": "function",
+            "function": {"name": f"t_{i}", "arguments": "{}"},
+        }
+        for i in range(3)
+    ]
+
+    class FakeSession:
+        session_id = "test"
+        user_id = "test"
+
+        def __init__(self):
+            self.messages = []
+
+    observed_locks = []
+
+    async def fake_yield(tc_list, idx, sess, lock=None):
+        observed_locks.append(lock)
+        yield StreamToolInputAvailable(
+            toolCallId=tc_list[idx]["id"], toolName=f"t_{idx}", input={}
+        )
+        yield StreamToolOutputAvailable(
+            toolCallId=tc_list[idx]["id"], toolName=f"t_{idx}", output="{}"
+        )
+
+    import backend.copilot.service as svc
+
+    orig = svc._yield_tool_call
+    svc._yield_tool_call = fake_yield
+    try:
+        async for _ in _execute_tool_calls_parallel(
+            tool_calls, cast(Any, FakeSession())
+        ):
+            pass
+    finally:
+        svc._yield_tool_call = orig
+
+    assert len(observed_locks) == 3
+    assert observed_locks[0] is observed_locks[1] is observed_locks[2]
+    assert isinstance(observed_locks[0], asyncio.Lock)
+
+
+@pytest.mark.asyncio
+async def test_cancellation_cleans_up():
+    """Generator close should cancel in-flight tasks."""
+    from backend.copilot.response_model import StreamToolInputAvailable
+    from backend.copilot.service import _execute_tool_calls_parallel
+
+    tool_calls = [
+        {
+            "id": f"call_{i}",
+            "type": "function",
+            "function": {"name": f"t_{i}", "arguments": "{}"},
+        }
+        for i in range(2)
+    ]
+
+    class FakeSession:
+        session_id = "test"
+        user_id = "test"
+
+        def __init__(self):
+            self.messages = []
+
+    started = asyncio.Event()
+
+    async def fake_yield(tc_list, idx, sess, lock=None):
+        yield StreamToolInputAvailable(
+            toolCallId=tc_list[idx]["id"], toolName=f"t_{idx}", input={}
+        )
+        started.set()
+        await asyncio.sleep(10)  # simulate long-running
+
+    import backend.copilot.service as svc
+
+    orig = svc._yield_tool_call
+    svc._yield_tool_call = fake_yield
+    try:
+        gen = _execute_tool_calls_parallel(tool_calls, cast(Any, FakeSession()))
+        await gen.__anext__()  # get first event
+        await started.wait()
+        await gen.aclose()  # close generator
+    finally:
+        svc._yield_tool_call = orig
+    # If we get here without hanging, cleanup worked
--- a/autogpt_platform/backend/backend/copilot/sdk/response_adapter.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/response_adapter.py
@@ -53,6 +53,7 @@ class SDKResponseAdapter:
        self.has_started_text = False
        self.has_ended_text = False
        self.current_tool_calls: dict[str, dict[str, str]] = {}
+        self.resolved_tool_calls: set[str] = set()
        self.task_id: str | None = None
        self.step_open = False

@@ -74,6 +75,10 @@ class SDKResponseAdapter:
                self.step_open = True

        elif isinstance(sdk_message, AssistantMessage):
+            # Flush any SDK built-in tool calls that didn't get a UserMessage
+            # result (e.g. WebSearch, Read handled internally by the CLI).
+            self._flush_unresolved_tool_calls(responses)
+
            # After tool results, the SDK sends a new AssistantMessage for the
            # next LLM turn. Open a new step if the previous one was closed.
            if not self.step_open:
@@ -111,6 +116,8 @@ class SDKResponseAdapter:
            # UserMessage carries tool results back from tool execution.
            content = sdk_message.content
            blocks = content if isinstance(content, list) else []
+            resolved_in_blocks: set[str] = set()
+
            for block in blocks:
                if isinstance(block, ToolResultBlock) and block.tool_use_id:
                    tool_info = self.current_tool_calls.get(block.tool_use_id, {})
@@ -132,6 +139,37 @@ class SDKResponseAdapter:
                            success=not (block.is_error or False),
                        )
                    )
+                    resolved_in_blocks.add(block.tool_use_id)
+
+            # Handle SDK built-in tool results carried via parent_tool_use_id
+            # instead of (or in addition to) ToolResultBlock content.
+            parent_id = sdk_message.parent_tool_use_id
+            if parent_id and parent_id not in resolved_in_blocks:
+                tool_info = self.current_tool_calls.get(parent_id, {})
+                tool_name = tool_info.get("name", "unknown")
+
+                # Try stashed output first (from PostToolUse hook),
+                # then tool_use_result dict, then string content.
+                output = pop_pending_tool_output(tool_name)
+                if not output:
+                    tur = sdk_message.tool_use_result
+                    if tur is not None:
+                        output = _extract_tool_use_result(tur)
+                if not output and isinstance(content, str) and content.strip():
+                    output = content.strip()
+
+                if output:
+                    responses.append(
+                        StreamToolOutputAvailable(
+                            toolCallId=parent_id,
+                            toolName=tool_name,
+                            output=output,
+                            success=True,
+                        )
+                    )
+                    resolved_in_blocks.add(parent_id)
+
+            self.resolved_tool_calls.update(resolved_in_blocks)

            # Close the current step after tool results — the next
            # AssistantMessage will open a new step for the continuation.
@@ -140,6 +178,7 @@ class SDKResponseAdapter:
                self.step_open = False

        elif isinstance(sdk_message, ResultMessage):
+            self._flush_unresolved_tool_calls(responses)
            self._end_text_if_open(responses)
            # Close the step before finishing.
            if self.step_open:
@@ -149,7 +188,7 @@ class SDKResponseAdapter:
            if sdk_message.subtype == "success":
                responses.append(StreamFinish())
            elif sdk_message.subtype in ("error", "error_during_execution"):
-                error_msg = getattr(sdk_message, "result", None) or "Unknown error"
+                error_msg = sdk_message.result or "Unknown error"
                responses.append(
                    StreamError(errorText=str(error_msg), code="sdk_error")
                )
@@ -180,6 +219,59 @@ class SDKResponseAdapter:
            responses.append(StreamTextEnd(id=self.text_block_id))
            self.has_ended_text = True

+    def _flush_unresolved_tool_calls(self, responses: list[StreamBaseResponse]) -> None:
+        """Emit outputs for tool calls that didn't receive a UserMessage result.
+
+        SDK built-in tools (WebSearch, Read, etc.) may be executed by the CLI
+        internally without surfacing a separate ``UserMessage`` with
+        ``ToolResultBlock`` content.  The ``PostToolUse`` hook stashes their
+        output, which we pop and emit here before the next ``AssistantMessage``
+        starts.
+        """
+        flushed = False
+        for tool_id, tool_info in self.current_tool_calls.items():
+            if tool_id in self.resolved_tool_calls:
+                continue
+            tool_name = tool_info.get("name", "unknown")
+            output = pop_pending_tool_output(tool_name)
+            if output is not None:
+                responses.append(
+                    StreamToolOutputAvailable(
+                        toolCallId=tool_id,
+                        toolName=tool_name,
+                        output=output,
+                        success=True,
+                    )
+                )
+                self.resolved_tool_calls.add(tool_id)
+                flushed = True
+                logger.debug(
+                    f"Flushed pending output for built-in tool {tool_name} "
+                    f"(call {tool_id})"
+                )
+            else:
+                # No output available — emit an empty output so the frontend
+                # transitions the tool from input-available to output-available
+                # (stops the spinner).
+                responses.append(
+                    StreamToolOutputAvailable(
+                        toolCallId=tool_id,
+                        toolName=tool_name,
+                        output="",
+                        success=True,
+                    )
+                )
+                self.resolved_tool_calls.add(tool_id)
+                flushed = True
+                logger.debug(
+                    f"Flushed empty output for unresolved tool {tool_name} "
+                    f"(call {tool_id})"
+                )
+
+        if flushed and self.step_open:
+            responses.append(StreamFinishStep())
+            self.step_open = False
+

 def _extract_tool_output(content: str | list[dict[str, str]] | None) -> str:
    """Extract a string output from a ToolResultBlock's content field."""
@@ -199,3 +291,30 @@ def _extract_tool_output(content: str | list[dict[str, str]] | None) -> str:
        return json.dumps(content)
    except (TypeError, ValueError):
        return str(content)
+
+
+def _extract_tool_use_result(result: object) -> str:
+    """Extract a string from a UserMessage's ``tool_use_result`` dict.
+
+    SDK built-in tools may store their result in ``tool_use_result``
+    instead of (or in addition to) ``ToolResultBlock`` content blocks.
+    """
+    if isinstance(result, str):
+        return result
+    if isinstance(result, dict):
+        # Try common result keys
+        for key in ("content", "text", "output", "stdout", "result"):
+            val = result.get(key)
+            if isinstance(val, str) and val:
+                return val
+        # Fall back to JSON serialization of the whole dict
+        try:
+            return json.dumps(result)
+        except (TypeError, ValueError):
+            return str(result)
+    if result is None:
+        return ""
+    try:
+        return json.dumps(result)
+    except (TypeError, ValueError):
+        return str(result)
--- a/autogpt_platform/backend/backend/copilot/sdk/security_hooks.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/security_hooks.py
@@ -16,6 +16,7 @@ from .tool_adapter import (
    DANGEROUS_PATTERNS,
    MCP_TOOL_PREFIX,
    WORKSPACE_SCOPED_TOOLS,
+    stash_pending_tool_output,
 )

 logger = logging.getLogger(__name__)
@@ -224,10 +225,25 @@ def create_security_hooks(
            tool_use_id: str | None,
            context: HookContext,
        ) -> SyncHookJSONOutput:
-            """Log successful tool executions for observability."""
+            """Log successful tool executions and stash SDK built-in tool outputs.
+
+            MCP tools stash their output in ``_execute_tool_sync`` before the
+            SDK can truncate it.  SDK built-in tools (WebSearch, Read, etc.)
+            are executed by the CLI internally — this hook captures their
+            output so the response adapter can forward it to the frontend.
+            """
            _ = context
            tool_name = cast(str, input_data.get("tool_name", ""))
            logger.debug(f"[SDK] Tool success: {tool_name}, tool_use_id={tool_use_id}")
+
+            # Stash output for SDK built-in tools so the response adapter can
+            # emit StreamToolOutputAvailable even when the CLI doesn't surface
+            # a separate UserMessage with ToolResultBlock content.
+            if not tool_name.startswith(MCP_TOOL_PREFIX):
+                tool_response = input_data.get("tool_response")
+                if tool_response is not None:
+                    stash_pending_tool_output(tool_name, tool_response)
+
            return cast(SyncHookJSONOutput, {})

        async def post_tool_failure_hook(
--- a/autogpt_platform/backend/backend/copilot/sdk/service.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/service.py
@@ -47,6 +47,7 @@ from .tool_adapter import (
    set_execution_context,
 )
 from .transcript import (
+    cleanup_cli_project_dir,
    download_transcript,
    read_transcript_file,
    upload_transcript,
@@ -86,9 +87,12 @@ _SDK_TOOL_SUPPLEMENT = """
  for shell commands — it runs in a network-isolated sandbox.
 - **Shared workspace**: The SDK Read/Write tools and `bash_exec` share the
  same working directory. Files created by one are readable by the other.
-  These files are **ephemeral** — they exist only for the current session.
- **Persistent storage**: Use `write_workspace_file` / `read_workspace_file`
-  for files that should persist across sessions (stored in cloud storage).
+- **IMPORTANT — File persistence**: Your working directory is **ephemeral** —
+  files are lost between turns. When you create or modify important files
+  (code, configs, outputs), you MUST save them using `write_workspace_file`
+  so they persist. Use `read_workspace_file` and `list_workspace_files` to
+  access files saved in previous turns. If a "Files from previous turns"
+  section is present above, those files are available via `read_workspace_file`.
 - Long-running tools (create_agent, edit_agent, etc.) are handled
  asynchronously.  You will receive an immediate response; the actual result
  is delivered to the user via a background stream.
@@ -268,48 +272,28 @@ def _make_sdk_cwd(session_id: str) -> str:


 def _cleanup_sdk_tool_results(cwd: str) -> None:
-    """Remove SDK tool-result files for a specific session working directory.
+    """Remove SDK session artifacts for a specific working directory.

-    The SDK creates tool-result files under ~/.claude/projects/<encoded-cwd>/tool-results/.
-    We clean only the specific cwd's results to avoid race conditions between
-    concurrent sessions.
+    Cleans up:
+    - ``~/.claude/projects/<encoded-cwd>/`` — CLI session transcripts and
+      tool-result files.  Each SDK turn uses a unique cwd, so this directory
+      is safe to remove entirely.
+    - ``/tmp/copilot-<session>/`` — the ephemeral working directory.

-    Security: cwd MUST be created by _make_sdk_cwd() which sanitizes session_id.
+    Security: *cwd* MUST be created by ``_make_sdk_cwd()`` which sanitizes
+    the session_id.
    """
    import shutil

-    # Validate cwd is under the expected prefix
    normalized = os.path.normpath(cwd)
    if not normalized.startswith(_SDK_CWD_PREFIX):
        logger.warning(f"[SDK] Rejecting cleanup for path outside workspace: {cwd}")
        return

-    # SDK encodes the cwd path by replacing '/' with '-'
-    encoded_cwd = normalized.replace("/", "-")
+    # Clean the CLI's project directory (transcripts + tool-results).
+    cleanup_cli_project_dir(cwd)

-    # Construct the project directory path (known-safe home expansion)
-    claude_projects = os.path.expanduser("~/.claude/projects")
-    project_dir = os.path.join(claude_projects, encoded_cwd)
-
-    # Security check 3: Validate project_dir is under ~/.claude/projects
-    project_dir = os.path.normpath(project_dir)
-    if not project_dir.startswith(claude_projects):
-        logger.warning(
-            f"[SDK] Rejecting cleanup for escaped project path: {project_dir}"
-        )
-        return
-
-    results_dir = os.path.join(project_dir, "tool-results")
-    if os.path.isdir(results_dir):
-        for filename in os.listdir(results_dir):
-            file_path = os.path.join(results_dir, filename)
-            try:
-                if os.path.isfile(file_path):
-                    os.remove(file_path)
-            except OSError:
-                pass
-
-    # Also clean up the temp cwd directory itself
+    # Clean up the temp cwd directory itself.
    try:
        shutil.rmtree(normalized, ignore_errors=True)
    except OSError:
@@ -519,6 +503,7 @@ async def stream_chat_completion_sdk(
            def _on_stop(transcript_path: str, sdk_session_id: str) -> None:
                captured_transcript.path = transcript_path
                captured_transcript.sdk_session_id = sdk_session_id
+                logger.debug(f"[SDK] Stop hook: path={transcript_path!r}")

            security_hooks = create_security_hooks(
                user_id,
@@ -530,18 +515,20 @@ async def stream_chat_completion_sdk(
            # --- Resume strategy: download transcript from bucket ---
            resume_file: str | None = None
            use_resume = False
+            transcript_msg_count = 0  # watermark: session.messages length at upload

            if config.claude_agent_use_resume and user_id and len(session.messages) > 1:
-                transcript_content = await download_transcript(user_id, session_id)
-                if transcript_content and validate_transcript(transcript_content):
+                dl = await download_transcript(user_id, session_id)
+                if dl and validate_transcript(dl.content):
                    resume_file = write_transcript_to_tempfile(
-                        transcript_content, session_id, sdk_cwd
+                        dl.content, session_id, sdk_cwd
                    )
                    if resume_file:
                        use_resume = True
-                        logger.info(
-                            f"[SDK] Using --resume with transcript "
-                            f"({len(transcript_content)} bytes)"
+                        transcript_msg_count = dl.message_count
+                        logger.debug(
+                            f"[SDK] Using --resume ({len(dl.content)}B, "
+                            f"msg_count={transcript_msg_count})"
                        )

            sdk_options_kwargs: dict[str, Any] = {
@@ -582,11 +569,38 @@ async def stream_chat_completion_sdk(
                # Build query: with --resume the CLI already has full
                # context, so we only send the new message.  Without
                # resume, compress history into a context prefix.
+                #
+                # Hybrid mode: if the transcript is stale (upload missed
+                # some turns), compress only the gap and prepend it so
+                # the agent has transcript context + missed turns.
                query_message = current_message
-                if not use_resume and len(session.messages) > 1:
+                current_msg_count = len(session.messages)
+
+                if use_resume and transcript_msg_count > 0:
+                    # Transcript covers messages[0..M-1].  Current session
+                    # has N messages (last one is the new user msg).
+                    # Gap = messages[M .. N-2] (everything between upload
+                    # and the current turn).
+                    # When transcript_msg_count == 0 (no metadata), we trust
+                    # the transcript is up-to-date and skip gap detection to
+                    # avoid duplicating the full history.
+                    if transcript_msg_count < current_msg_count - 1:
+                        gap = session.messages[transcript_msg_count:-1]
+                        gap_context = _format_conversation_context(gap)
+                        if gap_context:
+                            logger.info(
+                                f"[SDK] Transcript stale: covers {transcript_msg_count} "
+                                f"of {current_msg_count} messages, compressing "
+                                f"{len(gap)} missed messages"
+                            )
+                            query_message = (
+                                f"{gap_context}\n\n"
+                                f"Now, the user says:\n{current_message}"
+                            )
+                elif not use_resume and current_msg_count > 1:
                    logger.warning(
                        f"[SDK] Using compression fallback for session "
-                        f"{session_id} ({len(session.messages)} messages) — "
+                        f"{session_id} ({current_msg_count} messages) — "
                        f"no transcript available for --resume"
                    )
                    compressed = await _compress_conversation_history(session)
@@ -598,9 +612,9 @@ async def stream_chat_completion_sdk(
                        )

                logger.info(
-                    f"[SDK] Sending query ({len(session.messages)} msgs in session)"
+                    f"[SDK] Sending query ({len(session.messages)} msgs, "
+                    f"resume={use_resume})"
                )
-                logger.debug(f"[SDK] Query preview: {current_message[:80]!r}")
                await client.query(query_message, session_id=session_id)

                assistant_response = ChatMessage(role="assistant", content="")
@@ -681,29 +695,33 @@ async def stream_chat_completion_sdk(
                ) and not has_appended_assistant:
                    session.messages.append(assistant_response)

-                # --- Capture transcript while CLI is still alive ---
-                # Must happen INSIDE async with: close() sends SIGTERM
-                # which kills the CLI before it can flush the JSONL.
-                if (
-                    config.claude_agent_use_resume
-                    and user_id
-                    and captured_transcript.available
-                ):
-                    # Give CLI time to flush JSONL writes before we read
-                    await asyncio.sleep(0.5)
+            # --- Upload transcript for next-turn --resume ---
+            # After async with the SDK task group has exited, so the Stop
+            # hook has already fired and the CLI has been SIGTERMed.  The
+            # CLI uses appendFileSync, so all writes are safely on disk.
+            if config.claude_agent_use_resume and user_id:
+                # With --resume the CLI appends to the resume file (most
+                # complete).  Otherwise use the Stop hook path.
+                if use_resume and resume_file:
+                    raw_transcript = read_transcript_file(resume_file)
+                elif captured_transcript.path:
                    raw_transcript = read_transcript_file(captured_transcript.path)
-                    if raw_transcript:
-                        try:
-                            async with asyncio.timeout(30):
-                                await _upload_transcript_bg(
-                                    user_id, session_id, raw_transcript
-                                )
-                        except asyncio.TimeoutError:
-                            logger.warning(
-                                f"[SDK] Transcript upload timed out for {session_id}"
-                            )
-                    else:
-                        logger.debug("[SDK] Stop hook fired but transcript not usable")
+                else:
+                    raw_transcript = None
+
+                if raw_transcript:
+                    # Shield the upload from generator cancellation so a
+                    # client disconnect / page refresh doesn't lose the
+                    # transcript.  The upload must finish even if the SSE
+                    # connection is torn down.
+                    await asyncio.shield(
+                        _try_upload_transcript(
+                            user_id,
+                            session_id,
+                            raw_transcript,
+                            message_count=len(session.messages),
+                        )
+                    )

        except ImportError:
            raise RuntimeError(
@@ -712,7 +730,7 @@ async def stream_chat_completion_sdk(
                "to use the OpenAI-compatible fallback."
            )

-        await upsert_chat_session(session)
+        await asyncio.shield(upsert_chat_session(session))
        logger.debug(
            f"[SDK] Session {session_id} saved with {len(session.messages)} messages"
        )
@@ -722,7 +740,7 @@ async def stream_chat_completion_sdk(
    except Exception as e:
        logger.error(f"[SDK] Error: {e}", exc_info=True)
        try:
-            await upsert_chat_session(session)
+            await asyncio.shield(upsert_chat_session(session))
        except Exception as save_err:
            logger.error(f"[SDK] Failed to save session on error: {save_err}")
        yield StreamError(
@@ -735,14 +753,31 @@ async def stream_chat_completion_sdk(
            _cleanup_sdk_tool_results(sdk_cwd)


-async def _upload_transcript_bg(
-    user_id: str, session_id: str, raw_content: str
-) -> None:
-    """Background task to strip progress entries and upload transcript."""
+async def _try_upload_transcript(
+    user_id: str,
+    session_id: str,
+    raw_content: str,
+    message_count: int = 0,
+) -> bool:
+    """Strip progress entries and upload transcript (with timeout).
+
+    Returns True if the upload completed without error.
+    """
    try:
-        await upload_transcript(user_id, session_id, raw_content)
+        async with asyncio.timeout(30):
+            await upload_transcript(
+                user_id, session_id, raw_content, message_count=message_count
+            )
+        return True
+    except asyncio.TimeoutError:
+        logger.warning(f"[SDK] Transcript upload timed out for {session_id}")
+        return False
    except Exception as e:
-        logger.error(f"[SDK] Failed to upload transcript for {session_id}: {e}")
+        logger.error(
+            f"[SDK] Failed to upload transcript for {session_id}: {e}",
+            exc_info=True,
+        )
+        return False


 async def _update_title_async(
--- a/autogpt_platform/backend/backend/copilot/sdk/tool_adapter.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/tool_adapter.py
@@ -41,7 +41,7 @@ _current_session: ContextVar[ChatSession | None] = ContextVar(
 # Stash for MCP tool outputs before the SDK potentially truncates them.
 # Keyed by tool_name → full output string. Consumed (popped) by the
 # response adapter when it builds StreamToolOutputAvailable.
-_pending_tool_outputs: ContextVar[dict[str, str]] = ContextVar(
+_pending_tool_outputs: ContextVar[dict[str, list[str]]] = ContextVar(
    "pending_tool_outputs", default=None  # type: ignore[arg-type]
 )

@@ -88,19 +88,52 @@ def get_execution_context() -> tuple[str | None, ChatSession | None]:


 def pop_pending_tool_output(tool_name: str) -> str | None:
-    """Pop and return the stashed full output for *tool_name*.
+    """Pop and return the oldest stashed output for *tool_name*.

    The SDK CLI may truncate large tool results (writing them to disk and
    replacing the content with a file reference). This stash keeps the
    original MCP output so the response adapter can forward it to the
    frontend for proper widget rendering.

+    Uses a FIFO queue per tool name so duplicate calls to the same tool
+    in one turn each get their own output.
+
    Returns ``None`` if nothing was stashed for *tool_name*.
    """
    pending = _pending_tool_outputs.get(None)
    if pending is None:
        return None
-    return pending.pop(tool_name, None)
+    queue = pending.get(tool_name)
+    if not queue:
+        pending.pop(tool_name, None)
+        return None
+    value = queue.pop(0)
+    if not queue:
+        del pending[tool_name]
+    return value
+
+
+def stash_pending_tool_output(tool_name: str, output: Any) -> None:
+    """Stash tool output for later retrieval by the response adapter.
+
+    Used by the PostToolUse hook to capture SDK built-in tool outputs
+    (WebSearch, Read, etc.) that aren't available through the MCP stash
+    mechanism in ``_execute_tool_sync``.
+
+    Appends to a FIFO queue per tool name so multiple calls to the same
+    tool in one turn are all preserved.
+    """
+    pending = _pending_tool_outputs.get(None)
+    if pending is None:
+        return
+    if isinstance(output, str):
+        text = output
+    else:
+        try:
+            text = json.dumps(output)
+        except (TypeError, ValueError):
+            text = str(output)
+    pending.setdefault(tool_name, []).append(text)


 async def _execute_tool_sync(
@@ -125,14 +158,63 @@ async def _execute_tool_sync(
    # Stash the full output before the SDK potentially truncates it.
    pending = _pending_tool_outputs.get(None)
    if pending is not None:
-        pending[base_tool.name] = text
+        pending.setdefault(base_tool.name, []).append(text)
+
+    content_blocks: list[dict[str, str]] = [{"type": "text", "text": text}]
+
+    # If the tool result contains inline image data, add an MCP image block
+    # so Claude can "see" the image (e.g. read_workspace_file on a small PNG).
+    image_block = _extract_image_block(text)
+    if image_block:
+        content_blocks.append(image_block)

    return {
-        "content": [{"type": "text", "text": text}],
+        "content": content_blocks,
        "isError": not result.success,
    }


+# MIME types that Claude can process as image content blocks.
+_SUPPORTED_IMAGE_TYPES = frozenset(
+    {"image/png", "image/jpeg", "image/gif", "image/webp"}
+)
+
+
+def _extract_image_block(text: str) -> dict[str, str] | None:
+    """Extract an MCP image content block from a tool result JSON string.
+
+    Detects workspace file responses with ``content_base64`` and an image
+    MIME type, returning an MCP-format image block that allows Claude to
+    "see" the image.  Returns ``None`` if the result is not an inline image.
+    """
+    try:
+        data = json.loads(text)
+    except (json.JSONDecodeError, TypeError):
+        return None
+
+    if not isinstance(data, dict):
+        return None
+
+    mime_type = data.get("mime_type", "")
+    base64_content = data.get("content_base64", "")
+
+    # Only inline small images — large ones would exceed Claude's limits.
+    # 32 KB raw ≈ ~43 KB base64.
+    _MAX_IMAGE_BASE64_BYTES = 43_000
+    if (
+        mime_type in _SUPPORTED_IMAGE_TYPES
+        and base64_content
+        and len(base64_content) <= _MAX_IMAGE_BASE64_BYTES
+    ):
+        return {
+            "type": "image",
+            "data": base64_content,
+            "mimeType": mime_type,
+        }
+
+    return None
+
+
 def _mcp_error(message: str) -> dict[str, Any]:
    return {
        "content": [
@@ -311,14 +393,29 @@ def create_copilot_mcp_server():
 # which provides kernel-level network isolation via unshare --net.
 # Task allows spawning sub-agents (rate-limited by security hooks).
 # WebSearch uses Brave Search via Anthropic's API — safe, no SSRF risk.
-_SDK_BUILTIN_TOOLS = ["Read", "Write", "Edit", "Glob", "Grep", "Task", "WebSearch"]
+# TodoWrite manages the task checklist shown in the UI — no security concern.
+_SDK_BUILTIN_TOOLS = [
+    "Read",
+    "Write",
+    "Edit",
+    "Glob",
+    "Grep",
+    "Task",
+    "WebSearch",
+    "TodoWrite",
+]

 # SDK built-in tools that must be explicitly blocked.
 # Bash: dangerous — agent uses mcp__copilot__bash_exec with kernel-level
 #   network isolation (unshare --net) instead.
 # WebFetch: SSRF risk — can reach internal network (localhost, 10.x, etc.).
 #   Agent uses the SSRF-protected mcp__copilot__web_fetch tool instead.
-SDK_DISALLOWED_TOOLS = ["Bash", "WebFetch"]
+# AskUserQuestion: interactive CLI tool — no terminal in copilot context.
+SDK_DISALLOWED_TOOLS = [
+    "Bash",
+    "WebFetch",
+    "AskUserQuestion",
+]

 # Tools that are blocked entirely in security hooks (defence-in-depth).
 # Includes SDK_DISALLOWED_TOOLS plus common aliases/synonyms.
--- a/autogpt_platform/backend/backend/copilot/sdk/transcript.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/transcript.py
@@ -14,6 +14,8 @@ import json
 import logging
 import os
 import re
+import time
+from dataclasses import dataclass

 logger = logging.getLogger(__name__)

@@ -31,6 +33,16 @@ STRIPPABLE_TYPES = frozenset(
    {"progress", "file-history-snapshot", "queue-operation", "summary", "pr-link"}
 )

+
+@dataclass
+class TranscriptDownload:
+    """Result of downloading a transcript with its metadata."""
+
+    content: str
+    message_count: int = 0  # session.messages length when uploaded
+    uploaded_at: float = 0.0  # epoch timestamp of upload
+
+
 # Workspace storage constants — deterministic path from session_id.
 TRANSCRIPT_STORAGE_PREFIX = "chat-transcripts"

@@ -119,16 +131,12 @@ def read_transcript_file(transcript_path: str) -> str | None:
            content = f.read()

        if not content.strip():
-            logger.debug(f"[Transcript] Empty file: {transcript_path}")
            return None

        lines = content.strip().split("\n")
        if len(lines) < 3:
            # Raw files with ≤2 lines are metadata-only
            # (queue-operation + file-history-snapshot, no conversation).
-            logger.debug(
-                f"[Transcript] Too few lines ({len(lines)}): {transcript_path}"
-            )
            return None

        # Quick structural validation — parse first and last lines.
@@ -160,6 +168,41 @@ def _sanitize_id(raw_id: str, max_len: int = 36) -> str:
 _SAFE_CWD_PREFIX = os.path.realpath("/tmp/copilot-")


+def _encode_cwd_for_cli(cwd: str) -> str:
+    """Encode a working directory path the same way the Claude CLI does.
+
+    The CLI replaces all non-alphanumeric characters with ``-``.
+    """
+    return re.sub(r"[^a-zA-Z0-9]", "-", os.path.realpath(cwd))
+
+
+def cleanup_cli_project_dir(sdk_cwd: str) -> None:
+    """Remove the CLI's project directory for a specific working directory.
+
+    The CLI stores session data under ``~/.claude/projects/<encoded_cwd>/``.
+    Each SDK turn uses a unique ``sdk_cwd``, so the project directory is
+    safe to remove entirely after the transcript has been uploaded.
+    """
+    import shutil
+
+    cwd_encoded = _encode_cwd_for_cli(sdk_cwd)
+    config_dir = os.environ.get("CLAUDE_CONFIG_DIR") or os.path.expanduser("~/.claude")
+    projects_base = os.path.realpath(os.path.join(config_dir, "projects"))
+    project_dir = os.path.realpath(os.path.join(projects_base, cwd_encoded))
+
+    if not project_dir.startswith(projects_base + os.sep):
+        logger.warning(
+            f"[Transcript] Cleanup path escaped projects base: {project_dir}"
+        )
+        return
+
+    if os.path.isdir(project_dir):
+        shutil.rmtree(project_dir, ignore_errors=True)
+        logger.debug(f"[Transcript] Cleaned up CLI project dir: {project_dir}")
+    else:
+        logger.debug(f"[Transcript] Project dir not found: {project_dir}")
+
+
 def write_transcript_to_tempfile(
    transcript_content: str,
    session_id: str,
@@ -248,6 +291,15 @@ def _storage_path_parts(user_id: str, session_id: str) -> tuple[str, str, str]:
    )


+def _meta_storage_path_parts(user_id: str, session_id: str) -> tuple[str, str, str]:
+    """Return (workspace_id, file_id, filename) for a session's transcript metadata."""
+    return (
+        TRANSCRIPT_STORAGE_PREFIX,
+        _sanitize_id(user_id),
+        f"{_sanitize_id(session_id)}.meta.json",
+    )
+
+
 def _build_storage_path(user_id: str, session_id: str, backend: object) -> str:
    """Build the full storage path string that ``retrieve()`` expects.

@@ -268,21 +320,30 @@ def _build_storage_path(user_id: str, session_id: str, backend: object) -> str:
        return f"local://{wid}/{fid}/{fname}"


-async def upload_transcript(user_id: str, session_id: str, content: str) -> None:
+async def upload_transcript(
+    user_id: str,
+    session_id: str,
+    content: str,
+    message_count: int = 0,
+) -> None:
    """Strip progress entries and upload transcript to bucket storage.

    Safety: only overwrites when the new (stripped) transcript is larger than
    what is already stored.  Since JSONL is append-only, the latest transcript
    is always the longest.  This prevents a slow/stale background task from
    clobbering a newer upload from a concurrent turn.
+
+    Args:
+        message_count: ``len(session.messages)`` at upload time — used by
+            the next turn to detect staleness and compress only the gap.
    """
    from backend.util.workspace_storage import get_workspace_storage

    stripped = strip_progress_entries(content)
    if not validate_transcript(stripped):
        logger.warning(
-            f"[Transcript] Skipping upload — stripped content is not a valid "
-            f"transcript for session {session_id}"
+            f"[Transcript] Skipping upload — stripped content not valid "
+            f"for session {session_id}"
        )
        return

@@ -297,9 +358,8 @@ async def upload_transcript(user_id: str, session_id: str, content: str) -> None
        existing = await storage.retrieve(path)
        if len(existing) >= new_size:
            logger.info(
-                f"[Transcript] Skipping upload — existing transcript "
-                f"({len(existing)}B) >= new ({new_size}B) for session "
-                f"{session_id}"
+                f"[Transcript] Skipping upload — existing ({len(existing)}B) "
+                f">= new ({new_size}B) for session {session_id}"
            )
            return
    except (FileNotFoundError, Exception):
@@ -311,16 +371,38 @@ async def upload_transcript(user_id: str, session_id: str, content: str) -> None
        filename=fname,
        content=encoded,
    )
+
+    # Store metadata alongside the transcript so the next turn can detect
+    # staleness and only compress the gap instead of the full history.
+    # Wrapped in try/except so a metadata write failure doesn't orphan
+    # the already-uploaded transcript — the next turn will just fall back
+    # to full gap fill (msg_count=0).
+    try:
+        meta = {"message_count": message_count, "uploaded_at": time.time()}
+        mwid, mfid, mfname = _meta_storage_path_parts(user_id, session_id)
+        await storage.store(
+            workspace_id=mwid,
+            file_id=mfid,
+            filename=mfname,
+            content=json.dumps(meta).encode("utf-8"),
+        )
+    except Exception as e:
+        logger.warning(f"[Transcript] Failed to write metadata for {session_id}: {e}")
+
    logger.info(
-        f"[Transcript] Uploaded {new_size} bytes "
-        f"(stripped from {len(content)}) for session {session_id}"
+        f"[Transcript] Uploaded {new_size}B "
+        f"(stripped from {len(content)}B, msg_count={message_count}) "
+        f"for session {session_id}"
    )


-async def download_transcript(user_id: str, session_id: str) -> str | None:
-    """Download transcript from bucket storage.
+async def download_transcript(
+    user_id: str, session_id: str
+) -> TranscriptDownload | None:
+    """Download transcript and metadata from bucket storage.

-    Returns the JSONL content string, or ``None`` if not found.
+    Returns a ``TranscriptDownload`` with the JSONL content and the
+    ``message_count`` watermark from the upload, or ``None`` if not found.
    """
    from backend.util.workspace_storage import get_workspace_storage

@@ -330,10 +412,6 @@ async def download_transcript(user_id: str, session_id: str) -> str | None:
    try:
        data = await storage.retrieve(path)
        content = data.decode("utf-8")
-        logger.info(
-            f"[Transcript] Downloaded {len(content)} bytes for session {session_id}"
-        )
-        return content
    except FileNotFoundError:
        logger.debug(f"[Transcript] No transcript in storage for {session_id}")
        return None
@@ -341,6 +419,36 @@ async def download_transcript(user_id: str, session_id: str) -> str | None:
        logger.warning(f"[Transcript] Failed to download transcript: {e}")
        return None

+    # Try to load metadata (best-effort — old transcripts won't have it)
+    message_count = 0
+    uploaded_at = 0.0
+    try:
+        from backend.util.workspace_storage import GCSWorkspaceStorage
+
+        mwid, mfid, mfname = _meta_storage_path_parts(user_id, session_id)
+        if isinstance(storage, GCSWorkspaceStorage):
+            blob = f"workspaces/{mwid}/{mfid}/{mfname}"
+            meta_path = f"gcs://{storage.bucket_name}/{blob}"
+        else:
+            meta_path = f"local://{mwid}/{mfid}/{mfname}"
+
+        meta_data = await storage.retrieve(meta_path)
+        meta = json.loads(meta_data.decode("utf-8"))
+        message_count = meta.get("message_count", 0)
+        uploaded_at = meta.get("uploaded_at", 0.0)
+    except (FileNotFoundError, json.JSONDecodeError, Exception):
+        pass  # No metadata — treat as unknown (msg_count=0 → always fill gap)
+
+    logger.info(
+        f"[Transcript] Downloaded {len(content)}B "
+        f"(msg_count={message_count}) for session {session_id}"
+    )
+    return TranscriptDownload(
+        content=content,
+        message_count=message_count,
+        uploaded_at=uploaded_at,
+    )
+

 async def delete_transcript(user_id: str, session_id: str) -> None:
    """Delete transcript from bucket storage (e.g. after resume failure)."""
--- a/autogpt_platform/backend/backend/copilot/service.py
+++ b/autogpt_platform/backend/backend/copilot/service.py
@@ -118,6 +118,8 @@ Adapt flexibly to the conversation context. Not every interaction requires all s
   - Find reusable components with `find_block`
   - Create custom solutions with `create_agent` if nothing suitable exists
   - Modify existing library agents with `edit_agent`
+   - **When `create_agent` returns `suggested_goal`**: Present the suggestion to the user and ask "Would you like me to proceed with this refined goal?" If they accept, call `create_agent` again with the suggested goal.
+   - **When `create_agent` returns `clarifying_questions`**: After the user answers, call `create_agent` again with the original description AND the answers in the `context` parameter.

 5. **Execute**: Run automations immediately, schedule them, or set up webhooks using `run_agent`. Test specific components with `run_block`.

@@ -164,6 +166,11 @@ Adapt flexibly to the conversation context. Not every interaction requires all s
 - Use `add_understanding` to capture valuable business context
 - When tool calls fail, try alternative approaches

+**Handle Feedback Loops:**
+- When a tool returns a suggested alternative (like a refined goal), present it clearly and ask the user for confirmation before proceeding
+- When clarifying questions are answered, immediately re-call the tool with the accumulated context
+- Don't ask redundant questions if the user has already provided context in the conversation
+
 ## CRITICAL REMINDER

 You are NOT a chatbot. You are NOT documentation. You are a partner who helps busy business owners get value quickly by showing proof through working automations. Bias toward action over explanation."""
@@ -1225,23 +1232,10 @@ async def _stream_chat_chunks(
                    },
                )

-                # Yield all accumulated tool calls after the stream is complete
-                # This ensures all tool call arguments have been fully received
-                for idx, tool_call in enumerate(tool_calls):
-                    try:
-                        async for tc in _yield_tool_call(tool_calls, idx, session):
-                            yield tc
-                    except (orjson.JSONDecodeError, KeyError, TypeError) as e:
-                        logger.error(
-                            f"Failed to parse tool call {idx}: {e}",
-                            exc_info=True,
-                            extra={"tool_call": tool_call},
-                        )
-                        yield StreamError(
-                            errorText=f"Invalid tool call arguments for tool {tool_call.get('function', {}).get('name', 'unknown')}: {e}",
-                        )
-                        # Re-raise to trigger retry logic in the parent function
-                        raise
+                # Execute all accumulated tool calls in parallel
+                # Events are yielded as they arrive from each concurrent tool
+                async for event in _execute_tool_calls_parallel(tool_calls, session):
+                    yield event

                total_time = (time_module.perf_counter() - stream_chunks_start) * 1000
                logger.info(
@@ -1319,10 +1313,91 @@ async def _stream_chat_chunks(
            return


+async def _with_optional_lock(
+    lock: asyncio.Lock | None,
+    coro_fn: Any,
+) -> Any:
+    """Run *coro_fn()* under *lock* when provided, otherwise run directly."""
+    if lock:
+        async with lock:
+            return await coro_fn()
+    return await coro_fn()
+
+
+async def _execute_tool_calls_parallel(
+    tool_calls: list[dict[str, Any]],
+    session: ChatSession,
+) -> AsyncGenerator[StreamBaseResponse, None]:
+    """Execute all tool calls concurrently, yielding stream events as they arrive.
+
+    Each tool runs as an ``asyncio.Task``, pushing events into a shared queue.
+    A ``session_lock`` serialises session-state mutations (long-running tool
+    bookkeeping, ``run_agent`` counters).
+    """
+    queue: asyncio.Queue[StreamBaseResponse | None] = asyncio.Queue()
+    session_lock = asyncio.Lock()
+    n_tools = len(tool_calls)
+    retryable_errors: list[Exception] = []
+
+    async def _run_tool(idx: int) -> None:
+        tool_name = tool_calls[idx].get("function", {}).get("name", "unknown")
+        tool_call_id = tool_calls[idx].get("id", f"unknown_{idx}")
+        try:
+            async for event in _yield_tool_call(tool_calls, idx, session, session_lock):
+                await queue.put(event)
+        except (orjson.JSONDecodeError, KeyError, TypeError) as e:
+            logger.error(
+                f"Failed to parse tool call {idx} ({tool_name}): {e}",
+                exc_info=True,
+            )
+            retryable_errors.append(e)
+        except Exception as e:
+            # Infrastructure / setup errors — emit an error output so the
+            # client always sees a terminal event and doesn't hang.
+            logger.error(f"Tool call {idx} ({tool_name}) failed: {e}", exc_info=True)
+            await queue.put(
+                StreamToolOutputAvailable(
+                    toolCallId=tool_call_id,
+                    toolName=tool_name,
+                    output=ErrorResponse(
+                        message=f"Tool execution failed: {e!s}",
+                        error=type(e).__name__,
+                        session_id=session.session_id,
+                    ).model_dump_json(),
+                    success=False,
+                )
+            )
+        finally:
+            await queue.put(None)  # sentinel
+
+    tasks = [asyncio.create_task(_run_tool(idx)) for idx in range(n_tools)]
+    try:
+        finished = 0
+        while finished < n_tools:
+            event = await queue.get()
+            if event is None:
+                finished += 1
+            else:
+                yield event
+        if retryable_errors:
+            if len(retryable_errors) > 1:
+                logger.warning(
+                    f"{len(retryable_errors)} tool calls had retryable errors; "
+                    f"re-raising first to trigger retry"
+                )
+            raise retryable_errors[0]
+    finally:
+        for t in tasks:
+            if not t.done():
+                t.cancel()
+        await asyncio.gather(*tasks, return_exceptions=True)
+
+
 async def _yield_tool_call(
    tool_calls: list[dict[str, Any]],
    yield_idx: int,
    session: ChatSession,
+    session_lock: asyncio.Lock | None = None,
 ) -> AsyncGenerator[StreamBaseResponse, None]:
    """
    Yield a tool call and its execution result.
@@ -1420,8 +1495,7 @@ async def _yield_tool_call(
                "check back in a few minutes."
            )

-        # Track appended messages for rollback on failure
-        assistant_message: ChatMessage | None = None
+        # Track appended message for rollback on failure
        pending_message: ChatMessage | None = None

        # Wrap session save and task creation in try-except to release lock on failure
@@ -1436,22 +1510,24 @@ async def _yield_tool_call(
                operation_id=operation_id,
            )

-            # Attach the tool_call to the current turn's assistant message
-            # (or create one if this is a tool-only response with no text).
-            session.add_tool_call_to_current_turn(tool_calls[yield_idx])
+            # Attach tool_call and save pending result — lock serialises
+            # concurrent session mutations during parallel execution.
+            async def _save_pending() -> None:
+                nonlocal pending_message
+                session.add_tool_call_to_current_turn(tool_calls[yield_idx])
+                pending_message = ChatMessage(
+                    role="tool",
+                    content=OperationPendingResponse(
+                        message=pending_msg,
+                        operation_id=operation_id,
+                        tool_name=tool_name,
+                    ).model_dump_json(),
+                    tool_call_id=tool_call_id,
+                )
+                session.messages.append(pending_message)
+                await upsert_chat_session(session)

-            # Then save pending tool result
-            pending_message = ChatMessage(
-                role="tool",
-                content=OperationPendingResponse(
-                    message=pending_msg,
-                    operation_id=operation_id,
-                    tool_name=tool_name,
-                ).model_dump_json(),
-                tool_call_id=tool_call_id,
-            )
-            session.messages.append(pending_message)
-            await upsert_chat_session(session)
+            await _with_optional_lock(session_lock, _save_pending)
            logger.info(
                f"Saved pending operation {operation_id} (task_id={task_id}) "
                f"for tool {tool_name} in session {session.session_id}"
@@ -1475,27 +1551,21 @@ async def _yield_tool_call(
            # Associate the asyncio task with the stream registry task
            await stream_registry.set_task_asyncio_task(task_id, bg_task)
        except Exception as e:
-            # Roll back appended messages to prevent data corruption on subsequent saves
-            if (
-                pending_message
-                and session.messages
-                and session.messages[-1] == pending_message
-            ):
-                session.messages.pop()
-            if (
-                assistant_message
-                and session.messages
-                and session.messages[-1] == assistant_message
-            ):
-                session.messages.pop()
+            # Roll back appended messages — use identity-based removal so
+            # it works even when other parallel tools have appended after us.
+            async def _rollback() -> None:
+                if pending_message and pending_message in session.messages:
+                    session.messages.remove(pending_message)
+
+            await _with_optional_lock(session_lock, _rollback)

            # Release the Redis lock since the background task won't be spawned
            await _mark_operation_completed(tool_call_id)
            # Mark stream registry task as failed if it was created
            try:
                await stream_registry.mark_task_completed(task_id, status="failed")
-            except Exception:
-                pass
+            except Exception as mark_err:
+                logger.warning(f"Failed to mark task {task_id} as failed: {mark_err}")
            logger.error(
                f"Failed to setup long-running tool {tool_name}: {e}", exc_info=True
            )
--- a/autogpt_platform/backend/backend/copilot/service_test.py
+++ b/autogpt_platform/backend/backend/copilot/service_test.py
@@ -143,7 +143,7 @@ async def test_sdk_resume_multi_turn(setup_test_user, test_user_id):
        "Transcript was not uploaded to bucket after turn 1 — "
        "Stop hook may not have fired or transcript was too small"
    )
-    logger.info(f"Turn 1 transcript uploaded: {len(transcript)} bytes")
+    logger.info(f"Turn 1 transcript uploaded: {len(transcript.content)} bytes")

    # Reload session for turn 2
    session = await get_chat_session(session.session_id, test_user_id)
--- a/autogpt_platform/backend/backend/copilot/stream_registry.py
+++ b/autogpt_platform/backend/backend/copilot/stream_registry.py
@@ -829,8 +829,11 @@ async def get_active_task_for_session(
                            )
                            await mark_task_completed(task_id, "failed")
                            continue
-                    except (ValueError, TypeError):
-                        pass
+                    except (ValueError, TypeError) as exc:
+                        logger.warning(
+                            f"[TASK_LOOKUP] Failed to parse created_at "
+                            f"for task {task_id[:8]}...: {exc}"
+                        )

                logger.info(
                    f"[TASK_LOOKUP] Found running task {task_id[:8]}... for session {session_id[:8]}..."
--- a/autogpt_platform/backend/backend/copilot/tools/create_agent.py
+++ b/autogpt_platform/backend/backend/copilot/tools/create_agent.py
@@ -22,6 +22,7 @@ from .models import (
    ClarificationNeededResponse,
    ClarifyingQuestion,
    ErrorResponse,
+    SuggestedGoalResponse,
    ToolResponseBase,
 )

@@ -186,26 +187,28 @@ class CreateAgentTool(BaseTool):
        if decomposition_result.get("type") == "unachievable_goal":
            suggested = decomposition_result.get("suggested_goal", "")
            reason = decomposition_result.get("reason", "")
-            return ErrorResponse(
+            return SuggestedGoalResponse(
                message=(
-                    f"This goal cannot be accomplished with the available blocks. "
-                    f"{reason} "
-                    f"Suggestion: {suggested}"
+                    f"This goal cannot be accomplished with the available blocks. {reason}"
                ),
-                error="unachievable_goal",
-                details={"suggested_goal": suggested, "reason": reason},
+                suggested_goal=suggested,
+                reason=reason,
+                original_goal=description,
+                goal_type="unachievable",
                session_id=session_id,
            )

        if decomposition_result.get("type") == "vague_goal":
            suggested = decomposition_result.get("suggested_goal", "")
-            return ErrorResponse(
-                message=(
-                    f"The goal is too vague to create a specific workflow. "
-                    f"Suggestion: {suggested}"
-                ),
-                error="vague_goal",
-                details={"suggested_goal": suggested},
+            reason = decomposition_result.get(
+                "reason", "The goal needs more specific details"
+            )
+            return SuggestedGoalResponse(
+                message="The goal is too vague to create a specific workflow.",
+                suggested_goal=suggested,
+                reason=reason,
+                original_goal=description,
+                goal_type="vague",
                session_id=session_id,
            )

--- a/autogpt_platform/backend/backend/copilot/tools/create_agent_test.py
+++ b/autogpt_platform/backend/backend/copilot/tools/create_agent_test.py
@@ -0,0 +1,142 @@
+"""Tests for CreateAgentTool response types."""
+
+from unittest.mock import AsyncMock, patch
+
+import pytest
+
+from backend.copilot.tools.create_agent import CreateAgentTool
+from backend.copilot.tools.models import (
+    ClarificationNeededResponse,
+    ErrorResponse,
+    SuggestedGoalResponse,
+)
+
+from ._test_data import make_session
+
+_TEST_USER_ID = "test-user-create-agent"
+
+
+@pytest.fixture
+def tool():
+    return CreateAgentTool()
+
+
+@pytest.fixture
+def session():
+    return make_session(_TEST_USER_ID)
+
+
+@pytest.mark.asyncio
+async def test_missing_description_returns_error(tool, session):
+    """Missing description returns ErrorResponse."""
+    result = await tool._execute(user_id=_TEST_USER_ID, session=session, description="")
+    assert isinstance(result, ErrorResponse)
+    assert result.error == "Missing description parameter"
+
+
+@pytest.mark.asyncio
+async def test_vague_goal_returns_suggested_goal_response(tool, session):
+    """vague_goal decomposition result returns SuggestedGoalResponse, not ErrorResponse."""
+    vague_result = {
+        "type": "vague_goal",
+        "suggested_goal": "Monitor Twitter mentions for a specific keyword and send a daily digest email",
+    }
+
+    with (
+        patch(
+            "backend.copilot.tools.create_agent.get_all_relevant_agents_for_generation",
+            new_callable=AsyncMock,
+            return_value=[],
+        ),
+        patch(
+            "backend.copilot.tools.create_agent.decompose_goal",
+            new_callable=AsyncMock,
+            return_value=vague_result,
+        ),
+    ):
+        result = await tool._execute(
+            user_id=_TEST_USER_ID,
+            session=session,
+            description="monitor social media",
+        )
+
+    assert isinstance(result, SuggestedGoalResponse)
+    assert result.goal_type == "vague"
+    assert result.suggested_goal == vague_result["suggested_goal"]
+    assert result.original_goal == "monitor social media"
+    assert result.reason == "The goal needs more specific details"
+    assert not isinstance(result, ErrorResponse)
+
+
+@pytest.mark.asyncio
+async def test_unachievable_goal_returns_suggested_goal_response(tool, session):
+    """unachievable_goal decomposition result returns SuggestedGoalResponse, not ErrorResponse."""
+    unachievable_result = {
+        "type": "unachievable_goal",
+        "suggested_goal": "Summarize the latest news articles on a topic and send them by email",
+        "reason": "There are no blocks for mind-reading.",
+    }
+
+    with (
+        patch(
+            "backend.copilot.tools.create_agent.get_all_relevant_agents_for_generation",
+            new_callable=AsyncMock,
+            return_value=[],
+        ),
+        patch(
+            "backend.copilot.tools.create_agent.decompose_goal",
+            new_callable=AsyncMock,
+            return_value=unachievable_result,
+        ),
+    ):
+        result = await tool._execute(
+            user_id=_TEST_USER_ID,
+            session=session,
+            description="read my mind",
+        )
+
+    assert isinstance(result, SuggestedGoalResponse)
+    assert result.goal_type == "unachievable"
+    assert result.suggested_goal == unachievable_result["suggested_goal"]
+    assert result.original_goal == "read my mind"
+    assert result.reason == unachievable_result["reason"]
+    assert not isinstance(result, ErrorResponse)
+
+
+@pytest.mark.asyncio
+async def test_clarifying_questions_returns_clarification_needed_response(
+    tool, session
+):
+    """clarifying_questions decomposition result returns ClarificationNeededResponse."""
+    clarifying_result = {
+        "type": "clarifying_questions",
+        "questions": [
+            {
+                "question": "What platform should be monitored?",
+                "keyword": "platform",
+                "example": "Twitter, Reddit",
+            }
+        ],
+    }
+
+    with (
+        patch(
+            "backend.copilot.tools.create_agent.get_all_relevant_agents_for_generation",
+            new_callable=AsyncMock,
+            return_value=[],
+        ),
+        patch(
+            "backend.copilot.tools.create_agent.decompose_goal",
+            new_callable=AsyncMock,
+            return_value=clarifying_result,
+        ),
+    ):
+        result = await tool._execute(
+            user_id=_TEST_USER_ID,
+            session=session,
+            description="monitor social media and alert me",
+        )
+
+    assert isinstance(result, ClarificationNeededResponse)
+    assert len(result.questions) == 1
+    assert result.questions[0].keyword == "platform"
--- a/autogpt_platform/backend/backend/copilot/tools/models.py
+++ b/autogpt_platform/backend/backend/copilot/tools/models.py
@@ -2,7 +2,7 @@

 from datetime import datetime
 from enum import Enum
-from typing import Any
+from typing import Any, Literal

 from pydantic import BaseModel, Field

@@ -50,6 +50,8 @@ class ResponseType(str, Enum):
    # Feature request types
    FEATURE_REQUEST_SEARCH = "feature_request_search"
    FEATURE_REQUEST_CREATED = "feature_request_created"
+    # Goal refinement
+    SUGGESTED_GOAL = "suggested_goal"


 # Base response model
@@ -296,6 +298,22 @@ class ClarificationNeededResponse(ToolResponseBase):
    questions: list[ClarifyingQuestion] = Field(default_factory=list)


+class SuggestedGoalResponse(ToolResponseBase):
+    """Response when the goal needs refinement with a suggested alternative."""
+
+    type: ResponseType = ResponseType.SUGGESTED_GOAL
+    suggested_goal: str = Field(description="The suggested alternative goal")
+    reason: str = Field(
+        default="", description="Why the original goal needs refinement"
+    )
+    original_goal: str = Field(
+        default="", description="The user's original goal for context"
+    )
+    goal_type: Literal["vague", "unachievable"] = Field(
+        default="vague", description="Type: 'vague' or 'unachievable'"
+    )
+
+
 # Documentation search models
 class DocSearchResult(BaseModel):
    """A single documentation search result."""
--- a/autogpt_platform/backend/backend/copilot/tools/workspace_files.py
+++ b/autogpt_platform/backend/backend/copilot/tools/workspace_files.py
@@ -312,8 +312,18 @@ class ReadWorkspaceFileTool(BaseTool):
            is_small_file = file_info.size_bytes <= self.MAX_INLINE_SIZE_BYTES
            is_text_file = self._is_text_mime_type(file_info.mime_type)

-            # Return inline content for small text files (unless force_download_url)
-            if is_small_file and is_text_file and not force_download_url:
+            # Return inline content for small text/image files (unless force_download_url)
+            is_image_file = file_info.mime_type in {
+                "image/png",
+                "image/jpeg",
+                "image/gif",
+                "image/webp",
+            }
+            if (
+                is_small_file
+                and (is_text_file or is_image_file)
+                and not force_download_url
+            ):
                content = await manager.read_file_by_id(target_file_id)
                content_b64 = base64.b64encode(content).decode("utf-8")

--- a/autogpt_platform/backend/backend/util/service.py
+++ b/autogpt_platform/backend/backend/util/service.py
@@ -599,6 +599,15 @@ def get_service_client(
                if error_response and error_response.type in EXCEPTION_MAPPING:
                    exception_class = EXCEPTION_MAPPING[error_response.type]
                    args = error_response.args or [str(e)]
+
+                    # Prisma DataError subclasses expect a dict `data` arg,
+                    # but RPC serialization only preserves the string message
+                    # from exc.args.  Wrap it in the expected structure so
+                    # the constructor doesn't crash on `.get()`.
+                    if issubclass(exception_class, DataError):
+                        msg = str(args[0]) if args else str(e)
+                        raise exception_class({"user_facing_error": {"message": msg}})
+
                    raise exception_class(*args)

                # Otherwise categorize by HTTP status code
--- a/autogpt_platform/backend/backend/util/service_test.py
+++ b/autogpt_platform/backend/backend/util/service_test.py
@@ -6,6 +6,7 @@ from unittest.mock import Mock

 import httpx
 import pytest
+from prisma.errors import DataError, UniqueViolationError

 from backend.util.service import (
    AppService,
@@ -447,6 +448,39 @@ class TestHTTPErrorRetryBehavior:

        assert "Invalid parameter value" in str(exc_info.value)

+    def test_prisma_data_error_reconstructed_correctly(self):
+        """Test that DataError subclasses (e.g. UniqueViolationError) are
+        reconstructed without crashing.
+
+        Prisma's DataError.__init__ expects a dict `data` arg with
+        a 'user_facing_error' key.  RPC serialization only preserves the
+        string message via exc.args, so the client must wrap it in the
+        expected dict structure.
+        """
+        for exc_type in [DataError, UniqueViolationError]:
+            mock_response = Mock()
+            mock_response.status_code = 400
+            mock_response.json.return_value = {
+                "type": exc_type.__name__,
+                "args": ["Unique constraint failed on the fields: (`path`)"],
+            }
+            mock_response.raise_for_status.side_effect = httpx.HTTPStatusError(
+                "400 Bad Request", request=Mock(), response=mock_response
+            )
+
+            client = get_service_client(ServiceTestClient)
+
+            with pytest.raises(exc_type) as exc_info:
+                client._handle_call_method_response(  # type: ignore[attr-defined]
+                    response=mock_response, method_name="test_method"
+                )
+
+            # The exception should have the message preserved
+            assert "Unique constraint" in str(exc_info.value)
+            # And should have the expected data structure (not crash)
+            assert hasattr(exc_info.value, "data")
+            assert isinstance(exc_info.value.data, dict)
+
    def test_client_error_status_codes_coverage(self):
        """Test that various 4xx status codes are all wrapped as HTTPClientError."""
        client_error_codes = [400, 401, 403, 404, 405, 409, 422, 429]
--- a/autogpt_platform/frontend/CLAUDE.md
+++ b/autogpt_platform/frontend/CLAUDE.md
@@ -30,6 +30,16 @@ pnpm format
 pnpm types
 ```

+### Pre-completion Checks (MANDATORY)
+
+After making **any** code changes in the frontend, you MUST run the following commands **in order** before reporting work as done, creating commits, or opening PRs:
+
+1. `pnpm format` — auto-fix formatting issues
+2. `pnpm lint` — check for lint errors; fix any that appear
+3. `pnpm types` — check for type errors; fix any that appear
+
+Do NOT skip these steps. If any command reports errors, fix them and re-run until clean. Only then may you consider the task complete. If typing keeps failing, stop and ask the user.
+
 ### Code Style

 - Fully capitalize acronyms in symbols, e.g. `graphID`, `useBackendAPI`
@@ -74,3 +84,4 @@ See @CONTRIBUTING.md for complete patterns. Quick reference:
   - Do not use `useCallback` or `useMemo` unless asked to optimise a given function
   - Do not type hook returns, let Typescript infer as much as possible
   - Never type with `any` unless a variable/attribute can ACTUALLY be of any type
+   - avoid index and barrel files
--- a/autogpt_platform/frontend/src/app/(platform)/copilot/CopilotPage.tsx
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/CopilotPage.tsx
@@ -23,6 +23,7 @@ export function CopilotPage() {
    status,
    error,
    stop,
+    isReconnecting,
    createSession,
    onSend,
    isLoadingSession,
@@ -71,6 +72,7 @@ export function CopilotPage() {
            sessionId={sessionId}
            isLoadingSession={isLoadingSession}
            isCreatingSession={isCreatingSession}
+            isReconnecting={isReconnecting}
            onCreateSession={createSession}
            onSend={onSend}
            onStop={stop}
--- a/autogpt_platform/frontend/src/app/(platform)/copilot/components/ChatContainer/ChatContainer.tsx
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/components/ChatContainer/ChatContainer.tsx
@@ -14,6 +14,8 @@ export interface ChatContainerProps {
  sessionId: string | null;
  isLoadingSession: boolean;
  isCreatingSession: boolean;
+  /** True when backend has an active stream but we haven't reconnected yet. */
+  isReconnecting?: boolean;
  onCreateSession: () => void | Promise<string>;
  onSend: (message: string) => void | Promise<void>;
  onStop: () => void;
@@ -26,11 +28,13 @@ export const ChatContainer = ({
  sessionId,
  isLoadingSession,
  isCreatingSession,
+  isReconnecting,
  onCreateSession,
  onSend,
  onStop,
  headerSlot,
 }: ChatContainerProps) => {
+  const isBusy = status === "streaming" || !!isReconnecting;
  const inputLayoutId = "copilot-2-chat-input";

  return (
@@ -56,8 +60,8 @@ export const ChatContainer = ({
                <ChatInput
                  inputId="chat-input-session"
                  onSend={onSend}
-                  disabled={status === "streaming"}
-                  isStreaming={status === "streaming"}
+                  disabled={isBusy}
+                  isStreaming={isBusy}
                  onStop={onStop}
                  placeholder="What else can I help with?"
                />
--- a/autogpt_platform/frontend/src/app/(platform)/copilot/components/ChatMessagesContainer/ChatMessagesContainer.tsx
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/components/ChatMessagesContainer/ChatMessagesContainer.tsx
@@ -169,7 +169,10 @@ export const ChatMessagesContainer = ({
      <ConversationContent className="flex flex-1 flex-col gap-6 px-3 py-6">
        {headerSlot}
        {isLoading && messages.length === 0 && (
-          <div className="flex min-h-full flex-1 items-center justify-center">
+          <div
+            className="flex flex-1 items-center justify-center"
+            style={{ minHeight: "calc(100vh - 12rem)" }}
+          >
            <LoadingSpinner className="text-neutral-600" />
          </div>
        )}
--- a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/CreateAgent.tsx
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/CreateAgent.tsx
@@ -26,6 +26,7 @@ import {
 } from "./components/ClarificationQuestionsCard";
 import sparklesImg from "./components/MiniGame/assets/sparkles.png";
 import { MiniGame } from "./components/MiniGame/MiniGame";
+import { SuggestedGoalCard } from "./components/SuggestedGoalCard";
 import {
  AccordionIcon,
  formatMaybeJson,
@@ -38,6 +39,7 @@ import {
  isOperationInProgressOutput,
  isOperationPendingOutput,
  isOperationStartedOutput,
+  isSuggestedGoalOutput,
  ToolIcon,
  truncateText,
  type CreateAgentToolOutput,
@@ -77,6 +79,13 @@ function getAccordionMeta(output: CreateAgentToolOutput) {
      expanded: true,
    };
  }
+  if (isSuggestedGoalOutput(output)) {
+    return {
+      icon,
+      title: "Goal needs refinement",
+      expanded: true,
+    };
+  }
  if (
    isOperationStartedOutput(output) ||
    isOperationPendingOutput(output) ||
@@ -125,8 +134,13 @@ export function CreateAgentTool({ part }: Props) {
      isAgentPreviewOutput(output) ||
      isAgentSavedOutput(output) ||
      isClarificationNeededOutput(output) ||
+      isSuggestedGoalOutput(output) ||
      isErrorOutput(output));

+  function handleUseSuggestedGoal(goal: string) {
+    onSend(`Please create an agent with this goal: ${goal}`);
+  }
+
  function handleClarificationAnswers(answers: Record<string, string>) {
    const questions =
      output && isClarificationNeededOutput(output)
@@ -245,6 +259,16 @@ export function CreateAgentTool({ part }: Props) {
            />
          )}

+          {isSuggestedGoalOutput(output) && (
+            <SuggestedGoalCard
+              message={output.message}
+              suggestedGoal={output.suggested_goal}
+              reason={output.reason}
+              goalType={output.goal_type ?? "vague"}
+              onUseSuggestedGoal={handleUseSuggestedGoal}
+            />
+          )}
+
          {isErrorOutput(output) && (
            <ContentGrid>
              <ContentMessage>{output.message}</ContentMessage>
@@ -258,6 +282,22 @@ export function CreateAgentTool({ part }: Props) {
                  {formatMaybeJson(output.details)}
                </ContentCodeBlock>
              )}
+              <div className="flex gap-2">
+                <Button
+                  variant="outline"
+                  size="small"
+                  onClick={() => onSend("Please try creating the agent again.")}
+                >
+                  Try again
+                </Button>
+                <Button
+                  variant="outline"
+                  size="small"
+                  onClick={() => onSend("Can you help me simplify this goal?")}
+                >
+                  Simplify goal
+                </Button>
+              </div>
            </ContentGrid>
          )}
        </ToolAccordion>
--- a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/components/MiniGame/MiniGame.tsx
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/components/MiniGame/MiniGame.tsx
@@ -10,17 +10,10 @@ export function MiniGame() {
  const { canvasRef, activeMode, showOverlay, score, highScore, onContinue } =
    useMiniGame();

-  const isRunActive =
-    activeMode === "run" || activeMode === "idle" || activeMode === "over";
-
  let overlayText: string | undefined;
  let buttonLabel = "Continue";
  if (activeMode === "idle") {
    buttonLabel = "Start";
-  } else if (activeMode === "boss-intro") {
-    overlayText = "Face the bandit!";
-  } else if (activeMode === "boss-defeated") {
-    overlayText = "Great job, keep on going";
  } else if (activeMode === "over") {
    overlayText = `Score: ${score} / Record: ${highScore}`;
    buttonLabel = "Retry";
@@ -29,16 +22,7 @@ export function MiniGame() {
  return (
    <div className="flex flex-col gap-2">
      <p className="text-sm font-medium text-purple-500">
-        {isRunActive ? (
-          <>
-            Run mode: <Key>Space</Key> to jump
-          </>
-        ) : (
-          <>
-            Duel mode: <Key>←→</Key> to move · <Key>Z</Key> to attack ·{" "}
-            <Key>X</Key> to block · <Key>Space</Key> to jump
-          </>
-        )}
+        <Key>WASD</Key> to move
      </p>
      <div className="relative w-full overflow-hidden rounded-md border border-accent bg-background text-foreground">
        <canvas
--- a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/components/MiniGame/assets/archer-attack.png
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/components/MiniGame/assets/archer-attack.png
--- a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/components/MiniGame/assets/archer-idle.png
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/components/MiniGame/assets/archer-idle.png
--- a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/components/MiniGame/assets/archer-shoot.png
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/components/MiniGame/assets/archer-shoot.png
--- a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/components/MiniGame/assets/attack.png
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/components/MiniGame/assets/attack.png
--- a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/components/MiniGame/assets/guard.png
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/components/MiniGame/assets/guard.png
--- a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/components/MiniGame/assets/idle.png
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/components/MiniGame/assets/idle.png
--- a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/components/MiniGame/assets/run.png
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/components/MiniGame/assets/run.png
--- a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/components/MiniGame/assets/tree-1.png
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/components/MiniGame/assets/tree-1.png
--- a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/components/MiniGame/assets/tree-2.png
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/components/MiniGame/assets/tree-2.png
--- a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/components/MiniGame/assets/tree-3.png
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/components/MiniGame/assets/tree-3.png
--- a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/components/MiniGame/useMiniGame.ts
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/components/MiniGame/useMiniGame.ts
--- a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/components/SuggestedGoalCard.tsx
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/components/SuggestedGoalCard.tsx
@@ -0,0 +1,63 @@
+"use client";
+
+import { Button } from "@/components/atoms/Button/Button";
+import { Text } from "@/components/atoms/Text/Text";
+import { ArrowRightIcon, LightbulbIcon } from "@phosphor-icons/react";
+
+interface Props {
+  message: string;
+  suggestedGoal: string;
+  reason?: string;
+  goalType: string;
+  onUseSuggestedGoal: (goal: string) => void;
+}
+
+export function SuggestedGoalCard({
+  message,
+  suggestedGoal,
+  reason,
+  goalType,
+  onUseSuggestedGoal,
+}: Props) {
+  return (
+    <div className="rounded-xl border border-amber-200 bg-amber-50/50 p-4">
+      <div className="flex items-start gap-3">
+        <LightbulbIcon
+          size={20}
+          weight="fill"
+          className="mt-0.5 text-amber-600"
+        />
+        <div className="flex-1 space-y-3">
+          <div>
+            <Text variant="body-medium" className="font-medium text-slate-900">
+              {goalType === "unachievable"
+                ? "Goal cannot be accomplished"
+                : "Goal needs more detail"}
+            </Text>
+            <Text variant="small" className="text-slate-600">
+              {reason || message}
+            </Text>
+          </div>
+
+          <div className="rounded-lg border border-amber-300 bg-white p-3">
+            <Text variant="small" className="mb-1 font-semibold text-amber-800">
+              Suggested alternative:
+            </Text>
+            <Text variant="body-medium" className="text-slate-900">
+              {suggestedGoal}
+            </Text>
+          </div>
+
+          <Button
+            onClick={() => onUseSuggestedGoal(suggestedGoal)}
+            variant="primary"
+          >
+            <span className="inline-flex items-center gap-1.5">
+              Use this goal <ArrowRightIcon size={14} weight="bold" />
+            </span>
+          </Button>
+        </div>
+      </div>
+    </div>
+  );
+}
--- a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/helpers.tsx
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/helpers.tsx
@@ -6,6 +6,7 @@ import type { OperationInProgressResponse } from "@/app/api/__generated__/models
 import type { OperationPendingResponse } from "@/app/api/__generated__/models/operationPendingResponse";
 import type { OperationStartedResponse } from "@/app/api/__generated__/models/operationStartedResponse";
 import { ResponseType } from "@/app/api/__generated__/models/responseType";
+import type { SuggestedGoalResponse } from "@/app/api/__generated__/models/suggestedGoalResponse";
 import {
  PlusCircleIcon,
  PlusIcon,
@@ -21,6 +22,7 @@ export type CreateAgentToolOutput =
  | AgentPreviewResponse
  | AgentSavedResponse
  | ClarificationNeededResponse
+  | SuggestedGoalResponse
  | ErrorResponse;

 function parseOutput(output: unknown): CreateAgentToolOutput | null {
@@ -43,6 +45,7 @@ function parseOutput(output: unknown): CreateAgentToolOutput | null {
      type === ResponseType.agent_preview ||
      type === ResponseType.agent_saved ||
      type === ResponseType.clarification_needed ||
+      type === ResponseType.suggested_goal ||
      type === ResponseType.error
    ) {
      return output as CreateAgentToolOutput;
@@ -55,6 +58,7 @@ function parseOutput(output: unknown): CreateAgentToolOutput | null {
    if ("agent_id" in output && "library_agent_id" in output)
      return output as AgentSavedResponse;
    if ("questions" in output) return output as ClarificationNeededResponse;
+    if ("suggested_goal" in output) return output as SuggestedGoalResponse;
    if ("error" in output || "details" in output)
      return output as ErrorResponse;
  }
@@ -114,6 +118,14 @@ export function isClarificationNeededOutput(
  );
 }

+export function isSuggestedGoalOutput(
+  output: CreateAgentToolOutput,
+): output is SuggestedGoalResponse {
+  return (
+    output.type === ResponseType.suggested_goal || "suggested_goal" in output
+  );
+}
+
 export function isErrorOutput(
  output: CreateAgentToolOutput,
 ): output is ErrorResponse {
@@ -139,6 +151,7 @@ export function getAnimationText(part: {
      if (isAgentSavedOutput(output)) return `Saved ${output.agent_name}`;
      if (isAgentPreviewOutput(output)) return `Preview "${output.agent_name}"`;
      if (isClarificationNeededOutput(output)) return "Needs clarification";
+      if (isSuggestedGoalOutput(output)) return "Goal needs refinement";
      return "Error creating agent";
    }
    case "output-error":
--- a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/GenericTool.tsx
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/GenericTool.tsx
@@ -1,63 +1,713 @@
 "use client";

+import React from "react";
 import { ToolUIPart } from "ai";
-import { GearIcon } from "@phosphor-icons/react";
+import {
+  CheckCircleIcon,
+  CircleDashedIcon,
+  CircleIcon,
+  FileIcon,
+  FilesIcon,
+  GearIcon,
+  GlobeIcon,
+  ListChecksIcon,
+  MagnifyingGlassIcon,
+  PencilSimpleIcon,
+  TerminalIcon,
+  TrashIcon,
+  WarningDiamondIcon,
+} from "@phosphor-icons/react";
 import { MorphingTextAnimation } from "../../components/MorphingTextAnimation/MorphingTextAnimation";
+import { ToolAccordion } from "../../components/ToolAccordion/ToolAccordion";
+import {
+  ContentCodeBlock,
+  ContentMessage,
+} from "../../components/ToolAccordion/AccordionContent";
+import { OrbitLoader } from "../../components/OrbitLoader/OrbitLoader";

 interface Props {
  part: ToolUIPart;
 }

+/* ------------------------------------------------------------------ */
+/*  Tool name helpers                                                  */
+/* ------------------------------------------------------------------ */
+
 function extractToolName(part: ToolUIPart): string {
-  // ToolUIPart.type is "tool-{name}", extract the name portion.
  return part.type.replace(/^tool-/, "");
 }

 function formatToolName(name: string): string {
-  // "search_docs" → "Search docs", "Read" → "Read"
  return name.replace(/_/g, " ").replace(/^\w/, (c) => c.toUpperCase());
 }

-function getAnimationText(part: ToolUIPart): string {
-  const label = formatToolName(extractToolName(part));
+/* ------------------------------------------------------------------ */
+/*  Tool categorization                                                */
+/* ------------------------------------------------------------------ */

-  switch (part.state) {
-    case "input-streaming":
-    case "input-available":
-      return `Running ${label}…`;
-    case "output-available":
-      return `${label} completed`;
-    case "output-error":
-      return `${label} failed`;
+type ToolCategory =
+  | "bash"
+  | "web"
+  | "file-read"
+  | "file-write"
+  | "file-delete"
+  | "file-list"
+  | "search"
+  | "edit"
+  | "todo"
+  | "other";
+
+function getToolCategory(toolName: string): ToolCategory {
+  switch (toolName) {
+    case "bash_exec":
+      return "bash";
+    case "web_fetch":
+    case "WebSearch":
+    case "WebFetch":
+      return "web";
+    case "read_workspace_file":
+    case "Read":
+      return "file-read";
+    case "write_workspace_file":
+    case "Write":
+      return "file-write";
+    case "delete_workspace_file":
+      return "file-delete";
+    case "list_workspace_files":
+    case "Glob":
+      return "file-list";
+    case "Grep":
+      return "search";
+    case "Edit":
+      return "edit";
+    case "TodoWrite":
+      return "todo";
    default:
-      return `Running ${label}…`;
+      return "other";
  }
 }

+/* ------------------------------------------------------------------ */
+/*  Tool icon                                                          */
+/* ------------------------------------------------------------------ */
+
+function ToolIcon({
+  category,
+  isStreaming,
+  isError,
+}: {
+  category: ToolCategory;
+  isStreaming: boolean;
+  isError: boolean;
+}) {
+  if (isError) {
+    return (
+      <WarningDiamondIcon size={14} weight="regular" className="text-red-500" />
+    );
+  }
+  if (isStreaming) {
+    return <OrbitLoader size={14} />;
+  }
+
+  const iconClass = "text-neutral-400";
+  switch (category) {
+    case "bash":
+      return <TerminalIcon size={14} weight="regular" className={iconClass} />;
+    case "web":
+      return <GlobeIcon size={14} weight="regular" className={iconClass} />;
+    case "file-read":
+      return <FileIcon size={14} weight="regular" className={iconClass} />;
+    case "file-write":
+      return <FileIcon size={14} weight="regular" className={iconClass} />;
+    case "file-delete":
+      return <TrashIcon size={14} weight="regular" className={iconClass} />;
+    case "file-list":
+      return <FilesIcon size={14} weight="regular" className={iconClass} />;
+    case "search":
+      return (
+        <MagnifyingGlassIcon size={14} weight="regular" className={iconClass} />
+      );
+    case "edit":
+      return (
+        <PencilSimpleIcon size={14} weight="regular" className={iconClass} />
+      );
+    case "todo":
+      return (
+        <ListChecksIcon size={14} weight="regular" className={iconClass} />
+      );
+    default:
+      return <GearIcon size={14} weight="regular" className={iconClass} />;
+  }
+}
+
+/* ------------------------------------------------------------------ */
+/*  Accordion icon (larger, for the accordion header)                  */
+/* ------------------------------------------------------------------ */
+
+function AccordionIcon({ category }: { category: ToolCategory }) {
+  switch (category) {
+    case "bash":
+      return <TerminalIcon size={32} weight="light" />;
+    case "web":
+      return <GlobeIcon size={32} weight="light" />;
+    case "file-read":
+    case "file-write":
+      return <FileIcon size={32} weight="light" />;
+    case "file-delete":
+      return <TrashIcon size={32} weight="light" />;
+    case "file-list":
+      return <FilesIcon size={32} weight="light" />;
+    case "search":
+      return <MagnifyingGlassIcon size={32} weight="light" />;
+    case "edit":
+      return <PencilSimpleIcon size={32} weight="light" />;
+    case "todo":
+      return <ListChecksIcon size={32} weight="light" />;
+    default:
+      return <GearIcon size={32} weight="light" />;
+  }
+}
+
+/* ------------------------------------------------------------------ */
+/*  Input extraction                                                   */
+/* ------------------------------------------------------------------ */
+
+function getInputSummary(toolName: string, input: unknown): string | null {
+  if (!input || typeof input !== "object") return null;
+  const inp = input as Record<string, unknown>;
+
+  switch (toolName) {
+    case "bash_exec":
+      return typeof inp.command === "string" ? inp.command : null;
+    case "web_fetch":
+    case "WebFetch":
+      return typeof inp.url === "string" ? inp.url : null;
+    case "WebSearch":
+      return typeof inp.query === "string" ? inp.query : null;
+    case "read_workspace_file":
+    case "Read":
+      return (
+        (typeof inp.file_path === "string" ? inp.file_path : null) ??
+        (typeof inp.path === "string" ? inp.path : null)
+      );
+    case "write_workspace_file":
+    case "Write":
+      return (
+        (typeof inp.file_path === "string" ? inp.file_path : null) ??
+        (typeof inp.path === "string" ? inp.path : null)
+      );
+    case "delete_workspace_file":
+      return typeof inp.file_path === "string" ? inp.file_path : null;
+    case "Glob":
+      return typeof inp.pattern === "string" ? inp.pattern : null;
+    case "Grep":
+      return typeof inp.pattern === "string" ? inp.pattern : null;
+    case "Edit":
+      return typeof inp.file_path === "string" ? inp.file_path : null;
+    case "TodoWrite": {
+      // Extract the in-progress task name for the status line
+      const todos = Array.isArray(inp.todos) ? inp.todos : [];
+      const active = todos.find(
+        (t: Record<string, unknown>) => t.status === "in_progress",
+      );
+      if (active && typeof active.activeForm === "string")
+        return active.activeForm;
+      if (active && typeof active.content === "string") return active.content;
+      return null;
+    }
+    default:
+      return null;
+  }
+}
+
+function truncate(text: string, maxLen: number): string {
+  if (text.length <= maxLen) return text;
+  return text.slice(0, maxLen).trimEnd() + "…";
+}
+
+/* ------------------------------------------------------------------ */
+/*  Animation text                                                     */
+/* ------------------------------------------------------------------ */
+
+function getAnimationText(part: ToolUIPart, category: ToolCategory): string {
+  const toolName = extractToolName(part);
+  const summary = getInputSummary(toolName, part.input);
+  const shortSummary = summary ? truncate(summary, 60) : null;
+
+  switch (part.state) {
+    case "input-streaming":
+    case "input-available": {
+      switch (category) {
+        case "bash":
+          return shortSummary ? `Running: ${shortSummary}` : "Running command…";
+        case "web":
+          if (toolName === "WebSearch") {
+            return shortSummary
+              ? `Searching "${shortSummary}"`
+              : "Searching the web…";
+          }
+          return shortSummary
+            ? `Fetching ${shortSummary}`
+            : "Fetching web content…";
+        case "file-read":
+          return shortSummary ? `Reading ${shortSummary}` : "Reading file…";
+        case "file-write":
+          return shortSummary ? `Writing ${shortSummary}` : "Writing file…";
+        case "file-delete":
+          return shortSummary ? `Deleting ${shortSummary}` : "Deleting file…";
+        case "file-list":
+          return shortSummary ? `Listing ${shortSummary}` : "Listing files…";
+        case "search":
+          return shortSummary
+            ? `Searching for "${shortSummary}"`
+            : "Searching…";
+        case "edit":
+          return shortSummary ? `Editing ${shortSummary}` : "Editing file…";
+        case "todo":
+          return shortSummary ? `${shortSummary}` : "Updating task list…";
+        default:
+          return `Running ${formatToolName(toolName)}…`;
+      }
+    }
+    case "output-available": {
+      switch (category) {
+        case "bash": {
+          const exitCode = getExitCode(part.output);
+          if (exitCode !== null && exitCode !== 0) {
+            return `Command exited with code ${exitCode}`;
+          }
+          return shortSummary ? `Ran: ${shortSummary}` : "Command completed";
+        }
+        case "web":
+          if (toolName === "WebSearch") {
+            return shortSummary
+              ? `Searched "${shortSummary}"`
+              : "Web search completed";
+          }
+          return shortSummary
+            ? `Fetched ${shortSummary}`
+            : "Fetched web content";
+        case "file-read":
+          return shortSummary ? `Read ${shortSummary}` : "File read completed";
+        case "file-write":
+          return shortSummary ? `Wrote ${shortSummary}` : "File written";
+        case "file-delete":
+          return shortSummary ? `Deleted ${shortSummary}` : "File deleted";
+        case "file-list":
+          return "Listed files";
+        case "search":
+          return shortSummary
+            ? `Searched for "${shortSummary}"`
+            : "Search completed";
+        case "edit":
+          return shortSummary ? `Edited ${shortSummary}` : "Edit completed";
+        case "todo":
+          return "Updated task list";
+        default:
+          return `${formatToolName(toolName)} completed`;
+      }
+    }
+    case "output-error": {
+      switch (category) {
+        case "bash":
+          return "Command failed";
+        case "web":
+          return toolName === "WebSearch" ? "Search failed" : "Fetch failed";
+        default:
+          return `${formatToolName(toolName)} failed`;
+      }
+    }
+    default:
+      return `Running ${formatToolName(toolName)}…`;
+  }
+}
+
+/* ------------------------------------------------------------------ */
+/*  Output parsing helpers                                             */
+/* ------------------------------------------------------------------ */
+
+function parseOutput(output: unknown): Record<string, unknown> | null {
+  if (!output) return null;
+  if (typeof output === "object") return output as Record<string, unknown>;
+  if (typeof output === "string") {
+    const trimmed = output.trim();
+    if (!trimmed) return null;
+    try {
+      const parsed = JSON.parse(trimmed);
+      if (
+        typeof parsed === "object" &&
+        parsed !== null &&
+        !Array.isArray(parsed)
+      )
+        return parsed;
+    } catch {
+      // Return as a message wrapper for plain text output
+      return { _raw: trimmed };
+    }
+  }
+  return null;
+}
+
+/**
+ * Extract text from MCP-style content blocks.
+ * SDK built-in tools (WebSearch, etc.) may return `{content: [{type:"text", text:"..."}]}`.
+ */
+function extractMcpText(output: Record<string, unknown>): string | null {
+  if (Array.isArray(output.content)) {
+    const texts = (output.content as Array<Record<string, unknown>>)
+      .filter((b) => b.type === "text" && typeof b.text === "string")
+      .map((b) => b.text as string);
+    if (texts.length > 0) return texts.join("\n");
+  }
+  return null;
+}
+
+function getExitCode(output: unknown): number | null {
+  const parsed = parseOutput(output);
+  if (!parsed) return null;
+  if (typeof parsed.exit_code === "number") return parsed.exit_code;
+  return null;
+}
+
+function getStringField(
+  obj: Record<string, unknown>,
+  ...keys: string[]
+): string | null {
+  for (const key of keys) {
+    if (typeof obj[key] === "string" && obj[key].length > 0)
+      return obj[key] as string;
+  }
+  return null;
+}
+
+/* ------------------------------------------------------------------ */
+/*  Accordion content per tool category                                */
+/* ------------------------------------------------------------------ */
+
+interface AccordionData {
+  title: string;
+  description?: string;
+  content: React.ReactNode;
+}
+
+function getBashAccordionData(
+  input: unknown,
+  output: Record<string, unknown>,
+): AccordionData {
+  const inp = (input && typeof input === "object" ? input : {}) as Record<
+    string,
+    unknown
+  >;
+  const command = typeof inp.command === "string" ? inp.command : "Command";
+
+  const stdout = getStringField(output, "stdout");
+  const stderr = getStringField(output, "stderr");
+  const exitCode =
+    typeof output.exit_code === "number" ? output.exit_code : null;
+  const timedOut = output.timed_out === true;
+  const message = getStringField(output, "message");
+
+  const title = timedOut
+    ? "Command timed out"
+    : exitCode !== null && exitCode !== 0
+      ? `Command failed (exit ${exitCode})`
+      : "Command output";
+
+  return {
+    title,
+    description: truncate(command, 80),
+    content: (
+      <div className="space-y-2">
+        {stdout && (
+          <div>
+            <p className="mb-1 text-xs font-medium text-slate-500">stdout</p>
+            <ContentCodeBlock>{truncate(stdout, 2000)}</ContentCodeBlock>
+          </div>
+        )}
+        {stderr && (
+          <div>
+            <p className="mb-1 text-xs font-medium text-slate-500">stderr</p>
+            <ContentCodeBlock>{truncate(stderr, 1000)}</ContentCodeBlock>
+          </div>
+        )}
+        {!stdout && !stderr && message && (
+          <ContentMessage>{message}</ContentMessage>
+        )}
+      </div>
+    ),
+  };
+}
+
+function getWebAccordionData(
+  input: unknown,
+  output: Record<string, unknown>,
+): AccordionData {
+  const inp = (input && typeof input === "object" ? input : {}) as Record<
+    string,
+    unknown
+  >;
+  const url =
+    getStringField(inp as Record<string, unknown>, "url", "query") ??
+    "Web content";
+
+  // Try direct string fields first, then MCP content blocks, then raw JSON
+  let content = getStringField(output, "content", "text", "_raw");
+  if (!content) content = extractMcpText(output);
+  if (!content) {
+    // Fallback: render the raw JSON so the accordion isn't empty
+    try {
+      const raw = JSON.stringify(output, null, 2);
+      if (raw !== "{}") content = raw;
+    } catch {
+      /* empty */
+    }
+  }
+
+  const statusCode =
+    typeof output.status_code === "number" ? output.status_code : null;
+  const message = getStringField(output, "message");
+
+  return {
+    title: statusCode
+      ? `Response (${statusCode})`
+      : url
+        ? "Web fetch"
+        : "Search results",
+    description: truncate(url, 80),
+    content: content ? (
+      <ContentCodeBlock>{truncate(content, 2000)}</ContentCodeBlock>
+    ) : message ? (
+      <ContentMessage>{message}</ContentMessage>
+    ) : Object.keys(output).length > 0 ? (
+      <ContentCodeBlock>
+        {truncate(JSON.stringify(output, null, 2), 2000)}
+      </ContentCodeBlock>
+    ) : null,
+  };
+}
+
+function getFileAccordionData(
+  input: unknown,
+  output: Record<string, unknown>,
+): AccordionData {
+  const inp = (input && typeof input === "object" ? input : {}) as Record<
+    string,
+    unknown
+  >;
+  const filePath =
+    getStringField(
+      inp as Record<string, unknown>,
+      "file_path",
+      "path",
+      "pattern",
+    ) ?? "File";
+  const content = getStringField(output, "content", "text", "_raw");
+  const message = getStringField(output, "message");
+  // For Glob/list results, try to show file list
+  const files = Array.isArray(output.files)
+    ? output.files.filter((f: unknown): f is string => typeof f === "string")
+    : null;
+
+  return {
+    title: message ?? "File output",
+    description: truncate(filePath, 80),
+    content: (
+      <div className="space-y-2">
+        {content && (
+          <ContentCodeBlock>{truncate(content, 2000)}</ContentCodeBlock>
+        )}
+        {files && files.length > 0 && (
+          <ContentCodeBlock>
+            {truncate(files.join("\n"), 2000)}
+          </ContentCodeBlock>
+        )}
+        {!content && !files && message && (
+          <ContentMessage>{message}</ContentMessage>
+        )}
+      </div>
+    ),
+  };
+}
+
+interface TodoItem {
+  content: string;
+  status: "pending" | "in_progress" | "completed";
+  activeForm?: string;
+}
+
+function getTodoAccordionData(input: unknown): AccordionData {
+  const inp = (input && typeof input === "object" ? input : {}) as Record<
+    string,
+    unknown
+  >;
+  const todos: TodoItem[] = Array.isArray(inp.todos)
+    ? inp.todos.filter(
+        (t: unknown): t is TodoItem =>
+          typeof t === "object" &&
+          t !== null &&
+          typeof (t as TodoItem).content === "string",
+      )
+    : [];
+
+  const completed = todos.filter((t) => t.status === "completed").length;
+  const total = todos.length;
+
+  return {
+    title: "Task list",
+    description: `${completed}/${total} completed`,
+    content: (
+      <div className="space-y-1 py-1">
+        {todos.map((todo, i) => (
+          <div key={i} className="flex items-start gap-2 text-xs">
+            <span className="mt-0.5 flex-shrink-0">
+              {todo.status === "completed" ? (
+                <CheckCircleIcon
+                  size={14}
+                  weight="fill"
+                  className="text-green-500"
+                />
+              ) : todo.status === "in_progress" ? (
+                <CircleDashedIcon
+                  size={14}
+                  weight="bold"
+                  className="text-blue-500"
+                />
+              ) : (
+                <CircleIcon
+                  size={14}
+                  weight="regular"
+                  className="text-neutral-400"
+                />
+              )}
+            </span>
+            <span
+              className={
+                todo.status === "completed"
+                  ? "text-muted-foreground line-through"
+                  : todo.status === "in_progress"
+                    ? "font-medium text-foreground"
+                    : "text-muted-foreground"
+              }
+            >
+              {todo.content}
+            </span>
+          </div>
+        ))}
+      </div>
+    ),
+  };
+}
+
+function getDefaultAccordionData(
+  output: Record<string, unknown>,
+): AccordionData {
+  const message = getStringField(output, "message");
+  const raw = output._raw;
+  const mcpText = extractMcpText(output);
+
+  let displayContent: string;
+  if (typeof raw === "string") {
+    displayContent = raw;
+  } else if (mcpText) {
+    displayContent = mcpText;
+  } else if (message) {
+    displayContent = message;
+  } else {
+    try {
+      displayContent = JSON.stringify(output, null, 2);
+    } catch {
+      displayContent = String(output);
+    }
+  }
+
+  return {
+    title: "Output",
+    description: message ?? undefined,
+    content: (
+      <ContentCodeBlock>{truncate(displayContent, 2000)}</ContentCodeBlock>
+    ),
+  };
+}
+
+function getAccordionData(
+  category: ToolCategory,
+  input: unknown,
+  output: Record<string, unknown>,
+): AccordionData {
+  switch (category) {
+    case "bash":
+      return getBashAccordionData(input, output);
+    case "web":
+      return getWebAccordionData(input, output);
+    case "file-read":
+    case "file-write":
+    case "file-delete":
+    case "file-list":
+    case "search":
+    case "edit":
+      return getFileAccordionData(input, output);
+    case "todo":
+      return getTodoAccordionData(input);
+    default:
+      return getDefaultAccordionData(output);
+  }
+}
+
+/* ------------------------------------------------------------------ */
+/*  Component                                                          */
+/* ------------------------------------------------------------------ */
+
 export function GenericTool({ part }: Props) {
+  const toolName = extractToolName(part);
+  const category = getToolCategory(toolName);
  const isStreaming =
    part.state === "input-streaming" || part.state === "input-available";
  const isError = part.state === "output-error";
+  const text = getAnimationText(part, category);
+
+  const output = parseOutput(part.output);
+  const hasOutput =
+    part.state === "output-available" &&
+    !!output &&
+    Object.keys(output).length > 0;
+  const hasError = isError && !!output;
+
+  // TodoWrite: always show accordion from input (the todo list lives in input)
+  const hasTodoInput =
+    category === "todo" &&
+    part.input &&
+    typeof part.input === "object" &&
+    Array.isArray((part.input as Record<string, unknown>).todos);
+
+  const showAccordion = hasOutput || hasError || hasTodoInput;
+  const accordionData = showAccordion
+    ? getAccordionData(category, part.input, output ?? {})
+    : null;

  return (
    <div className="py-2">
      <div className="flex items-center gap-2 text-sm text-muted-foreground">
-        <GearIcon
-          size={14}
-          weight="regular"
-          className={
-            isError
-              ? "text-red-500"
-              : isStreaming
-                ? "animate-spin text-neutral-500"
-                : "text-neutral-400"
-          }
+        <ToolIcon
+          category={category}
+          isStreaming={isStreaming}
+          isError={isError}
        />
        <MorphingTextAnimation
-          text={getAnimationText(part)}
+          text={text}
          className={isError ? "text-red-500" : undefined}
        />
      </div>
+
+      {showAccordion && accordionData ? (
+        <ToolAccordion
+          icon={<AccordionIcon category={category} />}
+          title={accordionData.title}
+          description={accordionData.description}
+          titleClassName={isError ? "text-red-500" : undefined}
+        >
+          {accordionData.content}
+        </ToolAccordion>
+      ) : null}
    </div>
  );
 }
--- a/autogpt_platform/frontend/src/app/(platform)/copilot/useChatSession.ts
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/useChatSession.ts
@@ -50,6 +50,14 @@ export function useChatSession() {
    );
  }, [sessionQuery.data, sessionId]);

+  // Expose active_stream info so the caller can trigger manual resume
+  // after hydration completes (rather than relying on AI SDK's built-in
+  // resume which fires before hydration).
+  const hasActiveStream = useMemo(() => {
+    if (sessionQuery.data?.status !== 200) return false;
+    return !!sessionQuery.data.data.active_stream;
+  }, [sessionQuery.data]);
+
  const { mutateAsync: createSessionMutation, isPending: isCreatingSession } =
    usePostV2CreateSession({
      mutation: {
@@ -102,6 +110,7 @@ export function useChatSession() {
    sessionId,
    setSessionId,
    hydratedMessages,
+    hasActiveStream,
    isLoadingSession: sessionQuery.isLoading,
    createSession,
    isCreatingSession,
--- a/autogpt_platform/frontend/src/app/(platform)/copilot/useCopilotPage.ts
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/useCopilotPage.ts
@@ -1,5 +1,6 @@
 import {
  getGetV2ListSessionsQueryKey,
+  postV2CancelSessionTask,
  useDeleteV2DeleteSession,
  useGetV2ListSessions,
 } from "@/app/api/__generated__/endpoints/chat/chat";
@@ -8,6 +9,7 @@ import { useBreakpoint } from "@/lib/hooks/useBreakpoint";
 import { useSupabase } from "@/lib/supabase/hooks/useSupabase";
 import { useChat } from "@ai-sdk/react";
 import { useQueryClient } from "@tanstack/react-query";
+import type { UIMessage } from "ai";
 import { DefaultChatTransport } from "ai";
 import { useCallback, useEffect, useMemo, useRef, useState } from "react";
 import { useChatSession } from "./useChatSession";
@@ -15,6 +17,24 @@ import { useLongRunningToolPolling } from "./hooks/useLongRunningToolPolling";

 const STREAM_START_TIMEOUT_MS = 12_000;

+/** Mark any in-progress tool parts as completed/errored so spinners stop. */
+function resolveInProgressTools(
+  messages: UIMessage[],
+  outcome: "completed" | "cancelled",
+): UIMessage[] {
+  return messages.map((msg) => ({
+    ...msg,
+    parts: msg.parts.map((part) =>
+      "state" in part &&
+      (part.state === "input-streaming" || part.state === "input-available")
+        ? outcome === "cancelled"
+          ? { ...part, state: "output-error" as const, errorText: "Cancelled" }
+          : { ...part, state: "output-available" as const, output: "" }
+        : part,
+    ),
+  }));
+}
+
 export function useCopilotPage() {
  const { isUserLoading, isLoggedIn } = useSupabase();
  const [isDrawerOpen, setIsDrawerOpen] = useState(false);
@@ -29,6 +49,7 @@ export function useCopilotPage() {
    sessionId,
    setSessionId,
    hydratedMessages,
+    hasActiveStream,
    isLoadingSession,
    createSession,
    isCreatingSession,
@@ -80,16 +101,63 @@ export function useCopilotPage() {
                },
              };
            },
+            // Resume: GET goes to the same URL as POST (backend uses
+            // method to distinguish).  Override the default formula which
+            // would append /{chatId}/stream to the existing path.
+            prepareReconnectToStreamRequest: () => ({
+              api: `/api/chat/sessions/${sessionId}/stream`,
+            }),
          })
        : null,
    [sessionId],
  );

-  const { messages, sendMessage, stop, status, error, setMessages } = useChat({
+  const {
+    messages,
+    sendMessage,
+    stop: sdkStop,
+    status,
+    error,
+    setMessages,
+    resumeStream,
+  } = useChat({
    id: sessionId ?? undefined,
    transport: transport ?? undefined,
+    // Don't use resume: true — it fires before hydration completes, causing
+    // the hydrated messages to overwrite the resumed stream.  Instead we
+    // call resumeStream() manually after hydration + active_stream detection.
  });

+  // Wrap AI SDK's stop() to also cancel the backend executor task.
+  // sdkStop() aborts the SSE fetch instantly (UI feedback), then we fire
+  // the cancel API to actually stop the executor and wait for confirmation.
+  async function stop() {
+    sdkStop();
+    setMessages((prev) => resolveInProgressTools(prev, "cancelled"));
+
+    if (!sessionId) return;
+    try {
+      const res = await postV2CancelSessionTask(sessionId);
+      if (
+        res.status === 200 &&
+        "reason" in res.data &&
+        res.data.reason === "cancel_published_not_confirmed"
+      ) {
+        toast({
+          title: "Stop may take a moment",
+          description:
+            "The cancel was sent but not yet confirmed. The task should stop shortly.",
+        });
+      }
+    } catch {
+      toast({
+        title: "Could not stop the task",
+        description: "The task may still be running in the background.",
+        variant: "destructive",
+      });
+    }
+  }
+
  // Abort the stream if the backend doesn't start sending data within 12s.
  const stopRef = useRef(stop);
  stopRef.current = stop;
@@ -108,13 +176,43 @@ export function useCopilotPage() {
    return () => clearTimeout(timer);
  }, [status]);

+  // Hydrate messages from the REST session endpoint.
+  // Skip hydration while streaming to avoid overwriting the live stream.
  useEffect(() => {
    if (!hydratedMessages || hydratedMessages.length === 0) return;
+    if (status === "streaming" || status === "submitted") return;
    setMessages((prev) => {
      if (prev.length >= hydratedMessages.length) return prev;
      return hydratedMessages;
    });
-  }, [hydratedMessages, setMessages]);
+  }, [hydratedMessages, setMessages, status]);
+
+  // Resume an active stream AFTER hydration completes.
+  // The backend returns active_stream info when a task is still running.
+  // We wait for hydration so the AI SDK has the conversation history
+  // before the resumed stream appends the in-progress assistant message.
+  const hasResumedRef = useRef<string | null>(null);
+  useEffect(() => {
+    if (!hasActiveStream || !sessionId) return;
+    if (!hydratedMessages || hydratedMessages.length === 0) return;
+    if (status === "streaming" || status === "submitted") return;
+    // Only resume once per session to avoid re-triggering after stream ends
+    if (hasResumedRef.current === sessionId) return;
+    hasResumedRef.current = sessionId;
+    resumeStream();
+  }, [hasActiveStream, sessionId, hydratedMessages, status, resumeStream]);
+
+  // When the stream finishes, resolve any tool parts still showing spinners.
+  // This can happen if the backend didn't emit StreamToolOutputAvailable for
+  // a tool call before sending StreamFinish (e.g. SDK built-in tools).
+  const prevStatusRef = useRef(status);
+  useEffect(() => {
+    const prev = prevStatusRef.current;
+    prevStatusRef.current = status;
+    if (prev === "streaming" && status === "ready") {
+      setMessages((msgs) => resolveInProgressTools(msgs, "completed"));
+    }
+  }, [status, setMessages]);

  // Poll session endpoint when a long-running tool (create_agent, edit_agent)
  // is in progress. When the backend completes, the session data will contain
@@ -197,12 +295,18 @@ export function useCopilotPage() {
    }
  }, [isDeleting]);

+  // True while we know the backend has an active stream but haven't
+  // reconnected yet.  Used to disable the send button and show stop UI.
+  const isReconnecting =
+    hasActiveStream && status !== "streaming" && status !== "submitted";
+
  return {
    sessionId,
    messages,
    status,
    error,
    stop,
+    isReconnecting,
    isLoadingSession,
    isCreatingSession,
    isUserLoading,
--- a/autogpt_platform/frontend/src/app/api/openapi.json
+++ b/autogpt_platform/frontend/src/app/api/openapi.json
@@ -1052,6 +1052,7 @@
                    {
                      "$ref": "#/components/schemas/ClarificationNeededResponse"
                    },
+                    { "$ref": "#/components/schemas/SuggestedGoalResponse" },
                    { "$ref": "#/components/schemas/BlockListResponse" },
                    { "$ref": "#/components/schemas/BlockDetailsResponse" },
                    { "$ref": "#/components/schemas/BlockOutputResponse" },
@@ -1262,6 +1263,44 @@
        }
      }
    },
+    "/api/chat/sessions/{session_id}/cancel": {
+      "post": {
+        "tags": ["v2", "chat", "chat"],
+        "summary": "Cancel Session Task",
+        "description": "Cancel the active streaming task for a session.\n\nPublishes a cancel event to the executor via RabbitMQ FANOUT, then\npolls Redis until the task status flips from ``running`` or a timeout\n(5 s) is reached.  Returns only after the cancellation is confirmed.",
+        "operationId": "postV2CancelSessionTask",
+        "security": [{ "HTTPBearerJWT": [] }],
+        "parameters": [
+          {
+            "name": "session_id",
+            "in": "path",
+            "required": true,
+            "schema": { "type": "string", "title": "Session Id" }
+          }
+        ],
+        "responses": {
+          "200": {
+            "description": "Successful Response",
+            "content": {
+              "application/json": {
+                "schema": { "$ref": "#/components/schemas/CancelTaskResponse" }
+              }
+            }
+          },
+          "401": {
+            "$ref": "#/components/responses/HTTP401NotAuthenticatedError"
+          },
+          "422": {
+            "description": "Validation Error",
+            "content": {
+              "application/json": {
+                "schema": { "$ref": "#/components/schemas/HTTPValidationError" }
+              }
+            }
+          }
+        }
+      }
+    },
    "/api/chat/sessions/{session_id}/stream": {
      "get": {
        "tags": ["v2", "chat", "chat"],
@@ -7536,6 +7575,23 @@
        "required": ["file"],
        "title": "Body_postV2Upload submission media"
      },
+      "CancelTaskResponse": {
+        "properties": {
+          "cancelled": { "type": "boolean", "title": "Cancelled" },
+          "task_id": {
+            "anyOf": [{ "type": "string" }, { "type": "null" }],
+            "title": "Task Id"
+          },
+          "reason": {
+            "anyOf": [{ "type": "string" }, { "type": "null" }],
+            "title": "Reason"
+          }
+        },
+        "type": "object",
+        "required": ["cancelled"],
+        "title": "CancelTaskResponse",
+        "description": "Response model for the cancel task endpoint."
+      },
      "ChangelogEntry": {
        "properties": {
          "version": { "type": "string", "title": "Version" },
@@ -10796,7 +10852,8 @@
          "bash_exec",
          "operation_status",
          "feature_request_search",
-          "feature_request_created"
+          "feature_request_created",
+          "suggested_goal"
        ],
        "title": "ResponseType",
        "description": "Types of tool responses."
@@ -11677,6 +11734,47 @@
        "enum": ["DRAFT", "PENDING", "APPROVED", "REJECTED"],
        "title": "SubmissionStatus"
      },
+      "SuggestedGoalResponse": {
+        "properties": {
+          "type": {
+            "$ref": "#/components/schemas/ResponseType",
+            "default": "suggested_goal"
+          },
+          "message": { "type": "string", "title": "Message" },
+          "session_id": {
+            "anyOf": [{ "type": "string" }, { "type": "null" }],
+            "title": "Session Id"
+          },
+          "suggested_goal": {
+            "type": "string",
+            "title": "Suggested Goal",
+            "description": "The suggested alternative goal"
+          },
+          "reason": {
+            "type": "string",
+            "title": "Reason",
+            "description": "Why the original goal needs refinement",
+            "default": ""
+          },
+          "original_goal": {
+            "type": "string",
+            "title": "Original Goal",
+            "description": "The user's original goal for context",
+            "default": ""
+          },
+          "goal_type": {
+            "type": "string",
+            "enum": ["vague", "unachievable"],
+            "title": "Goal Type",
+            "description": "Type: 'vague' or 'unachievable'",
+            "default": "vague"
+          }
+        },
+        "type": "object",
+        "required": ["message", "suggested_goal"],
+        "title": "SuggestedGoalResponse",
+        "description": "Response when the goal needs refinement with a suggested alternative."
+      },
      "SuggestionsResponse": {
        "properties": {
          "otto_suggestions": {
--- a/autogpt_platform/frontend/src/tests/marketplace-creator.spec.ts
+++ b/autogpt_platform/frontend/src/tests/marketplace-creator.spec.ts
@@ -69,12 +69,11 @@ test.describe("Marketplace Creator Page – Basic Functionality", () => {
      await marketplacePage.getFirstCreatorProfile(page);
    await firstCreatorProfile.click();
    await page.waitForURL("**/marketplace/creator/**");
-    await page.waitForLoadState("networkidle").catch(() => {});

    const firstAgent = page
      .locator('[data-testid="store-card"]:visible')
      .first();
-    await firstAgent.waitFor({ state: "visible", timeout: 30000 });
+    await firstAgent.waitFor({ state: "visible", timeout: 15000 });

    await firstAgent.click();
    await page.waitForURL("**/marketplace/agent/**");
--- a/autogpt_platform/frontend/src/tests/marketplace.spec.ts
+++ b/autogpt_platform/frontend/src/tests/marketplace.spec.ts
@@ -115,18 +115,11 @@ test.describe("Marketplace – Basic Functionality", () => {
    const searchTerm = page.getByText("DummyInput").first();
    await isVisible(searchTerm);

-    await page.waitForLoadState("networkidle").catch(() => {});
-
-    await page
-      .waitForFunction(
-        () =>
-          document.querySelectorAll('[data-testid="store-card"]').length > 0,
-        { timeout: 15000 },
-      )
-      .catch(() => console.log("No search results appeared within timeout"));
-
-    const results = await marketplacePage.getSearchResultsCount(page);
-    expect(results).toBeGreaterThan(0);
+    await expect
+      .poll(() => marketplacePage.getSearchResultsCount(page), {
+        timeout: 15000,
+      })
+      .toBeGreaterThan(0);

    console.log("Complete search flow works correctly test passed ✅");
  });
@@ -135,7 +128,9 @@ test.describe("Marketplace – Basic Functionality", () => {
 });

 test.describe("Marketplace – Edge Cases", () => {
-  test("Search for non-existent item shows no results", async ({ page }) => {
+  test("Search for non-existent item renders search page correctly", async ({
+    page,
+  }) => {
    const marketplacePage = new MarketplacePage(page);
    await marketplacePage.goto(page);

@@ -151,9 +146,23 @@ test.describe("Marketplace – Edge Cases", () => {
    const searchTerm = page.getByText("xyznonexistentitemxyz123");
    await isVisible(searchTerm);

-    const results = await marketplacePage.getSearchResultsCount(page);
-    expect(results).toBe(0);
+    // The search page should render either results or a "No results found" message
+    await expect
+      .poll(
+        async () => {
+          const hasResults =
+            (await page.locator('[data-testid="store-card"]').count()) > 0;
+          const hasNoResultsMsg = await page
+            .getByText("No results found")
+            .isVisible();
+          return hasResults || hasNoResultsMsg;
+        },
+        { timeout: 15000 },
+      )
+      .toBe(true);

-    console.log("Search for non-existent item shows no results test passed ✅");
+    console.log(
+      "Search for non-existent item renders search page correctly test passed ✅",
+    );
  });
 });
--- a/autogpt_platform/frontend/src/tests/pages/build.page.ts
+++ b/autogpt_platform/frontend/src/tests/pages/build.page.ts
@@ -125,16 +125,8 @@ export class BuildPage extends BasePage {
      `[data-id="block-card-${blockCardId}"]`,
    );

-    try {
-      // Wait for the block card to be visible with a reasonable timeout
-      await blockCard.waitFor({ state: "visible", timeout: 10000 });
-      await blockCard.click();
-    } catch (error) {
-      console.log(
-        `Block ${block.name} (display: ${displayName}) returned from the API but not found in block list`,
-      );
-      console.log(`Error: ${error}`);
-    }
+    await blockCard.waitFor({ state: "visible", timeout: 10000 });
+    await blockCard.click();
  }

  async hasBlock(_block: Block) {
--- a/autogpt_platform/frontend/src/tests/pages/login.page.ts
+++ b/autogpt_platform/frontend/src/tests/pages/login.page.ts
@@ -65,7 +65,7 @@ export class LoginPage {
    await this.page.waitForLoadState("load", { timeout: 10_000 });

    console.log("➡️ Navigating to /marketplace ...");
-    await this.page.goto("/marketplace", { timeout: 10_000 });
+    await this.page.goto("/marketplace", { timeout: 20_000 });
    console.log("✅ Login process complete");

    // If Wallet popover auto-opens, close it to avoid blocking account menu interactions
--- a/autogpt_platform/frontend/src/tests/pages/marketplace.page.ts
+++ b/autogpt_platform/frontend/src/tests/pages/marketplace.page.ts
@@ -9,7 +9,12 @@ export class MarketplacePage extends BasePage {

  async goto(page: Page) {
    await page.goto("/marketplace");
-    await page.waitForLoadState("networkidle").catch(() => {});
+    await page
+      .locator(
+        '[data-testid="store-card"], [data-testid="featured-store-card"]',
+      )
+      .first()
+      .waitFor({ state: "visible", timeout: 20000 });
  }

  async getMarketplaceTitle(page: Page) {
@@ -111,7 +116,7 @@ export class MarketplacePage extends BasePage {
  async getFirstFeaturedAgent(page: Page) {
    const { getId } = getSelectors(page);
    const card = getId("featured-store-card").first();
-    await card.waitFor({ state: "visible", timeout: 30000 });
+    await card.waitFor({ state: "visible", timeout: 15000 });
    return card;
  }

@@ -119,14 +124,14 @@ export class MarketplacePage extends BasePage {
    const card = this.page
      .locator('[data-testid="store-card"]:visible')
      .first();
-    await card.waitFor({ state: "visible", timeout: 30000 });
+    await card.waitFor({ state: "visible", timeout: 15000 });
    return card;
  }

  async getFirstCreatorProfile(page: Page) {
    const { getId } = getSelectors(page);
    const card = getId("creator-card").first();
-    await card.waitFor({ state: "visible", timeout: 30000 });
+    await card.waitFor({ state: "visible", timeout: 15000 });
    return card;
  }

--- a/autogpt_platform/frontend/src/tests/utils/assertion.ts
+++ b/autogpt_platform/frontend/src/tests/utils/assertion.ts
@@ -45,8 +45,9 @@ export async function isEnabled(el: Locator) {
 }

 export async function hasMinCount(el: Locator, minCount: number) {
-  const count = await el.count();
-  expect(count).toBeGreaterThanOrEqual(minCount);
+  await expect
+    .poll(async () => await el.count(), { timeout: 10000 })
+    .toBeGreaterThanOrEqual(minCount);
 }

 export async function matchesUrl(page: Page, pattern: RegExp) {
Author	SHA1	Message	Date
Zamil Majdy	11e6fca8c3	fix(copilot): resolve dangling tool spinners when stream finishes When the backend doesn't emit StreamToolOutputAvailable for all tool calls before StreamFinish (e.g. SDK built-in tools like WebSearch), the frontend spinners would spin forever. Add a useEffect that watches for the streaming→ready transition and marks any remaining input-available/input-streaming tool parts as output-available. Extract shared resolveInProgressTools helper used by both the stop handler (cancelled) and stream-end (completed).	2026-02-20 03:48:20 +07:00
Zamil Majdy	6e737e0b74	style: fix Black formatting on cancel endpoint	2026-02-20 02:46:03 +07:00
Zamil Majdy	5ce002803d	fix(copilot): toast when cancel confirmation times out Check the reason field in the cancel response — if "cancel_published_not_confirmed", show a non-destructive toast so the user knows the stop was sent but not yet confirmed by the executor.	2026-02-20 02:31:18 +07:00
Zamil Majdy	f8ad8484ee	refactor(copilot): convert stop to plain function declaration Remove useCallback wrapper per project guidelines — stopRef.current captures the latest closure on every render regardless.	2026-02-20 02:25:43 +07:00
Zamil Majdy	b6064d0155	fix(copilot): address round-2 PR review and fix tool loading on stop Backend: - Add _validate_and_get_session() call to cancel endpoint (404 for invalid sessions, consistent with other endpoints) - Reduce polling max_wait from 10s to 5s (stay below reverse-proxy read timeouts) - Return cancelled=True with reason="cancel_published_not_confirmed" on timeout (cancel event IS published, just not yet confirmed) Frontend: - Mark in-progress tool parts as output-error on stop so spinners clear immediately instead of spinning forever - Toast on cancel API failure (network error / 5xx)	2026-02-20 02:21:28 +07:00
Zamil Majdy	76e0c96aa9	feat: fix openapi.json	2026-02-20 02:14:07 +07:00
Zamil Majdy	3364a8e415	refactor(copilot): use generated client for cancel API call Replace raw fetch() with generated postV2CancelSessionTask() and remove the now-unnecessary dedicated cancel proxy route — the general /api/proxy handles auth and forwarding. Toast on cancel failure so the user knows the backend may still be running.	2026-02-20 02:10:10 +07:00
Zamil Majdy	9f4f2749a4	fix(copilot): address PR review comments for cancel endpoint - Add CancelTaskResponse Pydantic model with typed return annotation - Handle non-JSON backend responses in cancel proxy route - Check for "no-token-found" token before forwarding auth header - Truncate IDs in log messages for consistency - Add cancel endpoint to openapi.json for frontend codegen	2026-02-20 02:02:14 +07:00
Zamil Majdy	2b0f457985	feat(copilot): wire up stop button to cancel executor tasks The stop button was completely disconnected — clicking it only aborted the client-side SSE fetch while the executor kept running indefinitely. - Add `enqueue_cancel_task()` to publish `CancelCoPilotEvent` to the existing RabbitMQ FANOUT exchange that the executor already consumes - Add `POST /sessions/{session_id}/cancel` endpoint that finds the active task, publishes the cancel event, and polls Redis until the task status confirms stopped (up to 10s) - Add Next.js API proxy route for the cancel endpoint - Wrap the AI SDK's `stop()` to also call the cancel API so the executor actually terminates	2026-02-20 01:20:19 +07:00
Otto	0b151f64e8	feat(copilot): Execute parallel tool calls concurrently (#12165 ) When the LLM returns multiple tool calls in a single response (e.g. multiple web fetches for a research task), they now execute concurrently instead of sequentially. This can dramatically reduce latency for multi-tool turns. Before: Tool calls execute one after another — 7 web fetches × 2s each = 14s total After: All tool calls fire concurrently — 7 web fetches = ~2s total ### Changes - `service.py`: New `_execute_tool_calls_parallel()` function that spawns tool calls as concurrent `asyncio` tasks, collecting stream events via `asyncio.Queue` - `service.py`: `_yield_tool_call()` now accepts an optional `session_lock` parameter for concurrent-safe session mutations - `base.py`: Session lock exposed via `contextvars` so tools that need it can access it without interface changes - `run_agent.py`: Rate-limit counters (`successful_agent_runs`, `successful_agent_schedules`) protected with the session lock to prevent race conditions ### Concurrency Safety \| Shared State \| Risk \| Mitigation \| \|---\|---\|---\| \| `session.messages` (long-running tools only) \| Race on append + upsert \| `session_lock` wraps mutations \| \| `session.successful_agent_runs` counter \| Bypass max-runs check \| `session_lock` wraps read-check-increment \| \| Tool-internal state (DB queries, API calls) \| None — stateless \| No mitigation needed \| ### Testing - Added `parallel_tool_calls_test.py` with tests for: - Parallel timing verification (sum vs max of delays) - Single tool call regression - Retryable error propagation - Shared session lock verification - Cancellation cleanup Closes SECRT-2016 --------- Co-authored-by: Zamil Majdy <zamil.majdy@agpt.co>	2026-02-19 17:53:36 +00:00
Zamil Majdy	be2a48aedb	feat(platform/copilot): add SuggestedGoalResponse for vague/unachievable goals (#12139 ) ## Summary - Add `SUGGESTED_GOAL` response type and `SuggestedGoalResponse` model to backend; vague/unachievable goals now return a structured suggestion instead of a generic error - Add `SuggestedGoalCard` frontend component (amber styling, "Use this goal" button) that lets users accept and re-submit a refined goal in one click - Add error recovery buttons ("Try again", "Simplify goal") to the error output block - Update copilot system prompt with explicit guidance for handling `suggested_goal` and `clarifying_questions` feedback loops - Add `create_agent_test.py` covering all four decomposition result types ## Test plan - [ ] Trigger vague goal (e.g. "monitor social media") → `SuggestedGoalCard` renders with amber styling - [ ] Trigger unachievable goal (e.g. "read my mind") → card shows goal type "Goal cannot be accomplished" with reason - [ ] Click "Use this goal" → sends message and triggers new `create_agent` call with the suggested goal - [ ] Trigger an error → "Try again" and "Simplify goal" buttons appear below the error - [ ] Clarifying questions answered → LLM re-calls `create_agent` with context (system prompt guidance) - [ ] Backend tests pass: `poetry run pytest backend/api/features/chat/tools/create_agent_test.py -xvs` (requires Docker services) <!-- greptile_comment --> <details><summary><h3>Greptile Summary</h3></summary> Replaced generic `ErrorResponse` with structured `SuggestedGoalResponse` for vague/unachievable goals in the copilot agent creation flow. Added frontend `SuggestedGoalCard` component with amber styling and "Use this goal" button for one-click goal refinement. Enhanced system prompt with explicit feedback loop handling for `suggested_goal` and `clarifying_questions`. Added comprehensive test coverage for all four decomposition result types. Key improvements: - Better UX: Users can now accept refined goals with one click instead of manually retyping - Clearer error recovery: Added "Try again" and "Simplify goal" buttons to error blocks - Structured data: Backend now returns `suggested_goal`, `reason`, `original_goal`, and `goal_type` fields instead of embedding everything in error messages Issue found: - The `reason` field from the backend is not being passed to or displayed by the `SuggestedGoalCard` component, so users won't see the explanation for why their goal was rejected (especially important for unachievable goals where it explains what blocks are missing) </details> <details><summary><h3>Confidence Score: 4/5</h3></summary> - Safe to merge after fixing the missing `reason` field in the frontend component - Implementation is well-structured with good test coverage and follows established patterns. The issue with the missing `reason` field is straightforward to fix but important for UX - users won't understand why their goal was rejected without it. All other changes are solid: backend properly returns structured data, tests cover all cases, and the component integration follows the project's conventions. - autogpt_platform/frontend/src/app/(platform)/copilot/tools/CreateAgent/CreateAgent.tsx and SuggestedGoalCard.tsx need the `reason` prop added </details> <details><summary><h3>Flowchart</h3></summary> ```mermaid flowchart TD Start[User submits goal to create_agent] --> Decompose[decompose_goal analyzes request] Decompose --> CheckType{Decomposition result type?} CheckType -->\|clarifying_questions\| Questions[Return ClarificationNeededResponse] Questions --> UserAnswers[User answers questions] UserAnswers --> Retry[Retry with context] Retry --> Decompose CheckType -->\|vague_goal\| VagueResponse[Return SuggestedGoalResponse<br/>goal_type: vague] VagueResponse --> ShowSuggestion[Frontend: SuggestedGoalCard<br/>amber styling] ShowSuggestion --> UserAccepts{User clicks<br/>Use this goal?} UserAccepts -->\|Yes\| NewGoal[Send suggested goal] NewGoal --> Decompose UserAccepts -->\|No\| End1[User refines manually] CheckType -->\|unachievable_goal\| UnachievableResponse[Return SuggestedGoalResponse<br/>goal_type: unachievable<br/>reason: missing blocks] UnachievableResponse --> ShowSuggestion CheckType -->\|success\| Generate[generate_agent creates workflow] Generate --> SaveOrPreview{save parameter?} SaveOrPreview -->\|true\| Save[Save to library<br/>AgentSavedResponse] SaveOrPreview -->\|false\| Preview[AgentPreviewResponse] CheckType -->\|error\| ErrorFlow[Return ErrorResponse] ErrorFlow --> ShowError[Frontend: Show error with<br/>Try again & Simplify goal buttons] ShowError --> UserRetry{User action?} UserRetry -->\|Try again\| Decompose UserRetry -->\|Simplify goal\| GetHelp[Ask LLM to simplify] GetHelp --> Decompose Save --> End2[Done] Preview --> End2 End1 --> End2 ``` </details> <sub>Last reviewed commit: 2f37aee</sub> <!-- greptile_other_comments_section --> <!-- /greptile_comment -->	2026-02-19 16:11:41 +00:00
Ubbe	aeca4dbb79	docs(frontend): add mandatory pre-completion checks to CLAUDE.md (#12161 ) ### Changes 🏗️ Adds a Pre-completion Checks (MANDATORY) section to `frontend/CLAUDE.md` that instructs Claude Code agents to always run the following commands in order before reporting frontend work as done: 1. `pnpm format` — auto-fix formatting issues 2. `pnpm lint` — check for lint errors and fix them 3. `pnpm types` — check for type errors and fix them This ensures code quality gates are enforced consistently by AI agents working on the frontend. ### Checklist 📋 #### For code changes: - [x] I have clearly listed my changes in the PR description - [x] I have made a test plan - [x] I have tested my changes according to the test plan: - [x] Verified `pnpm format` passes cleanly - [x] Verified `pnpm lint` passes cleanly - [x] Verified `pnpm types` passes cleanly 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-19 23:07:55 +08:00
Ubbe	7b85eeaae2	refactor(frontend): fix flaky e2e tests (#12156 ) ### Changes 🏗️ Some fixes to make running e2e more predictable... ### Checklist 📋 #### For code changes: - [x] I have clearly listed my changes in the PR description - [x] I have made a test plan - [x] I have tested my changes according to the test plan: - [x] e2e are imdempotent --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-19 21:38:50 +07:00
Ubbe	4db3be2d61	fix(frontend): switch minigame to snake (#12160 ) ## Changes 🏗️ <img width="600" height="416" alt="Screenshot 2026-02-19 at 18 05 39" src="https://github.com/user-attachments/assets/930116ad-b611-4398-bee7-4e33ca4dc688" /> Make the mini game a snake 🐍 game, so we don't use assets (_possible license issues_ ), and it's simpler... ## Checklist 📋 ### For code changes: - [x] I have clearly listed my changes in the PR description - [x] I have made a test plan - [x] I have tested my changes according to the test plan: - [x] Run the app and test	2026-02-19 19:28:59 +07:00
Ubbe	f57a1995d0	fix(frontend): make chat spinner centred when loading (#12154 ) ## Changes 🏗️ <img width="800" height="969" alt="Screenshot 2026-02-18 at 20 30 36" src="https://github.com/user-attachments/assets/30d7d211-98c1-4159-94e1-86e81e29ad43" /> - Make the spinner centred when the chat is loading ## Checklist 📋 #### For code changes: - [x] I have clearly listed my changes in the PR description - [x] I have made a test plan - [x] I have tested my changes according to the test plan: - [x] Run the app and test locally	2026-02-19 17:31:07 +07:00
Zamil Majdy	3928c35928	feat(copilot): SDK tool output, transcript resume, stream reconnection, GenericTool UI (#12159 ) ## Summary ### SDK built-in tool output forwarding - WebSearch, Read, TodoWrite outputs now render in the frontend — PostToolUse hook stashes outputs before SDK truncation, response adapter flushes unresolved tool calls via `_flush_unresolved_tool_calls` + `parent_tool_use_id` handling - Multi-call stash upgraded to `dict[str, list[str]]` to support multiple calls to the same built-in tool in one turn ### Transcript-based `--resume` with staleness detection - Simplified to single upload block after `async with` (Stop hook + `appendFileSync` guarantees), extracted `_try_upload_transcript` helper - NEW: `message_count` watermark + timestamp metadata stored alongside transcript — on the next turn, detects staleness and compresses only the missed messages instead of the full history (hybrid: transcript via `--resume` + compressed gap) - Removed redundant dual-strategy code and dead `find_cli_transcript`/`read_fallback_transcript` functions ### Frontend stream reconnection - NEW: Enabled `resume: true` on `useChat` with `prepareReconnectToStreamRequest` — page refresh reconnects to active backend streams via Redis replay (backend `resume_session_stream` endpoint was already wired up) ### GenericTool.tsx UI overhaul - Tool-specific icons (terminal, globe, file, search, edit, checklist) with category-based display - TodoWrite checklist rendering with status indicators (completed/in-progress/pending) - WebSearch/MCP content display via `extractMcpText` for MCP-style content blocks + raw JSON fallback - Defensive TodoItem filter per coderabbit review - Proper accordion content per tool category (bash, web, file, search, edit, todo) ### Image support - MCP tool results now include `{"type": "image"}` content blocks when workspace file responses contain `content_base64` with image MIME types ### Security & cleanup - `AskUserQuestion` added to `SDK_DISALLOWED_TOOLS` (interactive CLI tool, no terminal in copilot) - 36 per-operation `[TIMING]`/`[TASK_LOOKUP]` diagnostic logs downgraded info→debug - Silent exception fixes: warning logs for swallowed errors in stream_registry + service ## Test plan - [ ] Verify copilot multi-turn conversations use `--resume` (check logs for `Using --resume`) - [ ] Verify stale transcript detection fills gap (check logs for `Transcript stale: covers N of M messages`) - [ ] Verify page refresh reconnects to active stream (check network tab for GET to `/stream` returning SSE) - [ ] Verify WebSearch, Read, TodoWrite tool outputs render in frontend accordion - [ ] Verify GenericTool icons and accordion content display correctly for each tool type - [ ] Verify production log volume is reduced (no more `[TIMING]` at info level) --------- Co-authored-by: Otto (AGPT) <otto@agpt.co>	2026-02-19 08:48:12 +00:00