From 187f0a529dcb8eb9883d570d94922c28f84d90dc Mon Sep 17 00:00:00 2001
From: majdyz <zamil.majdy@agpt.co>
Date: Wed, 22 Apr 2026 12:56:47 +0700
Subject: [PATCH] fix(backend/copilot): address iteration-2 review on #12879
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add ``TodoWriteResponse`` and ``TaskResponse`` to ``ToolResponseUnion``
  in ``chat/routes.py`` so the ``/api/chat/schema/tool-responses``
  codegen endpoint surfaces both schemas to the frontend.  Regenerated
  openapi.json + orval client accordingly.
- Restructure the Task sub-agent error handling as nested
  ``try/except Exception`` inside a ``try/finally``.  ``CancelledError``
  / ``KeyboardInterrupt`` / ``SystemExit`` now propagate naturally
  through the outer finally — which both resets the depth ContextVar
  and rolls up inner usage — so the ``except BaseException`` +
  ``# noqa: BLE001`` suppressor pair is no longer needed.  The
  existing regression tests (cancellation + exception) still cover it.
- Prompt guidance (``SHARED_TOOL_NOTES``) now explicitly states that
  sub-agents inherit the parent tool set "except ``Task`` itself"; the
  "plan at most one level deep" bit saves the parent model from
  attempting nested delegation the baseline will refuse.
- Replace the stale ``(budget of 32000 chars ≈ 8000 tokens)`` note in
  ``test_total_schema_char_budget`` with a reference to ``_CHAR_BUDGET``
  above so future budget bumps don't desync the docstring again.
---
 .../backend/api/features/chat/routes.py       |  4 +
 .../backend/copilot/baseline/service.py       | 73 +++++++-------
 .../backend/backend/copilot/prompting.py      | 14 +--
 .../backend/copilot/tools/tool_schema_test.py |  7 +-
 .../frontend/src/app/api/openapi.json         | 99 ++++++++++++++++++-
 5 files changed, 149 insertions(+), 48 deletions(-)

diff --git a/autogpt_platform/backend/backend/api/features/chat/routes.py b/autogpt_platform/backend/backend/api/features/chat/routes.py
index ca7e4355f6..39cd111b31 100644
--- a/autogpt_platform/backend/backend/api/features/chat/routes.py
+++ b/autogpt_platform/backend/backend/api/features/chat/routes.py
@@ -75,6 +75,8 @@ from backend.copilot.tools.models import (
     NoResultsResponse,
     SetupRequirementsResponse,
     SuggestedGoalResponse,
+    TaskResponse,
+    TodoWriteResponse,
     UnderstandingUpdatedResponse,
 )
 from backend.copilot.tracking import track_user_message
@@ -1419,6 +1421,8 @@ ToolResponseUnion = (
     | MemorySearchResponse
     | MemoryForgetCandidatesResponse
     | MemoryForgetConfirmResponse
+    | TodoWriteResponse
+    | TaskResponse
 )
 
 
diff --git a/autogpt_platform/backend/backend/copilot/baseline/service.py b/autogpt_platform/backend/backend/copilot/baseline/service.py
index 28e78c1c5f..6f82af4dfe 100644
--- a/autogpt_platform/backend/backend/copilot/baseline/service.py
+++ b/autogpt_platform/backend/backend/copilot/baseline/service.py
@@ -987,50 +987,47 @@ async def _run_task_subagent(
     )
     inner_llm = partial(_baseline_llm_caller, state=inner_state)
 
-    task_exc: BaseException | None = None
+    task_exc: Exception | None = None
     token = _TASK_DEPTH_VAR.set(depth + 1)
     try:
-        async for loop_result in tool_call_loop(
-            messages=inner_messages,
-            tools=inner_tools,
-            llm_call=inner_llm,
-            execute_tool=inner_executor,
-            update_conversation=_inner_task_conversation_updater,
-            max_iterations=_MAX_TASK_ITERATIONS,
-            last_iteration_message=(
-                "This is the final iteration — produce your summary now."
-            ),
-        ):
-            # Discard inner streaming events so only the Task envelope and
-            # its final summary reach the parent client. Token accounting
-            # still happens via ``inner_state`` and rolls up after the
-            # loop exits.
-            inner_state.pending_events.clear()
-            for tc in loop_result.last_tool_calls:
-                tool_names_seen.append(tc.name)
-            iterations = loop_result.iterations
-            finished_naturally = loop_result.finished_naturally
-            if loop_result.finished_naturally:
-                final_response_text = loop_result.response_text or ""
-    except BaseException as exc:  # noqa: BLE001
-        # BaseException (not Exception) so CancelledError / KeyboardInterrupt
-        # / SystemExit still land in the ``finally`` and reset the
-        # ContextVar — otherwise a cancelled Task would leak an elevated
-        # depth into whatever task reuses this asyncio context next.
-        # Swallow only ``Exception`` below; re-raise ``BaseException`` after
-        # the reset.
-        task_exc = exc
+        try:
+            async for loop_result in tool_call_loop(
+                messages=inner_messages,
+                tools=inner_tools,
+                llm_call=inner_llm,
+                execute_tool=inner_executor,
+                update_conversation=_inner_task_conversation_updater,
+                max_iterations=_MAX_TASK_ITERATIONS,
+                last_iteration_message=(
+                    "This is the final iteration — produce your summary now."
+                ),
+            ):
+                # Discard inner streaming events so only the Task envelope
+                # and its final summary reach the parent client. Token
+                # accounting still happens via ``inner_state`` and rolls up
+                # after the loop exits.
+                inner_state.pending_events.clear()
+                for tc in loop_result.last_tool_calls:
+                    tool_names_seen.append(tc.name)
+                iterations = loop_result.iterations
+                finished_naturally = loop_result.finished_naturally
+                if loop_result.finished_naturally:
+                    final_response_text = loop_result.response_text or ""
+        except Exception as exc:
+            task_exc = exc
+        # ``CancelledError`` / ``KeyboardInterrupt`` / ``SystemExit``
+        # derive from ``BaseException`` and are intentionally NOT caught
+        # here — they propagate through the outer ``finally`` below, which
+        # still resets the depth counter and rolls up usage before the
+        # exception reaches the caller.  Letting them bubble naturally
+        # avoids the ``except BaseException`` suppressor pattern.
     finally:
         _TASK_DEPTH_VAR.reset(token)
-
-    # Usage rolls up regardless of outcome so partial work is still billed.
-    _absorb_inner_usage(parent_state, inner_state)
+        # Usage rolls up on every path (success, caught Exception, or
+        # propagating BaseException) so partial work is still billed.
+        _absorb_inner_usage(parent_state, inner_state)
 
     if task_exc is not None:
-        if not isinstance(task_exc, Exception):
-            # KeyboardInterrupt / SystemExit / CancelledError propagate
-            # after the depth counter was safely restored above.
-            raise task_exc
         logger.error(
             "[Baseline] Task sub-agent failed: %s", task_exc, exc_info=task_exc
         )
diff --git a/autogpt_platform/backend/backend/copilot/prompting.py b/autogpt_platform/backend/backend/copilot/prompting.py
index 1006746bd2..b9b9d84607 100644
--- a/autogpt_platform/backend/backend/copilot/prompting.py
+++ b/autogpt_platform/backend/backend/copilot/prompting.py
@@ -161,12 +161,14 @@ progress. Guidelines:
 
 ### Sub-agents — `Task`
 The `Task` tool runs an **in-process sub-agent** inside the current
-conversation. The sub-agent has the same tool set but its own message
-history, so its intermediate tool calls stay out of the parent context —
-you only see the sub-agent's final summary as the tool result. Use it
-for self-contained work that would otherwise generate a lot of
-intermediate chatter (focused research, bounded refactors, multi-step
-exploration where only the conclusion matters).
+conversation. The sub-agent inherits the parent's tool set **except
+`Task` itself** (nested delegation is refused — plan at most one level
+deep), and it gets its own message history so its intermediate tool
+calls stay out of the parent context — you only see the sub-agent's
+final summary as the tool result. Use it for self-contained work that
+would otherwise generate a lot of intermediate chatter (focused
+research, bounded refactors, multi-step exploration where only the
+conclusion matters).
 
 - Provide a short `description` (3-5 words, shown in the accordion) and a
   complete `prompt` — the sub-agent does NOT inherit the parent
diff --git a/autogpt_platform/backend/backend/copilot/tools/tool_schema_test.py b/autogpt_platform/backend/backend/copilot/tools/tool_schema_test.py
index cedcd81447..bc83b68402 100644
--- a/autogpt_platform/backend/backend/copilot/tools/tool_schema_test.py
+++ b/autogpt_platform/backend/backend/copilot/tools/tool_schema_test.py
@@ -120,9 +120,10 @@ def test_total_schema_char_budget() -> None:
 
     This locks in the 34% token reduction from #12398 and prevents future
     description bloat from eroding the gains. Uses character count with a
-    ~4 chars/token heuristic (budget of 32000 chars ≈ 8000 tokens).
-    Character count is tokenizer-agnostic — no dependency on GPT or Claude
-    tokenizers — while still providing a stable regression gate.
+    ~4 chars/token heuristic; see ``_CHAR_BUDGET`` above for the current
+    value and its change history.  Character count is tokenizer-agnostic
+    — no dependency on GPT or Claude tokenizers — while still providing a
+    stable regression gate.
     """
     schemas = [tool.as_openai_tool() for tool in TOOL_REGISTRY.values()]
     serialized = json.dumps(schemas)
diff --git a/autogpt_platform/frontend/src/app/api/openapi.json b/autogpt_platform/frontend/src/app/api/openapi.json
index c06e937b71..dfb681fada 100644
--- a/autogpt_platform/frontend/src/app/api/openapi.json
+++ b/autogpt_platform/frontend/src/app/api/openapi.json
@@ -2116,7 +2116,9 @@
                     },
                     {
                       "$ref": "#/components/schemas/MemoryForgetConfirmResponse"
-                    }
+                    },
+                    { "$ref": "#/components/schemas/TodoWriteResponse" },
+                    { "$ref": "#/components/schemas/TaskResponse" }
                   ],
                   "title": "Response Getv2[Dummy] Tool Response Type Export For Codegen"
                 }
@@ -16068,6 +16070,55 @@
         "required": ["recent_searches", "providers", "top_blocks"],
         "title": "SuggestionsResponse"
       },
+      "TaskResponse": {
+        "properties": {
+          "type": {
+            "$ref": "#/components/schemas/ResponseType",
+            "default": "task"
+          },
+          "message": { "type": "string", "title": "Message" },
+          "session_id": {
+            "anyOf": [{ "type": "string" }, { "type": "null" }],
+            "title": "Session Id"
+          },
+          "description": {
+            "type": "string",
+            "title": "Description",
+            "default": ""
+          },
+          "response": {
+            "type": "string",
+            "title": "Response",
+            "description": "Final assistant text the sub-agent produced.",
+            "default": ""
+          },
+          "iterations": {
+            "type": "integer",
+            "title": "Iterations",
+            "default": 0
+          },
+          "tool_calls": {
+            "items": { "type": "string" },
+            "type": "array",
+            "title": "Tool Calls",
+            "description": "Names of tools the sub-agent invoked (for observability)."
+          },
+          "status": {
+            "type": "string",
+            "enum": ["completed", "max_iterations", "error"],
+            "title": "Status",
+            "default": "completed"
+          },
+          "error": {
+            "anyOf": [{ "type": "string" }, { "type": "null" }],
+            "title": "Error"
+          }
+        },
+        "type": "object",
+        "required": ["message"],
+        "title": "TaskResponse",
+        "description": "Result of a delegated ``Task`` in-process sub-agent run.\n\nThe sub-agent runs a fresh tool-call loop with an isolated message\nhistory, then returns only its final assistant text. Intermediate tool\ncalls and thinking stay inside the sub-agent's loop so the parent\ncontext is not polluted."
+      },
       "TimezoneResponse": {
         "properties": {
           "timezone": {
@@ -16684,6 +16735,52 @@
         "required": ["timezone"],
         "title": "TimezoneResponse"
       },
+      "TodoItem": {
+        "properties": {
+          "content": {
+            "type": "string",
+            "title": "Content",
+            "description": "Imperative description of the task."
+          },
+          "activeForm": {
+            "type": "string",
+            "title": "Activeform",
+            "description": "Present-continuous form shown while the task is running."
+          },
+          "status": {
+            "type": "string",
+            "enum": ["pending", "in_progress", "completed"],
+            "title": "Status",
+            "default": "pending"
+          }
+        },
+        "type": "object",
+        "required": ["content", "activeForm"],
+        "title": "TodoItem",
+        "description": "One entry in a ``TodoWrite`` checklist.\n\nMirrors the schema used by Claude Code's built-in ``TodoWrite`` tool so\nthe frontend's ``GenericTool`` accordion renders baseline-emitted todos\nidentically to SDK-emitted ones."
+      },
+      "TodoWriteResponse": {
+        "properties": {
+          "type": {
+            "$ref": "#/components/schemas/ResponseType",
+            "default": "todo_write"
+          },
+          "message": { "type": "string", "title": "Message" },
+          "session_id": {
+            "anyOf": [{ "type": "string" }, { "type": "null" }],
+            "title": "Session Id"
+          },
+          "todos": {
+            "items": { "$ref": "#/components/schemas/TodoItem" },
+            "type": "array",
+            "title": "Todos"
+          }
+        },
+        "type": "object",
+        "required": ["message"],
+        "title": "TodoWriteResponse",
+        "description": "Ack returned by ``TodoWrite``.\n\nThe tool is effectively stateless — the authoritative task list lives in\nthe assistant's latest tool-call arguments, which are replayed from the\ntranscript on each turn. The tool output only needs to confirm that the\nupdate was accepted so the model can proceed."
+      },
       "TokenIntrospectionResult": {
         "properties": {
           "active": { "type": "boolean", "title": "Active" },