fix(backend/copilot): address iteration-2 review on #12879

- Add ``TodoWriteResponse`` and ``TaskResponse`` to ``ToolResponseUnion`` in ``chat/routes.py`` so the ``/api/chat/schema/tool-responses`` codegen endpoint surfaces both schemas to the frontend. Regenerated openapi.json + orval client accordingly. - Restructure the Task sub-agent error handling as nested ``try/except Exception`` inside a ``try/finally``. ``CancelledError`` / ``KeyboardInterrupt`` / ``SystemExit`` now propagate naturally through the outer finally — which both resets the depth ContextVar and rolls up inner usage — so the ``except BaseException`` + ``# noqa: BLE001`` suppressor pair is no longer needed. The existing regression tests (cancellation + exception) still cover it. - Prompt guidance (``SHARED_TOOL_NOTES``) now explicitly states that sub-agents inherit the parent tool set "except ``Task`` itself"; the "plan at most one level deep" bit saves the parent model from attempting nested delegation the baseline will refuse. - Replace the stale ``(budget of 32000 chars ≈ 8000 tokens)`` note in ``test_total_schema_char_budget`` with a reference to ``_CHAR_BUDGET`` above so future budget bumps don't desync the docstring again.
2026-04-30 03:00:41 -04:00 · 2026-04-22 12:56:47 +07:00
parent d06258ba44
commit 187f0a529d
5 changed files with 149 additions and 48 deletions
--- a/autogpt_platform/backend/backend/api/features/chat/routes.py
+++ b/autogpt_platform/backend/backend/api/features/chat/routes.py
@@ -75,6 +75,8 @@ from backend.copilot.tools.models import (
    NoResultsResponse,
    SetupRequirementsResponse,
    SuggestedGoalResponse,
+    TaskResponse,
+    TodoWriteResponse,
    UnderstandingUpdatedResponse,
 )
 from backend.copilot.tracking import track_user_message
@@ -1419,6 +1421,8 @@ ToolResponseUnion = (
    | MemorySearchResponse
    | MemoryForgetCandidatesResponse
    | MemoryForgetConfirmResponse
+    | TodoWriteResponse
+    | TaskResponse
 )


--- a/autogpt_platform/backend/backend/copilot/baseline/service.py
+++ b/autogpt_platform/backend/backend/copilot/baseline/service.py
@@ -987,50 +987,47 @@ async def _run_task_subagent(
    )
    inner_llm = partial(_baseline_llm_caller, state=inner_state)

-    task_exc: BaseException | None = None
+    task_exc: Exception | None = None
    token = _TASK_DEPTH_VAR.set(depth + 1)
    try:
-        async for loop_result in tool_call_loop(
-            messages=inner_messages,
-            tools=inner_tools,
-            llm_call=inner_llm,
-            execute_tool=inner_executor,
-            update_conversation=_inner_task_conversation_updater,
-            max_iterations=_MAX_TASK_ITERATIONS,
-            last_iteration_message=(
-                "This is the final iteration — produce your summary now."
-            ),
-        ):
-            # Discard inner streaming events so only the Task envelope and
-            # its final summary reach the parent client. Token accounting
-            # still happens via ``inner_state`` and rolls up after the
-            # loop exits.
-            inner_state.pending_events.clear()
-            for tc in loop_result.last_tool_calls:
-                tool_names_seen.append(tc.name)
-            iterations = loop_result.iterations
-            finished_naturally = loop_result.finished_naturally
-            if loop_result.finished_naturally:
-                final_response_text = loop_result.response_text or ""
-    except BaseException as exc:  # noqa: BLE001
-        # BaseException (not Exception) so CancelledError / KeyboardInterrupt
-        # / SystemExit still land in the ``finally`` and reset the
-        # ContextVar — otherwise a cancelled Task would leak an elevated
-        # depth into whatever task reuses this asyncio context next.
-        # Swallow only ``Exception`` below; re-raise ``BaseException`` after
-        # the reset.
-        task_exc = exc
+        try:
+            async for loop_result in tool_call_loop(
+                messages=inner_messages,
+                tools=inner_tools,
+                llm_call=inner_llm,
+                execute_tool=inner_executor,
+                update_conversation=_inner_task_conversation_updater,
+                max_iterations=_MAX_TASK_ITERATIONS,
+                last_iteration_message=(
+                    "This is the final iteration — produce your summary now."
+                ),
+            ):
+                # Discard inner streaming events so only the Task envelope
+                # and its final summary reach the parent client. Token
+                # accounting still happens via ``inner_state`` and rolls up
+                # after the loop exits.
+                inner_state.pending_events.clear()
+                for tc in loop_result.last_tool_calls:
+                    tool_names_seen.append(tc.name)
+                iterations = loop_result.iterations
+                finished_naturally = loop_result.finished_naturally
+                if loop_result.finished_naturally:
+                    final_response_text = loop_result.response_text or ""
+        except Exception as exc:
+            task_exc = exc
+        # ``CancelledError`` / ``KeyboardInterrupt`` / ``SystemExit``
+        # derive from ``BaseException`` and are intentionally NOT caught
+        # here — they propagate through the outer ``finally`` below, which
+        # still resets the depth counter and rolls up usage before the
+        # exception reaches the caller.  Letting them bubble naturally
+        # avoids the ``except BaseException`` suppressor pattern.
    finally:
        _TASK_DEPTH_VAR.reset(token)
-
-    # Usage rolls up regardless of outcome so partial work is still billed.
-    _absorb_inner_usage(parent_state, inner_state)
+        # Usage rolls up on every path (success, caught Exception, or
+        # propagating BaseException) so partial work is still billed.
+        _absorb_inner_usage(parent_state, inner_state)

    if task_exc is not None:
-        if not isinstance(task_exc, Exception):
-            # KeyboardInterrupt / SystemExit / CancelledError propagate
-            # after the depth counter was safely restored above.
-            raise task_exc
        logger.error(
            "[Baseline] Task sub-agent failed: %s", task_exc, exc_info=task_exc
        )
--- a/autogpt_platform/backend/backend/copilot/prompting.py
+++ b/autogpt_platform/backend/backend/copilot/prompting.py
@@ -161,12 +161,14 @@ progress. Guidelines:

 ### Sub-agents — `Task`
 The `Task` tool runs an **in-process sub-agent** inside the current
-conversation. The sub-agent has the same tool set but its own message
-history, so its intermediate tool calls stay out of the parent context —
-you only see the sub-agent's final summary as the tool result. Use it
-for self-contained work that would otherwise generate a lot of
-intermediate chatter (focused research, bounded refactors, multi-step
-exploration where only the conclusion matters).
+conversation. The sub-agent inherits the parent's tool set **except
+`Task` itself** (nested delegation is refused — plan at most one level
+deep), and it gets its own message history so its intermediate tool
+calls stay out of the parent context — you only see the sub-agent's
+final summary as the tool result. Use it for self-contained work that
+would otherwise generate a lot of intermediate chatter (focused
+research, bounded refactors, multi-step exploration where only the
+conclusion matters).

 - Provide a short `description` (3-5 words, shown in the accordion) and a
  complete `prompt` — the sub-agent does NOT inherit the parent
--- a/autogpt_platform/backend/backend/copilot/tools/tool_schema_test.py
+++ b/autogpt_platform/backend/backend/copilot/tools/tool_schema_test.py
@@ -120,9 +120,10 @@ def test_total_schema_char_budget() -> None:

    This locks in the 34% token reduction from #12398 and prevents future
    description bloat from eroding the gains. Uses character count with a
-    ~4 chars/token heuristic (budget of 32000 chars ≈ 8000 tokens).
-    Character count is tokenizer-agnostic — no dependency on GPT or Claude
-    tokenizers — while still providing a stable regression gate.
+    ~4 chars/token heuristic; see ``_CHAR_BUDGET`` above for the current
+    value and its change history.  Character count is tokenizer-agnostic
+    — no dependency on GPT or Claude tokenizers — while still providing a
+    stable regression gate.
    """
    schemas = [tool.as_openai_tool() for tool in TOOL_REGISTRY.values()]
    serialized = json.dumps(schemas)
--- a/autogpt_platform/frontend/src/app/api/openapi.json
+++ b/autogpt_platform/frontend/src/app/api/openapi.json
@@ -2116,7 +2116,9 @@
                    },
                    {
                      "$ref": "#/components/schemas/MemoryForgetConfirmResponse"
-                    }
+                    },
+                    { "$ref": "#/components/schemas/TodoWriteResponse" },
+                    { "$ref": "#/components/schemas/TaskResponse" }
                  ],
                  "title": "Response Getv2[Dummy] Tool Response Type Export For Codegen"
                }
@@ -16068,6 +16070,55 @@
        "required": ["recent_searches", "providers", "top_blocks"],
        "title": "SuggestionsResponse"
      },
+      "TaskResponse": {
+        "properties": {
+          "type": {
+            "$ref": "#/components/schemas/ResponseType",
+            "default": "task"
+          },
+          "message": { "type": "string", "title": "Message" },
+          "session_id": {
+            "anyOf": [{ "type": "string" }, { "type": "null" }],
+            "title": "Session Id"
+          },
+          "description": {
+            "type": "string",
+            "title": "Description",
+            "default": ""
+          },
+          "response": {
+            "type": "string",
+            "title": "Response",
+            "description": "Final assistant text the sub-agent produced.",
+            "default": ""
+          },
+          "iterations": {
+            "type": "integer",
+            "title": "Iterations",
+            "default": 0
+          },
+          "tool_calls": {
+            "items": { "type": "string" },
+            "type": "array",
+            "title": "Tool Calls",
+            "description": "Names of tools the sub-agent invoked (for observability)."
+          },
+          "status": {
+            "type": "string",
+            "enum": ["completed", "max_iterations", "error"],
+            "title": "Status",
+            "default": "completed"
+          },
+          "error": {
+            "anyOf": [{ "type": "string" }, { "type": "null" }],
+            "title": "Error"
+          }
+        },
+        "type": "object",
+        "required": ["message"],
+        "title": "TaskResponse",
+        "description": "Result of a delegated ``Task`` in-process sub-agent run.\n\nThe sub-agent runs a fresh tool-call loop with an isolated message\nhistory, then returns only its final assistant text. Intermediate tool\ncalls and thinking stay inside the sub-agent's loop so the parent\ncontext is not polluted."
+      },
      "TimezoneResponse": {
        "properties": {
          "timezone": {
@@ -16684,6 +16735,52 @@
        "required": ["timezone"],
        "title": "TimezoneResponse"
      },
+      "TodoItem": {
+        "properties": {
+          "content": {
+            "type": "string",
+            "title": "Content",
+            "description": "Imperative description of the task."
+          },
+          "activeForm": {
+            "type": "string",
+            "title": "Activeform",
+            "description": "Present-continuous form shown while the task is running."
+          },
+          "status": {
+            "type": "string",
+            "enum": ["pending", "in_progress", "completed"],
+            "title": "Status",
+            "default": "pending"
+          }
+        },
+        "type": "object",
+        "required": ["content", "activeForm"],
+        "title": "TodoItem",
+        "description": "One entry in a ``TodoWrite`` checklist.\n\nMirrors the schema used by Claude Code's built-in ``TodoWrite`` tool so\nthe frontend's ``GenericTool`` accordion renders baseline-emitted todos\nidentically to SDK-emitted ones."
+      },
+      "TodoWriteResponse": {
+        "properties": {
+          "type": {
+            "$ref": "#/components/schemas/ResponseType",
+            "default": "todo_write"
+          },
+          "message": { "type": "string", "title": "Message" },
+          "session_id": {
+            "anyOf": [{ "type": "string" }, { "type": "null" }],
+            "title": "Session Id"
+          },
+          "todos": {
+            "items": { "$ref": "#/components/schemas/TodoItem" },
+            "type": "array",
+            "title": "Todos"
+          }
+        },
+        "type": "object",
+        "required": ["message"],
+        "title": "TodoWriteResponse",
+        "description": "Ack returned by ``TodoWrite``.\n\nThe tool is effectively stateless — the authoritative task list lives in\nthe assistant's latest tool-call arguments, which are replayed from the\ntranscript on each turn. The tool output only needs to confirm that the\nupdate was accepted so the model can proceed."
+      },
      "TokenIntrospectionResult": {
        "properties": {
          "active": { "type": "boolean", "title": "Active" },