From 187f0a529dcb8eb9883d570d94922c28f84d90dc Mon Sep 17 00:00:00 2001 From: majdyz Date: Wed, 22 Apr 2026 12:56:47 +0700 Subject: [PATCH] fix(backend/copilot): address iteration-2 review on #12879 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add ``TodoWriteResponse`` and ``TaskResponse`` to ``ToolResponseUnion`` in ``chat/routes.py`` so the ``/api/chat/schema/tool-responses`` codegen endpoint surfaces both schemas to the frontend. Regenerated openapi.json + orval client accordingly. - Restructure the Task sub-agent error handling as nested ``try/except Exception`` inside a ``try/finally``. ``CancelledError`` / ``KeyboardInterrupt`` / ``SystemExit`` now propagate naturally through the outer finally — which both resets the depth ContextVar and rolls up inner usage — so the ``except BaseException`` + ``# noqa: BLE001`` suppressor pair is no longer needed. The existing regression tests (cancellation + exception) still cover it. - Prompt guidance (``SHARED_TOOL_NOTES``) now explicitly states that sub-agents inherit the parent tool set "except ``Task`` itself"; the "plan at most one level deep" bit saves the parent model from attempting nested delegation the baseline will refuse. - Replace the stale ``(budget of 32000 chars ≈ 8000 tokens)`` note in ``test_total_schema_char_budget`` with a reference to ``_CHAR_BUDGET`` above so future budget bumps don't desync the docstring again. --- .../backend/api/features/chat/routes.py | 4 + .../backend/copilot/baseline/service.py | 73 +++++++------- .../backend/backend/copilot/prompting.py | 14 +-- .../backend/copilot/tools/tool_schema_test.py | 7 +- .../frontend/src/app/api/openapi.json | 99 ++++++++++++++++++- 5 files changed, 149 insertions(+), 48 deletions(-) diff --git a/autogpt_platform/backend/backend/api/features/chat/routes.py b/autogpt_platform/backend/backend/api/features/chat/routes.py index ca7e4355f6..39cd111b31 100644 --- a/autogpt_platform/backend/backend/api/features/chat/routes.py +++ b/autogpt_platform/backend/backend/api/features/chat/routes.py @@ -75,6 +75,8 @@ from backend.copilot.tools.models import ( NoResultsResponse, SetupRequirementsResponse, SuggestedGoalResponse, + TaskResponse, + TodoWriteResponse, UnderstandingUpdatedResponse, ) from backend.copilot.tracking import track_user_message @@ -1419,6 +1421,8 @@ ToolResponseUnion = ( | MemorySearchResponse | MemoryForgetCandidatesResponse | MemoryForgetConfirmResponse + | TodoWriteResponse + | TaskResponse ) diff --git a/autogpt_platform/backend/backend/copilot/baseline/service.py b/autogpt_platform/backend/backend/copilot/baseline/service.py index 28e78c1c5f..6f82af4dfe 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/service.py +++ b/autogpt_platform/backend/backend/copilot/baseline/service.py @@ -987,50 +987,47 @@ async def _run_task_subagent( ) inner_llm = partial(_baseline_llm_caller, state=inner_state) - task_exc: BaseException | None = None + task_exc: Exception | None = None token = _TASK_DEPTH_VAR.set(depth + 1) try: - async for loop_result in tool_call_loop( - messages=inner_messages, - tools=inner_tools, - llm_call=inner_llm, - execute_tool=inner_executor, - update_conversation=_inner_task_conversation_updater, - max_iterations=_MAX_TASK_ITERATIONS, - last_iteration_message=( - "This is the final iteration — produce your summary now." - ), - ): - # Discard inner streaming events so only the Task envelope and - # its final summary reach the parent client. Token accounting - # still happens via ``inner_state`` and rolls up after the - # loop exits. - inner_state.pending_events.clear() - for tc in loop_result.last_tool_calls: - tool_names_seen.append(tc.name) - iterations = loop_result.iterations - finished_naturally = loop_result.finished_naturally - if loop_result.finished_naturally: - final_response_text = loop_result.response_text or "" - except BaseException as exc: # noqa: BLE001 - # BaseException (not Exception) so CancelledError / KeyboardInterrupt - # / SystemExit still land in the ``finally`` and reset the - # ContextVar — otherwise a cancelled Task would leak an elevated - # depth into whatever task reuses this asyncio context next. - # Swallow only ``Exception`` below; re-raise ``BaseException`` after - # the reset. - task_exc = exc + try: + async for loop_result in tool_call_loop( + messages=inner_messages, + tools=inner_tools, + llm_call=inner_llm, + execute_tool=inner_executor, + update_conversation=_inner_task_conversation_updater, + max_iterations=_MAX_TASK_ITERATIONS, + last_iteration_message=( + "This is the final iteration — produce your summary now." + ), + ): + # Discard inner streaming events so only the Task envelope + # and its final summary reach the parent client. Token + # accounting still happens via ``inner_state`` and rolls up + # after the loop exits. + inner_state.pending_events.clear() + for tc in loop_result.last_tool_calls: + tool_names_seen.append(tc.name) + iterations = loop_result.iterations + finished_naturally = loop_result.finished_naturally + if loop_result.finished_naturally: + final_response_text = loop_result.response_text or "" + except Exception as exc: + task_exc = exc + # ``CancelledError`` / ``KeyboardInterrupt`` / ``SystemExit`` + # derive from ``BaseException`` and are intentionally NOT caught + # here — they propagate through the outer ``finally`` below, which + # still resets the depth counter and rolls up usage before the + # exception reaches the caller. Letting them bubble naturally + # avoids the ``except BaseException`` suppressor pattern. finally: _TASK_DEPTH_VAR.reset(token) - - # Usage rolls up regardless of outcome so partial work is still billed. - _absorb_inner_usage(parent_state, inner_state) + # Usage rolls up on every path (success, caught Exception, or + # propagating BaseException) so partial work is still billed. + _absorb_inner_usage(parent_state, inner_state) if task_exc is not None: - if not isinstance(task_exc, Exception): - # KeyboardInterrupt / SystemExit / CancelledError propagate - # after the depth counter was safely restored above. - raise task_exc logger.error( "[Baseline] Task sub-agent failed: %s", task_exc, exc_info=task_exc ) diff --git a/autogpt_platform/backend/backend/copilot/prompting.py b/autogpt_platform/backend/backend/copilot/prompting.py index 1006746bd2..b9b9d84607 100644 --- a/autogpt_platform/backend/backend/copilot/prompting.py +++ b/autogpt_platform/backend/backend/copilot/prompting.py @@ -161,12 +161,14 @@ progress. Guidelines: ### Sub-agents — `Task` The `Task` tool runs an **in-process sub-agent** inside the current -conversation. The sub-agent has the same tool set but its own message -history, so its intermediate tool calls stay out of the parent context — -you only see the sub-agent's final summary as the tool result. Use it -for self-contained work that would otherwise generate a lot of -intermediate chatter (focused research, bounded refactors, multi-step -exploration where only the conclusion matters). +conversation. The sub-agent inherits the parent's tool set **except +`Task` itself** (nested delegation is refused — plan at most one level +deep), and it gets its own message history so its intermediate tool +calls stay out of the parent context — you only see the sub-agent's +final summary as the tool result. Use it for self-contained work that +would otherwise generate a lot of intermediate chatter (focused +research, bounded refactors, multi-step exploration where only the +conclusion matters). - Provide a short `description` (3-5 words, shown in the accordion) and a complete `prompt` — the sub-agent does NOT inherit the parent diff --git a/autogpt_platform/backend/backend/copilot/tools/tool_schema_test.py b/autogpt_platform/backend/backend/copilot/tools/tool_schema_test.py index cedcd81447..bc83b68402 100644 --- a/autogpt_platform/backend/backend/copilot/tools/tool_schema_test.py +++ b/autogpt_platform/backend/backend/copilot/tools/tool_schema_test.py @@ -120,9 +120,10 @@ def test_total_schema_char_budget() -> None: This locks in the 34% token reduction from #12398 and prevents future description bloat from eroding the gains. Uses character count with a - ~4 chars/token heuristic (budget of 32000 chars ≈ 8000 tokens). - Character count is tokenizer-agnostic — no dependency on GPT or Claude - tokenizers — while still providing a stable regression gate. + ~4 chars/token heuristic; see ``_CHAR_BUDGET`` above for the current + value and its change history. Character count is tokenizer-agnostic + — no dependency on GPT or Claude tokenizers — while still providing a + stable regression gate. """ schemas = [tool.as_openai_tool() for tool in TOOL_REGISTRY.values()] serialized = json.dumps(schemas) diff --git a/autogpt_platform/frontend/src/app/api/openapi.json b/autogpt_platform/frontend/src/app/api/openapi.json index c06e937b71..dfb681fada 100644 --- a/autogpt_platform/frontend/src/app/api/openapi.json +++ b/autogpt_platform/frontend/src/app/api/openapi.json @@ -2116,7 +2116,9 @@ }, { "$ref": "#/components/schemas/MemoryForgetConfirmResponse" - } + }, + { "$ref": "#/components/schemas/TodoWriteResponse" }, + { "$ref": "#/components/schemas/TaskResponse" } ], "title": "Response Getv2[Dummy] Tool Response Type Export For Codegen" } @@ -16068,6 +16070,55 @@ "required": ["recent_searches", "providers", "top_blocks"], "title": "SuggestionsResponse" }, + "TaskResponse": { + "properties": { + "type": { + "$ref": "#/components/schemas/ResponseType", + "default": "task" + }, + "message": { "type": "string", "title": "Message" }, + "session_id": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Session Id" + }, + "description": { + "type": "string", + "title": "Description", + "default": "" + }, + "response": { + "type": "string", + "title": "Response", + "description": "Final assistant text the sub-agent produced.", + "default": "" + }, + "iterations": { + "type": "integer", + "title": "Iterations", + "default": 0 + }, + "tool_calls": { + "items": { "type": "string" }, + "type": "array", + "title": "Tool Calls", + "description": "Names of tools the sub-agent invoked (for observability)." + }, + "status": { + "type": "string", + "enum": ["completed", "max_iterations", "error"], + "title": "Status", + "default": "completed" + }, + "error": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Error" + } + }, + "type": "object", + "required": ["message"], + "title": "TaskResponse", + "description": "Result of a delegated ``Task`` in-process sub-agent run.\n\nThe sub-agent runs a fresh tool-call loop with an isolated message\nhistory, then returns only its final assistant text. Intermediate tool\ncalls and thinking stay inside the sub-agent's loop so the parent\ncontext is not polluted." + }, "TimezoneResponse": { "properties": { "timezone": { @@ -16684,6 +16735,52 @@ "required": ["timezone"], "title": "TimezoneResponse" }, + "TodoItem": { + "properties": { + "content": { + "type": "string", + "title": "Content", + "description": "Imperative description of the task." + }, + "activeForm": { + "type": "string", + "title": "Activeform", + "description": "Present-continuous form shown while the task is running." + }, + "status": { + "type": "string", + "enum": ["pending", "in_progress", "completed"], + "title": "Status", + "default": "pending" + } + }, + "type": "object", + "required": ["content", "activeForm"], + "title": "TodoItem", + "description": "One entry in a ``TodoWrite`` checklist.\n\nMirrors the schema used by Claude Code's built-in ``TodoWrite`` tool so\nthe frontend's ``GenericTool`` accordion renders baseline-emitted todos\nidentically to SDK-emitted ones." + }, + "TodoWriteResponse": { + "properties": { + "type": { + "$ref": "#/components/schemas/ResponseType", + "default": "todo_write" + }, + "message": { "type": "string", "title": "Message" }, + "session_id": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Session Id" + }, + "todos": { + "items": { "$ref": "#/components/schemas/TodoItem" }, + "type": "array", + "title": "Todos" + } + }, + "type": "object", + "required": ["message"], + "title": "TodoWriteResponse", + "description": "Ack returned by ``TodoWrite``.\n\nThe tool is effectively stateless — the authoritative task list lives in\nthe assistant's latest tool-call arguments, which are replayed from the\ntranscript on each turn. The tool output only needs to confirm that the\nupdate was accepted so the model can proceed." + }, "TokenIntrospectionResult": { "properties": { "active": { "type": "boolean", "title": "Active" },