fix(backend/copilot): address iteration-2 review on #12879

- Add ``TodoWriteResponse`` and ``TaskResponse`` to ``ToolResponseUnion``
  in ``chat/routes.py`` so the ``/api/chat/schema/tool-responses``
  codegen endpoint surfaces both schemas to the frontend.  Regenerated
  openapi.json + orval client accordingly.
- Restructure the Task sub-agent error handling as nested
  ``try/except Exception`` inside a ``try/finally``.  ``CancelledError``
  / ``KeyboardInterrupt`` / ``SystemExit`` now propagate naturally
  through the outer finally — which both resets the depth ContextVar
  and rolls up inner usage — so the ``except BaseException`` +
  ``# noqa: BLE001`` suppressor pair is no longer needed.  The
  existing regression tests (cancellation + exception) still cover it.
- Prompt guidance (``SHARED_TOOL_NOTES``) now explicitly states that
  sub-agents inherit the parent tool set "except ``Task`` itself"; the
  "plan at most one level deep" bit saves the parent model from
  attempting nested delegation the baseline will refuse.
- Replace the stale ``(budget of 32000 chars ≈ 8000 tokens)`` note in
  ``test_total_schema_char_budget`` with a reference to ``_CHAR_BUDGET``
  above so future budget bumps don't desync the docstring again.
This commit is contained in:
majdyz
2026-04-22 12:56:47 +07:00
parent d06258ba44
commit 187f0a529d
5 changed files with 149 additions and 48 deletions

View File

@@ -75,6 +75,8 @@ from backend.copilot.tools.models import (
NoResultsResponse,
SetupRequirementsResponse,
SuggestedGoalResponse,
TaskResponse,
TodoWriteResponse,
UnderstandingUpdatedResponse,
)
from backend.copilot.tracking import track_user_message
@@ -1419,6 +1421,8 @@ ToolResponseUnion = (
| MemorySearchResponse
| MemoryForgetCandidatesResponse
| MemoryForgetConfirmResponse
| TodoWriteResponse
| TaskResponse
)

View File

@@ -987,50 +987,47 @@ async def _run_task_subagent(
)
inner_llm = partial(_baseline_llm_caller, state=inner_state)
task_exc: BaseException | None = None
task_exc: Exception | None = None
token = _TASK_DEPTH_VAR.set(depth + 1)
try:
async for loop_result in tool_call_loop(
messages=inner_messages,
tools=inner_tools,
llm_call=inner_llm,
execute_tool=inner_executor,
update_conversation=_inner_task_conversation_updater,
max_iterations=_MAX_TASK_ITERATIONS,
last_iteration_message=(
"This is the final iteration — produce your summary now."
),
):
# Discard inner streaming events so only the Task envelope and
# its final summary reach the parent client. Token accounting
# still happens via ``inner_state`` and rolls up after the
# loop exits.
inner_state.pending_events.clear()
for tc in loop_result.last_tool_calls:
tool_names_seen.append(tc.name)
iterations = loop_result.iterations
finished_naturally = loop_result.finished_naturally
if loop_result.finished_naturally:
final_response_text = loop_result.response_text or ""
except BaseException as exc: # noqa: BLE001
# BaseException (not Exception) so CancelledError / KeyboardInterrupt
# / SystemExit still land in the ``finally`` and reset the
# ContextVar — otherwise a cancelled Task would leak an elevated
# depth into whatever task reuses this asyncio context next.
# Swallow only ``Exception`` below; re-raise ``BaseException`` after
# the reset.
task_exc = exc
try:
async for loop_result in tool_call_loop(
messages=inner_messages,
tools=inner_tools,
llm_call=inner_llm,
execute_tool=inner_executor,
update_conversation=_inner_task_conversation_updater,
max_iterations=_MAX_TASK_ITERATIONS,
last_iteration_message=(
"This is the final iteration — produce your summary now."
),
):
# Discard inner streaming events so only the Task envelope
# and its final summary reach the parent client. Token
# accounting still happens via ``inner_state`` and rolls up
# after the loop exits.
inner_state.pending_events.clear()
for tc in loop_result.last_tool_calls:
tool_names_seen.append(tc.name)
iterations = loop_result.iterations
finished_naturally = loop_result.finished_naturally
if loop_result.finished_naturally:
final_response_text = loop_result.response_text or ""
except Exception as exc:
task_exc = exc
# ``CancelledError`` / ``KeyboardInterrupt`` / ``SystemExit``
# derive from ``BaseException`` and are intentionally NOT caught
# here — they propagate through the outer ``finally`` below, which
# still resets the depth counter and rolls up usage before the
# exception reaches the caller. Letting them bubble naturally
# avoids the ``except BaseException`` suppressor pattern.
finally:
_TASK_DEPTH_VAR.reset(token)
# Usage rolls up regardless of outcome so partial work is still billed.
_absorb_inner_usage(parent_state, inner_state)
# Usage rolls up on every path (success, caught Exception, or
# propagating BaseException) so partial work is still billed.
_absorb_inner_usage(parent_state, inner_state)
if task_exc is not None:
if not isinstance(task_exc, Exception):
# KeyboardInterrupt / SystemExit / CancelledError propagate
# after the depth counter was safely restored above.
raise task_exc
logger.error(
"[Baseline] Task sub-agent failed: %s", task_exc, exc_info=task_exc
)

View File

@@ -161,12 +161,14 @@ progress. Guidelines:
### Sub-agents — `Task`
The `Task` tool runs an **in-process sub-agent** inside the current
conversation. The sub-agent has the same tool set but its own message
history, so its intermediate tool calls stay out of the parent context —
you only see the sub-agent's final summary as the tool result. Use it
for self-contained work that would otherwise generate a lot of
intermediate chatter (focused research, bounded refactors, multi-step
exploration where only the conclusion matters).
conversation. The sub-agent inherits the parent's tool set **except
`Task` itself** (nested delegation is refused — plan at most one level
deep), and it gets its own message history so its intermediate tool
calls stay out of the parent context — you only see the sub-agent's
final summary as the tool result. Use it for self-contained work that
would otherwise generate a lot of intermediate chatter (focused
research, bounded refactors, multi-step exploration where only the
conclusion matters).
- Provide a short `description` (3-5 words, shown in the accordion) and a
complete `prompt` — the sub-agent does NOT inherit the parent

View File

@@ -120,9 +120,10 @@ def test_total_schema_char_budget() -> None:
This locks in the 34% token reduction from #12398 and prevents future
description bloat from eroding the gains. Uses character count with a
~4 chars/token heuristic (budget of 32000 chars ≈ 8000 tokens).
Character count is tokenizer-agnostic — no dependency on GPT or Claude
tokenizers — while still providing a stable regression gate.
~4 chars/token heuristic; see ``_CHAR_BUDGET`` above for the current
value and its change history. Character count is tokenizer-agnostic
— no dependency on GPT or Claude tokenizers — while still providing a
stable regression gate.
"""
schemas = [tool.as_openai_tool() for tool in TOOL_REGISTRY.values()]
serialized = json.dumps(schemas)

View File

@@ -2116,7 +2116,9 @@
},
{
"$ref": "#/components/schemas/MemoryForgetConfirmResponse"
}
},
{ "$ref": "#/components/schemas/TodoWriteResponse" },
{ "$ref": "#/components/schemas/TaskResponse" }
],
"title": "Response Getv2[Dummy] Tool Response Type Export For Codegen"
}
@@ -16068,6 +16070,55 @@
"required": ["recent_searches", "providers", "top_blocks"],
"title": "SuggestionsResponse"
},
"TaskResponse": {
"properties": {
"type": {
"$ref": "#/components/schemas/ResponseType",
"default": "task"
},
"message": { "type": "string", "title": "Message" },
"session_id": {
"anyOf": [{ "type": "string" }, { "type": "null" }],
"title": "Session Id"
},
"description": {
"type": "string",
"title": "Description",
"default": ""
},
"response": {
"type": "string",
"title": "Response",
"description": "Final assistant text the sub-agent produced.",
"default": ""
},
"iterations": {
"type": "integer",
"title": "Iterations",
"default": 0
},
"tool_calls": {
"items": { "type": "string" },
"type": "array",
"title": "Tool Calls",
"description": "Names of tools the sub-agent invoked (for observability)."
},
"status": {
"type": "string",
"enum": ["completed", "max_iterations", "error"],
"title": "Status",
"default": "completed"
},
"error": {
"anyOf": [{ "type": "string" }, { "type": "null" }],
"title": "Error"
}
},
"type": "object",
"required": ["message"],
"title": "TaskResponse",
"description": "Result of a delegated ``Task`` in-process sub-agent run.\n\nThe sub-agent runs a fresh tool-call loop with an isolated message\nhistory, then returns only its final assistant text. Intermediate tool\ncalls and thinking stay inside the sub-agent's loop so the parent\ncontext is not polluted."
},
"TimezoneResponse": {
"properties": {
"timezone": {
@@ -16684,6 +16735,52 @@
"required": ["timezone"],
"title": "TimezoneResponse"
},
"TodoItem": {
"properties": {
"content": {
"type": "string",
"title": "Content",
"description": "Imperative description of the task."
},
"activeForm": {
"type": "string",
"title": "Activeform",
"description": "Present-continuous form shown while the task is running."
},
"status": {
"type": "string",
"enum": ["pending", "in_progress", "completed"],
"title": "Status",
"default": "pending"
}
},
"type": "object",
"required": ["content", "activeForm"],
"title": "TodoItem",
"description": "One entry in a ``TodoWrite`` checklist.\n\nMirrors the schema used by Claude Code's built-in ``TodoWrite`` tool so\nthe frontend's ``GenericTool`` accordion renders baseline-emitted todos\nidentically to SDK-emitted ones."
},
"TodoWriteResponse": {
"properties": {
"type": {
"$ref": "#/components/schemas/ResponseType",
"default": "todo_write"
},
"message": { "type": "string", "title": "Message" },
"session_id": {
"anyOf": [{ "type": "string" }, { "type": "null" }],
"title": "Session Id"
},
"todos": {
"items": { "$ref": "#/components/schemas/TodoItem" },
"type": "array",
"title": "Todos"
}
},
"type": "object",
"required": ["message"],
"title": "TodoWriteResponse",
"description": "Ack returned by ``TodoWrite``.\n\nThe tool is effectively stateless — the authoritative task list lives in\nthe assistant's latest tool-call arguments, which are replayed from the\ntranscript on each turn. The tool output only needs to confirm that the\nupdate was accepted so the model can proceed."
},
"TokenIntrospectionResult": {
"properties": {
"active": { "type": "boolean", "title": "Active" },