mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-04-08 03:00:28 -04:00
fix(copilot): recognize Agent tool name and route CLI state into workspace (#12635)
### Why / What / How
**Why:** The Claude Agent SDK CLI renamed the sub-agent tool from
`"Task"` to `"Agent"` in v2.x. Our security hooks only checked for
`"Task"`, so all sub-agent security controls were silently bypassed on
production: concurrency limiting didn't apply, and slot tracking was
broken. This was discovered via Langfuse trace analysis of session
`62b1b2b9` where background sub-agents ran unchecked.
Additionally, the CLI writes sub-agent output to `/tmp/claude-<uid>/`
and project state to `$HOME/.claude/` — both outside the per-session
workspace (`/tmp/copilot-<session>/`). This caused `PermissionError` in
E2B sandboxes and silently lost sub-agent results.
The frontend also had no rendering for the `Agent` / `TaskOutput` SDK
built-in tools — they fell through to the generic "other" category with
no context-aware display.
**What:**
1. Fix the sub-agent tool name recognition (`"Task"` → `{"Task",
"Agent"}`)
2. Allow `run_in_background` — the SDK handles async lifecycle cleanly
(returns `isAsync:true`, model polls via `TaskOutput`)
3. Route CLI state into the workspace via `CLAUDE_CODE_TMPDIR` and
`HOME` env vars
4. Add lifecycle hooks (`SubagentStart`/`SubagentStop`) for
observability
5. Add frontend `"agent"` tool category with proper UI rendering
**How:**
- Security hooks check `tool_name in _SUBAGENT_TOOLS` (frozenset of
`"Task"` and `"Agent"`)
- Background agents are allowed but still count against `max_subtasks`
concurrency limit
- Frontend detects `isAsync: true` output → shows "Agent started
(background)" not "Agent completed"
- `TaskOutput` tool shows retrieval status and collected results
- Robot icon and agent-specific accordion rendering for both foreground
and background agents
### Changes 🏗️
**Backend:**
- **`security_hooks.py`**: Replace `tool_name == "Task"` with `tool_name
in _SUBAGENT_TOOLS`. Remove `run_in_background` deny block (SDK handles
async lifecycle). Add `SubagentStart`/`SubagentStop` hooks.
- **`tool_adapter.py`**: Add `"Agent"` to `_SDK_BUILTIN_ALWAYS` list
alongside `"Task"`.
- **`service.py`**: Set `CLAUDE_CODE_TMPDIR=sdk_cwd` and `HOME=sdk_cwd`
in SDK subprocess env.
- **`security_hooks_test.py`**: Update background tests (allowed, not
blocked). Add test for background agents counting against concurrency
limit.
**Frontend:**
- **`GenericTool/helpers.ts`**: Add `"agent"` tool category for `Agent`,
`Task`, `TaskOutput`. Agent-specific animation text detecting `isAsync`
output. Input summaries from description/prompt fields.
- **`GenericTool/GenericTool.tsx`**: Add `RobotIcon` for agent category.
Add `getAgentAccordionData()` with async-aware title/content.
`TaskOutput` shows retrieval status.
- **`useChatSession.ts`**: Fix pre-existing TS error (void mutation
body).
### Checklist 📋
#### For code changes:
- [x] I have clearly listed my changes in the PR description
- [x] I have made a test plan
- [x] I have tested my changes according to the test plan:
- [x] All security hooks tests pass (background allowed + limit
enforced)
- [x] Pre-commit hooks (ruff, black, isort, pyright, tsc) all pass
- [x] E2E test: copilot agent create+run scenario PASS
- [ ] Deploy to dev and test copilot sub-agent spawning with background
mode
#### For configuration changes:
- [x] `.env.default` is updated or already compatible
- [x] `docker-compose.yml` is updated or already compatible
This commit is contained in:
@@ -103,6 +103,7 @@ ToolName = Literal[
|
||||
"web_fetch",
|
||||
"write_workspace_file",
|
||||
# SDK built-ins
|
||||
"Agent",
|
||||
"Edit",
|
||||
"Glob",
|
||||
"Grep",
|
||||
|
||||
@@ -544,6 +544,7 @@ class TestApplyToolPermissions:
|
||||
class TestSdkBuiltinToolNames:
|
||||
def test_expected_builtins_present(self):
|
||||
expected = {
|
||||
"Agent",
|
||||
"Read",
|
||||
"Write",
|
||||
"Edit",
|
||||
|
||||
@@ -20,6 +20,7 @@ config = ChatConfig()
|
||||
def build_sdk_env(
|
||||
session_id: str | None = None,
|
||||
user_id: str | None = None,
|
||||
sdk_cwd: str | None = None,
|
||||
) -> dict[str, str]:
|
||||
"""Build env vars for the SDK CLI subprocess.
|
||||
|
||||
@@ -29,25 +30,35 @@ def build_sdk_env(
|
||||
``ANTHROPIC_API_KEY`` from the parent environment.
|
||||
3. **OpenRouter** (default) — overrides base URL and auth token to
|
||||
route through the proxy, with Langfuse trace headers.
|
||||
|
||||
When *sdk_cwd* is provided, ``CLAUDE_CODE_TMPDIR`` is set so that
|
||||
the CLI writes temp/sub-agent output inside the per-session workspace
|
||||
directory rather than an inaccessible system temp path.
|
||||
"""
|
||||
# --- Mode 1: Claude Code subscription auth ---
|
||||
if config.use_claude_code_subscription:
|
||||
validate_subscription()
|
||||
return {
|
||||
env: dict[str, str] = {
|
||||
"ANTHROPIC_API_KEY": "",
|
||||
"ANTHROPIC_AUTH_TOKEN": "",
|
||||
"ANTHROPIC_BASE_URL": "",
|
||||
}
|
||||
if sdk_cwd:
|
||||
env["CLAUDE_CODE_TMPDIR"] = sdk_cwd
|
||||
return env
|
||||
|
||||
# --- Mode 2: Direct Anthropic (no proxy hop) ---
|
||||
if not config.openrouter_active:
|
||||
return {}
|
||||
env = {}
|
||||
if sdk_cwd:
|
||||
env["CLAUDE_CODE_TMPDIR"] = sdk_cwd
|
||||
return env
|
||||
|
||||
# --- Mode 3: OpenRouter proxy ---
|
||||
base = (config.base_url or "").rstrip("/")
|
||||
if base.endswith("/v1"):
|
||||
base = base[:-3]
|
||||
env: dict[str, str] = {
|
||||
env = {
|
||||
"ANTHROPIC_BASE_URL": base,
|
||||
"ANTHROPIC_AUTH_TOKEN": config.api_key or "",
|
||||
"ANTHROPIC_API_KEY": "", # force CLI to use AUTH_TOKEN
|
||||
@@ -65,4 +76,7 @@ def build_sdk_env(
|
||||
if parts:
|
||||
env["ANTHROPIC_CUSTOM_HEADERS"] = "\n".join(parts)
|
||||
|
||||
if sdk_cwd:
|
||||
env["CLAUDE_CODE_TMPDIR"] = sdk_cwd
|
||||
|
||||
return env
|
||||
|
||||
@@ -240,3 +240,54 @@ class TestBuildSdkEnvModePriority:
|
||||
"ANTHROPIC_AUTH_TOKEN": "",
|
||||
"ANTHROPIC_BASE_URL": "",
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLAUDE_CODE_TMPDIR integration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestClaudeCodeTmpdir:
|
||||
"""Verify build_sdk_env() sets CLAUDE_CODE_TMPDIR from *sdk_cwd*."""
|
||||
|
||||
def test_tmpdir_set_when_sdk_cwd_is_truthy(self):
|
||||
"""CLAUDE_CODE_TMPDIR is set to sdk_cwd when sdk_cwd is truthy."""
|
||||
cfg = _make_config(use_openrouter=False)
|
||||
with patch("backend.copilot.sdk.env.config", cfg):
|
||||
from backend.copilot.sdk.env import build_sdk_env
|
||||
|
||||
result = build_sdk_env(sdk_cwd="/tmp/copilot-workspace")
|
||||
|
||||
assert result["CLAUDE_CODE_TMPDIR"] == "/tmp/copilot-workspace"
|
||||
|
||||
def test_tmpdir_not_set_when_sdk_cwd_is_none(self):
|
||||
"""CLAUDE_CODE_TMPDIR is NOT in the env when sdk_cwd is None."""
|
||||
cfg = _make_config(use_openrouter=False)
|
||||
with patch("backend.copilot.sdk.env.config", cfg):
|
||||
from backend.copilot.sdk.env import build_sdk_env
|
||||
|
||||
result = build_sdk_env(sdk_cwd=None)
|
||||
|
||||
assert "CLAUDE_CODE_TMPDIR" not in result
|
||||
|
||||
def test_tmpdir_not_set_when_sdk_cwd_is_empty_string(self):
|
||||
"""CLAUDE_CODE_TMPDIR is NOT in the env when sdk_cwd is empty string."""
|
||||
cfg = _make_config(use_openrouter=False)
|
||||
with patch("backend.copilot.sdk.env.config", cfg):
|
||||
from backend.copilot.sdk.env import build_sdk_env
|
||||
|
||||
result = build_sdk_env(sdk_cwd="")
|
||||
|
||||
assert "CLAUDE_CODE_TMPDIR" not in result
|
||||
|
||||
@patch("backend.copilot.sdk.env.validate_subscription")
|
||||
def test_tmpdir_set_in_subscription_mode(self, mock_validate):
|
||||
"""CLAUDE_CODE_TMPDIR is set even in subscription mode."""
|
||||
cfg = _make_config(use_claude_code_subscription=True)
|
||||
with patch("backend.copilot.sdk.env.config", cfg):
|
||||
from backend.copilot.sdk.env import build_sdk_env
|
||||
|
||||
result = build_sdk_env(sdk_cwd="/tmp/sub-workspace")
|
||||
|
||||
assert result["CLAUDE_CODE_TMPDIR"] == "/tmp/sub-workspace"
|
||||
assert result["ANTHROPIC_API_KEY"] == ""
|
||||
|
||||
@@ -1010,7 +1010,7 @@ def _make_sdk_patches(
|
||||
(f"{_SVC}.create_security_hooks", dict(return_value=MagicMock())),
|
||||
(f"{_SVC}.get_copilot_tool_names", dict(return_value=[])),
|
||||
(f"{_SVC}.get_sdk_disallowed_tools", dict(return_value=[])),
|
||||
(f"{_SVC}.build_sdk_env", dict(return_value=None)),
|
||||
(f"{_SVC}.build_sdk_env", dict(return_value={})),
|
||||
(f"{_SVC}._resolve_sdk_model", dict(return_value=None)),
|
||||
(f"{_SVC}.set_execution_context", {}),
|
||||
(
|
||||
|
||||
@@ -22,6 +22,38 @@ from .tool_adapter import (
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# The SDK CLI uses "Task" in older versions and "Agent" in v2.x+.
|
||||
# Shared across all sessions — used by security hooks for sub-agent detection.
|
||||
_SUBAGENT_TOOLS: frozenset[str] = frozenset({"Task", "Agent"})
|
||||
|
||||
# Unicode ranges stripped by _sanitize():
|
||||
# - BiDi overrides (U+202A-U+202E, U+2066-U+2069) can trick reviewers
|
||||
# into misreading code/logs.
|
||||
# - Zero-width characters (U+200B-U+200F, U+FEFF) can hide content.
|
||||
_BIDI_AND_ZW_CHARS = set(
|
||||
chr(c)
|
||||
for r in (range(0x202A, 0x202F), range(0x2066, 0x206A), range(0x200B, 0x2010))
|
||||
for c in r
|
||||
) | {"\ufeff"}
|
||||
|
||||
|
||||
def _sanitize(value: str, max_len: int = 200) -> str:
|
||||
"""Strip control characters and truncate for safe logging.
|
||||
|
||||
Removes C0 (U+0000-U+001F), DEL (U+007F), C1 (U+0080-U+009F),
|
||||
Unicode BiDi overrides, and zero-width characters to prevent
|
||||
log injection and visual spoofing.
|
||||
"""
|
||||
cleaned = "".join(
|
||||
c
|
||||
for c in value
|
||||
if c >= " "
|
||||
and c != "\x7f"
|
||||
and not ("\x80" <= c <= "\x9f")
|
||||
and c not in _BIDI_AND_ZW_CHARS
|
||||
)
|
||||
return cleaned[:max_len]
|
||||
|
||||
|
||||
def _deny(reason: str) -> dict[str, Any]:
|
||||
"""Return a hook denial response."""
|
||||
@@ -136,11 +168,13 @@ def create_security_hooks(
|
||||
- PostToolUse: Log successful tool executions
|
||||
- PostToolUseFailure: Log and handle failed tool executions
|
||||
- PreCompact: Log context compaction events (SDK handles compaction automatically)
|
||||
- SubagentStart: Log sub-agent lifecycle start
|
||||
- SubagentStop: Log sub-agent lifecycle end
|
||||
|
||||
Args:
|
||||
user_id: Current user ID for isolation validation
|
||||
sdk_cwd: SDK working directory for workspace-scoped tool validation
|
||||
max_subtasks: Maximum concurrent Task (sub-agent) spawns allowed per session
|
||||
max_subtasks: Maximum concurrent sub-agent spawns allowed per session
|
||||
on_compact: Callback invoked when SDK starts compacting context.
|
||||
Receives the transcript_path from the hook input.
|
||||
|
||||
@@ -151,9 +185,19 @@ def create_security_hooks(
|
||||
from claude_agent_sdk import HookMatcher
|
||||
from claude_agent_sdk.types import HookContext, HookInput, SyncHookJSONOutput
|
||||
|
||||
# Per-session tracking for Task sub-agent concurrency.
|
||||
# Per-session tracking for sub-agent concurrency.
|
||||
# Set of tool_use_ids that consumed a slot — len() is the active count.
|
||||
task_tool_use_ids: set[str] = set()
|
||||
#
|
||||
# LIMITATION: For background (async) agents the SDK returns the
|
||||
# Agent/Task tool immediately with {isAsync: true}, which triggers
|
||||
# PostToolUse and releases the slot while the agent is still running.
|
||||
# SubagentStop fires later when the background process finishes but
|
||||
# does not currently hold a slot. This means the concurrency limit
|
||||
# only gates *launches*, not true concurrent execution. To fix this
|
||||
# we would need to track background agent_ids separately and release
|
||||
# in SubagentStop, but the SDK does not guarantee SubagentStop fires
|
||||
# for every background agent (e.g. on session abort).
|
||||
subagent_tool_use_ids: set[str] = set()
|
||||
|
||||
async def pre_tool_use_hook(
|
||||
input_data: HookInput,
|
||||
@@ -165,29 +209,22 @@ def create_security_hooks(
|
||||
tool_name = cast(str, input_data.get("tool_name", ""))
|
||||
tool_input = cast(dict[str, Any], input_data.get("tool_input", {}))
|
||||
|
||||
# Rate-limit Task (sub-agent) spawns per session
|
||||
if tool_name == "Task":
|
||||
# Block background task execution first — denied calls
|
||||
# should not consume a subtask slot.
|
||||
if tool_input.get("run_in_background"):
|
||||
logger.info(f"[SDK] Blocked background Task, user={user_id}")
|
||||
return cast(
|
||||
SyncHookJSONOutput,
|
||||
_deny(
|
||||
"Background task execution is not supported. "
|
||||
"Run tasks in the foreground instead "
|
||||
"(remove the run_in_background parameter)."
|
||||
),
|
||||
)
|
||||
if len(task_tool_use_ids) >= max_subtasks:
|
||||
# Rate-limit sub-agent spawns per session.
|
||||
# The SDK CLI renamed "Task" → "Agent" in v2.x; handle both.
|
||||
if tool_name in _SUBAGENT_TOOLS:
|
||||
# Background agents are allowed — the SDK returns immediately
|
||||
# with {isAsync: true} and the model polls via TaskOutput.
|
||||
# Still count them against the concurrency limit.
|
||||
if len(subagent_tool_use_ids) >= max_subtasks:
|
||||
logger.warning(
|
||||
f"[SDK] Task limit reached ({max_subtasks}), user={user_id}"
|
||||
f"[SDK] Sub-agent limit reached ({max_subtasks}), "
|
||||
f"user={user_id}"
|
||||
)
|
||||
return cast(
|
||||
SyncHookJSONOutput,
|
||||
_deny(
|
||||
f"Maximum {max_subtasks} concurrent sub-tasks. "
|
||||
"Wait for running sub-tasks to finish, "
|
||||
f"Maximum {max_subtasks} concurrent sub-agents. "
|
||||
"Wait for running sub-agents to finish, "
|
||||
"or continue in the main conversation."
|
||||
),
|
||||
)
|
||||
@@ -208,20 +245,20 @@ def create_security_hooks(
|
||||
if result:
|
||||
return cast(SyncHookJSONOutput, result)
|
||||
|
||||
# Reserve the Task slot only after all validations pass
|
||||
if tool_name == "Task" and tool_use_id is not None:
|
||||
task_tool_use_ids.add(tool_use_id)
|
||||
# Reserve the sub-agent slot only after all validations pass
|
||||
if tool_name in _SUBAGENT_TOOLS and tool_use_id is not None:
|
||||
subagent_tool_use_ids.add(tool_use_id)
|
||||
|
||||
logger.debug(f"[SDK] Tool start: {tool_name}, user={user_id}")
|
||||
return cast(SyncHookJSONOutput, {})
|
||||
|
||||
def _release_task_slot(tool_name: str, tool_use_id: str | None) -> None:
|
||||
"""Release a Task concurrency slot if one was reserved."""
|
||||
if tool_name == "Task" and tool_use_id in task_tool_use_ids:
|
||||
task_tool_use_ids.discard(tool_use_id)
|
||||
def _release_subagent_slot(tool_name: str, tool_use_id: str | None) -> None:
|
||||
"""Release a sub-agent concurrency slot if one was reserved."""
|
||||
if tool_name in _SUBAGENT_TOOLS and tool_use_id in subagent_tool_use_ids:
|
||||
subagent_tool_use_ids.discard(tool_use_id)
|
||||
logger.info(
|
||||
"[SDK] Task slot released, active=%d/%d, user=%s",
|
||||
len(task_tool_use_ids),
|
||||
"[SDK] Sub-agent slot released, active=%d/%d, user=%s",
|
||||
len(subagent_tool_use_ids),
|
||||
max_subtasks,
|
||||
user_id,
|
||||
)
|
||||
@@ -241,13 +278,14 @@ def create_security_hooks(
|
||||
_ = context
|
||||
tool_name = cast(str, input_data.get("tool_name", ""))
|
||||
|
||||
_release_task_slot(tool_name, tool_use_id)
|
||||
_release_subagent_slot(tool_name, tool_use_id)
|
||||
is_builtin = not tool_name.startswith(MCP_TOOL_PREFIX)
|
||||
safe_tool_use_id = _sanitize(str(tool_use_id or ""), max_len=12)
|
||||
logger.info(
|
||||
"[SDK] PostToolUse: %s (builtin=%s, tool_use_id=%s)",
|
||||
tool_name,
|
||||
is_builtin,
|
||||
(tool_use_id or "")[:12],
|
||||
safe_tool_use_id,
|
||||
)
|
||||
|
||||
# Stash output for SDK built-in tools so the response adapter can
|
||||
@@ -256,7 +294,7 @@ def create_security_hooks(
|
||||
if is_builtin:
|
||||
tool_response = input_data.get("tool_response")
|
||||
if tool_response is not None:
|
||||
resp_preview = str(tool_response)[:100]
|
||||
resp_preview = _sanitize(str(tool_response), max_len=100)
|
||||
logger.info(
|
||||
"[SDK] Stashing builtin output for %s (%d chars): %s...",
|
||||
tool_name,
|
||||
@@ -280,13 +318,17 @@ def create_security_hooks(
|
||||
"""Log failed tool executions for debugging."""
|
||||
_ = context
|
||||
tool_name = cast(str, input_data.get("tool_name", ""))
|
||||
error = input_data.get("error", "Unknown error")
|
||||
error = _sanitize(str(input_data.get("error", "Unknown error")))
|
||||
safe_tool_use_id = _sanitize(str(tool_use_id or ""))
|
||||
logger.warning(
|
||||
f"[SDK] Tool failed: {tool_name}, error={error}, "
|
||||
f"user={user_id}, tool_use_id={tool_use_id}"
|
||||
"[SDK] Tool failed: %s, error=%s, user=%s, tool_use_id=%s",
|
||||
tool_name,
|
||||
error,
|
||||
user_id,
|
||||
safe_tool_use_id,
|
||||
)
|
||||
|
||||
_release_task_slot(tool_name, tool_use_id)
|
||||
_release_subagent_slot(tool_name, tool_use_id)
|
||||
|
||||
return cast(SyncHookJSONOutput, {})
|
||||
|
||||
@@ -301,16 +343,14 @@ def create_security_hooks(
|
||||
This hook provides visibility into when compaction happens.
|
||||
"""
|
||||
_ = context, tool_use_id
|
||||
trigger = input_data.get("trigger", "auto")
|
||||
trigger = _sanitize(str(input_data.get("trigger", "auto")), max_len=50)
|
||||
# Sanitize untrusted input: strip control chars for logging AND
|
||||
# for the value passed downstream. read_compacted_entries()
|
||||
# validates against _projects_base() as defence-in-depth, but
|
||||
# sanitizing here prevents log injection and rejects obviously
|
||||
# malformed paths early.
|
||||
transcript_path = (
|
||||
str(input_data.get("transcript_path", ""))
|
||||
.replace("\n", "")
|
||||
.replace("\r", "")
|
||||
transcript_path = _sanitize(
|
||||
str(input_data.get("transcript_path", "")), max_len=500
|
||||
)
|
||||
logger.info(
|
||||
"[SDK] Context compaction triggered: %s, user=%s, transcript_path=%s",
|
||||
@@ -322,6 +362,44 @@ def create_security_hooks(
|
||||
on_compact(transcript_path)
|
||||
return cast(SyncHookJSONOutput, {})
|
||||
|
||||
async def subagent_start_hook(
|
||||
input_data: HookInput,
|
||||
tool_use_id: str | None,
|
||||
context: HookContext,
|
||||
) -> SyncHookJSONOutput:
|
||||
"""Log when a sub-agent starts execution."""
|
||||
_ = context, tool_use_id
|
||||
agent_id = _sanitize(str(input_data.get("agent_id", "?")))
|
||||
agent_type = _sanitize(str(input_data.get("agent_type", "?")))
|
||||
logger.info(
|
||||
"[SDK] SubagentStart: agent_id=%s, type=%s, user=%s",
|
||||
agent_id,
|
||||
agent_type,
|
||||
user_id,
|
||||
)
|
||||
return cast(SyncHookJSONOutput, {})
|
||||
|
||||
async def subagent_stop_hook(
|
||||
input_data: HookInput,
|
||||
tool_use_id: str | None,
|
||||
context: HookContext,
|
||||
) -> SyncHookJSONOutput:
|
||||
"""Log when a sub-agent stops."""
|
||||
_ = context, tool_use_id
|
||||
agent_id = _sanitize(str(input_data.get("agent_id", "?")))
|
||||
agent_type = _sanitize(str(input_data.get("agent_type", "?")))
|
||||
transcript = _sanitize(
|
||||
str(input_data.get("agent_transcript_path", "")), max_len=500
|
||||
)
|
||||
logger.info(
|
||||
"[SDK] SubagentStop: agent_id=%s, type=%s, user=%s, transcript=%s",
|
||||
agent_id,
|
||||
agent_type,
|
||||
user_id,
|
||||
transcript,
|
||||
)
|
||||
return cast(SyncHookJSONOutput, {})
|
||||
|
||||
hooks: dict[str, Any] = {
|
||||
"PreToolUse": [HookMatcher(matcher="*", hooks=[pre_tool_use_hook])],
|
||||
"PostToolUse": [HookMatcher(matcher="*", hooks=[post_tool_use_hook])],
|
||||
@@ -329,6 +407,8 @@ def create_security_hooks(
|
||||
HookMatcher(matcher="*", hooks=[post_tool_failure_hook])
|
||||
],
|
||||
"PreCompact": [HookMatcher(matcher="*", hooks=[pre_compact_hook])],
|
||||
"SubagentStart": [HookMatcher(matcher="*", hooks=[subagent_start_hook])],
|
||||
"SubagentStop": [HookMatcher(matcher="*", hooks=[subagent_stop_hook])],
|
||||
}
|
||||
|
||||
return hooks
|
||||
|
||||
@@ -5,6 +5,7 @@ They validate that the security hooks correctly block unauthorized paths,
|
||||
tool access, and dangerous input patterns.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
|
||||
import pytest
|
||||
@@ -245,16 +246,15 @@ def _hooks():
|
||||
|
||||
@pytest.mark.skipif(not _sdk_available(), reason="claude_agent_sdk not installed")
|
||||
@pytest.mark.asyncio
|
||||
async def test_task_background_blocked(_hooks):
|
||||
"""Task with run_in_background=true must be denied."""
|
||||
async def test_task_background_allowed(_hooks):
|
||||
"""Task with run_in_background=true is allowed (SDK handles async lifecycle)."""
|
||||
pre, _, _ = _hooks
|
||||
result = await pre(
|
||||
{"tool_name": "Task", "tool_input": {"run_in_background": True, "prompt": "x"}},
|
||||
tool_use_id=None,
|
||||
tool_use_id="tu-bg-1",
|
||||
context={},
|
||||
)
|
||||
assert _is_denied(result)
|
||||
assert "foreground" in _reason(result).lower()
|
||||
assert not _is_denied(result)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not _sdk_available(), reason="claude_agent_sdk not installed")
|
||||
@@ -368,3 +368,303 @@ async def test_task_slot_released_on_failure(_hooks):
|
||||
context={},
|
||||
)
|
||||
assert not _is_denied(result)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# "Agent" tool name (SDK v2.x+ renamed "Task" → "Agent")
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.skipif(not _sdk_available(), reason="claude_agent_sdk not installed")
|
||||
@pytest.mark.asyncio
|
||||
async def test_agent_background_allowed(_hooks):
|
||||
"""Agent with run_in_background=true is allowed (SDK handles async lifecycle)."""
|
||||
pre, _, _ = _hooks
|
||||
result = await pre(
|
||||
{
|
||||
"tool_name": "Agent",
|
||||
"tool_input": {"run_in_background": True, "prompt": "x"},
|
||||
},
|
||||
tool_use_id="tu-agent-bg-1",
|
||||
context={},
|
||||
)
|
||||
assert not _is_denied(result)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not _sdk_available(), reason="claude_agent_sdk not installed")
|
||||
@pytest.mark.asyncio
|
||||
async def test_agent_foreground_allowed(_hooks):
|
||||
"""Agent without run_in_background should be allowed."""
|
||||
pre, _, _ = _hooks
|
||||
result = await pre(
|
||||
{"tool_name": "Agent", "tool_input": {"prompt": "do stuff"}},
|
||||
tool_use_id="tu-agent-1",
|
||||
context={},
|
||||
)
|
||||
assert not _is_denied(result)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not _sdk_available(), reason="claude_agent_sdk not installed")
|
||||
@pytest.mark.asyncio
|
||||
async def test_background_agent_counts_against_limit(_hooks):
|
||||
"""Background agents still consume concurrency slots."""
|
||||
pre, _, _ = _hooks
|
||||
# Two background agents fill the limit
|
||||
for i in range(2):
|
||||
result = await pre(
|
||||
{
|
||||
"tool_name": "Agent",
|
||||
"tool_input": {"run_in_background": True, "prompt": "bg"},
|
||||
},
|
||||
tool_use_id=f"tu-bglimit-{i}",
|
||||
context={},
|
||||
)
|
||||
assert not _is_denied(result)
|
||||
# Third (background or foreground) should be denied
|
||||
result = await pre(
|
||||
{
|
||||
"tool_name": "Agent",
|
||||
"tool_input": {"run_in_background": True, "prompt": "over"},
|
||||
},
|
||||
tool_use_id="tu-bglimit-2",
|
||||
context={},
|
||||
)
|
||||
assert _is_denied(result)
|
||||
assert "Maximum" in _reason(result)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not _sdk_available(), reason="claude_agent_sdk not installed")
|
||||
@pytest.mark.asyncio
|
||||
async def test_agent_limit_enforced(_hooks):
|
||||
"""Agent spawns beyond max_subtasks should be denied."""
|
||||
pre, _, _ = _hooks
|
||||
# First two should pass
|
||||
for i in range(2):
|
||||
result = await pre(
|
||||
{"tool_name": "Agent", "tool_input": {"prompt": "ok"}},
|
||||
tool_use_id=f"tu-agent-limit-{i}",
|
||||
context={},
|
||||
)
|
||||
assert not _is_denied(result)
|
||||
|
||||
# Third should be denied (limit=2)
|
||||
result = await pre(
|
||||
{"tool_name": "Agent", "tool_input": {"prompt": "over limit"}},
|
||||
tool_use_id="tu-agent-limit-2",
|
||||
context={},
|
||||
)
|
||||
assert _is_denied(result)
|
||||
assert "Maximum" in _reason(result)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not _sdk_available(), reason="claude_agent_sdk not installed")
|
||||
@pytest.mark.asyncio
|
||||
async def test_agent_slot_released_on_completion(_hooks):
|
||||
"""Completing an Agent should free a slot so new Agents can be spawned."""
|
||||
pre, post, _ = _hooks
|
||||
# Fill both slots
|
||||
for i in range(2):
|
||||
result = await pre(
|
||||
{"tool_name": "Agent", "tool_input": {"prompt": "ok"}},
|
||||
tool_use_id=f"tu-agent-comp-{i}",
|
||||
context={},
|
||||
)
|
||||
assert not _is_denied(result)
|
||||
|
||||
# Third should be denied — at capacity
|
||||
result = await pre(
|
||||
{"tool_name": "Agent", "tool_input": {"prompt": "over"}},
|
||||
tool_use_id="tu-agent-comp-2",
|
||||
context={},
|
||||
)
|
||||
assert _is_denied(result)
|
||||
|
||||
# Complete first agent — frees a slot
|
||||
await post(
|
||||
{"tool_name": "Agent", "tool_input": {}},
|
||||
tool_use_id="tu-agent-comp-0",
|
||||
context={},
|
||||
)
|
||||
|
||||
# Now a new Agent should be allowed
|
||||
result = await pre(
|
||||
{"tool_name": "Agent", "tool_input": {"prompt": "after release"}},
|
||||
tool_use_id="tu-agent-comp-3",
|
||||
context={},
|
||||
)
|
||||
assert not _is_denied(result)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not _sdk_available(), reason="claude_agent_sdk not installed")
|
||||
@pytest.mark.asyncio
|
||||
async def test_agent_slot_released_on_failure(_hooks):
|
||||
"""A failed Agent should also free its concurrency slot."""
|
||||
pre, _, post_failure = _hooks
|
||||
# Fill both slots
|
||||
for i in range(2):
|
||||
result = await pre(
|
||||
{"tool_name": "Agent", "tool_input": {"prompt": "ok"}},
|
||||
tool_use_id=f"tu-agent-fail-{i}",
|
||||
context={},
|
||||
)
|
||||
assert not _is_denied(result)
|
||||
|
||||
# At capacity
|
||||
result = await pre(
|
||||
{"tool_name": "Agent", "tool_input": {"prompt": "over"}},
|
||||
tool_use_id="tu-agent-fail-2",
|
||||
context={},
|
||||
)
|
||||
assert _is_denied(result)
|
||||
|
||||
# Fail first agent — should free a slot
|
||||
await post_failure(
|
||||
{"tool_name": "Agent", "tool_input": {}, "error": "something broke"},
|
||||
tool_use_id="tu-agent-fail-0",
|
||||
context={},
|
||||
)
|
||||
|
||||
# New Agent should be allowed
|
||||
result = await pre(
|
||||
{"tool_name": "Agent", "tool_input": {"prompt": "after failure"}},
|
||||
tool_use_id="tu-agent-fail-3",
|
||||
context={},
|
||||
)
|
||||
assert not _is_denied(result)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not _sdk_available(), reason="claude_agent_sdk not installed")
|
||||
@pytest.mark.asyncio
|
||||
async def test_mixed_task_agent_share_slots(_hooks):
|
||||
"""Task and Agent share the same concurrency pool."""
|
||||
pre, post, _ = _hooks
|
||||
# Fill one slot with Task, one with Agent
|
||||
result = await pre(
|
||||
{"tool_name": "Task", "tool_input": {"prompt": "ok"}},
|
||||
tool_use_id="tu-mix-task",
|
||||
context={},
|
||||
)
|
||||
assert not _is_denied(result)
|
||||
|
||||
result = await pre(
|
||||
{"tool_name": "Agent", "tool_input": {"prompt": "ok"}},
|
||||
tool_use_id="tu-mix-agent",
|
||||
context={},
|
||||
)
|
||||
assert not _is_denied(result)
|
||||
|
||||
# Third (either name) should be denied
|
||||
result = await pre(
|
||||
{"tool_name": "Agent", "tool_input": {"prompt": "over"}},
|
||||
tool_use_id="tu-mix-over",
|
||||
context={},
|
||||
)
|
||||
assert _is_denied(result)
|
||||
|
||||
# Release the Task slot
|
||||
await post(
|
||||
{"tool_name": "Task", "tool_input": {}},
|
||||
tool_use_id="tu-mix-task",
|
||||
context={},
|
||||
)
|
||||
|
||||
# Now an Agent should be allowed
|
||||
result = await pre(
|
||||
{"tool_name": "Agent", "tool_input": {"prompt": "after task release"}},
|
||||
tool_use_id="tu-mix-new",
|
||||
context={},
|
||||
)
|
||||
assert not _is_denied(result)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# SubagentStart / SubagentStop hooks
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def _subagent_hooks():
|
||||
"""Create hooks and return (subagent_start, subagent_stop) handlers."""
|
||||
hooks = create_security_hooks(user_id="u1", sdk_cwd=SDK_CWD, max_subtasks=2)
|
||||
start = hooks["SubagentStart"][0].hooks[0]
|
||||
stop = hooks["SubagentStop"][0].hooks[0]
|
||||
return start, stop
|
||||
|
||||
|
||||
@pytest.mark.skipif(not _sdk_available(), reason="claude_agent_sdk not installed")
|
||||
@pytest.mark.asyncio
|
||||
async def test_subagent_start_hook_returns_empty(_subagent_hooks):
|
||||
"""SubagentStart hook should return an empty dict (logging only)."""
|
||||
start, _ = _subagent_hooks
|
||||
result = await start(
|
||||
{"agent_id": "sa-123", "agent_type": "research"},
|
||||
tool_use_id=None,
|
||||
context={},
|
||||
)
|
||||
assert result == {}
|
||||
|
||||
|
||||
@pytest.mark.skipif(not _sdk_available(), reason="claude_agent_sdk not installed")
|
||||
@pytest.mark.asyncio
|
||||
async def test_subagent_stop_hook_returns_empty(_subagent_hooks):
|
||||
"""SubagentStop hook should return an empty dict (logging only)."""
|
||||
_, stop = _subagent_hooks
|
||||
result = await stop(
|
||||
{
|
||||
"agent_id": "sa-123",
|
||||
"agent_type": "research",
|
||||
"agent_transcript_path": "/tmp/transcript.txt",
|
||||
},
|
||||
tool_use_id=None,
|
||||
context={},
|
||||
)
|
||||
assert result == {}
|
||||
|
||||
|
||||
@pytest.mark.skipif(not _sdk_available(), reason="claude_agent_sdk not installed")
|
||||
@pytest.mark.asyncio
|
||||
async def test_subagent_hooks_sanitize_inputs(_subagent_hooks, caplog):
|
||||
"""SubagentStart/Stop should sanitize control chars from inputs."""
|
||||
start, stop = _subagent_hooks
|
||||
# Inject control characters (C0, DEL, C1, BiDi overrides, zero-width)
|
||||
# — hook should not raise AND logs must be clean
|
||||
with caplog.at_level(logging.DEBUG, logger="backend.copilot.sdk.security_hooks"):
|
||||
result = await start(
|
||||
{
|
||||
"agent_id": "sa\n-injected\r\x00\x7f",
|
||||
"agent_type": "safe\x80_type\x9f\ttab",
|
||||
},
|
||||
tool_use_id=None,
|
||||
context={},
|
||||
)
|
||||
assert result == {}
|
||||
# Control chars must be stripped from the logged values
|
||||
for record in caplog.records:
|
||||
assert "\x00" not in record.message
|
||||
assert "\r" not in record.message
|
||||
assert "\n" not in record.message
|
||||
assert "\x7f" not in record.message
|
||||
assert "\x80" not in record.message
|
||||
assert "\x9f" not in record.message
|
||||
assert "safe_type" in caplog.text
|
||||
|
||||
caplog.clear()
|
||||
with caplog.at_level(logging.DEBUG, logger="backend.copilot.sdk.security_hooks"):
|
||||
result = await stop(
|
||||
{
|
||||
"agent_id": "sa\n-injected\x7f",
|
||||
"agent_type": "type\r\x80\x9f",
|
||||
"agent_transcript_path": "/tmp/\x00malicious\npath\u202a\u200b",
|
||||
},
|
||||
tool_use_id=None,
|
||||
context={},
|
||||
)
|
||||
assert result == {}
|
||||
for record in caplog.records:
|
||||
assert "\x00" not in record.message
|
||||
assert "\r" not in record.message
|
||||
assert "\n" not in record.message
|
||||
assert "\x7f" not in record.message
|
||||
assert "\u202a" not in record.message
|
||||
assert "\u200b" not in record.message
|
||||
assert "/tmp/maliciouspath" in caplog.text
|
||||
|
||||
@@ -1884,7 +1884,10 @@ async def stream_chat_completion_sdk(
|
||||
)
|
||||
|
||||
# Fail fast when no API credentials are available at all.
|
||||
sdk_env = build_sdk_env(session_id=session_id, user_id=user_id)
|
||||
# sdk_cwd routes the CLI's temp dir into the per-session workspace
|
||||
# so sub-agent output files land inside sdk_cwd (see build_sdk_env).
|
||||
sdk_env = build_sdk_env(session_id=session_id, user_id=user_id, sdk_cwd=sdk_cwd)
|
||||
|
||||
if not config.api_key and not config.use_claude_code_subscription:
|
||||
raise RuntimeError(
|
||||
"No API key configured. Set OPEN_ROUTER_API_KEY, "
|
||||
|
||||
@@ -590,13 +590,14 @@ def create_copilot_mcp_server(*, use_e2b: bool = False):
|
||||
# Security hooks validate that file paths stay within sdk_cwd.
|
||||
# Bash is NOT included — use the sandboxed MCP bash_exec tool instead,
|
||||
# which provides kernel-level network isolation via unshare --net.
|
||||
# Task allows spawning sub-agents (rate-limited by security hooks).
|
||||
# Task/Agent allows spawning sub-agents (rate-limited by security hooks).
|
||||
# The CLI renamed "Task" → "Agent" in v2.x; both are listed for compat.
|
||||
# WebSearch uses Brave Search via Anthropic's API — safe, no SSRF risk.
|
||||
# TodoWrite manages the task checklist shown in the UI — no security concern.
|
||||
# In E2B mode, all five are disabled — MCP equivalents provide direct sandbox
|
||||
# access. read_file also handles local tool-results and ephemeral reads.
|
||||
_SDK_BUILTIN_FILE_TOOLS = ["Read", "Write", "Edit", "Glob", "Grep"]
|
||||
_SDK_BUILTIN_ALWAYS = ["Task", "WebSearch", "TodoWrite"]
|
||||
_SDK_BUILTIN_ALWAYS = ["Task", "Agent", "WebSearch", "TodoWrite"]
|
||||
_SDK_BUILTIN_TOOLS = [*_SDK_BUILTIN_FILE_TOOLS, *_SDK_BUILTIN_ALWAYS]
|
||||
|
||||
# SDK built-in tools that must be explicitly blocked.
|
||||
|
||||
@@ -15,6 +15,7 @@ import {
|
||||
MagnifyingGlassIcon,
|
||||
MonitorIcon,
|
||||
PencilSimpleIcon,
|
||||
RobotIcon,
|
||||
TerminalIcon,
|
||||
TrashIcon,
|
||||
WarningDiamondIcon,
|
||||
@@ -32,6 +33,7 @@ import {
|
||||
} from "@/components/contextual/OutputRenderers";
|
||||
import type { OutputMetadata } from "@/components/contextual/OutputRenderers";
|
||||
import {
|
||||
TOOL_TASK_OUTPUT,
|
||||
type ToolCategory,
|
||||
extractToolName,
|
||||
getAnimationText,
|
||||
@@ -109,6 +111,8 @@ function ToolIcon({
|
||||
return (
|
||||
<ArrowsClockwiseIcon size={14} weight="regular" className={iconClass} />
|
||||
);
|
||||
case "agent":
|
||||
return <RobotIcon size={14} weight="regular" className={iconClass} />;
|
||||
default:
|
||||
return <GearIcon size={14} weight="regular" className={iconClass} />;
|
||||
}
|
||||
@@ -141,6 +145,8 @@ function AccordionIcon({ category }: { category: ToolCategory }) {
|
||||
return <ListChecksIcon size={32} weight="light" />;
|
||||
case "compaction":
|
||||
return <ArrowsClockwiseIcon size={32} weight="light" />;
|
||||
case "agent":
|
||||
return <RobotIcon size={32} weight="light" />;
|
||||
default:
|
||||
return <GearIcon size={32} weight="light" />;
|
||||
}
|
||||
@@ -557,6 +563,53 @@ function getTodoAccordionData(input: unknown): AccordionData {
|
||||
};
|
||||
}
|
||||
|
||||
function getAgentAccordionData(
|
||||
toolName: string,
|
||||
input: unknown,
|
||||
output: Record<string, unknown>,
|
||||
): AccordionData {
|
||||
const inp = (input && typeof input === "object" ? input : {}) as Record<
|
||||
string,
|
||||
unknown
|
||||
>;
|
||||
const isAsync = output.isAsync === true || output.status === "async_launched";
|
||||
|
||||
if (toolName === TOOL_TASK_OUTPUT) {
|
||||
const status = getStringField(output, "retrieval_status");
|
||||
const task = output.task;
|
||||
return {
|
||||
title: status === "timeout" ? "Agent still running" : "Agent result",
|
||||
description:
|
||||
typeof inp.agentId === "string" ? `Agent: ${inp.agentId}` : undefined,
|
||||
content: task ? (
|
||||
<ContentCodeBlock>{JSON.stringify(task, null, 2)}</ContentCodeBlock>
|
||||
) : (
|
||||
<ContentMessage>
|
||||
{status === "timeout"
|
||||
? "The agent hasn't finished yet. Results will appear automatically when it's done."
|
||||
: "No result available."}
|
||||
</ContentMessage>
|
||||
),
|
||||
};
|
||||
}
|
||||
|
||||
const description =
|
||||
getStringField(inp, "description") ?? getStringField(output, "description");
|
||||
const agentId = getStringField(output, "agentId");
|
||||
|
||||
return {
|
||||
title: isAsync ? "Agent started (background)" : "Agent completed",
|
||||
description: description ?? agentId ?? undefined,
|
||||
content: isAsync ? (
|
||||
<ContentMessage>
|
||||
Running in the background. Results will appear here when ready.
|
||||
</ContentMessage>
|
||||
) : (
|
||||
<ContentCodeBlock>{JSON.stringify(output, null, 2)}</ContentCodeBlock>
|
||||
),
|
||||
};
|
||||
}
|
||||
|
||||
function getDefaultAccordionData(
|
||||
output: Record<string, unknown>,
|
||||
): AccordionData {
|
||||
@@ -608,6 +661,8 @@ function getAccordionData(
|
||||
return getFileAccordionData(category, input, output);
|
||||
case "todo":
|
||||
return getTodoAccordionData(input);
|
||||
case "agent":
|
||||
return getAgentAccordionData(toolName, input, output);
|
||||
default:
|
||||
return getDefaultAccordionData(output);
|
||||
}
|
||||
|
||||
@@ -1,5 +1,13 @@
|
||||
import type { ToolUIPart } from "ai";
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* Sub-agent tool name constants */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
export const TOOL_AGENT = "Agent";
|
||||
export const TOOL_TASK = "Task";
|
||||
export const TOOL_TASK_OUTPUT = "TaskOutput";
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* Tool name helpers */
|
||||
/* ------------------------------------------------------------------ */
|
||||
@@ -28,6 +36,7 @@ export type ToolCategory =
|
||||
| "edit"
|
||||
| "todo"
|
||||
| "compaction"
|
||||
| "agent"
|
||||
| "other";
|
||||
|
||||
export function getToolCategory(toolName: string): ToolCategory {
|
||||
@@ -66,6 +75,10 @@ export function getToolCategory(toolName: string): ToolCategory {
|
||||
return "todo";
|
||||
case "context_compaction":
|
||||
return "compaction";
|
||||
case TOOL_AGENT:
|
||||
case TOOL_TASK:
|
||||
case TOOL_TASK_OUTPUT:
|
||||
return "agent";
|
||||
default:
|
||||
return "other";
|
||||
}
|
||||
@@ -134,6 +147,15 @@ function getInputSummary(toolName: string, input: unknown): string | null {
|
||||
if (active && typeof active.content === "string") return active.content;
|
||||
return null;
|
||||
}
|
||||
case TOOL_AGENT:
|
||||
case TOOL_TASK:
|
||||
return typeof inp.description === "string"
|
||||
? inp.description
|
||||
: typeof inp.prompt === "string"
|
||||
? truncate(inp.prompt, 60)
|
||||
: null;
|
||||
case TOOL_TASK_OUTPUT:
|
||||
return typeof inp.agentId === "string" ? inp.agentId : null;
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
@@ -235,6 +257,14 @@ export function getAnimationText(
|
||||
return shortSummary ? `${shortSummary}` : "Updating task list\u2026";
|
||||
case "compaction":
|
||||
return "Summarizing earlier messages\u2026";
|
||||
case "agent":
|
||||
if (toolName === TOOL_TASK_OUTPUT)
|
||||
return shortSummary
|
||||
? `Checking agent ${shortSummary}\u2026`
|
||||
: "Checking agent result\u2026";
|
||||
return shortSummary
|
||||
? `Running agent: ${shortSummary}`
|
||||
: "Starting agent\u2026";
|
||||
default:
|
||||
return `Running ${formatToolName(toolName)}\u2026`;
|
||||
}
|
||||
@@ -288,6 +318,28 @@ export function getAnimationText(
|
||||
return "Updated task list";
|
||||
case "compaction":
|
||||
return "Earlier messages were summarized";
|
||||
case "agent": {
|
||||
if (toolName === TOOL_TASK_OUTPUT) {
|
||||
const taskOut =
|
||||
part.output && typeof part.output === "object"
|
||||
? (part.output as Record<string, unknown>)
|
||||
: null;
|
||||
if (taskOut?.retrieval_status === "timeout")
|
||||
return "Agent still running\u2026";
|
||||
return "Agent result received";
|
||||
}
|
||||
const agentOut =
|
||||
part.output && typeof part.output === "object"
|
||||
? (part.output as Record<string, unknown>)
|
||||
: null;
|
||||
if (agentOut?.isAsync || agentOut?.status === "async_launched")
|
||||
return shortSummary
|
||||
? `Agent started (background): ${shortSummary}`
|
||||
: "Agent started in background";
|
||||
return shortSummary
|
||||
? `Agent completed: ${shortSummary}`
|
||||
: "Agent completed";
|
||||
}
|
||||
default:
|
||||
return `${formatToolName(toolName)} completed`;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user