From 52b3aebf7187386514f4b4de3eb52e06bd8dfd76 Mon Sep 17 00:00:00 2001 From: Zamil Majdy Date: Fri, 13 Feb 2026 19:49:03 +0400 Subject: [PATCH] feat(backend/sdk): Claude Agent SDK integration for CoPilot (#12103) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Full integration of the **Claude Agent SDK** to replace the existing one-turn OpenAI-compatible CoPilot implementation with a multi-turn, tool-using AI agent. ### What changed **Core SDK Integration** (`chat/sdk/` — new module) - **`service.py`**: Main orchestrator — spawns Claude Code CLI as a subprocess per user message, streams responses back via SSE. Handles conversation history compression, session lifecycle, and error recovery. - **`response_adapter.py`**: Translates Claude Agent SDK events (text deltas, tool use, errors, result messages) into the existing CoPilot `StreamEvent` protocol so the frontend works unchanged. - **`tool_adapter.py`**: Bridges CoPilot's MCP tools (find_block, run_block, create_agent, etc.) into the SDK's tool format. Handles schema conversion and result serialization. - **`security_hooks.py`**: Pre/Post tool-use hooks that enforce a strict allowlist of tools, block path traversal, sandbox file operations to per-session workspace directories, cap sub-agent spawning, and prevent the model from accessing unauthorized system resources. - **`transcript.py`**: JSONL transcript I/O utilities for the stateless `--resume` feature (see below). **Stateless Multi-Turn Resume** (new) - Instead of compressing conversation history via LLM on every turn (lossy and expensive), we capture Claude Code's native JSONL session transcript via a **Stop hook** callback, persist it in the DB (`ChatSession.sdkTranscript`), and restore it on the next turn via `--resume `. - This preserves full tool call/result context across turns with zero token overhead for history. - Feature-flagged via `CLAUDE_AGENT_USE_RESUME` (default: off). - DB migration: `ALTER TABLE "ChatSession" ADD COLUMN "sdkTranscript" TEXT`. **Sandboxed Tool Execution** (`chat/tools/`) - **`bash_exec.py`**: Sandboxed bash execution using bubblewrap (`bwrap`) with read-only root filesystem, per-session writable workspace, resource limits (CPU, memory, file size), and network isolation. - **`sandbox.py`**: Shared bubblewrap sandbox infrastructure — generates `bwrap` command lines with configurable mounts, environment, and resource constraints. - **`web_fetch.py`**: URL fetching tool with domain allowlist, size limits, and content-type filtering. - **`check_operation_status.py`**: Polling tool for long-running operations (agent creation, block execution) so the SDK doesn't block waiting. - **`find_block.py`** / **`run_block.py`**: Enhanced with category filtering, optimized response size (removed raw JSON schemas), and better error handling. **Security** - Path traversal prevention: session IDs sanitized, all file ops confined to workspace dirs, symlink resolution. - Tool allowlist enforcement via SDK hooks — model cannot call arbitrary tools. - Built-in `Bash` tool blocked via `disallowed_tools` to prevent bypassing sandboxed `bash_exec`. - Sub-agent (`Task`) spawning capped at configurable limit (default: 10). - CodeQL-clean path sanitization patterns. **Streaming & Reconnection** - SSE stream registry backed by Redis Streams for crash-resilient reconnection. - Long-running operation tracking with TTL-based cleanup. - Atomic message append to prevent race conditions on concurrent writes. **Configuration** (`config.py`) - `use_claude_agent_sdk` — master toggle (default: on) - `claude_agent_model` — model override for SDK path - `claude_agent_max_buffer_size` — JSON parsing buffer (10MB) - `claude_agent_max_subtasks` — sub-agent cap (10) - `claude_agent_use_resume` — transcript-based resume (default: off) - `thinking_enabled` — extended thinking for Claude models **Tests** - `sdk/response_adapter_test.py` — 366 lines covering all event translation paths - `sdk/security_hooks_test.py` — 165 lines covering tool blocking, path traversal, subtask limits - `chat/model_test.py` — 214 lines covering session model serialization - `chat/service_test.py` — Integration tests including multi-turn resume keyword recall - `tools/find_block_test.py` / `run_block_test.py` — Extended with new tool behavior tests ## Test plan - [x] Unit tests pass (`sdk/response_adapter_test.py`, `security_hooks_test.py`, `model_test.py`) - [x] Integration test: multi-turn keyword recall via `--resume` (`service_test.py::test_sdk_resume_multi_turn`) - [x] Manual E2E: CoPilot chat sessions with tool calls, bash execution, and multi-turn context - [x] Pre-commit hooks pass (ruff, isort, black, pyright, flake8) - [ ] Staging deployment with `claude_agent_use_resume=false` initially - [ ] Enable resume in staging, verify transcript capture and recall

Greptile Overview

Greptile Summary

This PR replaces the existing OpenAI-compatible CoPilot with a full Claude Agent SDK integration, introducing multi-turn conversations, stateless resume via JSONL transcripts, and sandboxed tool execution. **Key changes:** - **SDK integration** (`chat/sdk/`): spawns Claude Code CLI subprocess per message, translates events to frontend protocol, bridges MCP tools - **Stateless resume**: captures JSONL transcripts via Stop hook, persists in `ChatSession.sdkTranscript`, restores with `--resume` (feature-flagged, default off) - **Sandboxed execution**: bubblewrap sandbox for bash commands with filesystem whitelist, network isolation, resource limits - **Security hooks**: tool allowlist enforcement, path traversal prevention, workspace-scoped file operations, sub-agent spawn limits - **Long-running operations**: delegates `create_agent`/`edit_agent` to existing stream_registry infrastructure for SSE reconnection - **Feature flag**: `CHAT_USE_CLAUDE_AGENT_SDK` with LaunchDarkly support, defaults to enabled **Security issues found:** - Path traversal validation has logic errors in `security_hooks.py:82` (tilde expansion order) and `service.py:266` (redundant `..` check) - Config validator always prefers env var over explicit `False` value (`config.py:162`) - Race condition in `routes.py:323` — message persisted before task registration, could duplicate on retry - Resource limits in sandbox may fail silently (`sandbox.py:109`) **Test coverage is strong** with 366 lines for response adapter, 165 for security hooks, and integration tests for multi-turn resume.

Confidence Score: 3/5

- This PR is generally safe but has critical security issues in path validation that must be fixed before merge - Score reflects strong architecture and test coverage offset by real security vulnerabilities: the tilde expansion bug in `security_hooks.py` could allow sandbox escape, the race condition could cause message duplication, and the silent ulimit failures could bypass resource limits. The bubblewrap sandbox and allowlist enforcement are well-designed, but the path validation bugs need fixing. The transcript resume feature is properly feature-flagged. Overall the implementation is solid but the security issues prevent a higher score. - Pay close attention to `backend/api/features/chat/sdk/security_hooks.py` (path traversal vulnerability), `backend/api/features/chat/routes.py` (race condition), `backend/api/features/chat/tools/sandbox.py` (silent resource limit failures), and `backend/api/features/chat/sdk/service.py` (redundant security check)

Sequence Diagram

```mermaid sequenceDiagram participant Frontend participant Routes as routes.py participant SDKService as sdk/service.py participant ClaudeSDK as Claude Agent SDK CLI participant SecurityHooks as security_hooks.py participant ToolAdapter as tool_adapter.py participant CoPilotTools as tools/* participant Sandbox as sandbox.py (bwrap) participant DB as Database participant Redis as stream_registry Frontend->>Routes: POST /chat (user message) Routes->>SDKService: stream_chat_completion_sdk() SDKService->>DB: get_chat_session() DB-->>SDKService: session + messages alt Resume enabled AND transcript exists SDKService->>SDKService: validate_transcript() SDKService->>SDKService: write_transcript_to_tempfile() Note over SDKService: Pass --resume to SDK else No resume SDKService->>SDKService: _compress_conversation_history() Note over SDKService: Inject history into user message end SDKService->>SecurityHooks: create_security_hooks() SDKService->>ToolAdapter: create_copilot_mcp_server() SDKService->>ClaudeSDK: spawn subprocess with MCP server loop Streaming Conversation ClaudeSDK->>SDKService: AssistantMessage (text/tool_use) SDKService->>Frontend: StreamTextDelta / StreamToolInputAvailable alt Tool Call ClaudeSDK->>SecurityHooks: PreToolUse hook SecurityHooks->>SecurityHooks: validate path, check allowlist alt Tool blocked SecurityHooks-->>ClaudeSDK: deny else Tool allowed SecurityHooks-->>ClaudeSDK: allow ClaudeSDK->>ToolAdapter: call MCP tool alt Long-running tool (create_agent, edit_agent) ToolAdapter->>Redis: register task ToolAdapter->>DB: save OperationPendingResponse ToolAdapter->>ToolAdapter: spawn background task ToolAdapter-->>ClaudeSDK: OperationStartedResponse else Regular tool (find_block, bash_exec) ToolAdapter->>CoPilotTools: execute() alt bash_exec CoPilotTools->>Sandbox: run_sandboxed() Sandbox->>Sandbox: build bwrap command Note over Sandbox: Network isolation,
filesystem whitelist,
resource limits Sandbox-->>CoPilotTools: stdout, stderr, exit_code end CoPilotTools-->>ToolAdapter: result ToolAdapter->>ToolAdapter: stash full output ToolAdapter-->>ClaudeSDK: MCP response end SecurityHooks->>SecurityHooks: PostToolUse hook (log) end end ClaudeSDK->>SDKService: UserMessage (ToolResultBlock) SDKService->>ToolAdapter: pop_pending_tool_output() SDKService->>Frontend: StreamToolOutputAvailable end ClaudeSDK->>SecurityHooks: Stop hook SecurityHooks->>SDKService: transcript_path callback SDKService->>SDKService: read_transcript_file() SDKService->>DB: save transcript to session.sdkTranscript ClaudeSDK->>SDKService: ResultMessage (success) SDKService->>Frontend: StreamFinish SDKService->>DB: upsert_chat_session() ```
Last reviewed commit: 28c1121 --------- Co-authored-by: Swifty --- autogpt_platform/backend/Dockerfile | 8 +- .../backend/api/features/chat/config.py | 45 +- .../backend/api/features/chat/model.py | 73 +- .../backend/api/features/chat/routes.py | 106 ++- .../backend/api/features/chat/sdk/__init__.py | 14 + .../api/features/chat/sdk/response_adapter.py | 203 +++++ .../chat/sdk/response_adapter_test.py | 366 +++++++++ .../api/features/chat/sdk/security_hooks.py | 335 ++++++++ .../features/chat/sdk/security_hooks_test.py | 165 ++++ .../backend/api/features/chat/sdk/service.py | 751 ++++++++++++++++++ .../api/features/chat/sdk/tool_adapter.py | 322 ++++++++ .../api/features/chat/sdk/transcript.py | 356 +++++++++ .../backend/api/features/chat/service.py | 22 +- .../backend/api/features/chat/service_test.py | 96 +++ .../api/features/chat/stream_registry.py | 22 + .../api/features/chat/tools/__init__.py | 9 + .../api/features/chat/tools/bash_exec.py | 131 +++ .../chat/tools/check_operation_status.py | 127 +++ .../api/features/chat/tools/find_block.py | 1 + .../backend/api/features/chat/tools/models.py | 44 + .../api/features/chat/tools/sandbox.py | 265 ++++++ .../api/features/chat/tools/web_fetch.py | 151 ++++ .../features/chat/tools/workspace_files.py | 14 +- .../backend/backend/util/feature_flag.py | 1 + autogpt_platform/backend/poetry.lock | 94 ++- autogpt_platform/backend/pyproject.toml | 1 + .../backend/test/chat/__init__.py | 0 .../backend/test/chat/test_security_hooks.py | 133 ++++ .../backend/test/chat/test_transcript.py | 255 ++++++ .../ChatMessagesContainer.tsx | 11 + .../copilot/tools/GenericTool/GenericTool.tsx | 63 ++ .../frontend/src/app/api/openapi.json | 58 +- 32 files changed, 4187 insertions(+), 55 deletions(-) create mode 100644 autogpt_platform/backend/backend/api/features/chat/sdk/__init__.py create mode 100644 autogpt_platform/backend/backend/api/features/chat/sdk/response_adapter.py create mode 100644 autogpt_platform/backend/backend/api/features/chat/sdk/response_adapter_test.py create mode 100644 autogpt_platform/backend/backend/api/features/chat/sdk/security_hooks.py create mode 100644 autogpt_platform/backend/backend/api/features/chat/sdk/security_hooks_test.py create mode 100644 autogpt_platform/backend/backend/api/features/chat/sdk/service.py create mode 100644 autogpt_platform/backend/backend/api/features/chat/sdk/tool_adapter.py create mode 100644 autogpt_platform/backend/backend/api/features/chat/sdk/transcript.py create mode 100644 autogpt_platform/backend/backend/api/features/chat/tools/bash_exec.py create mode 100644 autogpt_platform/backend/backend/api/features/chat/tools/check_operation_status.py create mode 100644 autogpt_platform/backend/backend/api/features/chat/tools/sandbox.py create mode 100644 autogpt_platform/backend/backend/api/features/chat/tools/web_fetch.py create mode 100644 autogpt_platform/backend/test/chat/__init__.py create mode 100644 autogpt_platform/backend/test/chat/test_security_hooks.py create mode 100644 autogpt_platform/backend/test/chat/test_transcript.py create mode 100644 autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/GenericTool.tsx diff --git a/autogpt_platform/backend/Dockerfile b/autogpt_platform/backend/Dockerfile index ace534b730..05a8d4858b 100644 --- a/autogpt_platform/backend/Dockerfile +++ b/autogpt_platform/backend/Dockerfile @@ -66,13 +66,19 @@ ENV POETRY_HOME=/opt/poetry \ DEBIAN_FRONTEND=noninteractive ENV PATH=/opt/poetry/bin:$PATH -# Install Python, FFmpeg, and ImageMagick (required for video processing blocks) +# Install Python, FFmpeg, ImageMagick, and CLI tools for agent use. +# bubblewrap provides OS-level sandbox (whitelist-only FS + no network) +# for the bash_exec MCP tool. # Using --no-install-recommends saves ~650MB by skipping unnecessary deps like llvm, mesa, etc. RUN apt-get update && apt-get install -y --no-install-recommends \ python3.13 \ python3-pip \ ffmpeg \ imagemagick \ + jq \ + ripgrep \ + tree \ + bubblewrap \ && rm -rf /var/lib/apt/lists/* COPY --from=builder /usr/local/lib/python3* /usr/local/lib/python3* diff --git a/autogpt_platform/backend/backend/api/features/chat/config.py b/autogpt_platform/backend/backend/api/features/chat/config.py index 808692f97f..04bbe8e60d 100644 --- a/autogpt_platform/backend/backend/api/features/chat/config.py +++ b/autogpt_platform/backend/backend/api/features/chat/config.py @@ -27,12 +27,11 @@ class ChatConfig(BaseSettings): session_ttl: int = Field(default=43200, description="Session TTL in seconds") # Streaming Configuration - max_context_messages: int = Field( - default=50, ge=1, le=200, description="Maximum context messages" - ) - stream_timeout: int = Field(default=300, description="Stream timeout in seconds") - max_retries: int = Field(default=3, description="Maximum number of retries") + max_retries: int = Field( + default=3, + description="Max retries for fallback path (SDK handles retries internally)", + ) max_agent_runs: int = Field(default=30, description="Maximum number of agent runs") max_agent_schedules: int = Field( default=30, description="Maximum number of agent schedules" @@ -93,6 +92,31 @@ class ChatConfig(BaseSettings): description="Name of the prompt in Langfuse to fetch", ) + # Claude Agent SDK Configuration + use_claude_agent_sdk: bool = Field( + default=True, + description="Use Claude Agent SDK for chat completions", + ) + claude_agent_model: str | None = Field( + default=None, + description="Model for the Claude Agent SDK path. If None, derives from " + "the `model` field by stripping the OpenRouter provider prefix.", + ) + claude_agent_max_buffer_size: int = Field( + default=10 * 1024 * 1024, # 10MB (default SDK is 1MB) + description="Max buffer size in bytes for Claude Agent SDK JSON message parsing. " + "Increase if tool outputs exceed the limit.", + ) + claude_agent_max_subtasks: int = Field( + default=10, + description="Max number of sub-agent Tasks the SDK can spawn per session.", + ) + claude_agent_use_resume: bool = Field( + default=True, + description="Use --resume for multi-turn conversations instead of " + "history compression. Falls back to compression when unavailable.", + ) + # Extended thinking configuration for Claude models thinking_enabled: bool = Field( default=True, @@ -138,6 +162,17 @@ class ChatConfig(BaseSettings): v = os.getenv("CHAT_INTERNAL_API_KEY") return v + @field_validator("use_claude_agent_sdk", mode="before") + @classmethod + def get_use_claude_agent_sdk(cls, v): + """Get use_claude_agent_sdk from environment if not provided.""" + # Check environment variable - default to True if not set + env_val = os.getenv("CHAT_USE_CLAUDE_AGENT_SDK", "").lower() + if env_val: + return env_val in ("true", "1", "yes", "on") + # Default to True (SDK enabled by default) + return True if v is None else v + # Prompt paths for different contexts PROMPT_PATHS: dict[str, str] = { "default": "prompts/chat_system.md", diff --git a/autogpt_platform/backend/backend/api/features/chat/model.py b/autogpt_platform/backend/backend/api/features/chat/model.py index 35418f174f..30ac27aece 100644 --- a/autogpt_platform/backend/backend/api/features/chat/model.py +++ b/autogpt_platform/backend/backend/api/features/chat/model.py @@ -334,9 +334,8 @@ async def _get_session_from_cache(session_id: str) -> ChatSession | None: try: session = ChatSession.model_validate_json(raw_session) logger.info( - f"Loading session {session_id} from cache: " - f"message_count={len(session.messages)}, " - f"roles={[m.role for m in session.messages]}" + f"[CACHE] Loaded session {session_id}: {len(session.messages)} messages, " + f"last_roles={[m.role for m in session.messages[-3:]]}" # Last 3 roles ) return session except Exception as e: @@ -378,11 +377,9 @@ async def _get_session_from_db(session_id: str) -> ChatSession | None: return None messages = prisma_session.Messages - logger.info( - f"Loading session {session_id} from DB: " - f"has_messages={messages is not None}, " - f"message_count={len(messages) if messages else 0}, " - f"roles={[m.role for m in messages] if messages else []}" + logger.debug( + f"[DB] Loaded session {session_id}: {len(messages) if messages else 0} messages, " + f"roles={[m.role for m in messages[-3:]] if messages else []}" # Last 3 roles ) return ChatSession.from_db(prisma_session, messages) @@ -433,10 +430,9 @@ async def _save_session_to_db( "function_call": msg.function_call, } ) - logger.info( - f"Saving {len(new_messages)} new messages to DB for session {session.session_id}: " - f"roles={[m['role'] for m in messages_data]}, " - f"start_sequence={existing_message_count}" + logger.debug( + f"[DB] Saving {len(new_messages)} messages to session {session.session_id}, " + f"roles={[m['role'] for m in messages_data]}" ) await chat_db.add_chat_messages_batch( session_id=session.session_id, @@ -476,7 +472,7 @@ async def get_chat_session( logger.warning(f"Unexpected cache error for session {session_id}: {e}") # Fall back to database - logger.info(f"Session {session_id} not in cache, checking database") + logger.debug(f"Session {session_id} not in cache, checking database") session = await _get_session_from_db(session_id) if session is None: @@ -493,7 +489,6 @@ async def get_chat_session( # Cache the session from DB try: await _cache_session(session) - logger.info(f"Cached session {session_id} from database") except Exception as e: logger.warning(f"Failed to cache session {session_id}: {e}") @@ -558,6 +553,40 @@ async def upsert_chat_session( return session +async def append_and_save_message(session_id: str, message: ChatMessage) -> ChatSession: + """Atomically append a message to a session and persist it. + + Acquires the session lock, re-fetches the latest session state, + appends the message, and saves — preventing message loss when + concurrent requests modify the same session. + """ + lock = await _get_session_lock(session_id) + + async with lock: + session = await get_chat_session(session_id) + if session is None: + raise ValueError(f"Session {session_id} not found") + + session.messages.append(message) + existing_message_count = await chat_db.get_chat_session_message_count( + session_id + ) + + try: + await _save_session_to_db(session, existing_message_count) + except Exception as e: + raise DatabaseError( + f"Failed to persist message to session {session_id}" + ) from e + + try: + await _cache_session(session) + except Exception as e: + logger.warning(f"Cache write failed for session {session_id}: {e}") + + return session + + async def create_chat_session(user_id: str) -> ChatSession: """Create a new chat session and persist it. @@ -664,13 +693,19 @@ async def update_session_title(session_id: str, title: str) -> bool: logger.warning(f"Session {session_id} not found for title update") return False - # Invalidate cache so next fetch gets updated title + # Update title in cache if it exists (instead of invalidating). + # This prevents race conditions where cache invalidation causes + # the frontend to see stale DB data while streaming is still in progress. try: - redis_key = _get_session_cache_key(session_id) - async_redis = await get_redis_async() - await async_redis.delete(redis_key) + cached = await _get_session_from_cache(session_id) + if cached: + cached.title = title + await _cache_session(cached) except Exception as e: - logger.warning(f"Failed to invalidate cache for session {session_id}: {e}") + # Not critical - title will be correct on next full cache refresh + logger.warning( + f"Failed to update title in cache for session {session_id}: {e}" + ) return True except Exception as e: diff --git a/autogpt_platform/backend/backend/api/features/chat/routes.py b/autogpt_platform/backend/backend/api/features/chat/routes.py index 0d8b12b0b7..aa565ca891 100644 --- a/autogpt_platform/backend/backend/api/features/chat/routes.py +++ b/autogpt_platform/backend/backend/api/features/chat/routes.py @@ -1,5 +1,6 @@ """Chat API routes for chat session management and streaming via SSE.""" +import asyncio import logging import uuid as uuid_module from collections.abc import AsyncGenerator @@ -11,13 +12,22 @@ from fastapi.responses import StreamingResponse from pydantic import BaseModel from backend.util.exceptions import NotFoundError +from backend.util.feature_flag import Flag, is_feature_enabled from . import service as chat_service from . import stream_registry from .completion_handler import process_operation_failure, process_operation_success from .config import ChatConfig -from .model import ChatSession, create_chat_session, get_chat_session, get_user_sessions -from .response_model import StreamFinish, StreamHeartbeat +from .model import ( + ChatMessage, + ChatSession, + append_and_save_message, + create_chat_session, + get_chat_session, + get_user_sessions, +) +from .response_model import StreamError, StreamFinish, StreamHeartbeat, StreamStart +from .sdk import service as sdk_service from .tools.models import ( AgentDetailsResponse, AgentOutputResponse, @@ -41,6 +51,7 @@ from .tools.models import ( SetupRequirementsResponse, UnderstandingUpdatedResponse, ) +from .tracking import track_user_message config = ChatConfig() @@ -232,6 +243,10 @@ async def get_session( active_task, last_message_id = await stream_registry.get_active_task_for_session( session_id, user_id ) + logger.info( + f"[GET_SESSION] session={session_id}, active_task={active_task is not None}, " + f"msg_count={len(messages)}, last_role={messages[-1].get('role') if messages else 'none'}" + ) if active_task: # Filter out the in-progress assistant message from the session response. # The client will receive the complete assistant response through the SSE @@ -301,10 +316,9 @@ async def stream_chat_post( f"user={user_id}, message_len={len(request.message)}", extra={"json_fields": log_meta}, ) - session = await _validate_and_get_session(session_id, user_id) logger.info( - f"[TIMING] session validated in {(time.perf_counter() - stream_start_time)*1000:.1f}ms", + f"[TIMING] session validated in {(time.perf_counter() - stream_start_time) * 1000:.1f}ms", extra={ "json_fields": { **log_meta, @@ -313,6 +327,25 @@ async def stream_chat_post( }, ) + # Atomically append user message to session BEFORE creating task to avoid + # race condition where GET_SESSION sees task as "running" but message isn't + # saved yet. append_and_save_message re-fetches inside a lock to prevent + # message loss from concurrent requests. + if request.message: + message = ChatMessage( + role="user" if request.is_user_message else "assistant", + content=request.message, + ) + if request.is_user_message: + track_user_message( + user_id=user_id, + session_id=session_id, + message_length=len(request.message), + ) + logger.info(f"[STREAM] Saving user message to session {session_id}") + session = await append_and_save_message(session_id, message) + logger.info(f"[STREAM] User message saved for session {session_id}") + # Create a task in the stream registry for reconnection support task_id = str(uuid_module.uuid4()) operation_id = str(uuid_module.uuid4()) @@ -328,7 +361,7 @@ async def stream_chat_post( operation_id=operation_id, ) logger.info( - f"[TIMING] create_task completed in {(time.perf_counter() - task_create_start)*1000:.1f}ms", + f"[TIMING] create_task completed in {(time.perf_counter() - task_create_start) * 1000:.1f}ms", extra={ "json_fields": { **log_meta, @@ -349,15 +382,47 @@ async def stream_chat_post( first_chunk_time, ttfc = None, None chunk_count = 0 try: - async for chunk in chat_service.stream_chat_completion( + # Emit a start event with task_id for reconnection + start_chunk = StreamStart(messageId=task_id, taskId=task_id) + await stream_registry.publish_chunk(task_id, start_chunk) + logger.info( + f"[TIMING] StreamStart published at {(time_module.perf_counter() - gen_start_time) * 1000:.1f}ms", + extra={ + "json_fields": { + **log_meta, + "elapsed_ms": (time_module.perf_counter() - gen_start_time) + * 1000, + } + }, + ) + + # Choose service based on LaunchDarkly flag (falls back to config default) + use_sdk = await is_feature_enabled( + Flag.COPILOT_SDK, + user_id or "anonymous", + default=config.use_claude_agent_sdk, + ) + stream_fn = ( + sdk_service.stream_chat_completion_sdk + if use_sdk + else chat_service.stream_chat_completion + ) + logger.info( + f"[TIMING] Calling {'sdk' if use_sdk else 'standard'} stream_chat_completion", + extra={"json_fields": log_meta}, + ) + # Pass message=None since we already added it to the session above + async for chunk in stream_fn( session_id, - request.message, + None, # Message already in session is_user_message=request.is_user_message, user_id=user_id, - session=session, # Pass pre-fetched session to avoid double-fetch + session=session, # Pass session with message already added context=request.context, - _task_id=task_id, # Pass task_id so service emits start with taskId for reconnection ): + # Skip duplicate StreamStart — we already published one above + if isinstance(chunk, StreamStart): + continue chunk_count += 1 if first_chunk_time is None: first_chunk_time = time_module.perf_counter() @@ -378,7 +443,7 @@ async def stream_chat_post( gen_end_time = time_module.perf_counter() total_time = (gen_end_time - gen_start_time) * 1000 logger.info( - f"[TIMING] run_ai_generation FINISHED in {total_time/1000:.1f}s; " + f"[TIMING] run_ai_generation FINISHED in {total_time / 1000:.1f}s; " f"task={task_id}, session={session_id}, " f"ttfc={ttfc or -1:.2f}s, n_chunks={chunk_count}", extra={ @@ -405,6 +470,17 @@ async def stream_chat_post( } }, ) + # Publish a StreamError so the frontend can display an error message + try: + await stream_registry.publish_chunk( + task_id, + StreamError( + errorText="An error occurred. Please try again.", + code="stream_error", + ), + ) + except Exception: + pass # Best-effort; mark_task_completed will publish StreamFinish await stream_registry.mark_task_completed(task_id, "failed") # Start the AI generation in a background task @@ -507,8 +583,14 @@ async def stream_chat_post( "json_fields": {**log_meta, "elapsed_ms": elapsed, "error": str(e)} }, ) + # Surface error to frontend so it doesn't appear stuck + yield StreamError( + errorText="An error occurred. Please try again.", + code="stream_error", + ).to_sse() + yield StreamFinish().to_sse() finally: - # Unsubscribe when client disconnects or stream ends to prevent resource leak + # Unsubscribe when client disconnects or stream ends if subscriber_queue is not None: try: await stream_registry.unsubscribe_from_task( @@ -752,8 +834,6 @@ async def stream_task( ) async def event_generator() -> AsyncGenerator[str, None]: - import asyncio - heartbeat_interval = 15.0 # Send heartbeat every 15 seconds try: while True: diff --git a/autogpt_platform/backend/backend/api/features/chat/sdk/__init__.py b/autogpt_platform/backend/backend/api/features/chat/sdk/__init__.py new file mode 100644 index 0000000000..7d9d6371e9 --- /dev/null +++ b/autogpt_platform/backend/backend/api/features/chat/sdk/__init__.py @@ -0,0 +1,14 @@ +"""Claude Agent SDK integration for CoPilot. + +This module provides the integration layer between the Claude Agent SDK +and the existing CoPilot tool system, enabling drop-in replacement of +the current LLM orchestration with the battle-tested Claude Agent SDK. +""" + +from .service import stream_chat_completion_sdk +from .tool_adapter import create_copilot_mcp_server + +__all__ = [ + "stream_chat_completion_sdk", + "create_copilot_mcp_server", +] diff --git a/autogpt_platform/backend/backend/api/features/chat/sdk/response_adapter.py b/autogpt_platform/backend/backend/api/features/chat/sdk/response_adapter.py new file mode 100644 index 0000000000..f7151f8319 --- /dev/null +++ b/autogpt_platform/backend/backend/api/features/chat/sdk/response_adapter.py @@ -0,0 +1,203 @@ +"""Response adapter for converting Claude Agent SDK messages to Vercel AI SDK format. + +This module provides the adapter layer that converts streaming messages from +the Claude Agent SDK into the Vercel AI SDK UI Stream Protocol format that +the frontend expects. +""" + +import json +import logging +import uuid + +from claude_agent_sdk import ( + AssistantMessage, + Message, + ResultMessage, + SystemMessage, + TextBlock, + ToolResultBlock, + ToolUseBlock, + UserMessage, +) + +from backend.api.features.chat.response_model import ( + StreamBaseResponse, + StreamError, + StreamFinish, + StreamFinishStep, + StreamStart, + StreamStartStep, + StreamTextDelta, + StreamTextEnd, + StreamTextStart, + StreamToolInputAvailable, + StreamToolInputStart, + StreamToolOutputAvailable, +) +from backend.api.features.chat.sdk.tool_adapter import ( + MCP_TOOL_PREFIX, + pop_pending_tool_output, +) + +logger = logging.getLogger(__name__) + + +class SDKResponseAdapter: + """Adapter for converting Claude Agent SDK messages to Vercel AI SDK format. + + This class maintains state during a streaming session to properly track + text blocks, tool calls, and message lifecycle. + """ + + def __init__(self, message_id: str | None = None): + self.message_id = message_id or str(uuid.uuid4()) + self.text_block_id = str(uuid.uuid4()) + self.has_started_text = False + self.has_ended_text = False + self.current_tool_calls: dict[str, dict[str, str]] = {} + self.task_id: str | None = None + self.step_open = False + + def set_task_id(self, task_id: str) -> None: + """Set the task ID for reconnection support.""" + self.task_id = task_id + + def convert_message(self, sdk_message: Message) -> list[StreamBaseResponse]: + """Convert a single SDK message to Vercel AI SDK format.""" + responses: list[StreamBaseResponse] = [] + + if isinstance(sdk_message, SystemMessage): + if sdk_message.subtype == "init": + responses.append( + StreamStart(messageId=self.message_id, taskId=self.task_id) + ) + # Open the first step (matches non-SDK: StreamStart then StreamStartStep) + responses.append(StreamStartStep()) + self.step_open = True + + elif isinstance(sdk_message, AssistantMessage): + # After tool results, the SDK sends a new AssistantMessage for the + # next LLM turn. Open a new step if the previous one was closed. + if not self.step_open: + responses.append(StreamStartStep()) + self.step_open = True + + for block in sdk_message.content: + if isinstance(block, TextBlock): + if block.text: + self._ensure_text_started(responses) + responses.append( + StreamTextDelta(id=self.text_block_id, delta=block.text) + ) + + elif isinstance(block, ToolUseBlock): + self._end_text_if_open(responses) + + # Strip MCP prefix so frontend sees "find_block" + # instead of "mcp__copilot__find_block". + tool_name = block.name.removeprefix(MCP_TOOL_PREFIX) + + responses.append( + StreamToolInputStart(toolCallId=block.id, toolName=tool_name) + ) + responses.append( + StreamToolInputAvailable( + toolCallId=block.id, + toolName=tool_name, + input=block.input, + ) + ) + self.current_tool_calls[block.id] = {"name": tool_name} + + elif isinstance(sdk_message, UserMessage): + # UserMessage carries tool results back from tool execution. + content = sdk_message.content + blocks = content if isinstance(content, list) else [] + for block in blocks: + if isinstance(block, ToolResultBlock) and block.tool_use_id: + tool_info = self.current_tool_calls.get(block.tool_use_id, {}) + tool_name = tool_info.get("name", "unknown") + + # Prefer the stashed full output over the SDK's + # (potentially truncated) ToolResultBlock content. + # The SDK truncates large results, writing them to disk, + # which breaks frontend widget parsing. + output = pop_pending_tool_output(tool_name) or ( + _extract_tool_output(block.content) + ) + + responses.append( + StreamToolOutputAvailable( + toolCallId=block.tool_use_id, + toolName=tool_name, + output=output, + success=not (block.is_error or False), + ) + ) + + # Close the current step after tool results — the next + # AssistantMessage will open a new step for the continuation. + if self.step_open: + responses.append(StreamFinishStep()) + self.step_open = False + + elif isinstance(sdk_message, ResultMessage): + self._end_text_if_open(responses) + # Close the step before finishing. + if self.step_open: + responses.append(StreamFinishStep()) + self.step_open = False + + if sdk_message.subtype == "success": + responses.append(StreamFinish()) + elif sdk_message.subtype in ("error", "error_during_execution"): + error_msg = getattr(sdk_message, "result", None) or "Unknown error" + responses.append( + StreamError(errorText=str(error_msg), code="sdk_error") + ) + responses.append(StreamFinish()) + else: + logger.warning( + f"Unexpected ResultMessage subtype: {sdk_message.subtype}" + ) + responses.append(StreamFinish()) + + else: + logger.debug(f"Unhandled SDK message type: {type(sdk_message).__name__}") + + return responses + + def _ensure_text_started(self, responses: list[StreamBaseResponse]) -> None: + """Start (or restart) a text block if needed.""" + if not self.has_started_text or self.has_ended_text: + if self.has_ended_text: + self.text_block_id = str(uuid.uuid4()) + self.has_ended_text = False + responses.append(StreamTextStart(id=self.text_block_id)) + self.has_started_text = True + + def _end_text_if_open(self, responses: list[StreamBaseResponse]) -> None: + """End the current text block if one is open.""" + if self.has_started_text and not self.has_ended_text: + responses.append(StreamTextEnd(id=self.text_block_id)) + self.has_ended_text = True + + +def _extract_tool_output(content: str | list[dict[str, str]] | None) -> str: + """Extract a string output from a ToolResultBlock's content field.""" + if isinstance(content, str): + return content + if isinstance(content, list): + parts = [item.get("text", "") for item in content if item.get("type") == "text"] + if parts: + return "".join(parts) + try: + return json.dumps(content) + except (TypeError, ValueError): + return str(content) + if content is None: + return "" + try: + return json.dumps(content) + except (TypeError, ValueError): + return str(content) diff --git a/autogpt_platform/backend/backend/api/features/chat/sdk/response_adapter_test.py b/autogpt_platform/backend/backend/api/features/chat/sdk/response_adapter_test.py new file mode 100644 index 0000000000..a4f2502642 --- /dev/null +++ b/autogpt_platform/backend/backend/api/features/chat/sdk/response_adapter_test.py @@ -0,0 +1,366 @@ +"""Unit tests for the SDK response adapter.""" + +from claude_agent_sdk import ( + AssistantMessage, + ResultMessage, + SystemMessage, + TextBlock, + ToolResultBlock, + ToolUseBlock, + UserMessage, +) + +from backend.api.features.chat.response_model import ( + StreamBaseResponse, + StreamError, + StreamFinish, + StreamFinishStep, + StreamStart, + StreamStartStep, + StreamTextDelta, + StreamTextEnd, + StreamTextStart, + StreamToolInputAvailable, + StreamToolInputStart, + StreamToolOutputAvailable, +) + +from .response_adapter import SDKResponseAdapter +from .tool_adapter import MCP_TOOL_PREFIX + + +def _adapter() -> SDKResponseAdapter: + a = SDKResponseAdapter(message_id="msg-1") + a.set_task_id("task-1") + return a + + +# -- SystemMessage ----------------------------------------------------------- + + +def test_system_init_emits_start_and_step(): + adapter = _adapter() + results = adapter.convert_message(SystemMessage(subtype="init", data={})) + assert len(results) == 2 + assert isinstance(results[0], StreamStart) + assert results[0].messageId == "msg-1" + assert results[0].taskId == "task-1" + assert isinstance(results[1], StreamStartStep) + + +def test_system_non_init_emits_nothing(): + adapter = _adapter() + results = adapter.convert_message(SystemMessage(subtype="other", data={})) + assert results == [] + + +# -- AssistantMessage with TextBlock ----------------------------------------- + + +def test_text_block_emits_step_start_and_delta(): + adapter = _adapter() + msg = AssistantMessage(content=[TextBlock(text="hello")], model="test") + results = adapter.convert_message(msg) + assert len(results) == 3 + assert isinstance(results[0], StreamStartStep) + assert isinstance(results[1], StreamTextStart) + assert isinstance(results[2], StreamTextDelta) + assert results[2].delta == "hello" + + +def test_empty_text_block_emits_only_step(): + adapter = _adapter() + msg = AssistantMessage(content=[TextBlock(text="")], model="test") + results = adapter.convert_message(msg) + # Empty text skipped, but step still opens + assert len(results) == 1 + assert isinstance(results[0], StreamStartStep) + + +def test_multiple_text_deltas_reuse_block_id(): + adapter = _adapter() + msg1 = AssistantMessage(content=[TextBlock(text="a")], model="test") + msg2 = AssistantMessage(content=[TextBlock(text="b")], model="test") + r1 = adapter.convert_message(msg1) + r2 = adapter.convert_message(msg2) + # First gets step+start+delta, second only delta (block & step already started) + assert len(r1) == 3 + assert isinstance(r1[0], StreamStartStep) + assert isinstance(r1[1], StreamTextStart) + assert len(r2) == 1 + assert isinstance(r2[0], StreamTextDelta) + assert r1[1].id == r2[0].id # same block ID + + +# -- AssistantMessage with ToolUseBlock -------------------------------------- + + +def test_tool_use_emits_input_start_and_available(): + """Tool names arrive with MCP prefix and should be stripped for the frontend.""" + adapter = _adapter() + msg = AssistantMessage( + content=[ + ToolUseBlock( + id="tool-1", + name=f"{MCP_TOOL_PREFIX}find_agent", + input={"q": "x"}, + ) + ], + model="test", + ) + results = adapter.convert_message(msg) + assert len(results) == 3 + assert isinstance(results[0], StreamStartStep) + assert isinstance(results[1], StreamToolInputStart) + assert results[1].toolCallId == "tool-1" + assert results[1].toolName == "find_agent" # prefix stripped + assert isinstance(results[2], StreamToolInputAvailable) + assert results[2].toolName == "find_agent" # prefix stripped + assert results[2].input == {"q": "x"} + + +def test_text_then_tool_ends_text_block(): + adapter = _adapter() + text_msg = AssistantMessage(content=[TextBlock(text="thinking...")], model="test") + tool_msg = AssistantMessage( + content=[ToolUseBlock(id="t1", name=f"{MCP_TOOL_PREFIX}tool", input={})], + model="test", + ) + adapter.convert_message(text_msg) # opens step + text + results = adapter.convert_message(tool_msg) + # Step already open, so: TextEnd, ToolInputStart, ToolInputAvailable + assert len(results) == 3 + assert isinstance(results[0], StreamTextEnd) + assert isinstance(results[1], StreamToolInputStart) + + +# -- UserMessage with ToolResultBlock ---------------------------------------- + + +def test_tool_result_emits_output_and_finish_step(): + adapter = _adapter() + # First register the tool call (opens step) — SDK sends prefixed name + tool_msg = AssistantMessage( + content=[ToolUseBlock(id="t1", name=f"{MCP_TOOL_PREFIX}find_agent", input={})], + model="test", + ) + adapter.convert_message(tool_msg) + + # Now send tool result + result_msg = UserMessage( + content=[ToolResultBlock(tool_use_id="t1", content="found 3 agents")] + ) + results = adapter.convert_message(result_msg) + assert len(results) == 2 + assert isinstance(results[0], StreamToolOutputAvailable) + assert results[0].toolCallId == "t1" + assert results[0].toolName == "find_agent" # prefix stripped + assert results[0].output == "found 3 agents" + assert results[0].success is True + assert isinstance(results[1], StreamFinishStep) + + +def test_tool_result_error(): + adapter = _adapter() + adapter.convert_message( + AssistantMessage( + content=[ + ToolUseBlock(id="t1", name=f"{MCP_TOOL_PREFIX}run_agent", input={}) + ], + model="test", + ) + ) + result_msg = UserMessage( + content=[ToolResultBlock(tool_use_id="t1", content="timeout", is_error=True)] + ) + results = adapter.convert_message(result_msg) + assert isinstance(results[0], StreamToolOutputAvailable) + assert results[0].success is False + assert isinstance(results[1], StreamFinishStep) + + +def test_tool_result_list_content(): + adapter = _adapter() + adapter.convert_message( + AssistantMessage( + content=[ToolUseBlock(id="t1", name=f"{MCP_TOOL_PREFIX}tool", input={})], + model="test", + ) + ) + result_msg = UserMessage( + content=[ + ToolResultBlock( + tool_use_id="t1", + content=[ + {"type": "text", "text": "line1"}, + {"type": "text", "text": "line2"}, + ], + ) + ] + ) + results = adapter.convert_message(result_msg) + assert isinstance(results[0], StreamToolOutputAvailable) + assert results[0].output == "line1line2" + assert isinstance(results[1], StreamFinishStep) + + +def test_string_user_message_ignored(): + """A plain string UserMessage (not tool results) produces no output.""" + adapter = _adapter() + results = adapter.convert_message(UserMessage(content="hello")) + assert results == [] + + +# -- ResultMessage ----------------------------------------------------------- + + +def test_result_success_emits_finish_step_and_finish(): + adapter = _adapter() + # Start some text first (opens step) + adapter.convert_message( + AssistantMessage(content=[TextBlock(text="done")], model="test") + ) + msg = ResultMessage( + subtype="success", + duration_ms=100, + duration_api_ms=50, + is_error=False, + num_turns=1, + session_id="s1", + ) + results = adapter.convert_message(msg) + # TextEnd + FinishStep + StreamFinish + assert len(results) == 3 + assert isinstance(results[0], StreamTextEnd) + assert isinstance(results[1], StreamFinishStep) + assert isinstance(results[2], StreamFinish) + + +def test_result_error_emits_error_and_finish(): + adapter = _adapter() + msg = ResultMessage( + subtype="error", + duration_ms=100, + duration_api_ms=50, + is_error=True, + num_turns=0, + session_id="s1", + result="API rate limited", + ) + results = adapter.convert_message(msg) + # No step was open, so no FinishStep — just Error + Finish + assert len(results) == 2 + assert isinstance(results[0], StreamError) + assert "API rate limited" in results[0].errorText + assert isinstance(results[1], StreamFinish) + + +# -- Text after tools (new block ID) ---------------------------------------- + + +def test_text_after_tool_gets_new_block_id(): + adapter = _adapter() + # Text -> Tool -> ToolResult -> Text should get a new text block ID and step + adapter.convert_message( + AssistantMessage(content=[TextBlock(text="before")], model="test") + ) + adapter.convert_message( + AssistantMessage( + content=[ToolUseBlock(id="t1", name=f"{MCP_TOOL_PREFIX}tool", input={})], + model="test", + ) + ) + # Send tool result (closes step) + adapter.convert_message( + UserMessage(content=[ToolResultBlock(tool_use_id="t1", content="ok")]) + ) + results = adapter.convert_message( + AssistantMessage(content=[TextBlock(text="after")], model="test") + ) + # Should get StreamStartStep (new step) + StreamTextStart (new block) + StreamTextDelta + assert len(results) == 3 + assert isinstance(results[0], StreamStartStep) + assert isinstance(results[1], StreamTextStart) + assert isinstance(results[2], StreamTextDelta) + assert results[2].delta == "after" + + +# -- Full conversation flow -------------------------------------------------- + + +def test_full_conversation_flow(): + """Simulate a complete conversation: init -> text -> tool -> result -> text -> finish.""" + adapter = _adapter() + all_responses: list[StreamBaseResponse] = [] + + # 1. Init + all_responses.extend( + adapter.convert_message(SystemMessage(subtype="init", data={})) + ) + # 2. Assistant text + all_responses.extend( + adapter.convert_message( + AssistantMessage(content=[TextBlock(text="Let me search")], model="test") + ) + ) + # 3. Tool use + all_responses.extend( + adapter.convert_message( + AssistantMessage( + content=[ + ToolUseBlock( + id="t1", + name=f"{MCP_TOOL_PREFIX}find_agent", + input={"query": "email"}, + ) + ], + model="test", + ) + ) + ) + # 4. Tool result + all_responses.extend( + adapter.convert_message( + UserMessage( + content=[ToolResultBlock(tool_use_id="t1", content="Found 2 agents")] + ) + ) + ) + # 5. More text + all_responses.extend( + adapter.convert_message( + AssistantMessage(content=[TextBlock(text="I found 2")], model="test") + ) + ) + # 6. Result + all_responses.extend( + adapter.convert_message( + ResultMessage( + subtype="success", + duration_ms=500, + duration_api_ms=400, + is_error=False, + num_turns=2, + session_id="s1", + ) + ) + ) + + types = [type(r).__name__ for r in all_responses] + assert types == [ + "StreamStart", + "StreamStartStep", # step 1: text + tool call + "StreamTextStart", + "StreamTextDelta", # "Let me search" + "StreamTextEnd", # closed before tool + "StreamToolInputStart", + "StreamToolInputAvailable", + "StreamToolOutputAvailable", # tool result + "StreamFinishStep", # step 1 closed after tool result + "StreamStartStep", # step 2: continuation text + "StreamTextStart", # new block after tool + "StreamTextDelta", # "I found 2" + "StreamTextEnd", # closed by result + "StreamFinishStep", # step 2 closed + "StreamFinish", + ] diff --git a/autogpt_platform/backend/backend/api/features/chat/sdk/security_hooks.py b/autogpt_platform/backend/backend/api/features/chat/sdk/security_hooks.py new file mode 100644 index 0000000000..14efc6d459 --- /dev/null +++ b/autogpt_platform/backend/backend/api/features/chat/sdk/security_hooks.py @@ -0,0 +1,335 @@ +"""Security hooks for Claude Agent SDK integration. + +This module provides security hooks that validate tool calls before execution, +ensuring multi-user isolation and preventing unauthorized operations. +""" + +import json +import logging +import os +import re +from collections.abc import Callable +from typing import Any, cast + +from backend.api.features.chat.sdk.tool_adapter import MCP_TOOL_PREFIX + +logger = logging.getLogger(__name__) + +# Tools that are blocked entirely (CLI/system access). +# "Bash" (capital) is the SDK built-in — it's NOT in allowed_tools but blocked +# here as defence-in-depth. The agent uses mcp__copilot__bash_exec instead, +# which has kernel-level network isolation (unshare --net). +BLOCKED_TOOLS = { + "Bash", + "bash", + "shell", + "exec", + "terminal", + "command", +} + +# Tools allowed only when their path argument stays within the SDK workspace. +# The SDK uses these to handle oversized tool results (writes to tool-results/ +# files, then reads them back) and for workspace file operations. +WORKSPACE_SCOPED_TOOLS = {"Read", "Write", "Edit", "Glob", "Grep"} + +# Dangerous patterns in tool inputs +DANGEROUS_PATTERNS = [ + r"sudo", + r"rm\s+-rf", + r"dd\s+if=", + r"/etc/passwd", + r"/etc/shadow", + r"chmod\s+777", + r"curl\s+.*\|.*sh", + r"wget\s+.*\|.*sh", + r"eval\s*\(", + r"exec\s*\(", + r"__import__", + r"os\.system", + r"subprocess", +] + + +def _deny(reason: str) -> dict[str, Any]: + """Return a hook denial response.""" + return { + "hookSpecificOutput": { + "hookEventName": "PreToolUse", + "permissionDecision": "deny", + "permissionDecisionReason": reason, + } + } + + +def _validate_workspace_path( + tool_name: str, tool_input: dict[str, Any], sdk_cwd: str | None +) -> dict[str, Any]: + """Validate that a workspace-scoped tool only accesses allowed paths. + + Allowed directories: + - The SDK working directory (``/tmp/copilot-/``) + - The SDK tool-results directory (``~/.claude/projects/…/tool-results/``) + """ + path = tool_input.get("file_path") or tool_input.get("path") or "" + if not path: + # Glob/Grep without a path default to cwd which is already sandboxed + return {} + + # Resolve relative paths against sdk_cwd (the SDK sets cwd so the LLM + # naturally uses relative paths like "test.txt" instead of absolute ones). + # Tilde paths (~/) are home-dir references, not relative — expand first. + if path.startswith("~"): + resolved = os.path.realpath(os.path.expanduser(path)) + elif not os.path.isabs(path) and sdk_cwd: + resolved = os.path.realpath(os.path.join(sdk_cwd, path)) + else: + resolved = os.path.realpath(path) + + # Allow access within the SDK working directory + if sdk_cwd: + norm_cwd = os.path.realpath(sdk_cwd) + if resolved.startswith(norm_cwd + os.sep) or resolved == norm_cwd: + return {} + + # Allow access to ~/.claude/projects/*/tool-results/ (big tool results) + claude_dir = os.path.realpath(os.path.expanduser("~/.claude/projects")) + tool_results_seg = os.sep + "tool-results" + os.sep + if resolved.startswith(claude_dir + os.sep) and tool_results_seg in resolved: + return {} + + logger.warning( + f"Blocked {tool_name} outside workspace: {path} (resolved={resolved})" + ) + workspace_hint = f" Allowed workspace: {sdk_cwd}" if sdk_cwd else "" + return _deny( + f"[SECURITY] Tool '{tool_name}' can only access files within the workspace " + f"directory.{workspace_hint} " + "This is enforced by the platform and cannot be bypassed." + ) + + +def _validate_tool_access( + tool_name: str, tool_input: dict[str, Any], sdk_cwd: str | None = None +) -> dict[str, Any]: + """Validate that a tool call is allowed. + + Returns: + Empty dict to allow, or dict with hookSpecificOutput to deny + """ + # Block forbidden tools + if tool_name in BLOCKED_TOOLS: + logger.warning(f"Blocked tool access attempt: {tool_name}") + return _deny( + f"[SECURITY] Tool '{tool_name}' is blocked for security. " + "This is enforced by the platform and cannot be bypassed. " + "Use the CoPilot-specific MCP tools instead." + ) + + # Workspace-scoped tools: allowed only within the SDK workspace directory + if tool_name in WORKSPACE_SCOPED_TOOLS: + return _validate_workspace_path(tool_name, tool_input, sdk_cwd) + + # Check for dangerous patterns in tool input + # Use json.dumps for predictable format (str() produces Python repr) + input_str = json.dumps(tool_input) if tool_input else "" + + for pattern in DANGEROUS_PATTERNS: + if re.search(pattern, input_str, re.IGNORECASE): + logger.warning( + f"Blocked dangerous pattern in tool input: {pattern} in {tool_name}" + ) + return _deny( + "[SECURITY] Input contains a blocked pattern. " + "This is enforced by the platform and cannot be bypassed." + ) + + return {} + + +def _validate_user_isolation( + tool_name: str, tool_input: dict[str, Any], user_id: str | None +) -> dict[str, Any]: + """Validate that tool calls respect user isolation.""" + # For workspace file tools, ensure path doesn't escape + if "workspace" in tool_name.lower(): + path = tool_input.get("path", "") or tool_input.get("file_path", "") + if path: + # Check for path traversal + if ".." in path or path.startswith("/"): + logger.warning( + f"Blocked path traversal attempt: {path} by user {user_id}" + ) + return { + "hookSpecificOutput": { + "hookEventName": "PreToolUse", + "permissionDecision": "deny", + "permissionDecisionReason": "Path traversal not allowed", + } + } + + return {} + + +def create_security_hooks( + user_id: str | None, + sdk_cwd: str | None = None, + max_subtasks: int = 3, + on_stop: Callable[[str, str], None] | None = None, +) -> dict[str, Any]: + """Create the security hooks configuration for Claude Agent SDK. + + Includes security validation and observability hooks: + - PreToolUse: Security validation before tool execution + - PostToolUse: Log successful tool executions + - PostToolUseFailure: Log and handle failed tool executions + - PreCompact: Log context compaction events (SDK handles compaction automatically) + - Stop: Capture transcript path for stateless resume (when *on_stop* is provided) + + Args: + user_id: Current user ID for isolation validation + sdk_cwd: SDK working directory for workspace-scoped tool validation + max_subtasks: Maximum Task (sub-agent) spawns allowed per session + on_stop: Callback ``(transcript_path, sdk_session_id)`` invoked when + the SDK finishes processing — used to read the JSONL transcript + before the CLI process exits. + + Returns: + Hooks configuration dict for ClaudeAgentOptions + """ + try: + from claude_agent_sdk import HookMatcher + from claude_agent_sdk.types import HookContext, HookInput, SyncHookJSONOutput + + # Per-session counter for Task sub-agent spawns + task_spawn_count = 0 + + async def pre_tool_use_hook( + input_data: HookInput, + tool_use_id: str | None, + context: HookContext, + ) -> SyncHookJSONOutput: + """Combined pre-tool-use validation hook.""" + nonlocal task_spawn_count + _ = context # unused but required by signature + tool_name = cast(str, input_data.get("tool_name", "")) + tool_input = cast(dict[str, Any], input_data.get("tool_input", {})) + + # Rate-limit Task (sub-agent) spawns per session + if tool_name == "Task": + task_spawn_count += 1 + if task_spawn_count > max_subtasks: + logger.warning( + f"[SDK] Task limit reached ({max_subtasks}), user={user_id}" + ) + return cast( + SyncHookJSONOutput, + _deny( + f"Maximum {max_subtasks} sub-tasks per session. " + "Please continue in the main conversation." + ), + ) + + # Strip MCP prefix for consistent validation + is_copilot_tool = tool_name.startswith(MCP_TOOL_PREFIX) + clean_name = tool_name.removeprefix(MCP_TOOL_PREFIX) + + # Only block non-CoPilot tools; our MCP-registered tools + # (including Read for oversized results) are already sandboxed. + if not is_copilot_tool: + result = _validate_tool_access(clean_name, tool_input, sdk_cwd) + if result: + return cast(SyncHookJSONOutput, result) + + # Validate user isolation + result = _validate_user_isolation(clean_name, tool_input, user_id) + if result: + return cast(SyncHookJSONOutput, result) + + logger.debug(f"[SDK] Tool start: {tool_name}, user={user_id}") + return cast(SyncHookJSONOutput, {}) + + async def post_tool_use_hook( + input_data: HookInput, + tool_use_id: str | None, + context: HookContext, + ) -> SyncHookJSONOutput: + """Log successful tool executions for observability.""" + _ = context + tool_name = cast(str, input_data.get("tool_name", "")) + logger.debug(f"[SDK] Tool success: {tool_name}, tool_use_id={tool_use_id}") + return cast(SyncHookJSONOutput, {}) + + async def post_tool_failure_hook( + input_data: HookInput, + tool_use_id: str | None, + context: HookContext, + ) -> SyncHookJSONOutput: + """Log failed tool executions for debugging.""" + _ = context + tool_name = cast(str, input_data.get("tool_name", "")) + error = input_data.get("error", "Unknown error") + logger.warning( + f"[SDK] Tool failed: {tool_name}, error={error}, " + f"user={user_id}, tool_use_id={tool_use_id}" + ) + return cast(SyncHookJSONOutput, {}) + + async def pre_compact_hook( + input_data: HookInput, + tool_use_id: str | None, + context: HookContext, + ) -> SyncHookJSONOutput: + """Log when SDK triggers context compaction. + + The SDK automatically compacts conversation history when it grows too large. + This hook provides visibility into when compaction happens. + """ + _ = context, tool_use_id + trigger = input_data.get("trigger", "auto") + logger.info( + f"[SDK] Context compaction triggered: {trigger}, user={user_id}" + ) + return cast(SyncHookJSONOutput, {}) + + # --- Stop hook: capture transcript path for stateless resume --- + async def stop_hook( + input_data: HookInput, + tool_use_id: str | None, + context: HookContext, + ) -> SyncHookJSONOutput: + """Capture transcript path when SDK finishes processing. + + The Stop hook fires while the CLI process is still alive, giving us + a reliable window to read the JSONL transcript before SIGTERM. + """ + _ = context, tool_use_id + transcript_path = cast(str, input_data.get("transcript_path", "")) + sdk_session_id = cast(str, input_data.get("session_id", "")) + + if transcript_path and on_stop: + logger.info( + f"[SDK] Stop hook: transcript_path={transcript_path}, " + f"sdk_session_id={sdk_session_id[:12]}..." + ) + on_stop(transcript_path, sdk_session_id) + + return cast(SyncHookJSONOutput, {}) + + hooks: dict[str, Any] = { + "PreToolUse": [HookMatcher(matcher="*", hooks=[pre_tool_use_hook])], + "PostToolUse": [HookMatcher(matcher="*", hooks=[post_tool_use_hook])], + "PostToolUseFailure": [ + HookMatcher(matcher="*", hooks=[post_tool_failure_hook]) + ], + "PreCompact": [HookMatcher(matcher="*", hooks=[pre_compact_hook])], + } + + if on_stop is not None: + hooks["Stop"] = [HookMatcher(matcher=None, hooks=[stop_hook])] + + return hooks + except ImportError: + # Fallback for when SDK isn't available - return empty hooks + logger.warning("claude-agent-sdk not available, security hooks disabled") + return {} diff --git a/autogpt_platform/backend/backend/api/features/chat/sdk/security_hooks_test.py b/autogpt_platform/backend/backend/api/features/chat/sdk/security_hooks_test.py new file mode 100644 index 0000000000..2d09afdab7 --- /dev/null +++ b/autogpt_platform/backend/backend/api/features/chat/sdk/security_hooks_test.py @@ -0,0 +1,165 @@ +"""Unit tests for SDK security hooks.""" + +import os + +from .security_hooks import _validate_tool_access, _validate_user_isolation + +SDK_CWD = "/tmp/copilot-abc123" + + +def _is_denied(result: dict) -> bool: + hook = result.get("hookSpecificOutput", {}) + return hook.get("permissionDecision") == "deny" + + +# -- Blocked tools ----------------------------------------------------------- + + +def test_blocked_tools_denied(): + for tool in ("bash", "shell", "exec", "terminal", "command"): + result = _validate_tool_access(tool, {}) + assert _is_denied(result), f"{tool} should be blocked" + + +def test_unknown_tool_allowed(): + result = _validate_tool_access("SomeCustomTool", {}) + assert result == {} + + +# -- Workspace-scoped tools -------------------------------------------------- + + +def test_read_within_workspace_allowed(): + result = _validate_tool_access( + "Read", {"file_path": f"{SDK_CWD}/file.txt"}, sdk_cwd=SDK_CWD + ) + assert result == {} + + +def test_write_within_workspace_allowed(): + result = _validate_tool_access( + "Write", {"file_path": f"{SDK_CWD}/output.json"}, sdk_cwd=SDK_CWD + ) + assert result == {} + + +def test_edit_within_workspace_allowed(): + result = _validate_tool_access( + "Edit", {"file_path": f"{SDK_CWD}/src/main.py"}, sdk_cwd=SDK_CWD + ) + assert result == {} + + +def test_glob_within_workspace_allowed(): + result = _validate_tool_access("Glob", {"path": f"{SDK_CWD}/src"}, sdk_cwd=SDK_CWD) + assert result == {} + + +def test_grep_within_workspace_allowed(): + result = _validate_tool_access("Grep", {"path": f"{SDK_CWD}/src"}, sdk_cwd=SDK_CWD) + assert result == {} + + +def test_read_outside_workspace_denied(): + result = _validate_tool_access( + "Read", {"file_path": "/etc/passwd"}, sdk_cwd=SDK_CWD + ) + assert _is_denied(result) + + +def test_write_outside_workspace_denied(): + result = _validate_tool_access( + "Write", {"file_path": "/home/user/secrets.txt"}, sdk_cwd=SDK_CWD + ) + assert _is_denied(result) + + +def test_traversal_attack_denied(): + result = _validate_tool_access( + "Read", + {"file_path": f"{SDK_CWD}/../../etc/passwd"}, + sdk_cwd=SDK_CWD, + ) + assert _is_denied(result) + + +def test_no_path_allowed(): + """Glob/Grep without a path argument defaults to cwd — should pass.""" + result = _validate_tool_access("Glob", {}, sdk_cwd=SDK_CWD) + assert result == {} + + +def test_read_no_cwd_denies_absolute(): + """If no sdk_cwd is set, absolute paths are denied.""" + result = _validate_tool_access("Read", {"file_path": "/tmp/anything"}) + assert _is_denied(result) + + +# -- Tool-results directory -------------------------------------------------- + + +def test_read_tool_results_allowed(): + home = os.path.expanduser("~") + path = f"{home}/.claude/projects/-tmp-copilot-abc123/tool-results/12345.txt" + result = _validate_tool_access("Read", {"file_path": path}, sdk_cwd=SDK_CWD) + assert result == {} + + +def test_read_claude_projects_without_tool_results_denied(): + home = os.path.expanduser("~") + path = f"{home}/.claude/projects/-tmp-copilot-abc123/settings.json" + result = _validate_tool_access("Read", {"file_path": path}, sdk_cwd=SDK_CWD) + assert _is_denied(result) + + +# -- Built-in Bash is blocked (use bash_exec MCP tool instead) --------------- + + +def test_bash_builtin_always_blocked(): + """SDK built-in Bash is blocked — bash_exec MCP tool with bubblewrap is used instead.""" + result = _validate_tool_access("Bash", {"command": "echo hello"}, sdk_cwd=SDK_CWD) + assert _is_denied(result) + + +# -- Dangerous patterns ------------------------------------------------------ + + +def test_dangerous_pattern_blocked(): + result = _validate_tool_access("SomeTool", {"cmd": "sudo rm -rf /"}) + assert _is_denied(result) + + +def test_subprocess_pattern_blocked(): + result = _validate_tool_access("SomeTool", {"code": "subprocess.run(...)"}) + assert _is_denied(result) + + +# -- User isolation ---------------------------------------------------------- + + +def test_workspace_path_traversal_blocked(): + result = _validate_user_isolation( + "workspace_read", {"path": "../../../etc/shadow"}, user_id="user-1" + ) + assert _is_denied(result) + + +def test_workspace_absolute_path_blocked(): + result = _validate_user_isolation( + "workspace_read", {"path": "/etc/passwd"}, user_id="user-1" + ) + assert _is_denied(result) + + +def test_workspace_normal_path_allowed(): + result = _validate_user_isolation( + "workspace_read", {"path": "src/main.py"}, user_id="user-1" + ) + assert result == {} + + +def test_non_workspace_tool_passes_isolation(): + result = _validate_user_isolation( + "find_agent", {"query": "email"}, user_id="user-1" + ) + assert result == {} diff --git a/autogpt_platform/backend/backend/api/features/chat/sdk/service.py b/autogpt_platform/backend/backend/api/features/chat/sdk/service.py new file mode 100644 index 0000000000..65195b442c --- /dev/null +++ b/autogpt_platform/backend/backend/api/features/chat/sdk/service.py @@ -0,0 +1,751 @@ +"""Claude Agent SDK service layer for CoPilot chat completions.""" + +import asyncio +import json +import logging +import os +import uuid +from collections.abc import AsyncGenerator +from dataclasses import dataclass +from typing import Any + +from backend.util.exceptions import NotFoundError + +from .. import stream_registry +from ..config import ChatConfig +from ..model import ( + ChatMessage, + ChatSession, + get_chat_session, + update_session_title, + upsert_chat_session, +) +from ..response_model import ( + StreamBaseResponse, + StreamError, + StreamFinish, + StreamStart, + StreamTextDelta, + StreamToolInputAvailable, + StreamToolOutputAvailable, +) +from ..service import ( + _build_system_prompt, + _execute_long_running_tool_with_streaming, + _generate_session_title, +) +from ..tools.models import OperationPendingResponse, OperationStartedResponse +from ..tools.sandbox import WORKSPACE_PREFIX, make_session_path +from ..tracking import track_user_message +from .response_adapter import SDKResponseAdapter +from .security_hooks import create_security_hooks +from .tool_adapter import ( + COPILOT_TOOL_NAMES, + LongRunningCallback, + create_copilot_mcp_server, + set_execution_context, +) +from .transcript import ( + download_transcript, + read_transcript_file, + upload_transcript, + validate_transcript, + write_transcript_to_tempfile, +) + +logger = logging.getLogger(__name__) +config = ChatConfig() + +# Set to hold background tasks to prevent garbage collection +_background_tasks: set[asyncio.Task[Any]] = set() + + +@dataclass +class CapturedTranscript: + """Info captured by the SDK Stop hook for stateless --resume.""" + + path: str = "" + sdk_session_id: str = "" + + @property + def available(self) -> bool: + return bool(self.path) + + +_SDK_CWD_PREFIX = WORKSPACE_PREFIX + +# Appended to the system prompt to inform the agent about available tools. +# The SDK built-in Bash is NOT available — use mcp__copilot__bash_exec instead, +# which has kernel-level network isolation (unshare --net). +_SDK_TOOL_SUPPLEMENT = """ + +## Tool notes + +- The SDK built-in Bash tool is NOT available. Use the `bash_exec` MCP tool + for shell commands — it runs in a network-isolated sandbox. +- **Shared workspace**: The SDK Read/Write tools and `bash_exec` share the + same working directory. Files created by one are readable by the other. + These files are **ephemeral** — they exist only for the current session. +- **Persistent storage**: Use `write_workspace_file` / `read_workspace_file` + for files that should persist across sessions (stored in cloud storage). +- Long-running tools (create_agent, edit_agent, etc.) are handled + asynchronously. You will receive an immediate response; the actual result + is delivered to the user via a background stream. +""" + + +def _build_long_running_callback(user_id: str | None) -> LongRunningCallback: + """Build a callback that delegates long-running tools to the non-SDK infrastructure. + + Long-running tools (create_agent, edit_agent, etc.) are delegated to the + existing background infrastructure: stream_registry (Redis Streams), + database persistence, and SSE reconnection. This means results survive + page refreshes / pod restarts, and the frontend shows the proper loading + widget with progress updates. + + The returned callback matches the ``LongRunningCallback`` signature: + ``(tool_name, args, session) -> MCP response dict``. + """ + + async def _callback( + tool_name: str, args: dict[str, Any], session: ChatSession + ) -> dict[str, Any]: + operation_id = str(uuid.uuid4()) + task_id = str(uuid.uuid4()) + tool_call_id = f"sdk-{uuid.uuid4().hex[:12]}" + session_id = session.session_id + + # --- Build user-friendly messages (matches non-SDK service) --- + if tool_name == "create_agent": + desc = args.get("description", "") + desc_preview = (desc[:100] + "...") if len(desc) > 100 else desc + pending_msg = ( + f"Creating your agent: {desc_preview}" + if desc_preview + else "Creating agent... This may take a few minutes." + ) + started_msg = ( + "Agent creation started. You can close this tab - " + "check your library in a few minutes." + ) + elif tool_name == "edit_agent": + changes = args.get("changes", "") + changes_preview = (changes[:100] + "...") if len(changes) > 100 else changes + pending_msg = ( + f"Editing agent: {changes_preview}" + if changes_preview + else "Editing agent... This may take a few minutes." + ) + started_msg = ( + "Agent edit started. You can close this tab - " + "check your library in a few minutes." + ) + else: + pending_msg = f"Running {tool_name}... This may take a few minutes." + started_msg = ( + f"{tool_name} started. You can close this tab - " + "check back in a few minutes." + ) + + # --- Register task in Redis for SSE reconnection --- + await stream_registry.create_task( + task_id=task_id, + session_id=session_id, + user_id=user_id, + tool_call_id=tool_call_id, + tool_name=tool_name, + operation_id=operation_id, + ) + + # --- Save OperationPendingResponse to chat history --- + pending_message = ChatMessage( + role="tool", + content=OperationPendingResponse( + message=pending_msg, + operation_id=operation_id, + tool_name=tool_name, + ).model_dump_json(), + tool_call_id=tool_call_id, + ) + session.messages.append(pending_message) + await upsert_chat_session(session) + + # --- Spawn background task (reuses non-SDK infrastructure) --- + bg_task = asyncio.create_task( + _execute_long_running_tool_with_streaming( + tool_name=tool_name, + parameters=args, + tool_call_id=tool_call_id, + operation_id=operation_id, + task_id=task_id, + session_id=session_id, + user_id=user_id, + ) + ) + _background_tasks.add(bg_task) + bg_task.add_done_callback(_background_tasks.discard) + await stream_registry.set_task_asyncio_task(task_id, bg_task) + + logger.info( + f"[SDK] Long-running tool {tool_name} delegated to background " + f"(operation_id={operation_id}, task_id={task_id})" + ) + + # --- Return OperationStartedResponse as MCP tool result --- + # This flows through SDK → response adapter → frontend, triggering + # the loading widget with SSE reconnection support. + started_json = OperationStartedResponse( + message=started_msg, + operation_id=operation_id, + tool_name=tool_name, + task_id=task_id, + ).model_dump_json() + + return { + "content": [{"type": "text", "text": started_json}], + "isError": False, + } + + return _callback + + +def _resolve_sdk_model() -> str | None: + """Resolve the model name for the Claude Agent SDK CLI. + + Uses ``config.claude_agent_model`` if set, otherwise derives from + ``config.model`` by stripping the OpenRouter provider prefix (e.g., + ``"anthropic/claude-opus-4.6"`` → ``"claude-opus-4.6"``). + """ + if config.claude_agent_model: + return config.claude_agent_model + model = config.model + if "/" in model: + return model.split("/", 1)[1] + return model + + +def _build_sdk_env() -> dict[str, str]: + """Build env vars for the SDK CLI process. + + Routes API calls through OpenRouter (or a custom base_url) using + the same ``config.api_key`` / ``config.base_url`` as the non-SDK path. + This gives per-call token and cost tracking on the OpenRouter dashboard. + + Only overrides ``ANTHROPIC_API_KEY`` when a valid proxy URL and auth + token are both present — otherwise returns an empty dict so the SDK + falls back to its default credentials. + """ + env: dict[str, str] = {} + if config.api_key and config.base_url: + # Strip /v1 suffix — SDK expects the base URL without a version path + base = config.base_url.rstrip("/") + if base.endswith("/v1"): + base = base[:-3] + if not base or not base.startswith("http"): + # Invalid base_url — don't override SDK defaults + return env + env["ANTHROPIC_BASE_URL"] = base + env["ANTHROPIC_AUTH_TOKEN"] = config.api_key + # Must be explicitly empty so the CLI uses AUTH_TOKEN instead + env["ANTHROPIC_API_KEY"] = "" + return env + + +def _make_sdk_cwd(session_id: str) -> str: + """Create a safe, session-specific working directory path. + + Delegates to :func:`~backend.api.features.chat.tools.sandbox.make_session_path` + (single source of truth for path sanitization) and adds a defence-in-depth + assertion. + """ + cwd = make_session_path(session_id) + # Defence-in-depth: normpath + startswith is a CodeQL-recognised sanitizer + cwd = os.path.normpath(cwd) + if not cwd.startswith(_SDK_CWD_PREFIX): + raise ValueError(f"SDK cwd escaped prefix: {cwd}") + return cwd + + +def _cleanup_sdk_tool_results(cwd: str) -> None: + """Remove SDK tool-result files for a specific session working directory. + + The SDK creates tool-result files under ~/.claude/projects//tool-results/. + We clean only the specific cwd's results to avoid race conditions between + concurrent sessions. + + Security: cwd MUST be created by _make_sdk_cwd() which sanitizes session_id. + """ + import shutil + + # Validate cwd is under the expected prefix + normalized = os.path.normpath(cwd) + if not normalized.startswith(_SDK_CWD_PREFIX): + logger.warning(f"[SDK] Rejecting cleanup for path outside workspace: {cwd}") + return + + # SDK encodes the cwd path by replacing '/' with '-' + encoded_cwd = normalized.replace("/", "-") + + # Construct the project directory path (known-safe home expansion) + claude_projects = os.path.expanduser("~/.claude/projects") + project_dir = os.path.join(claude_projects, encoded_cwd) + + # Security check 3: Validate project_dir is under ~/.claude/projects + project_dir = os.path.normpath(project_dir) + if not project_dir.startswith(claude_projects): + logger.warning( + f"[SDK] Rejecting cleanup for escaped project path: {project_dir}" + ) + return + + results_dir = os.path.join(project_dir, "tool-results") + if os.path.isdir(results_dir): + for filename in os.listdir(results_dir): + file_path = os.path.join(results_dir, filename) + try: + if os.path.isfile(file_path): + os.remove(file_path) + except OSError: + pass + + # Also clean up the temp cwd directory itself + try: + shutil.rmtree(normalized, ignore_errors=True) + except OSError: + pass + + +async def _compress_conversation_history( + session: ChatSession, +) -> list[ChatMessage]: + """Compress prior conversation messages if they exceed the token threshold. + + Uses the shared compress_context() from prompt.py which supports: + - LLM summarization of old messages (keeps recent ones intact) + - Progressive content truncation as fallback + - Middle-out deletion as last resort + + Returns the compressed prior messages (everything except the current message). + """ + prior = session.messages[:-1] + if len(prior) < 2: + return prior + + from backend.util.prompt import compress_context + + # Convert ChatMessages to dicts for compress_context + messages_dict = [] + for msg in prior: + msg_dict: dict[str, Any] = {"role": msg.role} + if msg.content: + msg_dict["content"] = msg.content + if msg.tool_calls: + msg_dict["tool_calls"] = msg.tool_calls + if msg.tool_call_id: + msg_dict["tool_call_id"] = msg.tool_call_id + messages_dict.append(msg_dict) + + try: + import openai + + async with openai.AsyncOpenAI( + api_key=config.api_key, base_url=config.base_url, timeout=30.0 + ) as client: + result = await compress_context( + messages=messages_dict, + model=config.model, + client=client, + ) + except Exception as e: + logger.warning(f"[SDK] Context compression with LLM failed: {e}") + # Fall back to truncation-only (no LLM summarization) + result = await compress_context( + messages=messages_dict, + model=config.model, + client=None, + ) + + if result.was_compacted: + logger.info( + f"[SDK] Context compacted: {result.original_token_count} -> " + f"{result.token_count} tokens " + f"({result.messages_summarized} summarized, " + f"{result.messages_dropped} dropped)" + ) + # Convert compressed dicts back to ChatMessages + return [ + ChatMessage( + role=m["role"], + content=m.get("content"), + tool_calls=m.get("tool_calls"), + tool_call_id=m.get("tool_call_id"), + ) + for m in result.messages + ] + + return prior + + +def _format_conversation_context(messages: list[ChatMessage]) -> str | None: + """Format conversation messages into a context prefix for the user message. + + Returns a string like: + + User: hello + You responded: Hi! How can I help? + + + Returns None if there are no messages to format. + """ + if not messages: + return None + + lines: list[str] = [] + for msg in messages: + if not msg.content: + continue + if msg.role == "user": + lines.append(f"User: {msg.content}") + elif msg.role == "assistant": + lines.append(f"You responded: {msg.content}") + # Skip tool messages — they're internal details + + if not lines: + return None + + return "\n" + "\n".join(lines) + "\n" + + +async def stream_chat_completion_sdk( + session_id: str, + message: str | None = None, + tool_call_response: str | None = None, # noqa: ARG001 + is_user_message: bool = True, + user_id: str | None = None, + retry_count: int = 0, # noqa: ARG001 + session: ChatSession | None = None, + context: dict[str, str] | None = None, # noqa: ARG001 +) -> AsyncGenerator[StreamBaseResponse, None]: + """Stream chat completion using Claude Agent SDK. + + Drop-in replacement for stream_chat_completion with improved reliability. + """ + + if session is None: + session = await get_chat_session(session_id, user_id) + + if not session: + raise NotFoundError( + f"Session {session_id} not found. Please create a new session first." + ) + + if message: + session.messages.append( + ChatMessage( + role="user" if is_user_message else "assistant", content=message + ) + ) + if is_user_message: + track_user_message( + user_id=user_id, session_id=session_id, message_length=len(message) + ) + + session = await upsert_chat_session(session) + + # Generate title for new sessions (first user message) + if is_user_message and not session.title: + user_messages = [m for m in session.messages if m.role == "user"] + if len(user_messages) == 1: + first_message = user_messages[0].content or message or "" + if first_message: + task = asyncio.create_task( + _update_title_async(session_id, first_message, user_id) + ) + _background_tasks.add(task) + task.add_done_callback(_background_tasks.discard) + + # Build system prompt (reuses non-SDK path with Langfuse support) + has_history = len(session.messages) > 1 + system_prompt, _ = await _build_system_prompt( + user_id, has_conversation_history=has_history + ) + system_prompt += _SDK_TOOL_SUPPLEMENT + message_id = str(uuid.uuid4()) + task_id = str(uuid.uuid4()) + + yield StreamStart(messageId=message_id, taskId=task_id) + + stream_completed = False + # Initialise sdk_cwd before the try so the finally can reference it + # even if _make_sdk_cwd raises (in that case it stays as ""). + sdk_cwd = "" + use_resume = False + + try: + # Use a session-specific temp dir to avoid cleanup race conditions + # between concurrent sessions. + sdk_cwd = _make_sdk_cwd(session_id) + os.makedirs(sdk_cwd, exist_ok=True) + + set_execution_context( + user_id, + session, + long_running_callback=_build_long_running_callback(user_id), + ) + try: + from claude_agent_sdk import ClaudeAgentOptions, ClaudeSDKClient + + # Fail fast when no API credentials are available at all + sdk_env = _build_sdk_env() + if not sdk_env and not os.environ.get("ANTHROPIC_API_KEY"): + raise RuntimeError( + "No API key configured. Set OPEN_ROUTER_API_KEY " + "(or CHAT_API_KEY) for OpenRouter routing, " + "or ANTHROPIC_API_KEY for direct Anthropic access." + ) + + mcp_server = create_copilot_mcp_server() + + sdk_model = _resolve_sdk_model() + + # --- Transcript capture via Stop hook --- + captured_transcript = CapturedTranscript() + + def _on_stop(transcript_path: str, sdk_session_id: str) -> None: + captured_transcript.path = transcript_path + captured_transcript.sdk_session_id = sdk_session_id + + security_hooks = create_security_hooks( + user_id, + sdk_cwd=sdk_cwd, + max_subtasks=config.claude_agent_max_subtasks, + on_stop=_on_stop if config.claude_agent_use_resume else None, + ) + + # --- Resume strategy: download transcript from bucket --- + resume_file: str | None = None + use_resume = False + + if config.claude_agent_use_resume and user_id and len(session.messages) > 1: + transcript_content = await download_transcript(user_id, session_id) + if transcript_content and validate_transcript(transcript_content): + resume_file = write_transcript_to_tempfile( + transcript_content, session_id, sdk_cwd + ) + if resume_file: + use_resume = True + logger.info( + f"[SDK] Using --resume with transcript " + f"({len(transcript_content)} bytes)" + ) + + sdk_options_kwargs: dict[str, Any] = { + "system_prompt": system_prompt, + "mcp_servers": {"copilot": mcp_server}, + "allowed_tools": COPILOT_TOOL_NAMES, + "disallowed_tools": ["Bash"], + "hooks": security_hooks, + "cwd": sdk_cwd, + "max_buffer_size": config.claude_agent_max_buffer_size, + } + if sdk_env: + sdk_options_kwargs["model"] = sdk_model + sdk_options_kwargs["env"] = sdk_env + if use_resume and resume_file: + sdk_options_kwargs["resume"] = resume_file + + options = ClaudeAgentOptions(**sdk_options_kwargs) # type: ignore[arg-type] + + adapter = SDKResponseAdapter(message_id=message_id) + adapter.set_task_id(task_id) + + async with ClaudeSDKClient(options=options) as client: + current_message = message or "" + if not current_message and session.messages: + last_user = [m for m in session.messages if m.role == "user"] + if last_user: + current_message = last_user[-1].content or "" + + if not current_message.strip(): + yield StreamError( + errorText="Message cannot be empty.", + code="empty_prompt", + ) + yield StreamFinish() + return + + # Build query: with --resume the CLI already has full + # context, so we only send the new message. Without + # resume, compress history into a context prefix. + query_message = current_message + if not use_resume and len(session.messages) > 1: + logger.warning( + f"[SDK] Using compression fallback for session " + f"{session_id} ({len(session.messages)} messages) — " + f"no transcript available for --resume" + ) + compressed = await _compress_conversation_history(session) + history_context = _format_conversation_context(compressed) + if history_context: + query_message = ( + f"{history_context}\n\n" + f"Now, the user says:\n{current_message}" + ) + + logger.info( + f"[SDK] Sending query ({len(session.messages)} msgs in session)" + ) + logger.debug(f"[SDK] Query preview: {current_message[:80]!r}") + await client.query(query_message, session_id=session_id) + + assistant_response = ChatMessage(role="assistant", content="") + accumulated_tool_calls: list[dict[str, Any]] = [] + has_appended_assistant = False + has_tool_results = False + + async for sdk_msg in client.receive_messages(): + logger.debug( + f"[SDK] Received: {type(sdk_msg).__name__} " + f"{getattr(sdk_msg, 'subtype', '')}" + ) + for response in adapter.convert_message(sdk_msg): + if isinstance(response, StreamStart): + continue + + yield response + + if isinstance(response, StreamTextDelta): + delta = response.delta or "" + # After tool results, start a new assistant + # message for the post-tool text. + if has_tool_results and has_appended_assistant: + assistant_response = ChatMessage( + role="assistant", content=delta + ) + accumulated_tool_calls = [] + has_appended_assistant = False + has_tool_results = False + session.messages.append(assistant_response) + has_appended_assistant = True + else: + assistant_response.content = ( + assistant_response.content or "" + ) + delta + if not has_appended_assistant: + session.messages.append(assistant_response) + has_appended_assistant = True + + elif isinstance(response, StreamToolInputAvailable): + accumulated_tool_calls.append( + { + "id": response.toolCallId, + "type": "function", + "function": { + "name": response.toolName, + "arguments": json.dumps(response.input or {}), + }, + } + ) + assistant_response.tool_calls = accumulated_tool_calls + if not has_appended_assistant: + session.messages.append(assistant_response) + has_appended_assistant = True + + elif isinstance(response, StreamToolOutputAvailable): + session.messages.append( + ChatMessage( + role="tool", + content=( + response.output + if isinstance(response.output, str) + else str(response.output) + ), + tool_call_id=response.toolCallId, + ) + ) + has_tool_results = True + + elif isinstance(response, StreamFinish): + stream_completed = True + + if stream_completed: + break + + if ( + assistant_response.content or assistant_response.tool_calls + ) and not has_appended_assistant: + session.messages.append(assistant_response) + + # --- Capture transcript while CLI is still alive --- + # Must happen INSIDE async with: close() sends SIGTERM + # which kills the CLI before it can flush the JSONL. + if ( + config.claude_agent_use_resume + and user_id + and captured_transcript.available + ): + # Give CLI time to flush JSONL writes before we read + await asyncio.sleep(0.5) + raw_transcript = read_transcript_file(captured_transcript.path) + if raw_transcript: + task = asyncio.create_task( + _upload_transcript_bg(user_id, session_id, raw_transcript) + ) + _background_tasks.add(task) + task.add_done_callback(_background_tasks.discard) + else: + logger.debug("[SDK] Stop hook fired but transcript not usable") + + except ImportError: + raise RuntimeError( + "claude-agent-sdk is not installed. " + "Disable SDK mode (CHAT_USE_CLAUDE_AGENT_SDK=false) " + "to use the OpenAI-compatible fallback." + ) + + await upsert_chat_session(session) + logger.debug( + f"[SDK] Session {session_id} saved with {len(session.messages)} messages" + ) + if not stream_completed: + yield StreamFinish() + + except Exception as e: + logger.error(f"[SDK] Error: {e}", exc_info=True) + try: + await upsert_chat_session(session) + except Exception as save_err: + logger.error(f"[SDK] Failed to save session on error: {save_err}") + yield StreamError( + errorText="An error occurred. Please try again.", + code="sdk_error", + ) + yield StreamFinish() + finally: + if sdk_cwd: + _cleanup_sdk_tool_results(sdk_cwd) + + +async def _upload_transcript_bg( + user_id: str, session_id: str, raw_content: str +) -> None: + """Background task to strip progress entries and upload transcript.""" + try: + await upload_transcript(user_id, session_id, raw_content) + except Exception as e: + logger.error(f"[SDK] Failed to upload transcript for {session_id}: {e}") + + +async def _update_title_async( + session_id: str, message: str, user_id: str | None = None +) -> None: + """Background task to update session title.""" + try: + title = await _generate_session_title( + message, user_id=user_id, session_id=session_id + ) + if title: + await update_session_title(session_id, title) + logger.debug(f"[SDK] Generated title for {session_id}: {title}") + except Exception as e: + logger.warning(f"[SDK] Failed to update session title: {e}") diff --git a/autogpt_platform/backend/backend/api/features/chat/sdk/tool_adapter.py b/autogpt_platform/backend/backend/api/features/chat/sdk/tool_adapter.py new file mode 100644 index 0000000000..d983d5e785 --- /dev/null +++ b/autogpt_platform/backend/backend/api/features/chat/sdk/tool_adapter.py @@ -0,0 +1,322 @@ +"""Tool adapter for wrapping existing CoPilot tools as Claude Agent SDK MCP tools. + +This module provides the adapter layer that converts existing BaseTool implementations +into in-process MCP tools that can be used with the Claude Agent SDK. + +Long-running tools (``is_long_running=True``) are delegated to the non-SDK +background infrastructure (stream_registry, Redis persistence, SSE reconnection) +via a callback provided by the service layer. This avoids wasteful SDK polling +and makes results survive page refreshes. +""" + +import itertools +import json +import logging +import os +import uuid +from collections.abc import Awaitable, Callable +from contextvars import ContextVar +from typing import Any + +from backend.api.features.chat.model import ChatSession +from backend.api.features.chat.tools import TOOL_REGISTRY +from backend.api.features.chat.tools.base import BaseTool + +logger = logging.getLogger(__name__) + +# Allowed base directory for the Read tool (SDK saves oversized tool results here). +# Restricted to ~/.claude/projects/ and further validated to require "tool-results" +# in the path — prevents reading settings, credentials, or other sensitive files. +_SDK_PROJECTS_DIR = os.path.expanduser("~/.claude/projects/") + +# MCP server naming - the SDK prefixes tool names as "mcp__{server_name}__{tool}" +MCP_SERVER_NAME = "copilot" +MCP_TOOL_PREFIX = f"mcp__{MCP_SERVER_NAME}__" + +# Context variables to pass user/session info to tool execution +_current_user_id: ContextVar[str | None] = ContextVar("current_user_id", default=None) +_current_session: ContextVar[ChatSession | None] = ContextVar( + "current_session", default=None +) +# Stash for MCP tool outputs before the SDK potentially truncates them. +# Keyed by tool_name → full output string. Consumed (popped) by the +# response adapter when it builds StreamToolOutputAvailable. +_pending_tool_outputs: ContextVar[dict[str, str]] = ContextVar( + "pending_tool_outputs", default=None # type: ignore[arg-type] +) + +# Callback type for delegating long-running tools to the non-SDK infrastructure. +# Args: (tool_name, arguments, session) → MCP-formatted response dict. +LongRunningCallback = Callable[ + [str, dict[str, Any], ChatSession], Awaitable[dict[str, Any]] +] + +# ContextVar so the service layer can inject the callback per-request. +_long_running_callback: ContextVar[LongRunningCallback | None] = ContextVar( + "long_running_callback", default=None +) + + +def set_execution_context( + user_id: str | None, + session: ChatSession, + long_running_callback: LongRunningCallback | None = None, +) -> None: + """Set the execution context for tool calls. + + This must be called before streaming begins to ensure tools have access + to user_id and session information. + + Args: + user_id: Current user's ID. + session: Current chat session. + long_running_callback: Optional callback to delegate long-running tools + to the non-SDK background infrastructure (stream_registry + Redis). + """ + _current_user_id.set(user_id) + _current_session.set(session) + _pending_tool_outputs.set({}) + _long_running_callback.set(long_running_callback) + + +def get_execution_context() -> tuple[str | None, ChatSession | None]: + """Get the current execution context.""" + return ( + _current_user_id.get(), + _current_session.get(), + ) + + +def pop_pending_tool_output(tool_name: str) -> str | None: + """Pop and return the stashed full output for *tool_name*. + + The SDK CLI may truncate large tool results (writing them to disk and + replacing the content with a file reference). This stash keeps the + original MCP output so the response adapter can forward it to the + frontend for proper widget rendering. + + Returns ``None`` if nothing was stashed for *tool_name*. + """ + pending = _pending_tool_outputs.get(None) + if pending is None: + return None + return pending.pop(tool_name, None) + + +async def _execute_tool_sync( + base_tool: BaseTool, + user_id: str | None, + session: ChatSession, + args: dict[str, Any], +) -> dict[str, Any]: + """Execute a tool synchronously and return MCP-formatted response.""" + effective_id = f"sdk-{uuid.uuid4().hex[:12]}" + result = await base_tool.execute( + user_id=user_id, + session=session, + tool_call_id=effective_id, + **args, + ) + + text = ( + result.output if isinstance(result.output, str) else json.dumps(result.output) + ) + + # Stash the full output before the SDK potentially truncates it. + pending = _pending_tool_outputs.get(None) + if pending is not None: + pending[base_tool.name] = text + + return { + "content": [{"type": "text", "text": text}], + "isError": not result.success, + } + + +def _mcp_error(message: str) -> dict[str, Any]: + return { + "content": [ + {"type": "text", "text": json.dumps({"error": message, "type": "error"})} + ], + "isError": True, + } + + +def create_tool_handler(base_tool: BaseTool): + """Create an async handler function for a BaseTool. + + This wraps the existing BaseTool._execute method to be compatible + with the Claude Agent SDK MCP tool format. + + Long-running tools (``is_long_running=True``) are delegated to the + non-SDK background infrastructure via a callback set in the execution + context. The callback persists the operation in Redis (stream_registry) + so results survive page refreshes and pod restarts. + """ + + async def tool_handler(args: dict[str, Any]) -> dict[str, Any]: + """Execute the wrapped tool and return MCP-formatted response.""" + user_id, session = get_execution_context() + + if session is None: + return _mcp_error("No session context available") + + # --- Long-running: delegate to non-SDK background infrastructure --- + if base_tool.is_long_running: + callback = _long_running_callback.get(None) + if callback: + try: + return await callback(base_tool.name, args, session) + except Exception as e: + logger.error( + f"Long-running callback failed for {base_tool.name}: {e}", + exc_info=True, + ) + return _mcp_error(f"Failed to start {base_tool.name}: {e}") + # No callback — fall through to synchronous execution + logger.warning( + f"[SDK] No long-running callback for {base_tool.name}, " + f"executing synchronously (may block)" + ) + + # --- Normal (fast) tool: execute synchronously --- + try: + return await _execute_tool_sync(base_tool, user_id, session, args) + except Exception as e: + logger.error(f"Error executing tool {base_tool.name}: {e}", exc_info=True) + return _mcp_error(f"Failed to execute {base_tool.name}: {e}") + + return tool_handler + + +def _build_input_schema(base_tool: BaseTool) -> dict[str, Any]: + """Build a JSON Schema input schema for a tool.""" + return { + "type": "object", + "properties": base_tool.parameters.get("properties", {}), + "required": base_tool.parameters.get("required", []), + } + + +async def _read_file_handler(args: dict[str, Any]) -> dict[str, Any]: + """Read a file with optional offset/limit. Restricted to SDK working directory. + + After reading, the file is deleted to prevent accumulation in long-running pods. + """ + file_path = args.get("file_path", "") + offset = args.get("offset", 0) + limit = args.get("limit", 2000) + + # Security: only allow reads under ~/.claude/projects/**/tool-results/ + real_path = os.path.realpath(file_path) + if not real_path.startswith(_SDK_PROJECTS_DIR) or "tool-results" not in real_path: + return { + "content": [{"type": "text", "text": f"Access denied: {file_path}"}], + "isError": True, + } + + try: + with open(real_path) as f: + selected = list(itertools.islice(f, offset, offset + limit)) + content = "".join(selected) + # Cleanup happens in _cleanup_sdk_tool_results after session ends; + # don't delete here — the SDK may read in multiple chunks. + return {"content": [{"type": "text", "text": content}], "isError": False} + except FileNotFoundError: + return { + "content": [{"type": "text", "text": f"File not found: {file_path}"}], + "isError": True, + } + except Exception as e: + return { + "content": [{"type": "text", "text": f"Error reading file: {e}"}], + "isError": True, + } + + +_READ_TOOL_NAME = "Read" +_READ_TOOL_DESCRIPTION = ( + "Read a file from the local filesystem. " + "Use offset and limit to read specific line ranges for large files." +) +_READ_TOOL_SCHEMA = { + "type": "object", + "properties": { + "file_path": { + "type": "string", + "description": "The absolute path to the file to read", + }, + "offset": { + "type": "integer", + "description": "Line number to start reading from (0-indexed). Default: 0", + }, + "limit": { + "type": "integer", + "description": "Number of lines to read. Default: 2000", + }, + }, + "required": ["file_path"], +} + + +# Create the MCP server configuration +def create_copilot_mcp_server(): + """Create an in-process MCP server configuration for CoPilot tools. + + This can be passed to ClaudeAgentOptions.mcp_servers. + + Note: The actual SDK MCP server creation depends on the claude-agent-sdk + package being available. This function returns the configuration that + can be used with the SDK. + """ + try: + from claude_agent_sdk import create_sdk_mcp_server, tool + + # Create decorated tool functions + sdk_tools = [] + + for tool_name, base_tool in TOOL_REGISTRY.items(): + handler = create_tool_handler(base_tool) + decorated = tool( + tool_name, + base_tool.description, + _build_input_schema(base_tool), + )(handler) + sdk_tools.append(decorated) + + # Add the Read tool so the SDK can read back oversized tool results + read_tool = tool( + _READ_TOOL_NAME, + _READ_TOOL_DESCRIPTION, + _READ_TOOL_SCHEMA, + )(_read_file_handler) + sdk_tools.append(read_tool) + + server = create_sdk_mcp_server( + name=MCP_SERVER_NAME, + version="1.0.0", + tools=sdk_tools, + ) + + return server + + except ImportError: + # Let ImportError propagate so service.py handles the fallback + raise + + +# SDK built-in tools allowed within the workspace directory. +# Security hooks validate that file paths stay within sdk_cwd. +# Bash is NOT included — use the sandboxed MCP bash_exec tool instead, +# which provides kernel-level network isolation via unshare --net. +# Task allows spawning sub-agents (rate-limited by security hooks). +_SDK_BUILTIN_TOOLS = ["Read", "Write", "Edit", "Glob", "Grep", "Task"] + +# List of tool names for allowed_tools configuration +# Include MCP tools, the MCP Read tool for oversized results, +# and SDK built-in file tools for workspace operations. +COPILOT_TOOL_NAMES = [ + *[f"{MCP_TOOL_PREFIX}{name}" for name in TOOL_REGISTRY.keys()], + f"{MCP_TOOL_PREFIX}{_READ_TOOL_NAME}", + *_SDK_BUILTIN_TOOLS, +] diff --git a/autogpt_platform/backend/backend/api/features/chat/sdk/transcript.py b/autogpt_platform/backend/backend/api/features/chat/sdk/transcript.py new file mode 100644 index 0000000000..aaa5609227 --- /dev/null +++ b/autogpt_platform/backend/backend/api/features/chat/sdk/transcript.py @@ -0,0 +1,356 @@ +"""JSONL transcript management for stateless multi-turn resume. + +The Claude Code CLI persists conversations as JSONL files (one JSON object per +line). When the SDK's ``Stop`` hook fires we read this file, strip bloat +(progress entries, metadata), and upload the result to bucket storage. On the +next turn we download the transcript, write it to a temp file, and pass +``--resume`` so the CLI can reconstruct the full conversation. + +Storage is handled via ``WorkspaceStorageBackend`` (GCS in prod, local +filesystem for self-hosted) — no DB column needed. +""" + +import json +import logging +import os +import re + +logger = logging.getLogger(__name__) + +# UUIDs are hex + hyphens; strip everything else to prevent path injection. +_SAFE_ID_RE = re.compile(r"[^0-9a-fA-F-]") + +# Entry types that can be safely removed from the transcript without breaking +# the parentUuid conversation tree that ``--resume`` relies on. +# - progress: UI progress ticks, no message content (avg 97KB for agent_progress) +# - file-history-snapshot: undo tracking metadata +# - queue-operation: internal queue bookkeeping +# - summary: session summaries +# - pr-link: PR link metadata +STRIPPABLE_TYPES = frozenset( + {"progress", "file-history-snapshot", "queue-operation", "summary", "pr-link"} +) + +# Workspace storage constants — deterministic path from session_id. +TRANSCRIPT_STORAGE_PREFIX = "chat-transcripts" + + +# --------------------------------------------------------------------------- +# Progress stripping +# --------------------------------------------------------------------------- + + +def strip_progress_entries(content: str) -> str: + """Remove progress/metadata entries from a JSONL transcript. + + Removes entries whose ``type`` is in ``STRIPPABLE_TYPES`` and reparents + any remaining child entries so the ``parentUuid`` chain stays intact. + Typically reduces transcript size by ~30%. + """ + lines = content.strip().split("\n") + + entries: list[dict] = [] + for line in lines: + try: + entries.append(json.loads(line)) + except json.JSONDecodeError: + # Keep unparseable lines as-is (safety) + entries.append({"_raw": line}) + + stripped_uuids: set[str] = set() + uuid_to_parent: dict[str, str] = {} + kept: list[dict] = [] + + for entry in entries: + if "_raw" in entry: + kept.append(entry) + continue + uid = entry.get("uuid", "") + parent = entry.get("parentUuid", "") + entry_type = entry.get("type", "") + + if uid: + uuid_to_parent[uid] = parent + + if entry_type in STRIPPABLE_TYPES: + if uid: + stripped_uuids.add(uid) + else: + kept.append(entry) + + # Reparent: walk up chain through stripped entries to find surviving ancestor + for entry in kept: + if "_raw" in entry: + continue + parent = entry.get("parentUuid", "") + original_parent = parent + while parent in stripped_uuids: + parent = uuid_to_parent.get(parent, "") + if parent != original_parent: + entry["parentUuid"] = parent + + result_lines: list[str] = [] + for entry in kept: + if "_raw" in entry: + result_lines.append(entry["_raw"]) + else: + result_lines.append(json.dumps(entry, separators=(",", ":"))) + + return "\n".join(result_lines) + "\n" + + +# --------------------------------------------------------------------------- +# Local file I/O (read from CLI's JSONL, write temp file for --resume) +# --------------------------------------------------------------------------- + + +def read_transcript_file(transcript_path: str) -> str | None: + """Read a JSONL transcript file from disk. + + Returns the raw JSONL content, or ``None`` if the file is missing, empty, + or only contains metadata (≤2 lines with no conversation messages). + """ + if not transcript_path or not os.path.isfile(transcript_path): + logger.debug(f"[Transcript] File not found: {transcript_path}") + return None + + try: + with open(transcript_path) as f: + content = f.read() + + if not content.strip(): + logger.debug(f"[Transcript] Empty file: {transcript_path}") + return None + + lines = content.strip().split("\n") + if len(lines) < 3: + # Raw files with ≤2 lines are metadata-only + # (queue-operation + file-history-snapshot, no conversation). + logger.debug( + f"[Transcript] Too few lines ({len(lines)}): {transcript_path}" + ) + return None + + # Quick structural validation — parse first and last lines. + json.loads(lines[0]) + json.loads(lines[-1]) + + logger.info( + f"[Transcript] Read {len(lines)} lines, " + f"{len(content)} bytes from {transcript_path}" + ) + return content + + except (json.JSONDecodeError, OSError) as e: + logger.warning(f"[Transcript] Failed to read {transcript_path}: {e}") + return None + + +def _sanitize_id(raw_id: str, max_len: int = 36) -> str: + """Sanitize an ID for safe use in file paths. + + Session/user IDs are expected to be UUIDs (hex + hyphens). Strip + everything else and truncate to *max_len* so the result cannot introduce + path separators or other special characters. + """ + cleaned = _SAFE_ID_RE.sub("", raw_id or "")[:max_len] + return cleaned or "unknown" + + +_SAFE_CWD_PREFIX = os.path.realpath("/tmp/copilot-") + + +def write_transcript_to_tempfile( + transcript_content: str, + session_id: str, + cwd: str, +) -> str | None: + """Write JSONL transcript to a temp file inside *cwd* for ``--resume``. + + The file lives in the session working directory so it is cleaned up + automatically when the session ends. + + Returns the absolute path to the file, or ``None`` on failure. + """ + # Validate cwd is under the expected sandbox prefix (CodeQL sanitizer). + real_cwd = os.path.realpath(cwd) + if not real_cwd.startswith(_SAFE_CWD_PREFIX): + logger.warning(f"[Transcript] cwd outside sandbox: {cwd}") + return None + + try: + os.makedirs(real_cwd, exist_ok=True) + safe_id = _sanitize_id(session_id, max_len=8) + jsonl_path = os.path.realpath( + os.path.join(real_cwd, f"transcript-{safe_id}.jsonl") + ) + if not jsonl_path.startswith(real_cwd): + logger.warning(f"[Transcript] Path escaped cwd: {jsonl_path}") + return None + + with open(jsonl_path, "w") as f: + f.write(transcript_content) + + logger.info(f"[Transcript] Wrote resume file: {jsonl_path}") + return jsonl_path + + except OSError as e: + logger.warning(f"[Transcript] Failed to write resume file: {e}") + return None + + +def validate_transcript(content: str | None) -> bool: + """Check that a transcript has actual conversation messages. + + A valid transcript for resume needs at least one user message and one + assistant message (not just queue-operation / file-history-snapshot + metadata). + """ + if not content or not content.strip(): + return False + + lines = content.strip().split("\n") + if len(lines) < 2: + return False + + has_user = False + has_assistant = False + + for line in lines: + try: + entry = json.loads(line) + msg_type = entry.get("type") + if msg_type == "user": + has_user = True + elif msg_type == "assistant": + has_assistant = True + except json.JSONDecodeError: + return False + + return has_user and has_assistant + + +# --------------------------------------------------------------------------- +# Bucket storage (GCS / local via WorkspaceStorageBackend) +# --------------------------------------------------------------------------- + + +def _storage_path_parts(user_id: str, session_id: str) -> tuple[str, str, str]: + """Return (workspace_id, file_id, filename) for a session's transcript. + + Path structure: ``chat-transcripts/{user_id}/{session_id}.jsonl`` + IDs are sanitized to hex+hyphen to prevent path traversal. + """ + return ( + TRANSCRIPT_STORAGE_PREFIX, + _sanitize_id(user_id), + f"{_sanitize_id(session_id)}.jsonl", + ) + + +def _build_storage_path(user_id: str, session_id: str, backend: object) -> str: + """Build the full storage path string that ``retrieve()`` expects. + + ``store()`` returns a path like ``gcs://bucket/workspaces/...`` or + ``local://workspace_id/file_id/filename``. Since we use deterministic + arguments we can reconstruct the same path for download/delete without + having stored the return value. + """ + from backend.util.workspace_storage import GCSWorkspaceStorage + + wid, fid, fname = _storage_path_parts(user_id, session_id) + + if isinstance(backend, GCSWorkspaceStorage): + blob = f"workspaces/{wid}/{fid}/{fname}" + return f"gcs://{backend.bucket_name}/{blob}" + else: + # LocalWorkspaceStorage returns local://{relative_path} + return f"local://{wid}/{fid}/{fname}" + + +async def upload_transcript(user_id: str, session_id: str, content: str) -> None: + """Strip progress entries and upload transcript to bucket storage. + + Safety: only overwrites when the new (stripped) transcript is larger than + what is already stored. Since JSONL is append-only, the latest transcript + is always the longest. This prevents a slow/stale background task from + clobbering a newer upload from a concurrent turn. + """ + from backend.util.workspace_storage import get_workspace_storage + + stripped = strip_progress_entries(content) + if not validate_transcript(stripped): + logger.warning( + f"[Transcript] Skipping upload — stripped content is not a valid " + f"transcript for session {session_id}" + ) + return + + storage = await get_workspace_storage() + wid, fid, fname = _storage_path_parts(user_id, session_id) + encoded = stripped.encode("utf-8") + new_size = len(encoded) + + # Check existing transcript size to avoid overwriting newer with older + path = _build_storage_path(user_id, session_id, storage) + try: + existing = await storage.retrieve(path) + if len(existing) >= new_size: + logger.info( + f"[Transcript] Skipping upload — existing transcript " + f"({len(existing)}B) >= new ({new_size}B) for session " + f"{session_id}" + ) + return + except (FileNotFoundError, Exception): + pass # No existing transcript or retrieval error — proceed with upload + + await storage.store( + workspace_id=wid, + file_id=fid, + filename=fname, + content=encoded, + ) + logger.info( + f"[Transcript] Uploaded {new_size} bytes " + f"(stripped from {len(content)}) for session {session_id}" + ) + + +async def download_transcript(user_id: str, session_id: str) -> str | None: + """Download transcript from bucket storage. + + Returns the JSONL content string, or ``None`` if not found. + """ + from backend.util.workspace_storage import get_workspace_storage + + storage = await get_workspace_storage() + path = _build_storage_path(user_id, session_id, storage) + + try: + data = await storage.retrieve(path) + content = data.decode("utf-8") + logger.info( + f"[Transcript] Downloaded {len(content)} bytes for session {session_id}" + ) + return content + except FileNotFoundError: + logger.debug(f"[Transcript] No transcript in storage for {session_id}") + return None + except Exception as e: + logger.warning(f"[Transcript] Failed to download transcript: {e}") + return None + + +async def delete_transcript(user_id: str, session_id: str) -> None: + """Delete transcript from bucket storage (e.g. after resume failure).""" + from backend.util.workspace_storage import get_workspace_storage + + storage = await get_workspace_storage() + path = _build_storage_path(user_id, session_id, storage) + + try: + await storage.delete(path) + logger.info(f"[Transcript] Deleted transcript for session {session_id}") + except Exception as e: + logger.warning(f"[Transcript] Failed to delete transcript: {e}") diff --git a/autogpt_platform/backend/backend/api/features/chat/service.py b/autogpt_platform/backend/backend/api/features/chat/service.py index b8ddc35960..cb5591e6d0 100644 --- a/autogpt_platform/backend/backend/api/features/chat/service.py +++ b/autogpt_platform/backend/backend/api/features/chat/service.py @@ -245,12 +245,16 @@ async def _get_system_prompt_template(context: str) -> str: return DEFAULT_SYSTEM_PROMPT.format(users_information=context) -async def _build_system_prompt(user_id: str | None) -> tuple[str, Any]: +async def _build_system_prompt( + user_id: str | None, has_conversation_history: bool = False +) -> tuple[str, Any]: """Build the full system prompt including business understanding if available. Args: - user_id: The user ID for fetching business understanding - If "default" and this is the user's first session, will use "onboarding" instead. + user_id: The user ID for fetching business understanding. + has_conversation_history: Whether there's existing conversation history. + If True, we don't tell the model to greet/introduce (since they're + already in a conversation). Returns: Tuple of (compiled prompt string, business understanding object) @@ -266,6 +270,8 @@ async def _build_system_prompt(user_id: str | None) -> tuple[str, Any]: if understanding: context = format_understanding_for_prompt(understanding) + elif has_conversation_history: + context = "No prior understanding saved yet. Continue the existing conversation naturally." else: context = "This is the first time you are meeting the user. Greet them and introduce them to the platform" @@ -374,7 +380,6 @@ async def stream_chat_completion( Raises: NotFoundError: If session_id is invalid - ValueError: If max_context_messages is exceeded """ completion_start = time.monotonic() @@ -459,8 +464,9 @@ async def stream_chat_completion( # Generate title for new sessions on first user message (non-blocking) # Check: is_user_message, no title yet, and this is the first user message - if is_user_message and message and not session.title: - user_messages = [m for m in session.messages if m.role == "user"] + user_messages = [m for m in session.messages if m.role == "user"] + first_user_msg = message or (user_messages[0].content if user_messages else None) + if is_user_message and first_user_msg and not session.title: if len(user_messages) == 1: # First user message - generate title in background import asyncio @@ -468,7 +474,7 @@ async def stream_chat_completion( # Capture only the values we need (not the session object) to avoid # stale data issues when the main flow modifies the session captured_session_id = session_id - captured_message = message + captured_message = first_user_msg captured_user_id = user_id async def _update_title(): @@ -1237,7 +1243,7 @@ async def _stream_chat_chunks( total_time = (time_module.perf_counter() - stream_chunks_start) * 1000 logger.info( - f"[TIMING] _stream_chat_chunks COMPLETED in {total_time/1000:.1f}s; " + f"[TIMING] _stream_chat_chunks COMPLETED in {total_time / 1000:.1f}s; " f"session={session.session_id}, user={session.user_id}", extra={"json_fields": {**log_meta, "total_time_ms": total_time}}, ) diff --git a/autogpt_platform/backend/backend/api/features/chat/service_test.py b/autogpt_platform/backend/backend/api/features/chat/service_test.py index 70f27af14f..b2fc82b790 100644 --- a/autogpt_platform/backend/backend/api/features/chat/service_test.py +++ b/autogpt_platform/backend/backend/api/features/chat/service_test.py @@ -1,3 +1,4 @@ +import asyncio import logging from os import getenv @@ -11,6 +12,8 @@ from .response_model import ( StreamTextDelta, StreamToolOutputAvailable, ) +from .sdk import service as sdk_service +from .sdk.transcript import download_transcript logger = logging.getLogger(__name__) @@ -80,3 +83,96 @@ async def test_stream_chat_completion_with_tool_calls(setup_test_user, test_user session = await get_chat_session(session.session_id) assert session, "Session not found" assert session.usage, "Usage is empty" + + +@pytest.mark.asyncio(loop_scope="session") +async def test_sdk_resume_multi_turn(setup_test_user, test_user_id): + """Test that the SDK --resume path captures and uses transcripts across turns. + + Turn 1: Send a message containing a unique keyword. + Turn 2: Ask the model to recall that keyword — proving the transcript was + persisted and restored via --resume. + """ + api_key: str | None = getenv("OPEN_ROUTER_API_KEY") + if not api_key: + return pytest.skip("OPEN_ROUTER_API_KEY is not set, skipping test") + + from .config import ChatConfig + + cfg = ChatConfig() + if not cfg.claude_agent_use_resume: + return pytest.skip("CLAUDE_AGENT_USE_RESUME is not enabled, skipping test") + + session = await create_chat_session(test_user_id) + session = await upsert_chat_session(session) + + # --- Turn 1: send a message with a unique keyword --- + keyword = "ZEPHYR42" + turn1_msg = ( + f"Please remember this special keyword: {keyword}. " + "Just confirm you've noted it, keep your response brief." + ) + turn1_text = "" + turn1_errors: list[str] = [] + turn1_ended = False + + async for chunk in sdk_service.stream_chat_completion_sdk( + session.session_id, + turn1_msg, + user_id=test_user_id, + ): + if isinstance(chunk, StreamTextDelta): + turn1_text += chunk.delta + elif isinstance(chunk, StreamError): + turn1_errors.append(chunk.errorText) + elif isinstance(chunk, StreamFinish): + turn1_ended = True + + assert turn1_ended, "Turn 1 did not finish" + assert not turn1_errors, f"Turn 1 errors: {turn1_errors}" + assert turn1_text, "Turn 1 produced no text" + + # Wait for background upload task to complete (retry up to 5s) + transcript = None + for _ in range(10): + await asyncio.sleep(0.5) + transcript = await download_transcript(test_user_id, session.session_id) + if transcript: + break + assert transcript, ( + "Transcript was not uploaded to bucket after turn 1 — " + "Stop hook may not have fired or transcript was too small" + ) + logger.info(f"Turn 1 transcript uploaded: {len(transcript)} bytes") + + # Reload session for turn 2 + session = await get_chat_session(session.session_id, test_user_id) + assert session, "Session not found after turn 1" + + # --- Turn 2: ask model to recall the keyword --- + turn2_msg = "What was the special keyword I asked you to remember?" + turn2_text = "" + turn2_errors: list[str] = [] + turn2_ended = False + + async for chunk in sdk_service.stream_chat_completion_sdk( + session.session_id, + turn2_msg, + user_id=test_user_id, + session=session, + ): + if isinstance(chunk, StreamTextDelta): + turn2_text += chunk.delta + elif isinstance(chunk, StreamError): + turn2_errors.append(chunk.errorText) + elif isinstance(chunk, StreamFinish): + turn2_ended = True + + assert turn2_ended, "Turn 2 did not finish" + assert not turn2_errors, f"Turn 2 errors: {turn2_errors}" + assert turn2_text, "Turn 2 produced no text" + assert keyword in turn2_text, ( + f"Model did not recall keyword '{keyword}' in turn 2. " + f"Response: {turn2_text[:200]}" + ) + logger.info(f"Turn 2 recalled keyword successfully: {turn2_text[:100]}") diff --git a/autogpt_platform/backend/backend/api/features/chat/stream_registry.py b/autogpt_platform/backend/backend/api/features/chat/stream_registry.py index abc34b1fc9..671aefc7ba 100644 --- a/autogpt_platform/backend/backend/api/features/chat/stream_registry.py +++ b/autogpt_platform/backend/backend/api/features/chat/stream_registry.py @@ -814,6 +814,28 @@ async def get_active_task_for_session( if task_user_id and user_id != task_user_id: continue + # Auto-expire stale tasks that exceeded stream_timeout + created_at_str = meta.get("created_at", "") + if created_at_str: + try: + created_at = datetime.fromisoformat(created_at_str) + age_seconds = ( + datetime.now(timezone.utc) - created_at + ).total_seconds() + if age_seconds > config.stream_timeout: + logger.warning( + f"[TASK_LOOKUP] Auto-expiring stale task {task_id[:8]}... " + f"(age={age_seconds:.0f}s > timeout={config.stream_timeout}s)" + ) + await mark_task_completed(task_id, "failed") + continue + except (ValueError, TypeError): + pass + + logger.info( + f"[TASK_LOOKUP] Found running task {task_id[:8]}... for session {session_id[:8]}..." + ) + # Get the last message ID from Redis Stream stream_key = _get_task_stream_key(task_id) last_id = "0-0" diff --git a/autogpt_platform/backend/backend/api/features/chat/tools/__init__.py b/autogpt_platform/backend/backend/api/features/chat/tools/__init__.py index 350776081a..1ab4f720bb 100644 --- a/autogpt_platform/backend/backend/api/features/chat/tools/__init__.py +++ b/autogpt_platform/backend/backend/api/features/chat/tools/__init__.py @@ -9,6 +9,8 @@ from backend.api.features.chat.tracking import track_tool_called from .add_understanding import AddUnderstandingTool from .agent_output import AgentOutputTool from .base import BaseTool +from .bash_exec import BashExecTool +from .check_operation_status import CheckOperationStatusTool from .create_agent import CreateAgentTool from .customize_agent import CustomizeAgentTool from .edit_agent import EditAgentTool @@ -20,6 +22,7 @@ from .get_doc_page import GetDocPageTool from .run_agent import RunAgentTool from .run_block import RunBlockTool from .search_docs import SearchDocsTool +from .web_fetch import WebFetchTool from .workspace_files import ( DeleteWorkspaceFileTool, ListWorkspaceFilesTool, @@ -44,8 +47,14 @@ TOOL_REGISTRY: dict[str, BaseTool] = { "run_agent": RunAgentTool(), "run_block": RunBlockTool(), "view_agent_output": AgentOutputTool(), + "check_operation_status": CheckOperationStatusTool(), "search_docs": SearchDocsTool(), "get_doc_page": GetDocPageTool(), + # Web fetch for safe URL retrieval + "web_fetch": WebFetchTool(), + # Sandboxed code execution (bubblewrap) + "bash_exec": BashExecTool(), + # Persistent workspace tools (cloud storage, survives across sessions) # Feature request tools "search_feature_requests": SearchFeatureRequestsTool(), "create_feature_request": CreateFeatureRequestTool(), diff --git a/autogpt_platform/backend/backend/api/features/chat/tools/bash_exec.py b/autogpt_platform/backend/backend/api/features/chat/tools/bash_exec.py new file mode 100644 index 0000000000..da9d8bf3fa --- /dev/null +++ b/autogpt_platform/backend/backend/api/features/chat/tools/bash_exec.py @@ -0,0 +1,131 @@ +"""Bash execution tool — run shell commands in a bubblewrap sandbox. + +Full Bash scripting is allowed (loops, conditionals, pipes, functions, etc.). +Safety comes from OS-level isolation (bubblewrap): only system dirs visible +read-only, writable workspace only, clean env, no network. + +Requires bubblewrap (``bwrap``) — the tool is disabled when bwrap is not +available (e.g. macOS development). +""" + +import logging +from typing import Any + +from backend.api.features.chat.model import ChatSession +from backend.api.features.chat.tools.base import BaseTool +from backend.api.features.chat.tools.models import ( + BashExecResponse, + ErrorResponse, + ToolResponseBase, +) +from backend.api.features.chat.tools.sandbox import ( + get_workspace_dir, + has_full_sandbox, + run_sandboxed, +) + +logger = logging.getLogger(__name__) + + +class BashExecTool(BaseTool): + """Execute Bash commands in a bubblewrap sandbox.""" + + @property + def name(self) -> str: + return "bash_exec" + + @property + def description(self) -> str: + if not has_full_sandbox(): + return ( + "Bash execution is DISABLED — bubblewrap sandbox is not " + "available on this platform. Do not call this tool." + ) + return ( + "Execute a Bash command or script in a bubblewrap sandbox. " + "Full Bash scripting is supported (loops, conditionals, pipes, " + "functions, etc.). " + "The sandbox shares the same working directory as the SDK Read/Write " + "tools — files created by either are accessible to both. " + "SECURITY: Only system directories (/usr, /bin, /lib, /etc) are " + "visible read-only, the per-session workspace is the only writable " + "path, environment variables are wiped (no secrets), all network " + "access is blocked at the kernel level, and resource limits are " + "enforced (max 64 processes, 512MB memory, 50MB file size). " + "Application code, configs, and other directories are NOT accessible. " + "To fetch web content, use the web_fetch tool instead. " + "Execution is killed after the timeout (default 30s, max 120s). " + "Returns stdout and stderr. " + "Useful for file manipulation, data processing with Unix tools " + "(grep, awk, sed, jq, etc.), and running shell scripts." + ) + + @property + def parameters(self) -> dict[str, Any]: + return { + "type": "object", + "properties": { + "command": { + "type": "string", + "description": "Bash command or script to execute.", + }, + "timeout": { + "type": "integer", + "description": ( + "Max execution time in seconds (default 30, max 120)." + ), + "default": 30, + }, + }, + "required": ["command"], + } + + @property + def requires_auth(self) -> bool: + return False + + async def _execute( + self, + user_id: str | None, + session: ChatSession, + **kwargs: Any, + ) -> ToolResponseBase: + session_id = session.session_id if session else None + + if not has_full_sandbox(): + return ErrorResponse( + message="bash_exec requires bubblewrap sandbox (Linux only).", + error="sandbox_unavailable", + session_id=session_id, + ) + + command: str = (kwargs.get("command") or "").strip() + timeout: int = kwargs.get("timeout", 30) + + if not command: + return ErrorResponse( + message="No command provided.", + error="empty_command", + session_id=session_id, + ) + + workspace = get_workspace_dir(session_id or "default") + + stdout, stderr, exit_code, timed_out = await run_sandboxed( + command=["bash", "-c", command], + cwd=workspace, + timeout=timeout, + ) + + return BashExecResponse( + message=( + "Execution timed out" + if timed_out + else f"Command executed (exit {exit_code})" + ), + stdout=stdout, + stderr=stderr, + exit_code=exit_code, + timed_out=timed_out, + session_id=session_id, + ) diff --git a/autogpt_platform/backend/backend/api/features/chat/tools/check_operation_status.py b/autogpt_platform/backend/backend/api/features/chat/tools/check_operation_status.py new file mode 100644 index 0000000000..b8ec770fd0 --- /dev/null +++ b/autogpt_platform/backend/backend/api/features/chat/tools/check_operation_status.py @@ -0,0 +1,127 @@ +"""CheckOperationStatusTool — query the status of a long-running operation.""" + +import logging +from typing import Any + +from backend.api.features.chat.model import ChatSession +from backend.api.features.chat.tools.base import BaseTool +from backend.api.features.chat.tools.models import ( + ErrorResponse, + ResponseType, + ToolResponseBase, +) + +logger = logging.getLogger(__name__) + + +class OperationStatusResponse(ToolResponseBase): + """Response for check_operation_status tool.""" + + type: ResponseType = ResponseType.OPERATION_STATUS + task_id: str + operation_id: str + status: str # "running", "completed", "failed" + tool_name: str | None = None + message: str = "" + + +class CheckOperationStatusTool(BaseTool): + """Check the status of a long-running operation (create_agent, edit_agent, etc.). + + The CoPilot uses this tool to report back to the user whether an + operation that was started earlier has completed, failed, or is still + running. + """ + + @property + def name(self) -> str: + return "check_operation_status" + + @property + def description(self) -> str: + return ( + "Check the current status of a long-running operation such as " + "create_agent or edit_agent. Accepts either an operation_id or " + "task_id from a previous operation_started response. " + "Returns the current status: running, completed, or failed." + ) + + @property + def parameters(self) -> dict[str, Any]: + return { + "type": "object", + "properties": { + "operation_id": { + "type": "string", + "description": ( + "The operation_id from an operation_started response." + ), + }, + "task_id": { + "type": "string", + "description": ( + "The task_id from an operation_started response. " + "Used as fallback if operation_id is not provided." + ), + }, + }, + "required": [], + } + + @property + def requires_auth(self) -> bool: + return False + + async def _execute( + self, + user_id: str | None, + session: ChatSession, + **kwargs, + ) -> ToolResponseBase: + from backend.api.features.chat import stream_registry + + operation_id = (kwargs.get("operation_id") or "").strip() + task_id = (kwargs.get("task_id") or "").strip() + + if not operation_id and not task_id: + return ErrorResponse( + message="Please provide an operation_id or task_id.", + error="missing_parameter", + ) + + task = None + if operation_id: + task = await stream_registry.find_task_by_operation_id(operation_id) + if task is None and task_id: + task = await stream_registry.get_task(task_id) + + if task is None: + # Task not in Redis — it may have already expired (TTL). + # Check conversation history for the result instead. + return ErrorResponse( + message=( + "Operation not found — it may have already completed and " + "expired from the status tracker. Check the conversation " + "history for the result." + ), + error="not_found", + ) + + status_messages = { + "running": ( + f"The {task.tool_name or 'operation'} is still running. " + "Please wait for it to complete." + ), + "completed": ( + f"The {task.tool_name or 'operation'} has completed successfully." + ), + "failed": f"The {task.tool_name or 'operation'} has failed.", + } + + return OperationStatusResponse( + task_id=task.task_id, + operation_id=task.operation_id, + status=task.status, + tool_name=task.tool_name, + message=status_messages.get(task.status, f"Status: {task.status}"), + ) diff --git a/autogpt_platform/backend/backend/api/features/chat/tools/find_block.py b/autogpt_platform/backend/backend/api/features/chat/tools/find_block.py index 55b1c0d510..c51317cb62 100644 --- a/autogpt_platform/backend/backend/api/features/chat/tools/find_block.py +++ b/autogpt_platform/backend/backend/api/features/chat/tools/find_block.py @@ -146,6 +146,7 @@ class FindBlockTool(BaseTool): id=block_id, name=block.name, description=block.description or "", + categories=[c.value for c in block.categories], ) ) diff --git a/autogpt_platform/backend/backend/api/features/chat/tools/models.py b/autogpt_platform/backend/backend/api/features/chat/tools/models.py index f2d8f364e4..b32f6ca2ce 100644 --- a/autogpt_platform/backend/backend/api/features/chat/tools/models.py +++ b/autogpt_platform/backend/backend/api/features/chat/tools/models.py @@ -41,6 +41,12 @@ class ResponseType(str, Enum): OPERATION_IN_PROGRESS = "operation_in_progress" # Input validation INPUT_VALIDATION_ERROR = "input_validation_error" + # Web fetch + WEB_FETCH = "web_fetch" + # Code execution + BASH_EXEC = "bash_exec" + # Operation status check + OPERATION_STATUS = "operation_status" # Feature request types FEATURE_REQUEST_SEARCH = "feature_request_search" FEATURE_REQUEST_CREATED = "feature_request_created" @@ -338,6 +344,19 @@ class BlockInfoSummary(BaseModel): id: str name: str description: str + categories: list[str] + input_schema: dict[str, Any] = Field( + default_factory=dict, + description="Full JSON schema for block inputs", + ) + output_schema: dict[str, Any] = Field( + default_factory=dict, + description="Full JSON schema for block outputs", + ) + required_inputs: list[BlockInputFieldInfo] = Field( + default_factory=list, + description="List of input fields for this block", + ) class BlockListResponse(ToolResponseBase): @@ -347,6 +366,10 @@ class BlockListResponse(ToolResponseBase): blocks: list[BlockInfoSummary] count: int query: str + usage_hint: str = Field( + default="To execute a block, call run_block with block_id set to the block's " + "'id' field and input_data containing the fields listed in required_inputs." + ) class BlockDetails(BaseModel): @@ -435,6 +458,27 @@ class AsyncProcessingResponse(ToolResponseBase): task_id: str | None = None +class WebFetchResponse(ToolResponseBase): + """Response for web_fetch tool.""" + + type: ResponseType = ResponseType.WEB_FETCH + url: str + status_code: int + content_type: str + content: str + truncated: bool = False + + +class BashExecResponse(ToolResponseBase): + """Response for bash_exec tool.""" + + type: ResponseType = ResponseType.BASH_EXEC + stdout: str + stderr: str + exit_code: int + timed_out: bool = False + + # Feature request models class FeatureRequestInfo(BaseModel): """Information about a feature request issue.""" diff --git a/autogpt_platform/backend/backend/api/features/chat/tools/sandbox.py b/autogpt_platform/backend/backend/api/features/chat/tools/sandbox.py new file mode 100644 index 0000000000..beb326f909 --- /dev/null +++ b/autogpt_platform/backend/backend/api/features/chat/tools/sandbox.py @@ -0,0 +1,265 @@ +"""Sandbox execution utilities for code execution tools. + +Provides filesystem + network isolated command execution using **bubblewrap** +(``bwrap``): whitelist-only filesystem (only system dirs visible read-only), +writable workspace only, clean environment, network blocked. + +Tools that call :func:`run_sandboxed` must first check :func:`has_full_sandbox` +and refuse to run if bubblewrap is not available. +""" + +import asyncio +import logging +import os +import platform +import shutil + +logger = logging.getLogger(__name__) + +_DEFAULT_TIMEOUT = 30 +_MAX_TIMEOUT = 120 + + +# --------------------------------------------------------------------------- +# Sandbox capability detection (cached at first call) +# --------------------------------------------------------------------------- + +_BWRAP_AVAILABLE: bool | None = None + + +def has_full_sandbox() -> bool: + """Return True if bubblewrap is available (filesystem + network isolation). + + On non-Linux platforms (macOS), always returns False. + """ + global _BWRAP_AVAILABLE + if _BWRAP_AVAILABLE is None: + _BWRAP_AVAILABLE = ( + platform.system() == "Linux" and shutil.which("bwrap") is not None + ) + return _BWRAP_AVAILABLE + + +WORKSPACE_PREFIX = "/tmp/copilot-" + + +def make_session_path(session_id: str) -> str: + """Build a sanitized, session-specific path under :data:`WORKSPACE_PREFIX`. + + Shared by both the SDK working-directory setup and the sandbox tools so + they always resolve to the same directory for a given session. + + Steps: + 1. Strip all characters except ``[A-Za-z0-9-]``. + 2. Construct ``/tmp/copilot-``. + 3. Validate via ``os.path.normpath`` + ``startswith`` (CodeQL-recognised + sanitizer) to prevent path traversal. + + Raises: + ValueError: If the resulting path escapes the prefix. + """ + import re + + safe_id = re.sub(r"[^A-Za-z0-9-]", "", session_id) + if not safe_id: + safe_id = "default" + path = os.path.normpath(f"{WORKSPACE_PREFIX}{safe_id}") + if not path.startswith(WORKSPACE_PREFIX): + raise ValueError(f"Session path escaped prefix: {path}") + return path + + +def get_workspace_dir(session_id: str) -> str: + """Get or create the workspace directory for a session. + + Uses :func:`make_session_path` — the same path the SDK uses — so that + bash_exec shares the workspace with the SDK file tools. + """ + workspace = make_session_path(session_id) + os.makedirs(workspace, exist_ok=True) + return workspace + + +# --------------------------------------------------------------------------- +# Bubblewrap command builder +# --------------------------------------------------------------------------- + +# System directories mounted read-only inside the sandbox. +# ONLY these are visible — /app, /root, /home, /opt, /var etc. are NOT accessible. +_SYSTEM_RO_BINDS = [ + "/usr", # binaries, libraries, Python interpreter + "/etc", # system config: ld.so, locale, passwd, alternatives +] + +# Compat paths: symlinks to /usr/* on modern Debian, real dirs on older systems. +# On Debian 13 these are symlinks (e.g. /bin -> usr/bin). bwrap --ro-bind +# can't create a symlink target, so we detect and use --symlink instead. +# /lib64 is critical: the ELF dynamic linker lives at /lib64/ld-linux-x86-64.so.2. +_COMPAT_PATHS = [ + ("/bin", "usr/bin"), # -> /usr/bin on Debian 13 + ("/sbin", "usr/sbin"), # -> /usr/sbin on Debian 13 + ("/lib", "usr/lib"), # -> /usr/lib on Debian 13 + ("/lib64", "usr/lib64"), # 64-bit libraries / ELF interpreter +] + +# Resource limits to prevent fork bombs, memory exhaustion, and disk abuse. +# Applied via ulimit inside the sandbox before exec'ing the user command. +_RESOURCE_LIMITS = ( + "ulimit -u 64" # max 64 processes (prevents fork bombs) + " -v 524288" # 512 MB virtual memory + " -f 51200" # 50 MB max file size (1024-byte blocks) + " -n 256" # 256 open file descriptors + " 2>/dev/null" +) + + +def _build_bwrap_command( + command: list[str], cwd: str, env: dict[str, str] +) -> list[str]: + """Build a bubblewrap command with strict filesystem + network isolation. + + Security model: + - **Whitelist-only filesystem**: only system directories (``/usr``, ``/etc``, + ``/bin``, ``/lib``) are mounted read-only. Application code (``/app``), + home directories, ``/var``, ``/opt``, etc. are NOT accessible at all. + - **Writable workspace only**: the per-session workspace is the sole + writable path. + - **Clean environment**: ``--clearenv`` wipes all inherited env vars. + Only the explicitly-passed safe env vars are set inside the sandbox. + - **Network isolation**: ``--unshare-net`` blocks all network access. + - **Resource limits**: ulimit caps on processes (64), memory (512MB), + file size (50MB), and open FDs (256) to prevent fork bombs and abuse. + - **New session**: prevents terminal control escape. + - **Die with parent**: prevents orphaned sandbox processes. + """ + cmd = [ + "bwrap", + # Create a new user namespace so bwrap can set up sandboxing + # inside unprivileged Docker containers (no CAP_SYS_ADMIN needed). + "--unshare-user", + # Wipe all inherited environment variables (API keys, secrets, etc.) + "--clearenv", + ] + + # Set only the safe env vars inside the sandbox + for key, value in env.items(): + cmd.extend(["--setenv", key, value]) + + # System directories: read-only + for path in _SYSTEM_RO_BINDS: + cmd.extend(["--ro-bind", path, path]) + + # Compat paths: use --symlink when host path is a symlink (Debian 13), + # --ro-bind when it's a real directory (older distros). + for path, symlink_target in _COMPAT_PATHS: + if os.path.islink(path): + cmd.extend(["--symlink", symlink_target, path]) + elif os.path.exists(path): + cmd.extend(["--ro-bind", path, path]) + + # Wrap the user command with resource limits: + # sh -c 'ulimit ...; exec "$@"' -- + # `exec "$@"` replaces the shell so there's no extra process overhead, + # and properly handles arguments with spaces. + limited_command = [ + "sh", + "-c", + f'{_RESOURCE_LIMITS}; exec "$@"', + "--", + *command, + ] + + cmd.extend( + [ + # Fresh virtual filesystems + "--dev", + "/dev", + "--proc", + "/proc", + "--tmpfs", + "/tmp", + # Workspace bind AFTER --tmpfs /tmp so it's visible through the tmpfs. + # (workspace lives under /tmp/copilot-) + "--bind", + cwd, + cwd, + # Isolation + "--unshare-net", + "--die-with-parent", + "--new-session", + "--chdir", + cwd, + "--", + *limited_command, + ] + ) + + return cmd + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + +async def run_sandboxed( + command: list[str], + cwd: str, + timeout: int = _DEFAULT_TIMEOUT, + env: dict[str, str] | None = None, +) -> tuple[str, str, int, bool]: + """Run a command inside a bubblewrap sandbox. + + Callers **must** check :func:`has_full_sandbox` before calling this + function. If bubblewrap is not available, this function raises + :class:`RuntimeError` rather than running unsandboxed. + + Returns: + (stdout, stderr, exit_code, timed_out) + """ + if not has_full_sandbox(): + raise RuntimeError( + "run_sandboxed() requires bubblewrap but bwrap is not available. " + "Callers must check has_full_sandbox() before calling this function." + ) + + timeout = min(max(timeout, 1), _MAX_TIMEOUT) + + safe_env = { + "PATH": "/usr/local/bin:/usr/bin:/bin", + "HOME": cwd, + "TMPDIR": cwd, + "LANG": "en_US.UTF-8", + "PYTHONDONTWRITEBYTECODE": "1", + "PYTHONIOENCODING": "utf-8", + } + if env: + safe_env.update(env) + + full_command = _build_bwrap_command(command, cwd, safe_env) + + try: + proc = await asyncio.create_subprocess_exec( + *full_command, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + cwd=cwd, + env=safe_env, + ) + + try: + stdout_bytes, stderr_bytes = await asyncio.wait_for( + proc.communicate(), timeout=timeout + ) + stdout = stdout_bytes.decode("utf-8", errors="replace") + stderr = stderr_bytes.decode("utf-8", errors="replace") + return stdout, stderr, proc.returncode or 0, False + except asyncio.TimeoutError: + proc.kill() + await proc.communicate() + return "", f"Execution timed out after {timeout}s", -1, True + + except RuntimeError: + raise + except Exception as e: + return "", f"Sandbox error: {e}", -1, False diff --git a/autogpt_platform/backend/backend/api/features/chat/tools/web_fetch.py b/autogpt_platform/backend/backend/api/features/chat/tools/web_fetch.py new file mode 100644 index 0000000000..fed7cc11fa --- /dev/null +++ b/autogpt_platform/backend/backend/api/features/chat/tools/web_fetch.py @@ -0,0 +1,151 @@ +"""Web fetch tool — safely retrieve public web page content.""" + +import logging +from typing import Any + +import aiohttp +import html2text + +from backend.api.features.chat.model import ChatSession +from backend.api.features.chat.tools.base import BaseTool +from backend.api.features.chat.tools.models import ( + ErrorResponse, + ToolResponseBase, + WebFetchResponse, +) +from backend.util.request import Requests + +logger = logging.getLogger(__name__) + +# Limits +_MAX_CONTENT_BYTES = 102_400 # 100 KB download cap +_REQUEST_TIMEOUT = aiohttp.ClientTimeout(total=15) + +# Content types we'll read as text +_TEXT_CONTENT_TYPES = { + "text/html", + "text/plain", + "text/xml", + "text/csv", + "text/markdown", + "application/json", + "application/xml", + "application/xhtml+xml", + "application/rss+xml", + "application/atom+xml", +} + + +def _is_text_content(content_type: str) -> bool: + base = content_type.split(";")[0].strip().lower() + return base in _TEXT_CONTENT_TYPES or base.startswith("text/") + + +def _html_to_text(html: str) -> str: + h = html2text.HTML2Text() + h.ignore_links = False + h.ignore_images = True + h.body_width = 0 + return h.handle(html) + + +class WebFetchTool(BaseTool): + """Safely fetch content from a public URL using SSRF-protected HTTP.""" + + @property + def name(self) -> str: + return "web_fetch" + + @property + def description(self) -> str: + return ( + "Fetch the content of a public web page by URL. " + "Returns readable text extracted from HTML by default. " + "Useful for reading documentation, articles, and API responses. " + "Only supports HTTP/HTTPS GET requests to public URLs " + "(private/internal network addresses are blocked)." + ) + + @property + def parameters(self) -> dict[str, Any]: + return { + "type": "object", + "properties": { + "url": { + "type": "string", + "description": "The public HTTP/HTTPS URL to fetch.", + }, + "extract_text": { + "type": "boolean", + "description": ( + "If true (default), extract readable text from HTML. " + "If false, return raw content." + ), + "default": True, + }, + }, + "required": ["url"], + } + + @property + def requires_auth(self) -> bool: + return False + + async def _execute( + self, + user_id: str | None, + session: ChatSession, + **kwargs: Any, + ) -> ToolResponseBase: + url: str = (kwargs.get("url") or "").strip() + extract_text: bool = kwargs.get("extract_text", True) + session_id = session.session_id if session else None + + if not url: + return ErrorResponse( + message="Please provide a URL to fetch.", + error="missing_url", + session_id=session_id, + ) + + try: + client = Requests(raise_for_status=False, retry_max_attempts=1) + response = await client.get(url, timeout=_REQUEST_TIMEOUT) + except ValueError as e: + # validate_url raises ValueError for SSRF / blocked IPs + return ErrorResponse( + message=f"URL blocked: {e}", + error="url_blocked", + session_id=session_id, + ) + except Exception as e: + logger.warning(f"[web_fetch] Request failed for {url}: {e}") + return ErrorResponse( + message=f"Failed to fetch URL: {e}", + error="fetch_failed", + session_id=session_id, + ) + + content_type = response.headers.get("content-type", "") + if not _is_text_content(content_type): + return ErrorResponse( + message=f"Non-text content type: {content_type.split(';')[0]}", + error="unsupported_content_type", + session_id=session_id, + ) + + raw = response.content[:_MAX_CONTENT_BYTES] + text = raw.decode("utf-8", errors="replace") + + if extract_text and "html" in content_type.lower(): + text = _html_to_text(text) + + return WebFetchResponse( + message=f"Fetched {url}", + url=response.url, + status_code=response.status, + content_type=content_type.split(";")[0].strip(), + content=text, + truncated=False, + session_id=session_id, + ) diff --git a/autogpt_platform/backend/backend/api/features/chat/tools/workspace_files.py b/autogpt_platform/backend/backend/api/features/chat/tools/workspace_files.py index 03532c8fee..f37d2c80e0 100644 --- a/autogpt_platform/backend/backend/api/features/chat/tools/workspace_files.py +++ b/autogpt_platform/backend/backend/api/features/chat/tools/workspace_files.py @@ -88,7 +88,9 @@ class ListWorkspaceFilesTool(BaseTool): @property def description(self) -> str: return ( - "List files in the user's workspace. " + "List files in the user's persistent workspace (cloud storage). " + "These files survive across sessions. " + "For ephemeral session files, use the SDK Read/Glob tools instead. " "Returns file names, paths, sizes, and metadata. " "Optionally filter by path prefix." ) @@ -204,7 +206,9 @@ class ReadWorkspaceFileTool(BaseTool): @property def description(self) -> str: return ( - "Read a file from the user's workspace. " + "Read a file from the user's persistent workspace (cloud storage). " + "These files survive across sessions. " + "For ephemeral session files, use the SDK Read tool instead. " "Specify either file_id or path to identify the file. " "For small text files, returns content directly. " "For large or binary files, returns metadata and a download URL. " @@ -378,7 +382,9 @@ class WriteWorkspaceFileTool(BaseTool): @property def description(self) -> str: return ( - "Write or create a file in the user's workspace. " + "Write or create a file in the user's persistent workspace (cloud storage). " + "These files survive across sessions. " + "For ephemeral session files, use the SDK Write tool instead. " "Provide the content as a base64-encoded string. " f"Maximum file size is {Config().max_file_size_mb}MB. " "Files are saved to the current session's folder by default. " @@ -523,7 +529,7 @@ class DeleteWorkspaceFileTool(BaseTool): @property def description(self) -> str: return ( - "Delete a file from the user's workspace. " + "Delete a file from the user's persistent workspace (cloud storage). " "Specify either file_id or path to identify the file. " "Paths are scoped to the current session by default. " "Use /sessions//... for cross-session access." diff --git a/autogpt_platform/backend/backend/util/feature_flag.py b/autogpt_platform/backend/backend/util/feature_flag.py index fbd3573112..4eadc41333 100644 --- a/autogpt_platform/backend/backend/util/feature_flag.py +++ b/autogpt_platform/backend/backend/util/feature_flag.py @@ -38,6 +38,7 @@ class Flag(str, Enum): AGENT_ACTIVITY = "agent-activity" ENABLE_PLATFORM_PAYMENT = "enable-platform-payment" CHAT = "chat" + COPILOT_SDK = "copilot-sdk" def is_configured() -> bool: diff --git a/autogpt_platform/backend/poetry.lock b/autogpt_platform/backend/poetry.lock index d71cca7865..8062457a70 100644 --- a/autogpt_platform/backend/poetry.lock +++ b/autogpt_platform/backend/poetry.lock @@ -897,6 +897,29 @@ files = [ {file = "charset_normalizer-3.4.4.tar.gz", hash = "sha256:94537985111c35f28720e43603b8e7b43a6ecfb2ce1d3058bbe955b73404e21a"}, ] +[[package]] +name = "claude-agent-sdk" +version = "0.1.35" +description = "Python SDK for Claude Code" +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "claude_agent_sdk-0.1.35-py3-none-macosx_11_0_arm64.whl", hash = "sha256:df67f4deade77b16a9678b3a626c176498e40417f33b04beda9628287f375591"}, + {file = "claude_agent_sdk-0.1.35-py3-none-manylinux_2_17_aarch64.whl", hash = "sha256:14963944f55ded7c8ed518feebfa5b4284aa6dd8d81aeff2e5b21a962ce65097"}, + {file = "claude_agent_sdk-0.1.35-py3-none-manylinux_2_17_x86_64.whl", hash = "sha256:84344dcc535d179c1fc8a11c6f34c37c3b583447bdf09d869effb26514fd7a65"}, + {file = "claude_agent_sdk-0.1.35-py3-none-win_amd64.whl", hash = "sha256:1b3d54b47448c93f6f372acd4d1757f047c3c1e8ef5804be7a1e3e53e2c79a5f"}, + {file = "claude_agent_sdk-0.1.35.tar.gz", hash = "sha256:0f98e2b3c71ca85abfc042e7a35c648df88e87fda41c52e6779ef7b038dcbb52"}, +] + +[package.dependencies] +anyio = ">=4.0.0" +mcp = ">=0.1.0" +typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""} + +[package.extras] +dev = ["anyio[trio] (>=4.0.0)", "mypy (>=1.0.0)", "pytest (>=7.0.0)", "pytest-asyncio (>=0.20.0)", "pytest-cov (>=4.0.0)", "ruff (>=0.1.0)"] + [[package]] name = "cleo" version = "2.1.0" @@ -2593,6 +2616,18 @@ http2 = ["h2 (>=3,<5)"] socks = ["socksio (==1.*)"] zstd = ["zstandard (>=0.18.0)"] +[[package]] +name = "httpx-sse" +version = "0.4.3" +description = "Consume Server-Sent Event (SSE) messages with HTTPX." +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "httpx_sse-0.4.3-py3-none-any.whl", hash = "sha256:0ac1c9fe3c0afad2e0ebb25a934a59f4c7823b60792691f779fad2c5568830fc"}, + {file = "httpx_sse-0.4.3.tar.gz", hash = "sha256:9b1ed0127459a66014aec3c56bebd93da3c1bc8bb6618c8082039a44889a755d"}, +] + [[package]] name = "huggingface-hub" version = "1.4.1" @@ -3310,6 +3345,39 @@ files = [ {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, ] +[[package]] +name = "mcp" +version = "1.26.0" +description = "Model Context Protocol SDK" +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "mcp-1.26.0-py3-none-any.whl", hash = "sha256:904a21c33c25aa98ddbeb47273033c435e595bbacfdb177f4bd87f6dceebe1ca"}, + {file = "mcp-1.26.0.tar.gz", hash = "sha256:db6e2ef491eecc1a0d93711a76f28dec2e05999f93afd48795da1c1137142c66"}, +] + +[package.dependencies] +anyio = ">=4.5" +httpx = ">=0.27.1" +httpx-sse = ">=0.4" +jsonschema = ">=4.20.0" +pydantic = ">=2.11.0,<3.0.0" +pydantic-settings = ">=2.5.2" +pyjwt = {version = ">=2.10.1", extras = ["crypto"]} +python-multipart = ">=0.0.9" +pywin32 = {version = ">=310", markers = "sys_platform == \"win32\""} +sse-starlette = ">=1.6.1" +starlette = ">=0.27" +typing-extensions = ">=4.9.0" +typing-inspection = ">=0.4.1" +uvicorn = {version = ">=0.31.1", markers = "sys_platform != \"emscripten\""} + +[package.extras] +cli = ["python-dotenv (>=1.0.0)", "typer (>=0.16.0)"] +rich = ["rich (>=13.9.4)"] +ws = ["websockets (>=15.0.1)"] + [[package]] name = "mdurl" version = "0.1.2" @@ -5994,7 +6062,7 @@ description = "Python for Window Extensions" optional = false python-versions = "*" groups = ["main"] -markers = "platform_system == \"Windows\"" +markers = "sys_platform == \"win32\" or platform_system == \"Windows\"" files = [ {file = "pywin32-311-cp310-cp310-win32.whl", hash = "sha256:d03ff496d2a0cd4a5893504789d4a15399133fe82517455e78bad62efbb7f0a3"}, {file = "pywin32-311-cp310-cp310-win_amd64.whl", hash = "sha256:797c2772017851984b97180b0bebe4b620bb86328e8a884bb626156295a63b3b"}, @@ -6974,6 +7042,28 @@ postgresql-psycopgbinary = ["psycopg[binary] (>=3.0.7)"] pymysql = ["pymysql"] sqlcipher = ["sqlcipher3_binary"] +[[package]] +name = "sse-starlette" +version = "3.2.0" +description = "SSE plugin for Starlette" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "sse_starlette-3.2.0-py3-none-any.whl", hash = "sha256:5876954bd51920fc2cd51baee47a080eb88a37b5b784e615abb0b283f801cdbf"}, + {file = "sse_starlette-3.2.0.tar.gz", hash = "sha256:8127594edfb51abe44eac9c49e59b0b01f1039d0c7461c6fd91d4e03b70da422"}, +] + +[package.dependencies] +anyio = ">=4.7.0" +starlette = ">=0.49.1" + +[package.extras] +daphne = ["daphne (>=4.2.0)"] +examples = ["aiosqlite (>=0.21.0)", "fastapi (>=0.115.12)", "sqlalchemy[asyncio] (>=2.0.41)", "uvicorn (>=0.34.0)"] +granian = ["granian (>=2.3.1)"] +uvicorn = ["uvicorn (>=0.34.0)"] + [[package]] name = "stagehand" version = "0.5.9" @@ -8440,4 +8530,4 @@ cffi = ["cffi (>=1.17,<2.0) ; platform_python_implementation != \"PyPy\" and pyt [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.14" -content-hash = "fa9c5deadf593e815dd2190f58e22152373900603f5f244b9616cd721de84d2f" +content-hash = "55e095de555482f0fe47de7695f390fe93e7bcf739b31c391b2e5e3c3d938ae3" diff --git a/autogpt_platform/backend/pyproject.toml b/autogpt_platform/backend/pyproject.toml index 32dfc547bc..7a112e75ca 100644 --- a/autogpt_platform/backend/pyproject.toml +++ b/autogpt_platform/backend/pyproject.toml @@ -16,6 +16,7 @@ anthropic = "^0.79.0" apscheduler = "^3.11.1" autogpt-libs = { path = "../autogpt_libs", develop = true } bleach = { extras = ["css"], version = "^6.2.0" } +claude-agent-sdk = "^0.1.0" click = "^8.2.0" cryptography = "^46.0" discord-py = "^2.5.2" diff --git a/autogpt_platform/backend/test/chat/__init__.py b/autogpt_platform/backend/test/chat/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/autogpt_platform/backend/test/chat/test_security_hooks.py b/autogpt_platform/backend/test/chat/test_security_hooks.py new file mode 100644 index 0000000000..f10a90871b --- /dev/null +++ b/autogpt_platform/backend/test/chat/test_security_hooks.py @@ -0,0 +1,133 @@ +"""Tests for SDK security hooks — workspace paths, tool access, and deny messages. + +These are pure unit tests with no external dependencies (no SDK, no DB, no server). +They validate that the security hooks correctly block unauthorized paths, +tool access, and dangerous input patterns. + +Note: Bash command validation was removed — the SDK built-in Bash tool is not in +allowed_tools, and the bash_exec MCP tool has kernel-level network isolation +(unshare --net) making command-level parsing unnecessary. +""" + +from backend.api.features.chat.sdk.security_hooks import ( + _validate_tool_access, + _validate_workspace_path, +) + +SDK_CWD = "/tmp/copilot-test-session" + + +def _is_denied(result: dict) -> bool: + hook = result.get("hookSpecificOutput", {}) + return hook.get("permissionDecision") == "deny" + + +def _reason(result: dict) -> str: + return result.get("hookSpecificOutput", {}).get("permissionDecisionReason", "") + + +# ============================================================ +# Workspace path validation (Read, Write, Edit, etc.) +# ============================================================ + + +class TestWorkspacePathValidation: + def test_path_in_workspace(self): + result = _validate_workspace_path( + "Read", {"file_path": f"{SDK_CWD}/file.txt"}, SDK_CWD + ) + assert not _is_denied(result) + + def test_path_outside_workspace(self): + result = _validate_workspace_path("Read", {"file_path": "/etc/passwd"}, SDK_CWD) + assert _is_denied(result) + + def test_tool_results_allowed(self): + result = _validate_workspace_path( + "Read", + {"file_path": "~/.claude/projects/abc/tool-results/out.txt"}, + SDK_CWD, + ) + assert not _is_denied(result) + + def test_claude_settings_blocked(self): + result = _validate_workspace_path( + "Read", {"file_path": "~/.claude/settings.json"}, SDK_CWD + ) + assert _is_denied(result) + + def test_claude_projects_without_tool_results(self): + result = _validate_workspace_path( + "Read", {"file_path": "~/.claude/projects/abc/credentials.json"}, SDK_CWD + ) + assert _is_denied(result) + + def test_no_path_allowed(self): + """Glob/Grep without path defaults to cwd — should be allowed.""" + result = _validate_workspace_path("Grep", {"pattern": "foo"}, SDK_CWD) + assert not _is_denied(result) + + def test_path_traversal_with_dotdot(self): + result = _validate_workspace_path( + "Read", {"file_path": f"{SDK_CWD}/../../../etc/passwd"}, SDK_CWD + ) + assert _is_denied(result) + + +# ============================================================ +# Tool access validation +# ============================================================ + + +class TestToolAccessValidation: + def test_blocked_tools(self): + for tool in ("bash", "shell", "exec", "terminal", "command"): + result = _validate_tool_access(tool, {}) + assert _is_denied(result), f"Tool '{tool}' should be blocked" + + def test_bash_builtin_blocked(self): + """SDK built-in Bash (capital) is blocked as defence-in-depth.""" + result = _validate_tool_access("Bash", {"command": "echo hello"}, SDK_CWD) + assert _is_denied(result) + assert "Bash" in _reason(result) + + def test_workspace_tools_delegate(self): + result = _validate_tool_access( + "Read", {"file_path": f"{SDK_CWD}/file.txt"}, SDK_CWD + ) + assert not _is_denied(result) + + def test_dangerous_pattern_blocked(self): + result = _validate_tool_access("SomeUnknownTool", {"data": "sudo rm -rf /"}) + assert _is_denied(result) + + def test_safe_unknown_tool_allowed(self): + result = _validate_tool_access("SomeSafeTool", {"data": "hello world"}) + assert not _is_denied(result) + + +# ============================================================ +# Deny message quality (ntindle feedback) +# ============================================================ + + +class TestDenyMessageClarity: + """Deny messages must include [SECURITY] and 'cannot be bypassed' + so the model knows the restriction is enforced, not a suggestion.""" + + def test_blocked_tool_message(self): + reason = _reason(_validate_tool_access("bash", {})) + assert "[SECURITY]" in reason + assert "cannot be bypassed" in reason + + def test_bash_builtin_blocked_message(self): + reason = _reason(_validate_tool_access("Bash", {"command": "echo hello"})) + assert "[SECURITY]" in reason + assert "cannot be bypassed" in reason + + def test_workspace_path_message(self): + reason = _reason( + _validate_workspace_path("Read", {"file_path": "/etc/passwd"}, SDK_CWD) + ) + assert "[SECURITY]" in reason + assert "cannot be bypassed" in reason diff --git a/autogpt_platform/backend/test/chat/test_transcript.py b/autogpt_platform/backend/test/chat/test_transcript.py new file mode 100644 index 0000000000..71b1fad81f --- /dev/null +++ b/autogpt_platform/backend/test/chat/test_transcript.py @@ -0,0 +1,255 @@ +"""Unit tests for JSONL transcript management utilities.""" + +import json +import os + +from backend.api.features.chat.sdk.transcript import ( + STRIPPABLE_TYPES, + read_transcript_file, + strip_progress_entries, + validate_transcript, + write_transcript_to_tempfile, +) + + +def _make_jsonl(*entries: dict) -> str: + return "\n".join(json.dumps(e) for e in entries) + "\n" + + +# --- Fixtures --- + + +METADATA_LINE = {"type": "queue-operation", "subtype": "create"} +FILE_HISTORY = {"type": "file-history-snapshot", "files": []} +USER_MSG = {"type": "user", "uuid": "u1", "message": {"role": "user", "content": "hi"}} +ASST_MSG = { + "type": "assistant", + "uuid": "a1", + "parentUuid": "u1", + "message": {"role": "assistant", "content": "hello"}, +} +PROGRESS_ENTRY = { + "type": "progress", + "uuid": "p1", + "parentUuid": "u1", + "data": {"type": "bash_progress", "stdout": "running..."}, +} + +VALID_TRANSCRIPT = _make_jsonl(METADATA_LINE, FILE_HISTORY, USER_MSG, ASST_MSG) + + +# --- read_transcript_file --- + + +class TestReadTranscriptFile: + def test_returns_content_for_valid_file(self, tmp_path): + path = tmp_path / "session.jsonl" + path.write_text(VALID_TRANSCRIPT) + result = read_transcript_file(str(path)) + assert result is not None + assert "user" in result + + def test_returns_none_for_missing_file(self): + assert read_transcript_file("/nonexistent/path.jsonl") is None + + def test_returns_none_for_empty_path(self): + assert read_transcript_file("") is None + + def test_returns_none_for_empty_file(self, tmp_path): + path = tmp_path / "empty.jsonl" + path.write_text("") + assert read_transcript_file(str(path)) is None + + def test_returns_none_for_metadata_only(self, tmp_path): + content = _make_jsonl(METADATA_LINE, FILE_HISTORY) + path = tmp_path / "meta.jsonl" + path.write_text(content) + assert read_transcript_file(str(path)) is None + + def test_returns_none_for_invalid_json(self, tmp_path): + path = tmp_path / "bad.jsonl" + path.write_text("not json\n{}\n{}\n") + assert read_transcript_file(str(path)) is None + + def test_no_size_limit(self, tmp_path): + """Large files are accepted — bucket storage has no size limit.""" + big_content = {"type": "user", "uuid": "u9", "data": "x" * 1_000_000} + content = _make_jsonl(METADATA_LINE, FILE_HISTORY, big_content, ASST_MSG) + path = tmp_path / "big.jsonl" + path.write_text(content) + result = read_transcript_file(str(path)) + assert result is not None + + +# --- write_transcript_to_tempfile --- + + +class TestWriteTranscriptToTempfile: + """Tests use /tmp/copilot-* paths to satisfy the sandbox prefix check.""" + + def test_writes_file_and_returns_path(self): + cwd = "/tmp/copilot-test-write" + try: + result = write_transcript_to_tempfile( + VALID_TRANSCRIPT, "sess-1234-abcd", cwd + ) + assert result is not None + assert os.path.isfile(result) + assert result.endswith(".jsonl") + with open(result) as f: + assert f.read() == VALID_TRANSCRIPT + finally: + import shutil + + shutil.rmtree(cwd, ignore_errors=True) + + def test_creates_parent_directory(self): + cwd = "/tmp/copilot-test-mkdir" + try: + result = write_transcript_to_tempfile(VALID_TRANSCRIPT, "sess-1234", cwd) + assert result is not None + assert os.path.isdir(cwd) + finally: + import shutil + + shutil.rmtree(cwd, ignore_errors=True) + + def test_uses_session_id_prefix(self): + cwd = "/tmp/copilot-test-prefix" + try: + result = write_transcript_to_tempfile( + VALID_TRANSCRIPT, "abcdef12-rest", cwd + ) + assert result is not None + assert "abcdef12" in os.path.basename(result) + finally: + import shutil + + shutil.rmtree(cwd, ignore_errors=True) + + def test_rejects_cwd_outside_sandbox(self, tmp_path): + cwd = str(tmp_path / "not-copilot") + result = write_transcript_to_tempfile(VALID_TRANSCRIPT, "sess-1234", cwd) + assert result is None + + +# --- validate_transcript --- + + +class TestValidateTranscript: + def test_valid_transcript(self): + assert validate_transcript(VALID_TRANSCRIPT) is True + + def test_none_content(self): + assert validate_transcript(None) is False + + def test_empty_content(self): + assert validate_transcript("") is False + + def test_metadata_only(self): + content = _make_jsonl(METADATA_LINE, FILE_HISTORY) + assert validate_transcript(content) is False + + def test_user_only_no_assistant(self): + content = _make_jsonl(METADATA_LINE, FILE_HISTORY, USER_MSG) + assert validate_transcript(content) is False + + def test_assistant_only_no_user(self): + content = _make_jsonl(METADATA_LINE, FILE_HISTORY, ASST_MSG) + assert validate_transcript(content) is False + + def test_invalid_json_returns_false(self): + assert validate_transcript("not json\n{}\n{}\n") is False + + +# --- strip_progress_entries --- + + +class TestStripProgressEntries: + def test_strips_all_strippable_types(self): + """All STRIPPABLE_TYPES are removed from the output.""" + entries = [ + USER_MSG, + {"type": "progress", "uuid": "p1", "parentUuid": "u1"}, + {"type": "file-history-snapshot", "files": []}, + {"type": "queue-operation", "subtype": "create"}, + {"type": "summary", "text": "..."}, + {"type": "pr-link", "url": "..."}, + ASST_MSG, + ] + result = strip_progress_entries(_make_jsonl(*entries)) + result_types = {json.loads(line)["type"] for line in result.strip().split("\n")} + assert result_types == {"user", "assistant"} + for stype in STRIPPABLE_TYPES: + assert stype not in result_types + + def test_reparents_children_of_stripped_entries(self): + """An assistant message whose parent is a progress entry gets reparented.""" + progress = { + "type": "progress", + "uuid": "p1", + "parentUuid": "u1", + "data": {"type": "bash_progress"}, + } + asst = { + "type": "assistant", + "uuid": "a1", + "parentUuid": "p1", # Points to progress + "message": {"role": "assistant", "content": "done"}, + } + content = _make_jsonl(USER_MSG, progress, asst) + result = strip_progress_entries(content) + lines = [json.loads(line) for line in result.strip().split("\n")] + + asst_entry = next(e for e in lines if e["type"] == "assistant") + # Should be reparented to u1 (the user message) + assert asst_entry["parentUuid"] == "u1" + + def test_reparents_through_chain(self): + """Reparenting walks through multiple stripped entries.""" + p1 = {"type": "progress", "uuid": "p1", "parentUuid": "u1"} + p2 = {"type": "progress", "uuid": "p2", "parentUuid": "p1"} + p3 = {"type": "progress", "uuid": "p3", "parentUuid": "p2"} + asst = { + "type": "assistant", + "uuid": "a1", + "parentUuid": "p3", # 3 levels deep + "message": {"role": "assistant", "content": "done"}, + } + content = _make_jsonl(USER_MSG, p1, p2, p3, asst) + result = strip_progress_entries(content) + lines = [json.loads(line) for line in result.strip().split("\n")] + + asst_entry = next(e for e in lines if e["type"] == "assistant") + assert asst_entry["parentUuid"] == "u1" + + def test_preserves_non_strippable_entries(self): + """User, assistant, and system entries are preserved.""" + system = {"type": "system", "uuid": "s1", "message": "prompt"} + content = _make_jsonl(system, USER_MSG, ASST_MSG) + result = strip_progress_entries(content) + result_types = [json.loads(line)["type"] for line in result.strip().split("\n")] + assert result_types == ["system", "user", "assistant"] + + def test_empty_input(self): + result = strip_progress_entries("") + # Should return just a newline (empty content stripped) + assert result.strip() == "" + + def test_no_strippable_entries(self): + """When there's nothing to strip, output matches input structure.""" + content = _make_jsonl(USER_MSG, ASST_MSG) + result = strip_progress_entries(content) + result_lines = result.strip().split("\n") + assert len(result_lines) == 2 + + def test_handles_entries_without_uuid(self): + """Entries without uuid field are handled gracefully.""" + no_uuid = {"type": "queue-operation", "subtype": "create"} + content = _make_jsonl(no_uuid, USER_MSG, ASST_MSG) + result = strip_progress_entries(content) + result_types = [json.loads(line)["type"] for line in result.strip().split("\n")] + # queue-operation is strippable + assert "queue-operation" not in result_types + assert "user" in result_types + assert "assistant" in result_types diff --git a/autogpt_platform/frontend/src/app/(platform)/copilot/components/ChatMessagesContainer/ChatMessagesContainer.tsx b/autogpt_platform/frontend/src/app/(platform)/copilot/components/ChatMessagesContainer/ChatMessagesContainer.tsx index b62e96f58a..c118057963 100644 --- a/autogpt_platform/frontend/src/app/(platform)/copilot/components/ChatMessagesContainer/ChatMessagesContainer.tsx +++ b/autogpt_platform/frontend/src/app/(platform)/copilot/components/ChatMessagesContainer/ChatMessagesContainer.tsx @@ -24,6 +24,7 @@ import { FindBlocksTool } from "../../tools/FindBlocks/FindBlocks"; import { RunAgentTool } from "../../tools/RunAgent/RunAgent"; import { RunBlockTool } from "../../tools/RunBlock/RunBlock"; import { SearchDocsTool } from "../../tools/SearchDocs/SearchDocs"; +import { GenericTool } from "../../tools/GenericTool/GenericTool"; import { ViewAgentOutputTool } from "../../tools/ViewAgentOutput/ViewAgentOutput"; // --------------------------------------------------------------------------- @@ -273,6 +274,16 @@ export const ChatMessagesContainer = ({ /> ); default: + // Render a generic tool indicator for SDK built-in + // tools (Read, Glob, Grep, etc.) or any unrecognized tool + if (part.type.startsWith("tool-")) { + return ( + + ); + } return null; } })} diff --git a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/GenericTool.tsx b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/GenericTool.tsx new file mode 100644 index 0000000000..677f1d01d1 --- /dev/null +++ b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/GenericTool.tsx @@ -0,0 +1,63 @@ +"use client"; + +import { ToolUIPart } from "ai"; +import { GearIcon } from "@phosphor-icons/react"; +import { MorphingTextAnimation } from "../../components/MorphingTextAnimation/MorphingTextAnimation"; + +interface Props { + part: ToolUIPart; +} + +function extractToolName(part: ToolUIPart): string { + // ToolUIPart.type is "tool-{name}", extract the name portion. + return part.type.replace(/^tool-/, ""); +} + +function formatToolName(name: string): string { + // "search_docs" → "Search docs", "Read" → "Read" + return name.replace(/_/g, " ").replace(/^\w/, (c) => c.toUpperCase()); +} + +function getAnimationText(part: ToolUIPart): string { + const label = formatToolName(extractToolName(part)); + + switch (part.state) { + case "input-streaming": + case "input-available": + return `Running ${label}…`; + case "output-available": + return `${label} completed`; + case "output-error": + return `${label} failed`; + default: + return `Running ${label}…`; + } +} + +export function GenericTool({ part }: Props) { + const isStreaming = + part.state === "input-streaming" || part.state === "input-available"; + const isError = part.state === "output-error"; + + return ( +
+
+ + +
+
+ ); +} diff --git a/autogpt_platform/frontend/src/app/api/openapi.json b/autogpt_platform/frontend/src/app/api/openapi.json index 1e8dca865c..8e48931540 100644 --- a/autogpt_platform/frontend/src/app/api/openapi.json +++ b/autogpt_platform/frontend/src/app/api/openapi.json @@ -7066,13 +7066,57 @@ "properties": { "id": { "type": "string", "title": "Id" }, "name": { "type": "string", "title": "Name" }, - "description": { "type": "string", "title": "Description" } + "description": { "type": "string", "title": "Description" }, + "categories": { + "items": { "type": "string" }, + "type": "array", + "title": "Categories" + }, + "input_schema": { + "additionalProperties": true, + "type": "object", + "title": "Input Schema", + "description": "Full JSON schema for block inputs" + }, + "output_schema": { + "additionalProperties": true, + "type": "object", + "title": "Output Schema", + "description": "Full JSON schema for block outputs" + }, + "required_inputs": { + "items": { "$ref": "#/components/schemas/BlockInputFieldInfo" }, + "type": "array", + "title": "Required Inputs", + "description": "List of input fields for this block" + } }, "type": "object", - "required": ["id", "name", "description"], + "required": ["id", "name", "description", "categories"], "title": "BlockInfoSummary", "description": "Summary of a block for search results." }, + "BlockInputFieldInfo": { + "properties": { + "name": { "type": "string", "title": "Name" }, + "type": { "type": "string", "title": "Type" }, + "description": { + "type": "string", + "title": "Description", + "default": "" + }, + "required": { + "type": "boolean", + "title": "Required", + "default": false + }, + "default": { "anyOf": [{}, { "type": "null" }], "title": "Default" } + }, + "type": "object", + "required": ["name", "type"], + "title": "BlockInputFieldInfo", + "description": "Information about a block input field." + }, "BlockListResponse": { "properties": { "type": { @@ -7090,7 +7134,12 @@ "title": "Blocks" }, "count": { "type": "integer", "title": "Count" }, - "query": { "type": "string", "title": "Query" } + "query": { "type": "string", "title": "Query" }, + "usage_hint": { + "type": "string", + "title": "Usage Hint", + "default": "To execute a block, call run_block with block_id set to the block's 'id' field and input_data containing the fields listed in required_inputs." + } }, "type": "object", "required": ["message", "blocks", "count", "query"], @@ -10496,6 +10545,9 @@ "operation_pending", "operation_in_progress", "input_validation_error", + "web_fetch", + "bash_exec", + "operation_status", "feature_request_search", "feature_request_created" ],