From b044862dbabb245559661117bc63ba3dd88d8bd8 Mon Sep 17 00:00:00 2001 From: majdyz Date: Mon, 13 Apr 2026 00:12:16 +0000 Subject: [PATCH] perf(copilot): add thinking token cap and lower default budget/turns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cost investigation showed 54% of spend is thinking tokens (~57K/turn avg at $75/M for Opus). Add max_thinking_tokens config (default 8192) to cap extended thinking output per LLM call. Also lower defaults: - max_budget_usd: $100 → $5 per turn - max_turns: 1000 → 50 tool-use loops These are configurable via env vars (CHAT_CLAUDE_AGENT_MAX_THINKING_TOKENS, CHAT_CLAUDE_AGENT_MAX_BUDGET_USD, CHAT_CLAUDE_AGENT_MAX_TURNS). --- autogpt_platform/backend/backend/copilot/config.py | 13 +++++++++++-- .../backend/backend/copilot/sdk/service.py | 4 ++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/autogpt_platform/backend/backend/copilot/config.py b/autogpt_platform/backend/backend/copilot/config.py index bd1acbce83..b580bda08b 100644 --- a/autogpt_platform/backend/backend/copilot/config.py +++ b/autogpt_platform/backend/backend/copilot/config.py @@ -152,19 +152,28 @@ class ChatConfig(BaseSettings): "overloaded). The SDK automatically retries with this cheaper model.", ) claude_agent_max_turns: int = Field( - default=1000, + default=50, ge=1, le=10000, description="Maximum number of agentic turns (tool-use loops) per query. " "Prevents runaway tool loops from burning budget.", ) claude_agent_max_budget_usd: float = Field( - default=100.0, + default=5.0, ge=0.01, le=1000.0, description="Maximum spend in USD per SDK query. The CLI aborts the " "request if this budget is exceeded.", ) + claude_agent_max_thinking_tokens: int = Field( + default=8192, + ge=1024, + le=128000, + description="Maximum thinking/reasoning tokens per LLM call. " + "Extended thinking on Opus can generate 50k+ tokens at $75/M — " + "capping this is the single biggest cost lever. " + "8192 is sufficient for most tasks; increase for complex reasoning.", + ) claude_agent_max_transient_retries: int = Field( default=3, ge=0, diff --git a/autogpt_platform/backend/backend/copilot/sdk/service.py b/autogpt_platform/backend/backend/copilot/sdk/service.py index 35b87cd40c..5ee6bba8ca 100644 --- a/autogpt_platform/backend/backend/copilot/sdk/service.py +++ b/autogpt_platform/backend/backend/copilot/sdk/service.py @@ -2238,6 +2238,10 @@ async def stream_chat_completion_sdk( "max_turns": config.claude_agent_max_turns, # max_budget_usd: per-query spend ceiling enforced by the CLI. "max_budget_usd": config.claude_agent_max_budget_usd, + # max_thinking_tokens: cap extended thinking output per LLM call. + # Thinking tokens are billed at output rate ($75/M for Opus) and + # account for ~54% of total cost. 8192 is the default. + "max_thinking_tokens": config.claude_agent_max_thinking_tokens, } if sdk_model: sdk_options_kwargs["model"] = sdk_model