perf(copilot): add thinking token cap and lower default budget/turns

Cost investigation showed 54% of spend is thinking tokens (~57K/turn avg at $75/M for Opus). Add max_thinking_tokens config (default 8192) to cap extended thinking output per LLM call. Also lower defaults: - max_budget_usd: $100 → $5 per turn - max_turns: 1000 → 50 tool-use loops These are configurable via env vars (CHAT_CLAUDE_AGENT_MAX_THINKING_TOKENS, CHAT_CLAUDE_AGENT_MAX_BUDGET_USD, CHAT_CLAUDE_AGENT_MAX_TURNS).
2026-04-30 03:00:41 -04:00 · 2026-04-13 00:12:16 +00:00
parent 099d5cf1b2
commit b044862dba
2 changed files with 15 additions and 2 deletions
--- a/autogpt_platform/backend/backend/copilot/config.py
+++ b/autogpt_platform/backend/backend/copilot/config.py
@@ -152,19 +152,28 @@ class ChatConfig(BaseSettings):
        "overloaded). The SDK automatically retries with this cheaper model.",
    )
    claude_agent_max_turns: int = Field(
-        default=1000,
+        default=50,
        ge=1,
        le=10000,
        description="Maximum number of agentic turns (tool-use loops) per query. "
        "Prevents runaway tool loops from burning budget.",
    )
    claude_agent_max_budget_usd: float = Field(
-        default=100.0,
+        default=5.0,
        ge=0.01,
        le=1000.0,
        description="Maximum spend in USD per SDK query. The CLI aborts the "
        "request if this budget is exceeded.",
    )
+    claude_agent_max_thinking_tokens: int = Field(
+        default=8192,
+        ge=1024,
+        le=128000,
+        description="Maximum thinking/reasoning tokens per LLM call. "
+        "Extended thinking on Opus can generate 50k+ tokens at $75/M — "
+        "capping this is the single biggest cost lever. "
+        "8192 is sufficient for most tasks; increase for complex reasoning.",
+    )
    claude_agent_max_transient_retries: int = Field(
        default=3,
        ge=0,
--- a/autogpt_platform/backend/backend/copilot/sdk/service.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/service.py
@@ -2238,6 +2238,10 @@ async def stream_chat_completion_sdk(
            "max_turns": config.claude_agent_max_turns,
            # max_budget_usd: per-query spend ceiling enforced by the CLI.
            "max_budget_usd": config.claude_agent_max_budget_usd,
+            # max_thinking_tokens: cap extended thinking output per LLM call.
+            # Thinking tokens are billed at output rate ($75/M for Opus) and
+            # account for ~54% of total cost.  8192 is the default.
+            "max_thinking_tokens": config.claude_agent_max_thinking_tokens,
        }
        if sdk_model:
            sdk_options_kwargs["model"] = sdk_model