mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-04-30 03:00:41 -04:00
perf(copilot): add thinking token cap and lower default budget/turns
Cost investigation showed 54% of spend is thinking tokens (~57K/turn avg at $75/M for Opus). Add max_thinking_tokens config (default 8192) to cap extended thinking output per LLM call. Also lower defaults: - max_budget_usd: $100 → $5 per turn - max_turns: 1000 → 50 tool-use loops These are configurable via env vars (CHAT_CLAUDE_AGENT_MAX_THINKING_TOKENS, CHAT_CLAUDE_AGENT_MAX_BUDGET_USD, CHAT_CLAUDE_AGENT_MAX_TURNS).
This commit is contained in:
@@ -152,19 +152,28 @@ class ChatConfig(BaseSettings):
|
||||
"overloaded). The SDK automatically retries with this cheaper model.",
|
||||
)
|
||||
claude_agent_max_turns: int = Field(
|
||||
default=1000,
|
||||
default=50,
|
||||
ge=1,
|
||||
le=10000,
|
||||
description="Maximum number of agentic turns (tool-use loops) per query. "
|
||||
"Prevents runaway tool loops from burning budget.",
|
||||
)
|
||||
claude_agent_max_budget_usd: float = Field(
|
||||
default=100.0,
|
||||
default=5.0,
|
||||
ge=0.01,
|
||||
le=1000.0,
|
||||
description="Maximum spend in USD per SDK query. The CLI aborts the "
|
||||
"request if this budget is exceeded.",
|
||||
)
|
||||
claude_agent_max_thinking_tokens: int = Field(
|
||||
default=8192,
|
||||
ge=1024,
|
||||
le=128000,
|
||||
description="Maximum thinking/reasoning tokens per LLM call. "
|
||||
"Extended thinking on Opus can generate 50k+ tokens at $75/M — "
|
||||
"capping this is the single biggest cost lever. "
|
||||
"8192 is sufficient for most tasks; increase for complex reasoning.",
|
||||
)
|
||||
claude_agent_max_transient_retries: int = Field(
|
||||
default=3,
|
||||
ge=0,
|
||||
|
||||
@@ -2238,6 +2238,10 @@ async def stream_chat_completion_sdk(
|
||||
"max_turns": config.claude_agent_max_turns,
|
||||
# max_budget_usd: per-query spend ceiling enforced by the CLI.
|
||||
"max_budget_usd": config.claude_agent_max_budget_usd,
|
||||
# max_thinking_tokens: cap extended thinking output per LLM call.
|
||||
# Thinking tokens are billed at output rate ($75/M for Opus) and
|
||||
# account for ~54% of total cost. 8192 is the default.
|
||||
"max_thinking_tokens": config.claude_agent_max_thinking_tokens,
|
||||
}
|
||||
if sdk_model:
|
||||
sdk_options_kwargs["model"] = sdk_model
|
||||
|
||||
Reference in New Issue
Block a user