perf(copilot): add thinking token cap and lower default budget/turns

Cost investigation showed 54% of spend is thinking tokens (~57K/turn
avg at $75/M for Opus). Add max_thinking_tokens config (default 8192)
to cap extended thinking output per LLM call.

Also lower defaults:
- max_budget_usd: $100 → $5 per turn
- max_turns: 1000 → 50 tool-use loops

These are configurable via env vars (CHAT_CLAUDE_AGENT_MAX_THINKING_TOKENS,
CHAT_CLAUDE_AGENT_MAX_BUDGET_USD, CHAT_CLAUDE_AGENT_MAX_TURNS).
This commit is contained in:
majdyz
2026-04-13 00:12:16 +00:00
parent 099d5cf1b2
commit b044862dba
2 changed files with 15 additions and 2 deletions

View File

@@ -152,19 +152,28 @@ class ChatConfig(BaseSettings):
"overloaded). The SDK automatically retries with this cheaper model.",
)
claude_agent_max_turns: int = Field(
default=1000,
default=50,
ge=1,
le=10000,
description="Maximum number of agentic turns (tool-use loops) per query. "
"Prevents runaway tool loops from burning budget.",
)
claude_agent_max_budget_usd: float = Field(
default=100.0,
default=5.0,
ge=0.01,
le=1000.0,
description="Maximum spend in USD per SDK query. The CLI aborts the "
"request if this budget is exceeded.",
)
claude_agent_max_thinking_tokens: int = Field(
default=8192,
ge=1024,
le=128000,
description="Maximum thinking/reasoning tokens per LLM call. "
"Extended thinking on Opus can generate 50k+ tokens at $75/M — "
"capping this is the single biggest cost lever. "
"8192 is sufficient for most tasks; increase for complex reasoning.",
)
claude_agent_max_transient_retries: int = Field(
default=3,
ge=0,

View File

@@ -2238,6 +2238,10 @@ async def stream_chat_completion_sdk(
"max_turns": config.claude_agent_max_turns,
# max_budget_usd: per-query spend ceiling enforced by the CLI.
"max_budget_usd": config.claude_agent_max_budget_usd,
# max_thinking_tokens: cap extended thinking output per LLM call.
# Thinking tokens are billed at output rate ($75/M for Opus) and
# account for ~54% of total cost. 8192 is the default.
"max_thinking_tokens": config.claude_agent_max_thinking_tokens,
}
if sdk_model:
sdk_options_kwargs["model"] = sdk_model