From b044862dbabb245559661117bc63ba3dd88d8bd8 Mon Sep 17 00:00:00 2001
From: majdyz <zamil.majdy@agpt.co>
Date: Mon, 13 Apr 2026 00:12:16 +0000
Subject: [PATCH] perf(copilot): add thinking token cap and lower default
 budget/turns
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cost investigation showed 54% of spend is thinking tokens (~57K/turn
avg at $75/M for Opus). Add max_thinking_tokens config (default 8192)
to cap extended thinking output per LLM call.

Also lower defaults:
- max_budget_usd: $100 → $5 per turn
- max_turns: 1000 → 50 tool-use loops

These are configurable via env vars (CHAT_CLAUDE_AGENT_MAX_THINKING_TOKENS,
CHAT_CLAUDE_AGENT_MAX_BUDGET_USD, CHAT_CLAUDE_AGENT_MAX_TURNS).
---
 autogpt_platform/backend/backend/copilot/config.py  | 13 +++++++++++--
 .../backend/backend/copilot/sdk/service.py          |  4 ++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/autogpt_platform/backend/backend/copilot/config.py b/autogpt_platform/backend/backend/copilot/config.py
index bd1acbce83..b580bda08b 100644
--- a/autogpt_platform/backend/backend/copilot/config.py
+++ b/autogpt_platform/backend/backend/copilot/config.py
@@ -152,19 +152,28 @@ class ChatConfig(BaseSettings):
         "overloaded). The SDK automatically retries with this cheaper model.",
     )
     claude_agent_max_turns: int = Field(
-        default=1000,
+        default=50,
         ge=1,
         le=10000,
         description="Maximum number of agentic turns (tool-use loops) per query. "
         "Prevents runaway tool loops from burning budget.",
     )
     claude_agent_max_budget_usd: float = Field(
-        default=100.0,
+        default=5.0,
         ge=0.01,
         le=1000.0,
         description="Maximum spend in USD per SDK query. The CLI aborts the "
         "request if this budget is exceeded.",
     )
+    claude_agent_max_thinking_tokens: int = Field(
+        default=8192,
+        ge=1024,
+        le=128000,
+        description="Maximum thinking/reasoning tokens per LLM call. "
+        "Extended thinking on Opus can generate 50k+ tokens at $75/M — "
+        "capping this is the single biggest cost lever. "
+        "8192 is sufficient for most tasks; increase for complex reasoning.",
+    )
     claude_agent_max_transient_retries: int = Field(
         default=3,
         ge=0,
diff --git a/autogpt_platform/backend/backend/copilot/sdk/service.py b/autogpt_platform/backend/backend/copilot/sdk/service.py
index 35b87cd40c..5ee6bba8ca 100644
--- a/autogpt_platform/backend/backend/copilot/sdk/service.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/service.py
@@ -2238,6 +2238,10 @@ async def stream_chat_completion_sdk(
             "max_turns": config.claude_agent_max_turns,
             # max_budget_usd: per-query spend ceiling enforced by the CLI.
             "max_budget_usd": config.claude_agent_max_budget_usd,
+            # max_thinking_tokens: cap extended thinking output per LLM call.
+            # Thinking tokens are billed at output rate ($75/M for Opus) and
+            # account for ~54% of total cost.  8192 is the default.
+            "max_thinking_tokens": config.claude_agent_max_thinking_tokens,
         }
         if sdk_model:
             sdk_options_kwargs["model"] = sdk_model