fix(anthropic): token limits for streaming with tool calls (#3084)

* remove for bedrock since they handle on their own * fix * fix inference config reference * add to docs * make it min between max tokens
2026-02-01 02:05:18 -05:00 · 2026-01-30 12:08:16 -08:00
parent f7c3de0591
commit 656beb8383
5 changed files with 41 additions and 21 deletions
--- a/.cursor/commands/council.md
+++ b/.cursor/commands/council.md
@@ -0,0 +1,7 @@
+Based on the given area of interest, please:
+
+1. Dig around the codebase in terms of that given area of interest, gather general information such as keywords and architecture overview.
+2. Spawn off n=10 (unless specified otherwise) task agents to dig deeper into the codebase in terms of that given area of interest, some of them should be out of the box for variance.
+3. Once the task agents are done, use the information to do what the user wants.
+
+If user is in plan mode, use the information to create the plan.
--- a/apps/docs/content/docs/en/blocks/agent.mdx
+++ b/apps/docs/content/docs/en/blocks/agent.mdx
@@ -58,7 +58,7 @@ Controls response randomness and creativity:

 ### Max Output Tokens

-Controls the maximum length of the model's response. For Anthropic models, Sim uses reliable defaults: streaming executions use the model's full capacity (e.g. 64,000 tokens for Claude 4.5), while non-streaming executions default to 8,192 to avoid timeout issues. For long-form content generation via API, explicitly set a higher value.
+Controls the maximum length of the model's response. For Anthropic models, Sim uses reliable defaults: streaming executions use the model's full capacity (e.g. 64,000 tokens for Claude 4.5), while non-streaming executions default to 8,192 to avoid timeout issues. When using tools with Anthropic models, intermediate tool-calling requests use a capped limit of 8,192 tokens to avoid SDK timeout errors, regardless of your configured max tokens—the final streaming response uses your full configured limit. This only affects Anthropic's direct API; AWS Bedrock handles this automatically. For long-form content generation via API, explicitly set a higher value.

 ### API Key

--- a/apps/sim/providers/anthropic/index.ts
+++ b/apps/sim/providers/anthropic/index.ts
@@ -302,13 +302,21 @@ export const anthropicProvider: ProviderConfig = {
      const providerStartTime = Date.now()
      const providerStartTimeISO = new Date(providerStartTime).toISOString()

+      // Cap intermediate calls at non-streaming limit to avoid SDK timeout errors,
+      // but allow users to set lower values if desired
+      const nonStreamingLimit = getMaxOutputTokensForModel(request.model, false)
+      const nonStreamingMaxTokens = request.maxTokens
+        ? Math.min(Number.parseInt(String(request.maxTokens)), nonStreamingLimit)
+        : nonStreamingLimit
+      const intermediatePayload = { ...payload, max_tokens: nonStreamingMaxTokens }
+
      try {
        const initialCallTime = Date.now()
-        const originalToolChoice = payload.tool_choice
+        const originalToolChoice = intermediatePayload.tool_choice
        const forcedTools = preparedTools?.forcedTools || []
        let usedForcedTools: string[] = []

-        let currentResponse = await anthropic.messages.create(payload)
+        let currentResponse = await anthropic.messages.create(intermediatePayload)
        const firstResponseTime = Date.now() - initialCallTime

        let content = ''
@@ -491,7 +499,7 @@ export const anthropicProvider: ProviderConfig = {
            toolsTime += thisToolsTime

            const nextPayload = {
-              ...payload,
+              ...intermediatePayload,
              messages: currentMessages,
            }

@@ -674,13 +682,21 @@ export const anthropicProvider: ProviderConfig = {
    const providerStartTime = Date.now()
    const providerStartTimeISO = new Date(providerStartTime).toISOString()

+    // Cap intermediate calls at non-streaming limit to avoid SDK timeout errors,
+    // but allow users to set lower values if desired
+    const nonStreamingLimit = getMaxOutputTokensForModel(request.model, false)
+    const toolLoopMaxTokens = request.maxTokens
+      ? Math.min(Number.parseInt(String(request.maxTokens)), nonStreamingLimit)
+      : nonStreamingLimit
+    const toolLoopPayload = { ...payload, max_tokens: toolLoopMaxTokens }
+
    try {
      const initialCallTime = Date.now()
-      const originalToolChoice = payload.tool_choice
+      const originalToolChoice = toolLoopPayload.tool_choice
      const forcedTools = preparedTools?.forcedTools || []
      let usedForcedTools: string[] = []

-      let currentResponse = await anthropic.messages.create(payload)
+      let currentResponse = await anthropic.messages.create(toolLoopPayload)
      const firstResponseTime = Date.now() - initialCallTime

      let content = ''
@@ -867,7 +883,7 @@ export const anthropicProvider: ProviderConfig = {
          toolsTime += thisToolsTime

          const nextPayload = {
-            ...payload,
+            ...toolLoopPayload,
            messages: currentMessages,
          }

--- a/apps/sim/providers/bedrock/index.ts
+++ b/apps/sim/providers/bedrock/index.ts
@@ -20,11 +20,7 @@ import {
  generateToolUseId,
  getBedrockInferenceProfileId,
 } from '@/providers/bedrock/utils'
-import {
-  getMaxOutputTokensForModel,
-  getProviderDefaultModel,
-  getProviderModels,
-} from '@/providers/models'
+import { getProviderDefaultModel, getProviderModels } from '@/providers/models'
 import type {
  ProviderConfig,
  ProviderRequest,
@@ -261,11 +257,11 @@ export const bedrockProvider: ProviderConfig = {

    const systemPromptWithSchema = systemContent

-    const inferenceConfig = {
+    const inferenceConfig: { temperature: number; maxTokens?: number } = {
      temperature: Number.parseFloat(String(request.temperature ?? 0.7)),
-      maxTokens:
-        Number.parseInt(String(request.maxTokens)) ||
-        getMaxOutputTokensForModel(request.model, request.stream ?? false),
+    }
+    if (request.maxTokens != null) {
+      inferenceConfig.maxTokens = Number.parseInt(String(request.maxTokens))
    }

    const shouldStreamToolCalls = request.streamToolCalls ?? false
--- a/apps/sim/providers/models.ts
+++ b/apps/sim/providers/models.ts
@@ -34,10 +34,15 @@ export interface ModelCapabilities {
  toolUsageControl?: boolean
  computerUse?: boolean
  nativeStructuredOutputs?: boolean
+  /**
+   * Max output tokens configuration for Anthropic SDK's streaming timeout workaround.
+   * The Anthropic SDK throws an error for non-streaming requests that may take >10 minutes.
+   * This only applies to direct Anthropic API calls, not Bedrock (which uses AWS SDK).
+   */
  maxOutputTokens?: {
    /** Maximum tokens for streaming requests */
    max: number
-    /** Safe default for non-streaming requests (to avoid timeout issues) */
+    /** Safe default for non-streaming requests (to avoid Anthropic SDK timeout errors) */
    default: number
  }
  reasoningEffort?: {
@@ -1709,7 +1714,6 @@ export const PROVIDER_DEFINITIONS: Record<string, ProviderDefinition> = {
        capabilities: {
          temperature: { min: 0, max: 1 },
          nativeStructuredOutputs: true,
-          maxOutputTokens: { max: 64000, default: 8192 },
        },
        contextWindow: 200000,
      },
@@ -1723,7 +1727,6 @@ export const PROVIDER_DEFINITIONS: Record<string, ProviderDefinition> = {
        capabilities: {
          temperature: { min: 0, max: 1 },
          nativeStructuredOutputs: true,
-          maxOutputTokens: { max: 64000, default: 8192 },
        },
        contextWindow: 200000,
      },
@@ -1737,7 +1740,6 @@ export const PROVIDER_DEFINITIONS: Record<string, ProviderDefinition> = {
        capabilities: {
          temperature: { min: 0, max: 1 },
          nativeStructuredOutputs: true,
-          maxOutputTokens: { max: 64000, default: 8192 },
        },
        contextWindow: 200000,
      },
@@ -1751,7 +1753,6 @@ export const PROVIDER_DEFINITIONS: Record<string, ProviderDefinition> = {
        capabilities: {
          temperature: { min: 0, max: 1 },
          nativeStructuredOutputs: true,
-          maxOutputTokens: { max: 64000, default: 8192 },
        },
        contextWindow: 200000,
      },