fix(anthropic): token limits for streaming with tool calls (#3084)

* remove for bedrock since they handle on their own

* fix

* fix inference config reference

* add to docs

* make it min between max tokens
This commit is contained in:
Vikhyath Mondreti
2026-01-30 12:08:16 -08:00
committed by GitHub
parent f7c3de0591
commit 656beb8383
5 changed files with 41 additions and 21 deletions

View File

@@ -0,0 +1,7 @@
Based on the given area of interest, please:
1. Dig around the codebase in terms of that given area of interest, gather general information such as keywords and architecture overview.
2. Spawn off n=10 (unless specified otherwise) task agents to dig deeper into the codebase in terms of that given area of interest, some of them should be out of the box for variance.
3. Once the task agents are done, use the information to do what the user wants.
If user is in plan mode, use the information to create the plan.

View File

@@ -58,7 +58,7 @@ Controls response randomness and creativity:
### Max Output Tokens
Controls the maximum length of the model's response. For Anthropic models, Sim uses reliable defaults: streaming executions use the model's full capacity (e.g. 64,000 tokens for Claude 4.5), while non-streaming executions default to 8,192 to avoid timeout issues. For long-form content generation via API, explicitly set a higher value.
Controls the maximum length of the model's response. For Anthropic models, Sim uses reliable defaults: streaming executions use the model's full capacity (e.g. 64,000 tokens for Claude 4.5), while non-streaming executions default to 8,192 to avoid timeout issues. When using tools with Anthropic models, intermediate tool-calling requests use a capped limit of 8,192 tokens to avoid SDK timeout errors, regardless of your configured max tokens—the final streaming response uses your full configured limit. This only affects Anthropic's direct API; AWS Bedrock handles this automatically. For long-form content generation via API, explicitly set a higher value.
### API Key

View File

@@ -302,13 +302,21 @@ export const anthropicProvider: ProviderConfig = {
const providerStartTime = Date.now()
const providerStartTimeISO = new Date(providerStartTime).toISOString()
// Cap intermediate calls at non-streaming limit to avoid SDK timeout errors,
// but allow users to set lower values if desired
const nonStreamingLimit = getMaxOutputTokensForModel(request.model, false)
const nonStreamingMaxTokens = request.maxTokens
? Math.min(Number.parseInt(String(request.maxTokens)), nonStreamingLimit)
: nonStreamingLimit
const intermediatePayload = { ...payload, max_tokens: nonStreamingMaxTokens }
try {
const initialCallTime = Date.now()
const originalToolChoice = payload.tool_choice
const originalToolChoice = intermediatePayload.tool_choice
const forcedTools = preparedTools?.forcedTools || []
let usedForcedTools: string[] = []
let currentResponse = await anthropic.messages.create(payload)
let currentResponse = await anthropic.messages.create(intermediatePayload)
const firstResponseTime = Date.now() - initialCallTime
let content = ''
@@ -491,7 +499,7 @@ export const anthropicProvider: ProviderConfig = {
toolsTime += thisToolsTime
const nextPayload = {
...payload,
...intermediatePayload,
messages: currentMessages,
}
@@ -674,13 +682,21 @@ export const anthropicProvider: ProviderConfig = {
const providerStartTime = Date.now()
const providerStartTimeISO = new Date(providerStartTime).toISOString()
// Cap intermediate calls at non-streaming limit to avoid SDK timeout errors,
// but allow users to set lower values if desired
const nonStreamingLimit = getMaxOutputTokensForModel(request.model, false)
const toolLoopMaxTokens = request.maxTokens
? Math.min(Number.parseInt(String(request.maxTokens)), nonStreamingLimit)
: nonStreamingLimit
const toolLoopPayload = { ...payload, max_tokens: toolLoopMaxTokens }
try {
const initialCallTime = Date.now()
const originalToolChoice = payload.tool_choice
const originalToolChoice = toolLoopPayload.tool_choice
const forcedTools = preparedTools?.forcedTools || []
let usedForcedTools: string[] = []
let currentResponse = await anthropic.messages.create(payload)
let currentResponse = await anthropic.messages.create(toolLoopPayload)
const firstResponseTime = Date.now() - initialCallTime
let content = ''
@@ -867,7 +883,7 @@ export const anthropicProvider: ProviderConfig = {
toolsTime += thisToolsTime
const nextPayload = {
...payload,
...toolLoopPayload,
messages: currentMessages,
}

View File

@@ -20,11 +20,7 @@ import {
generateToolUseId,
getBedrockInferenceProfileId,
} from '@/providers/bedrock/utils'
import {
getMaxOutputTokensForModel,
getProviderDefaultModel,
getProviderModels,
} from '@/providers/models'
import { getProviderDefaultModel, getProviderModels } from '@/providers/models'
import type {
ProviderConfig,
ProviderRequest,
@@ -261,11 +257,11 @@ export const bedrockProvider: ProviderConfig = {
const systemPromptWithSchema = systemContent
const inferenceConfig = {
const inferenceConfig: { temperature: number; maxTokens?: number } = {
temperature: Number.parseFloat(String(request.temperature ?? 0.7)),
maxTokens:
Number.parseInt(String(request.maxTokens)) ||
getMaxOutputTokensForModel(request.model, request.stream ?? false),
}
if (request.maxTokens != null) {
inferenceConfig.maxTokens = Number.parseInt(String(request.maxTokens))
}
const shouldStreamToolCalls = request.streamToolCalls ?? false

View File

@@ -34,10 +34,15 @@ export interface ModelCapabilities {
toolUsageControl?: boolean
computerUse?: boolean
nativeStructuredOutputs?: boolean
/**
* Max output tokens configuration for Anthropic SDK's streaming timeout workaround.
* The Anthropic SDK throws an error for non-streaming requests that may take >10 minutes.
* This only applies to direct Anthropic API calls, not Bedrock (which uses AWS SDK).
*/
maxOutputTokens?: {
/** Maximum tokens for streaming requests */
max: number
/** Safe default for non-streaming requests (to avoid timeout issues) */
/** Safe default for non-streaming requests (to avoid Anthropic SDK timeout errors) */
default: number
}
reasoningEffort?: {
@@ -1709,7 +1714,6 @@ export const PROVIDER_DEFINITIONS: Record<string, ProviderDefinition> = {
capabilities: {
temperature: { min: 0, max: 1 },
nativeStructuredOutputs: true,
maxOutputTokens: { max: 64000, default: 8192 },
},
contextWindow: 200000,
},
@@ -1723,7 +1727,6 @@ export const PROVIDER_DEFINITIONS: Record<string, ProviderDefinition> = {
capabilities: {
temperature: { min: 0, max: 1 },
nativeStructuredOutputs: true,
maxOutputTokens: { max: 64000, default: 8192 },
},
contextWindow: 200000,
},
@@ -1737,7 +1740,6 @@ export const PROVIDER_DEFINITIONS: Record<string, ProviderDefinition> = {
capabilities: {
temperature: { min: 0, max: 1 },
nativeStructuredOutputs: true,
maxOutputTokens: { max: 64000, default: 8192 },
},
contextWindow: 200000,
},
@@ -1751,7 +1753,6 @@ export const PROVIDER_DEFINITIONS: Record<string, ProviderDefinition> = {
capabilities: {
temperature: { min: 0, max: 1 },
nativeStructuredOutputs: true,
maxOutputTokens: { max: 64000, default: 8192 },
},
contextWindow: 200000,
},