mirror of
https://github.com/simstudioai/sim.git
synced 2026-02-01 02:05:18 -05:00
fix(anthropic): token limits for streaming with tool calls (#3084)
* remove for bedrock since they handle on their own * fix * fix inference config reference * add to docs * make it min between max tokens
This commit is contained in:
committed by
GitHub
parent
f7c3de0591
commit
656beb8383
7
.cursor/commands/council.md
Normal file
7
.cursor/commands/council.md
Normal file
@@ -0,0 +1,7 @@
|
||||
Based on the given area of interest, please:
|
||||
|
||||
1. Dig around the codebase in terms of that given area of interest, gather general information such as keywords and architecture overview.
|
||||
2. Spawn off n=10 (unless specified otherwise) task agents to dig deeper into the codebase in terms of that given area of interest, some of them should be out of the box for variance.
|
||||
3. Once the task agents are done, use the information to do what the user wants.
|
||||
|
||||
If user is in plan mode, use the information to create the plan.
|
||||
@@ -58,7 +58,7 @@ Controls response randomness and creativity:
|
||||
|
||||
### Max Output Tokens
|
||||
|
||||
Controls the maximum length of the model's response. For Anthropic models, Sim uses reliable defaults: streaming executions use the model's full capacity (e.g. 64,000 tokens for Claude 4.5), while non-streaming executions default to 8,192 to avoid timeout issues. For long-form content generation via API, explicitly set a higher value.
|
||||
Controls the maximum length of the model's response. For Anthropic models, Sim uses reliable defaults: streaming executions use the model's full capacity (e.g. 64,000 tokens for Claude 4.5), while non-streaming executions default to 8,192 to avoid timeout issues. When using tools with Anthropic models, intermediate tool-calling requests use a capped limit of 8,192 tokens to avoid SDK timeout errors, regardless of your configured max tokens—the final streaming response uses your full configured limit. This only affects Anthropic's direct API; AWS Bedrock handles this automatically. For long-form content generation via API, explicitly set a higher value.
|
||||
|
||||
### API Key
|
||||
|
||||
|
||||
@@ -302,13 +302,21 @@ export const anthropicProvider: ProviderConfig = {
|
||||
const providerStartTime = Date.now()
|
||||
const providerStartTimeISO = new Date(providerStartTime).toISOString()
|
||||
|
||||
// Cap intermediate calls at non-streaming limit to avoid SDK timeout errors,
|
||||
// but allow users to set lower values if desired
|
||||
const nonStreamingLimit = getMaxOutputTokensForModel(request.model, false)
|
||||
const nonStreamingMaxTokens = request.maxTokens
|
||||
? Math.min(Number.parseInt(String(request.maxTokens)), nonStreamingLimit)
|
||||
: nonStreamingLimit
|
||||
const intermediatePayload = { ...payload, max_tokens: nonStreamingMaxTokens }
|
||||
|
||||
try {
|
||||
const initialCallTime = Date.now()
|
||||
const originalToolChoice = payload.tool_choice
|
||||
const originalToolChoice = intermediatePayload.tool_choice
|
||||
const forcedTools = preparedTools?.forcedTools || []
|
||||
let usedForcedTools: string[] = []
|
||||
|
||||
let currentResponse = await anthropic.messages.create(payload)
|
||||
let currentResponse = await anthropic.messages.create(intermediatePayload)
|
||||
const firstResponseTime = Date.now() - initialCallTime
|
||||
|
||||
let content = ''
|
||||
@@ -491,7 +499,7 @@ export const anthropicProvider: ProviderConfig = {
|
||||
toolsTime += thisToolsTime
|
||||
|
||||
const nextPayload = {
|
||||
...payload,
|
||||
...intermediatePayload,
|
||||
messages: currentMessages,
|
||||
}
|
||||
|
||||
@@ -674,13 +682,21 @@ export const anthropicProvider: ProviderConfig = {
|
||||
const providerStartTime = Date.now()
|
||||
const providerStartTimeISO = new Date(providerStartTime).toISOString()
|
||||
|
||||
// Cap intermediate calls at non-streaming limit to avoid SDK timeout errors,
|
||||
// but allow users to set lower values if desired
|
||||
const nonStreamingLimit = getMaxOutputTokensForModel(request.model, false)
|
||||
const toolLoopMaxTokens = request.maxTokens
|
||||
? Math.min(Number.parseInt(String(request.maxTokens)), nonStreamingLimit)
|
||||
: nonStreamingLimit
|
||||
const toolLoopPayload = { ...payload, max_tokens: toolLoopMaxTokens }
|
||||
|
||||
try {
|
||||
const initialCallTime = Date.now()
|
||||
const originalToolChoice = payload.tool_choice
|
||||
const originalToolChoice = toolLoopPayload.tool_choice
|
||||
const forcedTools = preparedTools?.forcedTools || []
|
||||
let usedForcedTools: string[] = []
|
||||
|
||||
let currentResponse = await anthropic.messages.create(payload)
|
||||
let currentResponse = await anthropic.messages.create(toolLoopPayload)
|
||||
const firstResponseTime = Date.now() - initialCallTime
|
||||
|
||||
let content = ''
|
||||
@@ -867,7 +883,7 @@ export const anthropicProvider: ProviderConfig = {
|
||||
toolsTime += thisToolsTime
|
||||
|
||||
const nextPayload = {
|
||||
...payload,
|
||||
...toolLoopPayload,
|
||||
messages: currentMessages,
|
||||
}
|
||||
|
||||
|
||||
@@ -20,11 +20,7 @@ import {
|
||||
generateToolUseId,
|
||||
getBedrockInferenceProfileId,
|
||||
} from '@/providers/bedrock/utils'
|
||||
import {
|
||||
getMaxOutputTokensForModel,
|
||||
getProviderDefaultModel,
|
||||
getProviderModels,
|
||||
} from '@/providers/models'
|
||||
import { getProviderDefaultModel, getProviderModels } from '@/providers/models'
|
||||
import type {
|
||||
ProviderConfig,
|
||||
ProviderRequest,
|
||||
@@ -261,11 +257,11 @@ export const bedrockProvider: ProviderConfig = {
|
||||
|
||||
const systemPromptWithSchema = systemContent
|
||||
|
||||
const inferenceConfig = {
|
||||
const inferenceConfig: { temperature: number; maxTokens?: number } = {
|
||||
temperature: Number.parseFloat(String(request.temperature ?? 0.7)),
|
||||
maxTokens:
|
||||
Number.parseInt(String(request.maxTokens)) ||
|
||||
getMaxOutputTokensForModel(request.model, request.stream ?? false),
|
||||
}
|
||||
if (request.maxTokens != null) {
|
||||
inferenceConfig.maxTokens = Number.parseInt(String(request.maxTokens))
|
||||
}
|
||||
|
||||
const shouldStreamToolCalls = request.streamToolCalls ?? false
|
||||
|
||||
@@ -34,10 +34,15 @@ export interface ModelCapabilities {
|
||||
toolUsageControl?: boolean
|
||||
computerUse?: boolean
|
||||
nativeStructuredOutputs?: boolean
|
||||
/**
|
||||
* Max output tokens configuration for Anthropic SDK's streaming timeout workaround.
|
||||
* The Anthropic SDK throws an error for non-streaming requests that may take >10 minutes.
|
||||
* This only applies to direct Anthropic API calls, not Bedrock (which uses AWS SDK).
|
||||
*/
|
||||
maxOutputTokens?: {
|
||||
/** Maximum tokens for streaming requests */
|
||||
max: number
|
||||
/** Safe default for non-streaming requests (to avoid timeout issues) */
|
||||
/** Safe default for non-streaming requests (to avoid Anthropic SDK timeout errors) */
|
||||
default: number
|
||||
}
|
||||
reasoningEffort?: {
|
||||
@@ -1709,7 +1714,6 @@ export const PROVIDER_DEFINITIONS: Record<string, ProviderDefinition> = {
|
||||
capabilities: {
|
||||
temperature: { min: 0, max: 1 },
|
||||
nativeStructuredOutputs: true,
|
||||
maxOutputTokens: { max: 64000, default: 8192 },
|
||||
},
|
||||
contextWindow: 200000,
|
||||
},
|
||||
@@ -1723,7 +1727,6 @@ export const PROVIDER_DEFINITIONS: Record<string, ProviderDefinition> = {
|
||||
capabilities: {
|
||||
temperature: { min: 0, max: 1 },
|
||||
nativeStructuredOutputs: true,
|
||||
maxOutputTokens: { max: 64000, default: 8192 },
|
||||
},
|
||||
contextWindow: 200000,
|
||||
},
|
||||
@@ -1737,7 +1740,6 @@ export const PROVIDER_DEFINITIONS: Record<string, ProviderDefinition> = {
|
||||
capabilities: {
|
||||
temperature: { min: 0, max: 1 },
|
||||
nativeStructuredOutputs: true,
|
||||
maxOutputTokens: { max: 64000, default: 8192 },
|
||||
},
|
||||
contextWindow: 200000,
|
||||
},
|
||||
@@ -1751,7 +1753,6 @@ export const PROVIDER_DEFINITIONS: Record<string, ProviderDefinition> = {
|
||||
capabilities: {
|
||||
temperature: { min: 0, max: 1 },
|
||||
nativeStructuredOutputs: true,
|
||||
maxOutputTokens: { max: 64000, default: 8192 },
|
||||
},
|
||||
contextWindow: 200000,
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user