diff --git a/apps/sim/blocks/blocks/agent.ts b/apps/sim/blocks/blocks/agent.ts index bf8ec0d66..a841492c5 100644 --- a/apps/sim/blocks/blocks/agent.ts +++ b/apps/sim/blocks/blocks/agent.ts @@ -164,6 +164,7 @@ Return ONLY the JSON array.`, type: 'dropdown', placeholder: 'Select reasoning effort...', options: [ + { label: 'auto', id: 'auto' }, { label: 'low', id: 'low' }, { label: 'medium', id: 'medium' }, { label: 'high', id: 'high' }, @@ -173,9 +174,12 @@ Return ONLY the JSON array.`, const { useSubBlockStore } = await import('@/stores/workflows/subblock/store') const { useWorkflowRegistry } = await import('@/stores/workflows/registry/store') + const autoOption = { label: 'auto', id: 'auto' } + const activeWorkflowId = useWorkflowRegistry.getState().activeWorkflowId if (!activeWorkflowId) { return [ + autoOption, { label: 'low', id: 'low' }, { label: 'medium', id: 'medium' }, { label: 'high', id: 'high' }, @@ -188,6 +192,7 @@ Return ONLY the JSON array.`, if (!modelValue) { return [ + autoOption, { label: 'low', id: 'low' }, { label: 'medium', id: 'medium' }, { label: 'high', id: 'high' }, @@ -197,15 +202,15 @@ Return ONLY the JSON array.`, const validOptions = getReasoningEffortValuesForModel(modelValue) if (!validOptions) { return [ + autoOption, { label: 'low', id: 'low' }, { label: 'medium', id: 'medium' }, { label: 'high', id: 'high' }, ] } - return validOptions.map((opt) => ({ label: opt, id: opt })) + return [autoOption, ...validOptions.map((opt) => ({ label: opt, id: opt }))] }, - value: () => 'medium', condition: { field: 'model', value: MODELS_WITH_REASONING_EFFORT, @@ -217,6 +222,7 @@ Return ONLY the JSON array.`, type: 'dropdown', placeholder: 'Select verbosity...', options: [ + { label: 'auto', id: 'auto' }, { label: 'low', id: 'low' }, { label: 'medium', id: 'medium' }, { label: 'high', id: 'high' }, @@ -226,9 +232,12 @@ Return ONLY the JSON array.`, const { useSubBlockStore } = await import('@/stores/workflows/subblock/store') const { useWorkflowRegistry } = await import('@/stores/workflows/registry/store') + const autoOption = { label: 'auto', id: 'auto' } + const activeWorkflowId = useWorkflowRegistry.getState().activeWorkflowId if (!activeWorkflowId) { return [ + autoOption, { label: 'low', id: 'low' }, { label: 'medium', id: 'medium' }, { label: 'high', id: 'high' }, @@ -241,6 +250,7 @@ Return ONLY the JSON array.`, if (!modelValue) { return [ + autoOption, { label: 'low', id: 'low' }, { label: 'medium', id: 'medium' }, { label: 'high', id: 'high' }, @@ -250,15 +260,15 @@ Return ONLY the JSON array.`, const validOptions = getVerbosityValuesForModel(modelValue) if (!validOptions) { return [ + autoOption, { label: 'low', id: 'low' }, { label: 'medium', id: 'medium' }, { label: 'high', id: 'high' }, ] } - return validOptions.map((opt) => ({ label: opt, id: opt })) + return [autoOption, ...validOptions.map((opt) => ({ label: opt, id: opt }))] }, - value: () => 'medium', condition: { field: 'model', value: MODELS_WITH_VERBOSITY, @@ -270,6 +280,7 @@ Return ONLY the JSON array.`, type: 'dropdown', placeholder: 'Select thinking level...', options: [ + { label: 'none', id: 'none' }, { label: 'minimal', id: 'minimal' }, { label: 'low', id: 'low' }, { label: 'medium', id: 'medium' }, @@ -281,12 +292,11 @@ Return ONLY the JSON array.`, const { useSubBlockStore } = await import('@/stores/workflows/subblock/store') const { useWorkflowRegistry } = await import('@/stores/workflows/registry/store') + const noneOption = { label: 'none', id: 'none' } + const activeWorkflowId = useWorkflowRegistry.getState().activeWorkflowId if (!activeWorkflowId) { - return [ - { label: 'low', id: 'low' }, - { label: 'high', id: 'high' }, - ] + return [noneOption, { label: 'low', id: 'low' }, { label: 'high', id: 'high' }] } const workflowValues = useSubBlockStore.getState().workflowValues[activeWorkflowId] @@ -294,23 +304,16 @@ Return ONLY the JSON array.`, const modelValue = blockValues?.model as string if (!modelValue) { - return [ - { label: 'low', id: 'low' }, - { label: 'high', id: 'high' }, - ] + return [noneOption, { label: 'low', id: 'low' }, { label: 'high', id: 'high' }] } const validOptions = getThinkingLevelsForModel(modelValue) if (!validOptions) { - return [ - { label: 'low', id: 'low' }, - { label: 'high', id: 'high' }, - ] + return [noneOption, { label: 'low', id: 'low' }, { label: 'high', id: 'high' }] } - return validOptions.map((opt) => ({ label: opt, id: opt })) + return [noneOption, ...validOptions.map((opt) => ({ label: opt, id: opt }))] }, - value: () => 'high', condition: { field: 'model', value: MODELS_WITH_THINKING, diff --git a/apps/sim/executor/handlers/agent/agent-handler.ts b/apps/sim/executor/handlers/agent/agent-handler.ts index b4c2794a8..6664ebf31 100644 --- a/apps/sim/executor/handlers/agent/agent-handler.ts +++ b/apps/sim/executor/handlers/agent/agent-handler.ts @@ -906,24 +906,17 @@ export class AgentBlockHandler implements BlockHandler { } } - // Find first system message const firstSystemIndex = messages.findIndex((msg) => msg.role === 'system') if (firstSystemIndex === -1) { - // No system message exists - add at position 0 messages.unshift({ role: 'system', content }) } else if (firstSystemIndex === 0) { - // System message already at position 0 - replace it - // Explicit systemPrompt parameter takes precedence over memory/messages messages[0] = { role: 'system', content } } else { - // System message exists but not at position 0 - move it to position 0 - // and update with new content messages.splice(firstSystemIndex, 1) messages.unshift({ role: 'system', content }) } - // Remove any additional system messages (keep only the first one) for (let i = messages.length - 1; i >= 1; i--) { if (messages[i].role === 'system') { messages.splice(i, 1) @@ -996,6 +989,7 @@ export class AgentBlockHandler implements BlockHandler { blockNameMapping, reasoningEffort: inputs.reasoningEffort, verbosity: inputs.verbosity, + thinkingLevel: inputs.thinkingLevel, } } @@ -1064,6 +1058,7 @@ export class AgentBlockHandler implements BlockHandler { isDeployedContext: ctx.isDeployedContext, reasoningEffort: providerRequest.reasoningEffort, verbosity: providerRequest.verbosity, + thinkingLevel: providerRequest.thinkingLevel, }) return this.processProviderResponse(response, block, responseFormat) @@ -1081,8 +1076,6 @@ export class AgentBlockHandler implements BlockHandler { logger.info(`[${requestId}] Resolving Vertex AI credential: ${credentialId}`) - // Get the credential - we need to find the owner - // Since we're in a workflow context, we can query the credential directly const credential = await db.query.account.findFirst({ where: eq(account.id, credentialId), }) @@ -1091,7 +1084,6 @@ export class AgentBlockHandler implements BlockHandler { throw new Error(`Vertex AI credential not found: ${credentialId}`) } - // Refresh the token if needed const { accessToken } = await refreshTokenIfNeeded(requestId, credential, credentialId) if (!accessToken) { diff --git a/apps/sim/executor/handlers/agent/types.ts b/apps/sim/executor/handlers/agent/types.ts index 36002b7b0..c0731d9ee 100644 --- a/apps/sim/executor/handlers/agent/types.ts +++ b/apps/sim/executor/handlers/agent/types.ts @@ -34,6 +34,7 @@ export interface AgentInputs { bedrockRegion?: string reasoningEffort?: string verbosity?: string + thinkingLevel?: string } export interface ToolInput { diff --git a/apps/sim/providers/anthropic/core.ts b/apps/sim/providers/anthropic/core.ts index 3cd16eb4d..926ce44cf 100644 --- a/apps/sim/providers/anthropic/core.ts +++ b/apps/sim/providers/anthropic/core.ts @@ -113,6 +113,28 @@ function buildThinkingConfig( } } +/** + * The Anthropic SDK requires streaming for non-streaming requests when max_tokens exceeds + * this threshold, to avoid HTTP timeouts. When thinking is enabled and pushes max_tokens + * above this limit, we use streaming internally and collect the final message. + */ +const ANTHROPIC_SDK_NON_STREAMING_MAX_TOKENS = 21333 + +/** + * Creates an Anthropic message, automatically using streaming internally when max_tokens + * exceeds the SDK's non-streaming threshold. Returns the same Message object either way. + */ +async function createMessage( + anthropic: Anthropic, + payload: any +): Promise { + if (payload.max_tokens > ANTHROPIC_SDK_NON_STREAMING_MAX_TOKENS && !payload.stream) { + const stream = anthropic.messages.stream(payload) + return stream.finalMessage() + } + return anthropic.messages.create(payload) as Promise +} + /** * Executes a request using the Anthropic API with full tool loop support. * This is the shared core implementation used by both the standard Anthropic provider @@ -268,13 +290,35 @@ export async function executeAnthropicProviderRequest( } // Add extended thinking configuration if supported and requested - if (request.thinkingLevel) { + // The 'none' sentinel means "disable thinking" — skip configuration entirely. + if (request.thinkingLevel && request.thinkingLevel !== 'none') { const thinkingConfig = buildThinkingConfig(request.model, request.thinkingLevel) if (thinkingConfig) { payload.thinking = thinkingConfig.thinking if (thinkingConfig.outputConfig) { payload.output_config = thinkingConfig.outputConfig } + + // Per Anthropic docs: budget_tokens must be less than max_tokens. + // Ensure max_tokens leaves room for both thinking and text output. + if ( + thinkingConfig.thinking.type === 'enabled' && + 'budget_tokens' in thinkingConfig.thinking + ) { + const budgetTokens = thinkingConfig.thinking.budget_tokens + const minMaxTokens = budgetTokens + 4096 + if (payload.max_tokens < minMaxTokens) { + const modelMax = getMaxOutputTokensForModel(request.model, true) + payload.max_tokens = Math.min(minMaxTokens, modelMax) + logger.info( + `Adjusted max_tokens to ${payload.max_tokens} to satisfy budget_tokens (${budgetTokens}) constraint` + ) + } + } + + // Per Anthropic docs: thinking is not compatible with temperature or top_k modifications. + payload.temperature = undefined + const isAdaptive = thinkingConfig.thinking.type === 'adaptive' logger.info( `Using ${isAdaptive ? 'adaptive' : 'extended'} thinking for model: ${modelId} with ${isAdaptive ? `effort: ${request.thinkingLevel}` : `budget: ${(thinkingConfig.thinking as { budget_tokens: number }).budget_tokens}`}` @@ -288,7 +332,16 @@ export async function executeAnthropicProviderRequest( if (anthropicTools?.length) { payload.tools = anthropicTools - if (toolChoice !== 'auto') { + // Per Anthropic docs: forced tool_choice (type: "tool" or "any") is incompatible with + // thinking. Only auto and none are supported when thinking is enabled. + if (payload.thinking) { + // Per Anthropic docs: only 'auto' (default) and 'none' work with thinking. + if (toolChoice === 'none') { + payload.tool_choice = { type: 'none' } + } + } else if (toolChoice === 'none') { + payload.tool_choice = { type: 'none' } + } else if (toolChoice !== 'auto') { payload.tool_choice = toolChoice } } @@ -386,12 +439,16 @@ export async function executeAnthropicProviderRequest( const providerStartTimeISO = new Date(providerStartTime).toISOString() // Cap intermediate calls at non-streaming limit to avoid SDK timeout errors, - // but allow users to set lower values if desired + // but allow users to set lower values if desired. Use Math.max to preserve + // thinking-adjusted max_tokens from payload when it's higher. const nonStreamingLimit = getMaxOutputTokensForModel(request.model, false) const nonStreamingMaxTokens = request.maxTokens ? Math.min(Number.parseInt(String(request.maxTokens)), nonStreamingLimit) : nonStreamingLimit - const intermediatePayload = { ...payload, max_tokens: nonStreamingMaxTokens } + const intermediatePayload = { + ...payload, + max_tokens: Math.max(nonStreamingMaxTokens, payload.max_tokens), + } try { const initialCallTime = Date.now() @@ -399,7 +456,7 @@ export async function executeAnthropicProviderRequest( const forcedTools = preparedTools?.forcedTools || [] let usedForcedTools: string[] = [] - let currentResponse = await anthropic.messages.create(intermediatePayload) + let currentResponse = await createMessage(anthropic, intermediatePayload) const firstResponseTime = Date.now() - initialCallTime let content = '' @@ -583,11 +640,20 @@ export async function executeAnthropicProviderRequest( }) } - // Add ONE assistant message with ALL tool_use blocks + // Per Anthropic docs: thinking blocks must be preserved in assistant messages + // during tool use to maintain reasoning continuity. + const thinkingBlocks = currentResponse.content.filter( + (item) => item.type === 'thinking' || item.type === 'redacted_thinking' + ) + + // Add ONE assistant message with thinking + tool_use blocks if (toolUseBlocks.length > 0) { currentMessages.push({ role: 'assistant', - content: toolUseBlocks as unknown as Anthropic.Messages.ContentBlock[], + content: [ + ...thinkingBlocks, + ...toolUseBlocks, + ] as unknown as Anthropic.Messages.ContentBlock[], }) } @@ -607,7 +673,11 @@ export async function executeAnthropicProviderRequest( messages: currentMessages, } + // Per Anthropic docs: forced tool_choice is incompatible with thinking. + // Only auto and none are supported when thinking is enabled. + const thinkingEnabled = !!payload.thinking if ( + !thinkingEnabled && typeof originalToolChoice === 'object' && hasUsedForcedTool && forcedTools.length > 0 @@ -624,7 +694,11 @@ export async function executeAnthropicProviderRequest( nextPayload.tool_choice = undefined logger.info('All forced tools have been used, removing tool_choice parameter') } - } else if (hasUsedForcedTool && typeof originalToolChoice === 'object') { + } else if ( + !thinkingEnabled && + hasUsedForcedTool && + typeof originalToolChoice === 'object' + ) { nextPayload.tool_choice = undefined logger.info( 'Removing tool_choice parameter for subsequent requests after forced tool was used' @@ -633,7 +707,7 @@ export async function executeAnthropicProviderRequest( const nextModelStartTime = Date.now() - currentResponse = await anthropic.messages.create(nextPayload) + currentResponse = await createMessage(anthropic, nextPayload) const nextCheckResult = checkForForcedToolUsage( currentResponse, @@ -779,12 +853,16 @@ export async function executeAnthropicProviderRequest( const providerStartTimeISO = new Date(providerStartTime).toISOString() // Cap intermediate calls at non-streaming limit to avoid SDK timeout errors, - // but allow users to set lower values if desired + // but allow users to set lower values if desired. Use Math.max to preserve + // thinking-adjusted max_tokens from payload when it's higher. const nonStreamingLimit = getMaxOutputTokensForModel(request.model, false) const toolLoopMaxTokens = request.maxTokens ? Math.min(Number.parseInt(String(request.maxTokens)), nonStreamingLimit) : nonStreamingLimit - const toolLoopPayload = { ...payload, max_tokens: toolLoopMaxTokens } + const toolLoopPayload = { + ...payload, + max_tokens: Math.max(toolLoopMaxTokens, payload.max_tokens), + } try { const initialCallTime = Date.now() @@ -792,7 +870,7 @@ export async function executeAnthropicProviderRequest( const forcedTools = preparedTools?.forcedTools || [] let usedForcedTools: string[] = [] - let currentResponse = await anthropic.messages.create(toolLoopPayload) + let currentResponse = await createMessage(anthropic, toolLoopPayload) const firstResponseTime = Date.now() - initialCallTime let content = '' @@ -989,11 +1067,20 @@ export async function executeAnthropicProviderRequest( }) } - // Add ONE assistant message with ALL tool_use blocks + // Per Anthropic docs: thinking blocks must be preserved in assistant messages + // during tool use to maintain reasoning continuity. + const thinkingBlocks = currentResponse.content.filter( + (item) => item.type === 'thinking' || item.type === 'redacted_thinking' + ) + + // Add ONE assistant message with thinking + tool_use blocks if (toolUseBlocks.length > 0) { currentMessages.push({ role: 'assistant', - content: toolUseBlocks as unknown as Anthropic.Messages.ContentBlock[], + content: [ + ...thinkingBlocks, + ...toolUseBlocks, + ] as unknown as Anthropic.Messages.ContentBlock[], }) } @@ -1013,7 +1100,15 @@ export async function executeAnthropicProviderRequest( messages: currentMessages, } - if (typeof originalToolChoice === 'object' && hasUsedForcedTool && forcedTools.length > 0) { + // Per Anthropic docs: forced tool_choice is incompatible with thinking. + // Only auto and none are supported when thinking is enabled. + const thinkingEnabled = !!payload.thinking + if ( + !thinkingEnabled && + typeof originalToolChoice === 'object' && + hasUsedForcedTool && + forcedTools.length > 0 + ) { const remainingTools = forcedTools.filter((tool) => !usedForcedTools.includes(tool)) if (remainingTools.length > 0) { @@ -1026,7 +1121,11 @@ export async function executeAnthropicProviderRequest( nextPayload.tool_choice = undefined logger.info('All forced tools have been used, removing tool_choice parameter') } - } else if (hasUsedForcedTool && typeof originalToolChoice === 'object') { + } else if ( + !thinkingEnabled && + hasUsedForcedTool && + typeof originalToolChoice === 'object' + ) { nextPayload.tool_choice = undefined logger.info( 'Removing tool_choice parameter for subsequent requests after forced tool was used' @@ -1035,7 +1134,7 @@ export async function executeAnthropicProviderRequest( const nextModelStartTime = Date.now() - currentResponse = await anthropic.messages.create(nextPayload) + currentResponse = await createMessage(anthropic, nextPayload) const nextCheckResult = checkForForcedToolUsage( currentResponse, diff --git a/apps/sim/providers/azure-openai/index.ts b/apps/sim/providers/azure-openai/index.ts index ca63904df..50bf8d693 100644 --- a/apps/sim/providers/azure-openai/index.ts +++ b/apps/sim/providers/azure-openai/index.ts @@ -98,8 +98,10 @@ async function executeChatCompletionsRequest( if (request.temperature !== undefined) payload.temperature = request.temperature if (request.maxTokens != null) payload.max_completion_tokens = request.maxTokens - if (request.reasoningEffort !== undefined) payload.reasoning_effort = request.reasoningEffort - if (request.verbosity !== undefined) payload.verbosity = request.verbosity + if (request.reasoningEffort !== undefined && request.reasoningEffort !== 'auto') + payload.reasoning_effort = request.reasoningEffort + if (request.verbosity !== undefined && request.verbosity !== 'auto') + payload.verbosity = request.verbosity if (request.responseFormat) { payload.response_format = { diff --git a/apps/sim/providers/bedrock/index.ts b/apps/sim/providers/bedrock/index.ts index 57935394a..9df5d4bf0 100644 --- a/apps/sim/providers/bedrock/index.ts +++ b/apps/sim/providers/bedrock/index.ts @@ -197,6 +197,9 @@ export const bedrockProvider: ProviderConfig = { } else if (tc.type === 'function' && tc.function?.name) { toolChoice = { tool: { name: tc.function.name } } logger.info(`Using Bedrock tool_choice format: force tool "${tc.function.name}"`) + } else if (tc.type === 'any') { + toolChoice = { any: {} } + logger.info('Using Bedrock tool_choice format: any tool') } else { toolChoice = { auto: {} } } @@ -860,6 +863,11 @@ export const bedrockProvider: ProviderConfig = { content, model: request.model, tokens, + cost: { + input: cost.input, + output: cost.output, + total: cost.total, + }, toolCalls: toolCalls.length > 0 ? toolCalls.map((tc) => ({ diff --git a/apps/sim/providers/gemini/core.ts b/apps/sim/providers/gemini/core.ts index 5050672ea..4e7164b82 100644 --- a/apps/sim/providers/gemini/core.ts +++ b/apps/sim/providers/gemini/core.ts @@ -24,7 +24,6 @@ import { extractTextContent, mapToThinkingLevel, } from '@/providers/google/utils' -import { getThinkingCapability } from '@/providers/models' import type { FunctionCallResponse, ProviderRequest, ProviderResponse } from '@/providers/types' import { calculateCost, @@ -432,13 +431,11 @@ export async function executeGeminiRequest( logger.warn('Gemini does not support responseFormat with tools. Structured output ignored.') } - // Configure thinking for models that support it - const thinkingCapability = getThinkingCapability(model) - if (thinkingCapability) { - const level = request.thinkingLevel ?? thinkingCapability.default ?? 'high' + // Configure thinking only when the user explicitly selects a thinking level + if (request.thinkingLevel && request.thinkingLevel !== 'none') { const thinkingConfig: ThinkingConfig = { includeThoughts: false, - thinkingLevel: mapToThinkingLevel(level), + thinkingLevel: mapToThinkingLevel(request.thinkingLevel), } geminiConfig.thinkingConfig = thinkingConfig } diff --git a/apps/sim/providers/models.ts b/apps/sim/providers/models.ts index 3662e1ca5..f503d2d03 100644 --- a/apps/sim/providers/models.ts +++ b/apps/sim/providers/models.ts @@ -40,9 +40,9 @@ export interface ModelCapabilities { * This only applies to direct Anthropic API calls, not Bedrock (which uses AWS SDK). */ maxOutputTokens?: { - /** Maximum tokens for streaming requests */ + /** Maximum supported output tokens (used for streaming requests) */ max: number - /** Safe default for non-streaming requests (to avoid Anthropic SDK timeout errors) */ + /** Conservative default when user doesn't specify maxTokens (controls cost/latency) */ default: number } reasoningEffort?: { @@ -109,7 +109,7 @@ export const PROVIDER_DEFINITIONS: Record = { name: 'OpenAI', description: "OpenAI's models", defaultModel: 'gpt-4o', - modelPatterns: [/^gpt/, /^o1/, /^text-embedding/], + modelPatterns: [/^gpt/, /^o\d/, /^text-embedding/], icon: OpenAIIcon, capabilities: { toolUsageControl: true, @@ -138,7 +138,7 @@ export const PROVIDER_DEFINITIONS: Record = { }, capabilities: { reasoningEffort: { - values: ['none', 'minimal', 'low', 'medium', 'high', 'xhigh'], + values: ['none', 'low', 'medium', 'high', 'xhigh'], }, verbosity: { values: ['low', 'medium', 'high'], @@ -164,60 +164,6 @@ export const PROVIDER_DEFINITIONS: Record = { }, contextWindow: 400000, }, - // { - // id: 'gpt-5.1-mini', - // pricing: { - // input: 0.25, - // cachedInput: 0.025, - // output: 2.0, - // updatedAt: '2025-11-14', - // }, - // capabilities: { - // reasoningEffort: { - // values: ['none', 'low', 'medium', 'high'], - // }, - // verbosity: { - // values: ['low', 'medium', 'high'], - // }, - // }, - // contextWindow: 400000, - // }, - // { - // id: 'gpt-5.1-nano', - // pricing: { - // input: 0.05, - // cachedInput: 0.005, - // output: 0.4, - // updatedAt: '2025-11-14', - // }, - // capabilities: { - // reasoningEffort: { - // values: ['none', 'low', 'medium', 'high'], - // }, - // verbosity: { - // values: ['low', 'medium', 'high'], - // }, - // }, - // contextWindow: 400000, - // }, - // { - // id: 'gpt-5.1-codex', - // pricing: { - // input: 1.25, - // cachedInput: 0.125, - // output: 10.0, - // updatedAt: '2025-11-14', - // }, - // capabilities: { - // reasoningEffort: { - // values: ['none', 'medium', 'high'], - // }, - // verbosity: { - // values: ['low', 'medium', 'high'], - // }, - // }, - // contextWindow: 400000, - // }, { id: 'gpt-5', pricing: { @@ -280,8 +226,10 @@ export const PROVIDER_DEFINITIONS: Record = { output: 10.0, updatedAt: '2025-08-07', }, - capabilities: {}, - contextWindow: 400000, + capabilities: { + temperature: { min: 0, max: 2 }, + }, + contextWindow: 128000, }, { id: 'o1', @@ -311,7 +259,7 @@ export const PROVIDER_DEFINITIONS: Record = { values: ['low', 'medium', 'high'], }, }, - contextWindow: 128000, + contextWindow: 200000, }, { id: 'o4-mini', @@ -326,7 +274,7 @@ export const PROVIDER_DEFINITIONS: Record = { values: ['low', 'medium', 'high'], }, }, - contextWindow: 128000, + contextWindow: 200000, }, { id: 'gpt-4.1', @@ -413,7 +361,7 @@ export const PROVIDER_DEFINITIONS: Record = { maxOutputTokens: { max: 64000, default: 8192 }, thinking: { levels: ['low', 'medium', 'high'], - default: 'medium', + default: 'high', }, }, contextWindow: 200000, @@ -429,10 +377,10 @@ export const PROVIDER_DEFINITIONS: Record = { capabilities: { temperature: { min: 0, max: 1 }, nativeStructuredOutputs: true, - maxOutputTokens: { max: 64000, default: 8192 }, + maxOutputTokens: { max: 32000, default: 8192 }, thinking: { levels: ['low', 'medium', 'high'], - default: 'medium', + default: 'high', }, }, contextWindow: 200000, @@ -447,10 +395,10 @@ export const PROVIDER_DEFINITIONS: Record = { }, capabilities: { temperature: { min: 0, max: 1 }, - maxOutputTokens: { max: 64000, default: 8192 }, + maxOutputTokens: { max: 32000, default: 8192 }, thinking: { levels: ['low', 'medium', 'high'], - default: 'medium', + default: 'high', }, }, contextWindow: 200000, @@ -469,7 +417,7 @@ export const PROVIDER_DEFINITIONS: Record = { maxOutputTokens: { max: 64000, default: 8192 }, thinking: { levels: ['low', 'medium', 'high'], - default: 'medium', + default: 'high', }, }, contextWindow: 200000, @@ -487,7 +435,7 @@ export const PROVIDER_DEFINITIONS: Record = { maxOutputTokens: { max: 64000, default: 8192 }, thinking: { levels: ['low', 'medium', 'high'], - default: 'medium', + default: 'high', }, }, contextWindow: 200000, @@ -506,7 +454,7 @@ export const PROVIDER_DEFINITIONS: Record = { maxOutputTokens: { max: 64000, default: 8192 }, thinking: { levels: ['low', 'medium', 'high'], - default: 'medium', + default: 'high', }, }, contextWindow: 200000, @@ -515,7 +463,7 @@ export const PROVIDER_DEFINITIONS: Record = { id: 'claude-3-haiku-20240307', pricing: { input: 0.25, - cachedInput: 0.025, + cachedInput: 0.03, output: 1.25, updatedAt: '2026-02-05', }, @@ -536,10 +484,10 @@ export const PROVIDER_DEFINITIONS: Record = { capabilities: { temperature: { min: 0, max: 1 }, computerUse: true, - maxOutputTokens: { max: 8192, default: 8192 }, + maxOutputTokens: { max: 64000, default: 8192 }, thinking: { levels: ['low', 'medium', 'high'], - default: 'medium', + default: 'high', }, }, contextWindow: 200000, @@ -580,7 +528,7 @@ export const PROVIDER_DEFINITIONS: Record = { }, capabilities: { reasoningEffort: { - values: ['none', 'minimal', 'low', 'medium', 'high', 'xhigh'], + values: ['none', 'low', 'medium', 'high', 'xhigh'], }, verbosity: { values: ['low', 'medium', 'high'], @@ -606,42 +554,6 @@ export const PROVIDER_DEFINITIONS: Record = { }, contextWindow: 400000, }, - { - id: 'azure/gpt-5.1-mini', - pricing: { - input: 0.25, - cachedInput: 0.025, - output: 2.0, - updatedAt: '2025-11-14', - }, - capabilities: { - reasoningEffort: { - values: ['none', 'low', 'medium', 'high'], - }, - verbosity: { - values: ['low', 'medium', 'high'], - }, - }, - contextWindow: 400000, - }, - { - id: 'azure/gpt-5.1-nano', - pricing: { - input: 0.05, - cachedInput: 0.005, - output: 0.4, - updatedAt: '2025-11-14', - }, - capabilities: { - reasoningEffort: { - values: ['none', 'low', 'medium', 'high'], - }, - verbosity: { - values: ['low', 'medium', 'high'], - }, - }, - contextWindow: 400000, - }, { id: 'azure/gpt-5.1-codex', pricing: { @@ -652,7 +564,7 @@ export const PROVIDER_DEFINITIONS: Record = { }, capabilities: { reasoningEffort: { - values: ['none', 'medium', 'high'], + values: ['none', 'low', 'medium', 'high'], }, verbosity: { values: ['low', 'medium', 'high'], @@ -722,23 +634,25 @@ export const PROVIDER_DEFINITIONS: Record = { output: 10.0, updatedAt: '2025-08-07', }, - capabilities: {}, - contextWindow: 400000, + capabilities: { + temperature: { min: 0, max: 2 }, + }, + contextWindow: 128000, }, { id: 'azure/o3', pricing: { - input: 10, - cachedInput: 2.5, - output: 40, - updatedAt: '2025-06-15', + input: 2, + cachedInput: 0.5, + output: 8, + updatedAt: '2026-02-06', }, capabilities: { reasoningEffort: { values: ['low', 'medium', 'high'], }, }, - contextWindow: 128000, + contextWindow: 200000, }, { id: 'azure/o4-mini', @@ -753,7 +667,7 @@ export const PROVIDER_DEFINITIONS: Record = { values: ['low', 'medium', 'high'], }, }, - contextWindow: 128000, + contextWindow: 200000, }, { id: 'azure/gpt-4.1', @@ -763,7 +677,35 @@ export const PROVIDER_DEFINITIONS: Record = { output: 8.0, updatedAt: '2025-06-15', }, - capabilities: {}, + capabilities: { + temperature: { min: 0, max: 2 }, + }, + contextWindow: 1000000, + }, + { + id: 'azure/gpt-4.1-mini', + pricing: { + input: 0.4, + cachedInput: 0.1, + output: 1.6, + updatedAt: '2025-06-15', + }, + capabilities: { + temperature: { min: 0, max: 2 }, + }, + contextWindow: 1000000, + }, + { + id: 'azure/gpt-4.1-nano', + pricing: { + input: 0.1, + cachedInput: 0.025, + output: 0.4, + updatedAt: '2025-06-15', + }, + capabilities: { + temperature: { min: 0, max: 2 }, + }, contextWindow: 1000000, }, { @@ -775,7 +717,7 @@ export const PROVIDER_DEFINITIONS: Record = { updatedAt: '2025-06-15', }, capabilities: {}, - contextWindow: 1000000, + contextWindow: 200000, }, ], }, @@ -823,7 +765,7 @@ export const PROVIDER_DEFINITIONS: Record = { maxOutputTokens: { max: 64000, default: 8192 }, thinking: { levels: ['low', 'medium', 'high'], - default: 'medium', + default: 'high', }, }, contextWindow: 200000, @@ -842,7 +784,7 @@ export const PROVIDER_DEFINITIONS: Record = { maxOutputTokens: { max: 64000, default: 8192 }, thinking: { levels: ['low', 'medium', 'high'], - default: 'medium', + default: 'high', }, }, contextWindow: 200000, @@ -858,10 +800,10 @@ export const PROVIDER_DEFINITIONS: Record = { capabilities: { temperature: { min: 0, max: 1 }, nativeStructuredOutputs: true, - maxOutputTokens: { max: 64000, default: 8192 }, + maxOutputTokens: { max: 32000, default: 8192 }, thinking: { levels: ['low', 'medium', 'high'], - default: 'medium', + default: 'high', }, }, contextWindow: 200000, @@ -880,7 +822,7 @@ export const PROVIDER_DEFINITIONS: Record = { maxOutputTokens: { max: 64000, default: 8192 }, thinking: { levels: ['low', 'medium', 'high'], - default: 'medium', + default: 'high', }, }, contextWindow: 200000, diff --git a/apps/sim/providers/openai/core.ts b/apps/sim/providers/openai/core.ts index 8ed4c9386..4aca48033 100644 --- a/apps/sim/providers/openai/core.ts +++ b/apps/sim/providers/openai/core.ts @@ -130,14 +130,14 @@ export async function executeResponsesProviderRequest( if (request.temperature !== undefined) basePayload.temperature = request.temperature if (request.maxTokens != null) basePayload.max_output_tokens = request.maxTokens - if (request.reasoningEffort !== undefined) { + if (request.reasoningEffort !== undefined && request.reasoningEffort !== 'auto') { basePayload.reasoning = { effort: request.reasoningEffort, summary: 'auto', } } - if (request.verbosity !== undefined) { + if (request.verbosity !== undefined && request.verbosity !== 'auto') { basePayload.text = { ...(basePayload.text ?? {}), verbosity: request.verbosity, @@ -627,13 +627,13 @@ export async function executeResponsesProviderRequest( // Copy over non-tool related settings if (request.temperature !== undefined) finalPayload.temperature = request.temperature if (request.maxTokens != null) finalPayload.max_output_tokens = request.maxTokens - if (request.reasoningEffort !== undefined) { + if (request.reasoningEffort !== undefined && request.reasoningEffort !== 'auto') { finalPayload.reasoning = { effort: request.reasoningEffort, summary: 'auto', } } - if (request.verbosity !== undefined) { + if (request.verbosity !== undefined && request.verbosity !== 'auto') { finalPayload.text = { ...finalPayload.text, verbosity: request.verbosity, diff --git a/apps/sim/providers/utils.test.ts b/apps/sim/providers/utils.test.ts index 68575b875..60ab2467f 100644 --- a/apps/sim/providers/utils.test.ts +++ b/apps/sim/providers/utils.test.ts @@ -12,16 +12,22 @@ import { getApiKey, getBaseModelProviders, getHostedModels, + getMaxOutputTokensForModel, getMaxTemperature, + getModelPricing, getProvider, getProviderConfigFromModel, getProviderFromModel, getProviderModels, + getReasoningEffortValuesForModel, + getThinkingLevelsForModel, + getVerbosityValuesForModel, isProviderBlacklisted, MODELS_TEMP_RANGE_0_1, MODELS_TEMP_RANGE_0_2, MODELS_WITH_REASONING_EFFORT, MODELS_WITH_TEMPERATURE_SUPPORT, + MODELS_WITH_THINKING, MODELS_WITH_VERBOSITY, PROVIDERS_WITH_TOOL_USAGE_CONTROL, prepareToolExecution, @@ -169,6 +175,8 @@ describe('Model Capabilities', () => { 'gpt-4.1', 'gpt-4.1-mini', 'gpt-4.1-nano', + 'gpt-5-chat-latest', + 'azure/gpt-5-chat-latest', 'gemini-2.5-flash', 'claude-sonnet-4-0', 'claude-opus-4-0', @@ -186,34 +194,27 @@ describe('Model Capabilities', () => { it.concurrent('should return false for models that do not support temperature', () => { const unsupportedModels = [ 'unsupported-model', - 'cerebras/llama-3.3-70b', // Cerebras models don't have temperature defined - 'groq/meta-llama/llama-4-scout-17b-16e-instruct', // Groq models don't have temperature defined - // Reasoning models that don't support temperature + 'cerebras/llama-3.3-70b', + 'groq/meta-llama/llama-4-scout-17b-16e-instruct', 'o1', 'o3', 'o4-mini', 'azure/o3', 'azure/o4-mini', 'deepseek-r1', - // Chat models that don't support temperature 'deepseek-chat', - 'azure/gpt-4.1', 'azure/model-router', - // GPT-5.1 models don't support temperature (removed in our implementation) 'gpt-5.1', 'azure/gpt-5.1', 'azure/gpt-5.1-mini', 'azure/gpt-5.1-nano', 'azure/gpt-5.1-codex', - // GPT-5 models don't support temperature (removed in our implementation) 'gpt-5', 'gpt-5-mini', 'gpt-5-nano', - 'gpt-5-chat-latest', 'azure/gpt-5', 'azure/gpt-5-mini', 'azure/gpt-5-nano', - 'azure/gpt-5-chat-latest', ] for (const model of unsupportedModels) { @@ -240,6 +241,8 @@ describe('Model Capabilities', () => { const modelsRange02 = [ 'gpt-4o', 'azure/gpt-4o', + 'gpt-5-chat-latest', + 'azure/gpt-5-chat-latest', 'gemini-2.5-pro', 'gemini-2.5-flash', 'deepseek-v3', @@ -268,28 +271,23 @@ describe('Model Capabilities', () => { expect(getMaxTemperature('unsupported-model')).toBeUndefined() expect(getMaxTemperature('cerebras/llama-3.3-70b')).toBeUndefined() expect(getMaxTemperature('groq/meta-llama/llama-4-scout-17b-16e-instruct')).toBeUndefined() - // Reasoning models that don't support temperature expect(getMaxTemperature('o1')).toBeUndefined() expect(getMaxTemperature('o3')).toBeUndefined() expect(getMaxTemperature('o4-mini')).toBeUndefined() expect(getMaxTemperature('azure/o3')).toBeUndefined() expect(getMaxTemperature('azure/o4-mini')).toBeUndefined() expect(getMaxTemperature('deepseek-r1')).toBeUndefined() - // GPT-5.1 models don't support temperature expect(getMaxTemperature('gpt-5.1')).toBeUndefined() expect(getMaxTemperature('azure/gpt-5.1')).toBeUndefined() expect(getMaxTemperature('azure/gpt-5.1-mini')).toBeUndefined() expect(getMaxTemperature('azure/gpt-5.1-nano')).toBeUndefined() expect(getMaxTemperature('azure/gpt-5.1-codex')).toBeUndefined() - // GPT-5 models don't support temperature expect(getMaxTemperature('gpt-5')).toBeUndefined() expect(getMaxTemperature('gpt-5-mini')).toBeUndefined() expect(getMaxTemperature('gpt-5-nano')).toBeUndefined() - expect(getMaxTemperature('gpt-5-chat-latest')).toBeUndefined() expect(getMaxTemperature('azure/gpt-5')).toBeUndefined() expect(getMaxTemperature('azure/gpt-5-mini')).toBeUndefined() expect(getMaxTemperature('azure/gpt-5-nano')).toBeUndefined() - expect(getMaxTemperature('azure/gpt-5-chat-latest')).toBeUndefined() }) it.concurrent('should be case insensitive', () => { @@ -340,13 +338,13 @@ describe('Model Capabilities', () => { expect(MODELS_TEMP_RANGE_0_2).toContain('gpt-4o') expect(MODELS_TEMP_RANGE_0_2).toContain('gemini-2.5-flash') expect(MODELS_TEMP_RANGE_0_2).toContain('deepseek-v3') - expect(MODELS_TEMP_RANGE_0_2).not.toContain('claude-sonnet-4-0') // Should be in 0-1 range + expect(MODELS_TEMP_RANGE_0_2).not.toContain('claude-sonnet-4-0') }) it.concurrent('should have correct models in MODELS_TEMP_RANGE_0_1', () => { expect(MODELS_TEMP_RANGE_0_1).toContain('claude-sonnet-4-0') expect(MODELS_TEMP_RANGE_0_1).toContain('grok-3-latest') - expect(MODELS_TEMP_RANGE_0_1).not.toContain('gpt-4o') // Should be in 0-2 range + expect(MODELS_TEMP_RANGE_0_1).not.toContain('gpt-4o') }) it.concurrent('should have correct providers in PROVIDERS_WITH_TOOL_USAGE_CONTROL', () => { @@ -363,20 +361,19 @@ describe('Model Capabilities', () => { expect(MODELS_WITH_TEMPERATURE_SUPPORT.length).toBe( MODELS_TEMP_RANGE_0_2.length + MODELS_TEMP_RANGE_0_1.length ) - expect(MODELS_WITH_TEMPERATURE_SUPPORT).toContain('gpt-4o') // From 0-2 range - expect(MODELS_WITH_TEMPERATURE_SUPPORT).toContain('claude-sonnet-4-0') // From 0-1 range + expect(MODELS_WITH_TEMPERATURE_SUPPORT).toContain('gpt-4o') + expect(MODELS_WITH_TEMPERATURE_SUPPORT).toContain('claude-sonnet-4-0') } ) it.concurrent('should have correct models in MODELS_WITH_REASONING_EFFORT', () => { - // Should contain GPT-5.1 models that support reasoning effort expect(MODELS_WITH_REASONING_EFFORT).toContain('gpt-5.1') expect(MODELS_WITH_REASONING_EFFORT).toContain('azure/gpt-5.1') - expect(MODELS_WITH_REASONING_EFFORT).toContain('azure/gpt-5.1-mini') - expect(MODELS_WITH_REASONING_EFFORT).toContain('azure/gpt-5.1-nano') expect(MODELS_WITH_REASONING_EFFORT).toContain('azure/gpt-5.1-codex') - // Should contain GPT-5 models that support reasoning effort + expect(MODELS_WITH_REASONING_EFFORT).not.toContain('azure/gpt-5.1-mini') + expect(MODELS_WITH_REASONING_EFFORT).not.toContain('azure/gpt-5.1-nano') + expect(MODELS_WITH_REASONING_EFFORT).toContain('gpt-5') expect(MODELS_WITH_REASONING_EFFORT).toContain('gpt-5-mini') expect(MODELS_WITH_REASONING_EFFORT).toContain('gpt-5-nano') @@ -384,35 +381,30 @@ describe('Model Capabilities', () => { expect(MODELS_WITH_REASONING_EFFORT).toContain('azure/gpt-5-mini') expect(MODELS_WITH_REASONING_EFFORT).toContain('azure/gpt-5-nano') - // Should contain gpt-5.2 models expect(MODELS_WITH_REASONING_EFFORT).toContain('gpt-5.2') expect(MODELS_WITH_REASONING_EFFORT).toContain('azure/gpt-5.2') - // Should contain o-series reasoning models (reasoning_effort added Dec 17, 2024) expect(MODELS_WITH_REASONING_EFFORT).toContain('o1') expect(MODELS_WITH_REASONING_EFFORT).toContain('o3') expect(MODELS_WITH_REASONING_EFFORT).toContain('o4-mini') expect(MODELS_WITH_REASONING_EFFORT).toContain('azure/o3') expect(MODELS_WITH_REASONING_EFFORT).toContain('azure/o4-mini') - // Should NOT contain non-reasoning GPT-5 models expect(MODELS_WITH_REASONING_EFFORT).not.toContain('gpt-5-chat-latest') expect(MODELS_WITH_REASONING_EFFORT).not.toContain('azure/gpt-5-chat-latest') - // Should NOT contain other models expect(MODELS_WITH_REASONING_EFFORT).not.toContain('gpt-4o') expect(MODELS_WITH_REASONING_EFFORT).not.toContain('claude-sonnet-4-0') }) it.concurrent('should have correct models in MODELS_WITH_VERBOSITY', () => { - // Should contain GPT-5.1 models that support verbosity expect(MODELS_WITH_VERBOSITY).toContain('gpt-5.1') expect(MODELS_WITH_VERBOSITY).toContain('azure/gpt-5.1') - expect(MODELS_WITH_VERBOSITY).toContain('azure/gpt-5.1-mini') - expect(MODELS_WITH_VERBOSITY).toContain('azure/gpt-5.1-nano') expect(MODELS_WITH_VERBOSITY).toContain('azure/gpt-5.1-codex') - // Should contain GPT-5 models that support verbosity + expect(MODELS_WITH_VERBOSITY).not.toContain('azure/gpt-5.1-mini') + expect(MODELS_WITH_VERBOSITY).not.toContain('azure/gpt-5.1-nano') + expect(MODELS_WITH_VERBOSITY).toContain('gpt-5') expect(MODELS_WITH_VERBOSITY).toContain('gpt-5-mini') expect(MODELS_WITH_VERBOSITY).toContain('gpt-5-nano') @@ -420,26 +412,39 @@ describe('Model Capabilities', () => { expect(MODELS_WITH_VERBOSITY).toContain('azure/gpt-5-mini') expect(MODELS_WITH_VERBOSITY).toContain('azure/gpt-5-nano') - // Should contain gpt-5.2 models expect(MODELS_WITH_VERBOSITY).toContain('gpt-5.2') expect(MODELS_WITH_VERBOSITY).toContain('azure/gpt-5.2') - // Should NOT contain non-reasoning GPT-5 models expect(MODELS_WITH_VERBOSITY).not.toContain('gpt-5-chat-latest') expect(MODELS_WITH_VERBOSITY).not.toContain('azure/gpt-5-chat-latest') - // Should NOT contain o-series models (they support reasoning_effort but not verbosity) expect(MODELS_WITH_VERBOSITY).not.toContain('o1') expect(MODELS_WITH_VERBOSITY).not.toContain('o3') expect(MODELS_WITH_VERBOSITY).not.toContain('o4-mini') - // Should NOT contain other models expect(MODELS_WITH_VERBOSITY).not.toContain('gpt-4o') expect(MODELS_WITH_VERBOSITY).not.toContain('claude-sonnet-4-0') }) + it.concurrent('should have correct models in MODELS_WITH_THINKING', () => { + expect(MODELS_WITH_THINKING).toContain('claude-opus-4-6') + expect(MODELS_WITH_THINKING).toContain('claude-opus-4-5') + expect(MODELS_WITH_THINKING).toContain('claude-opus-4-1') + expect(MODELS_WITH_THINKING).toContain('claude-opus-4-0') + expect(MODELS_WITH_THINKING).toContain('claude-sonnet-4-5') + expect(MODELS_WITH_THINKING).toContain('claude-sonnet-4-0') + + expect(MODELS_WITH_THINKING).toContain('gemini-3-pro-preview') + expect(MODELS_WITH_THINKING).toContain('gemini-3-flash-preview') + + expect(MODELS_WITH_THINKING).toContain('claude-haiku-4-5') + + expect(MODELS_WITH_THINKING).not.toContain('gpt-4o') + expect(MODELS_WITH_THINKING).not.toContain('gpt-5') + expect(MODELS_WITH_THINKING).not.toContain('o3') + }) + it.concurrent('should have GPT-5 models in both reasoning effort and verbosity arrays', () => { - // GPT-5 series models support both reasoning effort and verbosity const gpt5ModelsWithReasoningEffort = MODELS_WITH_REASONING_EFFORT.filter( (m) => m.includes('gpt-5') && !m.includes('chat-latest') ) @@ -448,11 +453,229 @@ describe('Model Capabilities', () => { ) expect(gpt5ModelsWithReasoningEffort.sort()).toEqual(gpt5ModelsWithVerbosity.sort()) - // o-series models have reasoning effort but NOT verbosity expect(MODELS_WITH_REASONING_EFFORT).toContain('o1') expect(MODELS_WITH_VERBOSITY).not.toContain('o1') }) }) + describe('Reasoning Effort Values Per Model', () => { + it.concurrent('should return correct values for GPT-5.2', () => { + const values = getReasoningEffortValuesForModel('gpt-5.2') + expect(values).toBeDefined() + expect(values).toContain('none') + expect(values).toContain('low') + expect(values).toContain('medium') + expect(values).toContain('high') + expect(values).toContain('xhigh') + expect(values).not.toContain('minimal') + }) + + it.concurrent('should return correct values for GPT-5', () => { + const values = getReasoningEffortValuesForModel('gpt-5') + expect(values).toBeDefined() + expect(values).toContain('minimal') + expect(values).toContain('low') + expect(values).toContain('medium') + expect(values).toContain('high') + }) + + it.concurrent('should return correct values for o-series models', () => { + for (const model of ['o1', 'o3', 'o4-mini']) { + const values = getReasoningEffortValuesForModel(model) + expect(values).toBeDefined() + expect(values).toContain('low') + expect(values).toContain('medium') + expect(values).toContain('high') + expect(values).not.toContain('none') + expect(values).not.toContain('minimal') + } + }) + + it.concurrent('should return null for non-reasoning models', () => { + expect(getReasoningEffortValuesForModel('gpt-4o')).toBeNull() + expect(getReasoningEffortValuesForModel('claude-sonnet-4-5')).toBeNull() + expect(getReasoningEffortValuesForModel('gemini-2.5-flash')).toBeNull() + }) + + it.concurrent('should return correct values for Azure GPT-5.2', () => { + const values = getReasoningEffortValuesForModel('azure/gpt-5.2') + expect(values).toBeDefined() + expect(values).not.toContain('minimal') + expect(values).toContain('xhigh') + }) + }) + + describe('Verbosity Values Per Model', () => { + it.concurrent('should return correct values for GPT-5 family', () => { + for (const model of ['gpt-5.2', 'gpt-5.1', 'gpt-5', 'gpt-5-mini', 'gpt-5-nano']) { + const values = getVerbosityValuesForModel(model) + expect(values).toBeDefined() + expect(values).toContain('low') + expect(values).toContain('medium') + expect(values).toContain('high') + } + }) + + it.concurrent('should return null for o-series models', () => { + expect(getVerbosityValuesForModel('o1')).toBeNull() + expect(getVerbosityValuesForModel('o3')).toBeNull() + expect(getVerbosityValuesForModel('o4-mini')).toBeNull() + }) + + it.concurrent('should return null for non-reasoning models', () => { + expect(getVerbosityValuesForModel('gpt-4o')).toBeNull() + expect(getVerbosityValuesForModel('claude-sonnet-4-5')).toBeNull() + }) + }) + + describe('Thinking Levels Per Model', () => { + it.concurrent('should return correct levels for Claude Opus 4.6 (adaptive)', () => { + const levels = getThinkingLevelsForModel('claude-opus-4-6') + expect(levels).toBeDefined() + expect(levels).toContain('low') + expect(levels).toContain('medium') + expect(levels).toContain('high') + expect(levels).toContain('max') + }) + + it.concurrent('should return correct levels for other Claude models (budget_tokens)', () => { + for (const model of ['claude-opus-4-5', 'claude-sonnet-4-5', 'claude-sonnet-4-0']) { + const levels = getThinkingLevelsForModel(model) + expect(levels).toBeDefined() + expect(levels).toContain('low') + expect(levels).toContain('medium') + expect(levels).toContain('high') + expect(levels).not.toContain('max') + } + }) + + it.concurrent('should return correct levels for Gemini 3 models', () => { + const proLevels = getThinkingLevelsForModel('gemini-3-pro-preview') + expect(proLevels).toBeDefined() + expect(proLevels).toContain('low') + expect(proLevels).toContain('high') + + const flashLevels = getThinkingLevelsForModel('gemini-3-flash-preview') + expect(flashLevels).toBeDefined() + expect(flashLevels).toContain('minimal') + expect(flashLevels).toContain('low') + expect(flashLevels).toContain('medium') + expect(flashLevels).toContain('high') + }) + + it.concurrent('should return correct levels for Claude Haiku 4.5', () => { + const levels = getThinkingLevelsForModel('claude-haiku-4-5') + expect(levels).toBeDefined() + expect(levels).toContain('low') + expect(levels).toContain('medium') + expect(levels).toContain('high') + }) + + it.concurrent('should return null for non-thinking models', () => { + expect(getThinkingLevelsForModel('gpt-4o')).toBeNull() + expect(getThinkingLevelsForModel('gpt-5')).toBeNull() + expect(getThinkingLevelsForModel('o3')).toBeNull() + }) + }) +}) + +describe('Max Output Tokens', () => { + describe('getMaxOutputTokensForModel', () => { + it.concurrent('should return higher value for streaming than non-streaming (Anthropic)', () => { + const streamingTokens = getMaxOutputTokensForModel('claude-opus-4-6', true) + const nonStreamingTokens = getMaxOutputTokensForModel('claude-opus-4-6', false) + expect(streamingTokens).toBeGreaterThan(nonStreamingTokens) + expect(streamingTokens).toBe(128000) + expect(nonStreamingTokens).toBe(8192) + }) + + it.concurrent('should return correct values for Claude Sonnet 4.5', () => { + expect(getMaxOutputTokensForModel('claude-sonnet-4-5', true)).toBe(64000) + expect(getMaxOutputTokensForModel('claude-sonnet-4-5', false)).toBe(8192) + }) + + it.concurrent('should return correct values for Claude Opus 4.1', () => { + expect(getMaxOutputTokensForModel('claude-opus-4-1', true)).toBe(32000) + expect(getMaxOutputTokensForModel('claude-opus-4-1', false)).toBe(8192) + }) + + it.concurrent('should return standard default for models without maxOutputTokens', () => { + expect(getMaxOutputTokensForModel('gpt-4o', false)).toBe(4096) + expect(getMaxOutputTokensForModel('gpt-4o', true)).toBe(4096) + }) + + it.concurrent('should return standard default for unknown models', () => { + expect(getMaxOutputTokensForModel('unknown-model', false)).toBe(4096) + expect(getMaxOutputTokensForModel('unknown-model', true)).toBe(4096) + }) + + it.concurrent( + 'non-streaming default should be within Anthropic SDK non-streaming threshold', + () => { + const SDK_NON_STREAMING_THRESHOLD = 21333 + const models = [ + 'claude-opus-4-6', + 'claude-opus-4-5', + 'claude-opus-4-1', + 'claude-sonnet-4-5', + 'claude-sonnet-4-0', + 'claude-haiku-4-5', + ] + + for (const model of models) { + const nonStreamingDefault = getMaxOutputTokensForModel(model, false) + expect(nonStreamingDefault).toBeLessThan(SDK_NON_STREAMING_THRESHOLD) + } + } + ) + }) +}) + +describe('Model Pricing Validation', () => { + it.concurrent('should have correct pricing for key Anthropic models', () => { + const opus46 = getModelPricing('claude-opus-4-6') + expect(opus46).toBeDefined() + expect(opus46.input).toBe(5.0) + expect(opus46.output).toBe(25.0) + + const sonnet45 = getModelPricing('claude-sonnet-4-5') + expect(sonnet45).toBeDefined() + expect(sonnet45.input).toBe(3.0) + expect(sonnet45.output).toBe(15.0) + }) + + it.concurrent('should have correct pricing for key OpenAI models', () => { + const gpt4o = getModelPricing('gpt-4o') + expect(gpt4o).toBeDefined() + expect(gpt4o.input).toBe(2.5) + expect(gpt4o.output).toBe(10.0) + + const o3 = getModelPricing('o3') + expect(o3).toBeDefined() + expect(o3.input).toBe(2.0) + expect(o3.output).toBe(8.0) + }) + + it.concurrent('should have correct pricing for Azure OpenAI o3', () => { + const azureO3 = getModelPricing('azure/o3') + expect(azureO3).toBeDefined() + expect(azureO3.input).toBe(2.0) + expect(azureO3.output).toBe(8.0) + }) + + it.concurrent('should return null for unknown models', () => { + expect(getModelPricing('unknown-model')).toBeNull() + }) +}) + +describe('Context Window Validation', () => { + it.concurrent('should have correct context windows for key models', () => { + const allModels = getAllModels() + + expect(allModels).toContain('gpt-5-chat-latest') + + expect(allModels).toContain('o3') + expect(allModels).toContain('o4-mini') + }) }) describe('Cost Calculation', () => { @@ -464,7 +687,7 @@ describe('Cost Calculation', () => { expect(result.output).toBeGreaterThan(0) expect(result.total).toBeCloseTo(result.input + result.output, 6) expect(result.pricing).toBeDefined() - expect(result.pricing.input).toBe(2.5) // GPT-4o pricing + expect(result.pricing.input).toBe(2.5) }) it.concurrent('should handle cached input pricing when enabled', () => { @@ -472,7 +695,7 @@ describe('Cost Calculation', () => { const cachedCost = calculateCost('gpt-4o', 1000, 500, true) expect(cachedCost.input).toBeLessThan(regularCost.input) - expect(cachedCost.output).toBe(regularCost.output) // Output cost should be same + expect(cachedCost.output).toBe(regularCost.output) }) it.concurrent('should return default pricing for unknown models', () => { @@ -481,7 +704,7 @@ describe('Cost Calculation', () => { expect(result.input).toBe(0) expect(result.output).toBe(0) expect(result.total).toBe(0) - expect(result.pricing.input).toBe(1.0) // Default pricing + expect(result.pricing.input).toBe(1.0) }) it.concurrent('should handle zero tokens', () => { @@ -528,19 +751,15 @@ describe('getHostedModels', () => { it.concurrent('should return OpenAI, Anthropic, and Google models as hosted', () => { const hostedModels = getHostedModels() - // OpenAI models expect(hostedModels).toContain('gpt-4o') expect(hostedModels).toContain('o1') - // Anthropic models expect(hostedModels).toContain('claude-sonnet-4-0') expect(hostedModels).toContain('claude-opus-4-0') - // Google models expect(hostedModels).toContain('gemini-2.5-pro') expect(hostedModels).toContain('gemini-2.5-flash') - // Should not contain models from other providers expect(hostedModels).not.toContain('deepseek-v3') expect(hostedModels).not.toContain('grok-4-latest') }) @@ -558,31 +777,24 @@ describe('getHostedModels', () => { describe('shouldBillModelUsage', () => { it.concurrent('should return true for exact matches of hosted models', () => { - // OpenAI models expect(shouldBillModelUsage('gpt-4o')).toBe(true) expect(shouldBillModelUsage('o1')).toBe(true) - // Anthropic models expect(shouldBillModelUsage('claude-sonnet-4-0')).toBe(true) expect(shouldBillModelUsage('claude-opus-4-0')).toBe(true) - // Google models expect(shouldBillModelUsage('gemini-2.5-pro')).toBe(true) expect(shouldBillModelUsage('gemini-2.5-flash')).toBe(true) }) it.concurrent('should return false for non-hosted models', () => { - // Other providers expect(shouldBillModelUsage('deepseek-v3')).toBe(false) expect(shouldBillModelUsage('grok-4-latest')).toBe(false) - // Unknown models expect(shouldBillModelUsage('unknown-model')).toBe(false) }) it.concurrent('should return false for versioned model names not in hosted list', () => { - // Versioned model names that are NOT in the hosted list - // These should NOT be billed (user provides own API key) expect(shouldBillModelUsage('claude-sonnet-4-20250514')).toBe(false) expect(shouldBillModelUsage('gpt-4o-2024-08-06')).toBe(false) expect(shouldBillModelUsage('claude-3-5-sonnet-20241022')).toBe(false) @@ -595,8 +807,7 @@ describe('shouldBillModelUsage', () => { }) it.concurrent('should not match partial model names', () => { - // Should not match partial/prefix models - expect(shouldBillModelUsage('gpt-4')).toBe(false) // gpt-4o is hosted, not gpt-4 + expect(shouldBillModelUsage('gpt-4')).toBe(false) expect(shouldBillModelUsage('claude-sonnet')).toBe(false) expect(shouldBillModelUsage('gemini')).toBe(false) }) @@ -612,8 +823,8 @@ describe('Provider Management', () => { }) it.concurrent('should use model patterns for pattern matching', () => { - expect(getProviderFromModel('gpt-5-custom')).toBe('openai') // Matches /^gpt/ pattern - expect(getProviderFromModel('claude-custom-model')).toBe('anthropic') // Matches /^claude/ pattern + expect(getProviderFromModel('gpt-5-custom')).toBe('openai') + expect(getProviderFromModel('claude-custom-model')).toBe('anthropic') }) it.concurrent('should default to ollama for unknown models', () => { @@ -667,7 +878,6 @@ describe('Provider Management', () => { expect(Array.isArray(allModels)).toBe(true) expect(allModels.length).toBeGreaterThan(0) - // Should contain models from different providers expect(allModels).toContain('gpt-4o') expect(allModels).toContain('claude-sonnet-4-0') expect(allModels).toContain('gemini-2.5-pro') @@ -712,7 +922,6 @@ describe('Provider Management', () => { const baseProviders = getBaseModelProviders() expect(typeof baseProviders).toBe('object') - // Should exclude ollama models }) }) @@ -720,10 +929,8 @@ describe('Provider Management', () => { it.concurrent('should update ollama models', () => { const mockModels = ['llama2', 'codellama', 'mistral'] - // This should not throw expect(() => updateOllamaProviderModels(mockModels)).not.toThrow() - // Verify the models were updated const ollamaModels = getProviderModels('ollama') expect(ollamaModels).toEqual(mockModels) }) @@ -754,7 +961,7 @@ describe('JSON and Structured Output', () => { }) it.concurrent('should clean up common JSON issues', () => { - const content = '{\n "key": "value",\n "number": 42,\n}' // Trailing comma + const content = '{\n "key": "value",\n "number": 42,\n}' const result = extractAndParseJSON(content) expect(result).toEqual({ key: 'value', number: 42 }) }) @@ -945,13 +1152,13 @@ describe('prepareToolExecution', () => { const { toolParams } = prepareToolExecution(tool, llmArgs, request) expect(toolParams.apiKey).toBe('user-key') - expect(toolParams.channel).toBe('#general') // User value wins + expect(toolParams.channel).toBe('#general') expect(toolParams.message).toBe('Hello world') }) it.concurrent('should filter out empty string user params', () => { const tool = { - params: { apiKey: 'user-key', channel: '' }, // Empty channel + params: { apiKey: 'user-key', channel: '' }, } const llmArgs = { message: 'Hello', channel: '#llm-channel' } const request = {} @@ -959,7 +1166,7 @@ describe('prepareToolExecution', () => { const { toolParams } = prepareToolExecution(tool, llmArgs, request) expect(toolParams.apiKey).toBe('user-key') - expect(toolParams.channel).toBe('#llm-channel') // LLM value used since user is empty + expect(toolParams.channel).toBe('#llm-channel') expect(toolParams.message).toBe('Hello') }) }) @@ -969,7 +1176,7 @@ describe('prepareToolExecution', () => { const tool = { params: { workflowId: 'child-workflow-123', - inputMapping: '{}', // Empty JSON string from UI + inputMapping: '{}', }, } const llmArgs = { @@ -979,7 +1186,6 @@ describe('prepareToolExecution', () => { const { toolParams } = prepareToolExecution(tool, llmArgs, request) - // LLM values should be used since user object is empty expect(toolParams.inputMapping).toEqual({ query: 'search term', limit: 10 }) expect(toolParams.workflowId).toBe('child-workflow-123') }) @@ -988,7 +1194,7 @@ describe('prepareToolExecution', () => { const tool = { params: { workflowId: 'child-workflow', - inputMapping: '{"query": "", "customField": "user-value"}', // Partial values + inputMapping: '{"query": "", "customField": "user-value"}', }, } const llmArgs = { @@ -998,7 +1204,6 @@ describe('prepareToolExecution', () => { const { toolParams } = prepareToolExecution(tool, llmArgs, request) - // LLM fills empty query, user's customField preserved, LLM's limit included expect(toolParams.inputMapping).toEqual({ query: 'llm-search', limit: 10, @@ -1020,7 +1225,6 @@ describe('prepareToolExecution', () => { const { toolParams } = prepareToolExecution(tool, llmArgs, request) - // User values win, but LLM's extra field is included expect(toolParams.inputMapping).toEqual({ query: 'user-search', limit: 5, @@ -1032,7 +1236,7 @@ describe('prepareToolExecution', () => { const tool = { params: { workflowId: 'child-workflow', - inputMapping: { query: '', customField: 'user-value' }, // Object, not string + inputMapping: { query: '', customField: 'user-value' }, }, } const llmArgs = { @@ -1051,7 +1255,7 @@ describe('prepareToolExecution', () => { it.concurrent('should use LLM inputMapping when user does not provide it', () => { const tool = { - params: { workflowId: 'child-workflow' }, // No inputMapping + params: { workflowId: 'child-workflow' }, } const llmArgs = { inputMapping: { query: 'llm-search', limit: 10 }, @@ -1070,7 +1274,7 @@ describe('prepareToolExecution', () => { inputMapping: '{"query": "user-search"}', }, } - const llmArgs = {} // No inputMapping from LLM + const llmArgs = {} const request = {} const { toolParams } = prepareToolExecution(tool, llmArgs, request) @@ -1092,7 +1296,6 @@ describe('prepareToolExecution', () => { const { toolParams } = prepareToolExecution(tool, llmArgs, request) - // Should use LLM values since user JSON is invalid expect(toolParams.inputMapping).toEqual({ query: 'llm-search' }) }) @@ -1105,9 +1308,8 @@ describe('prepareToolExecution', () => { const { toolParams } = prepareToolExecution(tool, llmArgs, request) - // Normal behavior: user values override LLM values expect(toolParams.apiKey).toBe('user-key') - expect(toolParams.channel).toBe('#general') // User value wins + expect(toolParams.channel).toBe('#general') expect(toolParams.message).toBe('Hello') }) @@ -1125,8 +1327,6 @@ describe('prepareToolExecution', () => { const { toolParams } = prepareToolExecution(tool, llmArgs, request) - // 0 and false should be preserved (they're valid values) - // empty string should be filled by LLM expect(toolParams.inputMapping).toEqual({ limit: 0, enabled: false,