mirror of
https://github.com/simstudioai/sim.git
synced 2026-02-11 23:14:58 -05:00
* feat(providers): add Gemini Deep Research via Interactions API * fix(providers): hide memory UI for deep research models * feat(providers): add multi-turn support and token logging for deep research * fix(providers): only collect user messages as deep research input * fix(providers): forward previousInteractionId to provider request * fix(blocks): hide memory child fields for deep research models * remove memory params from models that don't support it in provider requests * update blog
1214 lines
39 KiB
TypeScript
1214 lines
39 KiB
TypeScript
import {
|
|
type Content,
|
|
FunctionCallingConfigMode,
|
|
type FunctionDeclaration,
|
|
type GenerateContentConfig,
|
|
type GenerateContentResponse,
|
|
type GoogleGenAI,
|
|
type Interactions,
|
|
type Part,
|
|
type Schema,
|
|
type ThinkingConfig,
|
|
type ToolConfig,
|
|
} from '@google/genai'
|
|
import { createLogger } from '@sim/logger'
|
|
import type { StreamingExecution } from '@/executor/types'
|
|
import { MAX_TOOL_ITERATIONS } from '@/providers'
|
|
import {
|
|
checkForForcedToolUsage,
|
|
cleanSchemaForGemini,
|
|
convertToGeminiFormat,
|
|
convertUsageMetadata,
|
|
createReadableStreamFromGeminiStream,
|
|
ensureStructResponse,
|
|
extractAllFunctionCallParts,
|
|
extractTextContent,
|
|
mapToThinkingLevel,
|
|
} from '@/providers/google/utils'
|
|
import type { FunctionCallResponse, ProviderRequest, ProviderResponse } from '@/providers/types'
|
|
import {
|
|
calculateCost,
|
|
isDeepResearchModel,
|
|
prepareToolExecution,
|
|
prepareToolsWithUsageControl,
|
|
} from '@/providers/utils'
|
|
import { executeTool } from '@/tools'
|
|
import type { ExecutionState, GeminiProviderType, GeminiUsage } from './types'
|
|
|
|
/**
|
|
* Creates initial execution state
|
|
*/
|
|
function createInitialState(
|
|
contents: Content[],
|
|
initialUsage: GeminiUsage,
|
|
firstResponseTime: number,
|
|
initialCallTime: number,
|
|
model: string,
|
|
toolConfig: ToolConfig | undefined
|
|
): ExecutionState {
|
|
const initialCost = calculateCost(
|
|
model,
|
|
initialUsage.promptTokenCount,
|
|
initialUsage.candidatesTokenCount
|
|
)
|
|
|
|
return {
|
|
contents,
|
|
tokens: {
|
|
input: initialUsage.promptTokenCount,
|
|
output: initialUsage.candidatesTokenCount,
|
|
total: initialUsage.totalTokenCount,
|
|
},
|
|
cost: initialCost,
|
|
toolCalls: [],
|
|
toolResults: [],
|
|
iterationCount: 0,
|
|
modelTime: firstResponseTime,
|
|
toolsTime: 0,
|
|
timeSegments: [
|
|
{
|
|
type: 'model',
|
|
name: 'Initial response',
|
|
startTime: initialCallTime,
|
|
endTime: initialCallTime + firstResponseTime,
|
|
duration: firstResponseTime,
|
|
},
|
|
],
|
|
usedForcedTools: [],
|
|
currentToolConfig: toolConfig,
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Executes multiple tool calls in parallel and updates state.
|
|
* Per Gemini docs, all function calls from a single response should be executed
|
|
* together, with one model message containing all function calls and one user
|
|
* message containing all function responses.
|
|
*/
|
|
async function executeToolCallsBatch(
|
|
functionCallParts: Part[],
|
|
request: ProviderRequest,
|
|
state: ExecutionState,
|
|
forcedTools: string[],
|
|
logger: ReturnType<typeof createLogger>
|
|
): Promise<{ success: boolean; state: ExecutionState }> {
|
|
if (functionCallParts.length === 0) {
|
|
return { success: false, state }
|
|
}
|
|
|
|
const executionPromises = functionCallParts.map(async (part) => {
|
|
const toolCallStartTime = Date.now()
|
|
const functionCall = part.functionCall!
|
|
const toolName = functionCall.name ?? ''
|
|
const args = (functionCall.args ?? {}) as Record<string, unknown>
|
|
|
|
const tool = request.tools?.find((t) => t.id === toolName)
|
|
if (!tool) {
|
|
logger.warn(`Tool ${toolName} not found in registry, skipping`)
|
|
return {
|
|
success: false,
|
|
part,
|
|
toolName,
|
|
args,
|
|
resultContent: { error: true, message: `Tool ${toolName} not found`, tool: toolName },
|
|
toolParams: {},
|
|
startTime: toolCallStartTime,
|
|
endTime: Date.now(),
|
|
duration: Date.now() - toolCallStartTime,
|
|
}
|
|
}
|
|
|
|
try {
|
|
const { toolParams, executionParams } = prepareToolExecution(tool, args, request)
|
|
const result = await executeTool(toolName, executionParams)
|
|
const toolCallEndTime = Date.now()
|
|
const duration = toolCallEndTime - toolCallStartTime
|
|
|
|
const resultContent: Record<string, unknown> = result.success
|
|
? ensureStructResponse(result.output)
|
|
: { error: true, message: result.error || 'Tool execution failed', tool: toolName }
|
|
|
|
return {
|
|
success: result.success,
|
|
part,
|
|
toolName,
|
|
args,
|
|
resultContent,
|
|
toolParams,
|
|
result,
|
|
startTime: toolCallStartTime,
|
|
endTime: toolCallEndTime,
|
|
duration,
|
|
}
|
|
} catch (error) {
|
|
const toolCallEndTime = Date.now()
|
|
logger.error('Error processing function call:', {
|
|
error: error instanceof Error ? error.message : String(error),
|
|
functionName: toolName,
|
|
})
|
|
return {
|
|
success: false,
|
|
part,
|
|
toolName,
|
|
args,
|
|
resultContent: {
|
|
error: true,
|
|
message: error instanceof Error ? error.message : 'Tool execution failed',
|
|
tool: toolName,
|
|
},
|
|
toolParams: {},
|
|
startTime: toolCallStartTime,
|
|
endTime: toolCallEndTime,
|
|
duration: toolCallEndTime - toolCallStartTime,
|
|
}
|
|
}
|
|
})
|
|
|
|
const results = await Promise.all(executionPromises)
|
|
|
|
// Check if at least one tool was found (not all failed due to missing tools)
|
|
const hasValidResults = results.some((r) => r.result !== undefined)
|
|
if (!hasValidResults && results.every((r) => !r.success)) {
|
|
return { success: false, state }
|
|
}
|
|
|
|
// Build batched messages per Gemini spec:
|
|
// ONE model message with ALL function call parts
|
|
// ONE user message with ALL function responses
|
|
const modelParts: Part[] = results.map((r) => r.part)
|
|
const userParts: Part[] = results.map((r) => ({
|
|
functionResponse: {
|
|
name: r.toolName,
|
|
response: r.resultContent,
|
|
},
|
|
}))
|
|
|
|
const updatedContents: Content[] = [
|
|
...state.contents,
|
|
{ role: 'model', parts: modelParts },
|
|
{ role: 'user', parts: userParts },
|
|
]
|
|
|
|
// Collect all tool calls and results
|
|
const newToolCalls: FunctionCallResponse[] = []
|
|
const newToolResults: Record<string, unknown>[] = []
|
|
const newTimeSegments: ExecutionState['timeSegments'] = []
|
|
let totalToolsTime = 0
|
|
|
|
for (const r of results) {
|
|
newToolCalls.push({
|
|
name: r.toolName,
|
|
arguments: r.toolParams,
|
|
startTime: new Date(r.startTime).toISOString(),
|
|
endTime: new Date(r.endTime).toISOString(),
|
|
duration: r.duration,
|
|
result: r.resultContent,
|
|
})
|
|
|
|
if (r.success && r.result?.output) {
|
|
newToolResults.push(r.result.output as Record<string, unknown>)
|
|
}
|
|
|
|
newTimeSegments.push({
|
|
type: 'tool',
|
|
name: r.toolName,
|
|
startTime: r.startTime,
|
|
endTime: r.endTime,
|
|
duration: r.duration,
|
|
})
|
|
|
|
totalToolsTime += r.duration
|
|
}
|
|
|
|
// Check forced tool usage for all executed tools
|
|
const executedToolsInfo = results.map((r) => ({ name: r.toolName, args: r.args }))
|
|
const forcedToolCheck = checkForForcedToolUsage(
|
|
executedToolsInfo,
|
|
state.currentToolConfig,
|
|
forcedTools,
|
|
state.usedForcedTools
|
|
)
|
|
|
|
return {
|
|
success: true,
|
|
state: {
|
|
...state,
|
|
contents: updatedContents,
|
|
toolCalls: [...state.toolCalls, ...newToolCalls],
|
|
toolResults: [...state.toolResults, ...newToolResults],
|
|
toolsTime: state.toolsTime + totalToolsTime,
|
|
timeSegments: [...state.timeSegments, ...newTimeSegments],
|
|
usedForcedTools: forcedToolCheck?.usedForcedTools ?? state.usedForcedTools,
|
|
currentToolConfig: forcedToolCheck?.nextToolConfig ?? state.currentToolConfig,
|
|
},
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Updates state with model response metadata
|
|
*/
|
|
function updateStateWithResponse(
|
|
state: ExecutionState,
|
|
response: GenerateContentResponse,
|
|
model: string,
|
|
startTime: number,
|
|
endTime: number
|
|
): ExecutionState {
|
|
const usage = convertUsageMetadata(response.usageMetadata)
|
|
const cost = calculateCost(model, usage.promptTokenCount, usage.candidatesTokenCount)
|
|
const duration = endTime - startTime
|
|
|
|
return {
|
|
...state,
|
|
tokens: {
|
|
input: state.tokens.input + usage.promptTokenCount,
|
|
output: state.tokens.output + usage.candidatesTokenCount,
|
|
total: state.tokens.total + usage.totalTokenCount,
|
|
},
|
|
cost: {
|
|
input: state.cost.input + cost.input,
|
|
output: state.cost.output + cost.output,
|
|
total: state.cost.total + cost.total,
|
|
pricing: cost.pricing, // Use latest pricing
|
|
},
|
|
modelTime: state.modelTime + duration,
|
|
timeSegments: [
|
|
...state.timeSegments,
|
|
{
|
|
type: 'model',
|
|
name: `Model response (iteration ${state.iterationCount + 1})`,
|
|
startTime,
|
|
endTime,
|
|
duration,
|
|
},
|
|
],
|
|
iterationCount: state.iterationCount + 1,
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Builds config for next iteration
|
|
*/
|
|
function buildNextConfig(
|
|
baseConfig: GenerateContentConfig,
|
|
state: ExecutionState,
|
|
forcedTools: string[],
|
|
request: ProviderRequest,
|
|
logger: ReturnType<typeof createLogger>
|
|
): GenerateContentConfig {
|
|
const nextConfig = { ...baseConfig }
|
|
const allForcedToolsUsed =
|
|
forcedTools.length > 0 && state.usedForcedTools.length === forcedTools.length
|
|
|
|
if (allForcedToolsUsed && request.responseFormat) {
|
|
nextConfig.tools = undefined
|
|
nextConfig.toolConfig = undefined
|
|
nextConfig.responseMimeType = 'application/json'
|
|
nextConfig.responseSchema = cleanSchemaForGemini(request.responseFormat.schema) as Schema
|
|
logger.info('Using structured output for final response after tool execution')
|
|
} else if (state.currentToolConfig) {
|
|
nextConfig.toolConfig = state.currentToolConfig
|
|
} else {
|
|
nextConfig.toolConfig = { functionCallingConfig: { mode: FunctionCallingConfigMode.AUTO } }
|
|
}
|
|
|
|
return nextConfig
|
|
}
|
|
|
|
/**
|
|
* Creates streaming execution result template
|
|
*/
|
|
function createStreamingResult(
|
|
providerStartTime: number,
|
|
providerStartTimeISO: string,
|
|
firstResponseTime: number,
|
|
initialCallTime: number,
|
|
state?: ExecutionState
|
|
): StreamingExecution {
|
|
return {
|
|
stream: undefined as unknown as ReadableStream<Uint8Array>,
|
|
execution: {
|
|
success: true,
|
|
output: {
|
|
content: '',
|
|
model: '',
|
|
tokens: state?.tokens ?? { input: 0, output: 0, total: 0 },
|
|
toolCalls: state?.toolCalls.length
|
|
? { list: state.toolCalls, count: state.toolCalls.length }
|
|
: undefined,
|
|
toolResults: state?.toolResults,
|
|
providerTiming: {
|
|
startTime: providerStartTimeISO,
|
|
endTime: new Date().toISOString(),
|
|
duration: Date.now() - providerStartTime,
|
|
modelTime: state?.modelTime ?? firstResponseTime,
|
|
toolsTime: state?.toolsTime ?? 0,
|
|
firstResponseTime,
|
|
iterations: (state?.iterationCount ?? 0) + 1,
|
|
timeSegments: state?.timeSegments ?? [
|
|
{
|
|
type: 'model',
|
|
name: 'Initial streaming response',
|
|
startTime: initialCallTime,
|
|
endTime: initialCallTime + firstResponseTime,
|
|
duration: firstResponseTime,
|
|
},
|
|
],
|
|
},
|
|
cost: state?.cost ?? {
|
|
input: 0,
|
|
output: 0,
|
|
total: 0,
|
|
pricing: { input: 0, output: 0, updatedAt: new Date().toISOString() },
|
|
},
|
|
},
|
|
logs: [],
|
|
metadata: {
|
|
startTime: providerStartTimeISO,
|
|
endTime: new Date().toISOString(),
|
|
duration: Date.now() - providerStartTime,
|
|
},
|
|
isStreaming: true,
|
|
},
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Configuration for executing a Gemini request
|
|
*/
|
|
export interface GeminiExecutionConfig {
|
|
ai: GoogleGenAI
|
|
model: string
|
|
request: ProviderRequest
|
|
providerType: GeminiProviderType
|
|
}
|
|
|
|
const DEEP_RESEARCH_POLL_INTERVAL_MS = 10_000
|
|
const DEEP_RESEARCH_MAX_DURATION_MS = 60 * 60 * 1000
|
|
|
|
/**
|
|
* Sleeps for the specified number of milliseconds
|
|
*/
|
|
function sleep(ms: number): Promise<void> {
|
|
return new Promise((resolve) => setTimeout(resolve, ms))
|
|
}
|
|
|
|
/**
|
|
* Collapses a ProviderRequest into a single input string and optional system instruction
|
|
* for the Interactions API, which takes a flat input rather than a messages array.
|
|
*
|
|
* Deep research is single-turn only — it takes one research query and returns a report.
|
|
* Memory/conversation history is hidden in the UI for deep research models, so only
|
|
* the last user message is used as input. System messages are passed via system_instruction.
|
|
*/
|
|
function collapseMessagesToInput(request: ProviderRequest): {
|
|
input: string
|
|
systemInstruction: string | undefined
|
|
} {
|
|
const systemParts: string[] = []
|
|
const userParts: string[] = []
|
|
|
|
if (request.systemPrompt) {
|
|
systemParts.push(request.systemPrompt)
|
|
}
|
|
|
|
if (request.messages) {
|
|
for (const msg of request.messages) {
|
|
if (msg.role === 'system' && msg.content) {
|
|
systemParts.push(msg.content)
|
|
} else if (msg.role === 'user' && msg.content) {
|
|
userParts.push(msg.content)
|
|
}
|
|
}
|
|
}
|
|
|
|
return {
|
|
input:
|
|
userParts.length > 0
|
|
? userParts[userParts.length - 1]
|
|
: 'Please conduct research on the provided topic.',
|
|
systemInstruction: systemParts.length > 0 ? systemParts.join('\n\n') : undefined,
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extracts text content from a completed interaction's outputs array.
|
|
* The outputs array can contain text, thought, google_search_result, and other types.
|
|
* We concatenate all text outputs to get the full research report.
|
|
*/
|
|
function extractTextFromInteractionOutputs(outputs: Interactions.Interaction['outputs']): string {
|
|
if (!outputs || outputs.length === 0) return ''
|
|
|
|
const textParts: string[] = []
|
|
for (const output of outputs) {
|
|
if (output.type === 'text') {
|
|
const text = (output as Interactions.TextContent).text
|
|
if (text) textParts.push(text)
|
|
}
|
|
}
|
|
|
|
return textParts.join('\n\n')
|
|
}
|
|
|
|
/**
|
|
* Extracts token usage from an Interaction's Usage object.
|
|
* The Interactions API provides total_input_tokens, total_output_tokens, total_tokens,
|
|
* and total_reasoning_tokens (for thinking models).
|
|
*
|
|
* Also handles the raw API field name total_thought_tokens which the SDK may
|
|
* map to total_reasoning_tokens.
|
|
*/
|
|
function extractInteractionUsage(usage: Interactions.Usage | undefined): {
|
|
inputTokens: number
|
|
outputTokens: number
|
|
reasoningTokens: number
|
|
totalTokens: number
|
|
} {
|
|
if (!usage) {
|
|
return { inputTokens: 0, outputTokens: 0, reasoningTokens: 0, totalTokens: 0 }
|
|
}
|
|
|
|
const usageLogger = createLogger('DeepResearchUsage')
|
|
usageLogger.info('Raw interaction usage', { usage: JSON.stringify(usage) })
|
|
|
|
const inputTokens = usage.total_input_tokens ?? 0
|
|
const outputTokens = usage.total_output_tokens ?? 0
|
|
const reasoningTokens =
|
|
usage.total_reasoning_tokens ??
|
|
((usage as Record<string, unknown>).total_thought_tokens as number) ??
|
|
0
|
|
const totalTokens = usage.total_tokens ?? inputTokens + outputTokens
|
|
|
|
return { inputTokens, outputTokens, reasoningTokens, totalTokens }
|
|
}
|
|
|
|
/**
|
|
* Builds a standard ProviderResponse from a completed deep research interaction.
|
|
*/
|
|
function buildDeepResearchResponse(
|
|
content: string,
|
|
model: string,
|
|
usage: {
|
|
inputTokens: number
|
|
outputTokens: number
|
|
reasoningTokens: number
|
|
totalTokens: number
|
|
},
|
|
providerStartTime: number,
|
|
providerStartTimeISO: string,
|
|
interactionId?: string
|
|
): ProviderResponse {
|
|
const providerEndTime = Date.now()
|
|
const duration = providerEndTime - providerStartTime
|
|
|
|
return {
|
|
content,
|
|
model,
|
|
tokens: {
|
|
input: usage.inputTokens,
|
|
output: usage.outputTokens,
|
|
total: usage.totalTokens,
|
|
},
|
|
timing: {
|
|
startTime: providerStartTimeISO,
|
|
endTime: new Date(providerEndTime).toISOString(),
|
|
duration,
|
|
modelTime: duration,
|
|
toolsTime: 0,
|
|
firstResponseTime: duration,
|
|
iterations: 1,
|
|
timeSegments: [
|
|
{
|
|
type: 'model',
|
|
name: 'Deep research',
|
|
startTime: providerStartTime,
|
|
endTime: providerEndTime,
|
|
duration,
|
|
},
|
|
],
|
|
},
|
|
cost: calculateCost(model, usage.inputTokens, usage.outputTokens),
|
|
interactionId,
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Creates a ReadableStream from a deep research streaming interaction.
|
|
*
|
|
* Deep research streaming returns InteractionSSEEvent chunks including:
|
|
* - interaction.start: initial interaction with ID
|
|
* - content.delta: incremental text and thought_summary updates
|
|
* - content.start / content.stop: output boundaries
|
|
* - interaction.complete: final event (outputs is undefined in streaming; must reconstruct)
|
|
* - error: error events
|
|
*
|
|
* We stream text deltas to the client and track usage from the interaction.complete event.
|
|
*/
|
|
function createDeepResearchStream(
|
|
stream: AsyncIterable<Interactions.InteractionSSEEvent>,
|
|
onComplete?: (
|
|
content: string,
|
|
usage: {
|
|
inputTokens: number
|
|
outputTokens: number
|
|
reasoningTokens: number
|
|
totalTokens: number
|
|
},
|
|
interactionId?: string
|
|
) => void
|
|
): ReadableStream<Uint8Array> {
|
|
const streamLogger = createLogger('DeepResearchStream')
|
|
let fullContent = ''
|
|
let completionUsage = { inputTokens: 0, outputTokens: 0, reasoningTokens: 0, totalTokens: 0 }
|
|
let completedInteractionId: string | undefined
|
|
|
|
return new ReadableStream({
|
|
async start(controller) {
|
|
try {
|
|
for await (const event of stream) {
|
|
if (event.event_type === 'content.delta') {
|
|
const delta = (event as Interactions.ContentDelta).delta
|
|
if (delta?.type === 'text' && 'text' in delta && delta.text) {
|
|
fullContent += delta.text
|
|
controller.enqueue(new TextEncoder().encode(delta.text))
|
|
}
|
|
} else if (event.event_type === 'interaction.complete') {
|
|
const interaction = (event as Interactions.InteractionEvent).interaction
|
|
if (interaction?.usage) {
|
|
completionUsage = extractInteractionUsage(interaction.usage)
|
|
}
|
|
completedInteractionId = interaction?.id
|
|
} else if (event.event_type === 'interaction.start') {
|
|
const interaction = (event as Interactions.InteractionEvent).interaction
|
|
if (interaction?.id) {
|
|
completedInteractionId = interaction.id
|
|
}
|
|
} else if (event.event_type === 'error') {
|
|
const errorEvent = event as { error?: { code?: string; message?: string } }
|
|
const message = errorEvent.error?.message ?? 'Unknown deep research stream error'
|
|
streamLogger.error('Deep research stream error', {
|
|
code: errorEvent.error?.code,
|
|
message,
|
|
})
|
|
controller.error(new Error(message))
|
|
return
|
|
}
|
|
}
|
|
|
|
onComplete?.(fullContent, completionUsage, completedInteractionId)
|
|
controller.close()
|
|
} catch (error) {
|
|
streamLogger.error('Error reading deep research stream', {
|
|
error: error instanceof Error ? error.message : String(error),
|
|
})
|
|
controller.error(error)
|
|
}
|
|
},
|
|
})
|
|
}
|
|
|
|
/**
|
|
* Executes a deep research request using the Interactions API.
|
|
*
|
|
* Deep research uses the Interactions API ({@link https://ai.google.dev/api/interactions-api}),
|
|
* a completely different surface from generateContent. It creates a background interaction
|
|
* that performs comprehensive research (up to 60 minutes).
|
|
*
|
|
* Supports both streaming and non-streaming modes:
|
|
* - Streaming: returns a StreamingExecution with a ReadableStream of text deltas
|
|
* - Non-streaming: polls until completion and returns a ProviderResponse
|
|
*
|
|
* Deep research does NOT support custom function calling tools, MCP servers,
|
|
* or structured output (response_format). These are gracefully ignored.
|
|
*/
|
|
export async function executeDeepResearchRequest(
|
|
config: GeminiExecutionConfig
|
|
): Promise<ProviderResponse | StreamingExecution> {
|
|
const { ai, model, request, providerType } = config
|
|
const logger = createLogger(providerType === 'google' ? 'GoogleProvider' : 'VertexProvider')
|
|
|
|
logger.info('Preparing deep research request', {
|
|
model,
|
|
hasSystemPrompt: !!request.systemPrompt,
|
|
hasMessages: !!request.messages?.length,
|
|
streaming: !!request.stream,
|
|
hasPreviousInteractionId: !!request.previousInteractionId,
|
|
})
|
|
|
|
if (request.tools?.length) {
|
|
logger.warn('Deep research does not support custom tools — ignoring tools parameter')
|
|
}
|
|
if (request.responseFormat) {
|
|
logger.warn(
|
|
'Deep research does not support structured output — ignoring responseFormat parameter'
|
|
)
|
|
}
|
|
|
|
const providerStartTime = Date.now()
|
|
const providerStartTimeISO = new Date(providerStartTime).toISOString()
|
|
|
|
try {
|
|
const { input, systemInstruction } = collapseMessagesToInput(request)
|
|
|
|
// Deep research requires background=true and store=true (store defaults to true,
|
|
// but we set it explicitly per API requirements)
|
|
const baseParams = {
|
|
agent: model as Interactions.CreateAgentInteractionParamsNonStreaming['agent'],
|
|
input,
|
|
background: true,
|
|
store: true,
|
|
...(systemInstruction && { system_instruction: systemInstruction }),
|
|
...(request.previousInteractionId && {
|
|
previous_interaction_id: request.previousInteractionId,
|
|
}),
|
|
agent_config: {
|
|
type: 'deep-research' as const,
|
|
thinking_summaries: 'auto' as const,
|
|
},
|
|
}
|
|
|
|
logger.info('Creating deep research interaction', {
|
|
inputLength: input.length,
|
|
hasSystemInstruction: !!systemInstruction,
|
|
streaming: !!request.stream,
|
|
})
|
|
|
|
// Streaming mode: create a streaming interaction and return a StreamingExecution
|
|
if (request.stream) {
|
|
const streamParams: Interactions.CreateAgentInteractionParamsStreaming = {
|
|
...baseParams,
|
|
stream: true,
|
|
}
|
|
|
|
const streamResponse = await ai.interactions.create(streamParams)
|
|
const firstResponseTime = Date.now() - providerStartTime
|
|
|
|
const streamingResult: StreamingExecution = {
|
|
stream: undefined as unknown as ReadableStream<Uint8Array>,
|
|
execution: {
|
|
success: true,
|
|
output: {
|
|
content: '',
|
|
model,
|
|
tokens: { input: 0, output: 0, total: 0 },
|
|
providerTiming: {
|
|
startTime: providerStartTimeISO,
|
|
endTime: new Date().toISOString(),
|
|
duration: Date.now() - providerStartTime,
|
|
modelTime: firstResponseTime,
|
|
toolsTime: 0,
|
|
firstResponseTime,
|
|
iterations: 1,
|
|
timeSegments: [
|
|
{
|
|
type: 'model',
|
|
name: 'Deep research (streaming)',
|
|
startTime: providerStartTime,
|
|
endTime: providerStartTime + firstResponseTime,
|
|
duration: firstResponseTime,
|
|
},
|
|
],
|
|
},
|
|
cost: {
|
|
input: 0,
|
|
output: 0,
|
|
total: 0,
|
|
pricing: { input: 0, output: 0, updatedAt: new Date().toISOString() },
|
|
},
|
|
},
|
|
logs: [],
|
|
metadata: {
|
|
startTime: providerStartTimeISO,
|
|
endTime: new Date().toISOString(),
|
|
duration: Date.now() - providerStartTime,
|
|
},
|
|
isStreaming: true,
|
|
},
|
|
}
|
|
|
|
streamingResult.stream = createDeepResearchStream(
|
|
streamResponse,
|
|
(content, usage, streamInteractionId) => {
|
|
streamingResult.execution.output.content = content
|
|
streamingResult.execution.output.tokens = {
|
|
input: usage.inputTokens,
|
|
output: usage.outputTokens,
|
|
total: usage.totalTokens,
|
|
}
|
|
streamingResult.execution.output.interactionId = streamInteractionId
|
|
|
|
const cost = calculateCost(model, usage.inputTokens, usage.outputTokens)
|
|
streamingResult.execution.output.cost = cost
|
|
|
|
const streamEndTime = Date.now()
|
|
if (streamingResult.execution.output.providerTiming) {
|
|
streamingResult.execution.output.providerTiming.endTime = new Date(
|
|
streamEndTime
|
|
).toISOString()
|
|
streamingResult.execution.output.providerTiming.duration =
|
|
streamEndTime - providerStartTime
|
|
const segments = streamingResult.execution.output.providerTiming.timeSegments
|
|
if (segments?.[0]) {
|
|
segments[0].endTime = streamEndTime
|
|
segments[0].duration = streamEndTime - providerStartTime
|
|
}
|
|
}
|
|
}
|
|
)
|
|
|
|
return streamingResult
|
|
}
|
|
|
|
// Non-streaming mode: create and poll
|
|
const createParams: Interactions.CreateAgentInteractionParamsNonStreaming = {
|
|
...baseParams,
|
|
stream: false,
|
|
}
|
|
|
|
const interaction = await ai.interactions.create(createParams)
|
|
const interactionId = interaction.id
|
|
|
|
logger.info('Deep research interaction created', { interactionId, status: interaction.status })
|
|
|
|
// Poll until a terminal status
|
|
const pollStartTime = Date.now()
|
|
let result: Interactions.Interaction = interaction
|
|
|
|
while (Date.now() - pollStartTime < DEEP_RESEARCH_MAX_DURATION_MS) {
|
|
if (result.status === 'completed') {
|
|
break
|
|
}
|
|
|
|
if (result.status === 'failed') {
|
|
throw new Error(`Deep research interaction failed: ${interactionId}`)
|
|
}
|
|
|
|
if (result.status === 'cancelled') {
|
|
throw new Error(`Deep research interaction was cancelled: ${interactionId}`)
|
|
}
|
|
|
|
logger.info('Deep research in progress, polling...', {
|
|
interactionId,
|
|
status: result.status,
|
|
elapsedMs: Date.now() - pollStartTime,
|
|
})
|
|
|
|
await sleep(DEEP_RESEARCH_POLL_INTERVAL_MS)
|
|
result = await ai.interactions.get(interactionId)
|
|
}
|
|
|
|
if (result.status !== 'completed') {
|
|
throw new Error(
|
|
`Deep research timed out after ${DEEP_RESEARCH_MAX_DURATION_MS / 1000}s (status: ${result.status})`
|
|
)
|
|
}
|
|
|
|
const content = extractTextFromInteractionOutputs(result.outputs)
|
|
const usage = extractInteractionUsage(result.usage)
|
|
|
|
logger.info('Deep research completed', {
|
|
interactionId,
|
|
contentLength: content.length,
|
|
inputTokens: usage.inputTokens,
|
|
outputTokens: usage.outputTokens,
|
|
reasoningTokens: usage.reasoningTokens,
|
|
totalTokens: usage.totalTokens,
|
|
durationMs: Date.now() - providerStartTime,
|
|
})
|
|
|
|
return buildDeepResearchResponse(
|
|
content,
|
|
model,
|
|
usage,
|
|
providerStartTime,
|
|
providerStartTimeISO,
|
|
interactionId
|
|
)
|
|
} catch (error) {
|
|
const providerEndTime = Date.now()
|
|
const duration = providerEndTime - providerStartTime
|
|
|
|
logger.error('Error in deep research request:', {
|
|
error: error instanceof Error ? error.message : String(error),
|
|
stack: error instanceof Error ? error.stack : undefined,
|
|
})
|
|
|
|
const enhancedError = error instanceof Error ? error : new Error(String(error))
|
|
Object.assign(enhancedError, {
|
|
timing: {
|
|
startTime: providerStartTimeISO,
|
|
endTime: new Date(providerEndTime).toISOString(),
|
|
duration,
|
|
},
|
|
})
|
|
throw enhancedError
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Executes a request using the Gemini API
|
|
*
|
|
* This is the shared core logic for both Google and Vertex AI providers.
|
|
* The only difference is how the GoogleGenAI client is configured.
|
|
*/
|
|
export async function executeGeminiRequest(
|
|
config: GeminiExecutionConfig
|
|
): Promise<ProviderResponse | StreamingExecution> {
|
|
const { ai, model, request, providerType } = config
|
|
|
|
// Route deep research models to the interactions API
|
|
if (isDeepResearchModel(model)) {
|
|
return executeDeepResearchRequest(config)
|
|
}
|
|
|
|
const logger = createLogger(providerType === 'google' ? 'GoogleProvider' : 'VertexProvider')
|
|
|
|
logger.info(`Preparing ${providerType} Gemini request`, {
|
|
model,
|
|
hasSystemPrompt: !!request.systemPrompt,
|
|
hasMessages: !!request.messages?.length,
|
|
hasTools: !!request.tools?.length,
|
|
toolCount: request.tools?.length ?? 0,
|
|
hasResponseFormat: !!request.responseFormat,
|
|
streaming: !!request.stream,
|
|
})
|
|
|
|
const providerStartTime = Date.now()
|
|
const providerStartTimeISO = new Date(providerStartTime).toISOString()
|
|
|
|
try {
|
|
const { contents, tools, systemInstruction } = convertToGeminiFormat(request)
|
|
|
|
// Build configuration
|
|
const geminiConfig: GenerateContentConfig = {}
|
|
|
|
if (request.temperature !== undefined) {
|
|
geminiConfig.temperature = request.temperature
|
|
}
|
|
if (request.maxTokens != null) {
|
|
geminiConfig.maxOutputTokens = request.maxTokens
|
|
}
|
|
if (systemInstruction) {
|
|
geminiConfig.systemInstruction = systemInstruction
|
|
}
|
|
|
|
// Handle response format (only when no tools)
|
|
if (request.responseFormat && !tools?.length) {
|
|
geminiConfig.responseMimeType = 'application/json'
|
|
geminiConfig.responseSchema = cleanSchemaForGemini(request.responseFormat.schema) as Schema
|
|
logger.info('Using Gemini native structured output format')
|
|
} else if (request.responseFormat && tools?.length) {
|
|
logger.warn('Gemini does not support responseFormat with tools. Structured output ignored.')
|
|
}
|
|
|
|
// Configure thinking only when the user explicitly selects a thinking level
|
|
if (request.thinkingLevel && request.thinkingLevel !== 'none') {
|
|
const thinkingConfig: ThinkingConfig = {
|
|
includeThoughts: false,
|
|
thinkingLevel: mapToThinkingLevel(request.thinkingLevel),
|
|
}
|
|
geminiConfig.thinkingConfig = thinkingConfig
|
|
}
|
|
|
|
// Prepare tools
|
|
let preparedTools: ReturnType<typeof prepareToolsWithUsageControl> | null = null
|
|
let toolConfig: ToolConfig | undefined
|
|
|
|
if (tools?.length) {
|
|
const functionDeclarations: FunctionDeclaration[] = tools.map((t) => ({
|
|
name: t.name,
|
|
description: t.description,
|
|
parameters: t.parameters,
|
|
}))
|
|
|
|
preparedTools = prepareToolsWithUsageControl(
|
|
functionDeclarations,
|
|
request.tools,
|
|
logger,
|
|
'google'
|
|
)
|
|
const { tools: filteredTools, toolConfig: preparedToolConfig } = preparedTools
|
|
|
|
if (filteredTools?.length) {
|
|
geminiConfig.tools = [{ functionDeclarations: filteredTools as FunctionDeclaration[] }]
|
|
|
|
if (preparedToolConfig) {
|
|
toolConfig = {
|
|
functionCallingConfig: {
|
|
mode:
|
|
{
|
|
AUTO: FunctionCallingConfigMode.AUTO,
|
|
ANY: FunctionCallingConfigMode.ANY,
|
|
NONE: FunctionCallingConfigMode.NONE,
|
|
}[preparedToolConfig.functionCallingConfig.mode] ?? FunctionCallingConfigMode.AUTO,
|
|
allowedFunctionNames: preparedToolConfig.functionCallingConfig.allowedFunctionNames,
|
|
},
|
|
}
|
|
geminiConfig.toolConfig = toolConfig
|
|
}
|
|
|
|
logger.info('Gemini request with tools:', {
|
|
toolCount: filteredTools.length,
|
|
model,
|
|
tools: filteredTools.map((t) => (t as FunctionDeclaration).name),
|
|
})
|
|
}
|
|
}
|
|
|
|
const initialCallTime = Date.now()
|
|
const shouldStream = request.stream && !tools?.length
|
|
|
|
// Streaming without tools
|
|
if (shouldStream) {
|
|
logger.info('Handling Gemini streaming response')
|
|
|
|
const streamGenerator = await ai.models.generateContentStream({
|
|
model,
|
|
contents,
|
|
config: geminiConfig,
|
|
})
|
|
const firstResponseTime = Date.now() - initialCallTime
|
|
|
|
const streamingResult = createStreamingResult(
|
|
providerStartTime,
|
|
providerStartTimeISO,
|
|
firstResponseTime,
|
|
initialCallTime
|
|
)
|
|
streamingResult.execution.output.model = model
|
|
|
|
streamingResult.stream = createReadableStreamFromGeminiStream(
|
|
streamGenerator,
|
|
(content: string, usage: GeminiUsage) => {
|
|
streamingResult.execution.output.content = content
|
|
streamingResult.execution.output.tokens = {
|
|
input: usage.promptTokenCount,
|
|
output: usage.candidatesTokenCount,
|
|
total: usage.totalTokenCount,
|
|
}
|
|
|
|
const costResult = calculateCost(
|
|
model,
|
|
usage.promptTokenCount,
|
|
usage.candidatesTokenCount
|
|
)
|
|
streamingResult.execution.output.cost = costResult
|
|
|
|
const streamEndTime = Date.now()
|
|
if (streamingResult.execution.output.providerTiming) {
|
|
streamingResult.execution.output.providerTiming.endTime = new Date(
|
|
streamEndTime
|
|
).toISOString()
|
|
streamingResult.execution.output.providerTiming.duration =
|
|
streamEndTime - providerStartTime
|
|
const segments = streamingResult.execution.output.providerTiming.timeSegments
|
|
if (segments?.[0]) {
|
|
segments[0].endTime = streamEndTime
|
|
segments[0].duration = streamEndTime - providerStartTime
|
|
}
|
|
}
|
|
}
|
|
)
|
|
|
|
return streamingResult
|
|
}
|
|
|
|
// Non-streaming request
|
|
const response = await ai.models.generateContent({ model, contents, config: geminiConfig })
|
|
const firstResponseTime = Date.now() - initialCallTime
|
|
|
|
// Check for UNEXPECTED_TOOL_CALL
|
|
const candidate = response.candidates?.[0]
|
|
if (candidate?.finishReason === 'UNEXPECTED_TOOL_CALL') {
|
|
logger.warn('Gemini returned UNEXPECTED_TOOL_CALL - model attempted to call unknown tool')
|
|
}
|
|
|
|
const initialUsage = convertUsageMetadata(response.usageMetadata)
|
|
let state = createInitialState(
|
|
contents,
|
|
initialUsage,
|
|
firstResponseTime,
|
|
initialCallTime,
|
|
model,
|
|
toolConfig
|
|
)
|
|
const forcedTools = preparedTools?.forcedTools ?? []
|
|
|
|
let currentResponse = response
|
|
let content = ''
|
|
|
|
// Tool execution loop
|
|
const functionCalls = response.functionCalls
|
|
if (functionCalls?.length) {
|
|
const functionNames = functionCalls.map((fc) => fc.name).join(', ')
|
|
logger.info(`Received ${functionCalls.length} function call(s) from Gemini: ${functionNames}`)
|
|
|
|
while (state.iterationCount < MAX_TOOL_ITERATIONS) {
|
|
// Extract ALL function call parts from the response (Gemini can return multiple)
|
|
const functionCallParts = extractAllFunctionCallParts(currentResponse.candidates?.[0])
|
|
if (functionCallParts.length === 0) {
|
|
content = extractTextContent(currentResponse.candidates?.[0])
|
|
break
|
|
}
|
|
|
|
const callNames = functionCallParts.map((p) => p.functionCall?.name ?? 'unknown').join(', ')
|
|
logger.info(
|
|
`Processing ${functionCallParts.length} function call(s): ${callNames} (iteration ${state.iterationCount + 1})`
|
|
)
|
|
|
|
// Execute ALL function calls in this batch
|
|
const { success, state: updatedState } = await executeToolCallsBatch(
|
|
functionCallParts,
|
|
request,
|
|
state,
|
|
forcedTools,
|
|
logger
|
|
)
|
|
if (!success) {
|
|
content = extractTextContent(currentResponse.candidates?.[0])
|
|
break
|
|
}
|
|
|
|
state = { ...updatedState, iterationCount: updatedState.iterationCount + 1 }
|
|
const nextConfig = buildNextConfig(geminiConfig, state, forcedTools, request, logger)
|
|
|
|
// Stream final response if requested
|
|
if (request.stream) {
|
|
const checkResponse = await ai.models.generateContent({
|
|
model,
|
|
contents: state.contents,
|
|
config: nextConfig,
|
|
})
|
|
state = updateStateWithResponse(state, checkResponse, model, Date.now() - 100, Date.now())
|
|
|
|
if (checkResponse.functionCalls?.length) {
|
|
currentResponse = checkResponse
|
|
continue
|
|
}
|
|
|
|
logger.info('No more function calls, streaming final response')
|
|
|
|
if (request.responseFormat) {
|
|
nextConfig.tools = undefined
|
|
nextConfig.toolConfig = undefined
|
|
nextConfig.responseMimeType = 'application/json'
|
|
nextConfig.responseSchema = cleanSchemaForGemini(
|
|
request.responseFormat.schema
|
|
) as Schema
|
|
}
|
|
|
|
// Capture accumulated cost before streaming
|
|
const accumulatedCost = {
|
|
input: state.cost.input,
|
|
output: state.cost.output,
|
|
total: state.cost.total,
|
|
}
|
|
const accumulatedTokens = { ...state.tokens }
|
|
|
|
const streamGenerator = await ai.models.generateContentStream({
|
|
model,
|
|
contents: state.contents,
|
|
config: nextConfig,
|
|
})
|
|
|
|
const streamingResult = createStreamingResult(
|
|
providerStartTime,
|
|
providerStartTimeISO,
|
|
firstResponseTime,
|
|
initialCallTime,
|
|
state
|
|
)
|
|
streamingResult.execution.output.model = model
|
|
|
|
streamingResult.stream = createReadableStreamFromGeminiStream(
|
|
streamGenerator,
|
|
(streamContent: string, usage: GeminiUsage) => {
|
|
streamingResult.execution.output.content = streamContent
|
|
streamingResult.execution.output.tokens = {
|
|
input: accumulatedTokens.input + usage.promptTokenCount,
|
|
output: accumulatedTokens.output + usage.candidatesTokenCount,
|
|
total: accumulatedTokens.total + usage.totalTokenCount,
|
|
}
|
|
|
|
const streamCost = calculateCost(
|
|
model,
|
|
usage.promptTokenCount,
|
|
usage.candidatesTokenCount
|
|
)
|
|
streamingResult.execution.output.cost = {
|
|
input: accumulatedCost.input + streamCost.input,
|
|
output: accumulatedCost.output + streamCost.output,
|
|
total: accumulatedCost.total + streamCost.total,
|
|
pricing: streamCost.pricing,
|
|
}
|
|
|
|
if (streamingResult.execution.output.providerTiming) {
|
|
streamingResult.execution.output.providerTiming.endTime = new Date().toISOString()
|
|
streamingResult.execution.output.providerTiming.duration =
|
|
Date.now() - providerStartTime
|
|
}
|
|
}
|
|
)
|
|
|
|
return streamingResult
|
|
}
|
|
|
|
// Non-streaming: get next response
|
|
const nextModelStartTime = Date.now()
|
|
const nextResponse = await ai.models.generateContent({
|
|
model,
|
|
contents: state.contents,
|
|
config: nextConfig,
|
|
})
|
|
state = updateStateWithResponse(state, nextResponse, model, nextModelStartTime, Date.now())
|
|
currentResponse = nextResponse
|
|
}
|
|
|
|
if (!content) {
|
|
content = extractTextContent(currentResponse.candidates?.[0])
|
|
}
|
|
} else {
|
|
content = extractTextContent(candidate)
|
|
}
|
|
|
|
const providerEndTime = Date.now()
|
|
|
|
return {
|
|
content,
|
|
model,
|
|
tokens: state.tokens,
|
|
toolCalls: state.toolCalls.length ? state.toolCalls : undefined,
|
|
toolResults: state.toolResults.length ? state.toolResults : undefined,
|
|
timing: {
|
|
startTime: providerStartTimeISO,
|
|
endTime: new Date(providerEndTime).toISOString(),
|
|
duration: providerEndTime - providerStartTime,
|
|
modelTime: state.modelTime,
|
|
toolsTime: state.toolsTime,
|
|
firstResponseTime,
|
|
iterations: state.iterationCount + 1,
|
|
timeSegments: state.timeSegments,
|
|
},
|
|
cost: state.cost,
|
|
}
|
|
} catch (error) {
|
|
const providerEndTime = Date.now()
|
|
const duration = providerEndTime - providerStartTime
|
|
|
|
logger.error('Error in Gemini request:', {
|
|
error: error instanceof Error ? error.message : String(error),
|
|
stack: error instanceof Error ? error.stack : undefined,
|
|
})
|
|
|
|
const enhancedError = error instanceof Error ? error : new Error(String(error))
|
|
Object.assign(enhancedError, {
|
|
timing: {
|
|
startTime: providerStartTimeISO,
|
|
endTime: new Date(providerEndTime).toISOString(),
|
|
duration,
|
|
},
|
|
})
|
|
throw enhancedError
|
|
}
|
|
}
|