diff --git a/apps/sim/app/workspace/[workspaceId]/w/[workflowId]/components/panel/components/editor/components/sub-block/components/messages-input/messages-input.tsx b/apps/sim/app/workspace/[workspaceId]/w/[workflowId]/components/panel/components/editor/components/sub-block/components/messages-input/messages-input.tsx index 73054d273..5e50646b7 100644 --- a/apps/sim/app/workspace/[workspaceId]/w/[workflowId]/components/panel/components/editor/components/sub-block/components/messages-input/messages-input.tsx +++ b/apps/sim/app/workspace/[workspaceId]/w/[workflowId]/components/panel/components/editor/components/sub-block/components/messages-input/messages-input.tsx @@ -32,6 +32,7 @@ import type { WandControlHandlers } from '@/app/workspace/[workspaceId]/w/[workf import { useAccessibleReferencePrefixes } from '@/app/workspace/[workspaceId]/w/[workflowId]/hooks/use-accessible-reference-prefixes' import { useWand } from '@/app/workspace/[workspaceId]/w/[workflowId]/hooks/use-wand' import type { SubBlockConfig } from '@/blocks/types' +import { supportsVision } from '@/providers/utils' import { useWorkflowRegistry } from '@/stores/workflows/registry/store' import { useSubBlockStore } from '@/stores/workflows/subblock/store' @@ -50,13 +51,13 @@ const MAX_TEXTAREA_HEIGHT_PX = 320 /** Pattern to match complete message objects in JSON */ const COMPLETE_MESSAGE_PATTERN = - /"role"\s*:\s*"(system|user|assistant|media)"[^}]*"content"\s*:\s*"((?:[^"\\]|\\.)*)"/g + /"role"\s*:\s*"(system|user|assistant|attachment)"[^}]*"content"\s*:\s*"((?:[^"\\]|\\.)*)"/g /** Pattern to match incomplete content at end of buffer */ const INCOMPLETE_CONTENT_PATTERN = /"content"\s*:\s*"((?:[^"\\]|\\.)*)$/ /** Pattern to match role before content */ -const ROLE_BEFORE_CONTENT_PATTERN = /"role"\s*:\s*"(system|user|assistant|media)"[^{]*$/ +const ROLE_BEFORE_CONTENT_PATTERN = /"role"\s*:\s*"(system|user|assistant|attachment)"[^{]*$/ /** * Unescapes JSON string content @@ -65,9 +66,9 @@ const unescapeContent = (str: string): string => str.replace(/\\n/g, '\n').replace(/\\"/g, '"').replace(/\\\\/g, '\\') /** - * Media content for multimodal messages + * Attachment content (files, images, documents) */ -interface MediaContent { +interface AttachmentContent { /** Source type: how the data was provided */ sourceType: 'url' | 'base64' | 'file' /** The URL or base64 data */ @@ -84,9 +85,9 @@ interface MediaContent { * Interface for individual message in the messages array */ interface Message { - role: 'system' | 'user' | 'assistant' | 'media' + role: 'system' | 'user' | 'assistant' | 'attachment' content: string - media?: MediaContent + attachment?: AttachmentContent } /** @@ -122,8 +123,8 @@ export function MessagesInput({ const [openPopoverIndex, setOpenPopoverIndex] = useState(null) const { activeWorkflowId } = useWorkflowRegistry() - // Local media mode state - basic = FileUpload, advanced = URL/base64 textarea - const [mediaMode, setMediaMode] = useState<'basic' | 'advanced'>('basic') + // Local attachment mode state - basic = FileUpload, advanced = URL/base64 textarea + const [attachmentMode, setAttachmentMode] = useState<'basic' | 'advanced'>('basic') // Workspace files for wand context const [workspaceFiles, setWorkspaceFiles] = useState([]) @@ -166,22 +167,49 @@ export function MessagesInput({ .join('\n') if (!filesList) { - return 'No media files in workspace. The user can upload files manually after generation.' + return 'No files in workspace. The user can upload files manually after generation.' } - return `AVAILABLE WORKSPACE FILES (optional - you don't have to select one):\n${filesList}\n\nTo use a file, include "fileId": "" in the media object. If not selecting a file, omit the fileId field.` + return `AVAILABLE WORKSPACE FILES (optional - you don't have to select one):\n${filesList}\n\nTo use a file, include "fileId": "" in the attachment object. If not selecting a file, omit the fileId field.` }, [workspaceFiles]) - // Get indices of media messages for subscription - const mediaIndices = useMemo( + // Get indices of attachment messages for subscription + const attachmentIndices = useMemo( () => localMessages - .map((msg, index) => (msg.role === 'media' ? index : -1)) + .map((msg, index) => (msg.role === 'attachment' ? index : -1)) .filter((i) => i !== -1), [localMessages] ) - // Subscribe to file upload values for all media messages + // Subscribe to model value to check vision capability + const modelSupportsVision = useSubBlockStore( + useCallback( + (state) => { + if (!activeWorkflowId) return true // Default to allowing attachments + const blockValues = state.workflowValues[activeWorkflowId]?.[blockId] ?? {} + const modelValue = blockValues.model as string | undefined + if (!modelValue) return true // No model selected, allow attachments + return supportsVision(modelValue) + }, + [activeWorkflowId, blockId] + ) + ) + + // Determine available roles based on model capabilities + const availableRoles = useMemo(() => { + const baseRoles: Array<'system' | 'user' | 'assistant' | 'attachment'> = [ + 'system', + 'user', + 'assistant', + ] + if (modelSupportsVision) { + baseRoles.push('attachment') + } + return baseRoles + }, [modelSupportsVision]) + + // Subscribe to file upload values for all attachment messages const fileUploadValues = useSubBlockStore( useCallback( (state) => { @@ -189,8 +217,8 @@ export function MessagesInput({ const blockValues = state.workflowValues[activeWorkflowId]?.[blockId] ?? {} const result: Record = {} - for (const index of mediaIndices) { - const fileUploadKey = `${subBlockId}-media-${index}` + for (const index of attachmentIndices) { + const fileUploadKey = `${subBlockId}-attachment-${index}` const fileValue = blockValues[fileUploadKey] if (fileValue && typeof fileValue === 'object' && 'path' in fileValue) { result[index] = fileValue as { name: string; path: string; type: string; size: number } @@ -198,21 +226,21 @@ export function MessagesInput({ } return result }, - [activeWorkflowId, blockId, subBlockId, mediaIndices] + [activeWorkflowId, blockId, subBlockId, attachmentIndices] ) ) - // Effect to sync FileUpload values to message media objects + // Effect to sync FileUpload values to message attachment objects useEffect(() => { if (!activeWorkflowId || isPreview) return let hasChanges = false const updatedMessages = localMessages.map((msg, index) => { - if (msg.role !== 'media') return msg + if (msg.role !== 'attachment') return msg const uploadedFile = fileUploadValues[index] if (uploadedFile) { - const newMedia: MediaContent = { + const newAttachment: AttachmentContent = { sourceType: 'file', data: uploadedFile.path, mimeType: uploadedFile.type, @@ -221,16 +249,16 @@ export function MessagesInput({ // Only update if different if ( - msg.media?.data !== newMedia.data || - msg.media?.sourceType !== newMedia.sourceType || - msg.media?.mimeType !== newMedia.mimeType || - msg.media?.fileName !== newMedia.fileName + msg.attachment?.data !== newAttachment.data || + msg.attachment?.sourceType !== newAttachment.sourceType || + msg.attachment?.mimeType !== newAttachment.mimeType || + msg.attachment?.fileName !== newAttachment.fileName ) { hasChanges = true return { ...msg, content: uploadedFile.name || msg.content, - media: newMedia, + attachment: newAttachment, } } } @@ -267,20 +295,22 @@ export function MessagesInput({ if (Array.isArray(parsed)) { const validMessages: Message[] = parsed .filter( - (m): m is { role: string; content: string; media?: MediaContent } => + (m): m is { role: string; content: string; attachment?: AttachmentContent } => typeof m === 'object' && m !== null && typeof m.role === 'string' && typeof m.content === 'string' ) .map((m) => { - const role = ['system', 'user', 'assistant', 'media'].includes(m.role) ? m.role : 'user' + const role = ['system', 'user', 'assistant', 'attachment'].includes(m.role) + ? m.role + : 'user' const message: Message = { role: role as Message['role'], content: m.content, } - if (m.media) { - message.media = m.media + if (m.attachment) { + message.attachment = m.attachment } return message }) @@ -344,14 +374,14 @@ export function MessagesInput({ onGeneratedContent: (content) => { const validMessages = parseMessages(content) if (validMessages) { - // Process media messages - only allow fileId to set files, sanitize other attempts + // Process attachment messages - only allow fileId to set files, sanitize other attempts validMessages.forEach((msg, index) => { - if (msg.role === 'media') { + if (msg.role === 'attachment') { // Check if this is an existing file with valid data (preserve it) const hasExistingFile = - msg.media?.sourceType === 'file' && - msg.media?.data?.startsWith('/api/') && - msg.media?.fileName + msg.attachment?.sourceType === 'file' && + msg.attachment?.data?.startsWith('/api/') && + msg.attachment?.fileName if (hasExistingFile) { // Preserve existing file data as-is @@ -359,11 +389,11 @@ export function MessagesInput({ } // Check if wand provided a fileId to select a workspace file - if (msg.media?.fileId) { - const file = workspaceFiles.find((f) => f.id === msg.media?.fileId) + if (msg.attachment?.fileId) { + const file = workspaceFiles.find((f) => f.id === msg.attachment?.fileId) if (file) { // Set the file value in SubBlockStore so FileUpload picks it up - const fileUploadKey = `${subBlockId}-media-${index}` + const fileUploadKey = `${subBlockId}-attachment-${index}` const uploadedFile = { name: file.name, path: file.path, @@ -372,16 +402,16 @@ export function MessagesInput({ } useSubBlockStore.getState().setValue(blockId, fileUploadKey, uploadedFile) - // Clear the media object - the FileUpload will sync the file data via useEffect - // DON'T set media.data here as it would appear in the ShortInput (advanced mode) - msg.media = undefined + // Clear the attachment object - the FileUpload will sync the file data via useEffect + // DON'T set attachment.data here as it would appear in the ShortInput (advanced mode) + msg.attachment = undefined return } } - // Sanitize: clear any media object that isn't a valid existing file or fileId match + // Sanitize: clear any attachment object that isn't a valid existing file or fileId match // This prevents the LLM from setting arbitrary data/variable references - msg.media = undefined + msg.attachment = undefined } }) @@ -458,22 +488,22 @@ export function MessagesInput({ ) const updateMessageRole = useCallback( - (index: number, role: 'system' | 'user' | 'assistant' | 'media') => { + (index: number, role: 'system' | 'user' | 'assistant' | 'attachment') => { if (isPreview || disabled) return const updatedMessages = [...localMessages] - if (role === 'media') { + if (role === 'attachment') { updatedMessages[index] = { ...updatedMessages[index], role, content: updatedMessages[index].content || '', - media: updatedMessages[index].media || { + attachment: updatedMessages[index].attachment || { sourceType: 'file', data: '', }, } } else { - const { media: _, ...rest } = updatedMessages[index] + const { attachment: _, ...rest } = updatedMessages[index] updatedMessages[index] = { ...rest, role, @@ -761,7 +791,7 @@ export function MessagesInput({
- {(['system', 'user', 'assistant', 'media'] as const).map((role) => ( + {availableRoles.map((role) => ( )} - {/* Mode toggle for media messages */} - {message.role === 'media' && ( + {/* Mode toggle for attachment messages */} + {message.role === 'attachment' && (
- {/* Content Input - different for media vs text messages */} - {message.role === 'media' ? ( + {/* Content Input - different for attachment vs text messages */} + {message.role === 'attachment' ? (
- {mediaMode === 'basic' ? ( + {attachmentMode === 'basic' ? ( { const updatedMessages = [...localMessages] - if (updatedMessages[index].role === 'media') { + if (updatedMessages[index].role === 'attachment') { // Determine sourceType based on content let sourceType: 'url' | 'base64' = 'url' if (newValue.startsWith('data:') || newValue.includes(';base64,')) { @@ -909,8 +941,8 @@ export function MessagesInput({ updatedMessages[index] = { ...updatedMessages[index], content: newValue.substring(0, 50), - media: { - ...updatedMessages[index].media, + attachment: { + ...updatedMessages[index].attachment, sourceType, data: newValue, }, diff --git a/apps/sim/blocks/blocks/agent.ts b/apps/sim/blocks/blocks/agent.ts index c26907105..7e3edebf6 100644 --- a/apps/sim/blocks/blocks/agent.ts +++ b/apps/sim/blocks/blocks/agent.ts @@ -100,7 +100,7 @@ Current messages: {context} RULES: 1. Generate ONLY a valid JSON array - no markdown, no explanations 2. Each message object must have "role" and "content" properties -3. Valid roles are: "system", "user", "assistant", "media" +3. Valid roles are: "system", "user", "assistant", "attachment" 4. Content can be as long as necessary - don't truncate 5. If editing existing messages, preserve structure unless asked to change it 6. For new agents, create DETAILED, PROFESSIONAL system prompts that include: @@ -110,15 +110,15 @@ RULES: - Critical thinking or quality guidelines - How to handle edge cases and uncertainty -MEDIA MESSAGES: -- Use role "media" to include images, audio, video, or documents in a multimodal conversation -- IMPORTANT: If a media message in the current context has a "media" object with file data, ALWAYS preserve that entire "media" object exactly as-is -- When creating NEW media messages, you can either: - 1. Just set role to "media" with descriptive content - user will upload the file manually - 2. Select a file from the available workspace files by including "fileId" in the media object (optional) +ATTACHMENTS: +- Use role "attachment" to include images, audio, video, or documents in a multimodal conversation +- IMPORTANT: If an attachment message in the current context has an "attachment" object with file data, ALWAYS preserve that entire "attachment" object exactly as-is +- When creating NEW attachment messages, you can either: + 1. Just set role to "attachment" with descriptive content - user will upload the file manually + 2. Select a file from the available workspace files by including "fileId" in the attachment object (optional) - You do NOT have to select a file - it's completely optional -- Example without file: {"role": "media", "content": "Analyze this image for text and objects"} -- Example with file selection: {"role": "media", "content": "Analyze this image", "media": {"fileId": "abc123"}} +- Example without file: {"role": "attachment", "content": "Analyze this image for text and objects"} +- Example with file selection: {"role": "attachment", "content": "Analyze this image", "attachment": {"fileId": "abc123"}} EXAMPLES: @@ -129,7 +129,7 @@ Code reviewer: [{"role": "system", "content": "You are a Senior Code Reviewer with expertise in software architecture, security, and best practices. Your role is to provide thorough, constructive code reviews that improve code quality and help developers grow.\\n\\n## Review Methodology\\n\\n1. **Security First**: Check for vulnerabilities including injection attacks, authentication flaws, data exposure, and insecure dependencies.\\n\\n2. **Code Quality**: Evaluate readability, maintainability, adherence to DRY/SOLID principles, and appropriate abstraction levels.\\n\\n3. **Performance**: Identify potential bottlenecks, unnecessary computations, memory leaks, and optimization opportunities.\\n\\n4. **Testing**: Assess test coverage, edge case handling, and testability of the code structure.\\n\\n## Output Format\\n\\n### Summary\\nBrief overview of the code's purpose and overall assessment.\\n\\n### Critical Issues\\nSecurity vulnerabilities or bugs that must be fixed before merging.\\n\\n### Improvements\\nSuggested enhancements with clear explanations of why and how.\\n\\n### Positive Aspects\\nHighlight well-written code to reinforce good practices.\\n\\nBe specific with line references. Provide code examples for suggested changes. Balance critique with encouragement."}, {"role": "user", "content": ""}] Image analysis agent: -[{"role": "system", "content": "You are an expert image analyst. Describe images in detail, identify objects, text, and patterns. Provide structured analysis."}, {"role": "media", "content": "Analyze this image"}] +[{"role": "system", "content": "You are an expert image analyst. Describe images in detail, identify objects, text, and patterns. Provide structured analysis."}, {"role": "attachment", "content": "Analyze this image"}] Return ONLY the JSON array.`, placeholder: 'Describe what you want to create or change...', diff --git a/apps/sim/executor/handlers/agent/agent-handler.ts b/apps/sim/executor/handlers/agent/agent-handler.ts index 612312e25..52b281295 100644 --- a/apps/sim/executor/handlers/agent/agent-handler.ts +++ b/apps/sim/executor/handlers/agent/agent-handler.ts @@ -3,8 +3,6 @@ import { account, mcpServers } from '@sim/db/schema' import { createLogger } from '@sim/logger' import { and, eq, inArray, isNull } from 'drizzle-orm' import { createMcpToolId } from '@/lib/mcp/utils' -import { bufferToBase64 } from '@/lib/uploads/utils/file-utils' -import { downloadFileFromUrl } from '@/lib/uploads/utils/file-utils.server' import { refreshTokenIfNeeded } from '@/app/api/auth/oauth/utils' import { getAllBlocks } from '@/blocks' import type { BlockOutput } from '@/blocks/types' @@ -27,6 +25,8 @@ import { validateModelProvider, } from '@/executor/utils/permission-check' import { executeProviderRequest } from '@/providers' +import { transformAttachmentMessages } from '@/providers/attachment' +import type { ProviderId } from '@/providers/types' import { getProviderFromModel, transformBlockTool } from '@/providers/utils' import type { SerializedBlock } from '@/serializer/types' import { executeTool } from '@/tools' @@ -62,9 +62,12 @@ export class AgentBlockHandler implements BlockHandler { const streamingConfig = this.getStreamingConfig(ctx, block) const rawMessages = await this.buildMessages(ctx, filteredInputs) - // Transform media messages to provider-specific format (async for file fetching) + // Transform attachment messages to provider-specific format (async for file fetching) const messages = rawMessages - ? await this.transformMediaMessages(rawMessages, providerId, ctx) + ? await transformAttachmentMessages(rawMessages, { + providerId: providerId as ProviderId, + model, + }) : undefined const providerRequest = this.buildProviderRequest({ @@ -848,445 +851,11 @@ export class AgentBlockHandler implements BlockHandler { 'role' in m && 'content' in m && typeof m.role === 'string' && - ['system', 'user', 'assistant', 'media'].includes(m.role) + ['system', 'user', 'assistant', 'attachment'].includes(m.role) ) }) } - /** - * Transforms messages with 'media' role into provider-compatible format. - * Media messages are merged with the preceding or following user message, - * or converted to a user message with multimodal content. - */ - private async transformMediaMessages( - messages: Message[], - providerId: string, - ctx: ExecutionContext - ): Promise { - const result: Message[] = [] - - for (let i = 0; i < messages.length; i++) { - const msg = messages[i] - - if (msg.role !== 'media') { - result.push(msg) - continue - } - - // Media message - transform based on provider (async for file fetching) - const mediaContent = await this.createProviderMediaContent(msg, providerId, ctx) - if (!mediaContent) { - logger.warn('Could not create media content for message', { msg }) - continue - } - - // Check if we should merge with the previous user message - const lastMessage = result[result.length - 1] - if (lastMessage && lastMessage.role === 'user') { - // Merge media into the previous user message's content array - const existingContent = this.ensureContentArray(lastMessage, providerId) - existingContent.push(mediaContent) - lastMessage.content = existingContent as any - } else { - // Create a new user message with the media content - result.push({ - role: 'user', - content: [mediaContent] as any, - }) - } - } - - // Post-process: ensure all user messages have consistent content format - return result.map((msg) => { - if (msg.role === 'user' && typeof msg.content === 'string') { - // Convert string content to provider-specific text format (wrapped in array for multimodal) - return { - ...msg, - content: [this.createTextContent(msg.content, providerId)] as any, - } - } - return msg - }) - } - - /** - * Ensures a user message has content as an array for multimodal support - */ - private ensureContentArray(msg: Message, providerId: string): any[] { - if (Array.isArray(msg.content)) { - return msg.content - } - if (typeof msg.content === 'string' && msg.content) { - return [this.createTextContent(msg.content, providerId)] - } - return [] - } - - /** - * Creates provider-specific text content block - */ - private createTextContent(text: string, providerId: string): any { - switch (providerId) { - case 'google': - case 'vertex': - return { text } - case 'anthropic': - return { type: 'text', text } - default: - // OpenAI format (used by most providers) - return { type: 'text', text } - } - } - - /** - * Creates provider-specific media content from a media message - */ - private async createProviderMediaContent( - msg: Message, - providerId: string, - ctx: ExecutionContext - ): Promise { - const media = msg.media - if (!media) return null - - const { sourceType, data, mimeType } = media - - // Validate data is not empty - if (!data || !data.trim()) { - logger.warn('Empty media data, skipping media content') - return null - } - - // Validate URL format if sourceType is URL - if (sourceType === 'url' || sourceType === 'file') { - const trimmedData = data.trim() - // Must start with http://, https://, or / (relative path for workspace files) - if ( - !trimmedData.startsWith('http://') && - !trimmedData.startsWith('https://') && - !trimmedData.startsWith('/') - ) { - logger.warn('Invalid URL format for media content', { data: trimmedData.substring(0, 50) }) - // Try to salvage by treating as text - return { type: 'text', text: `[Invalid media URL: ${trimmedData.substring(0, 30)}...]` } - } - } - - // Validate base64 format - if (sourceType === 'base64') { - const trimmedData = data.trim() - // Should be a data URL or raw base64 - if ( - !trimmedData.startsWith('data:') && - !/^[A-Za-z0-9+/]+=*$/.test(trimmedData.replace(/\s/g, '')) - ) { - logger.warn('Invalid base64 format for media content', { - data: trimmedData.substring(0, 50), - }) - return { type: 'text', text: `[Invalid base64 data]` } - } - } - - switch (providerId) { - case 'anthropic': - return this.createAnthropicMediaContent(sourceType, data, mimeType, ctx) - - case 'google': - case 'vertex': - return this.createGeminiMediaContent(sourceType, data, mimeType) - - case 'mistral': - return this.createMistralMediaContent(sourceType, data, mimeType) - - case 'bedrock': - return this.createBedrockMediaContent(sourceType, data, mimeType) - - default: - // OpenAI format (used by OpenAI, Azure, xAI, Groq, etc.) - return this.createOpenAIMediaContent(sourceType, data, mimeType) - } - } - - /** - * Creates OpenAI-compatible media content - */ - private createOpenAIMediaContent(sourceType: string, data: string, mimeType?: string): any { - const isImage = mimeType?.startsWith('image/') - const isAudio = mimeType?.startsWith('audio/') - // Treat 'file' as 'url' since workspace files are served via URL - const isUrl = sourceType === 'url' || sourceType === 'file' - - if (isImage) { - return { - type: 'image_url', - image_url: { url: data, detail: 'auto' }, - } - } - - if (isAudio) { - const base64Data = data.includes(',') ? data.split(',')[1] : data - return { - type: 'input_audio', - input_audio: { - data: base64Data, - format: mimeType === 'audio/wav' ? 'wav' : 'mp3', - }, - } - } - - // For documents/files, include as URL - if (sourceType === 'url') { - return { - type: 'file', - file: { url: data }, - } - } - - // Base64 file - some providers may not support this directly - logger.warn('Base64 file content may not be supported by this provider') - return { - type: 'text', - text: `[File: ${mimeType || 'unknown type'}]`, - } - } - - /** - * Creates Anthropic-compatible media content - * Anthropic requires base64 for internal/relative URLs since they can't fetch them - */ - private async createAnthropicMediaContent( - sourceType: string, - data: string, - mimeType?: string, - ctx?: ExecutionContext - ): Promise { - const isImage = mimeType?.startsWith('image/') - const isPdf = mimeType === 'application/pdf' - const isInternalUrl = data.startsWith('/') - const isExternalHttps = data.startsWith('https://') - - // For internal URLs (workspace files), fetch and convert to base64 - // Anthropic only supports external HTTPS URLs, not relative paths - if ((sourceType === 'url' || sourceType === 'file') && isInternalUrl) { - try { - logger.info('Fetching internal file for Anthropic base64 conversion', { - path: data.substring(0, 50), - }) - const buffer = await downloadFileFromUrl(data) - const base64Data = bufferToBase64(buffer) - - if (isImage) { - return { - type: 'image', - source: { - type: 'base64', - media_type: mimeType || 'image/png', - data: base64Data, - }, - } - } - - if (isPdf) { - return { - type: 'document', - source: { - type: 'base64', - media_type: 'application/pdf', - data: base64Data, - }, - } - } - - // Other file types - return as text fallback - return { - type: 'text', - text: `[File: ${mimeType || 'unknown type'}]`, - } - } catch (error) { - logger.error('Failed to fetch file for Anthropic', { error, path: data.substring(0, 50) }) - return { - type: 'text', - text: `[Failed to load file: ${mimeType || 'unknown type'}]`, - } - } - } - - // For external HTTPS URLs, Anthropic can fetch them directly - if ((sourceType === 'url' || sourceType === 'file') && isExternalHttps) { - if (isImage) { - return { - type: 'image', - source: { type: 'url', url: data }, - } - } - if (isPdf) { - return { - type: 'document', - source: { type: 'url', url: data }, - } - } - } - - // Already base64 encoded - if (sourceType === 'base64') { - const base64Data = data.includes(',') ? data.split(',')[1] : data - if (isImage) { - return { - type: 'image', - source: { - type: 'base64', - media_type: mimeType || 'image/png', - data: base64Data, - }, - } - } - if (isPdf) { - return { - type: 'document', - source: { - type: 'base64', - media_type: 'application/pdf', - data: base64Data, - }, - } - } - } - - // Fallback for unsupported types - return { - type: 'text', - text: `[File: ${mimeType || 'unknown type'}]`, - } - } - - /** - * Creates Google Gemini-compatible media content - */ - private createGeminiMediaContent(sourceType: string, data: string, mimeType?: string): any { - // Treat 'file' as 'url' since workspace files are served via URL - const isUrl = sourceType === 'url' || sourceType === 'file' - - if (isUrl) { - return { - fileData: { - mimeType: mimeType || 'application/octet-stream', - fileUri: data, - }, - } - } - - // base64 - const base64Data = data.includes(',') ? data.split(',')[1] : data - return { - inlineData: { - mimeType: mimeType || 'application/octet-stream', - data: base64Data, - }, - } - } - - /** - * Creates Mistral-compatible media content - * Note: Mistral uses a simplified format where image_url is a direct string, - * NOT a nested object like OpenAI - */ - private createMistralMediaContent(sourceType: string, data: string, mimeType?: string): any { - const isImage = mimeType?.startsWith('image/') - // Treat 'file' as 'url' since workspace files are served via URL - const isUrl = sourceType === 'url' || sourceType === 'file' - - if (isImage) { - if (isUrl) { - // Mistral uses direct string for image_url, not nested object - return { - type: 'image_url', - image_url: data, - } - } - // Base64 - Mistral accepts data URLs directly - const base64Data = data.includes(',') - ? data - : `data:${mimeType || 'image/png'};base64,${data}` - return { - type: 'image_url', - image_url: base64Data, - } - } - - // Fallback for non-image types - return { - type: 'text', - text: `[File: ${mimeType || 'unknown type'}]`, - } - } - - /** - * Creates AWS Bedrock Converse API-compatible media content - * Bedrock uses a different structure: { image: { format, source: { bytes } } } - * Note: The actual bytes conversion happens in the provider layer - */ - private createBedrockMediaContent(sourceType: string, data: string, mimeType?: string): any { - const isImage = mimeType?.startsWith('image/') - // Treat 'file' as 'url' since workspace files are served via URL - const isUrl = sourceType === 'url' || sourceType === 'file' - - // Determine format from mimeType - const getFormat = (mime?: string): string => { - if (!mime) return 'png' - if (mime.includes('jpeg') || mime.includes('jpg')) return 'jpeg' - if (mime.includes('png')) return 'png' - if (mime.includes('gif')) return 'gif' - if (mime.includes('webp')) return 'webp' - return 'png' - } - - if (isImage) { - if (isUrl) { - // For URLs, Bedrock needs S3 URIs or we need to fetch and convert - // Mark this for the provider layer to handle - return { - type: 'bedrock_image', - format: getFormat(mimeType), - sourceType: 'url', - url: data, - } - } - // Base64 - extract raw base64 data - const base64Data = data.includes(',') ? data.split(',')[1] : data - return { - type: 'bedrock_image', - format: getFormat(mimeType), - sourceType: 'base64', - data: base64Data, - } - } - - // Documents (PDFs) - Bedrock supports document content type - if (mimeType === 'application/pdf') { - if (isUrl) { - return { - type: 'bedrock_document', - format: 'pdf', - sourceType: 'url', - url: data, - } - } - const base64Data = data.includes(',') ? data.split(',')[1] : data - return { - type: 'bedrock_document', - format: 'pdf', - sourceType: 'base64', - data: base64Data, - } - } - - // Fallback for unsupported types - return { - type: 'text', - text: `[File: ${mimeType || 'unknown type'}]`, - } - } - private processMemories(memories: any): Message[] { if (!memories) return [] diff --git a/apps/sim/executor/handlers/agent/types.ts b/apps/sim/executor/handlers/agent/types.ts index 9007c527a..ae05e1884 100644 --- a/apps/sim/executor/handlers/agent/types.ts +++ b/apps/sim/executor/handlers/agent/types.ts @@ -43,9 +43,9 @@ export interface ToolInput { } /** - * Media content for multimodal messages + * Attachment content (files, images, documents) */ -export interface MediaContent { +export interface AttachmentContent { /** Source type: how the data was provided */ sourceType: 'url' | 'base64' | 'file' /** The URL or base64 data */ @@ -57,10 +57,10 @@ export interface MediaContent { } export interface Message { - role: 'system' | 'user' | 'assistant' | 'media' + role: 'system' | 'user' | 'assistant' | 'attachment' content: string - /** Media content for 'media' role messages */ - media?: MediaContent + /** Attachment content for 'attachment' role messages */ + attachment?: AttachmentContent executionId?: string function_call?: any tool_calls?: any[] diff --git a/apps/sim/providers/anthropic/index.ts b/apps/sim/providers/anthropic/index.ts index 27372c743..0367fc944 100644 --- a/apps/sim/providers/anthropic/index.ts +++ b/apps/sim/providers/anthropic/index.ts @@ -109,7 +109,7 @@ export const anthropicProvider: ProviderConfig = { ], }) } else { - // Handle content that's already in array format (from transformMediaMessages) + // Handle content that's already in array format (from transformAttachmentMessages) const content = Array.isArray(msg.content) ? msg.content : msg.content diff --git a/apps/sim/providers/attachment.ts b/apps/sim/providers/attachment.ts new file mode 100644 index 000000000..fe8ec3534 --- /dev/null +++ b/apps/sim/providers/attachment.ts @@ -0,0 +1,397 @@ +/** + * Centralized attachment content transformation for all providers. + * + * Strategy: Always normalize to base64 first, then create provider-specific formats. + * This eliminates URL accessibility issues and simplifies provider handling. + */ + +import { createLogger } from '@sim/logger' +import { bufferToBase64 } from '@/lib/uploads/utils/file-utils' +import { downloadFileFromUrl } from '@/lib/uploads/utils/file-utils.server' +import { supportsVision } from '@/providers/models' +import type { ProviderId } from '@/providers/types' + +const logger = createLogger('AttachmentTransformer') + +/** + * Generic message type for attachment transformation. + */ +interface TransformableMessage { + role: string + content: string | any[] | null + attachment?: AttachmentContent + [key: string]: any +} + +/** + * Attachment content (files, images, documents) + */ +export interface AttachmentContent { + sourceType: 'url' | 'base64' | 'file' + data: string + mimeType?: string + fileName?: string +} + +/** + * Normalized attachment data (always base64) + */ +interface NormalizedAttachment { + base64: string + mimeType: string +} + +/** + * Configuration for attachment transformation + */ +interface AttachmentTransformConfig { + providerId: ProviderId + model: string +} + +/** + * Checks if a model supports attachments (vision/multimodal content). + */ +export function modelSupportsAttachments(model: string): boolean { + return supportsVision(model) +} + +/** + * Transforms messages with 'attachment' role into provider-compatible format. + */ +export async function transformAttachmentMessages( + messages: T[], + config: AttachmentTransformConfig +): Promise { + const { providerId, model } = config + const supportsAttachments = modelSupportsAttachments(model) + + if (!supportsAttachments) { + return transformAttachmentsToText(messages) as T[] + } + + const result: T[] = [] + + for (const msg of messages) { + if (msg.role !== 'attachment') { + result.push(msg) + continue + } + + const attachmentContent = await createProviderAttachmentContent(msg, providerId) + if (!attachmentContent) { + logger.warn('Could not create attachment content for message', { msg }) + continue + } + + // Merge with previous user message or create new one + const lastMessage = result[result.length - 1] + if (lastMessage && lastMessage.role === 'user') { + const existingContent = ensureContentArray(lastMessage, providerId) + existingContent.push(attachmentContent) + lastMessage.content = existingContent as any + } else { + result.push({ + role: 'user', + content: [attachmentContent] as any, + } as T) + } + } + + // Ensure all user messages have consistent content format + return result.map((msg) => { + if (msg.role === 'user' && typeof msg.content === 'string') { + return { + ...msg, + content: [createTextContent(msg.content, providerId)] as any, + } + } + return msg + }) +} + +/** + * Transforms attachment messages to text placeholders for non-vision models + */ +function transformAttachmentsToText(messages: T[]): T[] { + const result: T[] = [] + + for (const msg of messages) { + if (msg.role !== 'attachment') { + result.push(msg) + continue + } + + const attachment = msg.attachment + const mimeType = attachment?.mimeType || 'unknown type' + const fileName = attachment?.fileName || 'file' + + const lastMessage = result[result.length - 1] + if (lastMessage && lastMessage.role === 'user') { + const currentContent = typeof lastMessage.content === 'string' ? lastMessage.content : '' + lastMessage.content = `${currentContent}\n[Attached file: ${fileName} (${mimeType}) - Note: This model does not support file/image inputs]` + } else { + result.push({ + role: 'user', + content: `[Attached file: ${fileName} (${mimeType}) - Note: This model does not support file/image inputs]`, + } as T) + } + } + + return result +} + +/** + * Ensures a user message has content as an array for multimodal support + */ +function ensureContentArray(msg: TransformableMessage, providerId: ProviderId): any[] { + if (Array.isArray(msg.content)) { + return msg.content + } + if (typeof msg.content === 'string' && msg.content) { + return [createTextContent(msg.content, providerId)] + } + return [] +} + +/** + * Creates provider-specific text content block + */ +export function createTextContent(text: string, providerId: ProviderId): any { + switch (providerId) { + case 'google': + case 'vertex': + return { text } + default: + return { type: 'text', text } + } +} + +/** + * Normalizes attachment data to base64. + * Fetches URLs and converts to base64, extracts base64 from data URLs. + */ +async function normalizeToBase64( + attachment: AttachmentContent +): Promise { + const { sourceType, data, mimeType } = attachment + + if (!data || !data.trim()) { + logger.warn('Empty attachment data') + return null + } + + const trimmedData = data.trim() + + // Already base64 + if (sourceType === 'base64') { + // Handle data URL format: data:mime;base64,xxx + if (trimmedData.startsWith('data:')) { + const match = trimmedData.match(/^data:([^;]+);base64,(.+)$/) + if (match) { + return { base64: match[2], mimeType: match[1] } + } + } + // Raw base64 + return { base64: trimmedData, mimeType: mimeType || 'application/octet-stream' } + } + + // URL or file path - need to fetch + if (sourceType === 'url' || sourceType === 'file') { + try { + logger.info('Fetching attachment for base64 conversion', { + url: trimmedData.substring(0, 50), + }) + const buffer = await downloadFileFromUrl(trimmedData) + const base64 = bufferToBase64(buffer) + return { base64, mimeType: mimeType || 'application/octet-stream' } + } catch (error) { + logger.error('Failed to fetch attachment', { error, url: trimmedData.substring(0, 50) }) + return null + } + } + + return null +} + +/** + * Creates provider-specific attachment content from an attachment message. + * First normalizes to base64, then creates the provider format. + */ +async function createProviderAttachmentContent( + msg: TransformableMessage, + providerId: ProviderId +): Promise { + const attachment = msg.attachment + if (!attachment) return null + + // Normalize to base64 first + const normalized = await normalizeToBase64(attachment) + if (!normalized) { + return createTextContent('[Failed to load attachment]', providerId) + } + + const { base64, mimeType } = normalized + + switch (providerId) { + case 'anthropic': + return createAnthropicContent(base64, mimeType) + + case 'google': + case 'vertex': + return createGeminiContent(base64, mimeType) + + case 'mistral': + return createMistralContent(base64, mimeType) + + case 'bedrock': + return createBedrockContent(base64, mimeType) + + default: + // OpenAI format (OpenAI, Azure, xAI, DeepSeek, Cerebras, Groq, OpenRouter, Ollama, vLLM) + return createOpenAIContent(base64, mimeType) + } +} + +/** + * OpenAI-compatible content (images only via base64 data URL) + */ +function createOpenAIContent(base64: string, mimeType: string): any { + const isImage = mimeType.startsWith('image/') + const isAudio = mimeType.startsWith('audio/') + + if (isImage) { + return { + type: 'image_url', + image_url: { + url: `data:${mimeType};base64,${base64}`, + detail: 'auto', + }, + } + } + + if (isAudio) { + return { + type: 'input_audio', + input_audio: { + data: base64, + format: mimeType === 'audio/wav' ? 'wav' : 'mp3', + }, + } + } + + // OpenAI Chat API doesn't support other file types directly + // For PDFs/docs, return a text placeholder + logger.warn(`OpenAI does not support ${mimeType} attachments in Chat API`) + return { + type: 'text', + text: `[Attached file: ${mimeType} - OpenAI Chat API only supports images and audio]`, + } +} + +/** + * Anthropic-compatible content (images and PDFs) + */ +function createAnthropicContent(base64: string, mimeType: string): any { + const isImage = mimeType.startsWith('image/') + const isPdf = mimeType === 'application/pdf' + + if (isImage) { + return { + type: 'image', + source: { + type: 'base64', + media_type: mimeType, + data: base64, + }, + } + } + + if (isPdf) { + return { + type: 'document', + source: { + type: 'base64', + media_type: 'application/pdf', + data: base64, + }, + } + } + + return { + type: 'text', + text: `[Attached file: ${mimeType} - Anthropic supports images and PDFs only]`, + } +} + +/** + * Google Gemini-compatible content (inlineData format) + */ +function createGeminiContent(base64: string, mimeType: string): any { + // Gemini supports a wide range of file types via inlineData + return { + inlineData: { + mimeType, + data: base64, + }, + } +} + +/** + * Mistral-compatible content (images only, data URL format) + */ +function createMistralContent(base64: string, mimeType: string): any { + const isImage = mimeType.startsWith('image/') + + if (isImage) { + // Mistral uses direct string for image_url, not nested object + return { + type: 'image_url', + image_url: `data:${mimeType};base64,${base64}`, + } + } + + return { + type: 'text', + text: `[Attached file: ${mimeType} - Mistral supports images only]`, + } +} + +/** + * AWS Bedrock-compatible content (images and PDFs) + */ +function createBedrockContent(base64: string, mimeType: string): any { + const isImage = mimeType.startsWith('image/') + const isPdf = mimeType === 'application/pdf' + + // Determine image format from mimeType + const getImageFormat = (mime: string): string => { + if (mime.includes('jpeg') || mime.includes('jpg')) return 'jpeg' + if (mime.includes('png')) return 'png' + if (mime.includes('gif')) return 'gif' + if (mime.includes('webp')) return 'webp' + return 'png' + } + + if (isImage) { + // Return a marker object that the Bedrock provider will convert to proper format + return { + type: 'bedrock_image', + format: getImageFormat(mimeType), + data: base64, + } + } + + if (isPdf) { + return { + type: 'bedrock_document', + format: 'pdf', + data: base64, + } + } + + return { + type: 'text', + text: `[Attached file: ${mimeType} - Bedrock supports images and PDFs only]`, + } +} diff --git a/apps/sim/providers/google/utils.ts b/apps/sim/providers/google/utils.ts index c5040aab4..b7ad7cf57 100644 --- a/apps/sim/providers/google/utils.ts +++ b/apps/sim/providers/google/utils.ts @@ -72,6 +72,75 @@ export function cleanSchemaForGemini(schema: SchemaUnion): SchemaUnion { return cleanedSchema } +/** + * Converts an array of content items to Gemini-compatible Part array. + * Handles various formats from the attachment transformer. + */ +function convertContentArrayToGeminiParts(contentArray: any[]): Part[] { + const parts: Part[] = [] + + for (const item of contentArray) { + if (!item) continue + + // Gemini-native text format: { text: "..." } + if (typeof item.text === 'string') { + parts.push({ text: item.text }) + continue + } + + // OpenAI-style text: { type: 'text', text: '...' } + if (item.type === 'text' && typeof item.text === 'string') { + parts.push({ text: item.text }) + continue + } + + // Gemini-native inlineData format (from attachment transformer) + if (item.inlineData) { + parts.push({ inlineData: item.inlineData }) + continue + } + + // Gemini-native fileData format (from attachment transformer) + if (item.fileData) { + parts.push({ fileData: item.fileData }) + continue + } + + // OpenAI-style image_url - convert to Gemini format + if (item.type === 'image_url' && item.image_url) { + const url = typeof item.image_url === 'string' ? item.image_url : item.image_url?.url + if (url) { + // Check if it's a data URL (base64) + if (url.startsWith('data:')) { + const match = url.match(/^data:([^;]+);base64,(.+)$/) + if (match) { + parts.push({ + inlineData: { + mimeType: match[1], + data: match[2], + }, + }) + } + } else { + // External URL + parts.push({ + fileData: { + mimeType: 'image/jpeg', // Default, Gemini will detect actual type + fileUri: url, + }, + }) + } + } + continue + } + + // Unknown type - log warning + logger.warn('Unknown content item type in Gemini conversion:', { type: item.type }) + } + + return parts +} + /** * Extracts text content from a Gemini response candidate. * Filters out thought parts (model reasoning) from the output. @@ -180,7 +249,13 @@ export function convertToGeminiFormat(request: ProviderRequest): { } else if (message.role === 'user' || message.role === 'assistant') { const geminiRole = message.role === 'user' ? 'user' : 'model' - if (message.content) { + // Handle multimodal content (arrays with text/image/file parts) + if (Array.isArray(message.content)) { + const parts: Part[] = convertContentArrayToGeminiParts(message.content) + if (parts.length > 0) { + contents.push({ role: geminiRole, parts }) + } + } else if (message.content) { contents.push({ role: geminiRole, parts: [{ text: message.content }] }) } diff --git a/apps/sim/providers/types.ts b/apps/sim/providers/types.ts index 7050d7621..4cb009e8a 100644 --- a/apps/sim/providers/types.ts +++ b/apps/sim/providers/types.ts @@ -112,9 +112,9 @@ export interface ProviderToolConfig { } /** - * Media content for multimodal messages + * Attachment content (files, images, documents) */ -export interface MediaContent { +export interface AttachmentContent { /** Source type: how the data was provided */ sourceType: 'url' | 'base64' | 'file' /** The URL or base64 data */ @@ -126,10 +126,10 @@ export interface MediaContent { } export interface Message { - role: 'system' | 'user' | 'assistant' | 'function' | 'tool' | 'media' + role: 'system' | 'user' | 'assistant' | 'function' | 'tool' | 'attachment' content: string | null - /** Media content for 'media' role messages */ - media?: MediaContent + /** Attachment content for 'attachment' role messages */ + attachment?: AttachmentContent name?: string function_call?: { name: string