mirror of
https://github.com/simstudioai/sim.git
synced 2026-02-04 19:55:08 -05:00
* improvement(collab): do not refetch active workflow id * progress on files * more integrations * separate server and client logic * consolidate more code * fix integrations * fix types * consolidate more code * fix tests * fix more bugbot comments * fix type check * fix circular impport * address more bugbot comments * fix ocr integrations * fix typing * remove leftover type * address bugbot comment * fix file block adv mode * fix * normalize file input * fix v2 blocmks for ocr * fix for v2 versions * fix more v2 blocks * update single file blocks * make interface simpler * cleanup fireflies * remove file only annotation * accept all types * added wand to ssh block * user files should be passed through * improve docs * fix slack to include successful execs * fix dropbox upload file * fix sendgrid * fix dropbox * fix * fix * update skills * fix uploaded file --------- Co-authored-by: waleed <walif6@gmail.com>
919 lines
27 KiB
TypeScript
919 lines
27 KiB
TypeScript
import { Buffer } from 'buffer'
|
|
import { createHash } from 'crypto'
|
|
import fsPromises, { readFile } from 'fs/promises'
|
|
import path from 'path'
|
|
import { createLogger } from '@sim/logger'
|
|
import binaryExtensionsList from 'binary-extensions'
|
|
import { type NextRequest, NextResponse } from 'next/server'
|
|
import { checkHybridAuth } from '@/lib/auth/hybrid'
|
|
import {
|
|
secureFetchWithPinnedIP,
|
|
validateUrlWithDNS,
|
|
} from '@/lib/core/security/input-validation.server'
|
|
import { sanitizeUrlForLog } from '@/lib/core/utils/logging'
|
|
import { isSupportedFileType, parseFile } from '@/lib/file-parsers'
|
|
import { isUsingCloudStorage, type StorageContext, StorageService } from '@/lib/uploads'
|
|
import { uploadExecutionFile } from '@/lib/uploads/contexts/execution'
|
|
import { UPLOAD_DIR_SERVER } from '@/lib/uploads/core/setup.server'
|
|
import { getFileMetadataByKey } from '@/lib/uploads/server/metadata'
|
|
import {
|
|
extractCleanFilename,
|
|
extractStorageKey,
|
|
extractWorkspaceIdFromExecutionKey,
|
|
getMimeTypeFromExtension,
|
|
getViewerUrl,
|
|
inferContextFromKey,
|
|
isInternalFileUrl,
|
|
} from '@/lib/uploads/utils/file-utils'
|
|
import { getUserEntityPermissions } from '@/lib/workspaces/permissions/utils'
|
|
import { verifyFileAccess } from '@/app/api/files/authorization'
|
|
import type { UserFile } from '@/executor/types'
|
|
import '@/lib/uploads/core/setup.server'
|
|
|
|
export const dynamic = 'force-dynamic'
|
|
|
|
const logger = createLogger('FilesParseAPI')
|
|
|
|
const MAX_DOWNLOAD_SIZE_BYTES = 100 * 1024 * 1024 // 100 MB
|
|
const DOWNLOAD_TIMEOUT_MS = 30000 // 30 seconds
|
|
|
|
interface ExecutionContext {
|
|
workspaceId: string
|
|
workflowId: string
|
|
executionId: string
|
|
}
|
|
|
|
interface ParseResult {
|
|
success: boolean
|
|
content?: string
|
|
error?: string
|
|
filePath: string
|
|
originalName?: string // Original filename from database (for workspace files)
|
|
viewerUrl?: string | null // Viewer URL for the file if available
|
|
userFile?: UserFile // UserFile object for the raw file
|
|
metadata?: {
|
|
fileType: string
|
|
size: number
|
|
hash: string
|
|
processingTime: number
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Main API route handler
|
|
*/
|
|
export async function POST(request: NextRequest) {
|
|
const startTime = Date.now()
|
|
|
|
try {
|
|
const authResult = await checkHybridAuth(request, { requireWorkflowId: true })
|
|
|
|
if (!authResult.success) {
|
|
logger.warn('Unauthorized file parse request', {
|
|
error: authResult.error || 'Authentication failed',
|
|
})
|
|
return NextResponse.json({ success: false, error: 'Unauthorized' }, { status: 401 })
|
|
}
|
|
|
|
if (!authResult.userId) {
|
|
logger.warn('File parse request missing userId', {
|
|
authType: authResult.authType,
|
|
})
|
|
return NextResponse.json({ success: false, error: 'User context required' }, { status: 401 })
|
|
}
|
|
|
|
const userId = authResult.userId
|
|
const requestData = await request.json()
|
|
const { filePath, fileType, workspaceId, workflowId, executionId } = requestData
|
|
|
|
if (!filePath || (typeof filePath === 'string' && filePath.trim() === '')) {
|
|
return NextResponse.json({ success: false, error: 'No file path provided' }, { status: 400 })
|
|
}
|
|
|
|
// Build execution context if all required fields are present
|
|
const executionContext: ExecutionContext | undefined =
|
|
workspaceId && workflowId && executionId
|
|
? { workspaceId, workflowId, executionId }
|
|
: undefined
|
|
|
|
logger.info('File parse request received:', {
|
|
filePath,
|
|
fileType,
|
|
workspaceId,
|
|
userId,
|
|
hasExecutionContext: !!executionContext,
|
|
})
|
|
|
|
if (Array.isArray(filePath)) {
|
|
const results = []
|
|
for (const singlePath of filePath) {
|
|
if (!singlePath || (typeof singlePath === 'string' && singlePath.trim() === '')) {
|
|
results.push({
|
|
success: false,
|
|
error: 'Empty file path in array',
|
|
filePath: singlePath || '',
|
|
})
|
|
continue
|
|
}
|
|
|
|
const result = await parseFileSingle(
|
|
singlePath,
|
|
fileType,
|
|
workspaceId,
|
|
userId,
|
|
executionContext
|
|
)
|
|
if (result.metadata) {
|
|
result.metadata.processingTime = Date.now() - startTime
|
|
}
|
|
|
|
if (result.success) {
|
|
const displayName =
|
|
result.originalName || extractCleanFilename(result.filePath) || 'unknown'
|
|
results.push({
|
|
success: true,
|
|
output: {
|
|
content: result.content,
|
|
name: displayName,
|
|
fileType: result.metadata?.fileType || 'application/octet-stream',
|
|
size: result.metadata?.size || 0,
|
|
binary: false,
|
|
file: result.userFile,
|
|
},
|
|
filePath: result.filePath,
|
|
viewerUrl: result.viewerUrl,
|
|
})
|
|
} else {
|
|
results.push(result)
|
|
}
|
|
}
|
|
|
|
return NextResponse.json({
|
|
success: true,
|
|
results,
|
|
})
|
|
}
|
|
|
|
const result = await parseFileSingle(filePath, fileType, workspaceId, userId, executionContext)
|
|
|
|
if (result.metadata) {
|
|
result.metadata.processingTime = Date.now() - startTime
|
|
}
|
|
|
|
if (result.success) {
|
|
const displayName = result.originalName || extractCleanFilename(result.filePath) || 'unknown'
|
|
return NextResponse.json({
|
|
success: true,
|
|
output: {
|
|
content: result.content,
|
|
name: displayName,
|
|
fileType: result.metadata?.fileType || 'application/octet-stream',
|
|
size: result.metadata?.size || 0,
|
|
binary: false,
|
|
file: result.userFile,
|
|
},
|
|
filePath: result.filePath,
|
|
viewerUrl: result.viewerUrl,
|
|
})
|
|
}
|
|
|
|
return NextResponse.json(result)
|
|
} catch (error) {
|
|
logger.error('Error in file parse API:', error)
|
|
return NextResponse.json(
|
|
{
|
|
success: false,
|
|
error: error instanceof Error ? error.message : 'Unknown error occurred',
|
|
filePath: '',
|
|
},
|
|
{ status: 500 }
|
|
)
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Parse a single file and return its content
|
|
*/
|
|
async function parseFileSingle(
|
|
filePath: string,
|
|
fileType: string,
|
|
workspaceId: string,
|
|
userId: string,
|
|
executionContext?: ExecutionContext
|
|
): Promise<ParseResult> {
|
|
logger.info('Parsing file:', filePath)
|
|
|
|
if (!filePath || filePath.trim() === '') {
|
|
return {
|
|
success: false,
|
|
error: 'Empty file path provided',
|
|
filePath: filePath || '',
|
|
}
|
|
}
|
|
|
|
const pathValidation = validateFilePath(filePath)
|
|
if (!pathValidation.isValid) {
|
|
return {
|
|
success: false,
|
|
error: pathValidation.error || 'Invalid path',
|
|
filePath,
|
|
}
|
|
}
|
|
|
|
if (isInternalFileUrl(filePath)) {
|
|
return handleCloudFile(filePath, fileType, undefined, userId, executionContext)
|
|
}
|
|
|
|
if (filePath.startsWith('http://') || filePath.startsWith('https://')) {
|
|
return handleExternalUrl(filePath, fileType, workspaceId, userId, executionContext)
|
|
}
|
|
|
|
if (isUsingCloudStorage()) {
|
|
return handleCloudFile(filePath, fileType, undefined, userId, executionContext)
|
|
}
|
|
|
|
return handleLocalFile(filePath, fileType, userId, executionContext)
|
|
}
|
|
|
|
/**
|
|
* Validate file path for security - prevents null byte injection and path traversal attacks
|
|
*/
|
|
function validateFilePath(filePath: string): { isValid: boolean; error?: string } {
|
|
if (filePath.includes('\0')) {
|
|
return { isValid: false, error: 'Invalid path: null byte detected' }
|
|
}
|
|
|
|
if (filePath.includes('..')) {
|
|
return { isValid: false, error: 'Access denied: path traversal detected' }
|
|
}
|
|
|
|
if (filePath.includes('~')) {
|
|
return { isValid: false, error: 'Invalid path: tilde character not allowed' }
|
|
}
|
|
|
|
if (filePath.startsWith('/') && !isInternalFileUrl(filePath)) {
|
|
return { isValid: false, error: 'Path outside allowed directory' }
|
|
}
|
|
|
|
if (/^[A-Za-z]:\\/.test(filePath)) {
|
|
return { isValid: false, error: 'Path outside allowed directory' }
|
|
}
|
|
|
|
return { isValid: true }
|
|
}
|
|
|
|
/**
|
|
* Handle external URL
|
|
* If workspaceId is provided, checks if file already exists and saves to workspace if not
|
|
* If executionContext is provided, also stores the file in execution storage and returns UserFile
|
|
*/
|
|
async function handleExternalUrl(
|
|
url: string,
|
|
fileType: string,
|
|
workspaceId: string,
|
|
userId: string,
|
|
executionContext?: ExecutionContext
|
|
): Promise<ParseResult> {
|
|
try {
|
|
logger.info('Fetching external URL:', url)
|
|
logger.info('WorkspaceId for URL save:', workspaceId)
|
|
|
|
const urlValidation = await validateUrlWithDNS(url, 'fileUrl')
|
|
if (!urlValidation.isValid) {
|
|
logger.warn(`Blocked external URL request: ${urlValidation.error}`)
|
|
return {
|
|
success: false,
|
|
error: urlValidation.error || 'Invalid external URL',
|
|
filePath: url,
|
|
}
|
|
}
|
|
|
|
const urlPath = new URL(url).pathname
|
|
const filename = urlPath.split('/').pop() || 'download'
|
|
const extension = path.extname(filename).toLowerCase().substring(1)
|
|
|
|
logger.info(`Extracted filename: ${filename}, workspaceId: ${workspaceId}`)
|
|
|
|
const {
|
|
S3_EXECUTION_FILES_CONFIG,
|
|
BLOB_EXECUTION_FILES_CONFIG,
|
|
USE_S3_STORAGE,
|
|
USE_BLOB_STORAGE,
|
|
} = await import('@/lib/uploads/config')
|
|
|
|
let isExecutionFile = false
|
|
try {
|
|
const parsedUrl = new URL(url)
|
|
|
|
if (USE_S3_STORAGE && S3_EXECUTION_FILES_CONFIG.bucket) {
|
|
const bucketInHost = parsedUrl.hostname.startsWith(S3_EXECUTION_FILES_CONFIG.bucket)
|
|
const bucketInPath = parsedUrl.pathname.startsWith(`/${S3_EXECUTION_FILES_CONFIG.bucket}/`)
|
|
isExecutionFile = bucketInHost || bucketInPath
|
|
} else if (USE_BLOB_STORAGE && BLOB_EXECUTION_FILES_CONFIG.containerName) {
|
|
isExecutionFile = url.includes(`/${BLOB_EXECUTION_FILES_CONFIG.containerName}/`)
|
|
}
|
|
} catch (error) {
|
|
logger.warn('Failed to parse URL for execution file check:', error)
|
|
isExecutionFile = false
|
|
}
|
|
|
|
// Only apply workspace deduplication if:
|
|
// 1. WorkspaceId is provided
|
|
// 2. URL is NOT from execution files bucket/container
|
|
const shouldCheckWorkspace = workspaceId && !isExecutionFile
|
|
|
|
if (shouldCheckWorkspace) {
|
|
const permission = await getUserEntityPermissions(userId, 'workspace', workspaceId)
|
|
if (permission === null) {
|
|
logger.warn('User does not have workspace access for file parse', {
|
|
userId,
|
|
workspaceId,
|
|
filename,
|
|
})
|
|
return {
|
|
success: false,
|
|
error: 'File not found',
|
|
filePath: url,
|
|
}
|
|
}
|
|
|
|
const { fileExistsInWorkspace, listWorkspaceFiles } = await import(
|
|
'@/lib/uploads/contexts/workspace'
|
|
)
|
|
const exists = await fileExistsInWorkspace(workspaceId, filename)
|
|
|
|
if (exists) {
|
|
logger.info(`File ${filename} already exists in workspace, using existing file`)
|
|
const workspaceFiles = await listWorkspaceFiles(workspaceId)
|
|
const existingFile = workspaceFiles.find((f) => f.name === filename)
|
|
|
|
if (existingFile) {
|
|
const storageFilePath = `/api/files/serve/${existingFile.key}`
|
|
return handleCloudFile(storageFilePath, fileType, 'workspace', userId, executionContext)
|
|
}
|
|
}
|
|
}
|
|
|
|
const response = await secureFetchWithPinnedIP(url, urlValidation.resolvedIP!, {
|
|
timeout: DOWNLOAD_TIMEOUT_MS,
|
|
})
|
|
if (!response.ok) {
|
|
throw new Error(`Failed to fetch URL: ${response.status} ${response.statusText}`)
|
|
}
|
|
|
|
const contentLength = response.headers.get('content-length')
|
|
if (contentLength && Number.parseInt(contentLength) > MAX_DOWNLOAD_SIZE_BYTES) {
|
|
throw new Error(`File too large: ${contentLength} bytes (max: ${MAX_DOWNLOAD_SIZE_BYTES})`)
|
|
}
|
|
|
|
const buffer = Buffer.from(await response.arrayBuffer())
|
|
|
|
if (buffer.length > MAX_DOWNLOAD_SIZE_BYTES) {
|
|
throw new Error(`File too large: ${buffer.length} bytes (max: ${MAX_DOWNLOAD_SIZE_BYTES})`)
|
|
}
|
|
|
|
logger.info(`Downloaded file from URL: ${url}, size: ${buffer.length} bytes`)
|
|
|
|
let userFile: UserFile | undefined
|
|
const mimeType = response.headers.get('content-type') || getMimeTypeFromExtension(extension)
|
|
|
|
if (executionContext) {
|
|
try {
|
|
userFile = await uploadExecutionFile(executionContext, buffer, filename, mimeType, userId)
|
|
logger.info(`Stored file in execution storage: ${filename}`, { key: userFile.key })
|
|
} catch (uploadError) {
|
|
logger.warn(`Failed to store file in execution storage:`, uploadError)
|
|
// Continue without userFile - parsing can still work
|
|
}
|
|
}
|
|
|
|
if (shouldCheckWorkspace) {
|
|
try {
|
|
const permission = await getUserEntityPermissions(userId, 'workspace', workspaceId)
|
|
if (permission !== 'admin' && permission !== 'write') {
|
|
logger.warn('User does not have write permission for workspace file save', {
|
|
userId,
|
|
workspaceId,
|
|
filename,
|
|
permission,
|
|
})
|
|
} else {
|
|
const { uploadWorkspaceFile } = await import('@/lib/uploads/contexts/workspace')
|
|
await uploadWorkspaceFile(workspaceId, userId, buffer, filename, mimeType)
|
|
logger.info(`Saved URL file to workspace storage: ${filename}`)
|
|
}
|
|
} catch (saveError) {
|
|
logger.warn(`Failed to save URL file to workspace:`, saveError)
|
|
}
|
|
}
|
|
|
|
let parseResult: ParseResult
|
|
if (extension === 'pdf') {
|
|
parseResult = await handlePdfBuffer(buffer, filename, fileType, url)
|
|
} else if (extension === 'csv') {
|
|
parseResult = await handleCsvBuffer(buffer, filename, fileType, url)
|
|
} else if (isSupportedFileType(extension)) {
|
|
parseResult = await handleGenericTextBuffer(buffer, filename, extension, fileType, url)
|
|
} else {
|
|
parseResult = handleGenericBuffer(buffer, filename, extension, fileType)
|
|
}
|
|
|
|
// Attach userFile to the result
|
|
if (userFile) {
|
|
parseResult.userFile = userFile
|
|
}
|
|
|
|
return parseResult
|
|
} catch (error) {
|
|
logger.error(`Error handling external URL ${sanitizeUrlForLog(url)}:`, error)
|
|
return {
|
|
success: false,
|
|
error: `Error fetching URL: ${(error as Error).message}`,
|
|
filePath: url,
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Handle file stored in cloud storage
|
|
* If executionContext is provided and file is not already from execution storage,
|
|
* copies the file to execution storage and returns UserFile
|
|
*/
|
|
async function handleCloudFile(
|
|
filePath: string,
|
|
fileType: string,
|
|
explicitContext: string | undefined,
|
|
userId: string,
|
|
executionContext?: ExecutionContext
|
|
): Promise<ParseResult> {
|
|
try {
|
|
const cloudKey = extractStorageKey(filePath)
|
|
|
|
logger.info('Extracted cloud key:', cloudKey)
|
|
|
|
const context = (explicitContext as StorageContext) || inferContextFromKey(cloudKey)
|
|
|
|
const hasAccess = await verifyFileAccess(
|
|
cloudKey,
|
|
userId,
|
|
undefined, // customConfig
|
|
context, // context
|
|
false // isLocal
|
|
)
|
|
|
|
if (!hasAccess) {
|
|
logger.warn('Unauthorized cloud file parse attempt', { userId, key: cloudKey, context })
|
|
return {
|
|
success: false,
|
|
error: 'File not found',
|
|
filePath,
|
|
}
|
|
}
|
|
|
|
let originalFilename: string | undefined
|
|
if (context === 'workspace') {
|
|
try {
|
|
const fileRecord = await getFileMetadataByKey(cloudKey, 'workspace')
|
|
|
|
if (fileRecord) {
|
|
originalFilename = fileRecord.originalName
|
|
logger.debug(`Found original filename for workspace file: ${originalFilename}`)
|
|
}
|
|
} catch (dbError) {
|
|
logger.debug(`Failed to lookup original filename for ${cloudKey}:`, dbError)
|
|
}
|
|
}
|
|
|
|
const fileBuffer = await StorageService.downloadFile({ key: cloudKey, context })
|
|
logger.info(
|
|
`Downloaded file from ${context} storage (${explicitContext ? 'explicit' : 'inferred'}): ${cloudKey}, size: ${fileBuffer.length} bytes`
|
|
)
|
|
|
|
const filename = originalFilename || cloudKey.split('/').pop() || cloudKey
|
|
const extension = path.extname(filename).toLowerCase().substring(1)
|
|
const mimeType = getMimeTypeFromExtension(extension)
|
|
|
|
const normalizedFilePath = `/api/files/serve/${encodeURIComponent(cloudKey)}?context=${context}`
|
|
let workspaceIdFromKey: string | undefined
|
|
|
|
if (context === 'execution') {
|
|
workspaceIdFromKey = extractWorkspaceIdFromExecutionKey(cloudKey) || undefined
|
|
} else if (context === 'workspace') {
|
|
const segments = cloudKey.split('/')
|
|
if (segments.length >= 2 && /^[a-f0-9-]{36}$/.test(segments[0])) {
|
|
workspaceIdFromKey = segments[0]
|
|
}
|
|
}
|
|
|
|
const viewerUrl = getViewerUrl(cloudKey, workspaceIdFromKey)
|
|
|
|
// Store file in execution storage if executionContext is provided
|
|
let userFile: UserFile | undefined
|
|
|
|
if (executionContext) {
|
|
// If file is already from execution context, create UserFile reference without re-uploading
|
|
if (context === 'execution') {
|
|
userFile = {
|
|
id: `file_${Date.now()}_${Math.random().toString(36).substring(2, 9)}`,
|
|
name: filename,
|
|
url: normalizedFilePath,
|
|
size: fileBuffer.length,
|
|
type: mimeType,
|
|
key: cloudKey,
|
|
context: 'execution',
|
|
}
|
|
logger.info(`Created UserFile reference for existing execution file: ${filename}`)
|
|
} else {
|
|
// Copy from workspace/other storage to execution storage
|
|
try {
|
|
userFile = await uploadExecutionFile(
|
|
executionContext,
|
|
fileBuffer,
|
|
filename,
|
|
mimeType,
|
|
userId
|
|
)
|
|
logger.info(`Copied file to execution storage: ${filename}`, { key: userFile.key })
|
|
} catch (uploadError) {
|
|
logger.warn(`Failed to copy file to execution storage:`, uploadError)
|
|
}
|
|
}
|
|
}
|
|
|
|
let parseResult: ParseResult
|
|
if (extension === 'pdf') {
|
|
parseResult = await handlePdfBuffer(fileBuffer, filename, fileType, normalizedFilePath)
|
|
} else if (extension === 'csv') {
|
|
parseResult = await handleCsvBuffer(fileBuffer, filename, fileType, normalizedFilePath)
|
|
} else if (isSupportedFileType(extension)) {
|
|
parseResult = await handleGenericTextBuffer(
|
|
fileBuffer,
|
|
filename,
|
|
extension,
|
|
fileType,
|
|
normalizedFilePath
|
|
)
|
|
} else {
|
|
parseResult = handleGenericBuffer(fileBuffer, filename, extension, fileType)
|
|
parseResult.filePath = normalizedFilePath
|
|
}
|
|
|
|
if (originalFilename) {
|
|
parseResult.originalName = originalFilename
|
|
}
|
|
|
|
parseResult.viewerUrl = viewerUrl
|
|
|
|
// Attach userFile to the result
|
|
if (userFile) {
|
|
parseResult.userFile = userFile
|
|
}
|
|
|
|
return parseResult
|
|
} catch (error) {
|
|
logger.error(`Error handling cloud file ${filePath}:`, error)
|
|
|
|
const errorMessage = (error as Error).message
|
|
if (errorMessage.includes('Access denied') || errorMessage.includes('Forbidden')) {
|
|
throw new Error(`Error accessing file from cloud storage: ${errorMessage}`)
|
|
}
|
|
|
|
return {
|
|
success: false,
|
|
error: `Error accessing file from cloud storage: ${errorMessage}`,
|
|
filePath,
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Handle local file
|
|
*/
|
|
async function handleLocalFile(
|
|
filePath: string,
|
|
fileType: string,
|
|
userId: string,
|
|
executionContext?: ExecutionContext
|
|
): Promise<ParseResult> {
|
|
try {
|
|
const filename = filePath.split('/').pop() || filePath
|
|
|
|
const context = inferContextFromKey(filename)
|
|
const hasAccess = await verifyFileAccess(
|
|
filename,
|
|
userId,
|
|
undefined, // customConfig
|
|
context, // context
|
|
true // isLocal
|
|
)
|
|
|
|
if (!hasAccess) {
|
|
logger.warn('Unauthorized local file parse attempt', { userId, filename })
|
|
return {
|
|
success: false,
|
|
error: 'File not found',
|
|
filePath,
|
|
}
|
|
}
|
|
|
|
const fullPath = path.join(UPLOAD_DIR_SERVER, filename)
|
|
|
|
logger.info('Processing local file:', fullPath)
|
|
|
|
try {
|
|
await fsPromises.access(fullPath)
|
|
} catch {
|
|
throw new Error(`File not found: ${filename}`)
|
|
}
|
|
|
|
const result = await parseFile(fullPath)
|
|
|
|
const stats = await fsPromises.stat(fullPath)
|
|
const fileBuffer = await readFile(fullPath)
|
|
const hash = createHash('md5').update(fileBuffer).digest('hex')
|
|
|
|
const extension = path.extname(filename).toLowerCase().substring(1)
|
|
const mimeType = fileType || getMimeTypeFromExtension(extension)
|
|
|
|
// Store file in execution storage if executionContext is provided
|
|
let userFile: UserFile | undefined
|
|
if (executionContext) {
|
|
try {
|
|
userFile = await uploadExecutionFile(
|
|
executionContext,
|
|
fileBuffer,
|
|
filename,
|
|
mimeType,
|
|
userId
|
|
)
|
|
logger.info(`Stored local file in execution storage: ${filename}`, { key: userFile.key })
|
|
} catch (uploadError) {
|
|
logger.warn(`Failed to store local file in execution storage:`, uploadError)
|
|
}
|
|
}
|
|
|
|
return {
|
|
success: true,
|
|
content: result.content,
|
|
filePath,
|
|
userFile,
|
|
metadata: {
|
|
fileType: mimeType,
|
|
size: stats.size,
|
|
hash,
|
|
processingTime: 0,
|
|
},
|
|
}
|
|
} catch (error) {
|
|
logger.error(`Error handling local file ${filePath}:`, error)
|
|
return {
|
|
success: false,
|
|
error: `Error processing local file: ${(error as Error).message}`,
|
|
filePath,
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Handle a PDF buffer directly in memory
|
|
*/
|
|
async function handlePdfBuffer(
|
|
fileBuffer: Buffer,
|
|
filename: string,
|
|
fileType?: string,
|
|
originalPath?: string
|
|
): Promise<ParseResult> {
|
|
try {
|
|
logger.info(`Parsing PDF in memory: ${filename}`)
|
|
|
|
const result = await parseBufferAsPdf(fileBuffer)
|
|
|
|
const content =
|
|
result.content ||
|
|
createPdfFallbackMessage(result.metadata?.pageCount || 0, fileBuffer.length, originalPath)
|
|
|
|
return {
|
|
success: true,
|
|
content,
|
|
filePath: originalPath || filename,
|
|
metadata: {
|
|
fileType: fileType || 'application/pdf',
|
|
size: fileBuffer.length,
|
|
hash: createHash('md5').update(fileBuffer).digest('hex'),
|
|
processingTime: 0,
|
|
},
|
|
}
|
|
} catch (error) {
|
|
logger.error('Failed to parse PDF in memory:', error)
|
|
|
|
const content = createPdfFailureMessage(
|
|
0,
|
|
fileBuffer.length,
|
|
originalPath || filename,
|
|
(error as Error).message
|
|
)
|
|
|
|
return {
|
|
success: true,
|
|
content,
|
|
filePath: originalPath || filename,
|
|
metadata: {
|
|
fileType: fileType || 'application/pdf',
|
|
size: fileBuffer.length,
|
|
hash: createHash('md5').update(fileBuffer).digest('hex'),
|
|
processingTime: 0,
|
|
},
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Handle a CSV buffer directly in memory
|
|
*/
|
|
async function handleCsvBuffer(
|
|
fileBuffer: Buffer,
|
|
filename: string,
|
|
fileType?: string,
|
|
originalPath?: string
|
|
): Promise<ParseResult> {
|
|
try {
|
|
logger.info(`Parsing CSV in memory: ${filename}`)
|
|
|
|
const { parseBuffer } = await import('@/lib/file-parsers')
|
|
const result = await parseBuffer(fileBuffer, 'csv')
|
|
|
|
return {
|
|
success: true,
|
|
content: result.content,
|
|
filePath: originalPath || filename,
|
|
metadata: {
|
|
fileType: fileType || 'text/csv',
|
|
size: fileBuffer.length,
|
|
hash: createHash('md5').update(fileBuffer).digest('hex'),
|
|
processingTime: 0,
|
|
},
|
|
}
|
|
} catch (error) {
|
|
logger.error('Failed to parse CSV in memory:', error)
|
|
return {
|
|
success: false,
|
|
error: `Failed to parse CSV: ${(error as Error).message}`,
|
|
filePath: originalPath || filename,
|
|
metadata: {
|
|
fileType: 'text/csv',
|
|
size: 0,
|
|
hash: '',
|
|
processingTime: 0,
|
|
},
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Handle a generic text file buffer in memory
|
|
*/
|
|
async function handleGenericTextBuffer(
|
|
fileBuffer: Buffer,
|
|
filename: string,
|
|
extension: string,
|
|
fileType?: string,
|
|
originalPath?: string
|
|
): Promise<ParseResult> {
|
|
try {
|
|
logger.info(`Parsing text file in memory: ${filename}`)
|
|
|
|
try {
|
|
const { parseBuffer, isSupportedFileType } = await import('@/lib/file-parsers')
|
|
|
|
if (isSupportedFileType(extension)) {
|
|
const result = await parseBuffer(fileBuffer, extension)
|
|
|
|
return {
|
|
success: true,
|
|
content: result.content,
|
|
filePath: originalPath || filename,
|
|
metadata: {
|
|
fileType: fileType || getMimeTypeFromExtension(extension),
|
|
size: fileBuffer.length,
|
|
hash: createHash('md5').update(fileBuffer).digest('hex'),
|
|
processingTime: 0,
|
|
},
|
|
}
|
|
}
|
|
} catch (parserError) {
|
|
logger.warn('Specialized parser failed, falling back to generic parsing:', parserError)
|
|
}
|
|
|
|
const content = fileBuffer.toString('utf-8')
|
|
|
|
return {
|
|
success: true,
|
|
content,
|
|
filePath: originalPath || filename,
|
|
metadata: {
|
|
fileType: fileType || getMimeTypeFromExtension(extension),
|
|
size: fileBuffer.length,
|
|
hash: createHash('md5').update(fileBuffer).digest('hex'),
|
|
processingTime: 0,
|
|
},
|
|
}
|
|
} catch (error) {
|
|
logger.error('Failed to parse text file in memory:', error)
|
|
return {
|
|
success: false,
|
|
error: `Failed to parse file: ${(error as Error).message}`,
|
|
filePath: originalPath || filename,
|
|
metadata: {
|
|
fileType: 'text/plain',
|
|
size: 0,
|
|
hash: '',
|
|
processingTime: 0,
|
|
},
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Handle a generic binary buffer
|
|
*/
|
|
function handleGenericBuffer(
|
|
fileBuffer: Buffer,
|
|
filename: string,
|
|
extension: string,
|
|
fileType?: string
|
|
): ParseResult {
|
|
const isBinary = binaryExtensionsList.includes(extension)
|
|
const content = isBinary
|
|
? `[Binary ${extension.toUpperCase()} file - ${fileBuffer.length} bytes]`
|
|
: fileBuffer.toString('utf-8')
|
|
|
|
return {
|
|
success: true,
|
|
content,
|
|
filePath: filename,
|
|
metadata: {
|
|
fileType: fileType || getMimeTypeFromExtension(extension),
|
|
size: fileBuffer.length,
|
|
hash: createHash('md5').update(fileBuffer).digest('hex'),
|
|
processingTime: 0,
|
|
},
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Parse a PDF buffer
|
|
*/
|
|
async function parseBufferAsPdf(buffer: Buffer) {
|
|
try {
|
|
const { PdfParser } = await import('@/lib/file-parsers/pdf-parser')
|
|
const parser = new PdfParser()
|
|
logger.info('Using main PDF parser for buffer')
|
|
|
|
return await parser.parseBuffer(buffer)
|
|
} catch (error) {
|
|
throw new Error(`PDF parsing failed: ${(error as Error).message}`)
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Format bytes to human readable size
|
|
*/
|
|
function prettySize(bytes: number): string {
|
|
if (bytes === 0) return '0 Bytes'
|
|
|
|
const sizes = ['Bytes', 'KB', 'MB', 'GB', 'TB']
|
|
const i = Math.floor(Math.log(bytes) / Math.log(1024))
|
|
|
|
return `${Number.parseFloat((bytes / 1024 ** i).toFixed(2))} ${sizes[i]}`
|
|
}
|
|
|
|
/**
|
|
* Create a formatted message for PDF content
|
|
*/
|
|
function createPdfFallbackMessage(pageCount: number, size: number, path?: string): string {
|
|
const formattedPath = path || 'Unknown path'
|
|
|
|
return `PDF document - ${pageCount} page(s), ${prettySize(size)}
|
|
Path: ${formattedPath}
|
|
|
|
This file appears to be a PDF document that could not be fully processed as text.
|
|
Please use a PDF viewer for best results.`
|
|
}
|
|
|
|
/**
|
|
* Create error message for PDF parsing failure and make it more readable
|
|
*/
|
|
function createPdfFailureMessage(
|
|
pageCount: number,
|
|
size: number,
|
|
path: string,
|
|
error: string
|
|
): string {
|
|
return `PDF document - Processing failed, ${prettySize(size)}
|
|
Path: ${path}
|
|
Error: ${error}
|
|
|
|
This file appears to be a PDF document that could not be processed.
|
|
Please use a PDF viewer for best results.`
|
|
}
|