mirror of
https://github.com/simstudioai/sim.git
synced 2026-04-28 03:00:29 -04:00
* feat(tools): added textract * cleanup * ack pr comments * reorder * removed upload for textract async version * fix additional fields dropdown in editor, update parser to leave validation to be done on the server * added mistral v2, files v2, and finalized textract * updated the rest of the old file patterns, updated mistral outputs for v2 * updated tag dropdown to parse non-operation fields as well * updated extension finder * cleanup * added description for inputs to workflow * use helper for internal route check * fix tag dropdown merge conflict change * remove duplicate code --------- Co-authored-by: Vikhyath Mondreti <vikhyath@simstudio.ai>
557 lines
20 KiB
TypeScript
557 lines
20 KiB
TypeScript
import { createLogger } from '@sim/logger'
|
|
import { getBaseUrl } from '@/lib/core/utils/urls'
|
|
import type {
|
|
MistralParserInput,
|
|
MistralParserOutput,
|
|
MistralParserV2Output,
|
|
} from '@/tools/mistral/types'
|
|
import type { ToolConfig } from '@/tools/types'
|
|
|
|
const logger = createLogger('MistralParserTool')
|
|
|
|
export const mistralParserTool: ToolConfig<MistralParserInput, MistralParserOutput> = {
|
|
id: 'mistral_parser',
|
|
name: 'Mistral PDF Parser',
|
|
description: 'Parse PDF documents using Mistral OCR API',
|
|
version: '1.0.0',
|
|
|
|
params: {
|
|
filePath: {
|
|
type: 'string',
|
|
required: true,
|
|
visibility: 'user-only',
|
|
description: 'URL to a PDF document to be processed',
|
|
},
|
|
fileUpload: {
|
|
type: 'object',
|
|
required: false,
|
|
visibility: 'hidden',
|
|
description: 'File upload data from file-upload component',
|
|
},
|
|
resultType: {
|
|
type: 'string',
|
|
required: false,
|
|
visibility: 'user-or-llm',
|
|
description: 'Type of parsed result (markdown, text, or json). Defaults to markdown.',
|
|
},
|
|
includeImageBase64: {
|
|
type: 'boolean',
|
|
required: false,
|
|
visibility: 'hidden',
|
|
description: 'Include base64-encoded images in the response',
|
|
},
|
|
pages: {
|
|
type: 'array',
|
|
required: false,
|
|
visibility: 'user-only',
|
|
description: 'Specific pages to process (array of page numbers, starting from 0)',
|
|
},
|
|
// Note: The following image-related parameters are still supported by the parser
|
|
// but are disabled in the UI. They can be re-enabled if needed.
|
|
imageLimit: {
|
|
type: 'number',
|
|
required: false,
|
|
visibility: 'hidden',
|
|
description: 'Maximum number of images to extract from the PDF',
|
|
},
|
|
imageMinSize: {
|
|
type: 'number',
|
|
required: false,
|
|
visibility: 'hidden',
|
|
description: 'Minimum height and width of images to extract from the PDF',
|
|
},
|
|
apiKey: {
|
|
type: 'string',
|
|
required: true,
|
|
visibility: 'user-only',
|
|
description: 'Mistral API key (MISTRAL_API_KEY)',
|
|
},
|
|
},
|
|
|
|
request: {
|
|
url: '/api/tools/mistral/parse',
|
|
method: 'POST',
|
|
headers: (params) => {
|
|
return {
|
|
'Content-Type': 'application/json',
|
|
Accept: 'application/json',
|
|
Authorization: `Bearer ${params.apiKey}`,
|
|
}
|
|
},
|
|
body: (params) => {
|
|
if (!params || typeof params !== 'object') {
|
|
throw new Error('Invalid parameters: Parameters must be provided as an object')
|
|
}
|
|
|
|
// Validate required parameters
|
|
if (!params.apiKey || typeof params.apiKey !== 'string' || params.apiKey.trim() === '') {
|
|
throw new Error('Missing or invalid API key: A valid Mistral API key is required')
|
|
}
|
|
|
|
// Check if we have a file upload instead of direct URL
|
|
if (
|
|
params.fileUpload &&
|
|
(!params.filePath || params.filePath === 'null' || params.filePath === '')
|
|
) {
|
|
// Try to extract file path from upload data
|
|
if (
|
|
typeof params.fileUpload === 'object' &&
|
|
params.fileUpload !== null &&
|
|
(params.fileUpload.url || params.fileUpload.path)
|
|
) {
|
|
// Get the full URL to the file - prefer url over path for UserFile compatibility
|
|
let uploadedFilePath = params.fileUpload.url || params.fileUpload.path
|
|
|
|
// Make sure the file path is an absolute URL
|
|
if (uploadedFilePath.startsWith('/')) {
|
|
// If it's a relative path starting with /, convert to absolute URL
|
|
const baseUrl = getBaseUrl()
|
|
if (!baseUrl) throw new Error('Failed to get base URL for file path conversion')
|
|
uploadedFilePath = `${baseUrl}${uploadedFilePath}`
|
|
}
|
|
|
|
// Set the filePath parameter
|
|
params.filePath = uploadedFilePath
|
|
logger.info('Using uploaded file:', uploadedFilePath)
|
|
} else {
|
|
throw new Error('Invalid file upload: Upload data is missing or invalid')
|
|
}
|
|
}
|
|
|
|
if (
|
|
!params.filePath ||
|
|
typeof params.filePath !== 'string' ||
|
|
params.filePath.trim() === ''
|
|
) {
|
|
throw new Error('Missing or invalid file path: Please provide a URL to a PDF document')
|
|
}
|
|
|
|
let filePathToValidate = params.filePath.trim()
|
|
if (filePathToValidate.startsWith('/')) {
|
|
const baseUrl = getBaseUrl()
|
|
if (!baseUrl) throw new Error('Failed to get base URL for file path conversion')
|
|
filePathToValidate = `${baseUrl}${filePathToValidate}`
|
|
}
|
|
|
|
let url
|
|
try {
|
|
url = new URL(filePathToValidate)
|
|
|
|
// Validate protocol
|
|
if (!['http:', 'https:'].includes(url.protocol)) {
|
|
throw new Error(`Invalid protocol: ${url.protocol}. URL must use HTTP or HTTPS protocol`)
|
|
}
|
|
|
|
// Validate against known unsupported services
|
|
if (url.hostname.includes('drive.google.com') || url.hostname.includes('docs.google.com')) {
|
|
throw new Error(
|
|
'Google Drive links are not supported by the Mistral OCR API. ' +
|
|
'Please upload your PDF to a public web server or provide a direct download link ' +
|
|
'that ends with .pdf extension.'
|
|
)
|
|
}
|
|
|
|
// Validate file appears to be a PDF (stricter check with informative warning)
|
|
const pathname = url.pathname.toLowerCase()
|
|
if (!pathname.endsWith('.pdf')) {
|
|
// Check if PDF is included in the path at all
|
|
if (!pathname.includes('pdf')) {
|
|
logger.warn(
|
|
'Warning: URL does not appear to point to a PDF document. ' +
|
|
'The Mistral OCR API is designed to work with PDF files. ' +
|
|
'Please ensure your URL points to a valid PDF document (ideally ending with .pdf extension).'
|
|
)
|
|
} else {
|
|
// If "pdf" is in the URL but not at the end, give a different warning
|
|
logger.warn(
|
|
'Warning: URL contains "pdf" but does not end with .pdf extension. ' +
|
|
'This might still work if the server returns a valid PDF document despite the missing extension.'
|
|
)
|
|
}
|
|
}
|
|
} catch (error) {
|
|
const errorMessage = error instanceof Error ? error.message : String(error)
|
|
throw new Error(
|
|
`Invalid URL format: ${errorMessage}. Please provide a valid HTTP or HTTPS URL to a PDF document (e.g., https://example.com/document.pdf)`
|
|
)
|
|
}
|
|
|
|
// Create the request body with required parameters
|
|
const requestBody: Record<string, any> = {
|
|
apiKey: params.apiKey,
|
|
filePath: url.toString(),
|
|
}
|
|
|
|
// Check if this is an internal workspace file path
|
|
if (params.fileUpload?.path?.startsWith('/api/files/serve/')) {
|
|
// Update filePath to the internal path for workspace files
|
|
requestBody.filePath = params.fileUpload.path
|
|
}
|
|
|
|
// Add optional parameters with proper validation
|
|
// Include images (base64)
|
|
if (params.includeImageBase64 !== undefined) {
|
|
if (typeof params.includeImageBase64 !== 'boolean') {
|
|
logger.warn('includeImageBase64 parameter should be a boolean, using default (false)')
|
|
} else {
|
|
requestBody.includeImageBase64 = params.includeImageBase64
|
|
}
|
|
}
|
|
|
|
// Page selection - safely handle null and undefined
|
|
if (params.pages !== undefined && params.pages !== null) {
|
|
if (Array.isArray(params.pages) && params.pages.length > 0) {
|
|
// Validate all page numbers are non-negative integers
|
|
const validPages = params.pages.filter(
|
|
(page) => typeof page === 'number' && Number.isInteger(page) && page >= 0
|
|
)
|
|
|
|
if (validPages.length > 0) {
|
|
requestBody.pages = validPages
|
|
|
|
if (validPages.length !== params.pages.length) {
|
|
logger.warn(
|
|
`Some invalid page numbers were removed. Using ${validPages.length} valid pages: ${validPages.join(', ')}`
|
|
)
|
|
}
|
|
} else {
|
|
logger.warn('No valid page numbers provided, processing all pages')
|
|
}
|
|
} else if (Array.isArray(params.pages) && params.pages.length === 0) {
|
|
logger.warn('Empty pages array provided, processing all pages')
|
|
}
|
|
}
|
|
|
|
// Image limit - safely handle null and undefined
|
|
if (params.imageLimit !== undefined && params.imageLimit !== null) {
|
|
const imageLimit = Number(params.imageLimit)
|
|
if (Number.isInteger(imageLimit) && imageLimit > 0) {
|
|
requestBody.imageLimit = imageLimit
|
|
} else {
|
|
logger.warn('imageLimit must be a positive integer, ignoring this parameter')
|
|
}
|
|
}
|
|
|
|
// Minimum image size - safely handle null and undefined
|
|
if (params.imageMinSize !== undefined && params.imageMinSize !== null) {
|
|
const imageMinSize = Number(params.imageMinSize)
|
|
if (Number.isInteger(imageMinSize) && imageMinSize > 0) {
|
|
requestBody.imageMinSize = imageMinSize
|
|
} else {
|
|
logger.warn('imageMinSize must be a positive integer, ignoring this parameter')
|
|
}
|
|
}
|
|
|
|
return requestBody
|
|
},
|
|
},
|
|
|
|
transformResponse: async (response, params?) => {
|
|
try {
|
|
// Parse response data with proper error handling
|
|
let ocrResult
|
|
try {
|
|
ocrResult = await response.json()
|
|
} catch (jsonError) {
|
|
throw new Error(
|
|
`Failed to parse Mistral OCR response: ${jsonError instanceof Error ? jsonError.message : String(jsonError)}`
|
|
)
|
|
}
|
|
|
|
if (!ocrResult || typeof ocrResult !== 'object') {
|
|
throw new Error('Invalid response format from Mistral OCR API')
|
|
}
|
|
|
|
const mistralData =
|
|
ocrResult.output && typeof ocrResult.output === 'object' && !ocrResult.pages
|
|
? ocrResult.output
|
|
: ocrResult
|
|
|
|
let resultType: 'markdown' | 'text' | 'json' = 'markdown'
|
|
let sourceUrl = ''
|
|
let isFileUpload = false
|
|
|
|
if (params && typeof params === 'object') {
|
|
if (params.filePath && typeof params.filePath === 'string') {
|
|
sourceUrl = params.filePath.trim()
|
|
}
|
|
|
|
isFileUpload = !!params.fileUpload
|
|
|
|
if (params.resultType && ['markdown', 'text', 'json'].includes(params.resultType)) {
|
|
resultType = params.resultType as 'markdown' | 'text' | 'json'
|
|
}
|
|
} else if (
|
|
mistralData.document &&
|
|
typeof mistralData.document === 'object' &&
|
|
mistralData.document.document_url &&
|
|
typeof mistralData.document.document_url === 'string'
|
|
) {
|
|
sourceUrl = mistralData.document.document_url
|
|
}
|
|
|
|
let content = ''
|
|
const pageCount =
|
|
mistralData.pages && Array.isArray(mistralData.pages) ? mistralData.pages.length : 0
|
|
|
|
if (pageCount > 0) {
|
|
content = mistralData.pages
|
|
.map((page: any) => (page && typeof page.markdown === 'string' ? page.markdown : ''))
|
|
.filter(Boolean)
|
|
.join('\n\n')
|
|
} else {
|
|
logger.warn('No pages found in OCR result, returning raw response')
|
|
content = JSON.stringify(mistralData, null, 2)
|
|
}
|
|
|
|
if (resultType === 'text') {
|
|
content = content
|
|
.replace(/##*\s/g, '') // Remove markdown headers
|
|
.replace(/\*\*/g, '') // Remove bold markers
|
|
.replace(/\*/g, '') // Remove italic markers
|
|
.replace(/\n{3,}/g, '\n\n') // Normalize newlines
|
|
} else if (resultType === 'json') {
|
|
content = JSON.stringify(mistralData, null, 2)
|
|
}
|
|
|
|
let fileName = 'document.pdf'
|
|
let fileType = 'pdf'
|
|
|
|
if (sourceUrl) {
|
|
try {
|
|
const url = new URL(sourceUrl)
|
|
const pathSegments = url.pathname.split('/')
|
|
const lastSegment = pathSegments[pathSegments.length - 1]
|
|
|
|
if (lastSegment && lastSegment.length > 0) {
|
|
fileName = lastSegment
|
|
const fileExtParts = fileName.split('.')
|
|
if (fileExtParts.length > 1) {
|
|
fileType = fileExtParts[fileExtParts.length - 1].toLowerCase()
|
|
}
|
|
}
|
|
} catch (urlError) {
|
|
logger.warn('Failed to parse document URL:', urlError)
|
|
}
|
|
}
|
|
|
|
const timestamp = Date.now()
|
|
const randomId = Math.random().toString(36).substring(2, 10)
|
|
const jobId = `mistral-ocr-${timestamp}-${randomId}`
|
|
|
|
const usageInfo =
|
|
mistralData.usage_info && typeof mistralData.usage_info === 'object'
|
|
? {
|
|
pagesProcessed:
|
|
typeof mistralData.usage_info.pages_processed === 'number'
|
|
? mistralData.usage_info.pages_processed
|
|
: Number(mistralData.usage_info.pages_processed),
|
|
docSizeBytes:
|
|
mistralData.usage_info.doc_size_bytes == null
|
|
? null
|
|
: typeof mistralData.usage_info.doc_size_bytes === 'number'
|
|
? mistralData.usage_info.doc_size_bytes
|
|
: Number(mistralData.usage_info.doc_size_bytes),
|
|
}
|
|
: undefined
|
|
|
|
const metadata: any = {
|
|
jobId,
|
|
fileType,
|
|
fileName,
|
|
source: 'url',
|
|
pageCount,
|
|
usageInfo,
|
|
model: typeof mistralData.model === 'string' ? mistralData.model : 'mistral-ocr-latest',
|
|
resultType,
|
|
processedAt: new Date().toISOString(),
|
|
}
|
|
|
|
if (
|
|
!isFileUpload &&
|
|
sourceUrl &&
|
|
!sourceUrl.includes('/api/files/serve/') &&
|
|
!sourceUrl.includes('s3.amazonaws.com')
|
|
) {
|
|
metadata.sourceUrl = sourceUrl
|
|
}
|
|
|
|
const parserResponse: MistralParserOutput = {
|
|
success: true,
|
|
output: {
|
|
content,
|
|
metadata,
|
|
},
|
|
}
|
|
|
|
return parserResponse
|
|
} catch (error) {
|
|
logger.error('Error processing OCR result:', error)
|
|
throw error
|
|
}
|
|
},
|
|
|
|
outputs: {
|
|
success: { type: 'boolean', description: 'Whether the PDF was parsed successfully' },
|
|
content: {
|
|
type: 'string',
|
|
description: 'Extracted content in the requested format (markdown, text, or JSON)',
|
|
},
|
|
metadata: {
|
|
type: 'object',
|
|
description: 'Processing metadata including jobId, fileType, pageCount, and usage info',
|
|
properties: {
|
|
jobId: { type: 'string', description: 'Unique job identifier' },
|
|
fileType: { type: 'string', description: 'File type (e.g., pdf)' },
|
|
fileName: { type: 'string', description: 'Original file name' },
|
|
source: { type: 'string', description: 'Source type (url)' },
|
|
pageCount: { type: 'number', description: 'Number of pages processed' },
|
|
model: { type: 'string', description: 'Mistral model used' },
|
|
resultType: { type: 'string', description: 'Output format (markdown, text, json)' },
|
|
processedAt: { type: 'string', description: 'Processing timestamp' },
|
|
sourceUrl: { type: 'string', description: 'Source URL if applicable', optional: true },
|
|
usageInfo: {
|
|
type: 'object',
|
|
description: 'Usage statistics from OCR processing',
|
|
optional: true,
|
|
},
|
|
},
|
|
},
|
|
},
|
|
}
|
|
|
|
export const mistralParserV2Tool: ToolConfig<MistralParserInput, MistralParserV2Output> = {
|
|
id: 'mistral_parser_v2',
|
|
name: 'Mistral PDF Parser',
|
|
description: 'Parse PDF documents using Mistral OCR API',
|
|
version: '2.0.0',
|
|
|
|
params: mistralParserTool.params,
|
|
request: mistralParserTool.request,
|
|
|
|
transformResponse: async (response: Response) => {
|
|
let ocrResult
|
|
try {
|
|
ocrResult = await response.json()
|
|
} catch (jsonError) {
|
|
throw new Error(
|
|
`Failed to parse Mistral OCR response: ${jsonError instanceof Error ? jsonError.message : String(jsonError)}`
|
|
)
|
|
}
|
|
|
|
if (!ocrResult || typeof ocrResult !== 'object') {
|
|
throw new Error('Invalid response format from Mistral OCR API')
|
|
}
|
|
|
|
// Extract the actual Mistral data (may be nested in output from our API route)
|
|
const mistralData =
|
|
ocrResult.output && typeof ocrResult.output === 'object' && !ocrResult.pages
|
|
? ocrResult.output
|
|
: ocrResult
|
|
|
|
// Return raw Mistral API structure - no transformation
|
|
return {
|
|
success: true,
|
|
output: {
|
|
pages: mistralData.pages ?? [],
|
|
model: mistralData.model ?? 'mistral-ocr-latest',
|
|
usage_info: mistralData.usage_info ?? { pages_processed: 0, doc_size_bytes: null },
|
|
document_annotation: mistralData.document_annotation ?? null,
|
|
},
|
|
}
|
|
},
|
|
|
|
outputs: {
|
|
pages: {
|
|
type: 'array',
|
|
description: 'Array of page objects from Mistral OCR',
|
|
items: {
|
|
type: 'object',
|
|
properties: {
|
|
index: { type: 'number', description: 'Page index (zero-based)' },
|
|
markdown: { type: 'string', description: 'Extracted markdown content' },
|
|
images: {
|
|
type: 'array',
|
|
description: 'Images extracted from this page with bounding boxes',
|
|
items: {
|
|
type: 'object',
|
|
properties: {
|
|
id: { type: 'string', description: 'Image identifier (e.g., img-0.jpeg)' },
|
|
top_left_x: { type: 'number', description: 'Top-left X coordinate in pixels' },
|
|
top_left_y: { type: 'number', description: 'Top-left Y coordinate in pixels' },
|
|
bottom_right_x: {
|
|
type: 'number',
|
|
description: 'Bottom-right X coordinate in pixels',
|
|
},
|
|
bottom_right_y: {
|
|
type: 'number',
|
|
description: 'Bottom-right Y coordinate in pixels',
|
|
},
|
|
image_base64: {
|
|
type: 'string',
|
|
description: 'Base64-encoded image data (when include_image_base64=true)',
|
|
optional: true,
|
|
},
|
|
},
|
|
},
|
|
},
|
|
dimensions: {
|
|
type: 'object',
|
|
description: 'Page dimensions',
|
|
properties: {
|
|
dpi: { type: 'number', description: 'Dots per inch' },
|
|
height: { type: 'number', description: 'Page height in pixels' },
|
|
width: { type: 'number', description: 'Page width in pixels' },
|
|
},
|
|
},
|
|
tables: {
|
|
type: 'array',
|
|
description:
|
|
'Extracted tables as HTML/markdown (when table_format is set). Referenced via placeholders like [tbl-0.html]',
|
|
},
|
|
hyperlinks: {
|
|
type: 'array',
|
|
description:
|
|
'Array of URL strings detected in the page (e.g., ["https://...", "mailto:..."])',
|
|
items: {
|
|
type: 'string',
|
|
description: 'URL or mailto link',
|
|
},
|
|
},
|
|
header: {
|
|
type: 'string',
|
|
description: 'Page header content (when extract_header=true)',
|
|
optional: true,
|
|
},
|
|
footer: {
|
|
type: 'string',
|
|
description: 'Page footer content (when extract_footer=true)',
|
|
optional: true,
|
|
},
|
|
},
|
|
},
|
|
},
|
|
model: {
|
|
type: 'string',
|
|
description: 'Mistral OCR model identifier (e.g., mistral-ocr-latest)',
|
|
},
|
|
usage_info: {
|
|
type: 'object',
|
|
description: 'Usage and processing statistics',
|
|
properties: {
|
|
pages_processed: { type: 'number', description: 'Total number of pages processed' },
|
|
doc_size_bytes: {
|
|
type: 'number',
|
|
description: 'Document file size in bytes',
|
|
optional: true,
|
|
},
|
|
},
|
|
},
|
|
document_annotation: {
|
|
type: 'string',
|
|
description: 'Structured annotation data as JSON string (when applicable)',
|
|
optional: true,
|
|
},
|
|
},
|
|
}
|