feat(ocr): added reducto and pulse for OCR (#2843)

* feat(ocr): added reducto and pulse for OCR

* ack comments
This commit is contained in:
Waleed
2026-01-15 18:30:39 -08:00
committed by GitHub
parent b813bf7f27
commit 12470a630c
18 changed files with 2212 additions and 0 deletions

View File

@@ -0,0 +1,2 @@
export { pulseParserTool } from '@/tools/pulse/parser'
export * from './types'

View File

@@ -0,0 +1,283 @@
import { createLogger } from '@sim/logger'
import { getBaseUrl } from '@/lib/core/utils/urls'
import type { PulseParserInput, PulseParserOutput } from '@/tools/pulse/types'
import type { ToolConfig } from '@/tools/types'
const logger = createLogger('PulseParserTool')
export const pulseParserTool: ToolConfig<PulseParserInput, PulseParserOutput> = {
id: 'pulse_parser',
name: 'Pulse Document Parser',
description: 'Parse documents (PDF, images, Office docs) using Pulse OCR API',
version: '1.0.0',
params: {
filePath: {
type: 'string',
required: true,
visibility: 'user-only',
description: 'URL to a document to be processed',
},
fileUpload: {
type: 'object',
required: false,
visibility: 'hidden',
description: 'File upload data from file-upload component',
},
pages: {
type: 'string',
required: false,
visibility: 'user-only',
description: 'Page range to process (1-indexed, e.g., "1-2,5")',
},
extractFigure: {
type: 'boolean',
required: false,
visibility: 'hidden',
description: 'Enable figure extraction from the document',
},
figureDescription: {
type: 'boolean',
required: false,
visibility: 'hidden',
description: 'Generate descriptions/captions for extracted figures',
},
returnHtml: {
type: 'boolean',
required: false,
visibility: 'hidden',
description: 'Include HTML in the response',
},
chunking: {
type: 'string',
required: false,
visibility: 'user-only',
description: 'Chunking strategies (comma-separated: semantic, header, page, recursive)',
},
chunkSize: {
type: 'number',
required: false,
visibility: 'user-only',
description: 'Maximum characters per chunk when chunking is enabled',
},
apiKey: {
type: 'string',
required: true,
visibility: 'user-only',
description: 'Pulse API key',
},
},
request: {
url: '/api/tools/pulse/parse',
method: 'POST',
headers: () => {
return {
'Content-Type': 'application/json',
Accept: 'application/json',
}
},
body: (params) => {
if (!params || typeof params !== 'object') {
throw new Error('Invalid parameters: Parameters must be provided as an object')
}
if (!params.apiKey || typeof params.apiKey !== 'string' || params.apiKey.trim() === '') {
throw new Error('Missing or invalid API key: A valid Pulse API key is required')
}
// Check if we have a file upload instead of direct URL
if (
params.fileUpload &&
(!params.filePath || params.filePath === 'null' || params.filePath === '')
) {
if (
typeof params.fileUpload === 'object' &&
params.fileUpload !== null &&
(params.fileUpload.url || params.fileUpload.path)
) {
let uploadedFilePath: string = params.fileUpload.url ?? params.fileUpload.path ?? ''
if (!uploadedFilePath) {
throw new Error('Invalid file upload: Upload data is missing or invalid')
}
if (uploadedFilePath.startsWith('/')) {
const baseUrl = getBaseUrl()
if (!baseUrl) throw new Error('Failed to get base URL for file path conversion')
uploadedFilePath = `${baseUrl}${uploadedFilePath}`
}
params.filePath = uploadedFilePath
logger.info('Using uploaded file:', uploadedFilePath)
} else {
throw new Error('Invalid file upload: Upload data is missing or invalid')
}
}
if (
!params.filePath ||
typeof params.filePath !== 'string' ||
params.filePath.trim() === ''
) {
throw new Error('Missing or invalid file path: Please provide a URL to a document')
}
let filePathToValidate = params.filePath.trim()
if (filePathToValidate.startsWith('/')) {
const baseUrl = getBaseUrl()
if (!baseUrl) throw new Error('Failed to get base URL for file path conversion')
filePathToValidate = `${baseUrl}${filePathToValidate}`
}
let url
try {
url = new URL(filePathToValidate)
if (!['http:', 'https:'].includes(url.protocol)) {
throw new Error(`Invalid protocol: ${url.protocol}. URL must use HTTP or HTTPS protocol`)
}
if (url.hostname.includes('drive.google.com') || url.hostname.includes('docs.google.com')) {
throw new Error(
'Google Drive links are not supported. ' +
'Please upload your document or provide a direct download link.'
)
}
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error)
throw new Error(
`Invalid URL format: ${errorMessage}. Please provide a valid HTTP or HTTPS URL to a document`
)
}
const requestBody: Record<string, unknown> = {
apiKey: params.apiKey.trim(),
filePath: url.toString(),
}
// Check if this is an internal workspace file path
if (params.fileUpload?.path?.startsWith('/api/files/serve/')) {
requestBody.filePath = params.fileUpload.path
}
// Add optional parameters
if (params.pages && typeof params.pages === 'string' && params.pages.trim() !== '') {
requestBody.pages = params.pages.trim()
}
if (params.extractFigure !== undefined) {
requestBody.extractFigure = params.extractFigure
}
if (params.figureDescription !== undefined) {
requestBody.figureDescription = params.figureDescription
}
if (params.returnHtml !== undefined) {
requestBody.returnHtml = params.returnHtml
}
if (params.chunking && typeof params.chunking === 'string' && params.chunking.trim() !== '') {
requestBody.chunking = params.chunking.trim()
}
if (params.chunkSize !== undefined && params.chunkSize > 0) {
requestBody.chunkSize = params.chunkSize
}
return requestBody
},
},
transformResponse: async (response) => {
let parseResult
try {
parseResult = await response.json()
} catch (jsonError) {
throw new Error(
`Failed to parse Pulse response: ${jsonError instanceof Error ? jsonError.message : String(jsonError)}`
)
}
if (!parseResult || typeof parseResult !== 'object') {
throw new Error('Invalid response format from Pulse API')
}
// Pass through the native Pulse API response
const pulseData =
parseResult.output && typeof parseResult.output === 'object'
? parseResult.output
: parseResult
return {
success: true,
output: {
markdown: pulseData.markdown ?? '',
page_count: pulseData.page_count ?? 0,
job_id: pulseData.job_id ?? '',
'plan-info': pulseData['plan-info'] ?? { pages_used: 0, tier: 'unknown' },
bounding_boxes: pulseData.bounding_boxes ?? null,
extraction_url: pulseData.extraction_url ?? null,
html: pulseData.html ?? null,
structured_output: pulseData.structured_output ?? null,
chunks: pulseData.chunks ?? null,
figures: pulseData.figures ?? null,
},
}
},
outputs: {
markdown: {
type: 'string',
description: 'Extracted content in markdown format',
},
page_count: {
type: 'number',
description: 'Number of pages in the document',
},
job_id: {
type: 'string',
description: 'Unique job identifier',
},
'plan-info': {
type: 'object',
description: 'Plan usage information',
properties: {
pages_used: { type: 'number', description: 'Number of pages used' },
tier: { type: 'string', description: 'Plan tier' },
note: { type: 'string', description: 'Optional note', optional: true },
},
},
bounding_boxes: {
type: 'json',
description: 'Bounding box layout information',
optional: true,
},
extraction_url: {
type: 'string',
description: 'URL for extraction results (for large documents)',
optional: true,
},
html: {
type: 'string',
description: 'HTML content if requested',
optional: true,
},
structured_output: {
type: 'json',
description: 'Structured output if schema was provided',
optional: true,
},
chunks: {
type: 'json',
description: 'Chunked content if chunking was enabled',
optional: true,
},
figures: {
type: 'json',
description: 'Extracted figures if figure extraction was enabled',
optional: true,
},
},
}

View File

@@ -0,0 +1,93 @@
import type { ToolResponse } from '@/tools/types'
/**
* Input parameters for the Pulse parser tool
*/
export interface PulseParserInput {
/** URL to a document to be processed */
filePath: string
/** File upload data (from file-upload component) */
fileUpload?: {
url?: string
path?: string
}
/** Pulse API key for authentication */
apiKey: string
/** Page range to process (1-indexed, e.g., "1-2,5") */
pages?: string
/** Whether to extract figures from the document */
extractFigure?: boolean
/** Whether to generate figure descriptions/captions */
figureDescription?: boolean
/** Whether to include HTML in the response */
returnHtml?: boolean
/** Chunking strategies (comma-separated: semantic, header, page, recursive) */
chunking?: string
/** Maximum characters per chunk when chunking is enabled */
chunkSize?: number
}
/**
* Plan info returned by the Pulse API
*/
export interface PulsePlanInfo {
/** Number of pages used */
pages_used: number
/** Plan tier */
tier: string
/** Optional note */
note?: string
}
/**
* Native output structure from the Pulse API
*/
export interface PulseParserOutputData {
/** Extracted content in markdown format */
markdown: string
/** Number of pages in the document */
page_count: number
/** Unique job identifier */
job_id: string
/** Plan usage information */
'plan-info': PulsePlanInfo
/** Bounding box layout information */
bounding_boxes?: Record<string, unknown>
/** URL for extraction results (for large documents) */
extraction_url?: string
/** HTML content if requested */
html?: string
/** Structured output if schema was provided */
structured_output?: Record<string, unknown>
/** Chunked content if chunking was enabled */
chunks?: unknown[]
/** Extracted figures if figure extraction was enabled */
figures?: unknown[]
}
/**
* Complete response from the Pulse parser tool
*/
export interface PulseParserOutput extends ToolResponse {
/** The native Pulse API output */
output: PulseParserOutputData
}

View File

@@ -0,0 +1,3 @@
import { reductoParserTool } from '@/tools/reducto/parser'
export { reductoParserTool }

View File

@@ -0,0 +1,203 @@
import { createLogger } from '@sim/logger'
import { getBaseUrl } from '@/lib/core/utils/urls'
import type { ReductoParserInput, ReductoParserOutput } from '@/tools/reducto/types'
import type { ToolConfig } from '@/tools/types'
const logger = createLogger('ReductoParserTool')
export const reductoParserTool: ToolConfig<ReductoParserInput, ReductoParserOutput> = {
id: 'reducto_parser',
name: 'Reducto PDF Parser',
description: 'Parse PDF documents using Reducto OCR API',
version: '1.0.0',
params: {
filePath: {
type: 'string',
required: true,
visibility: 'user-only',
description: 'URL to a PDF document to be processed',
},
fileUpload: {
type: 'object',
required: false,
visibility: 'hidden',
description: 'File upload data from file-upload component',
},
pages: {
type: 'array',
required: false,
visibility: 'user-only',
description: 'Specific pages to process (1-indexed page numbers)',
},
tableOutputFormat: {
type: 'string',
required: false,
visibility: 'user-or-llm',
description: 'Table output format (html or markdown). Defaults to markdown.',
},
apiKey: {
type: 'string',
required: true,
visibility: 'user-only',
description: 'Reducto API key (REDUCTO_API_KEY)',
},
},
request: {
url: '/api/tools/reducto/parse',
method: 'POST',
headers: (params) => {
return {
'Content-Type': 'application/json',
Accept: 'application/json',
Authorization: `Bearer ${params.apiKey}`,
}
},
body: (params) => {
if (!params || typeof params !== 'object') {
throw new Error('Invalid parameters: Parameters must be provided as an object')
}
if (!params.apiKey || typeof params.apiKey !== 'string' || params.apiKey.trim() === '') {
throw new Error('Missing or invalid API key: A valid Reducto API key is required')
}
// Check if we have a file upload instead of direct URL
if (
params.fileUpload &&
(!params.filePath || params.filePath === 'null' || params.filePath === '')
) {
if (
typeof params.fileUpload === 'object' &&
params.fileUpload !== null &&
(params.fileUpload.url || params.fileUpload.path)
) {
let uploadedFilePath = (params.fileUpload.url || params.fileUpload.path) as string
if (uploadedFilePath.startsWith('/')) {
const baseUrl = getBaseUrl()
if (!baseUrl) throw new Error('Failed to get base URL for file path conversion')
uploadedFilePath = `${baseUrl}${uploadedFilePath}`
}
params.filePath = uploadedFilePath as string
logger.info('Using uploaded file:', uploadedFilePath)
} else {
throw new Error('Invalid file upload: Upload data is missing or invalid')
}
}
if (
!params.filePath ||
typeof params.filePath !== 'string' ||
params.filePath.trim() === ''
) {
throw new Error('Missing or invalid file path: Please provide a URL to a PDF document')
}
let filePathToValidate = params.filePath.trim()
if (filePathToValidate.startsWith('/')) {
const baseUrl = getBaseUrl()
if (!baseUrl) throw new Error('Failed to get base URL for file path conversion')
filePathToValidate = `${baseUrl}${filePathToValidate}`
}
let url
try {
url = new URL(filePathToValidate)
if (!['http:', 'https:'].includes(url.protocol)) {
throw new Error(`Invalid protocol: ${url.protocol}. URL must use HTTP or HTTPS protocol`)
}
if (url.hostname.includes('drive.google.com') || url.hostname.includes('docs.google.com')) {
throw new Error(
'Google Drive links are not supported by the Reducto API. ' +
'Please upload your PDF to a public web server or provide a direct download link.'
)
}
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error)
throw new Error(
`Invalid URL format: ${errorMessage}. Please provide a valid HTTP or HTTPS URL to a PDF document.`
)
}
const requestBody: Record<string, unknown> = {
apiKey: params.apiKey,
filePath: url.toString(),
}
// Check if this is an internal workspace file path
if (params.fileUpload?.path?.startsWith('/api/files/serve/')) {
requestBody.filePath = params.fileUpload.path
}
if (params.tableOutputFormat && ['html', 'md'].includes(params.tableOutputFormat)) {
requestBody.tableOutputFormat = params.tableOutputFormat
}
// Page selection
if (params.pages !== undefined && params.pages !== null) {
if (Array.isArray(params.pages) && params.pages.length > 0) {
const validPages = params.pages.filter(
(page) => typeof page === 'number' && Number.isInteger(page) && page >= 0
)
if (validPages.length > 0) {
requestBody.pages = validPages
}
}
}
return requestBody
},
},
transformResponse: async (response) => {
const data = await response.json()
if (!data || typeof data !== 'object') {
throw new Error('Invalid response format from Reducto API')
}
// Pass through the native Reducto response
const reductoData = data.output ?? data
return {
success: true,
output: {
job_id: reductoData.job_id,
duration: reductoData.duration,
usage: reductoData.usage,
result: reductoData.result,
pdf_url: reductoData.pdf_url ?? null,
studio_link: reductoData.studio_link ?? null,
},
}
},
outputs: {
job_id: { type: 'string', description: 'Unique identifier for the processing job' },
duration: { type: 'number', description: 'Processing time in seconds' },
usage: {
type: 'json',
description: 'Resource consumption data',
},
result: {
type: 'json',
description: 'Parsed document content with chunks and blocks',
},
pdf_url: {
type: 'string',
description: 'Storage URL of converted PDF',
optional: true,
},
studio_link: {
type: 'string',
description: 'Link to Reducto studio interface',
optional: true,
},
},
}

View File

@@ -0,0 +1,160 @@
import type { ToolResponse } from '@/tools/types'
/**
* Input parameters for the Reducto parser tool
*/
export interface ReductoParserInput {
/** URL to a document to be processed */
filePath: string
/** File upload data (from file-upload component) */
fileUpload?: {
url?: string
path?: string
}
/** Reducto API key for authentication */
apiKey: string
/** Specific pages to process (1-indexed) */
pages?: number[]
/** Table output format (html or md) */
tableOutputFormat?: 'html' | 'md'
}
/**
* Bounding box for spatial location data
*/
export interface ReductoBoundingBox {
left: number
top: number
width: number
height: number
page: number
}
/**
* Granular confidence scores
*/
export interface ReductoGranularConfidence {
ocr: string | null
layout: string | null
order: string | null
}
/**
* Block type classification
*/
export type ReductoBlockType =
| 'Header'
| 'Footer'
| 'Title'
| 'SectionHeader'
| 'Text'
| 'ListItem'
| 'Table'
| 'Figure'
| 'Caption'
| 'Equation'
| 'Code'
| 'PageNumber'
| 'Watermark'
| 'Handwriting'
| 'Other'
/**
* Parse block - structured content element
*/
export interface ReductoParseBlock {
type: ReductoBlockType
bbox: ReductoBoundingBox
content: string
image_url: string | null
chart_data: string[] | null
confidence: string | null
granular_confidence: ReductoGranularConfidence | null
extra: Record<string, unknown> | null
}
/**
* Parse chunk - document segment
*/
export interface ReductoParseChunk {
content: string
embed: string
enriched: string | null
blocks: ReductoParseBlock[]
enrichment_success: boolean
}
/**
* OCR word data
*/
export interface ReductoOcrWord {
text: string
bbox: ReductoBoundingBox
confidence: number
}
/**
* OCR line data
*/
export interface ReductoOcrLine {
text: string
bbox: ReductoBoundingBox
words: ReductoOcrWord[]
}
/**
* OCR result data
*/
export interface ReductoOcrResult {
lines: ReductoOcrLine[]
words: ReductoOcrWord[]
}
/**
* Full result - when response fits in payload
*/
export interface ReductoFullResult {
type: 'full'
chunks: ReductoParseChunk[]
ocr: ReductoOcrResult | null
custom: unknown
}
/**
* URL result - when response exceeds size limits
*/
export interface ReductoUrlResult {
type: 'url'
url: string
}
/**
* Usage information returned by Reducto API
*/
export interface ReductoUsage {
num_pages: number
credits: number | null
}
/**
* Native Reducto API response structure
*/
export interface ReductoParserOutputData {
job_id: string
duration: number
usage: ReductoUsage
result: ReductoFullResult | ReductoUrlResult
pdf_url: string | null
studio_link: string | null
}
/**
* Complete response from the Reducto parser tool
*/
export interface ReductoParserOutput extends ToolResponse {
output: ReductoParserOutputData
}

View File

@@ -1032,6 +1032,7 @@ import {
posthogUpdatePropertyDefinitionTool,
posthogUpdateSurveyTool,
} from '@/tools/posthog'
import { pulseParserTool } from '@/tools/pulse'
import { qdrantFetchTool, qdrantSearchTool, qdrantUpsertTool } from '@/tools/qdrant'
import {
rdsDeleteTool,
@@ -1056,6 +1057,7 @@ import {
redditUnsaveTool,
redditVoteTool,
} from '@/tools/reddit'
import { reductoParserTool } from '@/tools/reducto'
import { mailSendTool } from '@/tools/resend'
import {
s3CopyObjectTool,
@@ -2126,6 +2128,7 @@ export const tools: Record<string, ToolConfig> = {
google_slides_add_image: googleSlidesAddImageTool,
perplexity_chat: perplexityChatTool,
perplexity_search: perplexitySearchTool,
pulse_parser: pulseParserTool,
posthog_capture_event: posthogCaptureEventTool,
posthog_batch_events: posthogBatchEventsTool,
posthog_list_persons: posthogListPersonsTool,
@@ -2248,6 +2251,7 @@ export const tools: Record<string, ToolConfig> = {
apollo_task_search: apolloTaskSearchTool,
apollo_email_accounts: apolloEmailAccountsTool,
mistral_parser: mistralParserTool,
reducto_parser: reductoParserTool,
thinking_tool: thinkingTool,
tinybird_events: tinybirdEventsTool,
tinybird_query: tinybirdQueryTool,