feat(ocr): added reducto and pulse for OCR (#2843)

* feat(ocr): added reducto and pulse for OCR * ack comments
2026-04-28 03:00:29 -04:00 · 2026-01-15 18:30:39 -08:00
parent b813bf7f27
commit 12470a630c
18 changed files with 2212 additions and 0 deletions
--- a/apps/sim/tools/pulse/index.ts
+++ b/apps/sim/tools/pulse/index.ts
@@ -0,0 +1,2 @@
+export { pulseParserTool } from '@/tools/pulse/parser'
+export * from './types'
--- a/apps/sim/tools/pulse/parser.ts
+++ b/apps/sim/tools/pulse/parser.ts
@@ -0,0 +1,283 @@
+import { createLogger } from '@sim/logger'
+import { getBaseUrl } from '@/lib/core/utils/urls'
+import type { PulseParserInput, PulseParserOutput } from '@/tools/pulse/types'
+import type { ToolConfig } from '@/tools/types'
+
+const logger = createLogger('PulseParserTool')
+
+export const pulseParserTool: ToolConfig<PulseParserInput, PulseParserOutput> = {
+  id: 'pulse_parser',
+  name: 'Pulse Document Parser',
+  description: 'Parse documents (PDF, images, Office docs) using Pulse OCR API',
+  version: '1.0.0',
+
+  params: {
+    filePath: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'URL to a document to be processed',
+    },
+    fileUpload: {
+      type: 'object',
+      required: false,
+      visibility: 'hidden',
+      description: 'File upload data from file-upload component',
+    },
+    pages: {
+      type: 'string',
+      required: false,
+      visibility: 'user-only',
+      description: 'Page range to process (1-indexed, e.g., "1-2,5")',
+    },
+    extractFigure: {
+      type: 'boolean',
+      required: false,
+      visibility: 'hidden',
+      description: 'Enable figure extraction from the document',
+    },
+    figureDescription: {
+      type: 'boolean',
+      required: false,
+      visibility: 'hidden',
+      description: 'Generate descriptions/captions for extracted figures',
+    },
+    returnHtml: {
+      type: 'boolean',
+      required: false,
+      visibility: 'hidden',
+      description: 'Include HTML in the response',
+    },
+    chunking: {
+      type: 'string',
+      required: false,
+      visibility: 'user-only',
+      description: 'Chunking strategies (comma-separated: semantic, header, page, recursive)',
+    },
+    chunkSize: {
+      type: 'number',
+      required: false,
+      visibility: 'user-only',
+      description: 'Maximum characters per chunk when chunking is enabled',
+    },
+    apiKey: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'Pulse API key',
+    },
+  },
+
+  request: {
+    url: '/api/tools/pulse/parse',
+    method: 'POST',
+    headers: () => {
+      return {
+        'Content-Type': 'application/json',
+        Accept: 'application/json',
+      }
+    },
+    body: (params) => {
+      if (!params || typeof params !== 'object') {
+        throw new Error('Invalid parameters: Parameters must be provided as an object')
+      }
+
+      if (!params.apiKey || typeof params.apiKey !== 'string' || params.apiKey.trim() === '') {
+        throw new Error('Missing or invalid API key: A valid Pulse API key is required')
+      }
+
+      // Check if we have a file upload instead of direct URL
+      if (
+        params.fileUpload &&
+        (!params.filePath || params.filePath === 'null' || params.filePath === '')
+      ) {
+        if (
+          typeof params.fileUpload === 'object' &&
+          params.fileUpload !== null &&
+          (params.fileUpload.url || params.fileUpload.path)
+        ) {
+          let uploadedFilePath: string = params.fileUpload.url ?? params.fileUpload.path ?? ''
+
+          if (!uploadedFilePath) {
+            throw new Error('Invalid file upload: Upload data is missing or invalid')
+          }
+
+          if (uploadedFilePath.startsWith('/')) {
+            const baseUrl = getBaseUrl()
+            if (!baseUrl) throw new Error('Failed to get base URL for file path conversion')
+            uploadedFilePath = `${baseUrl}${uploadedFilePath}`
+          }
+
+          params.filePath = uploadedFilePath
+          logger.info('Using uploaded file:', uploadedFilePath)
+        } else {
+          throw new Error('Invalid file upload: Upload data is missing or invalid')
+        }
+      }
+
+      if (
+        !params.filePath ||
+        typeof params.filePath !== 'string' ||
+        params.filePath.trim() === ''
+      ) {
+        throw new Error('Missing or invalid file path: Please provide a URL to a document')
+      }
+
+      let filePathToValidate = params.filePath.trim()
+      if (filePathToValidate.startsWith('/')) {
+        const baseUrl = getBaseUrl()
+        if (!baseUrl) throw new Error('Failed to get base URL for file path conversion')
+        filePathToValidate = `${baseUrl}${filePathToValidate}`
+      }
+
+      let url
+      try {
+        url = new URL(filePathToValidate)
+
+        if (!['http:', 'https:'].includes(url.protocol)) {
+          throw new Error(`Invalid protocol: ${url.protocol}. URL must use HTTP or HTTPS protocol`)
+        }
+
+        if (url.hostname.includes('drive.google.com') || url.hostname.includes('docs.google.com')) {
+          throw new Error(
+            'Google Drive links are not supported. ' +
+              'Please upload your document or provide a direct download link.'
+          )
+        }
+      } catch (error) {
+        const errorMessage = error instanceof Error ? error.message : String(error)
+        throw new Error(
+          `Invalid URL format: ${errorMessage}. Please provide a valid HTTP or HTTPS URL to a document`
+        )
+      }
+
+      const requestBody: Record<string, unknown> = {
+        apiKey: params.apiKey.trim(),
+        filePath: url.toString(),
+      }
+
+      // Check if this is an internal workspace file path
+      if (params.fileUpload?.path?.startsWith('/api/files/serve/')) {
+        requestBody.filePath = params.fileUpload.path
+      }
+
+      // Add optional parameters
+      if (params.pages && typeof params.pages === 'string' && params.pages.trim() !== '') {
+        requestBody.pages = params.pages.trim()
+      }
+
+      if (params.extractFigure !== undefined) {
+        requestBody.extractFigure = params.extractFigure
+      }
+
+      if (params.figureDescription !== undefined) {
+        requestBody.figureDescription = params.figureDescription
+      }
+
+      if (params.returnHtml !== undefined) {
+        requestBody.returnHtml = params.returnHtml
+      }
+
+      if (params.chunking && typeof params.chunking === 'string' && params.chunking.trim() !== '') {
+        requestBody.chunking = params.chunking.trim()
+      }
+
+      if (params.chunkSize !== undefined && params.chunkSize > 0) {
+        requestBody.chunkSize = params.chunkSize
+      }
+
+      return requestBody
+    },
+  },
+
+  transformResponse: async (response) => {
+    let parseResult
+    try {
+      parseResult = await response.json()
+    } catch (jsonError) {
+      throw new Error(
+        `Failed to parse Pulse response: ${jsonError instanceof Error ? jsonError.message : String(jsonError)}`
+      )
+    }
+
+    if (!parseResult || typeof parseResult !== 'object') {
+      throw new Error('Invalid response format from Pulse API')
+    }
+
+    // Pass through the native Pulse API response
+    const pulseData =
+      parseResult.output && typeof parseResult.output === 'object'
+        ? parseResult.output
+        : parseResult
+
+    return {
+      success: true,
+      output: {
+        markdown: pulseData.markdown ?? '',
+        page_count: pulseData.page_count ?? 0,
+        job_id: pulseData.job_id ?? '',
+        'plan-info': pulseData['plan-info'] ?? { pages_used: 0, tier: 'unknown' },
+        bounding_boxes: pulseData.bounding_boxes ?? null,
+        extraction_url: pulseData.extraction_url ?? null,
+        html: pulseData.html ?? null,
+        structured_output: pulseData.structured_output ?? null,
+        chunks: pulseData.chunks ?? null,
+        figures: pulseData.figures ?? null,
+      },
+    }
+  },
+
+  outputs: {
+    markdown: {
+      type: 'string',
+      description: 'Extracted content in markdown format',
+    },
+    page_count: {
+      type: 'number',
+      description: 'Number of pages in the document',
+    },
+    job_id: {
+      type: 'string',
+      description: 'Unique job identifier',
+    },
+    'plan-info': {
+      type: 'object',
+      description: 'Plan usage information',
+      properties: {
+        pages_used: { type: 'number', description: 'Number of pages used' },
+        tier: { type: 'string', description: 'Plan tier' },
+        note: { type: 'string', description: 'Optional note', optional: true },
+      },
+    },
+    bounding_boxes: {
+      type: 'json',
+      description: 'Bounding box layout information',
+      optional: true,
+    },
+    extraction_url: {
+      type: 'string',
+      description: 'URL for extraction results (for large documents)',
+      optional: true,
+    },
+    html: {
+      type: 'string',
+      description: 'HTML content if requested',
+      optional: true,
+    },
+    structured_output: {
+      type: 'json',
+      description: 'Structured output if schema was provided',
+      optional: true,
+    },
+    chunks: {
+      type: 'json',
+      description: 'Chunked content if chunking was enabled',
+      optional: true,
+    },
+    figures: {
+      type: 'json',
+      description: 'Extracted figures if figure extraction was enabled',
+      optional: true,
+    },
+  },
+}
--- a/apps/sim/tools/pulse/types.ts
+++ b/apps/sim/tools/pulse/types.ts
@@ -0,0 +1,93 @@
+import type { ToolResponse } from '@/tools/types'
+
+/**
+ * Input parameters for the Pulse parser tool
+ */
+export interface PulseParserInput {
+  /** URL to a document to be processed */
+  filePath: string
+
+  /** File upload data (from file-upload component) */
+  fileUpload?: {
+    url?: string
+    path?: string
+  }
+
+  /** Pulse API key for authentication */
+  apiKey: string
+
+  /** Page range to process (1-indexed, e.g., "1-2,5") */
+  pages?: string
+
+  /** Whether to extract figures from the document */
+  extractFigure?: boolean
+
+  /** Whether to generate figure descriptions/captions */
+  figureDescription?: boolean
+
+  /** Whether to include HTML in the response */
+  returnHtml?: boolean
+
+  /** Chunking strategies (comma-separated: semantic, header, page, recursive) */
+  chunking?: string
+
+  /** Maximum characters per chunk when chunking is enabled */
+  chunkSize?: number
+}
+
+/**
+ * Plan info returned by the Pulse API
+ */
+export interface PulsePlanInfo {
+  /** Number of pages used */
+  pages_used: number
+
+  /** Plan tier */
+  tier: string
+
+  /** Optional note */
+  note?: string
+}
+
+/**
+ * Native output structure from the Pulse API
+ */
+export interface PulseParserOutputData {
+  /** Extracted content in markdown format */
+  markdown: string
+
+  /** Number of pages in the document */
+  page_count: number
+
+  /** Unique job identifier */
+  job_id: string
+
+  /** Plan usage information */
+  'plan-info': PulsePlanInfo
+
+  /** Bounding box layout information */
+  bounding_boxes?: Record<string, unknown>
+
+  /** URL for extraction results (for large documents) */
+  extraction_url?: string
+
+  /** HTML content if requested */
+  html?: string
+
+  /** Structured output if schema was provided */
+  structured_output?: Record<string, unknown>
+
+  /** Chunked content if chunking was enabled */
+  chunks?: unknown[]
+
+  /** Extracted figures if figure extraction was enabled */
+  figures?: unknown[]
+}
+
+/**
+ * Complete response from the Pulse parser tool
+ */
+export interface PulseParserOutput extends ToolResponse {
+  /** The native Pulse API output */
+  output: PulseParserOutputData
+}
--- a/apps/sim/tools/reducto/index.ts
+++ b/apps/sim/tools/reducto/index.ts
@@ -0,0 +1,3 @@
+import { reductoParserTool } from '@/tools/reducto/parser'
+
+export { reductoParserTool }
--- a/apps/sim/tools/reducto/parser.ts
+++ b/apps/sim/tools/reducto/parser.ts
@@ -0,0 +1,203 @@
+import { createLogger } from '@sim/logger'
+import { getBaseUrl } from '@/lib/core/utils/urls'
+import type { ReductoParserInput, ReductoParserOutput } from '@/tools/reducto/types'
+import type { ToolConfig } from '@/tools/types'
+
+const logger = createLogger('ReductoParserTool')
+
+export const reductoParserTool: ToolConfig<ReductoParserInput, ReductoParserOutput> = {
+  id: 'reducto_parser',
+  name: 'Reducto PDF Parser',
+  description: 'Parse PDF documents using Reducto OCR API',
+  version: '1.0.0',
+
+  params: {
+    filePath: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'URL to a PDF document to be processed',
+    },
+    fileUpload: {
+      type: 'object',
+      required: false,
+      visibility: 'hidden',
+      description: 'File upload data from file-upload component',
+    },
+    pages: {
+      type: 'array',
+      required: false,
+      visibility: 'user-only',
+      description: 'Specific pages to process (1-indexed page numbers)',
+    },
+    tableOutputFormat: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Table output format (html or markdown). Defaults to markdown.',
+    },
+    apiKey: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'Reducto API key (REDUCTO_API_KEY)',
+    },
+  },
+
+  request: {
+    url: '/api/tools/reducto/parse',
+    method: 'POST',
+    headers: (params) => {
+      return {
+        'Content-Type': 'application/json',
+        Accept: 'application/json',
+        Authorization: `Bearer ${params.apiKey}`,
+      }
+    },
+    body: (params) => {
+      if (!params || typeof params !== 'object') {
+        throw new Error('Invalid parameters: Parameters must be provided as an object')
+      }
+
+      if (!params.apiKey || typeof params.apiKey !== 'string' || params.apiKey.trim() === '') {
+        throw new Error('Missing or invalid API key: A valid Reducto API key is required')
+      }
+
+      // Check if we have a file upload instead of direct URL
+      if (
+        params.fileUpload &&
+        (!params.filePath || params.filePath === 'null' || params.filePath === '')
+      ) {
+        if (
+          typeof params.fileUpload === 'object' &&
+          params.fileUpload !== null &&
+          (params.fileUpload.url || params.fileUpload.path)
+        ) {
+          let uploadedFilePath = (params.fileUpload.url || params.fileUpload.path) as string
+
+          if (uploadedFilePath.startsWith('/')) {
+            const baseUrl = getBaseUrl()
+            if (!baseUrl) throw new Error('Failed to get base URL for file path conversion')
+            uploadedFilePath = `${baseUrl}${uploadedFilePath}`
+          }
+
+          params.filePath = uploadedFilePath as string
+          logger.info('Using uploaded file:', uploadedFilePath)
+        } else {
+          throw new Error('Invalid file upload: Upload data is missing or invalid')
+        }
+      }
+
+      if (
+        !params.filePath ||
+        typeof params.filePath !== 'string' ||
+        params.filePath.trim() === ''
+      ) {
+        throw new Error('Missing or invalid file path: Please provide a URL to a PDF document')
+      }
+
+      let filePathToValidate = params.filePath.trim()
+      if (filePathToValidate.startsWith('/')) {
+        const baseUrl = getBaseUrl()
+        if (!baseUrl) throw new Error('Failed to get base URL for file path conversion')
+        filePathToValidate = `${baseUrl}${filePathToValidate}`
+      }
+
+      let url
+      try {
+        url = new URL(filePathToValidate)
+
+        if (!['http:', 'https:'].includes(url.protocol)) {
+          throw new Error(`Invalid protocol: ${url.protocol}. URL must use HTTP or HTTPS protocol`)
+        }
+
+        if (url.hostname.includes('drive.google.com') || url.hostname.includes('docs.google.com')) {
+          throw new Error(
+            'Google Drive links are not supported by the Reducto API. ' +
+              'Please upload your PDF to a public web server or provide a direct download link.'
+          )
+        }
+      } catch (error) {
+        const errorMessage = error instanceof Error ? error.message : String(error)
+        throw new Error(
+          `Invalid URL format: ${errorMessage}. Please provide a valid HTTP or HTTPS URL to a PDF document.`
+        )
+      }
+
+      const requestBody: Record<string, unknown> = {
+        apiKey: params.apiKey,
+        filePath: url.toString(),
+      }
+
+      // Check if this is an internal workspace file path
+      if (params.fileUpload?.path?.startsWith('/api/files/serve/')) {
+        requestBody.filePath = params.fileUpload.path
+      }
+
+      if (params.tableOutputFormat && ['html', 'md'].includes(params.tableOutputFormat)) {
+        requestBody.tableOutputFormat = params.tableOutputFormat
+      }
+
+      // Page selection
+      if (params.pages !== undefined && params.pages !== null) {
+        if (Array.isArray(params.pages) && params.pages.length > 0) {
+          const validPages = params.pages.filter(
+            (page) => typeof page === 'number' && Number.isInteger(page) && page >= 0
+          )
+
+          if (validPages.length > 0) {
+            requestBody.pages = validPages
+          }
+        }
+      }
+
+      return requestBody
+    },
+  },
+
+  transformResponse: async (response) => {
+    const data = await response.json()
+
+    if (!data || typeof data !== 'object') {
+      throw new Error('Invalid response format from Reducto API')
+    }
+
+    // Pass through the native Reducto response
+    const reductoData = data.output ?? data
+
+    return {
+      success: true,
+      output: {
+        job_id: reductoData.job_id,
+        duration: reductoData.duration,
+        usage: reductoData.usage,
+        result: reductoData.result,
+        pdf_url: reductoData.pdf_url ?? null,
+        studio_link: reductoData.studio_link ?? null,
+      },
+    }
+  },
+
+  outputs: {
+    job_id: { type: 'string', description: 'Unique identifier for the processing job' },
+    duration: { type: 'number', description: 'Processing time in seconds' },
+    usage: {
+      type: 'json',
+      description: 'Resource consumption data',
+    },
+    result: {
+      type: 'json',
+      description: 'Parsed document content with chunks and blocks',
+    },
+    pdf_url: {
+      type: 'string',
+      description: 'Storage URL of converted PDF',
+      optional: true,
+    },
+    studio_link: {
+      type: 'string',
+      description: 'Link to Reducto studio interface',
+      optional: true,
+    },
+  },
+}
--- a/apps/sim/tools/reducto/types.ts
+++ b/apps/sim/tools/reducto/types.ts
@@ -0,0 +1,160 @@
+import type { ToolResponse } from '@/tools/types'
+
+/**
+ * Input parameters for the Reducto parser tool
+ */
+export interface ReductoParserInput {
+  /** URL to a document to be processed */
+  filePath: string
+
+  /** File upload data (from file-upload component) */
+  fileUpload?: {
+    url?: string
+    path?: string
+  }
+
+  /** Reducto API key for authentication */
+  apiKey: string
+
+  /** Specific pages to process (1-indexed) */
+  pages?: number[]
+
+  /** Table output format (html or md) */
+  tableOutputFormat?: 'html' | 'md'
+}
+
+/**
+ * Bounding box for spatial location data
+ */
+export interface ReductoBoundingBox {
+  left: number
+  top: number
+  width: number
+  height: number
+  page: number
+}
+
+/**
+ * Granular confidence scores
+ */
+export interface ReductoGranularConfidence {
+  ocr: string | null
+  layout: string | null
+  order: string | null
+}
+
+/**
+ * Block type classification
+ */
+export type ReductoBlockType =
+  | 'Header'
+  | 'Footer'
+  | 'Title'
+  | 'SectionHeader'
+  | 'Text'
+  | 'ListItem'
+  | 'Table'
+  | 'Figure'
+  | 'Caption'
+  | 'Equation'
+  | 'Code'
+  | 'PageNumber'
+  | 'Watermark'
+  | 'Handwriting'
+  | 'Other'
+
+/**
+ * Parse block - structured content element
+ */
+export interface ReductoParseBlock {
+  type: ReductoBlockType
+  bbox: ReductoBoundingBox
+  content: string
+  image_url: string | null
+  chart_data: string[] | null
+  confidence: string | null
+  granular_confidence: ReductoGranularConfidence | null
+  extra: Record<string, unknown> | null
+}
+
+/**
+ * Parse chunk - document segment
+ */
+export interface ReductoParseChunk {
+  content: string
+  embed: string
+  enriched: string | null
+  blocks: ReductoParseBlock[]
+  enrichment_success: boolean
+}
+
+/**
+ * OCR word data
+ */
+export interface ReductoOcrWord {
+  text: string
+  bbox: ReductoBoundingBox
+  confidence: number
+}
+
+/**
+ * OCR line data
+ */
+export interface ReductoOcrLine {
+  text: string
+  bbox: ReductoBoundingBox
+  words: ReductoOcrWord[]
+}
+
+/**
+ * OCR result data
+ */
+export interface ReductoOcrResult {
+  lines: ReductoOcrLine[]
+  words: ReductoOcrWord[]
+}
+
+/**
+ * Full result - when response fits in payload
+ */
+export interface ReductoFullResult {
+  type: 'full'
+  chunks: ReductoParseChunk[]
+  ocr: ReductoOcrResult | null
+  custom: unknown
+}
+
+/**
+ * URL result - when response exceeds size limits
+ */
+export interface ReductoUrlResult {
+  type: 'url'
+  url: string
+}
+
+/**
+ * Usage information returned by Reducto API
+ */
+export interface ReductoUsage {
+  num_pages: number
+  credits: number | null
+}
+
+/**
+ * Native Reducto API response structure
+ */
+export interface ReductoParserOutputData {
+  job_id: string
+  duration: number
+  usage: ReductoUsage
+  result: ReductoFullResult | ReductoUrlResult
+  pdf_url: string | null
+  studio_link: string | null
+}
+
+/**
+ * Complete response from the Reducto parser tool
+ */
+export interface ReductoParserOutput extends ToolResponse {
+  output: ReductoParserOutputData
+}
--- a/apps/sim/tools/registry.ts
+++ b/apps/sim/tools/registry.ts
@@ -1032,6 +1032,7 @@ import {
  posthogUpdatePropertyDefinitionTool,
  posthogUpdateSurveyTool,
 } from '@/tools/posthog'
+import { pulseParserTool } from '@/tools/pulse'
 import { qdrantFetchTool, qdrantSearchTool, qdrantUpsertTool } from '@/tools/qdrant'
 import {
  rdsDeleteTool,
@@ -1056,6 +1057,7 @@ import {
  redditUnsaveTool,
  redditVoteTool,
 } from '@/tools/reddit'
+import { reductoParserTool } from '@/tools/reducto'
 import { mailSendTool } from '@/tools/resend'
 import {
  s3CopyObjectTool,
@@ -2126,6 +2128,7 @@ export const tools: Record<string, ToolConfig> = {
  google_slides_add_image: googleSlidesAddImageTool,
  perplexity_chat: perplexityChatTool,
  perplexity_search: perplexitySearchTool,
+  pulse_parser: pulseParserTool,
  posthog_capture_event: posthogCaptureEventTool,
  posthog_batch_events: posthogBatchEventsTool,
  posthog_list_persons: posthogListPersonsTool,
@@ -2248,6 +2251,7 @@ export const tools: Record<string, ToolConfig> = {
  apollo_task_search: apolloTaskSearchTool,
  apollo_email_accounts: apolloEmailAccountsTool,
  mistral_parser: mistralParserTool,
+  reducto_parser: reductoParserTool,
  thinking_tool: thinkingTool,
  tinybird_events: tinybirdEventsTool,
  tinybird_query: tinybirdQueryTool,