feat(ocr): added reducto and pulse for OCR (#2843)

* feat(ocr): added reducto and pulse for OCR * ack comments
2026-04-28 03:00:29 -04:00 · 2026-01-15 18:30:39 -08:00
parent b813bf7f27
commit 12470a630c
18 changed files with 2212 additions and 0 deletions
--- a/apps/sim/blocks/blocks/pulse.ts
+++ b/apps/sim/blocks/blocks/pulse.ts
@@ -0,0 +1,143 @@
+import { PulseIcon } from '@/components/icons'
+import { AuthMode, type BlockConfig, type SubBlockType } from '@/blocks/types'
+import type { PulseParserOutput } from '@/tools/pulse/types'
+
+export const PulseBlock: BlockConfig<PulseParserOutput> = {
+  type: 'pulse',
+  name: 'Pulse',
+  description: 'Extract text from documents using Pulse OCR',
+  authMode: AuthMode.ApiKey,
+  longDescription:
+    'Integrate Pulse into the workflow. Extract text from PDF documents, images, and Office files via URL or upload.',
+  docsLink: 'https://docs.sim.ai/tools/pulse',
+  category: 'tools',
+  bgColor: '#E0E0E0',
+  icon: PulseIcon,
+  subBlocks: [
+    {
+      id: 'inputMethod',
+      title: 'Select Input Method',
+      type: 'dropdown' as SubBlockType,
+      options: [
+        { id: 'url', label: 'Document URL' },
+        { id: 'upload', label: 'Upload Document' },
+      ],
+    },
+    {
+      id: 'filePath',
+      title: 'Document URL',
+      type: 'short-input' as SubBlockType,
+      placeholder: 'Enter full URL to a document (https://example.com/document.pdf)',
+      condition: {
+        field: 'inputMethod',
+        value: 'url',
+      },
+    },
+    {
+      id: 'fileUpload',
+      title: 'Upload Document',
+      type: 'file-upload' as SubBlockType,
+      acceptedTypes: 'application/pdf,image/*,.docx,.pptx,.xlsx',
+      condition: {
+        field: 'inputMethod',
+        value: 'upload',
+      },
+      maxSize: 50,
+    },
+    {
+      id: 'pages',
+      title: 'Specific Pages',
+      type: 'short-input',
+      placeholder: 'e.g. 1-3,5 (leave empty for all pages)',
+    },
+    {
+      id: 'chunking',
+      title: 'Chunking Strategy',
+      type: 'short-input',
+      placeholder: 'e.g. semantic,header,page,recursive',
+    },
+    {
+      id: 'chunkSize',
+      title: 'Chunk Size',
+      type: 'short-input',
+      placeholder: 'Max characters per chunk',
+    },
+    {
+      id: 'apiKey',
+      title: 'API Key',
+      type: 'short-input' as SubBlockType,
+      placeholder: 'Enter your Pulse API key',
+      password: true,
+      required: true,
+    },
+  ],
+  tools: {
+    access: ['pulse_parser'],
+    config: {
+      tool: () => 'pulse_parser',
+      params: (params) => {
+        if (!params || !params.apiKey || params.apiKey.trim() === '') {
+          throw new Error('Pulse API key is required')
+        }
+
+        const parameters: Record<string, unknown> = {
+          apiKey: params.apiKey.trim(),
+        }
+
+        const inputMethod = params.inputMethod || 'url'
+        if (inputMethod === 'url') {
+          if (!params.filePath || params.filePath.trim() === '') {
+            throw new Error('Document URL is required')
+          }
+          parameters.filePath = params.filePath.trim()
+        } else if (inputMethod === 'upload') {
+          if (!params.fileUpload) {
+            throw new Error('Please upload a document')
+          }
+          parameters.fileUpload = params.fileUpload
+        }
+
+        if (params.pages && params.pages.trim() !== '') {
+          parameters.pages = params.pages.trim()
+        }
+
+        if (params.chunking && params.chunking.trim() !== '') {
+          parameters.chunking = params.chunking.trim()
+        }
+
+        if (params.chunkSize && params.chunkSize.trim() !== '') {
+          const size = Number.parseInt(params.chunkSize.trim(), 10)
+          if (!Number.isNaN(size) && size > 0) {
+            parameters.chunkSize = size
+          }
+        }
+
+        return parameters
+      },
+    },
+  },
+  inputs: {
+    inputMethod: { type: 'string', description: 'Input method selection' },
+    filePath: { type: 'string', description: 'Document URL' },
+    fileUpload: { type: 'json', description: 'Uploaded document file' },
+    apiKey: { type: 'string', description: 'Pulse API key' },
+    pages: { type: 'string', description: 'Page range selection' },
+    chunking: {
+      type: 'string',
+      description: 'Chunking strategies (semantic, header, page, recursive)',
+    },
+    chunkSize: { type: 'string', description: 'Maximum characters per chunk' },
+  },
+  outputs: {
+    markdown: { type: 'string', description: 'Extracted content in markdown format' },
+    page_count: { type: 'number', description: 'Number of pages in the document' },
+    job_id: { type: 'string', description: 'Unique job identifier' },
+    'plan-info': { type: 'json', description: 'Plan usage information' },
+    bounding_boxes: { type: 'json', description: 'Bounding box layout information' },
+    extraction_url: { type: 'string', description: 'URL for extraction results (large documents)' },
+    html: { type: 'string', description: 'HTML content if requested' },
+    structured_output: { type: 'json', description: 'Structured output if schema was provided' },
+    chunks: { type: 'json', description: 'Chunked content if chunking was enabled' },
+    figures: { type: 'json', description: 'Extracted figures if figure extraction was enabled' },
+  },
+}
--- a/apps/sim/blocks/blocks/reducto.ts
+++ b/apps/sim/blocks/blocks/reducto.ts
@@ -0,0 +1,148 @@
+import { ReductoIcon } from '@/components/icons'
+import { AuthMode, type BlockConfig, type SubBlockType } from '@/blocks/types'
+import type { ReductoParserOutput } from '@/tools/reducto/types'
+
+export const ReductoBlock: BlockConfig<ReductoParserOutput> = {
+  type: 'reducto',
+  name: 'Reducto',
+  description: 'Extract text from PDF documents',
+  authMode: AuthMode.ApiKey,
+  longDescription: `Integrate Reducto Parse into the workflow. Can extract text from uploaded PDF documents, or from a URL.`,
+  docsLink: 'https://docs.sim.ai/tools/reducto',
+  category: 'tools',
+  bgColor: '#5c0c5c',
+  icon: ReductoIcon,
+  subBlocks: [
+    {
+      id: 'inputMethod',
+      title: 'Select Input Method',
+      type: 'dropdown' as SubBlockType,
+      options: [
+        { id: 'url', label: 'PDF Document URL' },
+        { id: 'upload', label: 'Upload PDF Document' },
+      ],
+    },
+    {
+      id: 'filePath',
+      title: 'PDF Document URL',
+      type: 'short-input' as SubBlockType,
+      placeholder: 'Enter full URL to a PDF document (https://example.com/document.pdf)',
+      condition: {
+        field: 'inputMethod',
+        value: 'url',
+      },
+    },
+    {
+      id: 'fileUpload',
+      title: 'Upload PDF',
+      type: 'file-upload' as SubBlockType,
+      acceptedTypes: 'application/pdf',
+      condition: {
+        field: 'inputMethod',
+        value: 'upload',
+      },
+      maxSize: 50,
+    },
+    {
+      id: 'pages',
+      title: 'Specific Pages',
+      type: 'short-input',
+      placeholder: 'e.g. 1,2,3 (1-indexed, leave empty for all)',
+    },
+    {
+      id: 'tableOutputFormat',
+      title: 'Table Format',
+      type: 'dropdown',
+      options: [
+        { id: 'md', label: 'Markdown' },
+        { id: 'html', label: 'HTML' },
+      ],
+    },
+    {
+      id: 'apiKey',
+      title: 'API Key',
+      type: 'short-input' as SubBlockType,
+      placeholder: 'Enter your Reducto API key',
+      password: true,
+      required: true,
+    },
+  ],
+  tools: {
+    access: ['reducto_parser'],
+    config: {
+      tool: () => 'reducto_parser',
+      params: (params) => {
+        if (!params || !params.apiKey || params.apiKey.trim() === '') {
+          throw new Error('Reducto API key is required')
+        }
+
+        const parameters: Record<string, unknown> = {
+          apiKey: params.apiKey.trim(),
+        }
+
+        const inputMethod = params.inputMethod || 'url'
+        if (inputMethod === 'url') {
+          if (!params.filePath || params.filePath.trim() === '') {
+            throw new Error('PDF Document URL is required')
+          }
+          parameters.filePath = params.filePath.trim()
+        } else if (inputMethod === 'upload') {
+          if (!params.fileUpload) {
+            throw new Error('Please upload a PDF document')
+          }
+          parameters.fileUpload = params.fileUpload
+        }
+
+        let pagesArray: number[] | undefined
+        if (params.pages && params.pages.trim() !== '') {
+          try {
+            pagesArray = params.pages
+              .split(',')
+              .map((p: string) => p.trim())
+              .filter((p: string) => p.length > 0)
+              .map((p: string) => {
+                const num = Number.parseInt(p, 10)
+                if (Number.isNaN(num) || num < 0) {
+                  throw new Error(`Invalid page number: ${p}`)
+                }
+                return num
+              })
+
+            if (pagesArray && pagesArray.length === 0) {
+              pagesArray = undefined
+            }
+          } catch (error: unknown) {
+            const errorMessage = error instanceof Error ? error.message : String(error)
+            throw new Error(`Page number format error: ${errorMessage}`)
+          }
+        }
+
+        if (pagesArray && pagesArray.length > 0) {
+          parameters.pages = pagesArray
+        }
+
+        if (params.tableOutputFormat) {
+          parameters.tableOutputFormat = params.tableOutputFormat
+        }
+
+        return parameters
+      },
+    },
+  },
+  inputs: {
+    inputMethod: { type: 'string', description: 'Input method selection' },
+    filePath: { type: 'string', description: 'PDF document URL' },
+    fileUpload: { type: 'json', description: 'Uploaded PDF file' },
+    apiKey: { type: 'string', description: 'Reducto API key' },
+    pages: { type: 'string', description: 'Page selection' },
+    tableOutputFormat: { type: 'string', description: 'Table output format' },
+  },
+  outputs: {
+    job_id: { type: 'string', description: 'Unique identifier for the processing job' },
+    duration: { type: 'number', description: 'Processing time in seconds' },
+    usage: { type: 'json', description: 'Resource consumption data (num_pages, credits)' },
+    result: { type: 'json', description: 'Parsed document content with chunks and blocks' },
+    pdf_url: { type: 'string', description: 'Storage URL of converted PDF' },
+    studio_link: { type: 'string', description: 'Link to Reducto studio interface' },
+  },
+}
--- a/apps/sim/blocks/registry.ts
+++ b/apps/sim/blocks/registry.ts
@@ -93,9 +93,11 @@ import { PipedriveBlock } from '@/blocks/blocks/pipedrive'
 import { PolymarketBlock } from '@/blocks/blocks/polymarket'
 import { PostgreSQLBlock } from '@/blocks/blocks/postgresql'
 import { PostHogBlock } from '@/blocks/blocks/posthog'
+import { PulseBlock } from '@/blocks/blocks/pulse'
 import { QdrantBlock } from '@/blocks/blocks/qdrant'
 import { RDSBlock } from '@/blocks/blocks/rds'
 import { RedditBlock } from '@/blocks/blocks/reddit'
+import { ReductoBlock } from '@/blocks/blocks/reducto'
 import { ResendBlock } from '@/blocks/blocks/resend'
 import { ResponseBlock } from '@/blocks/blocks/response'
 import { RouterBlock, RouterV2Block } from '@/blocks/blocks/router'
@@ -237,6 +239,7 @@ export const registry: Record<string, BlockConfig> = {
  microsoft_planner: MicrosoftPlannerBlock,
  microsoft_teams: MicrosoftTeamsBlock,
  mistral_parse: MistralParseBlock,
+  reducto: ReductoBlock,
  mongodb: MongoDBBlock,
  mysql: MySQLBlock,
  neo4j: Neo4jBlock,
@@ -253,6 +256,7 @@ export const registry: Record<string, BlockConfig> = {
  polymarket: PolymarketBlock,
  postgresql: PostgreSQLBlock,
  posthog: PostHogBlock,
+  pulse: PulseBlock,
  qdrant: QdrantBlock,
  rds: RDSBlock,
  sqs: SQSBlock,