feat(ocr): added reducto and pulse for OCR (#2843)

* feat(ocr): added reducto and pulse for OCR

* ack comments
This commit is contained in:
Waleed
2026-01-15 18:30:39 -08:00
committed by GitHub
parent b813bf7f27
commit 12470a630c
18 changed files with 2212 additions and 0 deletions

View File

@@ -0,0 +1,143 @@
import { PulseIcon } from '@/components/icons'
import { AuthMode, type BlockConfig, type SubBlockType } from '@/blocks/types'
import type { PulseParserOutput } from '@/tools/pulse/types'
export const PulseBlock: BlockConfig<PulseParserOutput> = {
type: 'pulse',
name: 'Pulse',
description: 'Extract text from documents using Pulse OCR',
authMode: AuthMode.ApiKey,
longDescription:
'Integrate Pulse into the workflow. Extract text from PDF documents, images, and Office files via URL or upload.',
docsLink: 'https://docs.sim.ai/tools/pulse',
category: 'tools',
bgColor: '#E0E0E0',
icon: PulseIcon,
subBlocks: [
{
id: 'inputMethod',
title: 'Select Input Method',
type: 'dropdown' as SubBlockType,
options: [
{ id: 'url', label: 'Document URL' },
{ id: 'upload', label: 'Upload Document' },
],
},
{
id: 'filePath',
title: 'Document URL',
type: 'short-input' as SubBlockType,
placeholder: 'Enter full URL to a document (https://example.com/document.pdf)',
condition: {
field: 'inputMethod',
value: 'url',
},
},
{
id: 'fileUpload',
title: 'Upload Document',
type: 'file-upload' as SubBlockType,
acceptedTypes: 'application/pdf,image/*,.docx,.pptx,.xlsx',
condition: {
field: 'inputMethod',
value: 'upload',
},
maxSize: 50,
},
{
id: 'pages',
title: 'Specific Pages',
type: 'short-input',
placeholder: 'e.g. 1-3,5 (leave empty for all pages)',
},
{
id: 'chunking',
title: 'Chunking Strategy',
type: 'short-input',
placeholder: 'e.g. semantic,header,page,recursive',
},
{
id: 'chunkSize',
title: 'Chunk Size',
type: 'short-input',
placeholder: 'Max characters per chunk',
},
{
id: 'apiKey',
title: 'API Key',
type: 'short-input' as SubBlockType,
placeholder: 'Enter your Pulse API key',
password: true,
required: true,
},
],
tools: {
access: ['pulse_parser'],
config: {
tool: () => 'pulse_parser',
params: (params) => {
if (!params || !params.apiKey || params.apiKey.trim() === '') {
throw new Error('Pulse API key is required')
}
const parameters: Record<string, unknown> = {
apiKey: params.apiKey.trim(),
}
const inputMethod = params.inputMethod || 'url'
if (inputMethod === 'url') {
if (!params.filePath || params.filePath.trim() === '') {
throw new Error('Document URL is required')
}
parameters.filePath = params.filePath.trim()
} else if (inputMethod === 'upload') {
if (!params.fileUpload) {
throw new Error('Please upload a document')
}
parameters.fileUpload = params.fileUpload
}
if (params.pages && params.pages.trim() !== '') {
parameters.pages = params.pages.trim()
}
if (params.chunking && params.chunking.trim() !== '') {
parameters.chunking = params.chunking.trim()
}
if (params.chunkSize && params.chunkSize.trim() !== '') {
const size = Number.parseInt(params.chunkSize.trim(), 10)
if (!Number.isNaN(size) && size > 0) {
parameters.chunkSize = size
}
}
return parameters
},
},
},
inputs: {
inputMethod: { type: 'string', description: 'Input method selection' },
filePath: { type: 'string', description: 'Document URL' },
fileUpload: { type: 'json', description: 'Uploaded document file' },
apiKey: { type: 'string', description: 'Pulse API key' },
pages: { type: 'string', description: 'Page range selection' },
chunking: {
type: 'string',
description: 'Chunking strategies (semantic, header, page, recursive)',
},
chunkSize: { type: 'string', description: 'Maximum characters per chunk' },
},
outputs: {
markdown: { type: 'string', description: 'Extracted content in markdown format' },
page_count: { type: 'number', description: 'Number of pages in the document' },
job_id: { type: 'string', description: 'Unique job identifier' },
'plan-info': { type: 'json', description: 'Plan usage information' },
bounding_boxes: { type: 'json', description: 'Bounding box layout information' },
extraction_url: { type: 'string', description: 'URL for extraction results (large documents)' },
html: { type: 'string', description: 'HTML content if requested' },
structured_output: { type: 'json', description: 'Structured output if schema was provided' },
chunks: { type: 'json', description: 'Chunked content if chunking was enabled' },
figures: { type: 'json', description: 'Extracted figures if figure extraction was enabled' },
},
}

View File

@@ -0,0 +1,148 @@
import { ReductoIcon } from '@/components/icons'
import { AuthMode, type BlockConfig, type SubBlockType } from '@/blocks/types'
import type { ReductoParserOutput } from '@/tools/reducto/types'
export const ReductoBlock: BlockConfig<ReductoParserOutput> = {
type: 'reducto',
name: 'Reducto',
description: 'Extract text from PDF documents',
authMode: AuthMode.ApiKey,
longDescription: `Integrate Reducto Parse into the workflow. Can extract text from uploaded PDF documents, or from a URL.`,
docsLink: 'https://docs.sim.ai/tools/reducto',
category: 'tools',
bgColor: '#5c0c5c',
icon: ReductoIcon,
subBlocks: [
{
id: 'inputMethod',
title: 'Select Input Method',
type: 'dropdown' as SubBlockType,
options: [
{ id: 'url', label: 'PDF Document URL' },
{ id: 'upload', label: 'Upload PDF Document' },
],
},
{
id: 'filePath',
title: 'PDF Document URL',
type: 'short-input' as SubBlockType,
placeholder: 'Enter full URL to a PDF document (https://example.com/document.pdf)',
condition: {
field: 'inputMethod',
value: 'url',
},
},
{
id: 'fileUpload',
title: 'Upload PDF',
type: 'file-upload' as SubBlockType,
acceptedTypes: 'application/pdf',
condition: {
field: 'inputMethod',
value: 'upload',
},
maxSize: 50,
},
{
id: 'pages',
title: 'Specific Pages',
type: 'short-input',
placeholder: 'e.g. 1,2,3 (1-indexed, leave empty for all)',
},
{
id: 'tableOutputFormat',
title: 'Table Format',
type: 'dropdown',
options: [
{ id: 'md', label: 'Markdown' },
{ id: 'html', label: 'HTML' },
],
},
{
id: 'apiKey',
title: 'API Key',
type: 'short-input' as SubBlockType,
placeholder: 'Enter your Reducto API key',
password: true,
required: true,
},
],
tools: {
access: ['reducto_parser'],
config: {
tool: () => 'reducto_parser',
params: (params) => {
if (!params || !params.apiKey || params.apiKey.trim() === '') {
throw new Error('Reducto API key is required')
}
const parameters: Record<string, unknown> = {
apiKey: params.apiKey.trim(),
}
const inputMethod = params.inputMethod || 'url'
if (inputMethod === 'url') {
if (!params.filePath || params.filePath.trim() === '') {
throw new Error('PDF Document URL is required')
}
parameters.filePath = params.filePath.trim()
} else if (inputMethod === 'upload') {
if (!params.fileUpload) {
throw new Error('Please upload a PDF document')
}
parameters.fileUpload = params.fileUpload
}
let pagesArray: number[] | undefined
if (params.pages && params.pages.trim() !== '') {
try {
pagesArray = params.pages
.split(',')
.map((p: string) => p.trim())
.filter((p: string) => p.length > 0)
.map((p: string) => {
const num = Number.parseInt(p, 10)
if (Number.isNaN(num) || num < 0) {
throw new Error(`Invalid page number: ${p}`)
}
return num
})
if (pagesArray && pagesArray.length === 0) {
pagesArray = undefined
}
} catch (error: unknown) {
const errorMessage = error instanceof Error ? error.message : String(error)
throw new Error(`Page number format error: ${errorMessage}`)
}
}
if (pagesArray && pagesArray.length > 0) {
parameters.pages = pagesArray
}
if (params.tableOutputFormat) {
parameters.tableOutputFormat = params.tableOutputFormat
}
return parameters
},
},
},
inputs: {
inputMethod: { type: 'string', description: 'Input method selection' },
filePath: { type: 'string', description: 'PDF document URL' },
fileUpload: { type: 'json', description: 'Uploaded PDF file' },
apiKey: { type: 'string', description: 'Reducto API key' },
pages: { type: 'string', description: 'Page selection' },
tableOutputFormat: { type: 'string', description: 'Table output format' },
},
outputs: {
job_id: { type: 'string', description: 'Unique identifier for the processing job' },
duration: { type: 'number', description: 'Processing time in seconds' },
usage: { type: 'json', description: 'Resource consumption data (num_pages, credits)' },
result: { type: 'json', description: 'Parsed document content with chunks and blocks' },
pdf_url: { type: 'string', description: 'Storage URL of converted PDF' },
studio_link: { type: 'string', description: 'Link to Reducto studio interface' },
},
}

View File

@@ -93,9 +93,11 @@ import { PipedriveBlock } from '@/blocks/blocks/pipedrive'
import { PolymarketBlock } from '@/blocks/blocks/polymarket'
import { PostgreSQLBlock } from '@/blocks/blocks/postgresql'
import { PostHogBlock } from '@/blocks/blocks/posthog'
import { PulseBlock } from '@/blocks/blocks/pulse'
import { QdrantBlock } from '@/blocks/blocks/qdrant'
import { RDSBlock } from '@/blocks/blocks/rds'
import { RedditBlock } from '@/blocks/blocks/reddit'
import { ReductoBlock } from '@/blocks/blocks/reducto'
import { ResendBlock } from '@/blocks/blocks/resend'
import { ResponseBlock } from '@/blocks/blocks/response'
import { RouterBlock, RouterV2Block } from '@/blocks/blocks/router'
@@ -237,6 +239,7 @@ export const registry: Record<string, BlockConfig> = {
microsoft_planner: MicrosoftPlannerBlock,
microsoft_teams: MicrosoftTeamsBlock,
mistral_parse: MistralParseBlock,
reducto: ReductoBlock,
mongodb: MongoDBBlock,
mysql: MySQLBlock,
neo4j: Neo4jBlock,
@@ -253,6 +256,7 @@ export const registry: Record<string, BlockConfig> = {
polymarket: PolymarketBlock,
postgresql: PostgreSQLBlock,
posthog: PostHogBlock,
pulse: PulseBlock,
qdrant: QdrantBlock,
rds: RDSBlock,
sqs: SQSBlock,