mirror of
https://github.com/simstudioai/sim.git
synced 2026-04-28 03:00:29 -04:00
feat(ocr): added reducto and pulse for OCR (#2843)
* feat(ocr): added reducto and pulse for OCR * ack comments
This commit is contained in:
143
apps/sim/blocks/blocks/pulse.ts
Normal file
143
apps/sim/blocks/blocks/pulse.ts
Normal file
@@ -0,0 +1,143 @@
|
||||
import { PulseIcon } from '@/components/icons'
|
||||
import { AuthMode, type BlockConfig, type SubBlockType } from '@/blocks/types'
|
||||
import type { PulseParserOutput } from '@/tools/pulse/types'
|
||||
|
||||
export const PulseBlock: BlockConfig<PulseParserOutput> = {
|
||||
type: 'pulse',
|
||||
name: 'Pulse',
|
||||
description: 'Extract text from documents using Pulse OCR',
|
||||
authMode: AuthMode.ApiKey,
|
||||
longDescription:
|
||||
'Integrate Pulse into the workflow. Extract text from PDF documents, images, and Office files via URL or upload.',
|
||||
docsLink: 'https://docs.sim.ai/tools/pulse',
|
||||
category: 'tools',
|
||||
bgColor: '#E0E0E0',
|
||||
icon: PulseIcon,
|
||||
subBlocks: [
|
||||
{
|
||||
id: 'inputMethod',
|
||||
title: 'Select Input Method',
|
||||
type: 'dropdown' as SubBlockType,
|
||||
options: [
|
||||
{ id: 'url', label: 'Document URL' },
|
||||
{ id: 'upload', label: 'Upload Document' },
|
||||
],
|
||||
},
|
||||
{
|
||||
id: 'filePath',
|
||||
title: 'Document URL',
|
||||
type: 'short-input' as SubBlockType,
|
||||
placeholder: 'Enter full URL to a document (https://example.com/document.pdf)',
|
||||
condition: {
|
||||
field: 'inputMethod',
|
||||
value: 'url',
|
||||
},
|
||||
},
|
||||
{
|
||||
id: 'fileUpload',
|
||||
title: 'Upload Document',
|
||||
type: 'file-upload' as SubBlockType,
|
||||
acceptedTypes: 'application/pdf,image/*,.docx,.pptx,.xlsx',
|
||||
condition: {
|
||||
field: 'inputMethod',
|
||||
value: 'upload',
|
||||
},
|
||||
maxSize: 50,
|
||||
},
|
||||
{
|
||||
id: 'pages',
|
||||
title: 'Specific Pages',
|
||||
type: 'short-input',
|
||||
placeholder: 'e.g. 1-3,5 (leave empty for all pages)',
|
||||
},
|
||||
{
|
||||
id: 'chunking',
|
||||
title: 'Chunking Strategy',
|
||||
type: 'short-input',
|
||||
placeholder: 'e.g. semantic,header,page,recursive',
|
||||
},
|
||||
{
|
||||
id: 'chunkSize',
|
||||
title: 'Chunk Size',
|
||||
type: 'short-input',
|
||||
placeholder: 'Max characters per chunk',
|
||||
},
|
||||
{
|
||||
id: 'apiKey',
|
||||
title: 'API Key',
|
||||
type: 'short-input' as SubBlockType,
|
||||
placeholder: 'Enter your Pulse API key',
|
||||
password: true,
|
||||
required: true,
|
||||
},
|
||||
],
|
||||
tools: {
|
||||
access: ['pulse_parser'],
|
||||
config: {
|
||||
tool: () => 'pulse_parser',
|
||||
params: (params) => {
|
||||
if (!params || !params.apiKey || params.apiKey.trim() === '') {
|
||||
throw new Error('Pulse API key is required')
|
||||
}
|
||||
|
||||
const parameters: Record<string, unknown> = {
|
||||
apiKey: params.apiKey.trim(),
|
||||
}
|
||||
|
||||
const inputMethod = params.inputMethod || 'url'
|
||||
if (inputMethod === 'url') {
|
||||
if (!params.filePath || params.filePath.trim() === '') {
|
||||
throw new Error('Document URL is required')
|
||||
}
|
||||
parameters.filePath = params.filePath.trim()
|
||||
} else if (inputMethod === 'upload') {
|
||||
if (!params.fileUpload) {
|
||||
throw new Error('Please upload a document')
|
||||
}
|
||||
parameters.fileUpload = params.fileUpload
|
||||
}
|
||||
|
||||
if (params.pages && params.pages.trim() !== '') {
|
||||
parameters.pages = params.pages.trim()
|
||||
}
|
||||
|
||||
if (params.chunking && params.chunking.trim() !== '') {
|
||||
parameters.chunking = params.chunking.trim()
|
||||
}
|
||||
|
||||
if (params.chunkSize && params.chunkSize.trim() !== '') {
|
||||
const size = Number.parseInt(params.chunkSize.trim(), 10)
|
||||
if (!Number.isNaN(size) && size > 0) {
|
||||
parameters.chunkSize = size
|
||||
}
|
||||
}
|
||||
|
||||
return parameters
|
||||
},
|
||||
},
|
||||
},
|
||||
inputs: {
|
||||
inputMethod: { type: 'string', description: 'Input method selection' },
|
||||
filePath: { type: 'string', description: 'Document URL' },
|
||||
fileUpload: { type: 'json', description: 'Uploaded document file' },
|
||||
apiKey: { type: 'string', description: 'Pulse API key' },
|
||||
pages: { type: 'string', description: 'Page range selection' },
|
||||
chunking: {
|
||||
type: 'string',
|
||||
description: 'Chunking strategies (semantic, header, page, recursive)',
|
||||
},
|
||||
chunkSize: { type: 'string', description: 'Maximum characters per chunk' },
|
||||
},
|
||||
outputs: {
|
||||
markdown: { type: 'string', description: 'Extracted content in markdown format' },
|
||||
page_count: { type: 'number', description: 'Number of pages in the document' },
|
||||
job_id: { type: 'string', description: 'Unique job identifier' },
|
||||
'plan-info': { type: 'json', description: 'Plan usage information' },
|
||||
bounding_boxes: { type: 'json', description: 'Bounding box layout information' },
|
||||
extraction_url: { type: 'string', description: 'URL for extraction results (large documents)' },
|
||||
html: { type: 'string', description: 'HTML content if requested' },
|
||||
structured_output: { type: 'json', description: 'Structured output if schema was provided' },
|
||||
chunks: { type: 'json', description: 'Chunked content if chunking was enabled' },
|
||||
figures: { type: 'json', description: 'Extracted figures if figure extraction was enabled' },
|
||||
},
|
||||
}
|
||||
148
apps/sim/blocks/blocks/reducto.ts
Normal file
148
apps/sim/blocks/blocks/reducto.ts
Normal file
@@ -0,0 +1,148 @@
|
||||
import { ReductoIcon } from '@/components/icons'
|
||||
import { AuthMode, type BlockConfig, type SubBlockType } from '@/blocks/types'
|
||||
import type { ReductoParserOutput } from '@/tools/reducto/types'
|
||||
|
||||
export const ReductoBlock: BlockConfig<ReductoParserOutput> = {
|
||||
type: 'reducto',
|
||||
name: 'Reducto',
|
||||
description: 'Extract text from PDF documents',
|
||||
authMode: AuthMode.ApiKey,
|
||||
longDescription: `Integrate Reducto Parse into the workflow. Can extract text from uploaded PDF documents, or from a URL.`,
|
||||
docsLink: 'https://docs.sim.ai/tools/reducto',
|
||||
category: 'tools',
|
||||
bgColor: '#5c0c5c',
|
||||
icon: ReductoIcon,
|
||||
subBlocks: [
|
||||
{
|
||||
id: 'inputMethod',
|
||||
title: 'Select Input Method',
|
||||
type: 'dropdown' as SubBlockType,
|
||||
options: [
|
||||
{ id: 'url', label: 'PDF Document URL' },
|
||||
{ id: 'upload', label: 'Upload PDF Document' },
|
||||
],
|
||||
},
|
||||
{
|
||||
id: 'filePath',
|
||||
title: 'PDF Document URL',
|
||||
type: 'short-input' as SubBlockType,
|
||||
placeholder: 'Enter full URL to a PDF document (https://example.com/document.pdf)',
|
||||
condition: {
|
||||
field: 'inputMethod',
|
||||
value: 'url',
|
||||
},
|
||||
},
|
||||
{
|
||||
id: 'fileUpload',
|
||||
title: 'Upload PDF',
|
||||
type: 'file-upload' as SubBlockType,
|
||||
acceptedTypes: 'application/pdf',
|
||||
condition: {
|
||||
field: 'inputMethod',
|
||||
value: 'upload',
|
||||
},
|
||||
maxSize: 50,
|
||||
},
|
||||
{
|
||||
id: 'pages',
|
||||
title: 'Specific Pages',
|
||||
type: 'short-input',
|
||||
placeholder: 'e.g. 1,2,3 (1-indexed, leave empty for all)',
|
||||
},
|
||||
{
|
||||
id: 'tableOutputFormat',
|
||||
title: 'Table Format',
|
||||
type: 'dropdown',
|
||||
options: [
|
||||
{ id: 'md', label: 'Markdown' },
|
||||
{ id: 'html', label: 'HTML' },
|
||||
],
|
||||
},
|
||||
{
|
||||
id: 'apiKey',
|
||||
title: 'API Key',
|
||||
type: 'short-input' as SubBlockType,
|
||||
placeholder: 'Enter your Reducto API key',
|
||||
password: true,
|
||||
required: true,
|
||||
},
|
||||
],
|
||||
tools: {
|
||||
access: ['reducto_parser'],
|
||||
config: {
|
||||
tool: () => 'reducto_parser',
|
||||
params: (params) => {
|
||||
if (!params || !params.apiKey || params.apiKey.trim() === '') {
|
||||
throw new Error('Reducto API key is required')
|
||||
}
|
||||
|
||||
const parameters: Record<string, unknown> = {
|
||||
apiKey: params.apiKey.trim(),
|
||||
}
|
||||
|
||||
const inputMethod = params.inputMethod || 'url'
|
||||
if (inputMethod === 'url') {
|
||||
if (!params.filePath || params.filePath.trim() === '') {
|
||||
throw new Error('PDF Document URL is required')
|
||||
}
|
||||
parameters.filePath = params.filePath.trim()
|
||||
} else if (inputMethod === 'upload') {
|
||||
if (!params.fileUpload) {
|
||||
throw new Error('Please upload a PDF document')
|
||||
}
|
||||
parameters.fileUpload = params.fileUpload
|
||||
}
|
||||
|
||||
let pagesArray: number[] | undefined
|
||||
if (params.pages && params.pages.trim() !== '') {
|
||||
try {
|
||||
pagesArray = params.pages
|
||||
.split(',')
|
||||
.map((p: string) => p.trim())
|
||||
.filter((p: string) => p.length > 0)
|
||||
.map((p: string) => {
|
||||
const num = Number.parseInt(p, 10)
|
||||
if (Number.isNaN(num) || num < 0) {
|
||||
throw new Error(`Invalid page number: ${p}`)
|
||||
}
|
||||
return num
|
||||
})
|
||||
|
||||
if (pagesArray && pagesArray.length === 0) {
|
||||
pagesArray = undefined
|
||||
}
|
||||
} catch (error: unknown) {
|
||||
const errorMessage = error instanceof Error ? error.message : String(error)
|
||||
throw new Error(`Page number format error: ${errorMessage}`)
|
||||
}
|
||||
}
|
||||
|
||||
if (pagesArray && pagesArray.length > 0) {
|
||||
parameters.pages = pagesArray
|
||||
}
|
||||
|
||||
if (params.tableOutputFormat) {
|
||||
parameters.tableOutputFormat = params.tableOutputFormat
|
||||
}
|
||||
|
||||
return parameters
|
||||
},
|
||||
},
|
||||
},
|
||||
inputs: {
|
||||
inputMethod: { type: 'string', description: 'Input method selection' },
|
||||
filePath: { type: 'string', description: 'PDF document URL' },
|
||||
fileUpload: { type: 'json', description: 'Uploaded PDF file' },
|
||||
apiKey: { type: 'string', description: 'Reducto API key' },
|
||||
pages: { type: 'string', description: 'Page selection' },
|
||||
tableOutputFormat: { type: 'string', description: 'Table output format' },
|
||||
},
|
||||
outputs: {
|
||||
job_id: { type: 'string', description: 'Unique identifier for the processing job' },
|
||||
duration: { type: 'number', description: 'Processing time in seconds' },
|
||||
usage: { type: 'json', description: 'Resource consumption data (num_pages, credits)' },
|
||||
result: { type: 'json', description: 'Parsed document content with chunks and blocks' },
|
||||
pdf_url: { type: 'string', description: 'Storage URL of converted PDF' },
|
||||
studio_link: { type: 'string', description: 'Link to Reducto studio interface' },
|
||||
},
|
||||
}
|
||||
@@ -93,9 +93,11 @@ import { PipedriveBlock } from '@/blocks/blocks/pipedrive'
|
||||
import { PolymarketBlock } from '@/blocks/blocks/polymarket'
|
||||
import { PostgreSQLBlock } from '@/blocks/blocks/postgresql'
|
||||
import { PostHogBlock } from '@/blocks/blocks/posthog'
|
||||
import { PulseBlock } from '@/blocks/blocks/pulse'
|
||||
import { QdrantBlock } from '@/blocks/blocks/qdrant'
|
||||
import { RDSBlock } from '@/blocks/blocks/rds'
|
||||
import { RedditBlock } from '@/blocks/blocks/reddit'
|
||||
import { ReductoBlock } from '@/blocks/blocks/reducto'
|
||||
import { ResendBlock } from '@/blocks/blocks/resend'
|
||||
import { ResponseBlock } from '@/blocks/blocks/response'
|
||||
import { RouterBlock, RouterV2Block } from '@/blocks/blocks/router'
|
||||
@@ -237,6 +239,7 @@ export const registry: Record<string, BlockConfig> = {
|
||||
microsoft_planner: MicrosoftPlannerBlock,
|
||||
microsoft_teams: MicrosoftTeamsBlock,
|
||||
mistral_parse: MistralParseBlock,
|
||||
reducto: ReductoBlock,
|
||||
mongodb: MongoDBBlock,
|
||||
mysql: MySQLBlock,
|
||||
neo4j: Neo4jBlock,
|
||||
@@ -253,6 +256,7 @@ export const registry: Record<string, BlockConfig> = {
|
||||
polymarket: PolymarketBlock,
|
||||
postgresql: PostgreSQLBlock,
|
||||
posthog: PostHogBlock,
|
||||
pulse: PulseBlock,
|
||||
qdrant: QdrantBlock,
|
||||
rds: RDSBlock,
|
||||
sqs: SQSBlock,
|
||||
|
||||
Reference in New Issue
Block a user