From ecf39c5a54c2bf1dc4ade313a75d39d80dedf432 Mon Sep 17 00:00:00 2001 From: waleed Date: Tue, 20 Jan 2026 11:06:38 -0800 Subject: [PATCH] feat(tools): added textract --- apps/docs/components/icons.tsx | 17 + apps/docs/components/ui/icon-mapping.ts | 2 + apps/docs/content/docs/en/tools/meta.json | 1 + apps/docs/content/docs/en/tools/textract.mdx | 120 ++++ .../sim/app/api/tools/textract/parse/route.ts | 617 ++++++++++++++++++ apps/sim/blocks/blocks/textract.ts | 251 +++++++ apps/sim/blocks/registry.ts | 2 + apps/sim/components/icons.tsx | 17 + .../sim/lib/core/security/input-validation.ts | 137 +++- apps/sim/lib/uploads/core/storage-service.ts | 24 + apps/sim/tools/registry.ts | 2 + apps/sim/tools/textract/index.ts | 2 + apps/sim/tools/textract/parser.ts | 413 ++++++++++++ apps/sim/tools/textract/types.ts | 114 ++++ 14 files changed, 1706 insertions(+), 13 deletions(-) create mode 100644 apps/docs/content/docs/en/tools/textract.mdx create mode 100644 apps/sim/app/api/tools/textract/parse/route.ts create mode 100644 apps/sim/blocks/blocks/textract.ts create mode 100644 apps/sim/tools/textract/index.ts create mode 100644 apps/sim/tools/textract/parser.ts create mode 100644 apps/sim/tools/textract/types.ts diff --git a/apps/docs/components/icons.tsx b/apps/docs/components/icons.tsx index 1c245ffaf..689dbb50a 100644 --- a/apps/docs/components/icons.tsx +++ b/apps/docs/components/icons.tsx @@ -4093,6 +4093,23 @@ export function SQSIcon(props: SVGProps) { ) } +export function TextractIcon(props: SVGProps) { + return ( + + + + ) +} + export function McpIcon(props: SVGProps) { return ( = { supabase: SupabaseIcon, tavily: TavilyIcon, telegram: TelegramIcon, + textract: TextractIcon, tinybird: TinybirdIcon, translate: TranslateIcon, trello: TrelloIcon, diff --git a/apps/docs/content/docs/en/tools/meta.json b/apps/docs/content/docs/en/tools/meta.json index ec3178013..f7db7e1be 100644 --- a/apps/docs/content/docs/en/tools/meta.json +++ b/apps/docs/content/docs/en/tools/meta.json @@ -106,6 +106,7 @@ "supabase", "tavily", "telegram", + "textract", "tinybird", "translate", "trello", diff --git a/apps/docs/content/docs/en/tools/textract.mdx b/apps/docs/content/docs/en/tools/textract.mdx new file mode 100644 index 000000000..7c106f37d --- /dev/null +++ b/apps/docs/content/docs/en/tools/textract.mdx @@ -0,0 +1,120 @@ +--- +title: AWS Textract +description: Extract text, tables, and forms from documents +--- + +import { BlockInfoCard } from "@/components/ui/block-info-card" + + + +{/* MANUAL-CONTENT-START:intro */} +[AWS Textract](https://aws.amazon.com/textract/) is a powerful AI service from Amazon Web Services designed to automatically extract printed text, handwriting, tables, forms, key-value pairs, and other structured data from scanned documents and images. Textract leverages advanced optical character recognition (OCR) and document analysis to transform documents into actionable data, enabling automation, analytics, compliance, and more. + +With AWS Textract, you can: + +- **Extract text from images and documents**: Recognize printed text and handwriting in formats such as PDF, JPEG, PNG, or TIFF +- **Detect and extract tables**: Automatically find tables and output their structured content +- **Parse forms and key-value pairs**: Pull structured data from forms, including fields and their corresponding values +- **Identify signatures and layout features**: Detect signatures, geometric layout, and relationships between document elements +- **Customize extraction with queries**: Extract specific fields and answers using query-based extraction (e.g., "What is the invoice number?") + +In Sim, the AWS Textract integration empowers your agents to intelligently process documents as part of their workflows. This unlocks automation scenarios such as data entry from invoices, onboarding documents, contracts, receipts, and more. Your agents can extract relevant data, analyze structured forms, and generate summaries or reports directly from document uploads or URLs. By connecting Sim with AWS Textract, you can reduce manual effort, improve data accuracy, and streamline your business processes with robust document understanding. +{/* MANUAL-CONTENT-END */} + + +## Usage Instructions + +Integrate AWS Textract into your workflow to extract text, tables, forms, and key-value pairs from documents. Sync mode supports JPEG, PNG, and single-page PDF. Async mode supports multi-page PDF and TIFF via S3. + + + +## Tools + +### `textract_parser` + +Parse documents using AWS Textract OCR and document analysis + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `accessKeyId` | string | Yes | AWS Access Key ID | +| `secretAccessKey` | string | Yes | AWS Secret Access Key | +| `region` | string | Yes | AWS region for Textract service \(e.g., us-east-1\) | +| `processingMode` | string | No | Document type: single-page or multi-page. Defaults to single-page. | +| `filePath` | string | No | URL to a document to be processed \(JPEG, PNG, PDF, or TIFF\). Required for sync mode. | +| `s3Uri` | string | No | S3 URI for async processing \(s3://bucket/key\). Required for async mode with S3 input. | +| `fileUpload` | object | No | File upload data from file-upload component | +| `featureTypes` | array | No | Feature types to detect: TABLES, FORMS, QUERIES, SIGNATURES, LAYOUT. If not specified, only text detection is performed. | +| `items` | string | No | Feature type | +| `queries` | array | No | Custom queries to extract specific information. Only used when featureTypes includes QUERIES. | +| `items` | object | No | Query configuration | +| `properties` | string | No | The query text | +| `Text` | string | No | No description | +| `Alias` | string | No | No description | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `blocks` | array | Array of Block objects containing detected text, tables, forms, and other elements | +| ↳ `BlockType` | string | Type of block \(PAGE, LINE, WORD, TABLE, CELL, KEY_VALUE_SET, etc.\) | +| ↳ `Id` | string | Unique identifier for the block | +| ↳ `Text` | string | Query text | +| ↳ `TextType` | string | Type of text \(PRINTED or HANDWRITING\) | +| ↳ `Confidence` | number | Confidence score \(0-100\) | +| ↳ `Page` | number | Page number | +| ↳ `Geometry` | object | Location and bounding box information | +| ↳ `BoundingBox` | object | Height as ratio of document height | +| ↳ `Height` | number | Height as ratio of document height | +| ↳ `Left` | number | Left position as ratio of document width | +| ↳ `Top` | number | Top position as ratio of document height | +| ↳ `Width` | number | Width as ratio of document width | +| ↳ `Height` | number | Height as ratio of document height | +| ↳ `Left` | number | Left position as ratio of document width | +| ↳ `Top` | number | Top position as ratio of document height | +| ↳ `Width` | number | Width as ratio of document width | +| ↳ `Polygon` | array | Polygon coordinates | +| ↳ `X` | number | X coordinate | +| ↳ `Y` | number | Y coordinate | +| ↳ `X` | number | X coordinate | +| ↳ `Y` | number | Y coordinate | +| ↳ `BoundingBox` | object | Height as ratio of document height | +| ↳ `Height` | number | Height as ratio of document height | +| ↳ `Left` | number | Left position as ratio of document width | +| ↳ `Top` | number | Top position as ratio of document height | +| ↳ `Width` | number | Width as ratio of document width | +| ↳ `Height` | number | Height as ratio of document height | +| ↳ `Left` | number | Left position as ratio of document width | +| ↳ `Top` | number | Top position as ratio of document height | +| ↳ `Width` | number | Width as ratio of document width | +| ↳ `Polygon` | array | Polygon coordinates | +| ↳ `X` | number | X coordinate | +| ↳ `Y` | number | Y coordinate | +| ↳ `X` | number | X coordinate | +| ↳ `Y` | number | Y coordinate | +| ↳ `Relationships` | array | Relationships to other blocks | +| ↳ `Type` | string | Relationship type \(CHILD, VALUE, ANSWER, etc.\) | +| ↳ `Ids` | array | IDs of related blocks | +| ↳ `Type` | string | Relationship type \(CHILD, VALUE, ANSWER, etc.\) | +| ↳ `Ids` | array | IDs of related blocks | +| ↳ `EntityTypes` | array | Entity types for KEY_VALUE_SET \(KEY or VALUE\) | +| ↳ `SelectionStatus` | string | For checkboxes: SELECTED or NOT_SELECTED | +| ↳ `RowIndex` | number | Row index for table cells | +| ↳ `ColumnIndex` | number | Column index for table cells | +| ↳ `RowSpan` | number | Row span for merged cells | +| ↳ `ColumnSpan` | number | Column span for merged cells | +| ↳ `Query` | object | Query information for QUERY blocks | +| ↳ `Text` | string | Query text | +| ↳ `Alias` | string | Query alias | +| ↳ `Pages` | array | Pages to search | +| ↳ `Alias` | string | Query alias | +| ↳ `Pages` | array | Pages to search | +| `documentMetadata` | object | Metadata about the analyzed document | +| ↳ `pages` | number | Number of pages in the document | +| `modelVersion` | string | Version of the Textract model used for processing | + + diff --git a/apps/sim/app/api/tools/textract/parse/route.ts b/apps/sim/app/api/tools/textract/parse/route.ts new file mode 100644 index 000000000..ad1cddd0e --- /dev/null +++ b/apps/sim/app/api/tools/textract/parse/route.ts @@ -0,0 +1,617 @@ +import crypto from 'crypto' +import { createLogger } from '@sim/logger' +import { type NextRequest, NextResponse } from 'next/server' +import { z } from 'zod' +import { checkHybridAuth } from '@/lib/auth/hybrid' +import { + validateAwsRegion, + validateExternalUrl, + validateS3BucketName, +} from '@/lib/core/security/input-validation' +import { generateRequestId } from '@/lib/core/utils/request' +import { getBaseUrl } from '@/lib/core/utils/urls' +import { StorageService } from '@/lib/uploads' +import { extractStorageKey, inferContextFromKey } from '@/lib/uploads/utils/file-utils' +import { verifyFileAccess } from '@/app/api/files/authorization' + +export const dynamic = 'force-dynamic' +export const maxDuration = 300 // 5 minutes for large multi-page PDF processing + +const logger = createLogger('TextractParseAPI') + +const QuerySchema = z.object({ + Text: z.string().min(1), + Alias: z.string().optional(), + Pages: z.array(z.string()).optional(), +}) + +const TextractParseSchema = z + .object({ + accessKeyId: z.string().min(1, 'AWS Access Key ID is required'), + secretAccessKey: z.string().min(1, 'AWS Secret Access Key is required'), + region: z.string().min(1, 'AWS region is required'), + processingMode: z.enum(['sync', 'async']).optional().default('sync'), + filePath: z.string().optional(), + s3Uri: z.string().optional(), + featureTypes: z + .array(z.enum(['TABLES', 'FORMS', 'QUERIES', 'SIGNATURES', 'LAYOUT'])) + .optional(), + queries: z.array(QuerySchema).optional(), + }) + .superRefine((data, ctx) => { + const regionValidation = validateAwsRegion(data.region, 'AWS region') + if (!regionValidation.isValid) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: regionValidation.error, + path: ['region'], + }) + } + }) + +/** + * Generate AWS Signature Version 4 signing key + */ +function getSignatureKey( + key: string, + dateStamp: string, + regionName: string, + serviceName: string +): Buffer { + const kDate = crypto.createHmac('sha256', `AWS4${key}`).update(dateStamp).digest() + const kRegion = crypto.createHmac('sha256', kDate).update(regionName).digest() + const kService = crypto.createHmac('sha256', kRegion).update(serviceName).digest() + const kSigning = crypto.createHmac('sha256', kService).update('aws4_request').digest() + return kSigning +} + +function signAwsRequest( + method: string, + host: string, + uri: string, + body: string, + accessKeyId: string, + secretAccessKey: string, + region: string, + service: string, + amzTarget: string +): Record { + const date = new Date() + const amzDate = date.toISOString().replace(/[:-]|\.\d{3}/g, '') + const dateStamp = amzDate.slice(0, 8) + + const payloadHash = crypto.createHash('sha256').update(body).digest('hex') + + const canonicalHeaders = + `content-type:application/x-amz-json-1.1\n` + + `host:${host}\n` + + `x-amz-date:${amzDate}\n` + + `x-amz-target:${amzTarget}\n` + + const signedHeaders = 'content-type;host;x-amz-date;x-amz-target' + + const canonicalRequest = `${method}\n${uri}\n\n${canonicalHeaders}\n${signedHeaders}\n${payloadHash}` + + const algorithm = 'AWS4-HMAC-SHA256' + const credentialScope = `${dateStamp}/${region}/${service}/aws4_request` + const stringToSign = `${algorithm}\n${amzDate}\n${credentialScope}\n${crypto.createHash('sha256').update(canonicalRequest).digest('hex')}` + + const signingKey = getSignatureKey(secretAccessKey, dateStamp, region, service) + const signature = crypto.createHmac('sha256', signingKey).update(stringToSign).digest('hex') + + const authorizationHeader = `${algorithm} Credential=${accessKeyId}/${credentialScope}, SignedHeaders=${signedHeaders}, Signature=${signature}` + + return { + 'Content-Type': 'application/x-amz-json-1.1', + Host: host, + 'X-Amz-Date': amzDate, + 'X-Amz-Target': amzTarget, + Authorization: authorizationHeader, + } +} + +async function fetchDocumentBytes(url: string): Promise<{ bytes: string; contentType: string }> { + const response = await fetch(url) + if (!response.ok) { + throw new Error(`Failed to fetch document: ${response.statusText}`) + } + + const arrayBuffer = await response.arrayBuffer() + const bytes = Buffer.from(arrayBuffer).toString('base64') + const contentType = response.headers.get('content-type') || 'application/octet-stream' + + return { bytes, contentType } +} + +function parseS3Uri(s3Uri: string): { bucket: string; key: string } { + const match = s3Uri.match(/^s3:\/\/([^/]+)\/(.+)$/) + if (!match) { + throw new Error( + `Invalid S3 URI format: ${s3Uri}. Expected format: s3://bucket-name/path/to/object` + ) + } + + const bucket = match[1] + const key = match[2] + + const bucketValidation = validateS3BucketName(bucket, 'S3 bucket name') + if (!bucketValidation.isValid) { + throw new Error(bucketValidation.error) + } + + if (key.includes('..') || key.startsWith('/')) { + throw new Error('S3 key contains invalid path traversal sequences') + } + + return { bucket, key } +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)) +} + +async function callTextractAsync( + host: string, + amzTarget: string, + body: Record, + accessKeyId: string, + secretAccessKey: string, + region: string +): Promise> { + const bodyString = JSON.stringify(body) + const headers = signAwsRequest( + 'POST', + host, + '/', + bodyString, + accessKeyId, + secretAccessKey, + region, + 'textract', + amzTarget + ) + + const response = await fetch(`https://${host}/`, { + method: 'POST', + headers, + body: bodyString, + }) + + if (!response.ok) { + const errorText = await response.text() + let errorMessage = `Textract API error: ${response.statusText}` + try { + const errorJson = JSON.parse(errorText) + if (errorJson.Message) { + errorMessage = errorJson.Message + } else if (errorJson.__type) { + errorMessage = `${errorJson.__type}: ${errorJson.message || errorText}` + } + } catch { + // Use default error message + } + throw new Error(errorMessage) + } + + return response.json() +} + +async function pollForJobCompletion( + host: string, + jobId: string, + accessKeyId: string, + secretAccessKey: string, + region: string, + useAnalyzeDocument: boolean, + requestId: string +): Promise> { + const pollIntervalMs = 5000 // 5 seconds between polls + const maxPollTimeMs = 180000 // 3 minutes maximum polling time + const maxAttempts = Math.ceil(maxPollTimeMs / pollIntervalMs) + + const getTarget = useAnalyzeDocument + ? 'Textract.GetDocumentAnalysis' + : 'Textract.GetDocumentTextDetection' + + for (let attempt = 0; attempt < maxAttempts; attempt++) { + const result = await callTextractAsync( + host, + getTarget, + { JobId: jobId }, + accessKeyId, + secretAccessKey, + region + ) + + const jobStatus = result.JobStatus as string + + if (jobStatus === 'SUCCEEDED') { + logger.info(`[${requestId}] Async job completed successfully after ${attempt + 1} polls`) + + let allBlocks = (result.Blocks as unknown[]) || [] + let nextToken = result.NextToken as string | undefined + + while (nextToken) { + const nextResult = await callTextractAsync( + host, + getTarget, + { JobId: jobId, NextToken: nextToken }, + accessKeyId, + secretAccessKey, + region + ) + allBlocks = allBlocks.concat((nextResult.Blocks as unknown[]) || []) + nextToken = nextResult.NextToken as string | undefined + } + + return { + ...result, + Blocks: allBlocks, + } + } + + if (jobStatus === 'FAILED') { + throw new Error(`Textract job failed: ${result.StatusMessage || 'Unknown error'}`) + } + + if (jobStatus === 'PARTIAL_SUCCESS') { + logger.warn(`[${requestId}] Job completed with partial success: ${result.StatusMessage}`) + return result + } + + logger.info(`[${requestId}] Job status: ${jobStatus}, attempt ${attempt + 1}/${maxAttempts}`) + await sleep(pollIntervalMs) + } + + throw new Error( + `Timeout waiting for Textract job to complete (max ${maxPollTimeMs / 1000} seconds)` + ) +} + +export async function POST(request: NextRequest) { + const requestId = generateRequestId() + + try { + const authResult = await checkHybridAuth(request, { requireWorkflowId: false }) + + if (!authResult.success || !authResult.userId) { + logger.warn(`[${requestId}] Unauthorized Textract parse attempt`, { + error: authResult.error || 'Missing userId', + }) + return NextResponse.json( + { + success: false, + error: authResult.error || 'Unauthorized', + }, + { status: 401 } + ) + } + + const userId = authResult.userId + const body = await request.json() + const validatedData = TextractParseSchema.parse(body) + + const processingMode = validatedData.processingMode || 'sync' + const featureTypes = validatedData.featureTypes ?? [] + const useAnalyzeDocument = featureTypes.length > 0 + const host = `textract.${validatedData.region}.amazonaws.com` + + logger.info(`[${requestId}] Textract parse request`, { + processingMode, + filePath: validatedData.filePath, + s3Uri: validatedData.s3Uri, + featureTypes, + userId, + }) + + if (processingMode === 'async') { + if (!validatedData.s3Uri && !validatedData.filePath) { + return NextResponse.json( + { + success: false, + error: 'S3 URI or file path is required for async processing', + }, + { status: 400 } + ) + } + + let s3Bucket: string + let s3Key: string + + if (validatedData.s3Uri) { + const parsed = parseS3Uri(validatedData.s3Uri) + s3Bucket = parsed.bucket + s3Key = parsed.key + } else if (validatedData.filePath?.includes('/api/files/serve/')) { + const storageKey = extractStorageKey(validatedData.filePath) + const context = inferContextFromKey(storageKey) + + const hasAccess = await verifyFileAccess(storageKey, userId, undefined, context, false) + if (!hasAccess) { + return NextResponse.json({ success: false, error: 'File not found' }, { status: 404 }) + } + + const s3Info = StorageService.getS3InfoForKey(storageKey, context) + s3Bucket = s3Info.bucket + s3Key = s3Info.key + } else { + return NextResponse.json( + { + success: false, + error: 'Async mode requires an S3 URI (s3://bucket/key) or an uploaded file', + }, + { status: 400 } + ) + } + + logger.info(`[${requestId}] Starting async Textract job`, { s3Bucket, s3Key }) + + const startTarget = useAnalyzeDocument + ? 'Textract.StartDocumentAnalysis' + : 'Textract.StartDocumentTextDetection' + + const startBody: Record = { + DocumentLocation: { + S3Object: { + Bucket: s3Bucket, + Name: s3Key, + }, + }, + } + + if (useAnalyzeDocument) { + startBody.FeatureTypes = featureTypes + + if ( + validatedData.queries && + validatedData.queries.length > 0 && + featureTypes.includes('QUERIES') + ) { + startBody.QueriesConfig = { + Queries: validatedData.queries.map((q) => ({ + Text: q.Text, + Alias: q.Alias, + Pages: q.Pages, + })), + } + } + } + + const startResult = await callTextractAsync( + host, + startTarget, + startBody, + validatedData.accessKeyId, + validatedData.secretAccessKey, + validatedData.region + ) + + const jobId = startResult.JobId as string + if (!jobId) { + throw new Error('Failed to start Textract job: No JobId returned') + } + + logger.info(`[${requestId}] Async job started`, { jobId }) + + const textractData = await pollForJobCompletion( + host, + jobId, + validatedData.accessKeyId, + validatedData.secretAccessKey, + validatedData.region, + useAnalyzeDocument, + requestId + ) + + logger.info(`[${requestId}] Textract async parse successful`, { + pageCount: (textractData.DocumentMetadata as { Pages?: number })?.Pages ?? 0, + blockCount: (textractData.Blocks as unknown[])?.length ?? 0, + }) + + return NextResponse.json({ + success: true, + output: { + blocks: textractData.Blocks ?? [], + documentMetadata: { + pages: (textractData.DocumentMetadata as { Pages?: number })?.Pages ?? 0, + }, + modelVersion: (textractData.AnalyzeDocumentModelVersion ?? + textractData.DetectDocumentTextModelVersion) as string | undefined, + }, + }) + } + + if (!validatedData.filePath) { + return NextResponse.json( + { + success: false, + error: 'File path is required for sync processing', + }, + { status: 400 } + ) + } + + let fileUrl = validatedData.filePath + + if (validatedData.filePath?.includes('/api/files/serve/')) { + try { + const storageKey = extractStorageKey(validatedData.filePath) + const context = inferContextFromKey(storageKey) + + const hasAccess = await verifyFileAccess(storageKey, userId, undefined, context, false) + + if (!hasAccess) { + logger.warn(`[${requestId}] Unauthorized presigned URL generation attempt`, { + userId, + key: storageKey, + context, + }) + return NextResponse.json( + { + success: false, + error: 'File not found', + }, + { status: 404 } + ) + } + + fileUrl = await StorageService.generatePresignedDownloadUrl(storageKey, context, 5 * 60) + logger.info(`[${requestId}] Generated presigned URL for ${context} file`) + } catch (error) { + logger.error(`[${requestId}] Failed to generate presigned URL:`, error) + return NextResponse.json( + { + success: false, + error: 'Failed to generate file access URL', + }, + { status: 500 } + ) + } + } else if (validatedData.filePath?.startsWith('/')) { + const baseUrl = getBaseUrl() + fileUrl = `${baseUrl}${validatedData.filePath}` + } else { + const urlValidation = validateExternalUrl(fileUrl, 'Document URL') + if (!urlValidation.isValid) { + logger.warn(`[${requestId}] SSRF attempt blocked`, { + userId, + url: fileUrl.substring(0, 100), + error: urlValidation.error, + }) + return NextResponse.json( + { + success: false, + error: urlValidation.error, + }, + { status: 400 } + ) + } + } + + const { bytes } = await fetchDocumentBytes(fileUrl) + + const uri = '/' + + let textractBody: Record + let amzTarget: string + + if (useAnalyzeDocument) { + amzTarget = 'Textract.AnalyzeDocument' + textractBody = { + Document: { + Bytes: bytes, + }, + FeatureTypes: featureTypes, + } + + if ( + validatedData.queries && + validatedData.queries.length > 0 && + featureTypes.includes('QUERIES') + ) { + textractBody.QueriesConfig = { + Queries: validatedData.queries.map((q) => ({ + Text: q.Text, + Alias: q.Alias, + Pages: q.Pages, + })), + } + } + } else { + amzTarget = 'Textract.DetectDocumentText' + textractBody = { + Document: { + Bytes: bytes, + }, + } + } + + const bodyString = JSON.stringify(textractBody) + + const headers = signAwsRequest( + 'POST', + host, + uri, + bodyString, + validatedData.accessKeyId, + validatedData.secretAccessKey, + validatedData.region, + 'textract', + amzTarget + ) + + const textractResponse = await fetch(`https://${host}${uri}`, { + method: 'POST', + headers, + body: bodyString, + }) + + if (!textractResponse.ok) { + const errorText = await textractResponse.text() + logger.error(`[${requestId}] Textract API error:`, errorText) + + let errorMessage = `Textract API error: ${textractResponse.statusText}` + try { + const errorJson = JSON.parse(errorText) + if (errorJson.Message) { + errorMessage = errorJson.Message + } else if (errorJson.__type) { + errorMessage = `${errorJson.__type}: ${errorJson.message || errorText}` + } + } catch { + // Use default error message + } + + return NextResponse.json( + { + success: false, + error: errorMessage, + }, + { status: textractResponse.status } + ) + } + + const textractData = await textractResponse.json() + + logger.info(`[${requestId}] Textract parse successful`, { + pageCount: textractData.DocumentMetadata?.Pages ?? 0, + blockCount: textractData.Blocks?.length ?? 0, + }) + + return NextResponse.json({ + success: true, + output: { + blocks: textractData.Blocks ?? [], + documentMetadata: { + pages: textractData.DocumentMetadata?.Pages ?? 0, + }, + modelVersion: + textractData.AnalyzeDocumentModelVersion ?? + textractData.DetectDocumentTextModelVersion ?? + undefined, + }, + }) + } catch (error) { + if (error instanceof z.ZodError) { + logger.warn(`[${requestId}] Invalid request data`, { errors: error.errors }) + return NextResponse.json( + { + success: false, + error: 'Invalid request data', + details: error.errors, + }, + { status: 400 } + ) + } + + logger.error(`[${requestId}] Error in Textract parse:`, error) + + return NextResponse.json( + { + success: false, + error: error instanceof Error ? error.message : 'Internal server error', + }, + { status: 500 } + ) + } +} diff --git a/apps/sim/blocks/blocks/textract.ts b/apps/sim/blocks/blocks/textract.ts new file mode 100644 index 000000000..6f3da8b75 --- /dev/null +++ b/apps/sim/blocks/blocks/textract.ts @@ -0,0 +1,251 @@ +import { TextractIcon } from '@/components/icons' +import { AuthMode, type BlockConfig, type SubBlockType } from '@/blocks/types' +import type { TextractParserOutput } from '@/tools/textract/types' + +export const TextractBlock: BlockConfig = { + type: 'textract', + name: 'AWS Textract', + description: 'Extract text, tables, and forms from documents', + authMode: AuthMode.ApiKey, + longDescription: `Integrate AWS Textract into your workflow to extract text, tables, forms, and key-value pairs from documents. Sync mode supports JPEG, PNG, and single-page PDF. Async mode supports multi-page PDF and TIFF via S3.`, + docsLink: 'https://docs.sim.ai/tools/textract', + category: 'tools', + bgColor: 'linear-gradient(135deg, #055F4E 0%, #56C0A7 100%)', + icon: TextractIcon, + subBlocks: [ + { + id: 'processingMode', + title: 'Document Type', + type: 'dropdown' as SubBlockType, + options: [ + { id: 'sync', label: 'Single Page' }, + { id: 'async', label: 'Multi-Page' }, + ], + }, + { + id: 'inputMethod', + title: 'Select Input Method', + type: 'dropdown' as SubBlockType, + options: [ + { id: 'url', label: 'Document URL' }, + { id: 'upload', label: 'Upload Document' }, + ], + condition: { + field: 'processingMode', + value: 'async', + not: true, + }, + }, + { + id: 'asyncInputMethod', + title: 'Select Input Method', + type: 'dropdown' as SubBlockType, + options: [ + { id: 's3', label: 'S3 URI' }, + { id: 'upload', label: 'Upload Document' }, + ], + condition: { + field: 'processingMode', + value: 'async', + }, + }, + { + id: 'filePath', + title: 'Document URL', + type: 'short-input' as SubBlockType, + placeholder: 'Enter full URL to a document (JPEG, PNG, or single-page PDF)', + condition: { + field: 'inputMethod', + value: 'url', + and: { + field: 'processingMode', + value: 'async', + not: true, + }, + }, + }, + { + id: 's3Uri', + title: 'S3 URI', + type: 'short-input' as SubBlockType, + placeholder: 's3://bucket-name/path/to/document.pdf', + condition: { + field: 'asyncInputMethod', + value: 's3', + and: { + field: 'processingMode', + value: 'async', + }, + }, + }, + { + id: 'fileUpload', + title: 'Upload Document', + type: 'file-upload' as SubBlockType, + acceptedTypes: 'application/pdf,image/jpeg,image/png,image/tiff', + condition: { + field: 'inputMethod', + value: 'upload', + and: { + field: 'processingMode', + value: 'async', + not: true, + }, + }, + maxSize: 10, + }, + { + id: 'asyncFileUpload', + title: 'Upload Document', + type: 'file-upload' as SubBlockType, + acceptedTypes: 'application/pdf,image/jpeg,image/png,image/tiff', + condition: { + field: 'asyncInputMethod', + value: 'upload', + and: { + field: 'processingMode', + value: 'async', + }, + }, + maxSize: 50, + }, + { + id: 'region', + title: 'AWS Region', + type: 'short-input' as SubBlockType, + placeholder: 'e.g., us-east-1', + required: true, + }, + { + id: 'accessKeyId', + title: 'AWS Access Key ID', + type: 'short-input' as SubBlockType, + placeholder: 'Enter your AWS Access Key ID', + password: true, + required: true, + }, + { + id: 'secretAccessKey', + title: 'AWS Secret Access Key', + type: 'short-input' as SubBlockType, + placeholder: 'Enter your AWS Secret Access Key', + password: true, + required: true, + }, + { + id: 'extractTables', + title: 'Extract Tables', + type: 'switch' as SubBlockType, + }, + { + id: 'extractForms', + title: 'Extract Forms (Key-Value Pairs)', + type: 'switch' as SubBlockType, + }, + { + id: 'detectSignatures', + title: 'Detect Signatures', + type: 'switch' as SubBlockType, + }, + { + id: 'analyzeLayout', + title: 'Analyze Document Layout', + type: 'switch' as SubBlockType, + }, + ], + tools: { + access: ['textract_parser'], + config: { + tool: () => 'textract_parser', + params: (params) => { + if (!params.accessKeyId || params.accessKeyId.trim() === '') { + throw new Error('AWS Access Key ID is required') + } + if (!params.secretAccessKey || params.secretAccessKey.trim() === '') { + throw new Error('AWS Secret Access Key is required') + } + if (!params.region || params.region.trim() === '') { + throw new Error('AWS Region is required') + } + + const processingMode = params.processingMode || 'sync' + const parameters: Record = { + accessKeyId: params.accessKeyId.trim(), + secretAccessKey: params.secretAccessKey.trim(), + region: params.region.trim(), + processingMode, + } + + if (processingMode === 'async') { + const asyncInputMethod = params.asyncInputMethod || 's3' + if (asyncInputMethod === 's3') { + if (!params.s3Uri || params.s3Uri.trim() === '') { + throw new Error('S3 URI is required for async processing') + } + parameters.s3Uri = params.s3Uri.trim() + } else if (asyncInputMethod === 'upload') { + if (!params.asyncFileUpload) { + throw new Error('Please upload a document') + } + parameters.fileUpload = params.asyncFileUpload + } + } else { + const inputMethod = params.inputMethod || 'url' + if (inputMethod === 'url') { + if (!params.filePath || params.filePath.trim() === '') { + throw new Error('Document URL is required') + } + parameters.filePath = params.filePath.trim() + } else if (inputMethod === 'upload') { + if (!params.fileUpload) { + throw new Error('Please upload a document') + } + parameters.fileUpload = params.fileUpload + } + } + + const featureTypes: string[] = [] + if (params.extractTables) featureTypes.push('TABLES') + if (params.extractForms) featureTypes.push('FORMS') + if (params.detectSignatures) featureTypes.push('SIGNATURES') + if (params.analyzeLayout) featureTypes.push('LAYOUT') + + if (featureTypes.length > 0) { + parameters.featureTypes = featureTypes + } + + return parameters + }, + }, + }, + inputs: { + processingMode: { type: 'string', description: 'Document type: single-page or multi-page' }, + inputMethod: { type: 'string', description: 'Input method selection for sync mode' }, + asyncInputMethod: { type: 'string', description: 'Input method selection for async mode' }, + filePath: { type: 'string', description: 'Document URL' }, + s3Uri: { type: 'string', description: 'S3 URI for async processing (s3://bucket/key)' }, + fileUpload: { type: 'json', description: 'Uploaded document file for sync mode' }, + asyncFileUpload: { type: 'json', description: 'Uploaded document file for async mode' }, + extractTables: { type: 'boolean', description: 'Extract tables from document' }, + extractForms: { type: 'boolean', description: 'Extract form key-value pairs' }, + detectSignatures: { type: 'boolean', description: 'Detect signatures' }, + analyzeLayout: { type: 'boolean', description: 'Analyze document layout' }, + region: { type: 'string', description: 'AWS region' }, + accessKeyId: { type: 'string', description: 'AWS Access Key ID' }, + secretAccessKey: { type: 'string', description: 'AWS Secret Access Key' }, + }, + outputs: { + blocks: { + type: 'json', + description: 'Array of detected blocks (PAGE, LINE, WORD, TABLE, CELL, KEY_VALUE_SET, etc.)', + }, + documentMetadata: { + type: 'json', + description: 'Document metadata containing pages count', + }, + modelVersion: { + type: 'string', + description: 'Version of the Textract model used for processing', + }, + }, +} diff --git a/apps/sim/blocks/registry.ts b/apps/sim/blocks/registry.ts index 544c29432..201ea3b4b 100644 --- a/apps/sim/blocks/registry.ts +++ b/apps/sim/blocks/registry.ts @@ -123,6 +123,7 @@ import { SttBlock } from '@/blocks/blocks/stt' import { SupabaseBlock } from '@/blocks/blocks/supabase' import { TavilyBlock } from '@/blocks/blocks/tavily' import { TelegramBlock } from '@/blocks/blocks/telegram' +import { TextractBlock } from '@/blocks/blocks/textract' import { ThinkingBlock } from '@/blocks/blocks/thinking' import { TinybirdBlock } from '@/blocks/blocks/tinybird' import { TranslateBlock } from '@/blocks/blocks/translate' @@ -285,6 +286,7 @@ export const registry: Record = { stt: SttBlock, supabase: SupabaseBlock, tavily: TavilyBlock, + textract: TextractBlock, telegram: TelegramBlock, thinking: ThinkingBlock, tinybird: TinybirdBlock, diff --git a/apps/sim/components/icons.tsx b/apps/sim/components/icons.tsx index 1c245ffaf..689dbb50a 100644 --- a/apps/sim/components/icons.tsx +++ b/apps/sim/components/icons.tsx @@ -4093,6 +4093,23 @@ export function SQSIcon(props: SVGProps) { ) } +export function TextractIcon(props: SVGProps) { + return ( + + + + ) +} + export function McpIcon(props: SVGProps) { return ( maxLength) { logger.warn('Path segment exceeds maximum length', { paramName, @@ -86,7 +78,6 @@ export function validatePathSegment( } } - // Check for null bytes (potential for bypass attacks) if (value.includes('\0') || value.includes('%00')) { logger.warn('Path segment contains null bytes', { paramName }) return { @@ -95,7 +86,6 @@ export function validatePathSegment( } } - // Check for path traversal patterns const pathTraversalPatterns = [ '..', './', @@ -124,7 +114,6 @@ export function validatePathSegment( } } - // Check for directory separators if (value.includes('/') || value.includes('\\')) { logger.warn('Path segment contains directory separators', { paramName }) return { @@ -133,7 +122,6 @@ export function validatePathSegment( } } - // Use custom pattern if provided if (customPattern) { if (!customPattern.test(value)) { logger.warn('Path segment failed custom pattern validation', { @@ -148,7 +136,6 @@ export function validatePathSegment( return { isValid: true, sanitized: value } } - // Build allowed character pattern let pattern = '^[a-zA-Z0-9' if (allowHyphens) pattern += '\\-' if (allowUnderscores) pattern += '_' @@ -947,6 +934,130 @@ export function validateAirtableId( return { isValid: true, sanitized: value } } +/** + * Validates an AWS region identifier + * + * AWS regions follow the pattern: {area}-{sub-area}-{number} + * Examples: us-east-1, eu-west-2, ap-southeast-1, sa-east-1 + * + * @param value - The AWS region to validate + * @param paramName - Name of the parameter for error messages + * @returns ValidationResult + * + * @example + * ```typescript + * const result = validateAwsRegion(region, 'region') + * if (!result.isValid) { + * return NextResponse.json({ error: result.error }, { status: 400 }) + * } + * ``` + */ +export function validateAwsRegion( + value: string | null | undefined, + paramName = 'region' +): ValidationResult { + if (value === null || value === undefined || value === '') { + return { + isValid: false, + error: `${paramName} is required`, + } + } + + // AWS region format: {area}-{sub-area}-{number} + // Examples: us-east-1, eu-west-2, ap-southeast-1, me-south-1, af-south-1 + const awsRegionPattern = /^[a-z]{2}-[a-z]+-\d{1,2}$/ + + if (!awsRegionPattern.test(value)) { + logger.warn('Invalid AWS region format', { + paramName, + value: value.substring(0, 50), + }) + return { + isValid: false, + error: `${paramName} must be a valid AWS region (e.g., us-east-1, eu-west-2)`, + } + } + + return { isValid: true, sanitized: value } +} + +/** + * Validates an S3 bucket name according to AWS naming rules + * + * S3 bucket names must: + * - Be 3-63 characters long + * - Start and end with a letter or number + * - Contain only lowercase letters, numbers, and hyphens + * - Not contain consecutive periods + * - Not be formatted as an IP address + * + * @param value - The S3 bucket name to validate + * @param paramName - Name of the parameter for error messages + * @returns ValidationResult + * + * @example + * ```typescript + * const result = validateS3BucketName(bucket, 'bucket') + * if (!result.isValid) { + * return NextResponse.json({ error: result.error }, { status: 400 }) + * } + * ``` + */ +export function validateS3BucketName( + value: string | null | undefined, + paramName = 'bucket' +): ValidationResult { + if (value === null || value === undefined || value === '') { + return { + isValid: false, + error: `${paramName} is required`, + } + } + + if (value.length < 3 || value.length > 63) { + logger.warn('S3 bucket name length invalid', { + paramName, + length: value.length, + }) + return { + isValid: false, + error: `${paramName} must be between 3 and 63 characters`, + } + } + + const bucketNamePattern = /^[a-z0-9][a-z0-9.-]*[a-z0-9]$|^[a-z0-9]$/ + + if (!bucketNamePattern.test(value)) { + logger.warn('Invalid S3 bucket name format', { + paramName, + value: value.substring(0, 63), + }) + return { + isValid: false, + error: `${paramName} must start and end with a letter or number, and contain only lowercase letters, numbers, hyphens, and periods`, + } + } + + if (value.includes('..')) { + logger.warn('S3 bucket name contains consecutive periods', { paramName }) + return { + isValid: false, + error: `${paramName} cannot contain consecutive periods`, + } + } + + const ipPattern = /^(\d{1,3}\.){3}\d{1,3}$/ + if (ipPattern.test(value)) { + logger.warn('S3 bucket name formatted as IP address', { paramName }) + return { + isValid: false, + error: `${paramName} cannot be formatted as an IP address`, + } + } + + return { isValid: true, sanitized: value } +} + /** * Validates a Google Calendar ID * diff --git a/apps/sim/lib/uploads/core/storage-service.ts b/apps/sim/lib/uploads/core/storage-service.ts index 0a7a004d8..b504db175 100644 --- a/apps/sim/lib/uploads/core/storage-service.ts +++ b/apps/sim/lib/uploads/core/storage-service.ts @@ -455,3 +455,27 @@ export async function generatePresignedDownloadUrl( export function hasCloudStorage(): boolean { return USE_BLOB_STORAGE || USE_S3_STORAGE } + +/** + * Get S3 bucket and key information for a storage key + * Useful for services that need direct S3 access (e.g., AWS Textract async) + */ +export function getS3InfoForKey( + key: string, + context: StorageContext +): { bucket: string; key: string } { + if (!USE_S3_STORAGE) { + throw new Error('S3 storage is not configured. Cannot retrieve S3 info for key.') + } + + const config = getStorageConfig(context) + + if (!config.bucket) { + throw new Error(`S3 bucket not configured for context: ${context}`) + } + + return { + bucket: config.bucket, + key, + } +} diff --git a/apps/sim/tools/registry.ts b/apps/sim/tools/registry.ts index 2a8088477..d15f72b5f 100644 --- a/apps/sim/tools/registry.ts +++ b/apps/sim/tools/registry.ts @@ -1500,6 +1500,7 @@ import { telegramSendPhotoTool, telegramSendVideoTool, } from '@/tools/telegram' +import { textractParserTool } from '@/tools/textract' import { thinkingTool } from '@/tools/thinking' import { tinybirdEventsTool, tinybirdQueryTool } from '@/tools/tinybird' import { @@ -2456,6 +2457,7 @@ export const tools: Record = { apollo_email_accounts: apolloEmailAccountsTool, mistral_parser: mistralParserTool, reducto_parser: reductoParserTool, + textract_parser: textractParserTool, thinking_tool: thinkingTool, tinybird_events: tinybirdEventsTool, tinybird_query: tinybirdQueryTool, diff --git a/apps/sim/tools/textract/index.ts b/apps/sim/tools/textract/index.ts new file mode 100644 index 000000000..5f618a8b4 --- /dev/null +++ b/apps/sim/tools/textract/index.ts @@ -0,0 +1,2 @@ +export { textractParserTool } from '@/tools/textract/parser' +export * from '@/tools/textract/types' diff --git a/apps/sim/tools/textract/parser.ts b/apps/sim/tools/textract/parser.ts new file mode 100644 index 000000000..58aa54c78 --- /dev/null +++ b/apps/sim/tools/textract/parser.ts @@ -0,0 +1,413 @@ +import { createLogger } from '@sim/logger' +import { getBaseUrl } from '@/lib/core/utils/urls' +import type { TextractParserInput, TextractParserOutput } from '@/tools/textract/types' +import type { ToolConfig } from '@/tools/types' + +const logger = createLogger('TextractParserTool') + +export const textractParserTool: ToolConfig = { + id: 'textract_parser', + name: 'AWS Textract Parser', + description: 'Parse documents using AWS Textract OCR and document analysis', + version: '1.0.0', + + params: { + accessKeyId: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'AWS Access Key ID', + }, + secretAccessKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'AWS Secret Access Key', + }, + region: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'AWS region for Textract service (e.g., us-east-1)', + }, + processingMode: { + type: 'string', + required: false, + visibility: 'user-only', + description: 'Document type: single-page or multi-page. Defaults to single-page.', + }, + filePath: { + type: 'string', + required: false, + visibility: 'user-only', + description: + 'URL to a document to be processed (JPEG, PNG, PDF, or TIFF). Required for sync mode.', + }, + s3Uri: { + type: 'string', + required: false, + visibility: 'user-only', + description: + 'S3 URI for async processing (s3://bucket/key). Required for async mode with S3 input.', + }, + fileUpload: { + type: 'object', + required: false, + visibility: 'hidden', + description: 'File upload data from file-upload component', + }, + featureTypes: { + type: 'array', + required: false, + visibility: 'user-or-llm', + description: + 'Feature types to detect: TABLES, FORMS, QUERIES, SIGNATURES, LAYOUT. If not specified, only text detection is performed.', + items: { + type: 'string', + description: 'Feature type', + }, + }, + queries: { + type: 'array', + required: false, + visibility: 'user-or-llm', + description: + 'Custom queries to extract specific information. Only used when featureTypes includes QUERIES.', + items: { + type: 'object', + description: 'Query configuration', + properties: { + Text: { type: 'string', description: 'The query text' }, + Alias: { type: 'string', description: 'Optional alias for the result' }, + }, + }, + }, + }, + + request: { + url: '/api/tools/textract/parse', + method: 'POST', + headers: () => { + return { + 'Content-Type': 'application/json', + Accept: 'application/json', + } + }, + body: (params) => { + if (!params || typeof params !== 'object') { + throw new Error('Invalid parameters: Parameters must be provided as an object') + } + + if ( + !params.accessKeyId || + typeof params.accessKeyId !== 'string' || + params.accessKeyId.trim() === '' + ) { + throw new Error('Missing or invalid AWS Access Key ID') + } + + if ( + !params.secretAccessKey || + typeof params.secretAccessKey !== 'string' || + params.secretAccessKey.trim() === '' + ) { + throw new Error('Missing or invalid AWS Secret Access Key') + } + + if (!params.region || typeof params.region !== 'string' || params.region.trim() === '') { + throw new Error('Missing or invalid AWS region') + } + + const processingMode = params.processingMode || 'sync' + + const requestBody: Record = { + accessKeyId: params.accessKeyId.trim(), + secretAccessKey: params.secretAccessKey.trim(), + region: params.region.trim(), + processingMode, + } + + if (processingMode === 'async') { + if (params.s3Uri && typeof params.s3Uri === 'string' && params.s3Uri.trim() !== '') { + const s3UriTrimmed = params.s3Uri.trim() + if (!s3UriTrimmed.match(/^s3:\/\/[^/]+\/.+$/)) { + throw new Error('Invalid S3 URI format. Expected: s3://bucket-name/path/to/object') + } + requestBody.s3Uri = s3UriTrimmed + } else if (params.fileUpload) { + if ( + typeof params.fileUpload === 'object' && + params.fileUpload !== null && + (params.fileUpload.url || params.fileUpload.path) + ) { + const uploadedFilePath = (params.fileUpload.path || params.fileUpload.url) as string + if (uploadedFilePath.startsWith('/api/files/serve/')) { + requestBody.filePath = uploadedFilePath + } else { + throw new Error('Async mode with upload requires files stored in S3') + } + } else { + throw new Error('Invalid file upload: Upload data is missing or invalid') + } + } else { + throw new Error('Async mode requires either an S3 URI or an uploaded file') + } + } else { + if ( + params.fileUpload && + (!params.filePath || params.filePath === 'null' || params.filePath === '') + ) { + if ( + typeof params.fileUpload === 'object' && + params.fileUpload !== null && + (params.fileUpload.url || params.fileUpload.path) + ) { + let uploadedFilePath = (params.fileUpload.url || params.fileUpload.path) as string + + if (uploadedFilePath.startsWith('/')) { + const baseUrl = getBaseUrl() + if (!baseUrl) throw new Error('Failed to get base URL for file path conversion') + uploadedFilePath = `${baseUrl}${uploadedFilePath}` + } + + params.filePath = uploadedFilePath + logger.info('Using uploaded file:', uploadedFilePath) + } else { + throw new Error('Invalid file upload: Upload data is missing or invalid') + } + } + + if ( + !params.filePath || + typeof params.filePath !== 'string' || + params.filePath.trim() === '' + ) { + throw new Error('Missing or invalid file path: Please provide a URL to a document') + } + + let filePathToValidate = params.filePath.trim() + if (filePathToValidate.startsWith('/')) { + const baseUrl = getBaseUrl() + if (!baseUrl) throw new Error('Failed to get base URL for file path conversion') + filePathToValidate = `${baseUrl}${filePathToValidate}` + } + + let url + try { + url = new URL(filePathToValidate) + + if (!['http:', 'https:'].includes(url.protocol)) { + throw new Error( + `Invalid protocol: ${url.protocol}. URL must use HTTP or HTTPS protocol` + ) + } + } catch (error) { + const errorMessage = error instanceof Error ? error.message : String(error) + throw new Error( + `Invalid URL format: ${errorMessage}. Please provide a valid HTTP or HTTPS URL to a document.` + ) + } + + requestBody.filePath = url.toString() + + if (params.fileUpload?.path?.startsWith('/api/files/serve/')) { + requestBody.filePath = params.fileUpload.path + } + } + + if (params.featureTypes && Array.isArray(params.featureTypes)) { + const validFeatures = ['TABLES', 'FORMS', 'QUERIES', 'SIGNATURES', 'LAYOUT'] + const filteredFeatures = params.featureTypes.filter((f) => + validFeatures.includes(f as string) + ) + if (filteredFeatures.length > 0) { + requestBody.featureTypes = filteredFeatures + } + } + + if (params.queries && Array.isArray(params.queries) && params.queries.length > 0) { + const validQueries = params.queries + .filter((q) => q && typeof q === 'object' && typeof q.Text === 'string' && q.Text.trim()) + .map((q) => ({ + Text: q.Text.trim(), + Alias: q.Alias?.trim() || undefined, + Pages: q.Pages || undefined, + })) + + if (validQueries.length > 0) { + requestBody.queries = validQueries + + if (!requestBody.featureTypes) { + requestBody.featureTypes = ['QUERIES'] + } else if ( + Array.isArray(requestBody.featureTypes) && + !requestBody.featureTypes.includes('QUERIES') + ) { + ;(requestBody.featureTypes as string[]).push('QUERIES') + } + } + } + + return requestBody + }, + }, + + transformResponse: async (response) => { + try { + let apiResult + try { + apiResult = await response.json() + } catch (jsonError) { + throw new Error( + `Failed to parse Textract response: ${jsonError instanceof Error ? jsonError.message : String(jsonError)}` + ) + } + + if (!apiResult || typeof apiResult !== 'object') { + throw new Error('Invalid response format from Textract API') + } + + if (!apiResult.success && apiResult.error) { + throw new Error(apiResult.error) + } + + const textractData = apiResult.output ?? apiResult + + return { + success: true, + output: { + blocks: textractData.Blocks ?? textractData.blocks ?? [], + documentMetadata: { + pages: + textractData.DocumentMetadata?.Pages ?? textractData.documentMetadata?.pages ?? 0, + }, + modelVersion: + textractData.AnalyzeDocumentModelVersion ?? + textractData.analyzeDocumentModelVersion ?? + textractData.DetectDocumentTextModelVersion ?? + textractData.detectDocumentTextModelVersion ?? + undefined, + }, + } + } catch (error) { + logger.error('Error processing Textract result:', error) + throw error + } + }, + + outputs: { + blocks: { + type: 'array', + description: + 'Array of Block objects containing detected text, tables, forms, and other elements', + items: { + type: 'object', + properties: { + BlockType: { + type: 'string', + description: 'Type of block (PAGE, LINE, WORD, TABLE, CELL, KEY_VALUE_SET, etc.)', + }, + Id: { type: 'string', description: 'Unique identifier for the block' }, + Text: { + type: 'string', + description: 'The text content (for LINE and WORD blocks)', + optional: true, + }, + TextType: { + type: 'string', + description: 'Type of text (PRINTED or HANDWRITING)', + optional: true, + }, + Confidence: { type: 'number', description: 'Confidence score (0-100)', optional: true }, + Page: { type: 'number', description: 'Page number', optional: true }, + Geometry: { + type: 'object', + description: 'Location and bounding box information', + optional: true, + properties: { + BoundingBox: { + type: 'object', + properties: { + Height: { type: 'number', description: 'Height as ratio of document height' }, + Left: { type: 'number', description: 'Left position as ratio of document width' }, + Top: { type: 'number', description: 'Top position as ratio of document height' }, + Width: { type: 'number', description: 'Width as ratio of document width' }, + }, + }, + Polygon: { + type: 'array', + description: 'Polygon coordinates', + items: { + type: 'object', + properties: { + X: { type: 'number', description: 'X coordinate' }, + Y: { type: 'number', description: 'Y coordinate' }, + }, + }, + }, + }, + }, + Relationships: { + type: 'array', + description: 'Relationships to other blocks', + optional: true, + items: { + type: 'object', + properties: { + Type: { + type: 'string', + description: 'Relationship type (CHILD, VALUE, ANSWER, etc.)', + }, + Ids: { type: 'array', description: 'IDs of related blocks' }, + }, + }, + }, + EntityTypes: { + type: 'array', + description: 'Entity types for KEY_VALUE_SET (KEY or VALUE)', + optional: true, + }, + SelectionStatus: { + type: 'string', + description: 'For checkboxes: SELECTED or NOT_SELECTED', + optional: true, + }, + RowIndex: { type: 'number', description: 'Row index for table cells', optional: true }, + ColumnIndex: { + type: 'number', + description: 'Column index for table cells', + optional: true, + }, + RowSpan: { type: 'number', description: 'Row span for merged cells', optional: true }, + ColumnSpan: { + type: 'number', + description: 'Column span for merged cells', + optional: true, + }, + Query: { + type: 'object', + description: 'Query information for QUERY blocks', + optional: true, + properties: { + Text: { type: 'string', description: 'Query text' }, + Alias: { type: 'string', description: 'Query alias', optional: true }, + Pages: { type: 'array', description: 'Pages to search', optional: true }, + }, + }, + }, + }, + }, + documentMetadata: { + type: 'object', + description: 'Metadata about the analyzed document', + properties: { + pages: { type: 'number', description: 'Number of pages in the document' }, + }, + }, + modelVersion: { + type: 'string', + description: 'Version of the Textract model used for processing', + optional: true, + }, + }, +} diff --git a/apps/sim/tools/textract/types.ts b/apps/sim/tools/textract/types.ts new file mode 100644 index 000000000..4fb66eea2 --- /dev/null +++ b/apps/sim/tools/textract/types.ts @@ -0,0 +1,114 @@ +import type { ToolResponse } from '@/tools/types' + +export type TextractProcessingMode = 'sync' | 'async' + +export interface TextractParserInput { + accessKeyId: string + secretAccessKey: string + region: string + processingMode?: TextractProcessingMode + filePath?: string + s3Uri?: string + fileUpload?: { + url?: string + path?: string + } + featureTypes?: TextractFeatureType[] + queries?: TextractQuery[] +} + +export type TextractFeatureType = 'TABLES' | 'FORMS' | 'QUERIES' | 'SIGNATURES' | 'LAYOUT' + +export interface TextractQuery { + Text: string + Alias?: string + Pages?: string[] +} + +export interface TextractBoundingBox { + Height: number + Left: number + Top: number + Width: number +} + +export interface TextractPolygonPoint { + X: number + Y: number +} + +export interface TextractGeometry { + BoundingBox: TextractBoundingBox + Polygon: TextractPolygonPoint[] + RotationAngle?: number +} + +export interface TextractRelationship { + Type: string + Ids: string[] +} + +export interface TextractBlock { + BlockType: string + Id: string + Text?: string + TextType?: string + Confidence?: number + Geometry?: TextractGeometry + Relationships?: TextractRelationship[] + Page?: number + EntityTypes?: string[] + SelectionStatus?: string + RowIndex?: number + ColumnIndex?: number + RowSpan?: number + ColumnSpan?: number + Query?: { + Text: string + Alias?: string + Pages?: string[] + } +} + +/** AWS Textract DocumentMetadata - exact API format */ +export interface TextractDocumentMetadataRaw { + Pages: number +} + +/** Normalized DocumentMetadata (camelCase) */ +export interface TextractDocumentMetadata { + pages: number +} + +/** AWS Textract API Response - exact API format */ +export interface TextractApiResponse { + Blocks: TextractBlock[] + DocumentMetadata: TextractDocumentMetadataRaw + AnalyzeDocumentModelVersion?: string + DetectDocumentTextModelVersion?: string +} + +export interface TextractNormalizedOutput { + blocks: TextractBlock[] + documentMetadata: TextractDocumentMetadata + modelVersion?: string +} + +/** Async job status response from Textract */ +export interface TextractAsyncJobResponse { + JobStatus: 'IN_PROGRESS' | 'SUCCEEDED' | 'FAILED' | 'PARTIAL_SUCCESS' + StatusMessage?: string + Blocks?: TextractBlock[] + DocumentMetadata?: TextractDocumentMetadataRaw + NextToken?: string + AnalyzeDocumentModelVersion?: string + DetectDocumentTextModelVersion?: string +} + +export interface TextractStartJobResponse { + JobId: string +} + +export interface TextractParserOutput extends ToolResponse { + output: TextractNormalizedOutput +}