feat(tools): added textract

This commit is contained in:
waleed
2026-01-20 11:06:38 -08:00
parent a26a1a9737
commit ecf39c5a54
14 changed files with 1706 additions and 13 deletions

View File

@@ -4093,6 +4093,23 @@ export function SQSIcon(props: SVGProps<SVGSVGElement>) {
)
}
export function TextractIcon(props: SVGProps<SVGSVGElement>) {
return (
<svg
{...props}
viewBox='10 14 60 52'
version='1.1'
xmlns='http://www.w3.org/2000/svg'
xmlnsXlink='http://www.w3.org/1999/xlink'
>
<path
d='M22.0624102,50 C24.3763895,53.603 28.4103535,56 33.0003125,56 C40.1672485,56 45.9991964,50.168 45.9991964,43 C45.9991964,35.832 40.1672485,30 33.0003125,30 C27.6033607,30 22.9664021,33.307 21.0024196,38 L23.2143999,38 C25.0393836,34.444 28.7363506,32 33.0003125,32 C39.0652583,32 43.9992143,36.935 43.9992143,43 C43.9992143,49.065 39.0652583,54 33.0003125,54 C29.5913429,54 26.5413702,52.441 24.5213882,50 L22.0624102,50 Z M37.0002768,45 L37.0002768,43 L41.9992321,43 C41.9992321,38.038 37.9622682,34 33.0003125,34 C28.0373568,34 23.9993929,38.038 23.9993929,43 L28.9993482,43 L28.9993482,45 L24.2313908,45 C25.1443826,49.002 28.7253507,52 33.0003125,52 C35.1362934,52 37.0992759,51.249 38.6442621,50 L34.0003036,50 L34.0003036,48 L40.4782457,48 C41.0812403,47.102 41.5202364,46.087 41.7682342,45 L37.0002768,45 Z M21.0024196,48 L23.2143999,48 C22.4434068,46.498 22.0004107,44.801 22.0004107,43 C22.0004107,41.959 22.1554093,40.955 22.4264069,40 L20.3634253,40 C20.1344274,40.965 19.9994286,41.966 19.9994286,43 C19.9994286,44.771 20.3584254,46.46 21.0024196,48 L21.0024196,48 Z M19.7434309,50 L17.0004554,50 L17.0004554,48 L18.8744386,48 C18.5344417,47.04 18.2894438,46.038 18.1494451,45 L15.4144695,45 L16.707458,46.293 L15.2924706,47.707 L12.2924974,44.707 C11.9025009,44.316 11.9025009,43.684 12.2924974,43.293 L15.2924706,40.293 L16.707458,41.707 L15.4144695,43 L18.0004464,43 C18.0004464,41.973 18.1044455,40.97 18.3024437,40 L17.0004554,40 L17.0004554,38 L18.8744386,38 C20.9404202,32.184 26.4833707,28 33.0003125,28 C37.427273,28 41.4002375,29.939 44.148213,33 L59.0000804,33 L59.0000804,35 L45.6661994,35 C47.1351863,37.318 47.9991786,40.058 47.9991786,43 L59.0000804,43 L59.0000804,45 L47.8501799,45 C46.8681887,52.327 40.5912447,58 33.0003125,58 C27.2563638,58 22.2624084,54.752 19.7434309,50 L19.7434309,50 Z M37.0002768,39 C37.0002768,38.448 36.5522808,38 36.0002857,38 L29.9993482,38 C29.4473442,38 28.9993482,38.448 28.9993482,39 L28.9993482,41 L31.0003304,41 L31.0003304,40 L32.0003214,40 L32.0003214,43 L31.0003304,43 L31.0003304,45 L35.0002946,45 L35.0002946,43 L34.0003036,43 L34.0003036,40 L35.0002946,40 L35.0002946,41 L37.0002768,41 L37.0002768,39 Z M49.0001696,40 L59.0000804,40 L59.0000804,38 L49.0001696,38 L49.0001696,40 Z M49.0001696,50 L59.0000804,50 L59.0000804,48 L49.0001696,48 L49.0001696,50 Z M57.0000982,27 L60.5850662,27 L57.0000982,23.414 L57.0000982,27 Z M63.7070383,27.293 C63.8940367,27.48 64.0000357,27.735 64.0000357,28 L64.0000357,63 C64.0000357,63.552 63.5520397,64 63.0000446,64 L32.0003304,64 C31.4473264,64 31.0003304,63.552 31.0003304,63 L31.0003304,59 L33.0003125,59 L33.0003125,62 L62.0000536,62 L62.0000536,29 L56.0001071,29 C55.4471121,29 55.0001161,28.552 55.0001161,28 L55.0001161,22 L33.0003125,22 L33.0003125,27 L31.0003304,27 L31.0003304,21 C31.0003304,20.448 31.4473264,20 32.0003304,20 L56.0001071,20 C56.2651048,20 56.5191025,20.105 56.7071008,20.293 L63.7070383,27.293 Z M68,24.166 L68,61 C68,61.552 67.552004,62 67.0000089,62 L65.0000268,62 L65.0000268,60 L66.0000179,60 L66.0000179,24.612 L58.6170838,18 L36.0002857,18 L36.0002857,19 L34.0003036,19 L34.0003036,17 C34.0003036,16.448 34.4472996,16 35.0003036,16 L59.0000804,16 C59.2460782,16 59.483076,16.091 59.6660744,16.255 L67.666003,23.42 C67.8780011,23.61 68,23.881 68,24.166 L68,24.166 Z'
fill='currentColor'
/>
</svg>
)
}
export function McpIcon(props: SVGProps<SVGSVGElement>) {
return (
<svg

View File

@@ -110,6 +110,7 @@ import {
SupabaseIcon,
TavilyIcon,
TelegramIcon,
TextractIcon,
TinybirdIcon,
TranslateIcon,
TrelloIcon,
@@ -237,6 +238,7 @@ export const blockTypeToIconMap: Record<string, IconComponent> = {
supabase: SupabaseIcon,
tavily: TavilyIcon,
telegram: TelegramIcon,
textract: TextractIcon,
tinybird: TinybirdIcon,
translate: TranslateIcon,
trello: TrelloIcon,

View File

@@ -106,6 +106,7 @@
"supabase",
"tavily",
"telegram",
"textract",
"tinybird",
"translate",
"trello",

View File

@@ -0,0 +1,120 @@
---
title: AWS Textract
description: Extract text, tables, and forms from documents
---
import { BlockInfoCard } from "@/components/ui/block-info-card"
<BlockInfoCard
type="textract"
color="linear-gradient(135deg, #055F4E 0%, #56C0A7 100%)"
/>
{/* MANUAL-CONTENT-START:intro */}
[AWS Textract](https://aws.amazon.com/textract/) is a powerful AI service from Amazon Web Services designed to automatically extract printed text, handwriting, tables, forms, key-value pairs, and other structured data from scanned documents and images. Textract leverages advanced optical character recognition (OCR) and document analysis to transform documents into actionable data, enabling automation, analytics, compliance, and more.
With AWS Textract, you can:
- **Extract text from images and documents**: Recognize printed text and handwriting in formats such as PDF, JPEG, PNG, or TIFF
- **Detect and extract tables**: Automatically find tables and output their structured content
- **Parse forms and key-value pairs**: Pull structured data from forms, including fields and their corresponding values
- **Identify signatures and layout features**: Detect signatures, geometric layout, and relationships between document elements
- **Customize extraction with queries**: Extract specific fields and answers using query-based extraction (e.g., "What is the invoice number?")
In Sim, the AWS Textract integration empowers your agents to intelligently process documents as part of their workflows. This unlocks automation scenarios such as data entry from invoices, onboarding documents, contracts, receipts, and more. Your agents can extract relevant data, analyze structured forms, and generate summaries or reports directly from document uploads or URLs. By connecting Sim with AWS Textract, you can reduce manual effort, improve data accuracy, and streamline your business processes with robust document understanding.
{/* MANUAL-CONTENT-END */}
## Usage Instructions
Integrate AWS Textract into your workflow to extract text, tables, forms, and key-value pairs from documents. Sync mode supports JPEG, PNG, and single-page PDF. Async mode supports multi-page PDF and TIFF via S3.
## Tools
### `textract_parser`
Parse documents using AWS Textract OCR and document analysis
#### Input
| Parameter | Type | Required | Description |
| --------- | ---- | -------- | ----------- |
| `accessKeyId` | string | Yes | AWS Access Key ID |
| `secretAccessKey` | string | Yes | AWS Secret Access Key |
| `region` | string | Yes | AWS region for Textract service \(e.g., us-east-1\) |
| `processingMode` | string | No | Document type: single-page or multi-page. Defaults to single-page. |
| `filePath` | string | No | URL to a document to be processed \(JPEG, PNG, PDF, or TIFF\). Required for sync mode. |
| `s3Uri` | string | No | S3 URI for async processing \(s3://bucket/key\). Required for async mode with S3 input. |
| `fileUpload` | object | No | File upload data from file-upload component |
| `featureTypes` | array | No | Feature types to detect: TABLES, FORMS, QUERIES, SIGNATURES, LAYOUT. If not specified, only text detection is performed. |
| `items` | string | No | Feature type |
| `queries` | array | No | Custom queries to extract specific information. Only used when featureTypes includes QUERIES. |
| `items` | object | No | Query configuration |
| `properties` | string | No | The query text |
| `Text` | string | No | No description |
| `Alias` | string | No | No description |
#### Output
| Parameter | Type | Description |
| --------- | ---- | ----------- |
| `blocks` | array | Array of Block objects containing detected text, tables, forms, and other elements |
| ↳ `BlockType` | string | Type of block \(PAGE, LINE, WORD, TABLE, CELL, KEY_VALUE_SET, etc.\) |
| ↳ `Id` | string | Unique identifier for the block |
| ↳ `Text` | string | Query text |
| ↳ `TextType` | string | Type of text \(PRINTED or HANDWRITING\) |
| ↳ `Confidence` | number | Confidence score \(0-100\) |
| ↳ `Page` | number | Page number |
| ↳ `Geometry` | object | Location and bounding box information |
| ↳ `BoundingBox` | object | Height as ratio of document height |
| ↳ `Height` | number | Height as ratio of document height |
| ↳ `Left` | number | Left position as ratio of document width |
| ↳ `Top` | number | Top position as ratio of document height |
| ↳ `Width` | number | Width as ratio of document width |
| ↳ `Height` | number | Height as ratio of document height |
| ↳ `Left` | number | Left position as ratio of document width |
| ↳ `Top` | number | Top position as ratio of document height |
| ↳ `Width` | number | Width as ratio of document width |
| ↳ `Polygon` | array | Polygon coordinates |
| ↳ `X` | number | X coordinate |
| ↳ `Y` | number | Y coordinate |
| ↳ `X` | number | X coordinate |
| ↳ `Y` | number | Y coordinate |
| ↳ `BoundingBox` | object | Height as ratio of document height |
| ↳ `Height` | number | Height as ratio of document height |
| ↳ `Left` | number | Left position as ratio of document width |
| ↳ `Top` | number | Top position as ratio of document height |
| ↳ `Width` | number | Width as ratio of document width |
| ↳ `Height` | number | Height as ratio of document height |
| ↳ `Left` | number | Left position as ratio of document width |
| ↳ `Top` | number | Top position as ratio of document height |
| ↳ `Width` | number | Width as ratio of document width |
| ↳ `Polygon` | array | Polygon coordinates |
| ↳ `X` | number | X coordinate |
| ↳ `Y` | number | Y coordinate |
| ↳ `X` | number | X coordinate |
| ↳ `Y` | number | Y coordinate |
| ↳ `Relationships` | array | Relationships to other blocks |
| ↳ `Type` | string | Relationship type \(CHILD, VALUE, ANSWER, etc.\) |
| ↳ `Ids` | array | IDs of related blocks |
| ↳ `Type` | string | Relationship type \(CHILD, VALUE, ANSWER, etc.\) |
| ↳ `Ids` | array | IDs of related blocks |
| ↳ `EntityTypes` | array | Entity types for KEY_VALUE_SET \(KEY or VALUE\) |
| ↳ `SelectionStatus` | string | For checkboxes: SELECTED or NOT_SELECTED |
| ↳ `RowIndex` | number | Row index for table cells |
| ↳ `ColumnIndex` | number | Column index for table cells |
| ↳ `RowSpan` | number | Row span for merged cells |
| ↳ `ColumnSpan` | number | Column span for merged cells |
| ↳ `Query` | object | Query information for QUERY blocks |
| ↳ `Text` | string | Query text |
| ↳ `Alias` | string | Query alias |
| ↳ `Pages` | array | Pages to search |
| ↳ `Alias` | string | Query alias |
| ↳ `Pages` | array | Pages to search |
| `documentMetadata` | object | Metadata about the analyzed document |
| ↳ `pages` | number | Number of pages in the document |
| `modelVersion` | string | Version of the Textract model used for processing |

View File

@@ -0,0 +1,617 @@
import crypto from 'crypto'
import { createLogger } from '@sim/logger'
import { type NextRequest, NextResponse } from 'next/server'
import { z } from 'zod'
import { checkHybridAuth } from '@/lib/auth/hybrid'
import {
validateAwsRegion,
validateExternalUrl,
validateS3BucketName,
} from '@/lib/core/security/input-validation'
import { generateRequestId } from '@/lib/core/utils/request'
import { getBaseUrl } from '@/lib/core/utils/urls'
import { StorageService } from '@/lib/uploads'
import { extractStorageKey, inferContextFromKey } from '@/lib/uploads/utils/file-utils'
import { verifyFileAccess } from '@/app/api/files/authorization'
export const dynamic = 'force-dynamic'
export const maxDuration = 300 // 5 minutes for large multi-page PDF processing
const logger = createLogger('TextractParseAPI')
const QuerySchema = z.object({
Text: z.string().min(1),
Alias: z.string().optional(),
Pages: z.array(z.string()).optional(),
})
const TextractParseSchema = z
.object({
accessKeyId: z.string().min(1, 'AWS Access Key ID is required'),
secretAccessKey: z.string().min(1, 'AWS Secret Access Key is required'),
region: z.string().min(1, 'AWS region is required'),
processingMode: z.enum(['sync', 'async']).optional().default('sync'),
filePath: z.string().optional(),
s3Uri: z.string().optional(),
featureTypes: z
.array(z.enum(['TABLES', 'FORMS', 'QUERIES', 'SIGNATURES', 'LAYOUT']))
.optional(),
queries: z.array(QuerySchema).optional(),
})
.superRefine((data, ctx) => {
const regionValidation = validateAwsRegion(data.region, 'AWS region')
if (!regionValidation.isValid) {
ctx.addIssue({
code: z.ZodIssueCode.custom,
message: regionValidation.error,
path: ['region'],
})
}
})
/**
* Generate AWS Signature Version 4 signing key
*/
function getSignatureKey(
key: string,
dateStamp: string,
regionName: string,
serviceName: string
): Buffer {
const kDate = crypto.createHmac('sha256', `AWS4${key}`).update(dateStamp).digest()
const kRegion = crypto.createHmac('sha256', kDate).update(regionName).digest()
const kService = crypto.createHmac('sha256', kRegion).update(serviceName).digest()
const kSigning = crypto.createHmac('sha256', kService).update('aws4_request').digest()
return kSigning
}
function signAwsRequest(
method: string,
host: string,
uri: string,
body: string,
accessKeyId: string,
secretAccessKey: string,
region: string,
service: string,
amzTarget: string
): Record<string, string> {
const date = new Date()
const amzDate = date.toISOString().replace(/[:-]|\.\d{3}/g, '')
const dateStamp = amzDate.slice(0, 8)
const payloadHash = crypto.createHash('sha256').update(body).digest('hex')
const canonicalHeaders =
`content-type:application/x-amz-json-1.1\n` +
`host:${host}\n` +
`x-amz-date:${amzDate}\n` +
`x-amz-target:${amzTarget}\n`
const signedHeaders = 'content-type;host;x-amz-date;x-amz-target'
const canonicalRequest = `${method}\n${uri}\n\n${canonicalHeaders}\n${signedHeaders}\n${payloadHash}`
const algorithm = 'AWS4-HMAC-SHA256'
const credentialScope = `${dateStamp}/${region}/${service}/aws4_request`
const stringToSign = `${algorithm}\n${amzDate}\n${credentialScope}\n${crypto.createHash('sha256').update(canonicalRequest).digest('hex')}`
const signingKey = getSignatureKey(secretAccessKey, dateStamp, region, service)
const signature = crypto.createHmac('sha256', signingKey).update(stringToSign).digest('hex')
const authorizationHeader = `${algorithm} Credential=${accessKeyId}/${credentialScope}, SignedHeaders=${signedHeaders}, Signature=${signature}`
return {
'Content-Type': 'application/x-amz-json-1.1',
Host: host,
'X-Amz-Date': amzDate,
'X-Amz-Target': amzTarget,
Authorization: authorizationHeader,
}
}
async function fetchDocumentBytes(url: string): Promise<{ bytes: string; contentType: string }> {
const response = await fetch(url)
if (!response.ok) {
throw new Error(`Failed to fetch document: ${response.statusText}`)
}
const arrayBuffer = await response.arrayBuffer()
const bytes = Buffer.from(arrayBuffer).toString('base64')
const contentType = response.headers.get('content-type') || 'application/octet-stream'
return { bytes, contentType }
}
function parseS3Uri(s3Uri: string): { bucket: string; key: string } {
const match = s3Uri.match(/^s3:\/\/([^/]+)\/(.+)$/)
if (!match) {
throw new Error(
`Invalid S3 URI format: ${s3Uri}. Expected format: s3://bucket-name/path/to/object`
)
}
const bucket = match[1]
const key = match[2]
const bucketValidation = validateS3BucketName(bucket, 'S3 bucket name')
if (!bucketValidation.isValid) {
throw new Error(bucketValidation.error)
}
if (key.includes('..') || key.startsWith('/')) {
throw new Error('S3 key contains invalid path traversal sequences')
}
return { bucket, key }
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms))
}
async function callTextractAsync(
host: string,
amzTarget: string,
body: Record<string, unknown>,
accessKeyId: string,
secretAccessKey: string,
region: string
): Promise<Record<string, unknown>> {
const bodyString = JSON.stringify(body)
const headers = signAwsRequest(
'POST',
host,
'/',
bodyString,
accessKeyId,
secretAccessKey,
region,
'textract',
amzTarget
)
const response = await fetch(`https://${host}/`, {
method: 'POST',
headers,
body: bodyString,
})
if (!response.ok) {
const errorText = await response.text()
let errorMessage = `Textract API error: ${response.statusText}`
try {
const errorJson = JSON.parse(errorText)
if (errorJson.Message) {
errorMessage = errorJson.Message
} else if (errorJson.__type) {
errorMessage = `${errorJson.__type}: ${errorJson.message || errorText}`
}
} catch {
// Use default error message
}
throw new Error(errorMessage)
}
return response.json()
}
async function pollForJobCompletion(
host: string,
jobId: string,
accessKeyId: string,
secretAccessKey: string,
region: string,
useAnalyzeDocument: boolean,
requestId: string
): Promise<Record<string, unknown>> {
const pollIntervalMs = 5000 // 5 seconds between polls
const maxPollTimeMs = 180000 // 3 minutes maximum polling time
const maxAttempts = Math.ceil(maxPollTimeMs / pollIntervalMs)
const getTarget = useAnalyzeDocument
? 'Textract.GetDocumentAnalysis'
: 'Textract.GetDocumentTextDetection'
for (let attempt = 0; attempt < maxAttempts; attempt++) {
const result = await callTextractAsync(
host,
getTarget,
{ JobId: jobId },
accessKeyId,
secretAccessKey,
region
)
const jobStatus = result.JobStatus as string
if (jobStatus === 'SUCCEEDED') {
logger.info(`[${requestId}] Async job completed successfully after ${attempt + 1} polls`)
let allBlocks = (result.Blocks as unknown[]) || []
let nextToken = result.NextToken as string | undefined
while (nextToken) {
const nextResult = await callTextractAsync(
host,
getTarget,
{ JobId: jobId, NextToken: nextToken },
accessKeyId,
secretAccessKey,
region
)
allBlocks = allBlocks.concat((nextResult.Blocks as unknown[]) || [])
nextToken = nextResult.NextToken as string | undefined
}
return {
...result,
Blocks: allBlocks,
}
}
if (jobStatus === 'FAILED') {
throw new Error(`Textract job failed: ${result.StatusMessage || 'Unknown error'}`)
}
if (jobStatus === 'PARTIAL_SUCCESS') {
logger.warn(`[${requestId}] Job completed with partial success: ${result.StatusMessage}`)
return result
}
logger.info(`[${requestId}] Job status: ${jobStatus}, attempt ${attempt + 1}/${maxAttempts}`)
await sleep(pollIntervalMs)
}
throw new Error(
`Timeout waiting for Textract job to complete (max ${maxPollTimeMs / 1000} seconds)`
)
}
export async function POST(request: NextRequest) {
const requestId = generateRequestId()
try {
const authResult = await checkHybridAuth(request, { requireWorkflowId: false })
if (!authResult.success || !authResult.userId) {
logger.warn(`[${requestId}] Unauthorized Textract parse attempt`, {
error: authResult.error || 'Missing userId',
})
return NextResponse.json(
{
success: false,
error: authResult.error || 'Unauthorized',
},
{ status: 401 }
)
}
const userId = authResult.userId
const body = await request.json()
const validatedData = TextractParseSchema.parse(body)
const processingMode = validatedData.processingMode || 'sync'
const featureTypes = validatedData.featureTypes ?? []
const useAnalyzeDocument = featureTypes.length > 0
const host = `textract.${validatedData.region}.amazonaws.com`
logger.info(`[${requestId}] Textract parse request`, {
processingMode,
filePath: validatedData.filePath,
s3Uri: validatedData.s3Uri,
featureTypes,
userId,
})
if (processingMode === 'async') {
if (!validatedData.s3Uri && !validatedData.filePath) {
return NextResponse.json(
{
success: false,
error: 'S3 URI or file path is required for async processing',
},
{ status: 400 }
)
}
let s3Bucket: string
let s3Key: string
if (validatedData.s3Uri) {
const parsed = parseS3Uri(validatedData.s3Uri)
s3Bucket = parsed.bucket
s3Key = parsed.key
} else if (validatedData.filePath?.includes('/api/files/serve/')) {
const storageKey = extractStorageKey(validatedData.filePath)
const context = inferContextFromKey(storageKey)
const hasAccess = await verifyFileAccess(storageKey, userId, undefined, context, false)
if (!hasAccess) {
return NextResponse.json({ success: false, error: 'File not found' }, { status: 404 })
}
const s3Info = StorageService.getS3InfoForKey(storageKey, context)
s3Bucket = s3Info.bucket
s3Key = s3Info.key
} else {
return NextResponse.json(
{
success: false,
error: 'Async mode requires an S3 URI (s3://bucket/key) or an uploaded file',
},
{ status: 400 }
)
}
logger.info(`[${requestId}] Starting async Textract job`, { s3Bucket, s3Key })
const startTarget = useAnalyzeDocument
? 'Textract.StartDocumentAnalysis'
: 'Textract.StartDocumentTextDetection'
const startBody: Record<string, unknown> = {
DocumentLocation: {
S3Object: {
Bucket: s3Bucket,
Name: s3Key,
},
},
}
if (useAnalyzeDocument) {
startBody.FeatureTypes = featureTypes
if (
validatedData.queries &&
validatedData.queries.length > 0 &&
featureTypes.includes('QUERIES')
) {
startBody.QueriesConfig = {
Queries: validatedData.queries.map((q) => ({
Text: q.Text,
Alias: q.Alias,
Pages: q.Pages,
})),
}
}
}
const startResult = await callTextractAsync(
host,
startTarget,
startBody,
validatedData.accessKeyId,
validatedData.secretAccessKey,
validatedData.region
)
const jobId = startResult.JobId as string
if (!jobId) {
throw new Error('Failed to start Textract job: No JobId returned')
}
logger.info(`[${requestId}] Async job started`, { jobId })
const textractData = await pollForJobCompletion(
host,
jobId,
validatedData.accessKeyId,
validatedData.secretAccessKey,
validatedData.region,
useAnalyzeDocument,
requestId
)
logger.info(`[${requestId}] Textract async parse successful`, {
pageCount: (textractData.DocumentMetadata as { Pages?: number })?.Pages ?? 0,
blockCount: (textractData.Blocks as unknown[])?.length ?? 0,
})
return NextResponse.json({
success: true,
output: {
blocks: textractData.Blocks ?? [],
documentMetadata: {
pages: (textractData.DocumentMetadata as { Pages?: number })?.Pages ?? 0,
},
modelVersion: (textractData.AnalyzeDocumentModelVersion ??
textractData.DetectDocumentTextModelVersion) as string | undefined,
},
})
}
if (!validatedData.filePath) {
return NextResponse.json(
{
success: false,
error: 'File path is required for sync processing',
},
{ status: 400 }
)
}
let fileUrl = validatedData.filePath
if (validatedData.filePath?.includes('/api/files/serve/')) {
try {
const storageKey = extractStorageKey(validatedData.filePath)
const context = inferContextFromKey(storageKey)
const hasAccess = await verifyFileAccess(storageKey, userId, undefined, context, false)
if (!hasAccess) {
logger.warn(`[${requestId}] Unauthorized presigned URL generation attempt`, {
userId,
key: storageKey,
context,
})
return NextResponse.json(
{
success: false,
error: 'File not found',
},
{ status: 404 }
)
}
fileUrl = await StorageService.generatePresignedDownloadUrl(storageKey, context, 5 * 60)
logger.info(`[${requestId}] Generated presigned URL for ${context} file`)
} catch (error) {
logger.error(`[${requestId}] Failed to generate presigned URL:`, error)
return NextResponse.json(
{
success: false,
error: 'Failed to generate file access URL',
},
{ status: 500 }
)
}
} else if (validatedData.filePath?.startsWith('/')) {
const baseUrl = getBaseUrl()
fileUrl = `${baseUrl}${validatedData.filePath}`
} else {
const urlValidation = validateExternalUrl(fileUrl, 'Document URL')
if (!urlValidation.isValid) {
logger.warn(`[${requestId}] SSRF attempt blocked`, {
userId,
url: fileUrl.substring(0, 100),
error: urlValidation.error,
})
return NextResponse.json(
{
success: false,
error: urlValidation.error,
},
{ status: 400 }
)
}
}
const { bytes } = await fetchDocumentBytes(fileUrl)
const uri = '/'
let textractBody: Record<string, unknown>
let amzTarget: string
if (useAnalyzeDocument) {
amzTarget = 'Textract.AnalyzeDocument'
textractBody = {
Document: {
Bytes: bytes,
},
FeatureTypes: featureTypes,
}
if (
validatedData.queries &&
validatedData.queries.length > 0 &&
featureTypes.includes('QUERIES')
) {
textractBody.QueriesConfig = {
Queries: validatedData.queries.map((q) => ({
Text: q.Text,
Alias: q.Alias,
Pages: q.Pages,
})),
}
}
} else {
amzTarget = 'Textract.DetectDocumentText'
textractBody = {
Document: {
Bytes: bytes,
},
}
}
const bodyString = JSON.stringify(textractBody)
const headers = signAwsRequest(
'POST',
host,
uri,
bodyString,
validatedData.accessKeyId,
validatedData.secretAccessKey,
validatedData.region,
'textract',
amzTarget
)
const textractResponse = await fetch(`https://${host}${uri}`, {
method: 'POST',
headers,
body: bodyString,
})
if (!textractResponse.ok) {
const errorText = await textractResponse.text()
logger.error(`[${requestId}] Textract API error:`, errorText)
let errorMessage = `Textract API error: ${textractResponse.statusText}`
try {
const errorJson = JSON.parse(errorText)
if (errorJson.Message) {
errorMessage = errorJson.Message
} else if (errorJson.__type) {
errorMessage = `${errorJson.__type}: ${errorJson.message || errorText}`
}
} catch {
// Use default error message
}
return NextResponse.json(
{
success: false,
error: errorMessage,
},
{ status: textractResponse.status }
)
}
const textractData = await textractResponse.json()
logger.info(`[${requestId}] Textract parse successful`, {
pageCount: textractData.DocumentMetadata?.Pages ?? 0,
blockCount: textractData.Blocks?.length ?? 0,
})
return NextResponse.json({
success: true,
output: {
blocks: textractData.Blocks ?? [],
documentMetadata: {
pages: textractData.DocumentMetadata?.Pages ?? 0,
},
modelVersion:
textractData.AnalyzeDocumentModelVersion ??
textractData.DetectDocumentTextModelVersion ??
undefined,
},
})
} catch (error) {
if (error instanceof z.ZodError) {
logger.warn(`[${requestId}] Invalid request data`, { errors: error.errors })
return NextResponse.json(
{
success: false,
error: 'Invalid request data',
details: error.errors,
},
{ status: 400 }
)
}
logger.error(`[${requestId}] Error in Textract parse:`, error)
return NextResponse.json(
{
success: false,
error: error instanceof Error ? error.message : 'Internal server error',
},
{ status: 500 }
)
}
}

View File

@@ -0,0 +1,251 @@
import { TextractIcon } from '@/components/icons'
import { AuthMode, type BlockConfig, type SubBlockType } from '@/blocks/types'
import type { TextractParserOutput } from '@/tools/textract/types'
export const TextractBlock: BlockConfig<TextractParserOutput> = {
type: 'textract',
name: 'AWS Textract',
description: 'Extract text, tables, and forms from documents',
authMode: AuthMode.ApiKey,
longDescription: `Integrate AWS Textract into your workflow to extract text, tables, forms, and key-value pairs from documents. Sync mode supports JPEG, PNG, and single-page PDF. Async mode supports multi-page PDF and TIFF via S3.`,
docsLink: 'https://docs.sim.ai/tools/textract',
category: 'tools',
bgColor: 'linear-gradient(135deg, #055F4E 0%, #56C0A7 100%)',
icon: TextractIcon,
subBlocks: [
{
id: 'processingMode',
title: 'Document Type',
type: 'dropdown' as SubBlockType,
options: [
{ id: 'sync', label: 'Single Page' },
{ id: 'async', label: 'Multi-Page' },
],
},
{
id: 'inputMethod',
title: 'Select Input Method',
type: 'dropdown' as SubBlockType,
options: [
{ id: 'url', label: 'Document URL' },
{ id: 'upload', label: 'Upload Document' },
],
condition: {
field: 'processingMode',
value: 'async',
not: true,
},
},
{
id: 'asyncInputMethod',
title: 'Select Input Method',
type: 'dropdown' as SubBlockType,
options: [
{ id: 's3', label: 'S3 URI' },
{ id: 'upload', label: 'Upload Document' },
],
condition: {
field: 'processingMode',
value: 'async',
},
},
{
id: 'filePath',
title: 'Document URL',
type: 'short-input' as SubBlockType,
placeholder: 'Enter full URL to a document (JPEG, PNG, or single-page PDF)',
condition: {
field: 'inputMethod',
value: 'url',
and: {
field: 'processingMode',
value: 'async',
not: true,
},
},
},
{
id: 's3Uri',
title: 'S3 URI',
type: 'short-input' as SubBlockType,
placeholder: 's3://bucket-name/path/to/document.pdf',
condition: {
field: 'asyncInputMethod',
value: 's3',
and: {
field: 'processingMode',
value: 'async',
},
},
},
{
id: 'fileUpload',
title: 'Upload Document',
type: 'file-upload' as SubBlockType,
acceptedTypes: 'application/pdf,image/jpeg,image/png,image/tiff',
condition: {
field: 'inputMethod',
value: 'upload',
and: {
field: 'processingMode',
value: 'async',
not: true,
},
},
maxSize: 10,
},
{
id: 'asyncFileUpload',
title: 'Upload Document',
type: 'file-upload' as SubBlockType,
acceptedTypes: 'application/pdf,image/jpeg,image/png,image/tiff',
condition: {
field: 'asyncInputMethod',
value: 'upload',
and: {
field: 'processingMode',
value: 'async',
},
},
maxSize: 50,
},
{
id: 'region',
title: 'AWS Region',
type: 'short-input' as SubBlockType,
placeholder: 'e.g., us-east-1',
required: true,
},
{
id: 'accessKeyId',
title: 'AWS Access Key ID',
type: 'short-input' as SubBlockType,
placeholder: 'Enter your AWS Access Key ID',
password: true,
required: true,
},
{
id: 'secretAccessKey',
title: 'AWS Secret Access Key',
type: 'short-input' as SubBlockType,
placeholder: 'Enter your AWS Secret Access Key',
password: true,
required: true,
},
{
id: 'extractTables',
title: 'Extract Tables',
type: 'switch' as SubBlockType,
},
{
id: 'extractForms',
title: 'Extract Forms (Key-Value Pairs)',
type: 'switch' as SubBlockType,
},
{
id: 'detectSignatures',
title: 'Detect Signatures',
type: 'switch' as SubBlockType,
},
{
id: 'analyzeLayout',
title: 'Analyze Document Layout',
type: 'switch' as SubBlockType,
},
],
tools: {
access: ['textract_parser'],
config: {
tool: () => 'textract_parser',
params: (params) => {
if (!params.accessKeyId || params.accessKeyId.trim() === '') {
throw new Error('AWS Access Key ID is required')
}
if (!params.secretAccessKey || params.secretAccessKey.trim() === '') {
throw new Error('AWS Secret Access Key is required')
}
if (!params.region || params.region.trim() === '') {
throw new Error('AWS Region is required')
}
const processingMode = params.processingMode || 'sync'
const parameters: Record<string, unknown> = {
accessKeyId: params.accessKeyId.trim(),
secretAccessKey: params.secretAccessKey.trim(),
region: params.region.trim(),
processingMode,
}
if (processingMode === 'async') {
const asyncInputMethod = params.asyncInputMethod || 's3'
if (asyncInputMethod === 's3') {
if (!params.s3Uri || params.s3Uri.trim() === '') {
throw new Error('S3 URI is required for async processing')
}
parameters.s3Uri = params.s3Uri.trim()
} else if (asyncInputMethod === 'upload') {
if (!params.asyncFileUpload) {
throw new Error('Please upload a document')
}
parameters.fileUpload = params.asyncFileUpload
}
} else {
const inputMethod = params.inputMethod || 'url'
if (inputMethod === 'url') {
if (!params.filePath || params.filePath.trim() === '') {
throw new Error('Document URL is required')
}
parameters.filePath = params.filePath.trim()
} else if (inputMethod === 'upload') {
if (!params.fileUpload) {
throw new Error('Please upload a document')
}
parameters.fileUpload = params.fileUpload
}
}
const featureTypes: string[] = []
if (params.extractTables) featureTypes.push('TABLES')
if (params.extractForms) featureTypes.push('FORMS')
if (params.detectSignatures) featureTypes.push('SIGNATURES')
if (params.analyzeLayout) featureTypes.push('LAYOUT')
if (featureTypes.length > 0) {
parameters.featureTypes = featureTypes
}
return parameters
},
},
},
inputs: {
processingMode: { type: 'string', description: 'Document type: single-page or multi-page' },
inputMethod: { type: 'string', description: 'Input method selection for sync mode' },
asyncInputMethod: { type: 'string', description: 'Input method selection for async mode' },
filePath: { type: 'string', description: 'Document URL' },
s3Uri: { type: 'string', description: 'S3 URI for async processing (s3://bucket/key)' },
fileUpload: { type: 'json', description: 'Uploaded document file for sync mode' },
asyncFileUpload: { type: 'json', description: 'Uploaded document file for async mode' },
extractTables: { type: 'boolean', description: 'Extract tables from document' },
extractForms: { type: 'boolean', description: 'Extract form key-value pairs' },
detectSignatures: { type: 'boolean', description: 'Detect signatures' },
analyzeLayout: { type: 'boolean', description: 'Analyze document layout' },
region: { type: 'string', description: 'AWS region' },
accessKeyId: { type: 'string', description: 'AWS Access Key ID' },
secretAccessKey: { type: 'string', description: 'AWS Secret Access Key' },
},
outputs: {
blocks: {
type: 'json',
description: 'Array of detected blocks (PAGE, LINE, WORD, TABLE, CELL, KEY_VALUE_SET, etc.)',
},
documentMetadata: {
type: 'json',
description: 'Document metadata containing pages count',
},
modelVersion: {
type: 'string',
description: 'Version of the Textract model used for processing',
},
},
}

View File

@@ -123,6 +123,7 @@ import { SttBlock } from '@/blocks/blocks/stt'
import { SupabaseBlock } from '@/blocks/blocks/supabase'
import { TavilyBlock } from '@/blocks/blocks/tavily'
import { TelegramBlock } from '@/blocks/blocks/telegram'
import { TextractBlock } from '@/blocks/blocks/textract'
import { ThinkingBlock } from '@/blocks/blocks/thinking'
import { TinybirdBlock } from '@/blocks/blocks/tinybird'
import { TranslateBlock } from '@/blocks/blocks/translate'
@@ -285,6 +286,7 @@ export const registry: Record<string, BlockConfig> = {
stt: SttBlock,
supabase: SupabaseBlock,
tavily: TavilyBlock,
textract: TextractBlock,
telegram: TelegramBlock,
thinking: ThinkingBlock,
tinybird: TinybirdBlock,

View File

@@ -4093,6 +4093,23 @@ export function SQSIcon(props: SVGProps<SVGSVGElement>) {
)
}
export function TextractIcon(props: SVGProps<SVGSVGElement>) {
return (
<svg
{...props}
viewBox='10 14 60 52'
version='1.1'
xmlns='http://www.w3.org/2000/svg'
xmlnsXlink='http://www.w3.org/1999/xlink'
>
<path
d='M22.0624102,50 C24.3763895,53.603 28.4103535,56 33.0003125,56 C40.1672485,56 45.9991964,50.168 45.9991964,43 C45.9991964,35.832 40.1672485,30 33.0003125,30 C27.6033607,30 22.9664021,33.307 21.0024196,38 L23.2143999,38 C25.0393836,34.444 28.7363506,32 33.0003125,32 C39.0652583,32 43.9992143,36.935 43.9992143,43 C43.9992143,49.065 39.0652583,54 33.0003125,54 C29.5913429,54 26.5413702,52.441 24.5213882,50 L22.0624102,50 Z M37.0002768,45 L37.0002768,43 L41.9992321,43 C41.9992321,38.038 37.9622682,34 33.0003125,34 C28.0373568,34 23.9993929,38.038 23.9993929,43 L28.9993482,43 L28.9993482,45 L24.2313908,45 C25.1443826,49.002 28.7253507,52 33.0003125,52 C35.1362934,52 37.0992759,51.249 38.6442621,50 L34.0003036,50 L34.0003036,48 L40.4782457,48 C41.0812403,47.102 41.5202364,46.087 41.7682342,45 L37.0002768,45 Z M21.0024196,48 L23.2143999,48 C22.4434068,46.498 22.0004107,44.801 22.0004107,43 C22.0004107,41.959 22.1554093,40.955 22.4264069,40 L20.3634253,40 C20.1344274,40.965 19.9994286,41.966 19.9994286,43 C19.9994286,44.771 20.3584254,46.46 21.0024196,48 L21.0024196,48 Z M19.7434309,50 L17.0004554,50 L17.0004554,48 L18.8744386,48 C18.5344417,47.04 18.2894438,46.038 18.1494451,45 L15.4144695,45 L16.707458,46.293 L15.2924706,47.707 L12.2924974,44.707 C11.9025009,44.316 11.9025009,43.684 12.2924974,43.293 L15.2924706,40.293 L16.707458,41.707 L15.4144695,43 L18.0004464,43 C18.0004464,41.973 18.1044455,40.97 18.3024437,40 L17.0004554,40 L17.0004554,38 L18.8744386,38 C20.9404202,32.184 26.4833707,28 33.0003125,28 C37.427273,28 41.4002375,29.939 44.148213,33 L59.0000804,33 L59.0000804,35 L45.6661994,35 C47.1351863,37.318 47.9991786,40.058 47.9991786,43 L59.0000804,43 L59.0000804,45 L47.8501799,45 C46.8681887,52.327 40.5912447,58 33.0003125,58 C27.2563638,58 22.2624084,54.752 19.7434309,50 L19.7434309,50 Z M37.0002768,39 C37.0002768,38.448 36.5522808,38 36.0002857,38 L29.9993482,38 C29.4473442,38 28.9993482,38.448 28.9993482,39 L28.9993482,41 L31.0003304,41 L31.0003304,40 L32.0003214,40 L32.0003214,43 L31.0003304,43 L31.0003304,45 L35.0002946,45 L35.0002946,43 L34.0003036,43 L34.0003036,40 L35.0002946,40 L35.0002946,41 L37.0002768,41 L37.0002768,39 Z M49.0001696,40 L59.0000804,40 L59.0000804,38 L49.0001696,38 L49.0001696,40 Z M49.0001696,50 L59.0000804,50 L59.0000804,48 L49.0001696,48 L49.0001696,50 Z M57.0000982,27 L60.5850662,27 L57.0000982,23.414 L57.0000982,27 Z M63.7070383,27.293 C63.8940367,27.48 64.0000357,27.735 64.0000357,28 L64.0000357,63 C64.0000357,63.552 63.5520397,64 63.0000446,64 L32.0003304,64 C31.4473264,64 31.0003304,63.552 31.0003304,63 L31.0003304,59 L33.0003125,59 L33.0003125,62 L62.0000536,62 L62.0000536,29 L56.0001071,29 C55.4471121,29 55.0001161,28.552 55.0001161,28 L55.0001161,22 L33.0003125,22 L33.0003125,27 L31.0003304,27 L31.0003304,21 C31.0003304,20.448 31.4473264,20 32.0003304,20 L56.0001071,20 C56.2651048,20 56.5191025,20.105 56.7071008,20.293 L63.7070383,27.293 Z M68,24.166 L68,61 C68,61.552 67.552004,62 67.0000089,62 L65.0000268,62 L65.0000268,60 L66.0000179,60 L66.0000179,24.612 L58.6170838,18 L36.0002857,18 L36.0002857,19 L34.0003036,19 L34.0003036,17 C34.0003036,16.448 34.4472996,16 35.0003036,16 L59.0000804,16 C59.2460782,16 59.483076,16.091 59.6660744,16.255 L67.666003,23.42 C67.8780011,23.61 68,23.881 68,24.166 L68,24.166 Z'
fill='currentColor'
/>
</svg>
)
}
export function McpIcon(props: SVGProps<SVGSVGElement>) {
return (
<svg

View File

@@ -3,18 +3,12 @@ import { createLogger } from '@sim/logger'
const logger = createLogger('InputValidation')
/**
* Result type for validation functions
*/
export interface ValidationResult {
isValid: boolean
error?: string
sanitized?: string
}
/**
* Options for path segment validation
*/
export interface PathSegmentOptions {
/** Name of the parameter for error messages */
paramName?: string
@@ -65,7 +59,6 @@ export function validatePathSegment(
customPattern,
} = options
// Check for null/undefined
if (value === null || value === undefined || value === '') {
return {
isValid: false,
@@ -73,7 +66,6 @@ export function validatePathSegment(
}
}
// Check length
if (value.length > maxLength) {
logger.warn('Path segment exceeds maximum length', {
paramName,
@@ -86,7 +78,6 @@ export function validatePathSegment(
}
}
// Check for null bytes (potential for bypass attacks)
if (value.includes('\0') || value.includes('%00')) {
logger.warn('Path segment contains null bytes', { paramName })
return {
@@ -95,7 +86,6 @@ export function validatePathSegment(
}
}
// Check for path traversal patterns
const pathTraversalPatterns = [
'..',
'./',
@@ -124,7 +114,6 @@ export function validatePathSegment(
}
}
// Check for directory separators
if (value.includes('/') || value.includes('\\')) {
logger.warn('Path segment contains directory separators', { paramName })
return {
@@ -133,7 +122,6 @@ export function validatePathSegment(
}
}
// Use custom pattern if provided
if (customPattern) {
if (!customPattern.test(value)) {
logger.warn('Path segment failed custom pattern validation', {
@@ -148,7 +136,6 @@ export function validatePathSegment(
return { isValid: true, sanitized: value }
}
// Build allowed character pattern
let pattern = '^[a-zA-Z0-9'
if (allowHyphens) pattern += '\\-'
if (allowUnderscores) pattern += '_'
@@ -947,6 +934,130 @@ export function validateAirtableId(
return { isValid: true, sanitized: value }
}
/**
* Validates an AWS region identifier
*
* AWS regions follow the pattern: {area}-{sub-area}-{number}
* Examples: us-east-1, eu-west-2, ap-southeast-1, sa-east-1
*
* @param value - The AWS region to validate
* @param paramName - Name of the parameter for error messages
* @returns ValidationResult
*
* @example
* ```typescript
* const result = validateAwsRegion(region, 'region')
* if (!result.isValid) {
* return NextResponse.json({ error: result.error }, { status: 400 })
* }
* ```
*/
export function validateAwsRegion(
value: string | null | undefined,
paramName = 'region'
): ValidationResult {
if (value === null || value === undefined || value === '') {
return {
isValid: false,
error: `${paramName} is required`,
}
}
// AWS region format: {area}-{sub-area}-{number}
// Examples: us-east-1, eu-west-2, ap-southeast-1, me-south-1, af-south-1
const awsRegionPattern = /^[a-z]{2}-[a-z]+-\d{1,2}$/
if (!awsRegionPattern.test(value)) {
logger.warn('Invalid AWS region format', {
paramName,
value: value.substring(0, 50),
})
return {
isValid: false,
error: `${paramName} must be a valid AWS region (e.g., us-east-1, eu-west-2)`,
}
}
return { isValid: true, sanitized: value }
}
/**
* Validates an S3 bucket name according to AWS naming rules
*
* S3 bucket names must:
* - Be 3-63 characters long
* - Start and end with a letter or number
* - Contain only lowercase letters, numbers, and hyphens
* - Not contain consecutive periods
* - Not be formatted as an IP address
*
* @param value - The S3 bucket name to validate
* @param paramName - Name of the parameter for error messages
* @returns ValidationResult
*
* @example
* ```typescript
* const result = validateS3BucketName(bucket, 'bucket')
* if (!result.isValid) {
* return NextResponse.json({ error: result.error }, { status: 400 })
* }
* ```
*/
export function validateS3BucketName(
value: string | null | undefined,
paramName = 'bucket'
): ValidationResult {
if (value === null || value === undefined || value === '') {
return {
isValid: false,
error: `${paramName} is required`,
}
}
if (value.length < 3 || value.length > 63) {
logger.warn('S3 bucket name length invalid', {
paramName,
length: value.length,
})
return {
isValid: false,
error: `${paramName} must be between 3 and 63 characters`,
}
}
const bucketNamePattern = /^[a-z0-9][a-z0-9.-]*[a-z0-9]$|^[a-z0-9]$/
if (!bucketNamePattern.test(value)) {
logger.warn('Invalid S3 bucket name format', {
paramName,
value: value.substring(0, 63),
})
return {
isValid: false,
error: `${paramName} must start and end with a letter or number, and contain only lowercase letters, numbers, hyphens, and periods`,
}
}
if (value.includes('..')) {
logger.warn('S3 bucket name contains consecutive periods', { paramName })
return {
isValid: false,
error: `${paramName} cannot contain consecutive periods`,
}
}
const ipPattern = /^(\d{1,3}\.){3}\d{1,3}$/
if (ipPattern.test(value)) {
logger.warn('S3 bucket name formatted as IP address', { paramName })
return {
isValid: false,
error: `${paramName} cannot be formatted as an IP address`,
}
}
return { isValid: true, sanitized: value }
}
/**
* Validates a Google Calendar ID
*

View File

@@ -455,3 +455,27 @@ export async function generatePresignedDownloadUrl(
export function hasCloudStorage(): boolean {
return USE_BLOB_STORAGE || USE_S3_STORAGE
}
/**
* Get S3 bucket and key information for a storage key
* Useful for services that need direct S3 access (e.g., AWS Textract async)
*/
export function getS3InfoForKey(
key: string,
context: StorageContext
): { bucket: string; key: string } {
if (!USE_S3_STORAGE) {
throw new Error('S3 storage is not configured. Cannot retrieve S3 info for key.')
}
const config = getStorageConfig(context)
if (!config.bucket) {
throw new Error(`S3 bucket not configured for context: ${context}`)
}
return {
bucket: config.bucket,
key,
}
}

View File

@@ -1500,6 +1500,7 @@ import {
telegramSendPhotoTool,
telegramSendVideoTool,
} from '@/tools/telegram'
import { textractParserTool } from '@/tools/textract'
import { thinkingTool } from '@/tools/thinking'
import { tinybirdEventsTool, tinybirdQueryTool } from '@/tools/tinybird'
import {
@@ -2456,6 +2457,7 @@ export const tools: Record<string, ToolConfig> = {
apollo_email_accounts: apolloEmailAccountsTool,
mistral_parser: mistralParserTool,
reducto_parser: reductoParserTool,
textract_parser: textractParserTool,
thinking_tool: thinkingTool,
tinybird_events: tinybirdEventsTool,
tinybird_query: tinybirdQueryTool,

View File

@@ -0,0 +1,2 @@
export { textractParserTool } from '@/tools/textract/parser'
export * from '@/tools/textract/types'

View File

@@ -0,0 +1,413 @@
import { createLogger } from '@sim/logger'
import { getBaseUrl } from '@/lib/core/utils/urls'
import type { TextractParserInput, TextractParserOutput } from '@/tools/textract/types'
import type { ToolConfig } from '@/tools/types'
const logger = createLogger('TextractParserTool')
export const textractParserTool: ToolConfig<TextractParserInput, TextractParserOutput> = {
id: 'textract_parser',
name: 'AWS Textract Parser',
description: 'Parse documents using AWS Textract OCR and document analysis',
version: '1.0.0',
params: {
accessKeyId: {
type: 'string',
required: true,
visibility: 'user-only',
description: 'AWS Access Key ID',
},
secretAccessKey: {
type: 'string',
required: true,
visibility: 'user-only',
description: 'AWS Secret Access Key',
},
region: {
type: 'string',
required: true,
visibility: 'user-only',
description: 'AWS region for Textract service (e.g., us-east-1)',
},
processingMode: {
type: 'string',
required: false,
visibility: 'user-only',
description: 'Document type: single-page or multi-page. Defaults to single-page.',
},
filePath: {
type: 'string',
required: false,
visibility: 'user-only',
description:
'URL to a document to be processed (JPEG, PNG, PDF, or TIFF). Required for sync mode.',
},
s3Uri: {
type: 'string',
required: false,
visibility: 'user-only',
description:
'S3 URI for async processing (s3://bucket/key). Required for async mode with S3 input.',
},
fileUpload: {
type: 'object',
required: false,
visibility: 'hidden',
description: 'File upload data from file-upload component',
},
featureTypes: {
type: 'array',
required: false,
visibility: 'user-or-llm',
description:
'Feature types to detect: TABLES, FORMS, QUERIES, SIGNATURES, LAYOUT. If not specified, only text detection is performed.',
items: {
type: 'string',
description: 'Feature type',
},
},
queries: {
type: 'array',
required: false,
visibility: 'user-or-llm',
description:
'Custom queries to extract specific information. Only used when featureTypes includes QUERIES.',
items: {
type: 'object',
description: 'Query configuration',
properties: {
Text: { type: 'string', description: 'The query text' },
Alias: { type: 'string', description: 'Optional alias for the result' },
},
},
},
},
request: {
url: '/api/tools/textract/parse',
method: 'POST',
headers: () => {
return {
'Content-Type': 'application/json',
Accept: 'application/json',
}
},
body: (params) => {
if (!params || typeof params !== 'object') {
throw new Error('Invalid parameters: Parameters must be provided as an object')
}
if (
!params.accessKeyId ||
typeof params.accessKeyId !== 'string' ||
params.accessKeyId.trim() === ''
) {
throw new Error('Missing or invalid AWS Access Key ID')
}
if (
!params.secretAccessKey ||
typeof params.secretAccessKey !== 'string' ||
params.secretAccessKey.trim() === ''
) {
throw new Error('Missing or invalid AWS Secret Access Key')
}
if (!params.region || typeof params.region !== 'string' || params.region.trim() === '') {
throw new Error('Missing or invalid AWS region')
}
const processingMode = params.processingMode || 'sync'
const requestBody: Record<string, unknown> = {
accessKeyId: params.accessKeyId.trim(),
secretAccessKey: params.secretAccessKey.trim(),
region: params.region.trim(),
processingMode,
}
if (processingMode === 'async') {
if (params.s3Uri && typeof params.s3Uri === 'string' && params.s3Uri.trim() !== '') {
const s3UriTrimmed = params.s3Uri.trim()
if (!s3UriTrimmed.match(/^s3:\/\/[^/]+\/.+$/)) {
throw new Error('Invalid S3 URI format. Expected: s3://bucket-name/path/to/object')
}
requestBody.s3Uri = s3UriTrimmed
} else if (params.fileUpload) {
if (
typeof params.fileUpload === 'object' &&
params.fileUpload !== null &&
(params.fileUpload.url || params.fileUpload.path)
) {
const uploadedFilePath = (params.fileUpload.path || params.fileUpload.url) as string
if (uploadedFilePath.startsWith('/api/files/serve/')) {
requestBody.filePath = uploadedFilePath
} else {
throw new Error('Async mode with upload requires files stored in S3')
}
} else {
throw new Error('Invalid file upload: Upload data is missing or invalid')
}
} else {
throw new Error('Async mode requires either an S3 URI or an uploaded file')
}
} else {
if (
params.fileUpload &&
(!params.filePath || params.filePath === 'null' || params.filePath === '')
) {
if (
typeof params.fileUpload === 'object' &&
params.fileUpload !== null &&
(params.fileUpload.url || params.fileUpload.path)
) {
let uploadedFilePath = (params.fileUpload.url || params.fileUpload.path) as string
if (uploadedFilePath.startsWith('/')) {
const baseUrl = getBaseUrl()
if (!baseUrl) throw new Error('Failed to get base URL for file path conversion')
uploadedFilePath = `${baseUrl}${uploadedFilePath}`
}
params.filePath = uploadedFilePath
logger.info('Using uploaded file:', uploadedFilePath)
} else {
throw new Error('Invalid file upload: Upload data is missing or invalid')
}
}
if (
!params.filePath ||
typeof params.filePath !== 'string' ||
params.filePath.trim() === ''
) {
throw new Error('Missing or invalid file path: Please provide a URL to a document')
}
let filePathToValidate = params.filePath.trim()
if (filePathToValidate.startsWith('/')) {
const baseUrl = getBaseUrl()
if (!baseUrl) throw new Error('Failed to get base URL for file path conversion')
filePathToValidate = `${baseUrl}${filePathToValidate}`
}
let url
try {
url = new URL(filePathToValidate)
if (!['http:', 'https:'].includes(url.protocol)) {
throw new Error(
`Invalid protocol: ${url.protocol}. URL must use HTTP or HTTPS protocol`
)
}
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error)
throw new Error(
`Invalid URL format: ${errorMessage}. Please provide a valid HTTP or HTTPS URL to a document.`
)
}
requestBody.filePath = url.toString()
if (params.fileUpload?.path?.startsWith('/api/files/serve/')) {
requestBody.filePath = params.fileUpload.path
}
}
if (params.featureTypes && Array.isArray(params.featureTypes)) {
const validFeatures = ['TABLES', 'FORMS', 'QUERIES', 'SIGNATURES', 'LAYOUT']
const filteredFeatures = params.featureTypes.filter((f) =>
validFeatures.includes(f as string)
)
if (filteredFeatures.length > 0) {
requestBody.featureTypes = filteredFeatures
}
}
if (params.queries && Array.isArray(params.queries) && params.queries.length > 0) {
const validQueries = params.queries
.filter((q) => q && typeof q === 'object' && typeof q.Text === 'string' && q.Text.trim())
.map((q) => ({
Text: q.Text.trim(),
Alias: q.Alias?.trim() || undefined,
Pages: q.Pages || undefined,
}))
if (validQueries.length > 0) {
requestBody.queries = validQueries
if (!requestBody.featureTypes) {
requestBody.featureTypes = ['QUERIES']
} else if (
Array.isArray(requestBody.featureTypes) &&
!requestBody.featureTypes.includes('QUERIES')
) {
;(requestBody.featureTypes as string[]).push('QUERIES')
}
}
}
return requestBody
},
},
transformResponse: async (response) => {
try {
let apiResult
try {
apiResult = await response.json()
} catch (jsonError) {
throw new Error(
`Failed to parse Textract response: ${jsonError instanceof Error ? jsonError.message : String(jsonError)}`
)
}
if (!apiResult || typeof apiResult !== 'object') {
throw new Error('Invalid response format from Textract API')
}
if (!apiResult.success && apiResult.error) {
throw new Error(apiResult.error)
}
const textractData = apiResult.output ?? apiResult
return {
success: true,
output: {
blocks: textractData.Blocks ?? textractData.blocks ?? [],
documentMetadata: {
pages:
textractData.DocumentMetadata?.Pages ?? textractData.documentMetadata?.pages ?? 0,
},
modelVersion:
textractData.AnalyzeDocumentModelVersion ??
textractData.analyzeDocumentModelVersion ??
textractData.DetectDocumentTextModelVersion ??
textractData.detectDocumentTextModelVersion ??
undefined,
},
}
} catch (error) {
logger.error('Error processing Textract result:', error)
throw error
}
},
outputs: {
blocks: {
type: 'array',
description:
'Array of Block objects containing detected text, tables, forms, and other elements',
items: {
type: 'object',
properties: {
BlockType: {
type: 'string',
description: 'Type of block (PAGE, LINE, WORD, TABLE, CELL, KEY_VALUE_SET, etc.)',
},
Id: { type: 'string', description: 'Unique identifier for the block' },
Text: {
type: 'string',
description: 'The text content (for LINE and WORD blocks)',
optional: true,
},
TextType: {
type: 'string',
description: 'Type of text (PRINTED or HANDWRITING)',
optional: true,
},
Confidence: { type: 'number', description: 'Confidence score (0-100)', optional: true },
Page: { type: 'number', description: 'Page number', optional: true },
Geometry: {
type: 'object',
description: 'Location and bounding box information',
optional: true,
properties: {
BoundingBox: {
type: 'object',
properties: {
Height: { type: 'number', description: 'Height as ratio of document height' },
Left: { type: 'number', description: 'Left position as ratio of document width' },
Top: { type: 'number', description: 'Top position as ratio of document height' },
Width: { type: 'number', description: 'Width as ratio of document width' },
},
},
Polygon: {
type: 'array',
description: 'Polygon coordinates',
items: {
type: 'object',
properties: {
X: { type: 'number', description: 'X coordinate' },
Y: { type: 'number', description: 'Y coordinate' },
},
},
},
},
},
Relationships: {
type: 'array',
description: 'Relationships to other blocks',
optional: true,
items: {
type: 'object',
properties: {
Type: {
type: 'string',
description: 'Relationship type (CHILD, VALUE, ANSWER, etc.)',
},
Ids: { type: 'array', description: 'IDs of related blocks' },
},
},
},
EntityTypes: {
type: 'array',
description: 'Entity types for KEY_VALUE_SET (KEY or VALUE)',
optional: true,
},
SelectionStatus: {
type: 'string',
description: 'For checkboxes: SELECTED or NOT_SELECTED',
optional: true,
},
RowIndex: { type: 'number', description: 'Row index for table cells', optional: true },
ColumnIndex: {
type: 'number',
description: 'Column index for table cells',
optional: true,
},
RowSpan: { type: 'number', description: 'Row span for merged cells', optional: true },
ColumnSpan: {
type: 'number',
description: 'Column span for merged cells',
optional: true,
},
Query: {
type: 'object',
description: 'Query information for QUERY blocks',
optional: true,
properties: {
Text: { type: 'string', description: 'Query text' },
Alias: { type: 'string', description: 'Query alias', optional: true },
Pages: { type: 'array', description: 'Pages to search', optional: true },
},
},
},
},
},
documentMetadata: {
type: 'object',
description: 'Metadata about the analyzed document',
properties: {
pages: { type: 'number', description: 'Number of pages in the document' },
},
},
modelVersion: {
type: 'string',
description: 'Version of the Textract model used for processing',
optional: true,
},
},
}

View File

@@ -0,0 +1,114 @@
import type { ToolResponse } from '@/tools/types'
export type TextractProcessingMode = 'sync' | 'async'
export interface TextractParserInput {
accessKeyId: string
secretAccessKey: string
region: string
processingMode?: TextractProcessingMode
filePath?: string
s3Uri?: string
fileUpload?: {
url?: string
path?: string
}
featureTypes?: TextractFeatureType[]
queries?: TextractQuery[]
}
export type TextractFeatureType = 'TABLES' | 'FORMS' | 'QUERIES' | 'SIGNATURES' | 'LAYOUT'
export interface TextractQuery {
Text: string
Alias?: string
Pages?: string[]
}
export interface TextractBoundingBox {
Height: number
Left: number
Top: number
Width: number
}
export interface TextractPolygonPoint {
X: number
Y: number
}
export interface TextractGeometry {
BoundingBox: TextractBoundingBox
Polygon: TextractPolygonPoint[]
RotationAngle?: number
}
export interface TextractRelationship {
Type: string
Ids: string[]
}
export interface TextractBlock {
BlockType: string
Id: string
Text?: string
TextType?: string
Confidence?: number
Geometry?: TextractGeometry
Relationships?: TextractRelationship[]
Page?: number
EntityTypes?: string[]
SelectionStatus?: string
RowIndex?: number
ColumnIndex?: number
RowSpan?: number
ColumnSpan?: number
Query?: {
Text: string
Alias?: string
Pages?: string[]
}
}
/** AWS Textract DocumentMetadata - exact API format */
export interface TextractDocumentMetadataRaw {
Pages: number
}
/** Normalized DocumentMetadata (camelCase) */
export interface TextractDocumentMetadata {
pages: number
}
/** AWS Textract API Response - exact API format */
export interface TextractApiResponse {
Blocks: TextractBlock[]
DocumentMetadata: TextractDocumentMetadataRaw
AnalyzeDocumentModelVersion?: string
DetectDocumentTextModelVersion?: string
}
export interface TextractNormalizedOutput {
blocks: TextractBlock[]
documentMetadata: TextractDocumentMetadata
modelVersion?: string
}
/** Async job status response from Textract */
export interface TextractAsyncJobResponse {
JobStatus: 'IN_PROGRESS' | 'SUCCEEDED' | 'FAILED' | 'PARTIAL_SUCCESS'
StatusMessage?: string
Blocks?: TextractBlock[]
DocumentMetadata?: TextractDocumentMetadataRaw
NextToken?: string
AnalyzeDocumentModelVersion?: string
DetectDocumentTextModelVersion?: string
}
export interface TextractStartJobResponse {
JobId: string
}
export interface TextractParserOutput extends ToolResponse {
output: TextractNormalizedOutput
}