Files
sim/apps/sim/app/api/tools/textract/parse/route.ts
Waleed fcdcaed00d fix(memory): add Bun.gc, stream cancellation, and unconsumed fetch drains (#3416)
* fix(memory): add Bun.gc, stream cancellation, and unconsumed fetch drains

* fix(memory): await reader.cancel() and use non-blocking Bun.gc

* fix(memory): update Bun.gc comment to match non-blocking call

* fix(memory): use response.body.cancel() instead of response.text() for drains

* fix(executor): flush TextDecoder after streaming loop for multi-byte chars

* fix(memory): use text() drain for SecureFetchResponse which lacks body property

* fix(chat): prevent premature isExecuting=false from killing chat stream

The onExecutionCompleted/Error/Cancelled callbacks were setting
isExecuting=false as soon as the server-side SSE stream completed.
For chat executions, this triggered a useEffect in chat.tsx that
cancelled the client-side stream reader before it finished consuming
buffered data — causing empty or partial chat responses.

Skip the isExecuting=false in these callbacks for chat executions
since the chat's own finally block handles cleanup after the stream
is fully consumed.

* fix(chat): remove useEffect anti-pattern that killed chat stream on state change

The effect reacted to isExecuting becoming false to clean up streams,
but this is an anti-pattern per React guidelines — using state changes
as a proxy for events. All cleanup cases are already handled by proper
event paths: stream done (processStreamingResponse), user cancel
(handleStopStreaming), component unmount (cleanup effect), and
abort/error (catch block).

* fix(servicenow): remove invalid string comparison on numeric offset param

* upgrade turborepo
2026-03-04 17:46:20 -08:00

660 lines
20 KiB
TypeScript

import crypto from 'crypto'
import { createLogger } from '@sim/logger'
import { type NextRequest, NextResponse } from 'next/server'
import { z } from 'zod'
import { checkInternalAuth } from '@/lib/auth/hybrid'
import { DEFAULT_EXECUTION_TIMEOUT_MS } from '@/lib/core/execution-limits'
import { validateAwsRegion, validateS3BucketName } from '@/lib/core/security/input-validation'
import {
secureFetchWithPinnedIP,
validateUrlWithDNS,
} from '@/lib/core/security/input-validation.server'
import { generateRequestId } from '@/lib/core/utils/request'
import { RawFileInputSchema } from '@/lib/uploads/utils/file-schemas'
import { isInternalFileUrl, processSingleFileToUserFile } from '@/lib/uploads/utils/file-utils'
import {
downloadFileFromStorage,
resolveInternalFileUrl,
} from '@/lib/uploads/utils/file-utils.server'
export const dynamic = 'force-dynamic'
export const maxDuration = 300 // 5 minutes for large multi-page PDF processing
const logger = createLogger('TextractParseAPI')
const QuerySchema = z.object({
Text: z.string().min(1),
Alias: z.string().optional(),
Pages: z.array(z.string()).optional(),
})
const TextractParseSchema = z
.object({
accessKeyId: z.string().min(1, 'AWS Access Key ID is required'),
secretAccessKey: z.string().min(1, 'AWS Secret Access Key is required'),
region: z.string().min(1, 'AWS region is required'),
processingMode: z.enum(['sync', 'async']).optional().default('sync'),
filePath: z.string().optional(),
file: RawFileInputSchema.optional(),
s3Uri: z.string().optional(),
featureTypes: z
.array(z.enum(['TABLES', 'FORMS', 'QUERIES', 'SIGNATURES', 'LAYOUT']))
.optional(),
queries: z.array(QuerySchema).optional(),
})
.superRefine((data, ctx) => {
const regionValidation = validateAwsRegion(data.region, 'AWS region')
if (!regionValidation.isValid) {
ctx.addIssue({
code: z.ZodIssueCode.custom,
message: regionValidation.error,
path: ['region'],
})
}
if (data.processingMode === 'async' && !data.s3Uri) {
ctx.addIssue({
code: z.ZodIssueCode.custom,
message: 'S3 URI is required for multi-page processing (s3://bucket/key)',
path: ['s3Uri'],
})
}
if (data.processingMode !== 'async' && !data.file && !data.filePath) {
ctx.addIssue({
code: z.ZodIssueCode.custom,
message: 'File input is required for single-page processing',
path: ['filePath'],
})
}
})
function getSignatureKey(
key: string,
dateStamp: string,
regionName: string,
serviceName: string
): Buffer {
const kDate = crypto.createHmac('sha256', `AWS4${key}`).update(dateStamp).digest()
const kRegion = crypto.createHmac('sha256', kDate).update(regionName).digest()
const kService = crypto.createHmac('sha256', kRegion).update(serviceName).digest()
const kSigning = crypto.createHmac('sha256', kService).update('aws4_request').digest()
return kSigning
}
function signAwsRequest(
method: string,
host: string,
uri: string,
body: string,
accessKeyId: string,
secretAccessKey: string,
region: string,
service: string,
amzTarget: string
): Record<string, string> {
const date = new Date()
const amzDate = date.toISOString().replace(/[:-]|\.\d{3}/g, '')
const dateStamp = amzDate.slice(0, 8)
const payloadHash = crypto.createHash('sha256').update(body).digest('hex')
const canonicalHeaders =
`content-type:application/x-amz-json-1.1\n` +
`host:${host}\n` +
`x-amz-date:${amzDate}\n` +
`x-amz-target:${amzTarget}\n`
const signedHeaders = 'content-type;host;x-amz-date;x-amz-target'
const canonicalRequest = `${method}\n${uri}\n\n${canonicalHeaders}\n${signedHeaders}\n${payloadHash}`
const algorithm = 'AWS4-HMAC-SHA256'
const credentialScope = `${dateStamp}/${region}/${service}/aws4_request`
const stringToSign = `${algorithm}\n${amzDate}\n${credentialScope}\n${crypto.createHash('sha256').update(canonicalRequest).digest('hex')}`
const signingKey = getSignatureKey(secretAccessKey, dateStamp, region, service)
const signature = crypto.createHmac('sha256', signingKey).update(stringToSign).digest('hex')
const authorizationHeader = `${algorithm} Credential=${accessKeyId}/${credentialScope}, SignedHeaders=${signedHeaders}, Signature=${signature}`
return {
'Content-Type': 'application/x-amz-json-1.1',
Host: host,
'X-Amz-Date': amzDate,
'X-Amz-Target': amzTarget,
Authorization: authorizationHeader,
}
}
async function fetchDocumentBytes(url: string): Promise<{ bytes: string; contentType: string }> {
const urlValidation = await validateUrlWithDNS(url, 'Document URL')
if (!urlValidation.isValid) {
throw new Error(urlValidation.error || 'Invalid document URL')
}
const response = await secureFetchWithPinnedIP(url, urlValidation.resolvedIP!, {
method: 'GET',
})
if (!response.ok) {
await response.text().catch(() => {})
throw new Error(`Failed to fetch document: ${response.statusText}`)
}
const arrayBuffer = await response.arrayBuffer()
const bytes = Buffer.from(arrayBuffer).toString('base64')
const contentType = response.headers.get('content-type') || 'application/octet-stream'
return { bytes, contentType }
}
function parseS3Uri(s3Uri: string): { bucket: string; key: string } {
const match = s3Uri.match(/^s3:\/\/([^/]+)\/(.+)$/)
if (!match) {
throw new Error(
`Invalid S3 URI format: ${s3Uri}. Expected format: s3://bucket-name/path/to/object`
)
}
const bucket = match[1]
const key = match[2]
const bucketValidation = validateS3BucketName(bucket, 'S3 bucket name')
if (!bucketValidation.isValid) {
throw new Error(bucketValidation.error)
}
if (key.includes('..') || key.startsWith('/')) {
throw new Error('S3 key contains invalid path traversal sequences')
}
return { bucket, key }
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms))
}
async function callTextractAsync(
host: string,
amzTarget: string,
body: Record<string, unknown>,
accessKeyId: string,
secretAccessKey: string,
region: string
): Promise<Record<string, unknown>> {
const bodyString = JSON.stringify(body)
const headers = signAwsRequest(
'POST',
host,
'/',
bodyString,
accessKeyId,
secretAccessKey,
region,
'textract',
amzTarget
)
const response = await fetch(`https://${host}/`, {
method: 'POST',
headers,
body: bodyString,
})
if (!response.ok) {
const errorText = await response.text()
let errorMessage = `Textract API error: ${response.statusText}`
try {
const errorJson = JSON.parse(errorText)
if (errorJson.Message) {
errorMessage = errorJson.Message
} else if (errorJson.__type) {
errorMessage = `${errorJson.__type}: ${errorJson.message || errorText}`
}
} catch {
// Use default error message
}
throw new Error(errorMessage)
}
return response.json()
}
async function pollForJobCompletion(
host: string,
jobId: string,
accessKeyId: string,
secretAccessKey: string,
region: string,
useAnalyzeDocument: boolean,
requestId: string
): Promise<Record<string, unknown>> {
const pollIntervalMs = 5000
const maxPollTimeMs = DEFAULT_EXECUTION_TIMEOUT_MS
const maxAttempts = Math.ceil(maxPollTimeMs / pollIntervalMs)
const getTarget = useAnalyzeDocument
? 'Textract.GetDocumentAnalysis'
: 'Textract.GetDocumentTextDetection'
for (let attempt = 0; attempt < maxAttempts; attempt++) {
const result = await callTextractAsync(
host,
getTarget,
{ JobId: jobId },
accessKeyId,
secretAccessKey,
region
)
const jobStatus = result.JobStatus as string
if (jobStatus === 'SUCCEEDED') {
logger.info(`[${requestId}] Async job completed successfully after ${attempt + 1} polls`)
let allBlocks = (result.Blocks as unknown[]) || []
let nextToken = result.NextToken as string | undefined
while (nextToken) {
const nextResult = await callTextractAsync(
host,
getTarget,
{ JobId: jobId, NextToken: nextToken },
accessKeyId,
secretAccessKey,
region
)
allBlocks = allBlocks.concat((nextResult.Blocks as unknown[]) || [])
nextToken = nextResult.NextToken as string | undefined
}
return {
...result,
Blocks: allBlocks,
}
}
if (jobStatus === 'FAILED') {
throw new Error(`Textract job failed: ${result.StatusMessage || 'Unknown error'}`)
}
if (jobStatus === 'PARTIAL_SUCCESS') {
logger.warn(`[${requestId}] Job completed with partial success: ${result.StatusMessage}`)
let allBlocks = (result.Blocks as unknown[]) || []
let nextToken = result.NextToken as string | undefined
while (nextToken) {
const nextResult = await callTextractAsync(
host,
getTarget,
{ JobId: jobId, NextToken: nextToken },
accessKeyId,
secretAccessKey,
region
)
allBlocks = allBlocks.concat((nextResult.Blocks as unknown[]) || [])
nextToken = nextResult.NextToken as string | undefined
}
return {
...result,
Blocks: allBlocks,
}
}
logger.info(`[${requestId}] Job status: ${jobStatus}, attempt ${attempt + 1}/${maxAttempts}`)
await sleep(pollIntervalMs)
}
throw new Error(
`Timeout waiting for Textract job to complete (max ${maxPollTimeMs / 1000} seconds)`
)
}
export async function POST(request: NextRequest) {
const requestId = generateRequestId()
try {
const authResult = await checkInternalAuth(request, { requireWorkflowId: false })
if (!authResult.success || !authResult.userId) {
logger.warn(`[${requestId}] Unauthorized Textract parse attempt`, {
error: authResult.error || 'Missing userId',
})
return NextResponse.json(
{
success: false,
error: authResult.error || 'Unauthorized',
},
{ status: 401 }
)
}
const userId = authResult.userId
const body = await request.json()
const validatedData = TextractParseSchema.parse(body)
const processingMode = validatedData.processingMode || 'sync'
const featureTypes = validatedData.featureTypes ?? []
const useAnalyzeDocument = featureTypes.length > 0
const host = `textract.${validatedData.region}.amazonaws.com`
logger.info(`[${requestId}] Textract parse request`, {
processingMode,
hasFile: Boolean(validatedData.file),
hasS3Uri: Boolean(validatedData.s3Uri),
featureTypes,
userId,
})
if (processingMode === 'async') {
if (!validatedData.s3Uri) {
return NextResponse.json(
{
success: false,
error: 'S3 URI is required for multi-page processing (s3://bucket/key)',
},
{ status: 400 }
)
}
const { bucket: s3Bucket, key: s3Key } = parseS3Uri(validatedData.s3Uri)
logger.info(`[${requestId}] Starting async Textract job`, { s3Bucket, s3Key })
const startTarget = useAnalyzeDocument
? 'Textract.StartDocumentAnalysis'
: 'Textract.StartDocumentTextDetection'
const startBody: Record<string, unknown> = {
DocumentLocation: {
S3Object: {
Bucket: s3Bucket,
Name: s3Key,
},
},
}
if (useAnalyzeDocument) {
startBody.FeatureTypes = featureTypes
if (
validatedData.queries &&
validatedData.queries.length > 0 &&
featureTypes.includes('QUERIES')
) {
startBody.QueriesConfig = {
Queries: validatedData.queries.map((q) => ({
Text: q.Text,
Alias: q.Alias,
Pages: q.Pages,
})),
}
}
}
const startResult = await callTextractAsync(
host,
startTarget,
startBody,
validatedData.accessKeyId,
validatedData.secretAccessKey,
validatedData.region
)
const jobId = startResult.JobId as string
if (!jobId) {
throw new Error('Failed to start Textract job: No JobId returned')
}
logger.info(`[${requestId}] Async job started`, { jobId })
const textractData = await pollForJobCompletion(
host,
jobId,
validatedData.accessKeyId,
validatedData.secretAccessKey,
validatedData.region,
useAnalyzeDocument,
requestId
)
logger.info(`[${requestId}] Textract async parse successful`, {
pageCount: (textractData.DocumentMetadata as { Pages?: number })?.Pages ?? 0,
blockCount: (textractData.Blocks as unknown[])?.length ?? 0,
})
return NextResponse.json({
success: true,
output: {
blocks: textractData.Blocks ?? [],
documentMetadata: {
pages: (textractData.DocumentMetadata as { Pages?: number })?.Pages ?? 0,
},
modelVersion: (textractData.AnalyzeDocumentModelVersion ??
textractData.DetectDocumentTextModelVersion) as string | undefined,
},
})
}
let bytes = ''
let contentType = 'application/octet-stream'
let isPdf = false
if (validatedData.file) {
let userFile
try {
userFile = processSingleFileToUserFile(validatedData.file, requestId, logger)
} catch (error) {
return NextResponse.json(
{
success: false,
error: error instanceof Error ? error.message : 'Failed to process file',
},
{ status: 400 }
)
}
const buffer = await downloadFileFromStorage(userFile, requestId, logger)
bytes = buffer.toString('base64')
contentType = userFile.type || 'application/octet-stream'
isPdf = contentType.includes('pdf') || userFile.name?.toLowerCase().endsWith('.pdf')
} else if (validatedData.filePath) {
let fileUrl = validatedData.filePath
const isInternalFilePath = isInternalFileUrl(fileUrl)
if (isInternalFilePath) {
const resolution = await resolveInternalFileUrl(fileUrl, userId, requestId, logger)
if (resolution.error) {
return NextResponse.json(
{
success: false,
error: resolution.error.message,
},
{ status: resolution.error.status }
)
}
fileUrl = resolution.fileUrl || fileUrl
} else if (fileUrl.startsWith('/')) {
logger.warn(`[${requestId}] Invalid internal path`, {
userId,
path: fileUrl.substring(0, 50),
})
return NextResponse.json(
{
success: false,
error: 'Invalid file path. Only uploaded files are supported for internal paths.',
},
{ status: 400 }
)
} else {
const urlValidation = await validateUrlWithDNS(fileUrl, 'Document URL')
if (!urlValidation.isValid) {
logger.warn(`[${requestId}] SSRF attempt blocked`, {
userId,
url: fileUrl.substring(0, 100),
error: urlValidation.error,
})
return NextResponse.json(
{
success: false,
error: urlValidation.error,
},
{ status: 400 }
)
}
}
const fetched = await fetchDocumentBytes(fileUrl)
bytes = fetched.bytes
contentType = fetched.contentType
isPdf = contentType.includes('pdf') || fileUrl.toLowerCase().endsWith('.pdf')
} else {
return NextResponse.json(
{
success: false,
error: 'File input is required for single-page processing',
},
{ status: 400 }
)
}
const uri = '/'
let textractBody: Record<string, unknown>
let amzTarget: string
if (useAnalyzeDocument) {
amzTarget = 'Textract.AnalyzeDocument'
textractBody = {
Document: {
Bytes: bytes,
},
FeatureTypes: featureTypes,
}
if (
validatedData.queries &&
validatedData.queries.length > 0 &&
featureTypes.includes('QUERIES')
) {
textractBody.QueriesConfig = {
Queries: validatedData.queries.map((q) => ({
Text: q.Text,
Alias: q.Alias,
Pages: q.Pages,
})),
}
}
} else {
amzTarget = 'Textract.DetectDocumentText'
textractBody = {
Document: {
Bytes: bytes,
},
}
}
const bodyString = JSON.stringify(textractBody)
const headers = signAwsRequest(
'POST',
host,
uri,
bodyString,
validatedData.accessKeyId,
validatedData.secretAccessKey,
validatedData.region,
'textract',
amzTarget
)
const textractResponse = await fetch(`https://${host}${uri}`, {
method: 'POST',
headers,
body: bodyString,
})
if (!textractResponse.ok) {
const errorText = await textractResponse.text()
logger.error(`[${requestId}] Textract API error:`, errorText)
let errorMessage = `Textract API error: ${textractResponse.statusText}`
let isUnsupportedFormat = false
try {
const errorJson = JSON.parse(errorText)
if (errorJson.Message) {
errorMessage = errorJson.Message
} else if (errorJson.__type) {
errorMessage = `${errorJson.__type}: ${errorJson.message || errorText}`
}
// Check for unsupported document format error
isUnsupportedFormat =
errorJson.__type === 'UnsupportedDocumentException' ||
errorJson.Message?.toLowerCase().includes('unsupported document') ||
errorText.toLowerCase().includes('unsupported document')
} catch {
isUnsupportedFormat = errorText.toLowerCase().includes('unsupported document')
}
// Provide helpful message for unsupported format (likely multi-page PDF)
if (isUnsupportedFormat && isPdf) {
errorMessage =
'This document format is not supported in Single Page mode. If this is a multi-page PDF, please use "Multi-Page (PDF, TIFF via S3)" mode instead, which requires uploading your document to S3 first. Single Page mode only supports JPEG, PNG, and single-page PDF files.'
}
return NextResponse.json(
{
success: false,
error: errorMessage,
},
{ status: textractResponse.status }
)
}
const textractData = await textractResponse.json()
logger.info(`[${requestId}] Textract parse successful`, {
pageCount: textractData.DocumentMetadata?.Pages ?? 0,
blockCount: textractData.Blocks?.length ?? 0,
})
return NextResponse.json({
success: true,
output: {
blocks: textractData.Blocks ?? [],
documentMetadata: {
pages: textractData.DocumentMetadata?.Pages ?? 0,
},
modelVersion:
textractData.AnalyzeDocumentModelVersion ??
textractData.DetectDocumentTextModelVersion ??
undefined,
},
})
} catch (error) {
if (error instanceof z.ZodError) {
logger.warn(`[${requestId}] Invalid request data`, { errors: error.errors })
return NextResponse.json(
{
success: false,
error: 'Invalid request data',
details: error.errors,
},
{ status: 400 }
)
}
logger.error(`[${requestId}] Error in Textract parse:`, error)
return NextResponse.json(
{
success: false,
error: error instanceof Error ? error.message : 'Internal server error',
},
{ status: 500 }
)
}
}