improvement(kb): encode non-ASCII headers for kb uploads (#1595)

* improvement(kb): encode non-ASCII headers for kb uploads

* cleanup

* increase timeouts to match trigger
This commit is contained in:
Waleed
2025-10-10 16:36:25 -07:00
committed by waleed
parent 97a8778449
commit 241d9fd12d
7 changed files with 42 additions and 20 deletions

View File

@@ -1,11 +1,11 @@
import { readFile } from 'fs/promises'
import type { NextRequest } from 'next/server'
import { NextResponse } from 'next/server'
import { checkHybridAuth } from '@/lib/auth/hybrid'
import { createLogger } from '@/lib/logs/console/logger'
import { downloadFile, getStorageProvider, isUsingCloudStorage } from '@/lib/uploads'
import { S3_KB_CONFIG } from '@/lib/uploads/setup'
import '@/lib/uploads/setup.server'
import { getSession } from '@/lib/auth'
import {
createErrorResponse,
createFileResponse,
@@ -29,23 +29,19 @@ export async function GET(
logger.info('File serve request:', { path })
const session = await getSession()
if (!session?.user?.id) {
logger.warn('Unauthorized file access attempt', { path })
const authResult = await checkHybridAuth(request, { requireWorkflowId: false })
if (!authResult.success) {
logger.warn('Unauthorized file access attempt', { path, error: authResult.error })
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
}
const userId = session.user.id
const userId = authResult.userId
const fullPath = path.join('/')
const isS3Path = path[0] === 's3'
const isBlobPath = path[0] === 'blob'
const isCloudPath = isS3Path || isBlobPath
const cloudKey = isCloudPath ? path.slice(1).join('/') : fullPath
const isExecutionFile = cloudKey.split('/').length >= 3 && !cloudKey.startsWith('kb/')
if (!isExecutionFile) {
logger.info('Authenticated file access granted', { userId, path: cloudKey })
}
if (isUsingCloudStorage() || isCloudPath) {
const bucketType = request.nextUrl.searchParams.get('bucket')
@@ -64,7 +60,7 @@ export async function GET(
}
}
async function handleLocalFile(filename: string, userId: string): Promise<NextResponse> {
async function handleLocalFile(filename: string, userId?: string): Promise<NextResponse> {
try {
const filePath = findLocalFile(filename)

View File

@@ -123,8 +123,7 @@ export async function POST(request: NextRequest) {
}
}
// Create the serve path
const servePath = `/api/files/serve/${result.key}`
const servePath = result.path
const uploadResult = {
name: originalName,

View File

@@ -307,6 +307,22 @@ function getSecureFileHeaders(filename: string, originalContentType: string) {
}
}
/**
* Encode filename for Content-Disposition header to support non-ASCII characters
* Uses RFC 5987 encoding for international characters
*/
function encodeFilenameForHeader(filename: string): string {
const hasNonAscii = /[^\x00-\x7F]/.test(filename)
if (!hasNonAscii) {
return `filename="${filename}"`
}
const encodedFilename = encodeURIComponent(filename)
const asciiSafe = filename.replace(/[^\x00-\x7F]/g, '_')
return `filename="${asciiSafe}"; filename*=UTF-8''${encodedFilename}`
}
/**
* Create a file response with appropriate security headers
*/
@@ -317,7 +333,7 @@ export function createFileResponse(file: FileResponse): NextResponse {
status: 200,
headers: {
'Content-Type': contentType,
'Content-Disposition': `${disposition}; filename="${file.filename}"`,
'Content-Disposition': `${disposition}; ${encodeFilenameForHeader(file.filename)}`,
'Cache-Control': 'public, max-age=31536000', // Cache for 1 year
'X-Content-Type-Options': 'nosniff',
'Content-Security-Policy': "default-src 'none'; style-src 'unsafe-inline'; sandbox;",

View File

@@ -26,7 +26,7 @@ export type DocumentProcessingPayload = {
export const processDocument = task({
id: 'knowledge-process-document',
maxDuration: env.KB_CONFIG_MAX_DURATION || 300,
maxDuration: env.KB_CONFIG_MAX_DURATION || 600,
retry: {
maxAttempts: env.KB_CONFIG_MAX_ATTEMPTS || 3,
factor: env.KB_CONFIG_RETRY_FACTOR || 2,

View File

@@ -146,7 +146,7 @@ export const env = createEnv({
RATE_LIMIT_ENTERPRISE_ASYNC: z.string().optional().default('1000'), // Enterprise tier async API executions per minute
// Knowledge Base Processing Configuration - Shared across all processing methods
KB_CONFIG_MAX_DURATION: z.number().optional().default(300), // Max processing duration in s
KB_CONFIG_MAX_DURATION: z.number().optional().default(600), // Max processing duration in seconds (10 minutes)
KB_CONFIG_MAX_ATTEMPTS: z.number().optional().default(3), // Max retry attempts
KB_CONFIG_RETRY_FACTOR: z.number().optional().default(2), // Retry backoff factor
KB_CONFIG_MIN_TIMEOUT: z.number().optional().default(1000), // Min timeout in ms

View File

@@ -180,7 +180,9 @@ async function parseDocument(
}
async function handleFileForOCR(fileUrl: string, filename: string, mimeType: string) {
if (fileUrl.startsWith('https://')) {
const isExternalHttps = fileUrl.startsWith('https://') && !fileUrl.includes('/api/files/serve/')
if (isExternalHttps) {
return { httpsUrl: fileUrl }
}
@@ -207,7 +209,16 @@ async function downloadFileWithTimeout(fileUrl: string): Promise<Buffer> {
const timeoutId = setTimeout(() => controller.abort(), TIMEOUTS.FILE_DOWNLOAD)
try {
const response = await fetch(fileUrl, { signal: controller.signal })
const isInternalFileServe = fileUrl.includes('/api/files/serve/')
const headers: HeadersInit = {}
if (isInternalFileServe) {
const { generateInternalToken } = await import('@/lib/auth/internal')
const token = await generateInternalToken()
headers.Authorization = `Bearer ${token}`
}
const response = await fetch(fileUrl, { signal: controller.signal, headers })
clearTimeout(timeoutId)
if (!response.ok) {

View File

@@ -17,14 +17,14 @@ import type { DocumentSortField, SortOrder } from './types'
const logger = createLogger('DocumentService')
const TIMEOUTS = {
OVERALL_PROCESSING: (env.KB_CONFIG_MAX_DURATION || 600) * 1000, // Increased to 10 minutes to match Trigger's timeout
OVERALL_PROCESSING: (env.KB_CONFIG_MAX_DURATION || 600) * 1000, // Default 10 minutes for KB document processing
EMBEDDINGS_API: (env.KB_CONFIG_MAX_TIMEOUT || 10000) * 18,
} as const
// Configuration for handling large documents
const LARGE_DOC_CONFIG = {
MAX_CHUNKS_PER_BATCH: 500, // Insert embeddings in batches of 500
MAX_EMBEDDING_BATCH: 50, // Generate embeddings in batches of 50
MAX_EMBEDDING_BATCH: 500, // Generate embeddings in batches of 500
MAX_FILE_SIZE: 100 * 1024 * 1024, // 100MB max file size
MAX_CHUNKS_PER_DOCUMENT: 100000, // Maximum chunks allowed per document
}