mirror of
https://github.com/simstudioai/sim.git
synced 2026-01-08 22:48:14 -05:00
improvement(kb): encode non-ASCII headers for kb uploads (#1595)
* improvement(kb): encode non-ASCII headers for kb uploads * cleanup * increase timeouts to match trigger
This commit is contained in:
@@ -1,11 +1,11 @@
|
||||
import { readFile } from 'fs/promises'
|
||||
import type { NextRequest } from 'next/server'
|
||||
import { NextResponse } from 'next/server'
|
||||
import { checkHybridAuth } from '@/lib/auth/hybrid'
|
||||
import { createLogger } from '@/lib/logs/console/logger'
|
||||
import { downloadFile, getStorageProvider, isUsingCloudStorage } from '@/lib/uploads'
|
||||
import { S3_KB_CONFIG } from '@/lib/uploads/setup'
|
||||
import '@/lib/uploads/setup.server'
|
||||
import { getSession } from '@/lib/auth'
|
||||
import {
|
||||
createErrorResponse,
|
||||
createFileResponse,
|
||||
@@ -29,23 +29,19 @@ export async function GET(
|
||||
|
||||
logger.info('File serve request:', { path })
|
||||
|
||||
const session = await getSession()
|
||||
if (!session?.user?.id) {
|
||||
logger.warn('Unauthorized file access attempt', { path })
|
||||
const authResult = await checkHybridAuth(request, { requireWorkflowId: false })
|
||||
|
||||
if (!authResult.success) {
|
||||
logger.warn('Unauthorized file access attempt', { path, error: authResult.error })
|
||||
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
|
||||
}
|
||||
|
||||
const userId = session.user.id
|
||||
const userId = authResult.userId
|
||||
const fullPath = path.join('/')
|
||||
const isS3Path = path[0] === 's3'
|
||||
const isBlobPath = path[0] === 'blob'
|
||||
const isCloudPath = isS3Path || isBlobPath
|
||||
const cloudKey = isCloudPath ? path.slice(1).join('/') : fullPath
|
||||
const isExecutionFile = cloudKey.split('/').length >= 3 && !cloudKey.startsWith('kb/')
|
||||
|
||||
if (!isExecutionFile) {
|
||||
logger.info('Authenticated file access granted', { userId, path: cloudKey })
|
||||
}
|
||||
|
||||
if (isUsingCloudStorage() || isCloudPath) {
|
||||
const bucketType = request.nextUrl.searchParams.get('bucket')
|
||||
@@ -64,7 +60,7 @@ export async function GET(
|
||||
}
|
||||
}
|
||||
|
||||
async function handleLocalFile(filename: string, userId: string): Promise<NextResponse> {
|
||||
async function handleLocalFile(filename: string, userId?: string): Promise<NextResponse> {
|
||||
try {
|
||||
const filePath = findLocalFile(filename)
|
||||
|
||||
|
||||
@@ -123,8 +123,7 @@ export async function POST(request: NextRequest) {
|
||||
}
|
||||
}
|
||||
|
||||
// Create the serve path
|
||||
const servePath = `/api/files/serve/${result.key}`
|
||||
const servePath = result.path
|
||||
|
||||
const uploadResult = {
|
||||
name: originalName,
|
||||
|
||||
@@ -307,6 +307,22 @@ function getSecureFileHeaders(filename: string, originalContentType: string) {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Encode filename for Content-Disposition header to support non-ASCII characters
|
||||
* Uses RFC 5987 encoding for international characters
|
||||
*/
|
||||
function encodeFilenameForHeader(filename: string): string {
|
||||
const hasNonAscii = /[^\x00-\x7F]/.test(filename)
|
||||
|
||||
if (!hasNonAscii) {
|
||||
return `filename="${filename}"`
|
||||
}
|
||||
|
||||
const encodedFilename = encodeURIComponent(filename)
|
||||
const asciiSafe = filename.replace(/[^\x00-\x7F]/g, '_')
|
||||
return `filename="${asciiSafe}"; filename*=UTF-8''${encodedFilename}`
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a file response with appropriate security headers
|
||||
*/
|
||||
@@ -317,7 +333,7 @@ export function createFileResponse(file: FileResponse): NextResponse {
|
||||
status: 200,
|
||||
headers: {
|
||||
'Content-Type': contentType,
|
||||
'Content-Disposition': `${disposition}; filename="${file.filename}"`,
|
||||
'Content-Disposition': `${disposition}; ${encodeFilenameForHeader(file.filename)}`,
|
||||
'Cache-Control': 'public, max-age=31536000', // Cache for 1 year
|
||||
'X-Content-Type-Options': 'nosniff',
|
||||
'Content-Security-Policy': "default-src 'none'; style-src 'unsafe-inline'; sandbox;",
|
||||
|
||||
@@ -26,7 +26,7 @@ export type DocumentProcessingPayload = {
|
||||
|
||||
export const processDocument = task({
|
||||
id: 'knowledge-process-document',
|
||||
maxDuration: env.KB_CONFIG_MAX_DURATION || 300,
|
||||
maxDuration: env.KB_CONFIG_MAX_DURATION || 600,
|
||||
retry: {
|
||||
maxAttempts: env.KB_CONFIG_MAX_ATTEMPTS || 3,
|
||||
factor: env.KB_CONFIG_RETRY_FACTOR || 2,
|
||||
|
||||
@@ -146,7 +146,7 @@ export const env = createEnv({
|
||||
RATE_LIMIT_ENTERPRISE_ASYNC: z.string().optional().default('1000'), // Enterprise tier async API executions per minute
|
||||
|
||||
// Knowledge Base Processing Configuration - Shared across all processing methods
|
||||
KB_CONFIG_MAX_DURATION: z.number().optional().default(300), // Max processing duration in s
|
||||
KB_CONFIG_MAX_DURATION: z.number().optional().default(600), // Max processing duration in seconds (10 minutes)
|
||||
KB_CONFIG_MAX_ATTEMPTS: z.number().optional().default(3), // Max retry attempts
|
||||
KB_CONFIG_RETRY_FACTOR: z.number().optional().default(2), // Retry backoff factor
|
||||
KB_CONFIG_MIN_TIMEOUT: z.number().optional().default(1000), // Min timeout in ms
|
||||
|
||||
@@ -180,7 +180,9 @@ async function parseDocument(
|
||||
}
|
||||
|
||||
async function handleFileForOCR(fileUrl: string, filename: string, mimeType: string) {
|
||||
if (fileUrl.startsWith('https://')) {
|
||||
const isExternalHttps = fileUrl.startsWith('https://') && !fileUrl.includes('/api/files/serve/')
|
||||
|
||||
if (isExternalHttps) {
|
||||
return { httpsUrl: fileUrl }
|
||||
}
|
||||
|
||||
@@ -207,7 +209,16 @@ async function downloadFileWithTimeout(fileUrl: string): Promise<Buffer> {
|
||||
const timeoutId = setTimeout(() => controller.abort(), TIMEOUTS.FILE_DOWNLOAD)
|
||||
|
||||
try {
|
||||
const response = await fetch(fileUrl, { signal: controller.signal })
|
||||
const isInternalFileServe = fileUrl.includes('/api/files/serve/')
|
||||
const headers: HeadersInit = {}
|
||||
|
||||
if (isInternalFileServe) {
|
||||
const { generateInternalToken } = await import('@/lib/auth/internal')
|
||||
const token = await generateInternalToken()
|
||||
headers.Authorization = `Bearer ${token}`
|
||||
}
|
||||
|
||||
const response = await fetch(fileUrl, { signal: controller.signal, headers })
|
||||
clearTimeout(timeoutId)
|
||||
|
||||
if (!response.ok) {
|
||||
|
||||
@@ -17,14 +17,14 @@ import type { DocumentSortField, SortOrder } from './types'
|
||||
const logger = createLogger('DocumentService')
|
||||
|
||||
const TIMEOUTS = {
|
||||
OVERALL_PROCESSING: (env.KB_CONFIG_MAX_DURATION || 600) * 1000, // Increased to 10 minutes to match Trigger's timeout
|
||||
OVERALL_PROCESSING: (env.KB_CONFIG_MAX_DURATION || 600) * 1000, // Default 10 minutes for KB document processing
|
||||
EMBEDDINGS_API: (env.KB_CONFIG_MAX_TIMEOUT || 10000) * 18,
|
||||
} as const
|
||||
|
||||
// Configuration for handling large documents
|
||||
const LARGE_DOC_CONFIG = {
|
||||
MAX_CHUNKS_PER_BATCH: 500, // Insert embeddings in batches of 500
|
||||
MAX_EMBEDDING_BATCH: 50, // Generate embeddings in batches of 50
|
||||
MAX_EMBEDDING_BATCH: 500, // Generate embeddings in batches of 500
|
||||
MAX_FILE_SIZE: 100 * 1024 * 1024, // 100MB max file size
|
||||
MAX_CHUNKS_PER_DOCUMENT: 100000, // Maximum chunks allowed per document
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user