Files
sim/apps/sim/app/api/knowledge/[id]/documents/route.ts
Waleed Latif 76df2b9cd9 fix(sockets): added throttling, refactor entire socket server, added tests (#534)
* refactor(kb): use chonkie locally (#475)

* feat(parsers): text and markdown parsers (#473)

* feat: text and markdown parsers

* fix: don't readfile on buffer, convert buffer to string instead

* fix(knowledge-wh): fixed authentication error on webhook trigger

fix(knowledge-wh): fixed authentication error on webhook trigger

* feat(tools): add huggingface tools/blcok  (#472)

* add hugging face tool

* docs: add Hugging Face tool documentation

* fix: format and lint Hugging Face integration files

* docs: add manual intro section to Hugging Face documentation

* feat: replace Record<string, any> with proper HuggingFaceRequestBody interface

* accidental local files added

* restore some docs

* make layout full for model field

* change huggingface logo

* add manual content

* fix lint

---------

Co-authored-by: Vikhyath Mondreti <vikhyathmondreti@Vikhyaths-MacBook-Air.local>

* fix(knowledge-ux): fixed ux for knowledge base (#478)

fix(knowledge-ux): fixed ux for knowledge base (#478)

* fix(billing): bump better-auth version & fix existing subscription issue when adding seats (#484)

* bump better-auth version & fix existing subscription issue Bwhen adding seats

* ack PR comments

* fix(env): added NEXT_PUBLIC_APP_URL to .env.example (#485)

* feat(subworkflows): workflows as a block within workflows (#480)

* feat(subworkflows) workflows in workflows

* revert sync changes

* working output vars

* fix greptile comments

* add cycle detection

* add tests

* working tests

* works

* fix formatting

* fix input var handling

* add images

---------

Co-authored-by: Vikhyath Mondreti <vikhyathmondreti@Vikhyaths-MacBook-Air.local>
Co-authored-by: Vikhyath Mondreti <vikhyathmondreti@Vikhyaths-Air.attlocal.net>

* fix(kb): fixed kb race condition resulting in no chunks found (#487)

* fix: added all blocks activeExecutionPath (#486)

* refactor(chunker): replace chonkie with custom TextChunker (#479)

* refactor(chunker): replace chonkie with custom TextChunker implementation and update document processing logic

* chore: cleanup unimplemented types

* fix: KB tests updated

* fix(tab-sync): sync between tabs on change (#489)

* fix(tab-sync): sync between tabs on change

* refactor: optimize JSON.stringify operations that are redundant

* fix(file-upload): upload presigned url to kb for file upload instead of the whole file, circumvents 4.5MB serverless func limit (#491)

* feat(folders): folders to manage workflows (#490)

* feat(subworkflows) workflows in workflows

* revert sync changes

* working output vars

* fix greptile comments

* add cycle detection

* add tests

* working tests

* works

* fix formatting

* fix input var handling

* fix(tab-sync): sync between tabs on change

* feat(folders): folders to organize workflows

* address comments

* change schema types

* fix lint error

* fix typing error

* fix race cond

* delete unused files

* improved UI

* updated naming conventions

* revert unrelated changes to db schema

* fixed collapsed sidebar subfolders

* add logs filters for folders

---------

Co-authored-by: Vikhyath Mondreti <vikhyathmondreti@Vikhyaths-MacBook-Air.local>
Co-authored-by: Vikhyath Mondreti <vikhyathmondreti@Vikhyaths-Air.attlocal.net>
Co-authored-by: Waleed Latif <walif6@gmail.com>

* revert tab sync

* improvement(folders): added multi-select for moving folders (#493)

* added multi-select for folders

* allow drag into root

* remove extraneous comments

* instantly create worfklow on plus

* styling improvements, fixed flicker

* small improvement to dragover container

* ack PR comments

* fix(deployed-chat): made the chat mobile friendly (#494)

* improvement(ui/ux): chat deploy (#496)

* improvement(ui/ux): chat deploy experience

* improvement(ui/ux): chat fontweight

* feat(gmail): added option to access raw gmail from gmail polling service (#495)

* added option to grab raw gmail from gmail polling service

* safe json parse for function block execution to prevent vars in raw email from being resolved as sim studio vars

* added tests

* remove extraneous comments

* fix(ui): fix the UI for folder deletion, huggingface icon, workflow block icon, standardized alert dialog (#498)

* fixed folder delete UI

* fixed UI for workflow block, huggingface, & added alert dialog for deleting folders

* consistently style all alert dialogs

* fix(reset-data): remove reset all data button from settings modal along with logic (#499)

* fix(airtable): fixed airtable oauth token refresh, added tests (#502)

* fixed airtable token refresh, added tests

* added helpers for refreshOAuthToken function

* feat(registration): disable registration + handle env booleans (#501)

* feat: disable registration + handle env booleans

* chore: removing pre-process because we need to use util

* chore: format

* feat(providers): added azure openai (#503)

* added azure openai

* fix request params being passed through agent block for azure

* remove o1 from azure-openai models list

* fix: add vscode settings to gitignore

* feat(file-upload): generalized storage to support azure blob, enhanced error logging in kb, added xlsx parser (#506)

* added blob storage option for azure, refactored storage client to be provider agnostic, tested kb & file upload and s3 is undisrupted, still have to test blob

* updated CORS policy for blob, added azure blob-specific headers

* remove extraneous comments

* add file size limit and timeout

* added some extra error handling in kb add documents

* grouped envvars

* ack PR comments

* added sheetjs and xlsx parser

* fix(folders): modified folder deletion to delete subfolders & workflows in it instead of moving to root (#508)

* modified folder deletion to delete subfolders & workflows in it instead of moving to root

* added additional testing utils

* ack PR comments

* feat: api response block and implementation

* improvement(local-storage): remove use of local storage except for oauth and last active workspace id (#497)

* remove local storage usage

* remove migration for last active workspace id

* Update apps/sim/app/w/[id]/components/workflow-block/components/sub-block/components/file-selector/components/jira-issue-selector.tsx

Add fallback for required scopes

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>

* add url builder util

* fi

* fix lint

* lint

* modify pre commit hook

* fix oauth

* get last active workspace working again

* new workspace logic works

* fetch locks

* works now

* remove empty useEffect

* fix loading issue

* skip empty workflow syncs

* use isWorkspace in transition flag

* add logging

* add data initialized flag

* fix lint

* fix: build error by create a server-side utils

* remove migration snapshots

* reverse search for workspace based on workflow id

* fix lint

* improvement: loading check and animation

* remove unused utils

* remove console  logs

---------

Co-authored-by: Vikhyath Mondreti <vikhyathmondreti@Vikhyaths-Air.attlocal.net>
Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
Co-authored-by: Emir Karabeg <emirkarabeg@berkeley.edu>
Co-authored-by: Vikhyath Mondreti <vikhyathmondreti@vikhyaths-air.lan>

* feat(multi-select): simplified chat to always return readable stream, can select multiple outputs and get response streamed back in chat panel & deployed chat (#507)

* improvement: all workflow executions return ReadableStream & use sse to support multiple streamed outputs in chats

* fixed build

* remove extraneous comments

* general improvemetns

* ack PR comments

* fixed built

* improvement(workflow-state): split workflow state into separate tables  (#511)

* new tables to track workflow state

* fix lint

* refactor into separate tables

* fix typing

* fix lint

* add tests

* fix lint

* add correct foreign key constraint

* add self ref

* remove unused checks

* fix types

* fix type

---------

Co-authored-by: Vikhyath Mondreti <vikhyathmondreti@Vikhyaths-Air.attlocal.net>

* feat(models): added new openai models, updated model pricing, added new groq model (#513)

* fix(autocomplete): fixed extra closing tag on tag dropdown autocomplete (#514)

* chore: enable input format again

* fix: process the input made on api calls with proper extraction

* feat: add json-object for ai generation for response block and others

* chore: add documentation for response block

* chore: rollback temp fix and uncomment original input handler

* chore: add missing mock for response handler

* chore: add missing mock

* chore: greptile recommendations

* added cost tracking for router & evaluator blocks, consolidated model information into a single file, hosted keys for evaluator & router, parallelized unit tests (#516)

* fix(deployState): deploy not persisting bug  (#518)

* fix(undeploy-bug): fix deployment persistence failing bug

* fix lint

---------

Co-authored-by: Vikhyath Mondreti <vikhyathmondreti@Vikhyaths-MacBook-Air.local>

* fix decimal entry issues

* remove unused files

* fix(db): decimal position entry issues (#520)

* fix decimal entry issues

* remove unused files

---------

Co-authored-by: Vikhyath Mondreti <vikhyathmondreti@Vikhyaths-Air.attlocal.net>

* fix lint

* fix test

* improvement(kb): added configurability for chunks, query across multiple knowledge bases (#512)

* refactor: consolidate create modal file

* fix: identify dead processes

* fix: mark failed in DB after processing timeout

* improvement: added overlap chunks and fixed modal UI

* feat: multiselect logic

* fix: biome changes for css ordering warn instead of error

* improvement: create chunk ui

* fix: removed unused schema columns

* fix: removed references to deleted columns

* improvement: sped up vector search time

* feat: multi-kb search

* add bulk endpoint to disable/delete multiple chunks

* add bulk endpoint to disable/delete multiple chunks

* fix: removed unused schema columns

* fix: removed references to deleted columns

* made endpoints for knowledge more RESTful, added tests

* added batch operations for delete/enable/disable docs, alr have this for chunks

* added migrations

* added migrations

---------

Co-authored-by: Waleed Latif <walif6@gmail.com>

* fix(models): remove temp from models that don't support it

* feat(sdk): added ts and python SDKs + docs (#524)

* added ts & python sdk, renamed cli from simstudio to cli

* added docs

* ack PR comments

* improvements

* fixed issue where it goes to random workspace when you click reload

fixed lint issue

* feat: better response builder + doc update

* fix(auth): added preview URLs to list of trusted origins (#525)

* trusted origins

* lint error

* removed localhost

* ran lint

---------

Co-authored-by: Waleed Latif <walif6@gmail.com>

* fix(sdk): remove dev script from SDK

* PR: changes for migration

* add changes on top of db migration changes

* fix: allow removing single input field

* improvement(permissions): workspace permissions improvements, added provider and reduced API calls by 85% (#530)

* improved permissions UI & access patterns, show outstanding invites

* added logger

* added provider for workspace permissions, 85% reduction in API calls to get user permissions and improved performance for invitations

* ack PR comments

* cleanup

* fix disabled tooltips

* improvement(tests): parallelized tests and build fixes (#531)

* added provider for workspace permissions, 85% reduction in API calls to get user permissions and improved performance for invitations

* parallelized more tests, fixed test warnings

* removed waitlist verification route, use more utils in tests

* fixed build

* ack PR comments

* fix

* fix(kb): reduced params in kb block, added advanced mode to starter block, updated docs

* feat(realtime): sockets + normalized tables + deprecate sync (#523)

* feat: implement real-time collaborative workflow editing with Socket.IO

- Add Socket.IO server with room-based architecture for workflow collaboration
- Implement socket context for client-side real-time communication
- Add collaborative workflow hook for synchronized state management
- Update CSP to allow socket connections to localhost:3002
- Add fallback authentication for testing collaborative features
- Enable real-time broadcasting of workflow operations between tabs
- Support multi-user editing of blocks, edges, and workflow state

Key components:
- socket-server/: Complete Socket.IO server with authentication and room management
- contexts/socket-context.tsx: Client-side socket connection and state management
- hooks/use-collaborative-workflow.ts: Hook for collaborative workflow operations
- Workflow store integration for real-time state synchronization

Status: Basic collaborative features working, authentication bypass enabled for testing

* feat: complete collaborative subblock editing implementation

 All collaborative features now working perfectly:
- Real-time block movement and positioning
- Real-time subblock value editing (text fields, inputs)
- Real-time edge operations and parent updates
- Multi-user workflow rooms with proper broadcasting
- Socket.IO server with room-based architecture
- Permission bypass system for testing

🔧 Technical improvements:
- Modified useSubBlockValue hook to use collaborative event system
- All subblock setValue calls now dispatch 'update-subblock-value' events
- Collaborative workflow hook handles all real-time operations
- Socket server processes and persists all operations to database
- Clean separation between local and collaborative state management

🧪 Tested and verified:
- Multiple browser tabs with different fallback users
- Block dragging and positioning updates in real-time
- Subblock text editing reflects immediately across tabs
- Workflow room management and user presence
- Database persistence of all collaborative operations

Status: Full collaborative workflow editing working with fallback authentication

* feat: implement proper authentication for collaborative Socket.IO server

 **Authentication System Complete**:
- Removed all fallback authentication code and bypasses
- Socket server now requires valid Better Auth session cookies
- Proper session validation using auth.api.getSession()
- Authentication errors properly handled and logged
- User info extracted from session: userId, userName, email, organizationId

🔧 **Technical Implementation**:
- Updated CSP to allow WebSocket connections (ws://localhost:3002)
- Socket authentication middleware validates session tokens
- Proper error handling for missing/invalid sessions
- Permission system enforces workflow access controls
- Clean separation between authenticated and unauthenticated states

🧪 **Testing Status**:
- Socket server properly rejects unauthenticated connections
- Authentication errors logged with clear messages
- CSP updated to allow both HTTP and WebSocket protocols
- Ready for testing with authenticated users

Status: Production-ready collaborative authentication system

* feat: complete authentication integration for collaborative Socket.IO system

🎉 **PRODUCTION-READY COLLABORATIVE SYSTEM**

 **Authentication Integration Complete**:
- Fixed Socket.IO client to send credentials (withCredentials: true)
- Updated server CORS to accept credentials with specific origin
- Removed all fallback authentication bypasses
- Proper Better Auth session validation working

🔧 **Technical Fixes**:
- Socket client: Enable withCredentials for cookie transmission
- Socket server: Accept credentials with origin 'http://localhost:3000'
- Better Auth cookie utility integration for session parsing
- Comprehensive authentication middleware with proper error handling

🧪 **Verified Working Features**:
-  Real user authentication (Vikhyath Mondreti authenticated)
-  Multi-user workflow rooms (2+ users in same workflow)
-  Permission system enforcing workflow access controls
-  Real-time subblock editing across browser tabs
-  Block movement and positioning updates
-  Automatic room cleanup and management
-  Database persistence of all collaborative operations

🚀 **Status**: Complete enterprise-grade collaborative workflow editing system
- No more fallback users - production authentication
- Multi-tab collaboration working perfectly
- Secure access control with Better Auth integration
- Real-time updates for all workflow operations

* remove sync system and move to server side

* fix lint

* delete unused file

* added socketio dep

* fix subblock persistence bug

* working deletion of workflows

* fix lint

* added railway

* add debug logging for railway deployment

* improve typing

* fix lint

* working subflow persistence

* fix lint

* working cascade deletion

* fix lint

* working subflow inside subflow

* works

* fix lint

* prevent subflow in subflow

* fix lint

* add additional logs, add localhost as allowedOrigin

* add additional logs, add localhost as allowedOrigin

* fix type error

* remove unused code

* fix lint

* fix tests

* fix lint

* fix build error

* workign folder updates

* fix typing issue

* fix lint

* fix typing issues

* lib/

* fix tests

* added old presence component back, updated to use one-time-token better auth plugin for socket server auth, tested

* fix errors

* fix bugs

* add migration scripts to run

* fix lint

* fix deploy tests

* fix lint

* fix minor issues

* fix lint

* fix migration script

* allow comma separateds id file input to migration script

* fix lint

* fixed

* fix lint

* fix fallback case

* fix type errors

* address greptile comments

* fix lint

* fix script to generate new block ids

* fix lint

---------

Co-authored-by: Vikhyath Mondreti <vikhyathmondreti@Vikhyaths-Air.attlocal.net>
Co-authored-by: Vikhyath Mondreti <vikhyathmondreti@vikhyaths-air.lan>
Co-authored-by: Waleed Latif <walif6@gmail.com>
Co-authored-by: Vikhyath Mondreti <vikhyathmondreti@Vikhyaths-MacBook-Air.local>

* fix(sockets): updated CSP

* remove unecessary logs

* fix lint

* added throttling, refactor entire socket server, added tests

* improvements

* remove self monitoring func, add block name event

* working isWide, isAdvanced toggles with sockets

* fix lint

* fix duplicate key issue for user avatar

* fix lint

* fix user presence

* working parallel badges / loop badges updates

* working connection output persistence

* fix lint

* fix build errors

* fix lint

* logs removed

* fix cascade var name update bug

* works

* fix lint

* fix parallel blocks

* fix placeholder

* fix test

* fixed tests

---------

Co-authored-by: Aditya Tripathi <aditya@climactic.co>
Co-authored-by: Adam Gough <77861281+aadamgough@users.noreply.github.com>
Co-authored-by: Vikhyath Mondreti <vikhyathvikku@gmail.com>
Co-authored-by: Vikhyath Mondreti <vikhyathmondreti@Vikhyaths-MacBook-Air.local>
Co-authored-by: Vikhyath Mondreti <vikhyathmondreti@Vikhyaths-Air.attlocal.net>
Co-authored-by: Emir Karabeg <emirkarabeg@berkeley.edu>
Co-authored-by: Emir Karabeg <78010029+emir-karabeg@users.noreply.github.com>
Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
Co-authored-by: Vikhyath Mondreti <vikhyathmondreti@vikhyaths-air.lan>
Co-authored-by: Ajit Kadaveru <ajit.kadaveru@berkeley.edu>
2025-06-24 17:44:30 -07:00

536 lines
17 KiB
TypeScript

import crypto from 'node:crypto'
import { and, desc, eq, inArray, isNull } from 'drizzle-orm'
import { type NextRequest, NextResponse } from 'next/server'
import { z } from 'zod'
import { getSession } from '@/lib/auth'
import { createLogger } from '@/lib/logs/console-logger'
import { db } from '@/db'
import { document } from '@/db/schema'
import { checkKnowledgeBaseAccess, processDocumentAsync } from '../../utils'
const logger = createLogger('DocumentsAPI')
const PROCESSING_CONFIG = {
maxConcurrentDocuments: 3,
batchSize: 5,
delayBetweenBatches: 1000,
delayBetweenDocuments: 500,
}
async function processDocumentsWithConcurrencyControl(
createdDocuments: Array<{
documentId: string
filename: string
fileUrl: string
fileSize: number
mimeType: string
}>,
knowledgeBaseId: string,
processingOptions: {
chunkSize: number
minCharactersPerChunk: number
recipe: string
lang: string
chunkOverlap: number
},
requestId: string
): Promise<void> {
const totalDocuments = createdDocuments.length
const batches = []
for (let i = 0; i < totalDocuments; i += PROCESSING_CONFIG.batchSize) {
batches.push(createdDocuments.slice(i, i + PROCESSING_CONFIG.batchSize))
}
logger.info(`[${requestId}] Processing ${totalDocuments} documents in ${batches.length} batches`)
for (const [batchIndex, batch] of batches.entries()) {
logger.info(
`[${requestId}] Starting batch ${batchIndex + 1}/${batches.length} with ${batch.length} documents`
)
await processBatchWithConcurrency(batch, knowledgeBaseId, processingOptions, requestId)
if (batchIndex < batches.length - 1) {
await new Promise((resolve) => setTimeout(resolve, PROCESSING_CONFIG.delayBetweenBatches))
}
}
logger.info(`[${requestId}] Completed processing initiation for all ${totalDocuments} documents`)
}
async function processBatchWithConcurrency(
batch: Array<{
documentId: string
filename: string
fileUrl: string
fileSize: number
mimeType: string
}>,
knowledgeBaseId: string,
processingOptions: {
chunkSize: number
minCharactersPerChunk: number
recipe: string
lang: string
chunkOverlap: number
},
requestId: string
): Promise<void> {
const semaphore = new Array(PROCESSING_CONFIG.maxConcurrentDocuments).fill(0)
const processingPromises = batch.map(async (doc, index) => {
if (index > 0) {
await new Promise((resolve) =>
setTimeout(resolve, index * PROCESSING_CONFIG.delayBetweenDocuments)
)
}
await new Promise<void>((resolve) => {
const checkSlot = () => {
const availableIndex = semaphore.findIndex((slot) => slot === 0)
if (availableIndex !== -1) {
semaphore[availableIndex] = 1
resolve()
} else {
setTimeout(checkSlot, 100)
}
}
checkSlot()
})
try {
logger.info(`[${requestId}] Starting processing for document: ${doc.filename}`)
await processDocumentAsync(
knowledgeBaseId,
doc.documentId,
{
filename: doc.filename,
fileUrl: doc.fileUrl,
fileSize: doc.fileSize,
mimeType: doc.mimeType,
},
processingOptions
)
logger.info(`[${requestId}] Successfully initiated processing for document: ${doc.filename}`)
} catch (error: unknown) {
logger.error(`[${requestId}] Failed to process document: ${doc.filename}`, {
documentId: doc.documentId,
filename: doc.filename,
error: error instanceof Error ? error.message : 'Unknown error',
})
try {
await db
.update(document)
.set({
processingStatus: 'failed',
processingError:
error instanceof Error ? error.message : 'Failed to initiate processing',
processingCompletedAt: new Date(),
})
.where(eq(document.id, doc.documentId))
} catch (dbError: unknown) {
logger.error(
`[${requestId}] Failed to update document status for failed document: ${doc.documentId}`,
dbError
)
}
} finally {
const slotIndex = semaphore.findIndex((slot) => slot === 1)
if (slotIndex !== -1) {
semaphore[slotIndex] = 0
}
}
})
await Promise.allSettled(processingPromises)
}
const CreateDocumentSchema = z.object({
filename: z.string().min(1, 'Filename is required'),
fileUrl: z.string().url('File URL must be valid'),
fileSize: z.number().min(1, 'File size must be greater than 0'),
mimeType: z.string().min(1, 'MIME type is required'),
})
const BulkCreateDocumentsSchema = z.object({
documents: z.array(CreateDocumentSchema),
processingOptions: z.object({
chunkSize: z.number().min(100).max(4000),
minCharactersPerChunk: z.number().min(50).max(2000),
recipe: z.string(),
lang: z.string(),
chunkOverlap: z.number().min(0).max(500),
}),
bulk: z.literal(true),
})
const BulkUpdateDocumentsSchema = z.object({
operation: z.enum(['enable', 'disable', 'delete']),
documentIds: z
.array(z.string())
.min(1, 'At least one document ID is required')
.max(100, 'Cannot operate on more than 100 documents at once'),
})
export async function GET(req: NextRequest, { params }: { params: Promise<{ id: string }> }) {
const requestId = crypto.randomUUID().slice(0, 8)
const { id: knowledgeBaseId } = await params
try {
const session = await getSession()
if (!session?.user?.id) {
logger.warn(`[${requestId}] Unauthorized documents access attempt`)
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
}
const accessCheck = await checkKnowledgeBaseAccess(knowledgeBaseId, session.user.id)
if (!accessCheck.hasAccess) {
if ('notFound' in accessCheck && accessCheck.notFound) {
logger.warn(`[${requestId}] Knowledge base not found: ${knowledgeBaseId}`)
return NextResponse.json({ error: 'Knowledge base not found' }, { status: 404 })
}
logger.warn(
`[${requestId}] User ${session.user.id} attempted to access unauthorized knowledge base documents ${knowledgeBaseId}`
)
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
}
const url = new URL(req.url)
const includeDisabled = url.searchParams.get('includeDisabled') === 'true'
// Build where conditions
const whereConditions = [
eq(document.knowledgeBaseId, knowledgeBaseId),
isNull(document.deletedAt),
]
// Filter out disabled documents unless specifically requested
if (!includeDisabled) {
whereConditions.push(eq(document.enabled, true))
}
const documents = await db
.select({
id: document.id,
filename: document.filename,
fileUrl: document.fileUrl,
fileSize: document.fileSize,
mimeType: document.mimeType,
chunkCount: document.chunkCount,
tokenCount: document.tokenCount,
characterCount: document.characterCount,
processingStatus: document.processingStatus,
processingStartedAt: document.processingStartedAt,
processingCompletedAt: document.processingCompletedAt,
processingError: document.processingError,
enabled: document.enabled,
uploadedAt: document.uploadedAt,
})
.from(document)
.where(and(...whereConditions))
.orderBy(desc(document.uploadedAt))
logger.info(
`[${requestId}] Retrieved ${documents.length} documents for knowledge base ${knowledgeBaseId}`
)
return NextResponse.json({
success: true,
data: documents,
})
} catch (error) {
logger.error(`[${requestId}] Error fetching documents`, error)
return NextResponse.json({ error: 'Failed to fetch documents' }, { status: 500 })
}
}
export async function POST(req: NextRequest, { params }: { params: Promise<{ id: string }> }) {
const requestId = crypto.randomUUID().slice(0, 8)
const { id: knowledgeBaseId } = await params
try {
const session = await getSession()
if (!session?.user?.id) {
logger.warn(`[${requestId}] Unauthorized document creation attempt`)
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
}
const accessCheck = await checkKnowledgeBaseAccess(knowledgeBaseId, session.user.id)
if (!accessCheck.hasAccess) {
if ('notFound' in accessCheck && accessCheck.notFound) {
logger.warn(`[${requestId}] Knowledge base not found: ${knowledgeBaseId}`)
return NextResponse.json({ error: 'Knowledge base not found' }, { status: 404 })
}
logger.warn(
`[${requestId}] User ${session.user.id} attempted to create document in unauthorized knowledge base ${knowledgeBaseId}`
)
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
}
const body = await req.json()
// Check if this is a bulk operation
if (body.bulk === true) {
// Handle bulk processing (replaces process-documents endpoint)
try {
const validatedData = BulkCreateDocumentsSchema.parse(body)
const createdDocuments = await db.transaction(async (tx) => {
const documentPromises = validatedData.documents.map(async (docData) => {
const documentId = crypto.randomUUID()
const now = new Date()
const newDocument = {
id: documentId,
knowledgeBaseId,
filename: docData.filename,
fileUrl: docData.fileUrl,
fileSize: docData.fileSize,
mimeType: docData.mimeType,
chunkCount: 0,
tokenCount: 0,
characterCount: 0,
processingStatus: 'pending' as const,
enabled: true,
uploadedAt: now,
}
await tx.insert(document).values(newDocument)
logger.info(
`[${requestId}] Document record created: ${documentId} for file: ${docData.filename}`
)
return { documentId, ...docData }
})
return await Promise.all(documentPromises)
})
logger.info(
`[${requestId}] Starting controlled async processing of ${createdDocuments.length} documents`
)
processDocumentsWithConcurrencyControl(
createdDocuments,
knowledgeBaseId,
validatedData.processingOptions,
requestId
).catch((error: unknown) => {
logger.error(`[${requestId}] Critical error in document processing pipeline:`, error)
})
return NextResponse.json({
success: true,
data: {
total: createdDocuments.length,
documentsCreated: createdDocuments.map((doc) => ({
documentId: doc.documentId,
filename: doc.filename,
status: 'pending',
})),
processingMethod: 'background',
processingConfig: {
maxConcurrentDocuments: PROCESSING_CONFIG.maxConcurrentDocuments,
batchSize: PROCESSING_CONFIG.batchSize,
totalBatches: Math.ceil(createdDocuments.length / PROCESSING_CONFIG.batchSize),
},
},
})
} catch (validationError) {
if (validationError instanceof z.ZodError) {
logger.warn(`[${requestId}] Invalid bulk processing request data`, {
errors: validationError.errors,
})
return NextResponse.json(
{ error: 'Invalid request data', details: validationError.errors },
{ status: 400 }
)
}
throw validationError
}
} else {
// Handle single document creation
try {
const validatedData = CreateDocumentSchema.parse(body)
const documentId = crypto.randomUUID()
const now = new Date()
const newDocument = {
id: documentId,
knowledgeBaseId,
filename: validatedData.filename,
fileUrl: validatedData.fileUrl,
fileSize: validatedData.fileSize,
mimeType: validatedData.mimeType,
chunkCount: 0,
tokenCount: 0,
characterCount: 0,
enabled: true,
uploadedAt: now,
}
await db.insert(document).values(newDocument)
logger.info(
`[${requestId}] Document created: ${documentId} in knowledge base ${knowledgeBaseId}`
)
return NextResponse.json({
success: true,
data: newDocument,
})
} catch (validationError) {
if (validationError instanceof z.ZodError) {
logger.warn(`[${requestId}] Invalid document data`, {
errors: validationError.errors,
})
return NextResponse.json(
{ error: 'Invalid request data', details: validationError.errors },
{ status: 400 }
)
}
throw validationError
}
}
} catch (error) {
logger.error(`[${requestId}] Error creating document`, error)
return NextResponse.json({ error: 'Failed to create document' }, { status: 500 })
}
}
export async function PATCH(req: NextRequest, { params }: { params: Promise<{ id: string }> }) {
const requestId = crypto.randomUUID().slice(0, 8)
const { id: knowledgeBaseId } = await params
try {
const session = await getSession()
if (!session?.user?.id) {
logger.warn(`[${requestId}] Unauthorized bulk document operation attempt`)
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
}
const accessCheck = await checkKnowledgeBaseAccess(knowledgeBaseId, session.user.id)
if (!accessCheck.hasAccess) {
if ('notFound' in accessCheck && accessCheck.notFound) {
logger.warn(`[${requestId}] Knowledge base not found: ${knowledgeBaseId}`)
return NextResponse.json({ error: 'Knowledge base not found' }, { status: 404 })
}
logger.warn(
`[${requestId}] User ${session.user.id} attempted to perform bulk operation on unauthorized knowledge base ${knowledgeBaseId}`
)
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
}
const body = await req.json()
try {
const validatedData = BulkUpdateDocumentsSchema.parse(body)
const { operation, documentIds } = validatedData
logger.info(
`[${requestId}] Starting bulk ${operation} operation on ${documentIds.length} documents in knowledge base ${knowledgeBaseId}`
)
// Verify all documents belong to this knowledge base and user has access
const documentsToUpdate = await db
.select({
id: document.id,
enabled: document.enabled,
})
.from(document)
.where(
and(
eq(document.knowledgeBaseId, knowledgeBaseId),
inArray(document.id, documentIds),
isNull(document.deletedAt)
)
)
if (documentsToUpdate.length === 0) {
return NextResponse.json({ error: 'No valid documents found to update' }, { status: 404 })
}
if (documentsToUpdate.length !== documentIds.length) {
logger.warn(
`[${requestId}] Some documents not found or don't belong to knowledge base. Requested: ${documentIds.length}, Found: ${documentsToUpdate.length}`
)
}
// Perform the bulk operation
let updateResult: Array<{ id: string; enabled?: boolean; deletedAt?: Date | null }>
let successCount: number
if (operation === 'delete') {
// Handle bulk soft delete
updateResult = await db
.update(document)
.set({
deletedAt: new Date(),
})
.where(
and(
eq(document.knowledgeBaseId, knowledgeBaseId),
inArray(document.id, documentIds),
isNull(document.deletedAt)
)
)
.returning({ id: document.id, deletedAt: document.deletedAt })
successCount = updateResult.length
} else {
// Handle bulk enable/disable
const enabled = operation === 'enable'
updateResult = await db
.update(document)
.set({
enabled,
})
.where(
and(
eq(document.knowledgeBaseId, knowledgeBaseId),
inArray(document.id, documentIds),
isNull(document.deletedAt)
)
)
.returning({ id: document.id, enabled: document.enabled })
successCount = updateResult.length
}
logger.info(
`[${requestId}] Bulk ${operation} operation completed: ${successCount} documents updated in knowledge base ${knowledgeBaseId}`
)
return NextResponse.json({
success: true,
data: {
operation,
successCount,
updatedDocuments: updateResult,
},
})
} catch (validationError) {
if (validationError instanceof z.ZodError) {
logger.warn(`[${requestId}] Invalid bulk operation data`, {
errors: validationError.errors,
})
return NextResponse.json(
{ error: 'Invalid request data', details: validationError.errors },
{ status: 400 }
)
}
throw validationError
}
} catch (error) {
logger.error(`[${requestId}] Error in bulk document operation`, error)
return NextResponse.json({ error: 'Failed to perform bulk operation' }, { status: 500 })
}
}