mirror of
https://github.com/simstudioai/sim.git
synced 2026-04-28 03:00:29 -04:00
improvement(kb): deferred content fetching and metadata-based hashes for connectors (#4044)
* improvement(kb): deferred content fetching and metadata-based hashes for connectors * fix(kb): remove message count from outlook contentHash to prevent list/get divergence * fix(kb): increase outlook getDocument message limit from 50 to 250 * fix(kb): skip outlook messages without conversationId to prevent broken stubs * fix(kb): scope outlook getDocument to same folder as listDocuments to prevent hash divergence * fix(kb): add missing connector sync cron job to Helm values The connector sync endpoint existed but had no cron job configured to trigger it, meaning scheduled syncs would never fire. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2,7 +2,7 @@ import { createLogger } from '@sim/logger'
|
||||
import { AsanaIcon } from '@/components/icons'
|
||||
import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
|
||||
import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
|
||||
import { computeContentHash, joinTagArray, parseTagDate } from '@/connectors/utils'
|
||||
import { joinTagArray, parseTagDate } from '@/connectors/utils'
|
||||
|
||||
const logger = createLogger('AsanaConnector')
|
||||
|
||||
@@ -240,7 +240,6 @@ export const asanaConnector: ConnectorConfig = {
|
||||
|
||||
for (const task of result.data) {
|
||||
const content = buildTaskContent(task)
|
||||
const contentHash = await computeContentHash(content)
|
||||
const tagNames = task.tags?.map((t) => t.name).filter(Boolean) || []
|
||||
|
||||
documents.push({
|
||||
@@ -249,7 +248,7 @@ export const asanaConnector: ConnectorConfig = {
|
||||
content,
|
||||
mimeType: 'text/plain',
|
||||
sourceUrl: task.permalink_url || undefined,
|
||||
contentHash,
|
||||
contentHash: `asana:${task.gid}:${task.modified_at ?? ''}`,
|
||||
metadata: {
|
||||
project: currentProjectGid,
|
||||
assignee: task.assignee?.name,
|
||||
@@ -315,7 +314,6 @@ export const asanaConnector: ConnectorConfig = {
|
||||
if (!task) return null
|
||||
|
||||
const content = buildTaskContent(task)
|
||||
const contentHash = await computeContentHash(content)
|
||||
const tagNames = task.tags?.map((t) => t.name).filter(Boolean) || []
|
||||
|
||||
return {
|
||||
@@ -324,7 +322,7 @@ export const asanaConnector: ConnectorConfig = {
|
||||
content,
|
||||
mimeType: 'text/plain',
|
||||
sourceUrl: task.permalink_url || undefined,
|
||||
contentHash,
|
||||
contentHash: `asana:${task.gid}:${task.modified_at ?? ''}`,
|
||||
metadata: {
|
||||
assignee: task.assignee?.name,
|
||||
completed: task.completed,
|
||||
|
||||
@@ -2,7 +2,7 @@ import { createLogger } from '@sim/logger'
|
||||
import { FirefliesIcon } from '@/components/icons'
|
||||
import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
|
||||
import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
|
||||
import { computeContentHash, parseTagDate } from '@/connectors/utils'
|
||||
import { parseTagDate } from '@/connectors/utils'
|
||||
|
||||
const logger = createLogger('FirefliesConnector')
|
||||
|
||||
@@ -196,17 +196,6 @@ export const firefliesConnector: ConnectorConfig = {
|
||||
id
|
||||
name
|
||||
}
|
||||
sentences {
|
||||
index
|
||||
speaker_name
|
||||
text
|
||||
}
|
||||
summary {
|
||||
keywords
|
||||
action_items
|
||||
overview
|
||||
short_summary
|
||||
}
|
||||
}
|
||||
}`,
|
||||
variables
|
||||
@@ -214,32 +203,27 @@ export const firefliesConnector: ConnectorConfig = {
|
||||
|
||||
const transcripts = (data.transcripts || []) as FirefliesTranscript[]
|
||||
|
||||
const documents: ExternalDocument[] = await Promise.all(
|
||||
transcripts.map(async (transcript) => {
|
||||
const content = formatTranscriptContent(transcript)
|
||||
const contentHash = await computeContentHash(content)
|
||||
const documents: ExternalDocument[] = transcripts.map((transcript) => {
|
||||
const meetingDate = transcript.date ? new Date(transcript.date).toISOString() : undefined
|
||||
const speakerNames = transcript.speakers?.map((s) => s.name).filter(Boolean) ?? []
|
||||
|
||||
const meetingDate = transcript.date ? new Date(transcript.date).toISOString() : undefined
|
||||
const speakerNames = transcript.speakers?.map((s) => s.name).filter(Boolean) ?? []
|
||||
|
||||
return {
|
||||
externalId: transcript.id,
|
||||
title: transcript.title || 'Untitled Meeting',
|
||||
content,
|
||||
mimeType: 'text/plain' as const,
|
||||
sourceUrl: transcript.transcript_url || undefined,
|
||||
contentHash,
|
||||
metadata: {
|
||||
hostEmail: transcript.host_email,
|
||||
duration: transcript.duration,
|
||||
meetingDate,
|
||||
participants: transcript.participants,
|
||||
speakers: speakerNames,
|
||||
keywords: transcript.summary?.keywords,
|
||||
},
|
||||
}
|
||||
})
|
||||
)
|
||||
return {
|
||||
externalId: transcript.id,
|
||||
title: transcript.title || 'Untitled Meeting',
|
||||
content: '',
|
||||
contentDeferred: true,
|
||||
mimeType: 'text/plain' as const,
|
||||
sourceUrl: transcript.transcript_url || undefined,
|
||||
contentHash: `fireflies:${transcript.id}:${transcript.date ?? ''}:${transcript.duration ?? ''}`,
|
||||
metadata: {
|
||||
hostEmail: transcript.host_email,
|
||||
duration: transcript.duration,
|
||||
meetingDate,
|
||||
participants: transcript.participants,
|
||||
speakers: speakerNames,
|
||||
},
|
||||
}
|
||||
})
|
||||
|
||||
const totalFetched = ((syncContext?.totalDocsFetched as number) ?? 0) + documents.length
|
||||
if (syncContext) syncContext.totalDocsFetched = totalFetched
|
||||
@@ -296,7 +280,7 @@ export const firefliesConnector: ConnectorConfig = {
|
||||
if (!transcript) return null
|
||||
|
||||
const content = formatTranscriptContent(transcript)
|
||||
const contentHash = await computeContentHash(content)
|
||||
const contentHash = `fireflies:${transcript.id}:${transcript.date ?? ''}:${transcript.duration ?? ''}`
|
||||
|
||||
const meetingDate = transcript.date ? new Date(transcript.date).toISOString() : undefined
|
||||
const speakerNames = transcript.speakers?.map((s) => s.name).filter(Boolean) ?? []
|
||||
@@ -305,6 +289,7 @@ export const firefliesConnector: ConnectorConfig = {
|
||||
externalId: transcript.id,
|
||||
title: transcript.title || 'Untitled Meeting',
|
||||
content,
|
||||
contentDeferred: false,
|
||||
mimeType: 'text/plain',
|
||||
sourceUrl: transcript.transcript_url || undefined,
|
||||
contentHash,
|
||||
|
||||
@@ -2,7 +2,7 @@ import { createLogger } from '@sim/logger'
|
||||
import { GoogleCalendarIcon } from '@/components/icons'
|
||||
import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
|
||||
import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
|
||||
import { computeContentHash, parseTagDate } from '@/connectors/utils'
|
||||
import { parseTagDate } from '@/connectors/utils'
|
||||
|
||||
const logger = createLogger('GoogleCalendarConnector')
|
||||
|
||||
@@ -195,14 +195,12 @@ function getTimeRange(sourceConfig: Record<string, unknown>): { timeMin: string;
|
||||
/**
|
||||
* Converts a CalendarEvent to an ExternalDocument.
|
||||
*/
|
||||
async function eventToDocument(event: CalendarEvent): Promise<ExternalDocument | null> {
|
||||
function eventToDocument(event: CalendarEvent): ExternalDocument | null {
|
||||
if (event.status === 'cancelled') return null
|
||||
|
||||
const content = eventToContent(event)
|
||||
if (!content.trim()) return null
|
||||
|
||||
const contentHash = await computeContentHash(content)
|
||||
|
||||
const startTime = event.start?.dateTime || event.start?.date || ''
|
||||
const attendeeCount = event.attendees?.filter((a) => !a.resource).length || 0
|
||||
|
||||
@@ -212,7 +210,7 @@ async function eventToDocument(event: CalendarEvent): Promise<ExternalDocument |
|
||||
content,
|
||||
mimeType: 'text/plain',
|
||||
sourceUrl: event.htmlLink || `https://calendar.google.com/calendar/event?eid=${event.id}`,
|
||||
contentHash,
|
||||
contentHash: `gcal:${event.id}:${event.updated ?? ''}`,
|
||||
metadata: {
|
||||
startTime,
|
||||
endTime: event.end?.dateTime || event.end?.date || '',
|
||||
@@ -348,7 +346,7 @@ export const googleCalendarConnector: ConnectorConfig = {
|
||||
|
||||
const documents: ExternalDocument[] = []
|
||||
for (const event of events) {
|
||||
const doc = await eventToDocument(event)
|
||||
const doc = eventToDocument(event)
|
||||
if (doc) documents.push(doc)
|
||||
}
|
||||
|
||||
@@ -392,7 +390,7 @@ export const googleCalendarConnector: ConnectorConfig = {
|
||||
|
||||
if (event.status === 'cancelled') return null
|
||||
|
||||
return eventToDocument(event)
|
||||
return eventToDocument(event) ?? null
|
||||
},
|
||||
|
||||
validateConfig: async (
|
||||
|
||||
@@ -2,7 +2,7 @@ import { createLogger } from '@sim/logger'
|
||||
import { GoogleDocsIcon } from '@/components/icons'
|
||||
import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
|
||||
import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
|
||||
import { computeContentHash, joinTagArray, parseTagDate } from '@/connectors/utils'
|
||||
import { joinTagArray, parseTagDate } from '@/connectors/utils'
|
||||
|
||||
const logger = createLogger('GoogleDocsConnector')
|
||||
|
||||
@@ -117,40 +117,23 @@ async function fetchDocContent(accessToken: string, documentId: string): Promise
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a Drive file entry into an ExternalDocument by fetching its content
|
||||
* from the Google Docs API.
|
||||
* Creates a lightweight stub from a Drive file entry. Content is deferred
|
||||
* and only fetched via getDocument for new or changed documents.
|
||||
*/
|
||||
async function fileToDocument(
|
||||
accessToken: string,
|
||||
file: DriveFile
|
||||
): Promise<ExternalDocument | null> {
|
||||
try {
|
||||
const content = await fetchDocContent(accessToken, file.id)
|
||||
if (!content.trim()) {
|
||||
logger.info(`Skipping empty document: ${file.name} (${file.id})`)
|
||||
return null
|
||||
}
|
||||
|
||||
const contentHash = await computeContentHash(content)
|
||||
|
||||
return {
|
||||
externalId: file.id,
|
||||
title: file.name || 'Untitled',
|
||||
content,
|
||||
mimeType: 'text/plain',
|
||||
sourceUrl: file.webViewLink || `https://docs.google.com/document/d/${file.id}/edit`,
|
||||
contentHash,
|
||||
metadata: {
|
||||
modifiedTime: file.modifiedTime,
|
||||
createdTime: file.createdTime,
|
||||
owners: file.owners?.map((o) => o.displayName || o.emailAddress).filter(Boolean),
|
||||
},
|
||||
}
|
||||
} catch (error) {
|
||||
logger.warn(`Failed to extract content from document: ${file.name} (${file.id})`, {
|
||||
error: error instanceof Error ? error.message : String(error),
|
||||
})
|
||||
return null
|
||||
function fileToStub(file: DriveFile): ExternalDocument {
|
||||
return {
|
||||
externalId: file.id,
|
||||
title: file.name || 'Untitled',
|
||||
content: '',
|
||||
contentDeferred: true,
|
||||
mimeType: 'text/plain',
|
||||
sourceUrl: file.webViewLink || `https://docs.google.com/document/d/${file.id}/edit`,
|
||||
contentHash: `gdocs:${file.id}:${file.modifiedTime ?? ''}`,
|
||||
metadata: {
|
||||
modifiedTime: file.modifiedTime,
|
||||
createdTime: file.createdTime,
|
||||
owners: file.owners?.map((o) => o.displayName || o.emailAddress).filter(Boolean),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -246,18 +229,11 @@ export const googleDocsConnector: ConnectorConfig = {
|
||||
const maxDocs = sourceConfig.maxDocs ? Number(sourceConfig.maxDocs) : 0
|
||||
const previouslyFetched = (syncContext?.totalDocsFetched as number) ?? 0
|
||||
|
||||
const CONCURRENCY = 5
|
||||
const documents: ExternalDocument[] = []
|
||||
for (let i = 0; i < files.length; i += CONCURRENCY) {
|
||||
if (maxDocs > 0 && previouslyFetched + documents.length >= maxDocs) break
|
||||
const batch = files.slice(i, i + CONCURRENCY)
|
||||
const results = await Promise.all(batch.map((file) => fileToDocument(accessToken, file)))
|
||||
documents.push(...(results.filter(Boolean) as ExternalDocument[]))
|
||||
}
|
||||
let documents = files.map(fileToStub)
|
||||
if (maxDocs > 0) {
|
||||
const remaining = maxDocs - previouslyFetched
|
||||
if (documents.length > remaining) {
|
||||
documents.splice(remaining)
|
||||
documents = documents.slice(0, remaining)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -300,7 +276,17 @@ export const googleDocsConnector: ConnectorConfig = {
|
||||
if (file.trashed) return null
|
||||
if (file.mimeType !== 'application/vnd.google-apps.document') return null
|
||||
|
||||
return fileToDocument(accessToken, file)
|
||||
try {
|
||||
const content = await fetchDocContent(accessToken, file.id)
|
||||
if (!content.trim()) return null
|
||||
|
||||
return { ...fileToStub(file), content, contentDeferred: false }
|
||||
} catch (error) {
|
||||
logger.warn(`Failed to extract content from document: ${file.name} (${file.id})`, {
|
||||
error: error instanceof Error ? error.message : String(error),
|
||||
})
|
||||
return null
|
||||
}
|
||||
},
|
||||
|
||||
validateConfig: async (
|
||||
|
||||
@@ -2,7 +2,7 @@ import { createLogger } from '@sim/logger'
|
||||
import { GoogleSheetsIcon } from '@/components/icons'
|
||||
import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
|
||||
import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
|
||||
import { computeContentHash, parseTagDate } from '@/connectors/utils'
|
||||
import { parseTagDate } from '@/connectors/utils'
|
||||
|
||||
const logger = createLogger('GoogleSheetsConnector')
|
||||
|
||||
@@ -168,7 +168,6 @@ async function sheetToDocument(
|
||||
return null
|
||||
}
|
||||
|
||||
const contentHash = await computeContentHash(content)
|
||||
const rowCount = dataRows.length
|
||||
|
||||
return {
|
||||
@@ -177,7 +176,7 @@ async function sheetToDocument(
|
||||
content,
|
||||
mimeType: 'text/plain',
|
||||
sourceUrl: `https://docs.google.com/spreadsheets/d/${spreadsheetId}/edit#gid=${sheet.sheetId}`,
|
||||
contentHash,
|
||||
contentHash: `gsheets:${spreadsheetId}:${sheet.sheetId}:${modifiedTime ?? ''}`,
|
||||
metadata: {
|
||||
spreadsheetId,
|
||||
spreadsheetTitle,
|
||||
@@ -259,22 +258,24 @@ export const googleSheetsConnector: ConnectorConfig = {
|
||||
sheetCount: sheets.length,
|
||||
})
|
||||
|
||||
const documents: ExternalDocument[] = []
|
||||
for (let i = 0; i < sheets.length; i += CONCURRENCY) {
|
||||
const batch = sheets.slice(i, i + CONCURRENCY)
|
||||
const results = await Promise.all(
|
||||
batch.map((sheet) =>
|
||||
sheetToDocument(
|
||||
accessToken,
|
||||
spreadsheetId,
|
||||
metadata.properties.title,
|
||||
sheet,
|
||||
modifiedTime
|
||||
)
|
||||
)
|
||||
)
|
||||
documents.push(...(results.filter(Boolean) as ExternalDocument[]))
|
||||
}
|
||||
const documents: ExternalDocument[] = sheets.map((sheet) => ({
|
||||
externalId: `${spreadsheetId}__sheet__${sheet.sheetId}`,
|
||||
title: `${metadata.properties.title} - ${sheet.title}`,
|
||||
content: '',
|
||||
contentDeferred: true,
|
||||
mimeType: 'text/plain',
|
||||
sourceUrl: `https://docs.google.com/spreadsheets/d/${spreadsheetId}/edit#gid=${sheet.sheetId}`,
|
||||
contentHash: `gsheets:${spreadsheetId}:${sheet.sheetId}:${modifiedTime ?? ''}`,
|
||||
metadata: {
|
||||
spreadsheetId,
|
||||
spreadsheetTitle: metadata.properties.title,
|
||||
sheetTitle: sheet.title,
|
||||
sheetId: sheet.sheetId,
|
||||
rowCount: sheet.gridProperties?.rowCount,
|
||||
columnCount: sheet.gridProperties?.columnCount,
|
||||
...(modifiedTime ? { modifiedTime } : {}),
|
||||
},
|
||||
}))
|
||||
|
||||
return {
|
||||
documents,
|
||||
@@ -324,13 +325,15 @@ export const googleSheetsConnector: ConnectorConfig = {
|
||||
return null
|
||||
}
|
||||
|
||||
return sheetToDocument(
|
||||
const doc = await sheetToDocument(
|
||||
accessToken,
|
||||
spreadsheetId,
|
||||
metadata.properties.title,
|
||||
sheetEntry.properties,
|
||||
modifiedTime
|
||||
)
|
||||
if (!doc) return null
|
||||
return { ...doc, contentDeferred: false }
|
||||
},
|
||||
|
||||
validateConfig: async (
|
||||
|
||||
@@ -2,7 +2,7 @@ import { createLogger } from '@sim/logger'
|
||||
import { HubspotIcon } from '@/components/icons'
|
||||
import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
|
||||
import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
|
||||
import { computeContentHash, parseTagDate } from '@/connectors/utils'
|
||||
import { parseTagDate } from '@/connectors/utils'
|
||||
|
||||
const logger = createLogger('HubSpotConnector')
|
||||
|
||||
@@ -140,16 +140,15 @@ function buildRecordContent(objectType: string, properties: Record<string, strin
|
||||
/**
|
||||
* Converts a HubSpot CRM record to an ExternalDocument.
|
||||
*/
|
||||
async function recordToDocument(
|
||||
function recordToDocument(
|
||||
record: Record<string, unknown>,
|
||||
objectType: string,
|
||||
portalId: string
|
||||
): Promise<ExternalDocument> {
|
||||
): ExternalDocument {
|
||||
const id = record.id as string
|
||||
const properties = (record.properties || {}) as Record<string, string | null>
|
||||
|
||||
const content = buildRecordContent(objectType, properties)
|
||||
const contentHash = await computeContentHash(content)
|
||||
const title = buildRecordTitle(objectType, properties)
|
||||
|
||||
const lastModified =
|
||||
@@ -161,7 +160,7 @@ async function recordToDocument(
|
||||
content,
|
||||
mimeType: 'text/plain',
|
||||
sourceUrl: `https://app.hubspot.com/contacts/${portalId}/record/${objectType}/${id}`,
|
||||
contentHash,
|
||||
contentHash: `hubspot:${id}:${lastModified ?? ''}`,
|
||||
metadata: {
|
||||
objectType,
|
||||
owner: properties.hubspot_owner_id || undefined,
|
||||
@@ -260,8 +259,8 @@ export const hubspotConnector: ConnectorConfig = {
|
||||
const paging = data.paging as { next?: { after?: string } } | undefined
|
||||
const nextCursor = paging?.next?.after
|
||||
|
||||
const documents: ExternalDocument[] = await Promise.all(
|
||||
results.map((record) => recordToDocument(record, objectType, portalId))
|
||||
const documents: ExternalDocument[] = results.map((record) =>
|
||||
recordToDocument(record, objectType, portalId)
|
||||
)
|
||||
|
||||
const previouslyFetched = (syncContext?.totalDocsFetched as number) ?? 0
|
||||
@@ -322,7 +321,7 @@ export const hubspotConnector: ConnectorConfig = {
|
||||
}
|
||||
|
||||
const record = await response.json()
|
||||
return recordToDocument(record, objectType, portalId)
|
||||
return recordToDocument(record as Record<string, unknown>, objectType, portalId)
|
||||
},
|
||||
|
||||
validateConfig: async (
|
||||
|
||||
@@ -2,7 +2,7 @@ import { createLogger } from '@sim/logger'
|
||||
import { IntercomIcon } from '@/components/icons'
|
||||
import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
|
||||
import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
|
||||
import { computeContentHash, htmlToPlainText, parseTagDate } from '@/connectors/utils'
|
||||
import { htmlToPlainText, parseTagDate } from '@/connectors/utils'
|
||||
|
||||
const logger = createLogger('IntercomConnector')
|
||||
|
||||
@@ -309,7 +309,6 @@ export const intercomConnector: ConnectorConfig = {
|
||||
const content = formatArticle(article)
|
||||
if (!content.trim()) continue
|
||||
|
||||
const contentHash = await computeContentHash(content)
|
||||
const updatedAt = new Date(article.updated_at * 1000).toISOString()
|
||||
|
||||
documents.push({
|
||||
@@ -318,7 +317,7 @@ export const intercomConnector: ConnectorConfig = {
|
||||
content,
|
||||
mimeType: 'text/plain',
|
||||
sourceUrl: `https://app.intercom.com/a/apps/_/articles/articles/${article.id}/show`,
|
||||
contentHash,
|
||||
contentHash: `intercom:article-${article.id}:${article.updated_at}`,
|
||||
metadata: {
|
||||
type: 'article',
|
||||
state: article.state,
|
||||
@@ -337,28 +336,23 @@ export const intercomConnector: ConnectorConfig = {
|
||||
const conversations = await fetchConversations(accessToken, maxItems, conversationState)
|
||||
|
||||
for (const conversation of conversations) {
|
||||
const detail = await fetchConversationDetail(accessToken, conversation.id)
|
||||
const content = formatConversation(detail)
|
||||
if (!content.trim()) continue
|
||||
|
||||
const contentHash = await computeContentHash(content)
|
||||
const updatedAt = new Date(conversation.updated_at * 1000).toISOString()
|
||||
const tags = conversation.tags?.tags?.map((t) => t.name) || []
|
||||
|
||||
documents.push({
|
||||
externalId: `conversation-${conversation.id}`,
|
||||
title: conversation.title || `Conversation #${conversation.id}`,
|
||||
content,
|
||||
content: '',
|
||||
contentDeferred: true,
|
||||
mimeType: 'text/plain',
|
||||
sourceUrl: `https://app.intercom.com/a/apps/_/inbox/inbox/all/conversations/${conversation.id}`,
|
||||
contentHash,
|
||||
contentHash: `intercom:conversation-${conversation.id}:${conversation.updated_at}`,
|
||||
metadata: {
|
||||
type: 'conversation',
|
||||
state: conversation.state,
|
||||
tags: tags.join(', '),
|
||||
updatedAt,
|
||||
createdAt: new Date(conversation.created_at * 1000).toISOString(),
|
||||
messageCount: (detail.conversation_parts?.total_count ?? 0) + 1,
|
||||
},
|
||||
})
|
||||
}
|
||||
@@ -383,7 +377,6 @@ export const intercomConnector: ConnectorConfig = {
|
||||
const content = formatArticle(article)
|
||||
if (!content.trim()) return null
|
||||
|
||||
const contentHash = await computeContentHash(content)
|
||||
const updatedAt = new Date(article.updated_at * 1000).toISOString()
|
||||
|
||||
return {
|
||||
@@ -392,7 +385,7 @@ export const intercomConnector: ConnectorConfig = {
|
||||
content,
|
||||
mimeType: 'text/plain',
|
||||
sourceUrl: `https://app.intercom.com/a/apps/_/articles/articles/${article.id}/show`,
|
||||
contentHash,
|
||||
contentHash: `intercom:article-${article.id}:${article.updated_at}`,
|
||||
metadata: {
|
||||
type: 'article',
|
||||
state: article.state,
|
||||
@@ -410,7 +403,6 @@ export const intercomConnector: ConnectorConfig = {
|
||||
const content = formatConversation(detail)
|
||||
if (!content.trim()) return null
|
||||
|
||||
const contentHash = await computeContentHash(content)
|
||||
const updatedAt = new Date(detail.updated_at * 1000).toISOString()
|
||||
const tags = detail.tags?.tags?.map((t) => t.name) || []
|
||||
|
||||
@@ -418,9 +410,10 @@ export const intercomConnector: ConnectorConfig = {
|
||||
externalId,
|
||||
title: detail.title || `Conversation #${detail.id}`,
|
||||
content,
|
||||
contentDeferred: false,
|
||||
mimeType: 'text/plain',
|
||||
sourceUrl: `https://app.intercom.com/a/apps/_/inbox/inbox/all/conversations/${detail.id}`,
|
||||
contentHash,
|
||||
contentHash: `intercom:conversation-${detail.id}:${detail.updated_at}`,
|
||||
metadata: {
|
||||
type: 'conversation',
|
||||
state: detail.state,
|
||||
|
||||
@@ -2,7 +2,7 @@ import { createLogger } from '@sim/logger'
|
||||
import { JiraIcon } from '@/components/icons'
|
||||
import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
|
||||
import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
|
||||
import { computeContentHash, joinTagArray, parseTagDate } from '@/connectors/utils'
|
||||
import { joinTagArray, parseTagDate } from '@/connectors/utils'
|
||||
import { extractAdfText, getJiraCloudId } from '@/tools/jira/utils'
|
||||
|
||||
const logger = createLogger('JiraConnector')
|
||||
@@ -33,16 +33,12 @@ function buildIssueContent(fields: Record<string, unknown>): string {
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a Jira issue API response to an ExternalDocument.
|
||||
* Extracts common metadata fields from a Jira issue into an ExternalDocument
|
||||
* stub with deferred content. The contentHash is metadata-based so it is
|
||||
* identical whether produced during listing or full fetch.
|
||||
*/
|
||||
async function issueToDocument(
|
||||
issue: Record<string, unknown>,
|
||||
domain: string
|
||||
): Promise<ExternalDocument> {
|
||||
function issueToStub(issue: Record<string, unknown>, domain: string): ExternalDocument {
|
||||
const fields = (issue.fields || {}) as Record<string, unknown>
|
||||
const content = buildIssueContent(fields)
|
||||
const contentHash = await computeContentHash(content)
|
||||
|
||||
const key = issue.key as string
|
||||
const issueType = fields.issuetype as Record<string, unknown> | undefined
|
||||
const status = fields.status as Record<string, unknown> | undefined
|
||||
@@ -51,14 +47,16 @@ async function issueToDocument(
|
||||
const reporter = fields.reporter as Record<string, unknown> | undefined
|
||||
const project = fields.project as Record<string, unknown> | undefined
|
||||
const labels = Array.isArray(fields.labels) ? (fields.labels as string[]) : []
|
||||
const updated = (fields.updated as string) ?? ''
|
||||
|
||||
return {
|
||||
externalId: String(issue.id),
|
||||
title: `${key}: ${(fields.summary as string) || 'Untitled'}`,
|
||||
content,
|
||||
content: '',
|
||||
contentDeferred: true,
|
||||
mimeType: 'text/plain',
|
||||
sourceUrl: `https://${domain}/browse/${key}`,
|
||||
contentHash,
|
||||
contentHash: `jira:${issue.id}:${updated}`,
|
||||
metadata: {
|
||||
key,
|
||||
issueType: issueType?.name,
|
||||
@@ -74,6 +72,22 @@ async function issueToDocument(
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a fully-fetched Jira issue (with description and comments) into an
|
||||
* ExternalDocument with resolved content.
|
||||
*/
|
||||
function issueToFullDocument(issue: Record<string, unknown>, domain: string): ExternalDocument {
|
||||
const stub = issueToStub(issue, domain)
|
||||
const fields = (issue.fields || {}) as Record<string, unknown>
|
||||
const content = buildIssueContent(fields)
|
||||
|
||||
return {
|
||||
...stub,
|
||||
content,
|
||||
contentDeferred: false,
|
||||
}
|
||||
}
|
||||
|
||||
export const jiraConnector: ConnectorConfig = {
|
||||
id: 'jira',
|
||||
name: 'Jira',
|
||||
@@ -162,7 +176,7 @@ export const jiraConnector: ConnectorConfig = {
|
||||
params.append('maxResults', String(Math.min(PAGE_SIZE, remaining)))
|
||||
params.append(
|
||||
'fields',
|
||||
'summary,description,comment,issuetype,status,priority,assignee,reporter,project,labels,created,updated'
|
||||
'summary,issuetype,status,priority,assignee,reporter,project,labels,created,updated'
|
||||
)
|
||||
|
||||
const url = `https://api.atlassian.com/ex/jira/${cloudId}/rest/api/3/search?${params.toString()}`
|
||||
@@ -190,9 +204,7 @@ export const jiraConnector: ConnectorConfig = {
|
||||
const issues = (data.issues || []) as Record<string, unknown>[]
|
||||
const total = (data.total as number) ?? 0
|
||||
|
||||
const documents: ExternalDocument[] = await Promise.all(
|
||||
issues.map((issue) => issueToDocument(issue, domain))
|
||||
)
|
||||
const documents: ExternalDocument[] = issues.map((issue) => issueToStub(issue, domain))
|
||||
|
||||
const nextStart = startAt + issues.length
|
||||
const hasMore = nextStart < total && (maxIssues <= 0 || nextStart < maxIssues)
|
||||
@@ -239,7 +251,7 @@ export const jiraConnector: ConnectorConfig = {
|
||||
}
|
||||
|
||||
const issue = await response.json()
|
||||
return issueToDocument(issue, domain)
|
||||
return issueToFullDocument(issue, domain)
|
||||
},
|
||||
|
||||
validateConfig: async (
|
||||
|
||||
@@ -3,7 +3,7 @@ import { LinearIcon } from '@/components/icons'
|
||||
import type { RetryOptions } from '@/lib/knowledge/documents/utils'
|
||||
import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
|
||||
import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
|
||||
import { computeContentHash, joinTagArray, parseTagDate } from '@/connectors/utils'
|
||||
import { joinTagArray, parseTagDate } from '@/connectors/utils'
|
||||
|
||||
const logger = createLogger('LinearConnector')
|
||||
|
||||
@@ -278,36 +278,34 @@ export const linearConnector: ConnectorConfig = {
|
||||
const nodes = (issuesConn.nodes || []) as Record<string, unknown>[]
|
||||
const pageInfo = issuesConn.pageInfo as Record<string, unknown>
|
||||
|
||||
const documents: ExternalDocument[] = await Promise.all(
|
||||
nodes.map(async (issue) => {
|
||||
const content = buildIssueContent(issue)
|
||||
const contentHash = await computeContentHash(content)
|
||||
const documents: ExternalDocument[] = nodes.map((issue) => {
|
||||
const content = buildIssueContent(issue)
|
||||
const contentHash = `linear:${issue.id}:${issue.updatedAt}`
|
||||
|
||||
const labelNodes = ((issue.labels as Record<string, unknown>)?.nodes || []) as Record<
|
||||
string,
|
||||
unknown
|
||||
>[]
|
||||
const labelNodes = ((issue.labels as Record<string, unknown>)?.nodes || []) as Record<
|
||||
string,
|
||||
unknown
|
||||
>[]
|
||||
|
||||
return {
|
||||
externalId: issue.id as string,
|
||||
title: `${(issue.identifier as string) || ''}: ${(issue.title as string) || 'Untitled'}`,
|
||||
content,
|
||||
mimeType: 'text/plain' as const,
|
||||
sourceUrl: (issue.url as string) || undefined,
|
||||
contentHash,
|
||||
metadata: {
|
||||
identifier: issue.identifier,
|
||||
state: (issue.state as Record<string, unknown>)?.name,
|
||||
priority: issue.priorityLabel,
|
||||
assignee: (issue.assignee as Record<string, unknown>)?.name,
|
||||
labels: labelNodes.map((l) => l.name as string),
|
||||
team: (issue.team as Record<string, unknown>)?.name,
|
||||
project: (issue.project as Record<string, unknown>)?.name,
|
||||
lastModified: issue.updatedAt,
|
||||
},
|
||||
}
|
||||
})
|
||||
)
|
||||
return {
|
||||
externalId: issue.id as string,
|
||||
title: `${(issue.identifier as string) || ''}: ${(issue.title as string) || 'Untitled'}`,
|
||||
content,
|
||||
mimeType: 'text/plain' as const,
|
||||
sourceUrl: (issue.url as string) || undefined,
|
||||
contentHash,
|
||||
metadata: {
|
||||
identifier: issue.identifier,
|
||||
state: (issue.state as Record<string, unknown>)?.name,
|
||||
priority: issue.priorityLabel,
|
||||
assignee: (issue.assignee as Record<string, unknown>)?.name,
|
||||
labels: labelNodes.map((l) => l.name as string),
|
||||
team: (issue.team as Record<string, unknown>)?.name,
|
||||
project: (issue.project as Record<string, unknown>)?.name,
|
||||
lastModified: issue.updatedAt,
|
||||
},
|
||||
}
|
||||
})
|
||||
|
||||
const hasNextPage = Boolean(pageInfo.hasNextPage)
|
||||
const endCursor = (pageInfo.endCursor as string) || undefined
|
||||
@@ -335,7 +333,7 @@ export const linearConnector: ConnectorConfig = {
|
||||
if (!issue) return null
|
||||
|
||||
const content = buildIssueContent(issue)
|
||||
const contentHash = await computeContentHash(content)
|
||||
const contentHash = `linear:${issue.id}:${issue.updatedAt}`
|
||||
|
||||
const labelNodes = ((issue.labels as Record<string, unknown>)?.nodes || []) as Record<
|
||||
string,
|
||||
@@ -346,7 +344,7 @@ export const linearConnector: ConnectorConfig = {
|
||||
externalId: issue.id as string,
|
||||
title: `${(issue.identifier as string) || ''}: ${(issue.title as string) || 'Untitled'}`,
|
||||
content,
|
||||
mimeType: 'text/plain',
|
||||
mimeType: 'text/plain' as const,
|
||||
sourceUrl: (issue.url as string) || undefined,
|
||||
contentHash,
|
||||
metadata: {
|
||||
@@ -379,7 +377,6 @@ export const linearConnector: ConnectorConfig = {
|
||||
}
|
||||
|
||||
try {
|
||||
// Verify the token works by fetching teams
|
||||
const data = await linearGraphQL(accessToken, TEAMS_QUERY, undefined, VALIDATE_RETRY_OPTIONS)
|
||||
const teamsConn = data.teams as Record<string, unknown>
|
||||
const teams = (teamsConn.nodes || []) as Record<string, unknown>[]
|
||||
@@ -391,7 +388,6 @@ export const linearConnector: ConnectorConfig = {
|
||||
}
|
||||
}
|
||||
|
||||
// If teamId specified, verify it exists
|
||||
const teamId = sourceConfig.teamId as string | undefined
|
||||
if (teamId) {
|
||||
const found = teams.some((t) => t.id === teamId)
|
||||
|
||||
@@ -2,14 +2,37 @@ import { createLogger } from '@sim/logger'
|
||||
import { OutlookIcon } from '@/components/icons'
|
||||
import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
|
||||
import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
|
||||
import { computeContentHash, htmlToPlainText, parseTagDate } from '@/connectors/utils'
|
||||
import { htmlToPlainText, parseTagDate } from '@/connectors/utils'
|
||||
|
||||
const logger = createLogger('OutlookConnector')
|
||||
|
||||
const GRAPH_API_BASE = 'https://graph.microsoft.com/v1.0/me'
|
||||
const DEFAULT_MAX_CONVERSATIONS = 500
|
||||
const MESSAGES_PER_PAGE = 50
|
||||
const MESSAGE_FIELDS = [
|
||||
/**
|
||||
* Fields requested when listing messages (no body — deferred to getDocument).
|
||||
*/
|
||||
const LIST_MESSAGE_FIELDS = [
|
||||
'id',
|
||||
'conversationId',
|
||||
'subject',
|
||||
'from',
|
||||
'toRecipients',
|
||||
'receivedDateTime',
|
||||
'sentDateTime',
|
||||
'categories',
|
||||
'importance',
|
||||
'inferenceClassification',
|
||||
'hasAttachments',
|
||||
'webLink',
|
||||
'isDraft',
|
||||
'parentFolderId',
|
||||
].join(',')
|
||||
|
||||
/**
|
||||
* Fields requested when fetching full message content in getDocument.
|
||||
*/
|
||||
const FULL_MESSAGE_FIELDS = [
|
||||
'id',
|
||||
'conversationId',
|
||||
'subject',
|
||||
@@ -84,7 +107,7 @@ function buildInitialUrl(sourceConfig: Record<string, unknown>): string {
|
||||
|
||||
const params = new URLSearchParams({
|
||||
$top: String(MESSAGES_PER_PAGE),
|
||||
$select: MESSAGE_FIELDS,
|
||||
$select: LIST_MESSAGE_FIELDS,
|
||||
})
|
||||
|
||||
// Build $filter clauses
|
||||
@@ -353,7 +376,6 @@ export const outlookConnector: ConnectorConfig = {
|
||||
const headers: Record<string, string> = {
|
||||
Authorization: `Bearer ${accessToken}`,
|
||||
Accept: 'application/json',
|
||||
Prefer: 'outlook.body-content-type="text"',
|
||||
}
|
||||
|
||||
const response = await fetchWithRetry(url, { method: 'GET', headers })
|
||||
@@ -385,7 +407,8 @@ export const outlookConnector: ConnectorConfig = {
|
||||
continue
|
||||
}
|
||||
|
||||
const convId = msg.conversationId || msg.id
|
||||
if (!msg.conversationId) continue
|
||||
const convId = msg.conversationId
|
||||
if (!conversations[convId]) {
|
||||
conversations[convId] = []
|
||||
}
|
||||
@@ -407,8 +430,8 @@ export const outlookConnector: ConnectorConfig = {
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 2: Group conversations into documents
|
||||
logger.info('Grouping Outlook messages into conversations', {
|
||||
// Phase 2: Build lightweight stubs — content is deferred to getDocument
|
||||
logger.info('Building Outlook conversation stubs', {
|
||||
totalMessages: syncContext?._totalMessagesFetched,
|
||||
totalConversations: Object.keys(conversations).length,
|
||||
})
|
||||
@@ -433,23 +456,26 @@ export const outlookConnector: ConnectorConfig = {
|
||||
|
||||
const documents: ExternalDocument[] = []
|
||||
for (const [convId, msgs] of limited) {
|
||||
const result = formatConversation(convId, msgs)
|
||||
if (!result) continue
|
||||
if (msgs.length === 0) continue
|
||||
|
||||
const contentHash = await computeContentHash(result.content)
|
||||
const lastDate = msgs.reduce((max, m) => {
|
||||
const d = m.receivedDateTime || ''
|
||||
return d > max ? d : max
|
||||
}, '')
|
||||
|
||||
// Use the first message's webLink as the source URL
|
||||
const subject = msgs[0].subject || 'No Subject'
|
||||
const firstWithLink = msgs.find((m) => m.webLink)
|
||||
const sourceUrl = firstWithLink?.webLink || `https://outlook.office.com/mail/inbox`
|
||||
const sourceUrl = firstWithLink?.webLink || 'https://outlook.office.com/mail/inbox'
|
||||
|
||||
documents.push({
|
||||
externalId: convId,
|
||||
title: result.subject,
|
||||
content: result.content,
|
||||
title: subject,
|
||||
content: '',
|
||||
contentDeferred: true,
|
||||
mimeType: 'text/plain',
|
||||
sourceUrl,
|
||||
contentHash,
|
||||
metadata: result.metadata,
|
||||
contentHash: `outlook:${convId}:${lastDate}`,
|
||||
metadata: {},
|
||||
})
|
||||
}
|
||||
|
||||
@@ -462,14 +488,25 @@ export const outlookConnector: ConnectorConfig = {
|
||||
externalId: string
|
||||
): Promise<ExternalDocument | null> => {
|
||||
try {
|
||||
// Fetch messages for this conversation
|
||||
// Scope to the same folder as listDocuments so contentHash stays consistent
|
||||
const folder = (sourceConfig.folder as string) || 'inbox'
|
||||
const basePath =
|
||||
folder === 'all'
|
||||
? `${GRAPH_API_BASE}/messages`
|
||||
: `${GRAPH_API_BASE}/mailFolders/${WELL_KNOWN_FOLDERS[folder] || folder}/messages`
|
||||
|
||||
const filterParts = [
|
||||
`conversationId eq '${externalId.replace(/'/g, "''")}'`,
|
||||
'isDraft eq false',
|
||||
]
|
||||
|
||||
const params = new URLSearchParams({
|
||||
$filter: `conversationId eq '${externalId.replace(/'/g, "''")}'`,
|
||||
$select: MESSAGE_FIELDS,
|
||||
$top: '50',
|
||||
$filter: filterParts.join(' and '),
|
||||
$select: FULL_MESSAGE_FIELDS,
|
||||
$top: '250',
|
||||
})
|
||||
|
||||
const url = `${GRAPH_API_BASE}/messages?${params.toString()}`
|
||||
const url = `${basePath}?${params.toString()}`
|
||||
|
||||
const response = await fetchWithRetry(url, {
|
||||
method: 'GET',
|
||||
@@ -493,16 +530,21 @@ export const outlookConnector: ConnectorConfig = {
|
||||
const result = formatConversation(externalId, messages)
|
||||
if (!result) return null
|
||||
|
||||
const contentHash = await computeContentHash(result.content)
|
||||
const lastDate = messages.reduce((max, m) => {
|
||||
const d = m.receivedDateTime || ''
|
||||
return d > max ? d : max
|
||||
}, '')
|
||||
|
||||
const firstWithLink = messages.find((m) => m.webLink)
|
||||
|
||||
return {
|
||||
externalId,
|
||||
title: result.subject,
|
||||
content: result.content,
|
||||
contentDeferred: false,
|
||||
mimeType: 'text/plain',
|
||||
sourceUrl: firstWithLink?.webLink || 'https://outlook.office.com/mail/inbox',
|
||||
contentHash,
|
||||
contentHash: `outlook:${externalId}:${lastDate}`,
|
||||
metadata: result.metadata,
|
||||
}
|
||||
} catch (error) {
|
||||
|
||||
@@ -2,7 +2,7 @@ import { createLogger } from '@sim/logger'
|
||||
import { RedditIcon } from '@/components/icons'
|
||||
import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
|
||||
import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
|
||||
import { computeContentHash, parseTagDate } from '@/connectors/utils'
|
||||
import { parseTagDate } from '@/connectors/utils'
|
||||
|
||||
const logger = createLogger('RedditConnector')
|
||||
|
||||
@@ -338,29 +338,23 @@ export const redditConnector: ConnectorConfig = {
|
||||
afterToken
|
||||
)
|
||||
|
||||
const documents: ExternalDocument[] = []
|
||||
|
||||
for (const post of posts) {
|
||||
const content = await formatPostContent(accessToken, post, COMMENTS_PER_POST)
|
||||
const contentHash = await computeContentHash(content)
|
||||
|
||||
documents.push({
|
||||
externalId: post.id,
|
||||
title: post.title,
|
||||
content,
|
||||
mimeType: 'text/plain',
|
||||
sourceUrl: `https://www.reddit.com${post.permalink}`,
|
||||
contentHash,
|
||||
metadata: {
|
||||
author: post.author,
|
||||
score: post.score,
|
||||
commentCount: post.num_comments,
|
||||
flair: post.link_flair_text ?? undefined,
|
||||
postDate: new Date(post.created_utc * 1000).toISOString(),
|
||||
subreddit: post.subreddit,
|
||||
},
|
||||
})
|
||||
}
|
||||
const documents: ExternalDocument[] = posts.map((post) => ({
|
||||
externalId: post.id,
|
||||
title: post.title,
|
||||
content: '',
|
||||
contentDeferred: true,
|
||||
mimeType: 'text/plain',
|
||||
sourceUrl: `https://www.reddit.com${post.permalink}`,
|
||||
contentHash: `reddit:${post.id}:${post.created_utc}`,
|
||||
metadata: {
|
||||
author: post.author,
|
||||
score: post.score,
|
||||
commentCount: post.num_comments,
|
||||
flair: post.link_flair_text ?? undefined,
|
||||
postDate: new Date(post.created_utc * 1000).toISOString(),
|
||||
subreddit: post.subreddit,
|
||||
},
|
||||
}))
|
||||
|
||||
const totalCollected = collectedSoFar + documents.length
|
||||
const hasMore = after !== null && totalCollected < maxPosts
|
||||
@@ -397,15 +391,15 @@ export const redditConnector: ConnectorConfig = {
|
||||
const comments =
|
||||
data.length >= 2 ? extractComments(data[1] as RedditListing, COMMENTS_PER_POST) : []
|
||||
const content = await formatPostContent(accessToken, post, COMMENTS_PER_POST, comments)
|
||||
const contentHash = await computeContentHash(content)
|
||||
|
||||
return {
|
||||
externalId: post.id,
|
||||
title: post.title,
|
||||
content,
|
||||
contentDeferred: false,
|
||||
mimeType: 'text/plain',
|
||||
sourceUrl: `https://www.reddit.com${post.permalink}`,
|
||||
contentHash,
|
||||
contentHash: `reddit:${post.id}:${post.created_utc}`,
|
||||
metadata: {
|
||||
author: post.author,
|
||||
score: post.score,
|
||||
|
||||
@@ -2,7 +2,7 @@ import { createLogger } from '@sim/logger'
|
||||
import { SalesforceIcon } from '@/components/icons'
|
||||
import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
|
||||
import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
|
||||
import { computeContentHash, htmlToPlainText, parseTagDate } from '@/connectors/utils'
|
||||
import { htmlToPlainText, parseTagDate } from '@/connectors/utils'
|
||||
|
||||
const logger = createLogger('SalesforceConnector')
|
||||
|
||||
@@ -12,7 +12,14 @@ const PAGE_SIZE = 200
|
||||
|
||||
/** SOQL field lists per object type. */
|
||||
const OBJECT_FIELDS: Record<string, string[]> = {
|
||||
KnowledgeArticleVersion: ['Id', 'Title', 'Summary', 'LastModifiedDate', 'ArticleNumber'],
|
||||
KnowledgeArticleVersion: [
|
||||
'Id',
|
||||
'Title',
|
||||
'Summary',
|
||||
'LastModifiedDate',
|
||||
'ArticleNumber',
|
||||
'PublishStatus',
|
||||
],
|
||||
Case: ['Id', 'Subject', 'Description', 'Status', 'LastModifiedDate', 'CaseNumber'],
|
||||
Account: ['Id', 'Name', 'Description', 'Industry', 'LastModifiedDate'],
|
||||
Opportunity: [
|
||||
@@ -146,36 +153,52 @@ function getRecordStatus(objectType: string, record: Record<string, unknown>): s
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a Salesforce record to an ExternalDocument.
|
||||
* Creates a lightweight stub for a Salesforce record with metadata-based hash.
|
||||
* Content is deferred and fetched later via getDocument only for new/changed docs.
|
||||
*/
|
||||
async function recordToDocument(
|
||||
function recordToStub(
|
||||
record: Record<string, unknown>,
|
||||
objectType: string,
|
||||
instanceUrl: string
|
||||
): Promise<ExternalDocument> {
|
||||
): ExternalDocument {
|
||||
const id = record.Id as string
|
||||
const content = buildRecordContent(objectType, record)
|
||||
const contentHash = await computeContentHash(content)
|
||||
const title = buildRecordTitle(objectType, record)
|
||||
|
||||
const lastModified = (record.LastModifiedDate as string) || ''
|
||||
const baseUrl = instanceUrl.replace(`/services/data/${API_VERSION}/`, '')
|
||||
|
||||
return {
|
||||
externalId: id,
|
||||
title,
|
||||
content,
|
||||
content: '',
|
||||
contentDeferred: true,
|
||||
mimeType: 'text/plain',
|
||||
sourceUrl: `${baseUrl}/${id}`,
|
||||
contentHash,
|
||||
contentHash: `salesforce:${id}:${lastModified}`,
|
||||
metadata: {
|
||||
objectType,
|
||||
lastModified: (record.LastModifiedDate as string) || undefined,
|
||||
lastModified: lastModified || undefined,
|
||||
recordNumber: getRecordNumber(objectType, record),
|
||||
status: getRecordStatus(objectType, record),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a full ExternalDocument with content from a Salesforce record.
|
||||
*/
|
||||
function recordToDocument(
|
||||
record: Record<string, unknown>,
|
||||
objectType: string,
|
||||
instanceUrl: string
|
||||
): ExternalDocument {
|
||||
const stub = recordToStub(record, objectType, instanceUrl)
|
||||
return {
|
||||
...stub,
|
||||
content: buildRecordContent(objectType, record),
|
||||
contentDeferred: false,
|
||||
}
|
||||
}
|
||||
|
||||
export const salesforceConnector: ConnectorConfig = {
|
||||
id: 'salesforce',
|
||||
name: 'Salesforce',
|
||||
@@ -257,8 +280,8 @@ export const salesforceConnector: ConnectorConfig = {
|
||||
const records = (data.records || []) as Record<string, unknown>[]
|
||||
const nextRecordsUrl = data.nextRecordsUrl as string | undefined
|
||||
|
||||
const documents: ExternalDocument[] = await Promise.all(
|
||||
records.map((record) => recordToDocument(record, objectType, instanceUrl))
|
||||
const documents: ExternalDocument[] = records.map((record) =>
|
||||
recordToStub(record, objectType, instanceUrl)
|
||||
)
|
||||
|
||||
const previouslyFetched = (syncContext?.totalDocsFetched as number) ?? 0
|
||||
|
||||
@@ -2,7 +2,7 @@ import { createLogger } from '@sim/logger'
|
||||
import { ServiceNowIcon } from '@/components/icons'
|
||||
import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
|
||||
import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
|
||||
import { computeContentHash, htmlToPlainText, parseTagDate } from '@/connectors/utils'
|
||||
import { htmlToPlainText, parseTagDate } from '@/connectors/utils'
|
||||
|
||||
const logger = createLogger('ServiceNowConnector')
|
||||
|
||||
@@ -184,15 +184,13 @@ function priorityLabel(priority: string | undefined): string {
|
||||
/**
|
||||
* Converts a KB article record to an ExternalDocument.
|
||||
*/
|
||||
async function kbArticleToDocument(
|
||||
article: KBArticle,
|
||||
instanceUrl: string
|
||||
): Promise<ExternalDocument> {
|
||||
function kbArticleToDocument(article: KBArticle, instanceUrl: string): ExternalDocument {
|
||||
const title = rawValue(article.short_description) || rawValue(article.number) || article.sys_id
|
||||
const articleText = rawValue(article.text) || rawValue(article.wiki) || ''
|
||||
const content = htmlToPlainText(articleText)
|
||||
const contentHash = await computeContentHash(content)
|
||||
const sysId = rawValue(article.sys_id as unknown as string) || article.sys_id
|
||||
const updatedOn = rawValue(article.sys_updated_on) || ''
|
||||
const contentHash = `servicenow:${sysId}:${updatedOn}`
|
||||
const sourceUrl = `${instanceUrl}/kb_view.do?sys_kb_id=${sysId}`
|
||||
|
||||
return {
|
||||
@@ -218,10 +216,7 @@ async function kbArticleToDocument(
|
||||
/**
|
||||
* Converts an incident record to an ExternalDocument.
|
||||
*/
|
||||
async function incidentToDocument(
|
||||
incident: Incident,
|
||||
instanceUrl: string
|
||||
): Promise<ExternalDocument> {
|
||||
function incidentToDocument(incident: Incident, instanceUrl: string): ExternalDocument {
|
||||
const number = rawValue(incident.number)
|
||||
const shortDesc = rawValue(incident.short_description)
|
||||
const title = number ? `${number}: ${shortDesc || 'Untitled'}` : shortDesc || incident.sys_id
|
||||
@@ -258,8 +253,9 @@ async function incidentToDocument(
|
||||
}
|
||||
|
||||
const content = parts.join('\n')
|
||||
const contentHash = await computeContentHash(content)
|
||||
const sysId = rawValue(incident.sys_id as unknown as string) || incident.sys_id
|
||||
const updatedOn = rawValue(incident.sys_updated_on) || ''
|
||||
const contentHash = `servicenow:${sysId}:${updatedOn}`
|
||||
const sourceUrl = `${instanceUrl}/incident.do?sys_id=${sysId}`
|
||||
|
||||
return {
|
||||
@@ -478,8 +474,8 @@ export const servicenowConnector: ConnectorConfig = {
|
||||
const documents: ExternalDocument[] = []
|
||||
for (const record of result) {
|
||||
const doc = isKB
|
||||
? await kbArticleToDocument(record as unknown as KBArticle, instanceUrl)
|
||||
: await incidentToDocument(record as unknown as Incident, instanceUrl)
|
||||
? kbArticleToDocument(record as unknown as KBArticle, instanceUrl)
|
||||
: incidentToDocument(record as unknown as Incident, instanceUrl)
|
||||
|
||||
if (doc.content.trim()) {
|
||||
documents.push(doc)
|
||||
@@ -532,8 +528,8 @@ export const servicenowConnector: ConnectorConfig = {
|
||||
|
||||
const record = result[0]
|
||||
const doc = isKB
|
||||
? await kbArticleToDocument(record as unknown as KBArticle, instanceUrl)
|
||||
: await incidentToDocument(record as unknown as Incident, instanceUrl)
|
||||
? kbArticleToDocument(record as unknown as KBArticle, instanceUrl)
|
||||
: incidentToDocument(record as unknown as Incident, instanceUrl)
|
||||
|
||||
return doc.content.trim() ? doc : null
|
||||
} catch (error) {
|
||||
|
||||
@@ -2,7 +2,7 @@ import { createLogger } from '@sim/logger'
|
||||
import { WebflowIcon } from '@/components/icons'
|
||||
import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
|
||||
import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
|
||||
import { computeContentHash, htmlToPlainText, parseTagDate } from '@/connectors/utils'
|
||||
import { htmlToPlainText, parseTagDate } from '@/connectors/utils'
|
||||
|
||||
const logger = createLogger('WebflowConnector')
|
||||
|
||||
@@ -194,8 +194,8 @@ export const webflowConnector: ConnectorConfig = {
|
||||
}
|
||||
|
||||
const items = data.items || []
|
||||
let documents: ExternalDocument[] = await Promise.all(
|
||||
items.map((item) => itemToDocument(item, currentCollectionId, collectionName))
|
||||
let documents: ExternalDocument[] = items.map((item) =>
|
||||
itemToDocument(item, currentCollectionId, collectionName)
|
||||
)
|
||||
|
||||
if (maxItems > 0) {
|
||||
@@ -373,13 +373,14 @@ export const webflowConnector: ConnectorConfig = {
|
||||
/**
|
||||
* Converts a Webflow CMS item to an ExternalDocument.
|
||||
*/
|
||||
async function itemToDocument(
|
||||
function itemToDocument(
|
||||
item: WebflowItem,
|
||||
collectionId: string,
|
||||
collectionName: string
|
||||
): Promise<ExternalDocument> {
|
||||
): ExternalDocument {
|
||||
const plainText = itemToPlainText(item, collectionName)
|
||||
const contentHash = await computeContentHash(plainText)
|
||||
const lastModified = item.lastUpdated || item.lastPublished || item.createdOn || ''
|
||||
const contentHash = `webflow:${item.id}:${lastModified}`
|
||||
const title = extractItemTitle(item)
|
||||
const slug = (item.fieldData?.slug as string) || ''
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@ import { createLogger } from '@sim/logger'
|
||||
import { WordpressIcon } from '@/components/icons'
|
||||
import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
|
||||
import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
|
||||
import { computeContentHash, htmlToPlainText, joinTagArray, parseTagDate } from '@/connectors/utils'
|
||||
import { htmlToPlainText, joinTagArray, parseTagDate } from '@/connectors/utils'
|
||||
|
||||
const logger = createLogger('WordPressConnector')
|
||||
|
||||
@@ -59,10 +59,10 @@ function extractTagNames(tags: Record<string, { name: string }>): string[] {
|
||||
/**
|
||||
* Converts a WordPress post to an ExternalDocument.
|
||||
*/
|
||||
async function postToDocument(post: WordPressPost): Promise<ExternalDocument> {
|
||||
function postToDocument(post: WordPressPost): ExternalDocument {
|
||||
const plainText = htmlToPlainText(post.content)
|
||||
const fullContent = `# ${post.title}\n\n${plainText}`
|
||||
const contentHash = await computeContentHash(fullContent)
|
||||
const contentHash = `wordpress:${post.ID}:${post.modified || ''}`
|
||||
const categories = extractCategoryNames(post.categories)
|
||||
const tags = extractTagNames(post.tags)
|
||||
|
||||
@@ -182,7 +182,7 @@ export const wordpressConnector: ConnectorConfig = {
|
||||
const data = (await response.json()) as WordPressPostsResponse
|
||||
const posts = data.posts || []
|
||||
|
||||
const documents = await Promise.all(posts.map(postToDocument))
|
||||
const documents = posts.map(postToDocument)
|
||||
|
||||
const totalFetched = totalDocsFetched + documents.length
|
||||
if (syncContext) syncContext.totalDocsFetched = totalFetched
|
||||
@@ -226,7 +226,7 @@ export const wordpressConnector: ConnectorConfig = {
|
||||
}
|
||||
|
||||
const post = (await response.json()) as WordPressPost
|
||||
return await postToDocument(post)
|
||||
return postToDocument(post)
|
||||
} catch (error) {
|
||||
logger.warn('Failed to get WordPress document', {
|
||||
externalId,
|
||||
|
||||
@@ -2,7 +2,7 @@ import { createLogger } from '@sim/logger'
|
||||
import { ZendeskIcon } from '@/components/icons'
|
||||
import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
|
||||
import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
|
||||
import { computeContentHash, htmlToPlainText, joinTagArray, parseTagDate } from '@/connectors/utils'
|
||||
import { htmlToPlainText, joinTagArray, parseTagDate } from '@/connectors/utils'
|
||||
|
||||
const logger = createLogger('ZendeskConnector')
|
||||
|
||||
@@ -207,14 +207,11 @@ function formatTicketContent(ticket: ZendeskTicket, comments: ZendeskComment[]):
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts an article to an ExternalDocument.
|
||||
* Converts an article to an ExternalDocument with inline content.
|
||||
* Articles return body inline from the list API so no deferral is needed.
|
||||
*/
|
||||
async function articleToDocument(
|
||||
article: ZendeskArticle,
|
||||
subdomain: string
|
||||
): Promise<ExternalDocument> {
|
||||
function articleToDocument(article: ZendeskArticle, subdomain: string): ExternalDocument {
|
||||
const content = htmlToPlainText(article.body || '')
|
||||
const contentHash = await computeContentHash(content)
|
||||
|
||||
return {
|
||||
externalId: `article-${article.id}`,
|
||||
@@ -222,7 +219,7 @@ async function articleToDocument(
|
||||
content,
|
||||
mimeType: 'text/plain',
|
||||
sourceUrl: article.html_url || `https://${subdomain}.zendesk.com/hc/articles/${article.id}`,
|
||||
contentHash,
|
||||
contentHash: `zendesk:article:${article.id}:${article.updated_at}`,
|
||||
metadata: {
|
||||
type: 'article',
|
||||
articleId: article.id,
|
||||
@@ -238,23 +235,50 @@ async function articleToDocument(
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a ticket (with comments) to an ExternalDocument.
|
||||
* Creates a deferred stub for a ticket. Content is not fetched here because
|
||||
* each ticket requires a separate comments API call. Full content is fetched
|
||||
* lazily via getDocument only for new/changed documents.
|
||||
*/
|
||||
async function ticketToDocument(
|
||||
function ticketToStub(ticket: ZendeskTicket, subdomain: string): ExternalDocument {
|
||||
return {
|
||||
externalId: `ticket-${ticket.id}`,
|
||||
title: `Ticket #${ticket.id}: ${ticket.subject}`,
|
||||
content: '',
|
||||
contentDeferred: true,
|
||||
mimeType: 'text/plain',
|
||||
sourceUrl: `https://${subdomain}.zendesk.com/agent/tickets/${ticket.id}`,
|
||||
contentHash: `zendesk:ticket:${ticket.id}:${ticket.updated_at}`,
|
||||
metadata: {
|
||||
type: 'ticket',
|
||||
ticketId: ticket.id,
|
||||
status: ticket.status,
|
||||
priority: ticket.priority,
|
||||
tags: ticket.tags,
|
||||
createdAt: ticket.created_at,
|
||||
updatedAt: ticket.updated_at,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a ticket (with comments) to a full ExternalDocument.
|
||||
* Used by getDocument to resolve deferred ticket stubs.
|
||||
*/
|
||||
function ticketToDocument(
|
||||
ticket: ZendeskTicket,
|
||||
comments: ZendeskComment[],
|
||||
subdomain: string
|
||||
): Promise<ExternalDocument> {
|
||||
): ExternalDocument {
|
||||
const content = formatTicketContent(ticket, comments)
|
||||
const contentHash = await computeContentHash(content)
|
||||
|
||||
return {
|
||||
externalId: `ticket-${ticket.id}`,
|
||||
title: `Ticket #${ticket.id}: ${ticket.subject}`,
|
||||
content,
|
||||
contentDeferred: false,
|
||||
mimeType: 'text/plain',
|
||||
sourceUrl: `https://${subdomain}.zendesk.com/agent/tickets/${ticket.id}`,
|
||||
contentHash,
|
||||
contentHash: `zendesk:ticket:${ticket.id}:${ticket.updated_at}`,
|
||||
metadata: {
|
||||
type: 'ticket',
|
||||
ticketId: ticket.id,
|
||||
@@ -375,8 +399,7 @@ export const zendeskConnector: ConnectorConfig = {
|
||||
|
||||
for (const article of articles) {
|
||||
if (!article.body?.trim()) continue
|
||||
const doc = await articleToDocument(article, subdomain)
|
||||
documents.push(doc)
|
||||
documents.push(articleToDocument(article, subdomain))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -391,21 +414,8 @@ export const zendeskConnector: ConnectorConfig = {
|
||||
)
|
||||
logger.info(`Fetched ${tickets.length} tickets from Zendesk`)
|
||||
|
||||
const BATCH_SIZE = 5
|
||||
for (let i = 0; i < tickets.length; i += BATCH_SIZE) {
|
||||
const batch = tickets.slice(i, i + BATCH_SIZE)
|
||||
const batchResults = await Promise.all(
|
||||
batch.map(async (ticket) => {
|
||||
const comments = await fetchTicketComments(
|
||||
subdomain,
|
||||
accessToken,
|
||||
sourceConfig,
|
||||
ticket.id
|
||||
)
|
||||
return ticketToDocument(ticket, comments, subdomain)
|
||||
})
|
||||
)
|
||||
documents.push(...batchResults)
|
||||
for (const ticket of tickets) {
|
||||
documents.push(ticketToStub(ticket, subdomain))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -985,6 +985,15 @@ cronjobs:
|
||||
successfulJobsHistoryLimit: 3
|
||||
failedJobsHistoryLimit: 1
|
||||
|
||||
connectorSync:
|
||||
enabled: true
|
||||
name: connector-sync
|
||||
schedule: "*/5 * * * *"
|
||||
path: "/api/knowledge/connectors/sync"
|
||||
concurrencyPolicy: Forbid
|
||||
successfulJobsHistoryLimit: 3
|
||||
failedJobsHistoryLimit: 1
|
||||
|
||||
# Global CronJob settings
|
||||
image:
|
||||
repository: curlimages/curl
|
||||
|
||||
Reference in New Issue
Block a user