mirror of
https://github.com/simstudioai/sim.git
synced 2026-02-10 06:35:01 -05:00
* fix(visibility): updated visibility for non-sensitive tool params from user only to user or llm * update docs * updated docs script
215 lines
6.3 KiB
TypeScript
215 lines
6.3 KiB
TypeScript
import { createLogger } from '@sim/logger'
|
|
import type { ExtractParams, ExtractResponse } from '@/tools/firecrawl/types'
|
|
import type { ToolConfig } from '@/tools/types'
|
|
|
|
const logger = createLogger('FirecrawlExtractTool')
|
|
|
|
const POLL_INTERVAL_MS = 5000 // 5 seconds between polls
|
|
const MAX_POLL_TIME_MS = 300000 // 5 minutes maximum polling time
|
|
|
|
export const extractTool: ToolConfig<ExtractParams, ExtractResponse> = {
|
|
id: 'firecrawl_extract',
|
|
name: 'Firecrawl Extract',
|
|
description:
|
|
'Extract structured data from entire webpages using natural language prompts and JSON schema. Powerful agentic feature for intelligent data extraction.',
|
|
version: '1.0.0',
|
|
|
|
params: {
|
|
urls: {
|
|
type: 'json',
|
|
required: true,
|
|
visibility: 'user-or-llm',
|
|
description:
|
|
'Array of URLs to extract data from (e.g., ["https://example.com/page1", "https://example.com/page2"] or ["https://example.com/*"])',
|
|
},
|
|
prompt: {
|
|
type: 'string',
|
|
required: false,
|
|
visibility: 'user-or-llm',
|
|
description: 'Natural language guidance for the extraction process',
|
|
},
|
|
schema: {
|
|
type: 'json',
|
|
required: false,
|
|
visibility: 'user-or-llm',
|
|
description: 'JSON Schema defining the structure of data to extract',
|
|
},
|
|
enableWebSearch: {
|
|
type: 'boolean',
|
|
required: false,
|
|
visibility: 'user-only',
|
|
description: 'Enable web search to find supplementary information (default: false)',
|
|
},
|
|
ignoreSitemap: {
|
|
type: 'boolean',
|
|
required: false,
|
|
visibility: 'user-only',
|
|
description: 'Ignore sitemap.xml files during scanning (default: false)',
|
|
},
|
|
includeSubdomains: {
|
|
type: 'boolean',
|
|
required: false,
|
|
visibility: 'user-only',
|
|
description: 'Extend scanning to subdomains (default: true)',
|
|
},
|
|
showSources: {
|
|
type: 'boolean',
|
|
required: false,
|
|
visibility: 'user-only',
|
|
description: 'Return data sources in the response (default: false)',
|
|
},
|
|
ignoreInvalidURLs: {
|
|
type: 'boolean',
|
|
required: false,
|
|
visibility: 'user-only',
|
|
description: 'Skip invalid URLs in the array (default: true)',
|
|
},
|
|
scrapeOptions: {
|
|
type: 'json',
|
|
required: false,
|
|
visibility: 'hidden',
|
|
description: 'Advanced scraping configuration options',
|
|
},
|
|
apiKey: {
|
|
type: 'string',
|
|
required: true,
|
|
visibility: 'user-only',
|
|
description: 'Firecrawl API key',
|
|
},
|
|
},
|
|
|
|
request: {
|
|
method: 'POST',
|
|
url: 'https://api.firecrawl.dev/v2/extract',
|
|
headers: (params) => ({
|
|
'Content-Type': 'application/json',
|
|
Authorization: `Bearer ${params.apiKey}`,
|
|
}),
|
|
body: (params) => {
|
|
const body: Record<string, any> = {
|
|
urls: params.urls,
|
|
}
|
|
|
|
if (params.prompt) body.prompt = params.prompt
|
|
if (params.schema) body.schema = params.schema
|
|
if (typeof params.enableWebSearch === 'boolean') body.enableWebSearch = params.enableWebSearch
|
|
if (typeof params.ignoreSitemap === 'boolean') body.ignoreSitemap = params.ignoreSitemap
|
|
if (typeof params.includeSubdomains === 'boolean')
|
|
body.includeSubdomains = params.includeSubdomains
|
|
if (typeof params.showSources === 'boolean') body.showSources = params.showSources
|
|
if (typeof params.ignoreInvalidURLs === 'boolean')
|
|
body.ignoreInvalidURLs = params.ignoreInvalidURLs
|
|
|
|
if (params.scrapeOptions != null) {
|
|
const cleanedScrapeOptions = Object.entries(params.scrapeOptions).reduce(
|
|
(acc, [key, val]) => {
|
|
if (val != null) {
|
|
acc[key] = val
|
|
}
|
|
return acc
|
|
},
|
|
{} as Record<string, any>
|
|
)
|
|
if (Object.keys(cleanedScrapeOptions).length > 0) {
|
|
body.scrapeOptions = cleanedScrapeOptions
|
|
}
|
|
}
|
|
|
|
return body
|
|
},
|
|
},
|
|
|
|
transformResponse: async (response: Response) => {
|
|
const data = await response.json()
|
|
|
|
return {
|
|
success: true,
|
|
output: {
|
|
jobId: data.id,
|
|
success: false,
|
|
data: {},
|
|
},
|
|
}
|
|
},
|
|
postProcess: async (result, params) => {
|
|
if (!result.success) {
|
|
return result
|
|
}
|
|
|
|
const jobId = result.output.jobId
|
|
logger.info(`Firecrawl extract job ${jobId} created, polling for completion...`)
|
|
|
|
let elapsedTime = 0
|
|
|
|
while (elapsedTime < MAX_POLL_TIME_MS) {
|
|
try {
|
|
const statusResponse = await fetch(`https://api.firecrawl.dev/v2/extract/${jobId}`, {
|
|
method: 'GET',
|
|
headers: {
|
|
Authorization: `Bearer ${params.apiKey}`,
|
|
'Content-Type': 'application/json',
|
|
},
|
|
})
|
|
|
|
if (!statusResponse.ok) {
|
|
throw new Error(`Failed to get extract status: ${statusResponse.statusText}`)
|
|
}
|
|
|
|
const extractData = await statusResponse.json()
|
|
logger.info(`Firecrawl extract job ${jobId} status: ${extractData.status}`)
|
|
|
|
if (extractData.status === 'completed') {
|
|
result.output = {
|
|
jobId,
|
|
success: true,
|
|
data: extractData.data || {},
|
|
}
|
|
return result
|
|
}
|
|
|
|
if (extractData.status === 'failed') {
|
|
return {
|
|
...result,
|
|
success: false,
|
|
error: `Extract job failed: ${extractData.error || 'Unknown error'}`,
|
|
}
|
|
}
|
|
|
|
await new Promise((resolve) => setTimeout(resolve, POLL_INTERVAL_MS))
|
|
elapsedTime += POLL_INTERVAL_MS
|
|
} catch (error: any) {
|
|
logger.error('Error polling for extract job status:', {
|
|
message: error.message || 'Unknown error',
|
|
jobId,
|
|
})
|
|
|
|
return {
|
|
...result,
|
|
success: false,
|
|
error: `Error polling for extract job status: ${error.message || 'Unknown error'}`,
|
|
}
|
|
}
|
|
}
|
|
|
|
logger.warn(
|
|
`Extract job ${jobId} did not complete within the maximum polling time (${MAX_POLL_TIME_MS / 1000}s)`
|
|
)
|
|
return {
|
|
...result,
|
|
success: false,
|
|
error: `Extract job did not complete within the maximum polling time (${MAX_POLL_TIME_MS / 1000}s)`,
|
|
}
|
|
},
|
|
|
|
outputs: {
|
|
success: {
|
|
type: 'boolean',
|
|
description: 'Whether the extraction operation was successful',
|
|
},
|
|
data: {
|
|
type: 'object',
|
|
description: 'Extracted structured data according to the schema or prompt',
|
|
},
|
|
},
|
|
}
|