Files
sim/apps/sim/tools/firecrawl/extract.ts
Waleed 6cb3977dd9 fix(visibility): updated visibility for non-sensitive tool params from user only to user or llm (#3095)
* fix(visibility): updated visibility for non-sensitive tool params from user only to user or llm

* update docs

* updated docs script
2026-01-31 11:31:08 -08:00

215 lines
6.3 KiB
TypeScript

import { createLogger } from '@sim/logger'
import type { ExtractParams, ExtractResponse } from '@/tools/firecrawl/types'
import type { ToolConfig } from '@/tools/types'
const logger = createLogger('FirecrawlExtractTool')
const POLL_INTERVAL_MS = 5000 // 5 seconds between polls
const MAX_POLL_TIME_MS = 300000 // 5 minutes maximum polling time
export const extractTool: ToolConfig<ExtractParams, ExtractResponse> = {
id: 'firecrawl_extract',
name: 'Firecrawl Extract',
description:
'Extract structured data from entire webpages using natural language prompts and JSON schema. Powerful agentic feature for intelligent data extraction.',
version: '1.0.0',
params: {
urls: {
type: 'json',
required: true,
visibility: 'user-or-llm',
description:
'Array of URLs to extract data from (e.g., ["https://example.com/page1", "https://example.com/page2"] or ["https://example.com/*"])',
},
prompt: {
type: 'string',
required: false,
visibility: 'user-or-llm',
description: 'Natural language guidance for the extraction process',
},
schema: {
type: 'json',
required: false,
visibility: 'user-or-llm',
description: 'JSON Schema defining the structure of data to extract',
},
enableWebSearch: {
type: 'boolean',
required: false,
visibility: 'user-only',
description: 'Enable web search to find supplementary information (default: false)',
},
ignoreSitemap: {
type: 'boolean',
required: false,
visibility: 'user-only',
description: 'Ignore sitemap.xml files during scanning (default: false)',
},
includeSubdomains: {
type: 'boolean',
required: false,
visibility: 'user-only',
description: 'Extend scanning to subdomains (default: true)',
},
showSources: {
type: 'boolean',
required: false,
visibility: 'user-only',
description: 'Return data sources in the response (default: false)',
},
ignoreInvalidURLs: {
type: 'boolean',
required: false,
visibility: 'user-only',
description: 'Skip invalid URLs in the array (default: true)',
},
scrapeOptions: {
type: 'json',
required: false,
visibility: 'hidden',
description: 'Advanced scraping configuration options',
},
apiKey: {
type: 'string',
required: true,
visibility: 'user-only',
description: 'Firecrawl API key',
},
},
request: {
method: 'POST',
url: 'https://api.firecrawl.dev/v2/extract',
headers: (params) => ({
'Content-Type': 'application/json',
Authorization: `Bearer ${params.apiKey}`,
}),
body: (params) => {
const body: Record<string, any> = {
urls: params.urls,
}
if (params.prompt) body.prompt = params.prompt
if (params.schema) body.schema = params.schema
if (typeof params.enableWebSearch === 'boolean') body.enableWebSearch = params.enableWebSearch
if (typeof params.ignoreSitemap === 'boolean') body.ignoreSitemap = params.ignoreSitemap
if (typeof params.includeSubdomains === 'boolean')
body.includeSubdomains = params.includeSubdomains
if (typeof params.showSources === 'boolean') body.showSources = params.showSources
if (typeof params.ignoreInvalidURLs === 'boolean')
body.ignoreInvalidURLs = params.ignoreInvalidURLs
if (params.scrapeOptions != null) {
const cleanedScrapeOptions = Object.entries(params.scrapeOptions).reduce(
(acc, [key, val]) => {
if (val != null) {
acc[key] = val
}
return acc
},
{} as Record<string, any>
)
if (Object.keys(cleanedScrapeOptions).length > 0) {
body.scrapeOptions = cleanedScrapeOptions
}
}
return body
},
},
transformResponse: async (response: Response) => {
const data = await response.json()
return {
success: true,
output: {
jobId: data.id,
success: false,
data: {},
},
}
},
postProcess: async (result, params) => {
if (!result.success) {
return result
}
const jobId = result.output.jobId
logger.info(`Firecrawl extract job ${jobId} created, polling for completion...`)
let elapsedTime = 0
while (elapsedTime < MAX_POLL_TIME_MS) {
try {
const statusResponse = await fetch(`https://api.firecrawl.dev/v2/extract/${jobId}`, {
method: 'GET',
headers: {
Authorization: `Bearer ${params.apiKey}`,
'Content-Type': 'application/json',
},
})
if (!statusResponse.ok) {
throw new Error(`Failed to get extract status: ${statusResponse.statusText}`)
}
const extractData = await statusResponse.json()
logger.info(`Firecrawl extract job ${jobId} status: ${extractData.status}`)
if (extractData.status === 'completed') {
result.output = {
jobId,
success: true,
data: extractData.data || {},
}
return result
}
if (extractData.status === 'failed') {
return {
...result,
success: false,
error: `Extract job failed: ${extractData.error || 'Unknown error'}`,
}
}
await new Promise((resolve) => setTimeout(resolve, POLL_INTERVAL_MS))
elapsedTime += POLL_INTERVAL_MS
} catch (error: any) {
logger.error('Error polling for extract job status:', {
message: error.message || 'Unknown error',
jobId,
})
return {
...result,
success: false,
error: `Error polling for extract job status: ${error.message || 'Unknown error'}`,
}
}
}
logger.warn(
`Extract job ${jobId} did not complete within the maximum polling time (${MAX_POLL_TIME_MS / 1000}s)`
)
return {
...result,
success: false,
error: `Extract job did not complete within the maximum polling time (${MAX_POLL_TIME_MS / 1000}s)`,
}
},
outputs: {
success: {
type: 'boolean',
description: 'Whether the extraction operation was successful',
},
data: {
type: 'object',
description: 'Extracted structured data according to the schema or prompt',
},
},
}