mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-04-08 03:00:28 -04:00
feat(blocks): Add Firecrawl Integration for Web Scraping and Data Extraction (#10494)
### Changes 🏗️ This PR adds Firecrawl integration to AutoGPT, providing powerful web scraping and data extraction capabilities: **New Blocks Added:** ⚠️ All these blocks are synchronous so take a while to finish, this allows a simpler agent workflow - **Firecrawl Scrape Block**: Scrapes single web pages with various output formats (Markdown, HTML, JSON, screenshots) - **Firecrawl Crawl Block**: Crawls entire websites following links with customizable depth and filters - **Firecrawl Extract Block**: Extracts structured data from web pages using AI-powered prompts - **Firecrawl Map Block**: Maps website structure and returns a list of all discovered URLs - **Firecrawl Search Block**: Searches Google and scrapes the results **Key Features:** - Advanced anti-blocking technology to bypass scraping protections - Multiple output formats including Markdown, HTML, JSON, and screenshots - AI-powered data extraction with custom prompts and schemas - Configurable crawling depth and URL filtering - Built-in caching and rate limiting - Google search integration for discovering relevant content **Use Cases:** - Web data extraction for research and analysis - Content monitoring and change tracking - Competitive intelligence gathering - SEO analysis and website mapping - Automated data collection workflows ### Checklist 📋 #### For code changes: - [x] I have clearly listed my changes in the PR description - [x] I have made a test plan - [x] I have tested my changes according to the test plan: <\!-- Put your test plan here: --> - [x] Verified all Firecrawl blocks appear in the UI - [x] Tested scraping various websites with different formats - [x] Tested crawling with depth limits and URL filters - [x] Tested data extraction with custom prompts - [x] Verified error handling for invalid URLs and API failures - [x] Tested authentication with Firecrawl API key - [x] Confirmed proper rate limiting and caching behavior <img width="1025" height="1027" alt="Screenshot 2025-07-30 at 15 20 28" src="https://github.com/user-attachments/assets/7b94d3cf-7a0e-4d09-a9c5-24c4e8a3b660" /> # Example Agent [FC Testing_v12.json](https://github.com/user-attachments/files/21510608/FC.Testing_v12.json)
This commit is contained in:
@@ -0,0 +1,8 @@
|
||||
from backend.sdk import BlockCostType, ProviderBuilder
|
||||
|
||||
firecrawl = (
|
||||
ProviderBuilder("firecrawl")
|
||||
.with_api_key("FIRECRAWL_API_KEY", "Firecrawl API Key")
|
||||
.with_base_cost(1, BlockCostType.RUN)
|
||||
.build()
|
||||
)
|
||||
114
autogpt_platform/backend/backend/blocks/firecrawl/crawl.py
Normal file
114
autogpt_platform/backend/backend/blocks/firecrawl/crawl.py
Normal file
@@ -0,0 +1,114 @@
|
||||
from enum import Enum
|
||||
from typing import Any
|
||||
|
||||
from firecrawl import FirecrawlApp, ScrapeOptions
|
||||
|
||||
from backend.sdk import (
|
||||
APIKeyCredentials,
|
||||
Block,
|
||||
BlockCategory,
|
||||
BlockOutput,
|
||||
BlockSchema,
|
||||
CredentialsMetaInput,
|
||||
SchemaField,
|
||||
)
|
||||
|
||||
from ._config import firecrawl
|
||||
|
||||
|
||||
class ScrapeFormat(Enum):
|
||||
MARKDOWN = "markdown"
|
||||
HTML = "html"
|
||||
RAW_HTML = "rawHtml"
|
||||
LINKS = "links"
|
||||
SCREENSHOT = "screenshot"
|
||||
SCREENSHOT_FULL_PAGE = "screenshot@fullPage"
|
||||
JSON = "json"
|
||||
CHANGE_TRACKING = "changeTracking"
|
||||
|
||||
|
||||
class FirecrawlCrawlBlock(Block):
|
||||
|
||||
class Input(BlockSchema):
|
||||
credentials: CredentialsMetaInput = firecrawl.credentials_field()
|
||||
url: str = SchemaField(description="The URL to crawl")
|
||||
limit: int = SchemaField(description="The number of pages to crawl", default=10)
|
||||
only_main_content: bool = SchemaField(
|
||||
description="Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
default=True,
|
||||
)
|
||||
max_age: int = SchemaField(
|
||||
description="The maximum age of the page in milliseconds - default is 1 hour",
|
||||
default=3600000,
|
||||
)
|
||||
wait_for: int = SchemaField(
|
||||
description="Specify a delay in milliseconds before fetching the content, allowing the page sufficient time to load.",
|
||||
default=0,
|
||||
)
|
||||
formats: list[ScrapeFormat] = SchemaField(
|
||||
description="The format of the crawl", default=[ScrapeFormat.MARKDOWN]
|
||||
)
|
||||
|
||||
class Output(BlockSchema):
|
||||
data: list[dict[str, Any]] = SchemaField(description="The result of the crawl")
|
||||
markdown: str = SchemaField(description="The markdown of the crawl")
|
||||
html: str = SchemaField(description="The html of the crawl")
|
||||
raw_html: str = SchemaField(description="The raw html of the crawl")
|
||||
links: list[str] = SchemaField(description="The links of the crawl")
|
||||
screenshot: str = SchemaField(description="The screenshot of the crawl")
|
||||
screenshot_full_page: str = SchemaField(
|
||||
description="The screenshot full page of the crawl"
|
||||
)
|
||||
json_data: dict[str, Any] = SchemaField(
|
||||
description="The json data of the crawl"
|
||||
)
|
||||
change_tracking: dict[str, Any] = SchemaField(
|
||||
description="The change tracking of the crawl"
|
||||
)
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(
|
||||
id="bdbbaba0-03b7-4971-970e-699e2de6015e",
|
||||
description="Firecrawl crawls websites to extract comprehensive data while bypassing blockers.",
|
||||
categories={BlockCategory.SEARCH},
|
||||
input_schema=self.Input,
|
||||
output_schema=self.Output,
|
||||
)
|
||||
|
||||
async def run(
|
||||
self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
|
||||
) -> BlockOutput:
|
||||
|
||||
app = FirecrawlApp(api_key=credentials.api_key.get_secret_value())
|
||||
|
||||
# Sync call
|
||||
crawl_result = app.crawl_url(
|
||||
input_data.url,
|
||||
limit=input_data.limit,
|
||||
scrape_options=ScrapeOptions(
|
||||
formats=[format.value for format in input_data.formats],
|
||||
onlyMainContent=input_data.only_main_content,
|
||||
maxAge=input_data.max_age,
|
||||
waitFor=input_data.wait_for,
|
||||
),
|
||||
)
|
||||
yield "data", crawl_result.data
|
||||
|
||||
for data in crawl_result.data:
|
||||
for f in input_data.formats:
|
||||
if f == ScrapeFormat.MARKDOWN:
|
||||
yield "markdown", data.markdown
|
||||
elif f == ScrapeFormat.HTML:
|
||||
yield "html", data.html
|
||||
elif f == ScrapeFormat.RAW_HTML:
|
||||
yield "raw_html", data.rawHtml
|
||||
elif f == ScrapeFormat.LINKS:
|
||||
yield "links", data.links
|
||||
elif f == ScrapeFormat.SCREENSHOT:
|
||||
yield "screenshot", data.screenshot
|
||||
elif f == ScrapeFormat.SCREENSHOT_FULL_PAGE:
|
||||
yield "screenshot_full_page", data.screenshot
|
||||
elif f == ScrapeFormat.CHANGE_TRACKING:
|
||||
yield "change_tracking", data.changeTracking
|
||||
elif f == ScrapeFormat.JSON:
|
||||
yield "json", data.json
|
||||
67
autogpt_platform/backend/backend/blocks/firecrawl/extract.py
Normal file
67
autogpt_platform/backend/backend/blocks/firecrawl/extract.py
Normal file
@@ -0,0 +1,67 @@
|
||||
from typing import Any
|
||||
|
||||
from firecrawl import FirecrawlApp
|
||||
|
||||
from backend.sdk import (
|
||||
APIKeyCredentials,
|
||||
Block,
|
||||
BlockCategory,
|
||||
BlockCost,
|
||||
BlockCostType,
|
||||
BlockOutput,
|
||||
BlockSchema,
|
||||
CredentialsMetaInput,
|
||||
SchemaField,
|
||||
cost,
|
||||
)
|
||||
|
||||
from ._config import firecrawl
|
||||
|
||||
|
||||
@cost(BlockCost(2, BlockCostType.RUN))
|
||||
class FirecrawlExtractBlock(Block):
|
||||
|
||||
class Input(BlockSchema):
|
||||
credentials: CredentialsMetaInput = firecrawl.credentials_field()
|
||||
urls: list[str] = SchemaField(
|
||||
description="The URLs to crawl - at least one is required. Wildcards are supported. (/*)"
|
||||
)
|
||||
prompt: str | None = SchemaField(
|
||||
description="The prompt to use for the crawl", default=None, advanced=False
|
||||
)
|
||||
output_schema: str | None = SchemaField(
|
||||
description="A more rigid structure if you already know the JSON layout.",
|
||||
default=None,
|
||||
)
|
||||
enable_web_search: bool = SchemaField(
|
||||
description="When true, extraction can follow links outside the specified domain.",
|
||||
default=False,
|
||||
)
|
||||
|
||||
class Output(BlockSchema):
|
||||
data: dict[str, Any] = SchemaField(description="The result of the crawl")
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(
|
||||
id="d1774756-4d9e-40e6-bab1-47ec0ccd81b2",
|
||||
description="Firecrawl crawls websites to extract comprehensive data while bypassing blockers.",
|
||||
categories={BlockCategory.SEARCH},
|
||||
input_schema=self.Input,
|
||||
output_schema=self.Output,
|
||||
)
|
||||
|
||||
async def run(
|
||||
self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
|
||||
) -> BlockOutput:
|
||||
|
||||
app = FirecrawlApp(api_key=credentials.api_key.get_secret_value())
|
||||
|
||||
# Sync call
|
||||
extract_result = app.extract(
|
||||
urls=input_data.urls,
|
||||
prompt=input_data.prompt,
|
||||
schema=input_data.output_schema,
|
||||
enable_web_search=input_data.enable_web_search,
|
||||
)
|
||||
|
||||
yield "data", extract_result.data
|
||||
46
autogpt_platform/backend/backend/blocks/firecrawl/map.py
Normal file
46
autogpt_platform/backend/backend/blocks/firecrawl/map.py
Normal file
@@ -0,0 +1,46 @@
|
||||
from firecrawl import FirecrawlApp
|
||||
|
||||
from backend.sdk import (
|
||||
APIKeyCredentials,
|
||||
Block,
|
||||
BlockCategory,
|
||||
BlockOutput,
|
||||
BlockSchema,
|
||||
CredentialsMetaInput,
|
||||
SchemaField,
|
||||
)
|
||||
|
||||
from ._config import firecrawl
|
||||
|
||||
|
||||
class FirecrawlMapWebsiteBlock(Block):
|
||||
|
||||
class Input(BlockSchema):
|
||||
credentials: CredentialsMetaInput = firecrawl.credentials_field()
|
||||
|
||||
url: str = SchemaField(description="The website url to map")
|
||||
|
||||
class Output(BlockSchema):
|
||||
links: list[str] = SchemaField(description="The links of the website")
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(
|
||||
id="f0f43e2b-c943-48a0-a7f1-40136ca4d3b9",
|
||||
description="Firecrawl maps a website to extract all the links.",
|
||||
categories={BlockCategory.SEARCH},
|
||||
input_schema=self.Input,
|
||||
output_schema=self.Output,
|
||||
)
|
||||
|
||||
async def run(
|
||||
self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
|
||||
) -> BlockOutput:
|
||||
|
||||
app = FirecrawlApp(api_key=credentials.api_key.get_secret_value())
|
||||
|
||||
# Sync call
|
||||
map_result = app.map_url(
|
||||
url=input_data.url,
|
||||
)
|
||||
|
||||
yield "links", map_result.links
|
||||
109
autogpt_platform/backend/backend/blocks/firecrawl/scrape.py
Normal file
109
autogpt_platform/backend/backend/blocks/firecrawl/scrape.py
Normal file
@@ -0,0 +1,109 @@
|
||||
from enum import Enum
|
||||
from typing import Any
|
||||
|
||||
from firecrawl import FirecrawlApp
|
||||
|
||||
from backend.sdk import (
|
||||
APIKeyCredentials,
|
||||
Block,
|
||||
BlockCategory,
|
||||
BlockOutput,
|
||||
BlockSchema,
|
||||
CredentialsMetaInput,
|
||||
SchemaField,
|
||||
)
|
||||
|
||||
from ._config import firecrawl
|
||||
|
||||
|
||||
class ScrapeFormat(Enum):
|
||||
MARKDOWN = "markdown"
|
||||
HTML = "html"
|
||||
RAW_HTML = "rawHtml"
|
||||
LINKS = "links"
|
||||
SCREENSHOT = "screenshot"
|
||||
SCREENSHOT_FULL_PAGE = "screenshot@fullPage"
|
||||
JSON = "json"
|
||||
CHANGE_TRACKING = "changeTracking"
|
||||
|
||||
|
||||
class FirecrawlScrapeBlock(Block):
|
||||
|
||||
class Input(BlockSchema):
|
||||
credentials: CredentialsMetaInput = firecrawl.credentials_field()
|
||||
url: str = SchemaField(description="The URL to crawl")
|
||||
limit: int = SchemaField(description="The number of pages to crawl", default=10)
|
||||
only_main_content: bool = SchemaField(
|
||||
description="Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
default=True,
|
||||
)
|
||||
max_age: int = SchemaField(
|
||||
description="The maximum age of the page in milliseconds - default is 1 hour",
|
||||
default=3600000,
|
||||
)
|
||||
wait_for: int = SchemaField(
|
||||
description="Specify a delay in milliseconds before fetching the content, allowing the page sufficient time to load.",
|
||||
default=200,
|
||||
)
|
||||
formats: list[ScrapeFormat] = SchemaField(
|
||||
description="The format of the crawl", default=[ScrapeFormat.MARKDOWN]
|
||||
)
|
||||
|
||||
class Output(BlockSchema):
|
||||
data: dict[str, Any] = SchemaField(description="The result of the crawl")
|
||||
markdown: str = SchemaField(description="The markdown of the crawl")
|
||||
html: str = SchemaField(description="The html of the crawl")
|
||||
raw_html: str = SchemaField(description="The raw html of the crawl")
|
||||
links: list[str] = SchemaField(description="The links of the crawl")
|
||||
screenshot: str = SchemaField(description="The screenshot of the crawl")
|
||||
screenshot_full_page: str = SchemaField(
|
||||
description="The screenshot full page of the crawl"
|
||||
)
|
||||
json_data: dict[str, Any] = SchemaField(
|
||||
description="The json data of the crawl"
|
||||
)
|
||||
change_tracking: dict[str, Any] = SchemaField(
|
||||
description="The change tracking of the crawl"
|
||||
)
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(
|
||||
id="ac444320-cf5e-4697-b586-2604c17a3e75",
|
||||
description="Firecrawl scrapes a website to extract comprehensive data while bypassing blockers.",
|
||||
categories={BlockCategory.SEARCH},
|
||||
input_schema=self.Input,
|
||||
output_schema=self.Output,
|
||||
)
|
||||
|
||||
async def run(
|
||||
self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
|
||||
) -> BlockOutput:
|
||||
|
||||
app = FirecrawlApp(api_key=credentials.api_key.get_secret_value())
|
||||
|
||||
scrape_result = app.scrape_url(
|
||||
input_data.url,
|
||||
formats=[format.value for format in input_data.formats],
|
||||
only_main_content=input_data.only_main_content,
|
||||
max_age=input_data.max_age,
|
||||
wait_for=input_data.wait_for,
|
||||
)
|
||||
yield "data", scrape_result
|
||||
|
||||
for f in input_data.formats:
|
||||
if f == ScrapeFormat.MARKDOWN:
|
||||
yield "markdown", scrape_result.markdown
|
||||
elif f == ScrapeFormat.HTML:
|
||||
yield "html", scrape_result.html
|
||||
elif f == ScrapeFormat.RAW_HTML:
|
||||
yield "raw_html", scrape_result.rawHtml
|
||||
elif f == ScrapeFormat.LINKS:
|
||||
yield "links", scrape_result.links
|
||||
elif f == ScrapeFormat.SCREENSHOT:
|
||||
yield "screenshot", scrape_result.screenshot
|
||||
elif f == ScrapeFormat.SCREENSHOT_FULL_PAGE:
|
||||
yield "screenshot_full_page", scrape_result.screenshot
|
||||
elif f == ScrapeFormat.CHANGE_TRACKING:
|
||||
yield "change_tracking", scrape_result.changeTracking
|
||||
elif f == ScrapeFormat.JSON:
|
||||
yield "json", scrape_result.json
|
||||
79
autogpt_platform/backend/backend/blocks/firecrawl/search.py
Normal file
79
autogpt_platform/backend/backend/blocks/firecrawl/search.py
Normal file
@@ -0,0 +1,79 @@
|
||||
from enum import Enum
|
||||
from typing import Any
|
||||
|
||||
from firecrawl import FirecrawlApp, ScrapeOptions
|
||||
|
||||
from backend.sdk import (
|
||||
APIKeyCredentials,
|
||||
Block,
|
||||
BlockCategory,
|
||||
BlockOutput,
|
||||
BlockSchema,
|
||||
CredentialsMetaInput,
|
||||
SchemaField,
|
||||
)
|
||||
|
||||
from ._config import firecrawl
|
||||
|
||||
|
||||
class ScrapeFormat(Enum):
|
||||
MARKDOWN = "markdown"
|
||||
HTML = "html"
|
||||
RAW_HTML = "rawHtml"
|
||||
LINKS = "links"
|
||||
SCREENSHOT = "screenshot"
|
||||
SCREENSHOT_FULL_PAGE = "screenshot@fullPage"
|
||||
JSON = "json"
|
||||
CHANGE_TRACKING = "changeTracking"
|
||||
|
||||
|
||||
class FirecrawlSearchBlock(Block):
|
||||
|
||||
class Input(BlockSchema):
|
||||
credentials: CredentialsMetaInput = firecrawl.credentials_field()
|
||||
query: str = SchemaField(description="The query to search for")
|
||||
limit: int = SchemaField(description="The number of pages to crawl", default=10)
|
||||
max_age: int = SchemaField(
|
||||
description="The maximum age of the page in milliseconds - default is 1 hour",
|
||||
default=3600000,
|
||||
)
|
||||
wait_for: int = SchemaField(
|
||||
description="Specify a delay in milliseconds before fetching the content, allowing the page sufficient time to load.",
|
||||
default=200,
|
||||
)
|
||||
formats: list[ScrapeFormat] = SchemaField(
|
||||
description="Returns the content of the search if specified", default=[]
|
||||
)
|
||||
|
||||
class Output(BlockSchema):
|
||||
data: dict[str, Any] = SchemaField(description="The result of the search")
|
||||
site: dict[str, Any] = SchemaField(description="The site of the search")
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(
|
||||
id="f8d2f28d-b3a1-405b-804e-418c087d288b",
|
||||
description="Firecrawl searches the web for the given query.",
|
||||
categories={BlockCategory.SEARCH},
|
||||
input_schema=self.Input,
|
||||
output_schema=self.Output,
|
||||
)
|
||||
|
||||
async def run(
|
||||
self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
|
||||
) -> BlockOutput:
|
||||
|
||||
app = FirecrawlApp(api_key=credentials.api_key.get_secret_value())
|
||||
|
||||
# Sync call
|
||||
scrape_result = app.search(
|
||||
input_data.query,
|
||||
limit=input_data.limit,
|
||||
scrape_options=ScrapeOptions(
|
||||
formats=[format.value for format in input_data.formats],
|
||||
maxAge=input_data.max_age,
|
||||
waitFor=input_data.wait_for,
|
||||
),
|
||||
)
|
||||
yield "data", scrape_result
|
||||
for site in scrape_result.data:
|
||||
yield "site", site
|
||||
34
autogpt_platform/backend/poetry.lock
generated
34
autogpt_platform/backend/poetry.lock
generated
@@ -1212,6 +1212,26 @@ files = [
|
||||
[package.dependencies]
|
||||
packaging = ">=20"
|
||||
|
||||
[[package]]
|
||||
name = "firecrawl-py"
|
||||
version = "2.16.3"
|
||||
description = "Python SDK for Firecrawl API"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "firecrawl_py-2.16.3-py3-none-any.whl", hash = "sha256:94bb46af5e0df6c8ec414ac999a5355c0f5a46f15fd1cf5a02a3b31062db0aa8"},
|
||||
{file = "firecrawl_py-2.16.3.tar.gz", hash = "sha256:5fd063ef4acc4c4be62648f1e11467336bc127780b3afc28d39078a012e6a14c"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
aiohttp = "*"
|
||||
nest-asyncio = "*"
|
||||
pydantic = "*"
|
||||
python-dotenv = "*"
|
||||
requests = "*"
|
||||
websockets = "*"
|
||||
|
||||
[[package]]
|
||||
name = "flake8"
|
||||
version = "7.3.0"
|
||||
@@ -2902,6 +2922,18 @@ files = [
|
||||
{file = "mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nest-asyncio"
|
||||
version = "1.6.0"
|
||||
description = "Patch asyncio to allow nested event loops"
|
||||
optional = false
|
||||
python-versions = ">=3.5"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c"},
|
||||
{file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nodeenv"
|
||||
version = "1.9.1"
|
||||
@@ -6686,4 +6718,4 @@ cffi = ["cffi (>=1.11)"]
|
||||
[metadata]
|
||||
lock-version = "2.1"
|
||||
python-versions = ">=3.10,<3.13"
|
||||
content-hash = "e79fd4e1968b496b1012c4866130f5680fc6558d041ddddd53f466e8ec58869c"
|
||||
content-hash = "225ddae645d22cc57f46330e735c069fb52e708123aa642e74adbf077dda0796"
|
||||
|
||||
@@ -74,6 +74,7 @@ aioclamd = "^1.0.0"
|
||||
setuptools = "^80.9.0"
|
||||
gcloud-aio-storage = "^9.5.0"
|
||||
pandas = "^2.3.1"
|
||||
firecrawl-py = "^2.16.3"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
aiohappyeyeballs = "^2.6.1"
|
||||
|
||||
Reference in New Issue
Block a user