feat(blocks): Add Firecrawl Integration for Web Scraping and Data Extraction (#10494)

### Changes 🏗️

This PR adds Firecrawl integration to AutoGPT, providing powerful web
scraping and data extraction capabilities:

**New Blocks Added:**

⚠️ All these blocks are synchronous so take a while to finish, this
allows a simpler agent workflow

- **Firecrawl Scrape Block**: Scrapes single web pages with various
output formats (Markdown, HTML, JSON, screenshots)
- **Firecrawl Crawl Block**: Crawls entire websites following links with
customizable depth and filters
- **Firecrawl Extract Block**: Extracts structured data from web pages
using AI-powered prompts
- **Firecrawl Map Block**: Maps website structure and returns a list of
all discovered URLs
- **Firecrawl Search Block**: Searches Google and scrapes the results

**Key Features:**
- Advanced anti-blocking technology to bypass scraping protections
- Multiple output formats including Markdown, HTML, JSON, and
screenshots
- AI-powered data extraction with custom prompts and schemas
- Configurable crawling depth and URL filtering
- Built-in caching and rate limiting
- Google search integration for discovering relevant content

**Use Cases:**
- Web data extraction for research and analysis
- Content monitoring and change tracking
- Competitive intelligence gathering
- SEO analysis and website mapping
- Automated data collection workflows

### Checklist 📋

#### For code changes:
- [x] I have clearly listed my changes in the PR description
- [x] I have made a test plan
- [x] I have tested my changes according to the test plan:
  <\!-- Put your test plan here: -->
  - [x] Verified all Firecrawl blocks appear in the UI
  - [x] Tested scraping various websites with different formats
  - [x] Tested crawling with depth limits and URL filters
  - [x] Tested data extraction with custom prompts
  - [x] Verified error handling for invalid URLs and API failures
  - [x] Tested authentication with Firecrawl API key
  - [x] Confirmed proper rate limiting and caching behavior

<img width="1025" height="1027" alt="Screenshot 2025-07-30 at 15 20 28"
src="https://github.com/user-attachments/assets/7b94d3cf-7a0e-4d09-a9c5-24c4e8a3b660"
/>

# Example Agent
[FC
Testing_v12.json](https://github.com/user-attachments/files/21510608/FC.Testing_v12.json)
This commit is contained in:
Swifty
2025-07-31 11:47:49 +02:00
committed by GitHub
parent b429505c14
commit df399e5c51
10 changed files with 457 additions and 1 deletions

View File

@@ -0,0 +1,8 @@
from backend.sdk import BlockCostType, ProviderBuilder
firecrawl = (
ProviderBuilder("firecrawl")
.with_api_key("FIRECRAWL_API_KEY", "Firecrawl API Key")
.with_base_cost(1, BlockCostType.RUN)
.build()
)

View File

@@ -0,0 +1,114 @@
from enum import Enum
from typing import Any
from firecrawl import FirecrawlApp, ScrapeOptions
from backend.sdk import (
APIKeyCredentials,
Block,
BlockCategory,
BlockOutput,
BlockSchema,
CredentialsMetaInput,
SchemaField,
)
from ._config import firecrawl
class ScrapeFormat(Enum):
MARKDOWN = "markdown"
HTML = "html"
RAW_HTML = "rawHtml"
LINKS = "links"
SCREENSHOT = "screenshot"
SCREENSHOT_FULL_PAGE = "screenshot@fullPage"
JSON = "json"
CHANGE_TRACKING = "changeTracking"
class FirecrawlCrawlBlock(Block):
class Input(BlockSchema):
credentials: CredentialsMetaInput = firecrawl.credentials_field()
url: str = SchemaField(description="The URL to crawl")
limit: int = SchemaField(description="The number of pages to crawl", default=10)
only_main_content: bool = SchemaField(
description="Only return the main content of the page excluding headers, navs, footers, etc.",
default=True,
)
max_age: int = SchemaField(
description="The maximum age of the page in milliseconds - default is 1 hour",
default=3600000,
)
wait_for: int = SchemaField(
description="Specify a delay in milliseconds before fetching the content, allowing the page sufficient time to load.",
default=0,
)
formats: list[ScrapeFormat] = SchemaField(
description="The format of the crawl", default=[ScrapeFormat.MARKDOWN]
)
class Output(BlockSchema):
data: list[dict[str, Any]] = SchemaField(description="The result of the crawl")
markdown: str = SchemaField(description="The markdown of the crawl")
html: str = SchemaField(description="The html of the crawl")
raw_html: str = SchemaField(description="The raw html of the crawl")
links: list[str] = SchemaField(description="The links of the crawl")
screenshot: str = SchemaField(description="The screenshot of the crawl")
screenshot_full_page: str = SchemaField(
description="The screenshot full page of the crawl"
)
json_data: dict[str, Any] = SchemaField(
description="The json data of the crawl"
)
change_tracking: dict[str, Any] = SchemaField(
description="The change tracking of the crawl"
)
def __init__(self):
super().__init__(
id="bdbbaba0-03b7-4971-970e-699e2de6015e",
description="Firecrawl crawls websites to extract comprehensive data while bypassing blockers.",
categories={BlockCategory.SEARCH},
input_schema=self.Input,
output_schema=self.Output,
)
async def run(
self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
) -> BlockOutput:
app = FirecrawlApp(api_key=credentials.api_key.get_secret_value())
# Sync call
crawl_result = app.crawl_url(
input_data.url,
limit=input_data.limit,
scrape_options=ScrapeOptions(
formats=[format.value for format in input_data.formats],
onlyMainContent=input_data.only_main_content,
maxAge=input_data.max_age,
waitFor=input_data.wait_for,
),
)
yield "data", crawl_result.data
for data in crawl_result.data:
for f in input_data.formats:
if f == ScrapeFormat.MARKDOWN:
yield "markdown", data.markdown
elif f == ScrapeFormat.HTML:
yield "html", data.html
elif f == ScrapeFormat.RAW_HTML:
yield "raw_html", data.rawHtml
elif f == ScrapeFormat.LINKS:
yield "links", data.links
elif f == ScrapeFormat.SCREENSHOT:
yield "screenshot", data.screenshot
elif f == ScrapeFormat.SCREENSHOT_FULL_PAGE:
yield "screenshot_full_page", data.screenshot
elif f == ScrapeFormat.CHANGE_TRACKING:
yield "change_tracking", data.changeTracking
elif f == ScrapeFormat.JSON:
yield "json", data.json

View File

@@ -0,0 +1,67 @@
from typing import Any
from firecrawl import FirecrawlApp
from backend.sdk import (
APIKeyCredentials,
Block,
BlockCategory,
BlockCost,
BlockCostType,
BlockOutput,
BlockSchema,
CredentialsMetaInput,
SchemaField,
cost,
)
from ._config import firecrawl
@cost(BlockCost(2, BlockCostType.RUN))
class FirecrawlExtractBlock(Block):
class Input(BlockSchema):
credentials: CredentialsMetaInput = firecrawl.credentials_field()
urls: list[str] = SchemaField(
description="The URLs to crawl - at least one is required. Wildcards are supported. (/*)"
)
prompt: str | None = SchemaField(
description="The prompt to use for the crawl", default=None, advanced=False
)
output_schema: str | None = SchemaField(
description="A more rigid structure if you already know the JSON layout.",
default=None,
)
enable_web_search: bool = SchemaField(
description="When true, extraction can follow links outside the specified domain.",
default=False,
)
class Output(BlockSchema):
data: dict[str, Any] = SchemaField(description="The result of the crawl")
def __init__(self):
super().__init__(
id="d1774756-4d9e-40e6-bab1-47ec0ccd81b2",
description="Firecrawl crawls websites to extract comprehensive data while bypassing blockers.",
categories={BlockCategory.SEARCH},
input_schema=self.Input,
output_schema=self.Output,
)
async def run(
self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
) -> BlockOutput:
app = FirecrawlApp(api_key=credentials.api_key.get_secret_value())
# Sync call
extract_result = app.extract(
urls=input_data.urls,
prompt=input_data.prompt,
schema=input_data.output_schema,
enable_web_search=input_data.enable_web_search,
)
yield "data", extract_result.data

View File

@@ -0,0 +1,46 @@
from firecrawl import FirecrawlApp
from backend.sdk import (
APIKeyCredentials,
Block,
BlockCategory,
BlockOutput,
BlockSchema,
CredentialsMetaInput,
SchemaField,
)
from ._config import firecrawl
class FirecrawlMapWebsiteBlock(Block):
class Input(BlockSchema):
credentials: CredentialsMetaInput = firecrawl.credentials_field()
url: str = SchemaField(description="The website url to map")
class Output(BlockSchema):
links: list[str] = SchemaField(description="The links of the website")
def __init__(self):
super().__init__(
id="f0f43e2b-c943-48a0-a7f1-40136ca4d3b9",
description="Firecrawl maps a website to extract all the links.",
categories={BlockCategory.SEARCH},
input_schema=self.Input,
output_schema=self.Output,
)
async def run(
self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
) -> BlockOutput:
app = FirecrawlApp(api_key=credentials.api_key.get_secret_value())
# Sync call
map_result = app.map_url(
url=input_data.url,
)
yield "links", map_result.links

View File

@@ -0,0 +1,109 @@
from enum import Enum
from typing import Any
from firecrawl import FirecrawlApp
from backend.sdk import (
APIKeyCredentials,
Block,
BlockCategory,
BlockOutput,
BlockSchema,
CredentialsMetaInput,
SchemaField,
)
from ._config import firecrawl
class ScrapeFormat(Enum):
MARKDOWN = "markdown"
HTML = "html"
RAW_HTML = "rawHtml"
LINKS = "links"
SCREENSHOT = "screenshot"
SCREENSHOT_FULL_PAGE = "screenshot@fullPage"
JSON = "json"
CHANGE_TRACKING = "changeTracking"
class FirecrawlScrapeBlock(Block):
class Input(BlockSchema):
credentials: CredentialsMetaInput = firecrawl.credentials_field()
url: str = SchemaField(description="The URL to crawl")
limit: int = SchemaField(description="The number of pages to crawl", default=10)
only_main_content: bool = SchemaField(
description="Only return the main content of the page excluding headers, navs, footers, etc.",
default=True,
)
max_age: int = SchemaField(
description="The maximum age of the page in milliseconds - default is 1 hour",
default=3600000,
)
wait_for: int = SchemaField(
description="Specify a delay in milliseconds before fetching the content, allowing the page sufficient time to load.",
default=200,
)
formats: list[ScrapeFormat] = SchemaField(
description="The format of the crawl", default=[ScrapeFormat.MARKDOWN]
)
class Output(BlockSchema):
data: dict[str, Any] = SchemaField(description="The result of the crawl")
markdown: str = SchemaField(description="The markdown of the crawl")
html: str = SchemaField(description="The html of the crawl")
raw_html: str = SchemaField(description="The raw html of the crawl")
links: list[str] = SchemaField(description="The links of the crawl")
screenshot: str = SchemaField(description="The screenshot of the crawl")
screenshot_full_page: str = SchemaField(
description="The screenshot full page of the crawl"
)
json_data: dict[str, Any] = SchemaField(
description="The json data of the crawl"
)
change_tracking: dict[str, Any] = SchemaField(
description="The change tracking of the crawl"
)
def __init__(self):
super().__init__(
id="ac444320-cf5e-4697-b586-2604c17a3e75",
description="Firecrawl scrapes a website to extract comprehensive data while bypassing blockers.",
categories={BlockCategory.SEARCH},
input_schema=self.Input,
output_schema=self.Output,
)
async def run(
self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
) -> BlockOutput:
app = FirecrawlApp(api_key=credentials.api_key.get_secret_value())
scrape_result = app.scrape_url(
input_data.url,
formats=[format.value for format in input_data.formats],
only_main_content=input_data.only_main_content,
max_age=input_data.max_age,
wait_for=input_data.wait_for,
)
yield "data", scrape_result
for f in input_data.formats:
if f == ScrapeFormat.MARKDOWN:
yield "markdown", scrape_result.markdown
elif f == ScrapeFormat.HTML:
yield "html", scrape_result.html
elif f == ScrapeFormat.RAW_HTML:
yield "raw_html", scrape_result.rawHtml
elif f == ScrapeFormat.LINKS:
yield "links", scrape_result.links
elif f == ScrapeFormat.SCREENSHOT:
yield "screenshot", scrape_result.screenshot
elif f == ScrapeFormat.SCREENSHOT_FULL_PAGE:
yield "screenshot_full_page", scrape_result.screenshot
elif f == ScrapeFormat.CHANGE_TRACKING:
yield "change_tracking", scrape_result.changeTracking
elif f == ScrapeFormat.JSON:
yield "json", scrape_result.json

View File

@@ -0,0 +1,79 @@
from enum import Enum
from typing import Any
from firecrawl import FirecrawlApp, ScrapeOptions
from backend.sdk import (
APIKeyCredentials,
Block,
BlockCategory,
BlockOutput,
BlockSchema,
CredentialsMetaInput,
SchemaField,
)
from ._config import firecrawl
class ScrapeFormat(Enum):
MARKDOWN = "markdown"
HTML = "html"
RAW_HTML = "rawHtml"
LINKS = "links"
SCREENSHOT = "screenshot"
SCREENSHOT_FULL_PAGE = "screenshot@fullPage"
JSON = "json"
CHANGE_TRACKING = "changeTracking"
class FirecrawlSearchBlock(Block):
class Input(BlockSchema):
credentials: CredentialsMetaInput = firecrawl.credentials_field()
query: str = SchemaField(description="The query to search for")
limit: int = SchemaField(description="The number of pages to crawl", default=10)
max_age: int = SchemaField(
description="The maximum age of the page in milliseconds - default is 1 hour",
default=3600000,
)
wait_for: int = SchemaField(
description="Specify a delay in milliseconds before fetching the content, allowing the page sufficient time to load.",
default=200,
)
formats: list[ScrapeFormat] = SchemaField(
description="Returns the content of the search if specified", default=[]
)
class Output(BlockSchema):
data: dict[str, Any] = SchemaField(description="The result of the search")
site: dict[str, Any] = SchemaField(description="The site of the search")
def __init__(self):
super().__init__(
id="f8d2f28d-b3a1-405b-804e-418c087d288b",
description="Firecrawl searches the web for the given query.",
categories={BlockCategory.SEARCH},
input_schema=self.Input,
output_schema=self.Output,
)
async def run(
self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
) -> BlockOutput:
app = FirecrawlApp(api_key=credentials.api_key.get_secret_value())
# Sync call
scrape_result = app.search(
input_data.query,
limit=input_data.limit,
scrape_options=ScrapeOptions(
formats=[format.value for format in input_data.formats],
maxAge=input_data.max_age,
waitFor=input_data.wait_for,
),
)
yield "data", scrape_result
for site in scrape_result.data:
yield "site", site

View File

@@ -1212,6 +1212,26 @@ files = [
[package.dependencies]
packaging = ">=20"
[[package]]
name = "firecrawl-py"
version = "2.16.3"
description = "Python SDK for Firecrawl API"
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "firecrawl_py-2.16.3-py3-none-any.whl", hash = "sha256:94bb46af5e0df6c8ec414ac999a5355c0f5a46f15fd1cf5a02a3b31062db0aa8"},
{file = "firecrawl_py-2.16.3.tar.gz", hash = "sha256:5fd063ef4acc4c4be62648f1e11467336bc127780b3afc28d39078a012e6a14c"},
]
[package.dependencies]
aiohttp = "*"
nest-asyncio = "*"
pydantic = "*"
python-dotenv = "*"
requests = "*"
websockets = "*"
[[package]]
name = "flake8"
version = "7.3.0"
@@ -2902,6 +2922,18 @@ files = [
{file = "mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558"},
]
[[package]]
name = "nest-asyncio"
version = "1.6.0"
description = "Patch asyncio to allow nested event loops"
optional = false
python-versions = ">=3.5"
groups = ["main"]
files = [
{file = "nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c"},
{file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"},
]
[[package]]
name = "nodeenv"
version = "1.9.1"
@@ -6686,4 +6718,4 @@ cffi = ["cffi (>=1.11)"]
[metadata]
lock-version = "2.1"
python-versions = ">=3.10,<3.13"
content-hash = "e79fd4e1968b496b1012c4866130f5680fc6558d041ddddd53f466e8ec58869c"
content-hash = "225ddae645d22cc57f46330e735c069fb52e708123aa642e74adbf077dda0796"

View File

@@ -74,6 +74,7 @@ aioclamd = "^1.0.0"
setuptools = "^80.9.0"
gcloud-aio-storage = "^9.5.0"
pandas = "^2.3.1"
firecrawl-py = "^2.16.3"
[tool.poetry.group.dev.dependencies]
aiohappyeyeballs = "^2.6.1"