feat(blocks): Add Firecrawl Integration for Web Scraping and Data Extraction (#10494)

### Changes 🏗️ This PR adds Firecrawl integration to AutoGPT, providing powerful web scraping and data extraction capabilities: **New Blocks Added:** ⚠️ All these blocks are synchronous so take a while to finish, this allows a simpler agent workflow - **Firecrawl Scrape Block**: Scrapes single web pages with various output formats (Markdown, HTML, JSON, screenshots) - **Firecrawl Crawl Block**: Crawls entire websites following links with customizable depth and filters - **Firecrawl Extract Block**: Extracts structured data from web pages using AI-powered prompts - **Firecrawl Map Block**: Maps website structure and returns a list of all discovered URLs - **Firecrawl Search Block**: Searches Google and scrapes the results **Key Features:** - Advanced anti-blocking technology to bypass scraping protections - Multiple output formats including Markdown, HTML, JSON, and screenshots - AI-powered data extraction with custom prompts and schemas - Configurable crawling depth and URL filtering - Built-in caching and rate limiting - Google search integration for discovering relevant content **Use Cases:** - Web data extraction for research and analysis - Content monitoring and change tracking - Competitive intelligence gathering - SEO analysis and website mapping - Automated data collection workflows ### Checklist 📋 #### For code changes: - [x] I have clearly listed my changes in the PR description - [x] I have made a test plan - [x] I have tested my changes according to the test plan: <\!-- Put your test plan here: --> - [x] Verified all Firecrawl blocks appear in the UI - [x] Tested scraping various websites with different formats - [x] Tested crawling with depth limits and URL filters - [x] Tested data extraction with custom prompts - [x] Verified error handling for invalid URLs and API failures - [x] Tested authentication with Firecrawl API key - [x] Confirmed proper rate limiting and caching behavior <img width="1025" height="1027" alt="Screenshot 2025-07-30 at 15 20 28" src="https://github.com/user-attachments/assets/7b94d3cf-7a0e-4d09-a9c5-24c4e8a3b660" /> # Example Agent [FC Testing_v12.json](https://github.com/user-attachments/files/21510608/FC.Testing_v12.json)
2026-04-08 03:00:28 -04:00 · 2025-07-31 11:47:49 +02:00
parent b429505c14
commit df399e5c51
10 changed files with 457 additions and 1 deletions
--- a/autogpt_platform/backend/backend/blocks/firecrawl/init.py
+++ b/autogpt_platform/backend/backend/blocks/firecrawl/init.py
--- a/autogpt_platform/backend/backend/blocks/firecrawl/_api.py
+++ b/autogpt_platform/backend/backend/blocks/firecrawl/_api.py
--- a/autogpt_platform/backend/backend/blocks/firecrawl/_config.py
+++ b/autogpt_platform/backend/backend/blocks/firecrawl/_config.py
@@ -0,0 +1,8 @@
+from backend.sdk import BlockCostType, ProviderBuilder
+
+firecrawl = (
+    ProviderBuilder("firecrawl")
+    .with_api_key("FIRECRAWL_API_KEY", "Firecrawl API Key")
+    .with_base_cost(1, BlockCostType.RUN)
+    .build()
+)
--- a/autogpt_platform/backend/backend/blocks/firecrawl/crawl.py
+++ b/autogpt_platform/backend/backend/blocks/firecrawl/crawl.py
@@ -0,0 +1,114 @@
+from enum import Enum
+from typing import Any
+
+from firecrawl import FirecrawlApp, ScrapeOptions
+
+from backend.sdk import (
+    APIKeyCredentials,
+    Block,
+    BlockCategory,
+    BlockOutput,
+    BlockSchema,
+    CredentialsMetaInput,
+    SchemaField,
+)
+
+from ._config import firecrawl
+
+
+class ScrapeFormat(Enum):
+    MARKDOWN = "markdown"
+    HTML = "html"
+    RAW_HTML = "rawHtml"
+    LINKS = "links"
+    SCREENSHOT = "screenshot"
+    SCREENSHOT_FULL_PAGE = "screenshot@fullPage"
+    JSON = "json"
+    CHANGE_TRACKING = "changeTracking"
+
+
+class FirecrawlCrawlBlock(Block):
+
+    class Input(BlockSchema):
+        credentials: CredentialsMetaInput = firecrawl.credentials_field()
+        url: str = SchemaField(description="The URL to crawl")
+        limit: int = SchemaField(description="The number of pages to crawl", default=10)
+        only_main_content: bool = SchemaField(
+            description="Only return the main content of the page excluding headers, navs, footers, etc.",
+            default=True,
+        )
+        max_age: int = SchemaField(
+            description="The maximum age of the page in milliseconds - default is 1 hour",
+            default=3600000,
+        )
+        wait_for: int = SchemaField(
+            description="Specify a delay in milliseconds before fetching the content, allowing the page sufficient time to load.",
+            default=0,
+        )
+        formats: list[ScrapeFormat] = SchemaField(
+            description="The format of the crawl", default=[ScrapeFormat.MARKDOWN]
+        )
+
+    class Output(BlockSchema):
+        data: list[dict[str, Any]] = SchemaField(description="The result of the crawl")
+        markdown: str = SchemaField(description="The markdown of the crawl")
+        html: str = SchemaField(description="The html of the crawl")
+        raw_html: str = SchemaField(description="The raw html of the crawl")
+        links: list[str] = SchemaField(description="The links of the crawl")
+        screenshot: str = SchemaField(description="The screenshot of the crawl")
+        screenshot_full_page: str = SchemaField(
+            description="The screenshot full page of the crawl"
+        )
+        json_data: dict[str, Any] = SchemaField(
+            description="The json data of the crawl"
+        )
+        change_tracking: dict[str, Any] = SchemaField(
+            description="The change tracking of the crawl"
+        )
+
+    def __init__(self):
+        super().__init__(
+            id="bdbbaba0-03b7-4971-970e-699e2de6015e",
+            description="Firecrawl crawls websites to extract comprehensive data while bypassing blockers.",
+            categories={BlockCategory.SEARCH},
+            input_schema=self.Input,
+            output_schema=self.Output,
+        )
+
+    async def run(
+        self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
+    ) -> BlockOutput:
+
+        app = FirecrawlApp(api_key=credentials.api_key.get_secret_value())
+
+        # Sync call
+        crawl_result = app.crawl_url(
+            input_data.url,
+            limit=input_data.limit,
+            scrape_options=ScrapeOptions(
+                formats=[format.value for format in input_data.formats],
+                onlyMainContent=input_data.only_main_content,
+                maxAge=input_data.max_age,
+                waitFor=input_data.wait_for,
+            ),
+        )
+        yield "data", crawl_result.data
+
+        for data in crawl_result.data:
+            for f in input_data.formats:
+                if f == ScrapeFormat.MARKDOWN:
+                    yield "markdown", data.markdown
+                elif f == ScrapeFormat.HTML:
+                    yield "html", data.html
+                elif f == ScrapeFormat.RAW_HTML:
+                    yield "raw_html", data.rawHtml
+                elif f == ScrapeFormat.LINKS:
+                    yield "links", data.links
+                elif f == ScrapeFormat.SCREENSHOT:
+                    yield "screenshot", data.screenshot
+                elif f == ScrapeFormat.SCREENSHOT_FULL_PAGE:
+                    yield "screenshot_full_page", data.screenshot
+                elif f == ScrapeFormat.CHANGE_TRACKING:
+                    yield "change_tracking", data.changeTracking
+                elif f == ScrapeFormat.JSON:
+                    yield "json", data.json
--- a/autogpt_platform/backend/backend/blocks/firecrawl/extract.py
+++ b/autogpt_platform/backend/backend/blocks/firecrawl/extract.py
@@ -0,0 +1,67 @@
+from typing import Any
+
+from firecrawl import FirecrawlApp
+
+from backend.sdk import (
+    APIKeyCredentials,
+    Block,
+    BlockCategory,
+    BlockCost,
+    BlockCostType,
+    BlockOutput,
+    BlockSchema,
+    CredentialsMetaInput,
+    SchemaField,
+    cost,
+)
+
+from ._config import firecrawl
+
+
+@cost(BlockCost(2, BlockCostType.RUN))
+class FirecrawlExtractBlock(Block):
+
+    class Input(BlockSchema):
+        credentials: CredentialsMetaInput = firecrawl.credentials_field()
+        urls: list[str] = SchemaField(
+            description="The URLs to crawl - at least one is required. Wildcards are supported. (/*)"
+        )
+        prompt: str | None = SchemaField(
+            description="The prompt to use for the crawl", default=None, advanced=False
+        )
+        output_schema: str | None = SchemaField(
+            description="A more rigid structure if you already know the JSON layout.",
+            default=None,
+        )
+        enable_web_search: bool = SchemaField(
+            description="When true, extraction can follow links outside the specified domain.",
+            default=False,
+        )
+
+    class Output(BlockSchema):
+        data: dict[str, Any] = SchemaField(description="The result of the crawl")
+
+    def __init__(self):
+        super().__init__(
+            id="d1774756-4d9e-40e6-bab1-47ec0ccd81b2",
+            description="Firecrawl crawls websites to extract comprehensive data while bypassing blockers.",
+            categories={BlockCategory.SEARCH},
+            input_schema=self.Input,
+            output_schema=self.Output,
+        )
+
+    async def run(
+        self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
+    ) -> BlockOutput:
+
+        app = FirecrawlApp(api_key=credentials.api_key.get_secret_value())
+
+        # Sync call
+        extract_result = app.extract(
+            urls=input_data.urls,
+            prompt=input_data.prompt,
+            schema=input_data.output_schema,
+            enable_web_search=input_data.enable_web_search,
+        )
+
+        yield "data", extract_result.data
--- a/autogpt_platform/backend/backend/blocks/firecrawl/map.py
+++ b/autogpt_platform/backend/backend/blocks/firecrawl/map.py
@@ -0,0 +1,46 @@
+from firecrawl import FirecrawlApp
+
+from backend.sdk import (
+    APIKeyCredentials,
+    Block,
+    BlockCategory,
+    BlockOutput,
+    BlockSchema,
+    CredentialsMetaInput,
+    SchemaField,
+)
+
+from ._config import firecrawl
+
+
+class FirecrawlMapWebsiteBlock(Block):
+
+    class Input(BlockSchema):
+        credentials: CredentialsMetaInput = firecrawl.credentials_field()
+
+        url: str = SchemaField(description="The website url to map")
+
+    class Output(BlockSchema):
+        links: list[str] = SchemaField(description="The links of the website")
+
+    def __init__(self):
+        super().__init__(
+            id="f0f43e2b-c943-48a0-a7f1-40136ca4d3b9",
+            description="Firecrawl maps a website to extract all the links.",
+            categories={BlockCategory.SEARCH},
+            input_schema=self.Input,
+            output_schema=self.Output,
+        )
+
+    async def run(
+        self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
+    ) -> BlockOutput:
+
+        app = FirecrawlApp(api_key=credentials.api_key.get_secret_value())
+
+        # Sync call
+        map_result = app.map_url(
+            url=input_data.url,
+        )
+
+        yield "links", map_result.links
--- a/autogpt_platform/backend/backend/blocks/firecrawl/scrape.py
+++ b/autogpt_platform/backend/backend/blocks/firecrawl/scrape.py
@@ -0,0 +1,109 @@
+from enum import Enum
+from typing import Any
+
+from firecrawl import FirecrawlApp
+
+from backend.sdk import (
+    APIKeyCredentials,
+    Block,
+    BlockCategory,
+    BlockOutput,
+    BlockSchema,
+    CredentialsMetaInput,
+    SchemaField,
+)
+
+from ._config import firecrawl
+
+
+class ScrapeFormat(Enum):
+    MARKDOWN = "markdown"
+    HTML = "html"
+    RAW_HTML = "rawHtml"
+    LINKS = "links"
+    SCREENSHOT = "screenshot"
+    SCREENSHOT_FULL_PAGE = "screenshot@fullPage"
+    JSON = "json"
+    CHANGE_TRACKING = "changeTracking"
+
+
+class FirecrawlScrapeBlock(Block):
+
+    class Input(BlockSchema):
+        credentials: CredentialsMetaInput = firecrawl.credentials_field()
+        url: str = SchemaField(description="The URL to crawl")
+        limit: int = SchemaField(description="The number of pages to crawl", default=10)
+        only_main_content: bool = SchemaField(
+            description="Only return the main content of the page excluding headers, navs, footers, etc.",
+            default=True,
+        )
+        max_age: int = SchemaField(
+            description="The maximum age of the page in milliseconds - default is 1 hour",
+            default=3600000,
+        )
+        wait_for: int = SchemaField(
+            description="Specify a delay in milliseconds before fetching the content, allowing the page sufficient time to load.",
+            default=200,
+        )
+        formats: list[ScrapeFormat] = SchemaField(
+            description="The format of the crawl", default=[ScrapeFormat.MARKDOWN]
+        )
+
+    class Output(BlockSchema):
+        data: dict[str, Any] = SchemaField(description="The result of the crawl")
+        markdown: str = SchemaField(description="The markdown of the crawl")
+        html: str = SchemaField(description="The html of the crawl")
+        raw_html: str = SchemaField(description="The raw html of the crawl")
+        links: list[str] = SchemaField(description="The links of the crawl")
+        screenshot: str = SchemaField(description="The screenshot of the crawl")
+        screenshot_full_page: str = SchemaField(
+            description="The screenshot full page of the crawl"
+        )
+        json_data: dict[str, Any] = SchemaField(
+            description="The json data of the crawl"
+        )
+        change_tracking: dict[str, Any] = SchemaField(
+            description="The change tracking of the crawl"
+        )
+
+    def __init__(self):
+        super().__init__(
+            id="ac444320-cf5e-4697-b586-2604c17a3e75",
+            description="Firecrawl scrapes a website to extract comprehensive data while bypassing blockers.",
+            categories={BlockCategory.SEARCH},
+            input_schema=self.Input,
+            output_schema=self.Output,
+        )
+
+    async def run(
+        self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
+    ) -> BlockOutput:
+
+        app = FirecrawlApp(api_key=credentials.api_key.get_secret_value())
+
+        scrape_result = app.scrape_url(
+            input_data.url,
+            formats=[format.value for format in input_data.formats],
+            only_main_content=input_data.only_main_content,
+            max_age=input_data.max_age,
+            wait_for=input_data.wait_for,
+        )
+        yield "data", scrape_result
+
+        for f in input_data.formats:
+            if f == ScrapeFormat.MARKDOWN:
+                yield "markdown", scrape_result.markdown
+            elif f == ScrapeFormat.HTML:
+                yield "html", scrape_result.html
+            elif f == ScrapeFormat.RAW_HTML:
+                yield "raw_html", scrape_result.rawHtml
+            elif f == ScrapeFormat.LINKS:
+                yield "links", scrape_result.links
+            elif f == ScrapeFormat.SCREENSHOT:
+                yield "screenshot", scrape_result.screenshot
+            elif f == ScrapeFormat.SCREENSHOT_FULL_PAGE:
+                yield "screenshot_full_page", scrape_result.screenshot
+            elif f == ScrapeFormat.CHANGE_TRACKING:
+                yield "change_tracking", scrape_result.changeTracking
+            elif f == ScrapeFormat.JSON:
+                yield "json", scrape_result.json
--- a/autogpt_platform/backend/backend/blocks/firecrawl/search.py
+++ b/autogpt_platform/backend/backend/blocks/firecrawl/search.py
@@ -0,0 +1,79 @@
+from enum import Enum
+from typing import Any
+
+from firecrawl import FirecrawlApp, ScrapeOptions
+
+from backend.sdk import (
+    APIKeyCredentials,
+    Block,
+    BlockCategory,
+    BlockOutput,
+    BlockSchema,
+    CredentialsMetaInput,
+    SchemaField,
+)
+
+from ._config import firecrawl
+
+
+class ScrapeFormat(Enum):
+    MARKDOWN = "markdown"
+    HTML = "html"
+    RAW_HTML = "rawHtml"
+    LINKS = "links"
+    SCREENSHOT = "screenshot"
+    SCREENSHOT_FULL_PAGE = "screenshot@fullPage"
+    JSON = "json"
+    CHANGE_TRACKING = "changeTracking"
+
+
+class FirecrawlSearchBlock(Block):
+
+    class Input(BlockSchema):
+        credentials: CredentialsMetaInput = firecrawl.credentials_field()
+        query: str = SchemaField(description="The query to search for")
+        limit: int = SchemaField(description="The number of pages to crawl", default=10)
+        max_age: int = SchemaField(
+            description="The maximum age of the page in milliseconds - default is 1 hour",
+            default=3600000,
+        )
+        wait_for: int = SchemaField(
+            description="Specify a delay in milliseconds before fetching the content, allowing the page sufficient time to load.",
+            default=200,
+        )
+        formats: list[ScrapeFormat] = SchemaField(
+            description="Returns the content of the search if specified", default=[]
+        )
+
+    class Output(BlockSchema):
+        data: dict[str, Any] = SchemaField(description="The result of the search")
+        site: dict[str, Any] = SchemaField(description="The site of the search")
+
+    def __init__(self):
+        super().__init__(
+            id="f8d2f28d-b3a1-405b-804e-418c087d288b",
+            description="Firecrawl searches the web for the given query.",
+            categories={BlockCategory.SEARCH},
+            input_schema=self.Input,
+            output_schema=self.Output,
+        )
+
+    async def run(
+        self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
+    ) -> BlockOutput:
+
+        app = FirecrawlApp(api_key=credentials.api_key.get_secret_value())
+
+        # Sync call
+        scrape_result = app.search(
+            input_data.query,
+            limit=input_data.limit,
+            scrape_options=ScrapeOptions(
+                formats=[format.value for format in input_data.formats],
+                maxAge=input_data.max_age,
+                waitFor=input_data.wait_for,
+            ),
+        )
+        yield "data", scrape_result
+        for site in scrape_result.data:
+            yield "site", site
--- a/autogpt_platform/backend/poetry.lock
+++ b/autogpt_platform/backend/poetry.lock
@@ -1212,6 +1212,26 @@ files = [
 [package.dependencies]
 packaging = ">=20"

+[[package]]
+name = "firecrawl-py"
+version = "2.16.3"
+description = "Python SDK for Firecrawl API"
+optional = false
+python-versions = ">=3.8"
+groups = ["main"]
+files = [
+    {file = "firecrawl_py-2.16.3-py3-none-any.whl", hash = "sha256:94bb46af5e0df6c8ec414ac999a5355c0f5a46f15fd1cf5a02a3b31062db0aa8"},
+    {file = "firecrawl_py-2.16.3.tar.gz", hash = "sha256:5fd063ef4acc4c4be62648f1e11467336bc127780b3afc28d39078a012e6a14c"},
+]
+
+[package.dependencies]
+aiohttp = "*"
+nest-asyncio = "*"
+pydantic = "*"
+python-dotenv = "*"
+requests = "*"
+websockets = "*"
+
 [[package]]
 name = "flake8"
 version = "7.3.0"
@@ -2902,6 +2922,18 @@ files = [
    {file = "mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558"},
 ]

+[[package]]
+name = "nest-asyncio"
+version = "1.6.0"
+description = "Patch asyncio to allow nested event loops"
+optional = false
+python-versions = ">=3.5"
+groups = ["main"]
+files = [
+    {file = "nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c"},
+    {file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"},
+]
+
 [[package]]
 name = "nodeenv"
 version = "1.9.1"
@@ -6686,4 +6718,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.10,<3.13"
-content-hash = "e79fd4e1968b496b1012c4866130f5680fc6558d041ddddd53f466e8ec58869c"
+content-hash = "225ddae645d22cc57f46330e735c069fb52e708123aa642e74adbf077dda0796"
--- a/autogpt_platform/backend/pyproject.toml
+++ b/autogpt_platform/backend/pyproject.toml
@@ -74,6 +74,7 @@ aioclamd = "^1.0.0"
 setuptools = "^80.9.0"
 gcloud-aio-storage = "^9.5.0"
 pandas = "^2.3.1"
+firecrawl-py = "^2.16.3"

 [tool.poetry.group.dev.dependencies]
 aiohappyeyeballs = "^2.6.1"