feat(forge): add lightweight web fetch component

Add WebFetchComponent for fast HTTP-based page fetching without browser overhead. Uses trafilatura for intelligent content extraction. Commands: - fetch_webpage: Extract main content as text/markdown/xml - Removes navigation, ads, boilerplate automatically - Extracts page metadata (title, description, author, date) - Extracts and lists page links - Much faster than Selenium-based read_webpage - fetch_raw_html: Get raw HTML for structure inspection - Optional truncation for large pages Features: - Trafilatura-powered content extraction (best-in-class accuracy) - Automatic link extraction with relative URL resolution - Page metadata extraction (OG tags, meta tags) - Configurable timeout, max content length, max links - Proper error handling for timeouts and HTTP errors - 19 comprehensive tests Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-04-30 03:00:41 -04:00 · 2026-01-19 01:04:22 -06:00
parent e0784f8f6b
commit cda9572acd
4 changed files with 804 additions and 5 deletions
--- a/classic/forge/forge/components/web/test_web_fetch.py
+++ b/classic/forge/forge/components/web/test_web_fetch.py
@@ -0,0 +1,291 @@
+"""Tests for the web fetch component."""
+
+from unittest.mock import MagicMock, patch
+
+import httpx
+import pytest
+
+from forge.utils.exceptions import CommandExecutionError
+
+from .web_fetch import WebFetchComponent, WebFetchConfiguration
+
+
+@pytest.fixture
+def web_fetch_component():
+    """Create a WebFetchComponent with default config."""
+    config = WebFetchConfiguration()
+    return WebFetchComponent(config)
+
+
+@pytest.fixture
+def sample_html():
+    """Sample HTML for testing."""
+    return """
+    <!DOCTYPE html>
+    <html>
+    <head>
+        <title>Test Page Title</title>
+        <meta name="description" content="Test page description">
+        <meta name="author" content="Test Author">
+    </head>
+    <body>
+        <nav>Navigation content to be removed</nav>
+        <main>
+            <article>
+                <h1>Main Article Title</h1>
+                <p>This is the main content of the article.</p>
+                <p>It contains multiple paragraphs with important information.</p>
+                <a href="/relative-link">Relative Link</a>
+                <a href="https://example.com/absolute">Absolute Link</a>
+            </article>
+        </main>
+        <footer>Footer content</footer>
+        <script>console.log('script to remove');</script>
+    </body>
+    </html>
+    """
+
+
+class TestFetchWebpage:
+    """Test fetch_webpage command."""
+
+    def test_fetch_webpage_extracts_content(
+        self, mocker, web_fetch_component, sample_html
+    ):
+        mock_response = MagicMock()
+        mock_response.text = sample_html
+        mock_response.headers = {"content-length": "1000"}
+
+        mocker.patch.object(
+            web_fetch_component.client, "get", return_value=mock_response
+        )
+
+        result = web_fetch_component.fetch_webpage(
+            "https://example.com/test", include_links=False
+        )
+
+        assert "Main Article Title" in result or "main content" in result.lower()
+        assert "Title:" in result  # Metadata included
+
+    def test_fetch_webpage_extracts_metadata(
+        self, mocker, web_fetch_component, sample_html
+    ):
+        mock_response = MagicMock()
+        mock_response.text = sample_html
+        mock_response.headers = {}
+
+        mocker.patch.object(
+            web_fetch_component.client, "get", return_value=mock_response
+        )
+
+        result = web_fetch_component.fetch_webpage(
+            "https://example.com/test", include_links=False
+        )
+
+        assert "Test Page Title" in result
+        assert "Test page description" in result or "Description" in result
+
+    def test_fetch_webpage_extracts_links(
+        self, mocker, web_fetch_component, sample_html
+    ):
+        mock_response = MagicMock()
+        mock_response.text = sample_html
+        mock_response.headers = {}
+
+        mocker.patch.object(
+            web_fetch_component.client, "get", return_value=mock_response
+        )
+
+        result = web_fetch_component.fetch_webpage(
+            "https://example.com/test", include_links=True
+        )
+
+        assert "Links" in result
+        assert "example.com" in result
+
+    def test_fetch_webpage_handles_timeout(self, mocker, web_fetch_component):
+        mocker.patch.object(
+            web_fetch_component.client,
+            "get",
+            side_effect=httpx.TimeoutException("Timeout"),
+        )
+
+        with pytest.raises(CommandExecutionError) as exc_info:
+            web_fetch_component.fetch_webpage("https://example.com/slow")
+
+        assert "timed out" in str(exc_info.value).lower()
+
+    def test_fetch_webpage_handles_http_error(self, mocker, web_fetch_component):
+        mock_response = MagicMock()
+        mock_response.status_code = 404
+        mock_response.reason_phrase = "Not Found"
+
+        mocker.patch.object(
+            web_fetch_component.client,
+            "get",
+            side_effect=httpx.HTTPStatusError(
+                "Not Found", request=MagicMock(), response=mock_response
+            ),
+        )
+
+        with pytest.raises(CommandExecutionError) as exc_info:
+            web_fetch_component.fetch_webpage("https://example.com/missing")
+
+        assert "404" in str(exc_info.value)
+
+    def test_fetch_webpage_respects_max_content_length(
+        self, mocker, web_fetch_component
+    ):
+        mock_response = MagicMock()
+        mock_response.headers = {"content-length": "999999999"}
+
+        mocker.patch.object(
+            web_fetch_component.client, "get", return_value=mock_response
+        )
+
+        with pytest.raises(CommandExecutionError) as exc_info:
+            web_fetch_component.fetch_webpage("https://example.com/huge")
+
+        assert "too large" in str(exc_info.value).lower()
+
+
+class TestFetchRawHtml:
+    """Test fetch_raw_html command."""
+
+    def test_fetch_raw_html_returns_html(
+        self, mocker, web_fetch_component, sample_html
+    ):
+        mock_response = MagicMock()
+        mock_response.text = sample_html
+        mock_response.headers = {}
+
+        mocker.patch.object(
+            web_fetch_component.client, "get", return_value=mock_response
+        )
+
+        result = web_fetch_component.fetch_raw_html("https://example.com/test")
+
+        assert "<!DOCTYPE html>" in result
+        assert "<title>Test Page Title</title>" in result
+        assert "<script>" in result  # Raw HTML includes scripts
+
+    def test_fetch_raw_html_truncates_long_content(self, mocker, web_fetch_component):
+        long_html = "<html>" + "x" * 100000 + "</html>"
+
+        mock_response = MagicMock()
+        mock_response.text = long_html
+        mock_response.headers = {}
+
+        mocker.patch.object(
+            web_fetch_component.client, "get", return_value=mock_response
+        )
+
+        result = web_fetch_component.fetch_raw_html(
+            "https://example.com/long", max_length=1000
+        )
+
+        assert len(result) < len(long_html)
+        assert "truncated" in result.lower()
+
+
+class TestLinkExtraction:
+    """Test link extraction functionality."""
+
+    def test_extracts_absolute_links(self, web_fetch_component):
+        html = '<a href="https://example.com/page">Link Text</a>'
+        links = web_fetch_component._extract_links(html, "https://base.com")
+
+        assert len(links) == 1
+        assert "https://example.com/page" in links[0]
+        assert "Link Text" in links[0]
+
+    def test_resolves_relative_links(self, web_fetch_component):
+        html = '<a href="/relative/path">Relative</a>'
+        links = web_fetch_component._extract_links(html, "https://base.com")
+
+        assert len(links) == 1
+        assert "https://base.com/relative/path" in links[0]
+
+    def test_skips_javascript_links(self, web_fetch_component):
+        html = """
+        <a href="javascript:void(0)">JS Link</a>
+        <a href="mailto:test@example.com">Email</a>
+        <a href="tel:+1234567890">Phone</a>
+        <a href="#section">Anchor</a>
+        <a href="https://real.com">Real Link</a>
+        """
+        links = web_fetch_component._extract_links(html, "https://base.com")
+
+        assert len(links) == 1
+        assert "real.com" in links[0]
+
+    def test_respects_max_links(self, web_fetch_component):
+        web_fetch_component.config.max_links = 3
+        html = "".join(
+            f'<a href="https://example.com/{i}">Link {i}</a>' for i in range(10)
+        )
+        links = web_fetch_component._extract_links(html, "https://base.com")
+
+        assert len(links) == 3
+
+
+class TestMetadataExtraction:
+    """Test metadata extraction functionality."""
+
+    def test_extracts_title(self, web_fetch_component):
+        html = "<html><head><title>Page Title</title></head></html>"
+        metadata = web_fetch_component._extract_metadata(html)
+
+        assert metadata.get("title") == "Page Title"
+
+    def test_extracts_description(self, web_fetch_component):
+        html = '<html><head><meta name="description" content="Page desc"></head></html>'
+        metadata = web_fetch_component._extract_metadata(html)
+
+        assert metadata.get("description") == "Page desc"
+
+    def test_extracts_author(self, web_fetch_component):
+        html = '<html><head><meta name="author" content="John Doe"></head></html>'
+        metadata = web_fetch_component._extract_metadata(html)
+
+        assert metadata.get("author") == "John Doe"
+
+    def test_extracts_og_metadata(self, web_fetch_component):
+        html = """
+        <html><head>
+            <meta property="og:title" content="OG Title">
+            <meta property="og:description" content="OG Description">
+        </head></html>
+        """
+        metadata = web_fetch_component._extract_metadata(html)
+
+        assert metadata.get("og_title") == "OG Title"
+        assert metadata.get("og_description") == "OG Description"
+
+
+class TestConfiguration:
+    """Test configuration handling."""
+
+    def test_commands_available(self, web_fetch_component):
+        commands = list(web_fetch_component.get_commands())
+        command_names = [cmd.names[0] for cmd in commands]
+
+        assert "fetch_webpage" in command_names
+        assert "fetch_raw_html" in command_names
+
+    def test_resources_provided(self, web_fetch_component):
+        resources = list(web_fetch_component.get_resources())
+        assert len(resources) == 1
+        assert "fetch" in resources[0].lower() or "web" in resources[0].lower()
+
+    def test_custom_config(self):
+        config = WebFetchConfiguration(
+            timeout=60,
+            max_content_length=5_000_000,
+            max_links=100,
+        )
+        component = WebFetchComponent(config)
+
+        assert component.config.timeout == 60
+        assert component.config.max_content_length == 5_000_000
+        assert component.config.max_links == 100
--- a/classic/forge/forge/components/web/web_fetch.py
+++ b/classic/forge/forge/components/web/web_fetch.py
@@ -0,0 +1,338 @@
+"""
+Lightweight web page fetching component.
+
+Provides fast HTTP-based page fetching without browser overhead.
+Uses trafilatura for intelligent content extraction.
+"""
+
+import logging
+from typing import Iterator, Literal, Optional
+from urllib.parse import urljoin, urlparse
+
+import httpx
+import trafilatura
+from bs4 import BeautifulSoup
+from pydantic import BaseModel
+
+from forge.agent.components import ConfigurableComponent
+from forge.agent.protocols import CommandProvider, DirectiveProvider
+from forge.command import Command, command
+from forge.models.json_schema import JSONSchema
+from forge.utils.exceptions import CommandExecutionError
+from forge.utils.url_validator import validate_url
+
+logger = logging.getLogger(__name__)
+
+# Default headers to mimic a browser
+DEFAULT_HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+    ),
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    "Accept-Language": "en-US,en;q=0.5",
+    "Accept-Encoding": "gzip, deflate",
+    "DNT": "1",
+    "Connection": "keep-alive",
+    "Upgrade-Insecure-Requests": "1",
+}
+
+
+class WebFetchConfiguration(BaseModel):
+    """Configuration for the web fetch component."""
+
+    timeout: int = 30
+    """Request timeout in seconds"""
+
+    max_content_length: int = 10_000_000  # 10MB
+    """Maximum content length to download"""
+
+    follow_redirects: bool = True
+    """Whether to follow HTTP redirects"""
+
+    extract_links: bool = True
+    """Whether to extract links from pages"""
+
+    max_links: int = 50
+    """Maximum number of links to return"""
+
+    include_metadata: bool = True
+    """Whether to include page metadata (title, description, etc.)"""
+
+
+class WebFetchComponent(
+    DirectiveProvider, CommandProvider, ConfigurableComponent[WebFetchConfiguration]
+):
+    """
+    Lightweight web page fetching component.
+
+    Provides fast HTTP-based page fetching without browser overhead.
+    Uses trafilatura for intelligent main content extraction.
+    """
+
+    config_class = WebFetchConfiguration
+
+    def __init__(self, config: Optional[WebFetchConfiguration] = None):
+        ConfigurableComponent.__init__(self, config)
+        self._client: Optional[httpx.Client] = None
+
+    @property
+    def client(self) -> httpx.Client:
+        """Lazy-loaded HTTP client."""
+        if self._client is None:
+            self._client = httpx.Client(
+                headers=DEFAULT_HEADERS,
+                timeout=self.config.timeout,
+                follow_redirects=self.config.follow_redirects,
+            )
+        return self._client
+
+    def get_resources(self) -> Iterator[str]:
+        yield "Ability to fetch and extract content from web pages."
+
+    def get_commands(self) -> Iterator[Command]:
+        yield self.fetch_webpage
+        yield self.fetch_raw_html
+
+    def _fetch_url(self, url: str) -> httpx.Response:
+        """Fetch a URL and return the response."""
+        try:
+            response = self.client.get(url)
+            response.raise_for_status()
+
+            # Check content length
+            content_length = response.headers.get("content-length")
+            if content_length and int(content_length) > self.config.max_content_length:
+                raise CommandExecutionError(
+                    f"Content too large: {int(content_length)} bytes "
+                    f"(max: {self.config.max_content_length})"
+                )
+
+            return response
+
+        except httpx.TimeoutException:
+            raise CommandExecutionError(
+                f"Request timed out after {self.config.timeout} seconds"
+            )
+        except httpx.HTTPStatusError as e:
+            raise CommandExecutionError(
+                f"HTTP {e.response.status_code}: {e.response.reason_phrase}"
+            )
+        except httpx.RequestError as e:
+            raise CommandExecutionError(f"Request failed: {e}")
+
+    def _extract_links(self, html: str, base_url: str) -> list[str]:
+        """Extract links from HTML content."""
+        soup = BeautifulSoup(html, "html.parser")
+        links = []
+
+        for a in soup.find_all("a", href=True):
+            href = a["href"]
+            # Skip javascript, mailto, tel links
+            if href.startswith(("javascript:", "mailto:", "tel:", "#")):
+                continue
+
+            # Resolve relative URLs
+            absolute_url = urljoin(base_url, href)
+
+            # Only include http(s) links
+            if absolute_url.startswith(("http://", "https://")):
+                # Get link text
+                text = a.get_text(strip=True)[:100] or "[no text]"
+                links.append(f"{text}: {absolute_url}")
+
+                if len(links) >= self.config.max_links:
+                    break
+
+        return links
+
+    def _extract_metadata(self, html: str) -> dict[str, str]:
+        """Extract metadata from HTML."""
+        soup = BeautifulSoup(html, "html.parser")
+        metadata = {}
+
+        # Title
+        title_tag = soup.find("title")
+        if title_tag:
+            metadata["title"] = title_tag.get_text(strip=True)
+
+        # Meta description
+        desc = soup.find("meta", attrs={"name": "description"})
+        if desc and desc.get("content"):
+            metadata["description"] = desc["content"]
+
+        # Open Graph title/description
+        og_title = soup.find("meta", attrs={"property": "og:title"})
+        if og_title and og_title.get("content"):
+            metadata["og_title"] = og_title["content"]
+
+        og_desc = soup.find("meta", attrs={"property": "og:description"})
+        if og_desc and og_desc.get("content"):
+            metadata["og_description"] = og_desc["content"]
+
+        # Author
+        author = soup.find("meta", attrs={"name": "author"})
+        if author and author.get("content"):
+            metadata["author"] = author["content"]
+
+        # Published date
+        for attr in ["article:published_time", "datePublished", "date"]:
+            date_tag = soup.find("meta", attrs={"property": attr}) or soup.find(
+                "meta", attrs={"name": attr}
+            )
+            if date_tag and date_tag.get("content"):
+                metadata["published"] = date_tag["content"]
+                break
+
+        return metadata
+
+    @command(
+        ["fetch_webpage", "fetch", "download_page"],
+        "Fetch a webpage and extract its main content as clean text. "
+        "Much faster than read_webpage (no browser needed).",
+        {
+            "url": JSONSchema(
+                type=JSONSchema.Type.STRING,
+                description="The URL to fetch",
+                required=True,
+            ),
+            "output_format": JSONSchema(
+                type=JSONSchema.Type.STRING,
+                description=(
+                    "Output format: 'text' (plain text), 'markdown' (with formatting), "
+                    "or 'xml' (structured). Default: 'markdown'"
+                ),
+                required=False,
+            ),
+            "include_links": JSONSchema(
+                type=JSONSchema.Type.BOOLEAN,
+                description="Whether to include extracted links. Default: true",
+                required=False,
+            ),
+            "include_comments": JSONSchema(
+                type=JSONSchema.Type.BOOLEAN,
+                description="Whether to include page comments. Default: false",
+                required=False,
+            ),
+        },
+    )
+    @validate_url
+    def fetch_webpage(
+        self,
+        url: str,
+        output_format: Literal["text", "markdown", "xml"] = "markdown",
+        include_links: bool = True,
+        include_comments: bool = False,
+    ) -> str:
+        """
+        Fetch a webpage and extract its main content.
+
+        Uses trafilatura for intelligent content extraction - automatically
+        removes navigation, ads, boilerplate, and extracts the main article text.
+
+        Args:
+            url: The URL to fetch
+            output_format: Output format (text, markdown, xml)
+            include_links: Whether to include links from the page
+            include_comments: Whether to include comments section
+
+        Returns:
+            Extracted content with optional metadata and links
+        """
+        response = self._fetch_url(url)
+        html = response.text
+
+        # Extract main content using trafilatura
+        extract_kwargs = {
+            "include_comments": include_comments,
+            "include_links": output_format == "markdown",
+            "include_images": False,
+            "include_tables": True,
+            "no_fallback": False,
+        }
+
+        if output_format == "markdown":
+            content = trafilatura.extract(
+                html, output_format="markdown", **extract_kwargs
+            )
+        elif output_format == "xml":
+            content = trafilatura.extract(html, output_format="xml", **extract_kwargs)
+        else:
+            content = trafilatura.extract(html, **extract_kwargs)
+
+        if not content:
+            # Fallback to basic BeautifulSoup extraction
+            soup = BeautifulSoup(html, "html.parser")
+            for tag in soup(["script", "style", "nav", "footer", "header"]):
+                tag.decompose()
+            content = soup.get_text(separator="\n", strip=True)
+            if not content:
+                return "Could not extract content from this page."
+
+        # Build output
+        output_parts = []
+
+        # Add metadata
+        if self.config.include_metadata:
+            metadata = self._extract_metadata(html)
+            if metadata:
+                meta_lines = []
+                if "title" in metadata:
+                    meta_lines.append(f"**Title:** {metadata['title']}")
+                if "description" in metadata:
+                    meta_lines.append(f"**Description:** {metadata['description']}")
+                if "author" in metadata:
+                    meta_lines.append(f"**Author:** {metadata['author']}")
+                if "published" in metadata:
+                    meta_lines.append(f"**Published:** {metadata['published']}")
+                if meta_lines:
+                    output_parts.append("## Page Info\n" + "\n".join(meta_lines))
+
+        # Add main content
+        output_parts.append(f"## Content\n{content}")
+
+        # Add links
+        if include_links and self.config.extract_links:
+            links = self._extract_links(html, url)
+            if links:
+                links_text = "\n".join(f"- {link}" for link in links)
+                output_parts.append(f"## Links ({len(links)})\n{links_text}")
+
+        return "\n\n".join(output_parts)
+
+    @command(
+        ["fetch_raw_html", "get_html"],
+        "Fetch a webpage and return the raw HTML. Use this when you need "
+        "to inspect the page structure or extract specific elements.",
+        {
+            "url": JSONSchema(
+                type=JSONSchema.Type.STRING,
+                description="The URL to fetch",
+                required=True,
+            ),
+            "max_length": JSONSchema(
+                type=JSONSchema.Type.INTEGER,
+                description="Maximum characters to return. Default: 50000",
+                required=False,
+            ),
+        },
+    )
+    @validate_url
+    def fetch_raw_html(self, url: str, max_length: int = 50000) -> str:
+        """
+        Fetch a webpage and return the raw HTML.
+
+        Args:
+            url: The URL to fetch
+            max_length: Maximum characters to return
+
+        Returns:
+            Raw HTML content (truncated if necessary)
+        """
+        response = self._fetch_url(url)
+        html = response.text
+
+        if len(html) > max_length:
+            return html[:max_length] + f"\n\n... [truncated, {len(html)} total chars]"
+
+        return html
--- a/classic/forge/poetry.lock
+++ b/classic/forge/poetry.lock
@@ -322,6 +322,21 @@ files = [
    {file = "attrs-25.4.0.tar.gz", hash = "sha256:16d5969b87f0859ef33a48b35d55ac1be6e42ae49d5e853b597db70c35c57e11"},
 ]

+[[package]]
+name = "babel"
+version = "2.17.0"
+description = "Internationalization utilities"
+optional = false
+python-versions = ">=3.8"
+groups = ["main"]
+files = [
+    {file = "babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2"},
+    {file = "babel-2.17.0.tar.gz", hash = "sha256:0c54cffb19f690cdcc52a3b50bcbf71e07a808d1c80d549f2459b9d2cf0afb9d"},
+]
+
+[package.extras]
+dev = ["backports.zoneinfo ; python_version < \"3.9\"", "freezegun (>=1.0,<2.0)", "jinja2 (>=3.0)", "pytest (>=6.0)", "pytest-cov", "pytz", "setuptools", "tzdata ; sys_platform == \"win32\""]
+
 [[package]]
 name = "backoff"
 version = "2.2.1"
@@ -1715,6 +1730,26 @@ mypy = ["bokeh", "contourpy[bokeh,docs]", "docutils-stubs", "mypy (==1.15.0)", "
 test = ["Pillow", "contourpy[test-no-images]", "matplotlib"]
 test-no-images = ["pytest", "pytest-cov", "pytest-rerunfailures", "pytest-xdist", "wurlitzer"]

+[[package]]
+name = "courlan"
+version = "1.3.2"
+description = "Clean, filter and sample URLs to optimize data collection – includes spam, content type and language filters."
+optional = false
+python-versions = ">=3.8"
+groups = ["main"]
+files = [
+    {file = "courlan-1.3.2-py3-none-any.whl", hash = "sha256:d0dab52cf5b5b1000ee2839fbc2837e93b2514d3cb5bb61ae158a55b7a04c6be"},
+    {file = "courlan-1.3.2.tar.gz", hash = "sha256:0b66f4db3a9c39a6e22dd247c72cfaa57d68ea660e94bb2c84ec7db8712af190"},
+]
+
+[package.dependencies]
+babel = ">=2.16.0"
+tld = ">=0.13"
+urllib3 = ">=1.26,<3"
+
+[package.extras]
+dev = ["black", "flake8", "mypy", "pytest", "pytest-cov", "types-urllib3"]
+
 [[package]]
 name = "coverage"
 version = "7.13.1"
@@ -1883,6 +1918,29 @@ files = [
    {file = "cymem-2.0.11.tar.gz", hash = "sha256:efe49a349d4a518be6b6c6b255d4a80f740a341544bde1a807707c058b88d0bd"},
 ]

+[[package]]
+name = "dateparser"
+version = "1.2.2"
+description = "Date parsing library designed to parse dates from HTML pages"
+optional = false
+python-versions = ">=3.8"
+groups = ["main"]
+files = [
+    {file = "dateparser-1.2.2-py3-none-any.whl", hash = "sha256:5a5d7211a09013499867547023a2a0c91d5a27d15dd4dbcea676ea9fe66f2482"},
+    {file = "dateparser-1.2.2.tar.gz", hash = "sha256:986316f17cb8cdc23ea8ce563027c5ef12fc725b6fb1d137c14ca08777c5ecf7"},
+]
+
+[package.dependencies]
+python-dateutil = ">=2.7.0"
+pytz = ">=2024.2"
+regex = ">=2024.9.11"
+tzlocal = ">=0.2"
+
+[package.extras]
+calendars = ["convertdate (>=2.2.1)", "hijridate"]
+fasttext = ["fasttext (>=0.9.1)", "numpy (>=1.19.3,<2)"]
+langdetect = ["langdetect (>=1.0.0)"]
+
 [[package]]
 name = "ddgs"
 version = "9.10.0"
@@ -3063,6 +3121,30 @@ files = [
    {file = "hpack-4.1.0.tar.gz", hash = "sha256:ec5eca154f7056aa06f196a557655c5b009b382873ac8d1e66e79e87535f1dca"},
 ]

+[[package]]
+name = "htmldate"
+version = "1.9.4"
+description = "Fast and robust extraction of original and updated publication dates from URLs and web pages."
+optional = false
+python-versions = ">=3.8"
+groups = ["main"]
+files = [
+    {file = "htmldate-1.9.4-py3-none-any.whl", hash = "sha256:1b94bcc4e08232a5b692159903acf95548b6a7492dddca5bb123d89d6325921c"},
+    {file = "htmldate-1.9.4.tar.gz", hash = "sha256:1129063e02dd0354b74264de71e950c0c3fcee191178321418ccad2074cc8ed0"},
+]
+
+[package.dependencies]
+charset_normalizer = ">=3.4.0"
+dateparser = ">=1.1.2"
+lxml = {version = ">=5.3.0", markers = "platform_system != \"Darwin\" or python_version > \"3.8\""}
+python-dateutil = ">=2.9.0.post0"
+urllib3 = ">=1.26,<3"
+
+[package.extras]
+all = ["htmldate[dev]", "htmldate[speed]"]
+dev = ["black", "flake8", "mypy", "pytest", "pytest-cov", "types-dateparser", "types-lxml", "types-python-dateutil", "types-urllib3"]
+speed = ["backports-datetime-fromisoformat ; python_version < \"3.11\"", "faust-cchardet (>=2.1.19)", "urllib3[brotli]"]
+
 [[package]]
 name = "httpcore"
 version = "1.0.8"
@@ -3605,6 +3687,21 @@ files = [
 [package.dependencies]
 referencing = ">=0.31.0"

+[[package]]
+name = "justext"
+version = "3.0.2"
+description = "Heuristic based boilerplate removal tool"
+optional = false
+python-versions = "*"
+groups = ["main"]
+files = [
+    {file = "justext-3.0.2-py2.py3-none-any.whl", hash = "sha256:62b1c562b15c3c6265e121cc070874243a443bfd53060e869393f09d6b6cc9a7"},
+    {file = "justext-3.0.2.tar.gz", hash = "sha256:13496a450c44c4cd5b5a75a5efcd9996066d2a189794ea99a49949685a0beb05"},
+]
+
+[package.dependencies]
+lxml = {version = ">=4.4.2", extras = ["html-clean"]}
+
 [[package]]
 name = "kiwisolver"
 version = "1.4.9"
@@ -3946,12 +4043,30 @@ files = [
    {file = "lxml-6.0.2.tar.gz", hash = "sha256:cd79f3367bd74b317dda655dc8fcfa304d9eb6e4fb06b7168c5cf27f96e0cd62"},
 ]

+[package.dependencies]
+lxml_html_clean = {version = "*", optional = true, markers = "extra == \"html-clean\""}
+
 [package.extras]
 cssselect = ["cssselect (>=0.7)"]
 html-clean = ["lxml_html_clean"]
 html5 = ["html5lib"]
 htmlsoup = ["BeautifulSoup4"]

+[[package]]
+name = "lxml-html-clean"
+version = "0.4.3"
+description = "HTML cleaner from lxml project"
+optional = false
+python-versions = "*"
+groups = ["main"]
+files = [
+    {file = "lxml_html_clean-0.4.3-py3-none-any.whl", hash = "sha256:63fd7b0b9c3a2e4176611c2ca5d61c4c07ffca2de76c14059a81a2825833731e"},
+    {file = "lxml_html_clean-0.4.3.tar.gz", hash = "sha256:c9df91925b00f836c807beab127aac82575110eacff54d0a75187914f1bd9d8c"},
+]
+
+[package.dependencies]
+lxml = "*"
+
 [[package]]
 name = "markdown-it-py"
 version = "4.0.0"
@@ -6524,10 +6639,9 @@ dev = ["atomicwrites (==1.2.1)", "attrs (==19.2.0)", "coverage (==6.5.0)", "hatc
 name = "pytz"
 version = "2025.2"
 description = "World timezone definitions, modern and historical"
-optional = true
+optional = false
 python-versions = "*"
 groups = ["main"]
-markers = "extra == \"benchmark\""
 files = [
    {file = "pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00"},
    {file = "pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3"},
@@ -7906,6 +8020,18 @@ requests = ">=2.26.0"
 [package.extras]
 blobfile = ["blobfile (>=2)"]

+[[package]]
+name = "tld"
+version = "0.13.1"
+description = "Extract the top-level domain (TLD) from the URL given."
+optional = false
+python-versions = ">=3.7"
+groups = ["main"]
+files = [
+    {file = "tld-0.13.1-py2.py3-none-any.whl", hash = "sha256:a2d35109433ac83486ddf87e3c4539ab2c5c2478230e5d9c060a18af4b03aa7c"},
+    {file = "tld-0.13.1.tar.gz", hash = "sha256:75ec00936cbcf564f67361c41713363440b6c4ef0f0c1592b5b0fbe72c17a350"},
+]
+
 [[package]]
 name = "tokenizers"
 version = "0.22.2"
@@ -7982,6 +8108,31 @@ notebook = ["ipywidgets (>=6)"]
 slack = ["slack-sdk"]
 telegram = ["requests"]

+[[package]]
+name = "trafilatura"
+version = "2.0.0"
+description = "Python & Command-line tool to gather text and metadata on the Web: Crawling, scraping, extraction, output as CSV, JSON, HTML, MD, TXT, XML."
+optional = false
+python-versions = ">=3.8"
+groups = ["main"]
+files = [
+    {file = "trafilatura-2.0.0-py3-none-any.whl", hash = "sha256:77eb5d1e993747f6f20938e1de2d840020719735690c840b9a1024803a4cd51d"},
+    {file = "trafilatura-2.0.0.tar.gz", hash = "sha256:ceb7094a6ecc97e72fea73c7dba36714c5c5b577b6470e4520dca893706d6247"},
+]
+
+[package.dependencies]
+certifi = "*"
+charset_normalizer = ">=3.4.0"
+courlan = ">=1.3.2"
+htmldate = ">=1.9.2"
+justext = ">=3.0.1"
+lxml = {version = ">=5.3.0", markers = "platform_system != \"Darwin\" or python_version > \"3.8\""}
+urllib3 = ">=1.26,<3"
+
+[package.extras]
+all = ["brotli", "cchardet (>=2.1.7) ; python_version < \"3.11\"", "faust-cchardet (>=2.1.19) ; python_version >= \"3.11\"", "htmldate[speed] (>=1.9.2)", "py3langid (>=0.3.0)", "pycurl (>=7.45.3)", "urllib3[socks]", "zstandard (>=0.23.0)"]
+dev = ["flake8", "mypy", "pytest", "pytest-cov", "types-lxml", "types-urllib3"]
+
 [[package]]
 name = "traitlets"
 version = "5.14.3"
@@ -8221,15 +8372,33 @@ typing-extensions = ">=4.12.0"
 name = "tzdata"
 version = "2025.3"
 description = "Provider of IANA time zone data"
-optional = true
+optional = false
 python-versions = ">=2"
 groups = ["main"]
-markers = "extra == \"benchmark\""
+markers = "extra == \"benchmark\" or platform_system == \"Windows\""
 files = [
    {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"},
    {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"},
 ]

+[[package]]
+name = "tzlocal"
+version = "5.3.1"
+description = "tzinfo object for the local timezone"
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+files = [
+    {file = "tzlocal-5.3.1-py3-none-any.whl", hash = "sha256:eb1a66c3ef5847adf7a834f1be0800581b683b5608e74f86ecbcef8ab91bb85d"},
+    {file = "tzlocal-5.3.1.tar.gz", hash = "sha256:cceffc7edecefea1f595541dbd6e990cb1ea3d19bf01b2809f362a03dd7921fd"},
+]
+
+[package.dependencies]
+tzdata = {version = "*", markers = "platform_system == \"Windows\""}
+
+[package.extras]
+devenv = ["check-manifest", "pytest (>=4.3)", "pytest-cov", "pytest-mock (>=3.3)", "zest.releaser"]
+
 [[package]]
 name = "uritemplate"
 version = "4.2.0"
@@ -9022,4 +9191,4 @@ benchmark = ["agbenchmark"]
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.12"
-content-hash = "9388cd65752a0db5c42052e312a7df7b0f535256b2c27a9d9209d5118a418a25"
+content-hash = "7de5064cccbd74f3bc85ded976c37b249f7da997d6ef5b5853240a57daae7695"
--- a/classic/forge/pyproject.toml
+++ b/classic/forge/pyproject.toml
@@ -37,6 +37,7 @@ colorama = "^0.4.6"
 demjson3 = "^3.0.0"
 docker = "*"
 ddgs = "^9.9"
+trafilatura = "^2.0"
 en-core-web-sm = { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl" }
 fastapi = "^0.109.1"
 gitpython = "^3.1.32"