feat(forge): add lightweight web fetch component

Add WebFetchComponent for fast HTTP-based page fetching without browser
overhead. Uses trafilatura for intelligent content extraction.

Commands:
- fetch_webpage: Extract main content as text/markdown/xml
  - Removes navigation, ads, boilerplate automatically
  - Extracts page metadata (title, description, author, date)
  - Extracts and lists page links
  - Much faster than Selenium-based read_webpage

- fetch_raw_html: Get raw HTML for structure inspection
  - Optional truncation for large pages

Features:
- Trafilatura-powered content extraction (best-in-class accuracy)
- Automatic link extraction with relative URL resolution
- Page metadata extraction (OG tags, meta tags)
- Configurable timeout, max content length, max links
- Proper error handling for timeouts and HTTP errors
- 19 comprehensive tests

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Nicholas Tindle
2026-01-19 01:04:22 -06:00
parent e0784f8f6b
commit cda9572acd
4 changed files with 804 additions and 5 deletions

View File

@@ -0,0 +1,291 @@
"""Tests for the web fetch component."""
from unittest.mock import MagicMock, patch
import httpx
import pytest
from forge.utils.exceptions import CommandExecutionError
from .web_fetch import WebFetchComponent, WebFetchConfiguration
@pytest.fixture
def web_fetch_component():
"""Create a WebFetchComponent with default config."""
config = WebFetchConfiguration()
return WebFetchComponent(config)
@pytest.fixture
def sample_html():
"""Sample HTML for testing."""
return """
<!DOCTYPE html>
<html>
<head>
<title>Test Page Title</title>
<meta name="description" content="Test page description">
<meta name="author" content="Test Author">
</head>
<body>
<nav>Navigation content to be removed</nav>
<main>
<article>
<h1>Main Article Title</h1>
<p>This is the main content of the article.</p>
<p>It contains multiple paragraphs with important information.</p>
<a href="/relative-link">Relative Link</a>
<a href="https://example.com/absolute">Absolute Link</a>
</article>
</main>
<footer>Footer content</footer>
<script>console.log('script to remove');</script>
</body>
</html>
"""
class TestFetchWebpage:
"""Test fetch_webpage command."""
def test_fetch_webpage_extracts_content(
self, mocker, web_fetch_component, sample_html
):
mock_response = MagicMock()
mock_response.text = sample_html
mock_response.headers = {"content-length": "1000"}
mocker.patch.object(
web_fetch_component.client, "get", return_value=mock_response
)
result = web_fetch_component.fetch_webpage(
"https://example.com/test", include_links=False
)
assert "Main Article Title" in result or "main content" in result.lower()
assert "Title:" in result # Metadata included
def test_fetch_webpage_extracts_metadata(
self, mocker, web_fetch_component, sample_html
):
mock_response = MagicMock()
mock_response.text = sample_html
mock_response.headers = {}
mocker.patch.object(
web_fetch_component.client, "get", return_value=mock_response
)
result = web_fetch_component.fetch_webpage(
"https://example.com/test", include_links=False
)
assert "Test Page Title" in result
assert "Test page description" in result or "Description" in result
def test_fetch_webpage_extracts_links(
self, mocker, web_fetch_component, sample_html
):
mock_response = MagicMock()
mock_response.text = sample_html
mock_response.headers = {}
mocker.patch.object(
web_fetch_component.client, "get", return_value=mock_response
)
result = web_fetch_component.fetch_webpage(
"https://example.com/test", include_links=True
)
assert "Links" in result
assert "example.com" in result
def test_fetch_webpage_handles_timeout(self, mocker, web_fetch_component):
mocker.patch.object(
web_fetch_component.client,
"get",
side_effect=httpx.TimeoutException("Timeout"),
)
with pytest.raises(CommandExecutionError) as exc_info:
web_fetch_component.fetch_webpage("https://example.com/slow")
assert "timed out" in str(exc_info.value).lower()
def test_fetch_webpage_handles_http_error(self, mocker, web_fetch_component):
mock_response = MagicMock()
mock_response.status_code = 404
mock_response.reason_phrase = "Not Found"
mocker.patch.object(
web_fetch_component.client,
"get",
side_effect=httpx.HTTPStatusError(
"Not Found", request=MagicMock(), response=mock_response
),
)
with pytest.raises(CommandExecutionError) as exc_info:
web_fetch_component.fetch_webpage("https://example.com/missing")
assert "404" in str(exc_info.value)
def test_fetch_webpage_respects_max_content_length(
self, mocker, web_fetch_component
):
mock_response = MagicMock()
mock_response.headers = {"content-length": "999999999"}
mocker.patch.object(
web_fetch_component.client, "get", return_value=mock_response
)
with pytest.raises(CommandExecutionError) as exc_info:
web_fetch_component.fetch_webpage("https://example.com/huge")
assert "too large" in str(exc_info.value).lower()
class TestFetchRawHtml:
"""Test fetch_raw_html command."""
def test_fetch_raw_html_returns_html(
self, mocker, web_fetch_component, sample_html
):
mock_response = MagicMock()
mock_response.text = sample_html
mock_response.headers = {}
mocker.patch.object(
web_fetch_component.client, "get", return_value=mock_response
)
result = web_fetch_component.fetch_raw_html("https://example.com/test")
assert "<!DOCTYPE html>" in result
assert "<title>Test Page Title</title>" in result
assert "<script>" in result # Raw HTML includes scripts
def test_fetch_raw_html_truncates_long_content(self, mocker, web_fetch_component):
long_html = "<html>" + "x" * 100000 + "</html>"
mock_response = MagicMock()
mock_response.text = long_html
mock_response.headers = {}
mocker.patch.object(
web_fetch_component.client, "get", return_value=mock_response
)
result = web_fetch_component.fetch_raw_html(
"https://example.com/long", max_length=1000
)
assert len(result) < len(long_html)
assert "truncated" in result.lower()
class TestLinkExtraction:
"""Test link extraction functionality."""
def test_extracts_absolute_links(self, web_fetch_component):
html = '<a href="https://example.com/page">Link Text</a>'
links = web_fetch_component._extract_links(html, "https://base.com")
assert len(links) == 1
assert "https://example.com/page" in links[0]
assert "Link Text" in links[0]
def test_resolves_relative_links(self, web_fetch_component):
html = '<a href="/relative/path">Relative</a>'
links = web_fetch_component._extract_links(html, "https://base.com")
assert len(links) == 1
assert "https://base.com/relative/path" in links[0]
def test_skips_javascript_links(self, web_fetch_component):
html = """
<a href="javascript:void(0)">JS Link</a>
<a href="mailto:test@example.com">Email</a>
<a href="tel:+1234567890">Phone</a>
<a href="#section">Anchor</a>
<a href="https://real.com">Real Link</a>
"""
links = web_fetch_component._extract_links(html, "https://base.com")
assert len(links) == 1
assert "real.com" in links[0]
def test_respects_max_links(self, web_fetch_component):
web_fetch_component.config.max_links = 3
html = "".join(
f'<a href="https://example.com/{i}">Link {i}</a>' for i in range(10)
)
links = web_fetch_component._extract_links(html, "https://base.com")
assert len(links) == 3
class TestMetadataExtraction:
"""Test metadata extraction functionality."""
def test_extracts_title(self, web_fetch_component):
html = "<html><head><title>Page Title</title></head></html>"
metadata = web_fetch_component._extract_metadata(html)
assert metadata.get("title") == "Page Title"
def test_extracts_description(self, web_fetch_component):
html = '<html><head><meta name="description" content="Page desc"></head></html>'
metadata = web_fetch_component._extract_metadata(html)
assert metadata.get("description") == "Page desc"
def test_extracts_author(self, web_fetch_component):
html = '<html><head><meta name="author" content="John Doe"></head></html>'
metadata = web_fetch_component._extract_metadata(html)
assert metadata.get("author") == "John Doe"
def test_extracts_og_metadata(self, web_fetch_component):
html = """
<html><head>
<meta property="og:title" content="OG Title">
<meta property="og:description" content="OG Description">
</head></html>
"""
metadata = web_fetch_component._extract_metadata(html)
assert metadata.get("og_title") == "OG Title"
assert metadata.get("og_description") == "OG Description"
class TestConfiguration:
"""Test configuration handling."""
def test_commands_available(self, web_fetch_component):
commands = list(web_fetch_component.get_commands())
command_names = [cmd.names[0] for cmd in commands]
assert "fetch_webpage" in command_names
assert "fetch_raw_html" in command_names
def test_resources_provided(self, web_fetch_component):
resources = list(web_fetch_component.get_resources())
assert len(resources) == 1
assert "fetch" in resources[0].lower() or "web" in resources[0].lower()
def test_custom_config(self):
config = WebFetchConfiguration(
timeout=60,
max_content_length=5_000_000,
max_links=100,
)
component = WebFetchComponent(config)
assert component.config.timeout == 60
assert component.config.max_content_length == 5_000_000
assert component.config.max_links == 100

View File

@@ -0,0 +1,338 @@
"""
Lightweight web page fetching component.
Provides fast HTTP-based page fetching without browser overhead.
Uses trafilatura for intelligent content extraction.
"""
import logging
from typing import Iterator, Literal, Optional
from urllib.parse import urljoin, urlparse
import httpx
import trafilatura
from bs4 import BeautifulSoup
from pydantic import BaseModel
from forge.agent.components import ConfigurableComponent
from forge.agent.protocols import CommandProvider, DirectiveProvider
from forge.command import Command, command
from forge.models.json_schema import JSONSchema
from forge.utils.exceptions import CommandExecutionError
from forge.utils.url_validator import validate_url
logger = logging.getLogger(__name__)
# Default headers to mimic a browser
DEFAULT_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
class WebFetchConfiguration(BaseModel):
"""Configuration for the web fetch component."""
timeout: int = 30
"""Request timeout in seconds"""
max_content_length: int = 10_000_000 # 10MB
"""Maximum content length to download"""
follow_redirects: bool = True
"""Whether to follow HTTP redirects"""
extract_links: bool = True
"""Whether to extract links from pages"""
max_links: int = 50
"""Maximum number of links to return"""
include_metadata: bool = True
"""Whether to include page metadata (title, description, etc.)"""
class WebFetchComponent(
DirectiveProvider, CommandProvider, ConfigurableComponent[WebFetchConfiguration]
):
"""
Lightweight web page fetching component.
Provides fast HTTP-based page fetching without browser overhead.
Uses trafilatura for intelligent main content extraction.
"""
config_class = WebFetchConfiguration
def __init__(self, config: Optional[WebFetchConfiguration] = None):
ConfigurableComponent.__init__(self, config)
self._client: Optional[httpx.Client] = None
@property
def client(self) -> httpx.Client:
"""Lazy-loaded HTTP client."""
if self._client is None:
self._client = httpx.Client(
headers=DEFAULT_HEADERS,
timeout=self.config.timeout,
follow_redirects=self.config.follow_redirects,
)
return self._client
def get_resources(self) -> Iterator[str]:
yield "Ability to fetch and extract content from web pages."
def get_commands(self) -> Iterator[Command]:
yield self.fetch_webpage
yield self.fetch_raw_html
def _fetch_url(self, url: str) -> httpx.Response:
"""Fetch a URL and return the response."""
try:
response = self.client.get(url)
response.raise_for_status()
# Check content length
content_length = response.headers.get("content-length")
if content_length and int(content_length) > self.config.max_content_length:
raise CommandExecutionError(
f"Content too large: {int(content_length)} bytes "
f"(max: {self.config.max_content_length})"
)
return response
except httpx.TimeoutException:
raise CommandExecutionError(
f"Request timed out after {self.config.timeout} seconds"
)
except httpx.HTTPStatusError as e:
raise CommandExecutionError(
f"HTTP {e.response.status_code}: {e.response.reason_phrase}"
)
except httpx.RequestError as e:
raise CommandExecutionError(f"Request failed: {e}")
def _extract_links(self, html: str, base_url: str) -> list[str]:
"""Extract links from HTML content."""
soup = BeautifulSoup(html, "html.parser")
links = []
for a in soup.find_all("a", href=True):
href = a["href"]
# Skip javascript, mailto, tel links
if href.startswith(("javascript:", "mailto:", "tel:", "#")):
continue
# Resolve relative URLs
absolute_url = urljoin(base_url, href)
# Only include http(s) links
if absolute_url.startswith(("http://", "https://")):
# Get link text
text = a.get_text(strip=True)[:100] or "[no text]"
links.append(f"{text}: {absolute_url}")
if len(links) >= self.config.max_links:
break
return links
def _extract_metadata(self, html: str) -> dict[str, str]:
"""Extract metadata from HTML."""
soup = BeautifulSoup(html, "html.parser")
metadata = {}
# Title
title_tag = soup.find("title")
if title_tag:
metadata["title"] = title_tag.get_text(strip=True)
# Meta description
desc = soup.find("meta", attrs={"name": "description"})
if desc and desc.get("content"):
metadata["description"] = desc["content"]
# Open Graph title/description
og_title = soup.find("meta", attrs={"property": "og:title"})
if og_title and og_title.get("content"):
metadata["og_title"] = og_title["content"]
og_desc = soup.find("meta", attrs={"property": "og:description"})
if og_desc and og_desc.get("content"):
metadata["og_description"] = og_desc["content"]
# Author
author = soup.find("meta", attrs={"name": "author"})
if author and author.get("content"):
metadata["author"] = author["content"]
# Published date
for attr in ["article:published_time", "datePublished", "date"]:
date_tag = soup.find("meta", attrs={"property": attr}) or soup.find(
"meta", attrs={"name": attr}
)
if date_tag and date_tag.get("content"):
metadata["published"] = date_tag["content"]
break
return metadata
@command(
["fetch_webpage", "fetch", "download_page"],
"Fetch a webpage and extract its main content as clean text. "
"Much faster than read_webpage (no browser needed).",
{
"url": JSONSchema(
type=JSONSchema.Type.STRING,
description="The URL to fetch",
required=True,
),
"output_format": JSONSchema(
type=JSONSchema.Type.STRING,
description=(
"Output format: 'text' (plain text), 'markdown' (with formatting), "
"or 'xml' (structured). Default: 'markdown'"
),
required=False,
),
"include_links": JSONSchema(
type=JSONSchema.Type.BOOLEAN,
description="Whether to include extracted links. Default: true",
required=False,
),
"include_comments": JSONSchema(
type=JSONSchema.Type.BOOLEAN,
description="Whether to include page comments. Default: false",
required=False,
),
},
)
@validate_url
def fetch_webpage(
self,
url: str,
output_format: Literal["text", "markdown", "xml"] = "markdown",
include_links: bool = True,
include_comments: bool = False,
) -> str:
"""
Fetch a webpage and extract its main content.
Uses trafilatura for intelligent content extraction - automatically
removes navigation, ads, boilerplate, and extracts the main article text.
Args:
url: The URL to fetch
output_format: Output format (text, markdown, xml)
include_links: Whether to include links from the page
include_comments: Whether to include comments section
Returns:
Extracted content with optional metadata and links
"""
response = self._fetch_url(url)
html = response.text
# Extract main content using trafilatura
extract_kwargs = {
"include_comments": include_comments,
"include_links": output_format == "markdown",
"include_images": False,
"include_tables": True,
"no_fallback": False,
}
if output_format == "markdown":
content = trafilatura.extract(
html, output_format="markdown", **extract_kwargs
)
elif output_format == "xml":
content = trafilatura.extract(html, output_format="xml", **extract_kwargs)
else:
content = trafilatura.extract(html, **extract_kwargs)
if not content:
# Fallback to basic BeautifulSoup extraction
soup = BeautifulSoup(html, "html.parser")
for tag in soup(["script", "style", "nav", "footer", "header"]):
tag.decompose()
content = soup.get_text(separator="\n", strip=True)
if not content:
return "Could not extract content from this page."
# Build output
output_parts = []
# Add metadata
if self.config.include_metadata:
metadata = self._extract_metadata(html)
if metadata:
meta_lines = []
if "title" in metadata:
meta_lines.append(f"**Title:** {metadata['title']}")
if "description" in metadata:
meta_lines.append(f"**Description:** {metadata['description']}")
if "author" in metadata:
meta_lines.append(f"**Author:** {metadata['author']}")
if "published" in metadata:
meta_lines.append(f"**Published:** {metadata['published']}")
if meta_lines:
output_parts.append("## Page Info\n" + "\n".join(meta_lines))
# Add main content
output_parts.append(f"## Content\n{content}")
# Add links
if include_links and self.config.extract_links:
links = self._extract_links(html, url)
if links:
links_text = "\n".join(f"- {link}" for link in links)
output_parts.append(f"## Links ({len(links)})\n{links_text}")
return "\n\n".join(output_parts)
@command(
["fetch_raw_html", "get_html"],
"Fetch a webpage and return the raw HTML. Use this when you need "
"to inspect the page structure or extract specific elements.",
{
"url": JSONSchema(
type=JSONSchema.Type.STRING,
description="The URL to fetch",
required=True,
),
"max_length": JSONSchema(
type=JSONSchema.Type.INTEGER,
description="Maximum characters to return. Default: 50000",
required=False,
),
},
)
@validate_url
def fetch_raw_html(self, url: str, max_length: int = 50000) -> str:
"""
Fetch a webpage and return the raw HTML.
Args:
url: The URL to fetch
max_length: Maximum characters to return
Returns:
Raw HTML content (truncated if necessary)
"""
response = self._fetch_url(url)
html = response.text
if len(html) > max_length:
return html[:max_length] + f"\n\n... [truncated, {len(html)} total chars]"
return html

View File

@@ -322,6 +322,21 @@ files = [
{file = "attrs-25.4.0.tar.gz", hash = "sha256:16d5969b87f0859ef33a48b35d55ac1be6e42ae49d5e853b597db70c35c57e11"},
]
[[package]]
name = "babel"
version = "2.17.0"
description = "Internationalization utilities"
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2"},
{file = "babel-2.17.0.tar.gz", hash = "sha256:0c54cffb19f690cdcc52a3b50bcbf71e07a808d1c80d549f2459b9d2cf0afb9d"},
]
[package.extras]
dev = ["backports.zoneinfo ; python_version < \"3.9\"", "freezegun (>=1.0,<2.0)", "jinja2 (>=3.0)", "pytest (>=6.0)", "pytest-cov", "pytz", "setuptools", "tzdata ; sys_platform == \"win32\""]
[[package]]
name = "backoff"
version = "2.2.1"
@@ -1715,6 +1730,26 @@ mypy = ["bokeh", "contourpy[bokeh,docs]", "docutils-stubs", "mypy (==1.15.0)", "
test = ["Pillow", "contourpy[test-no-images]", "matplotlib"]
test-no-images = ["pytest", "pytest-cov", "pytest-rerunfailures", "pytest-xdist", "wurlitzer"]
[[package]]
name = "courlan"
version = "1.3.2"
description = "Clean, filter and sample URLs to optimize data collection includes spam, content type and language filters."
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "courlan-1.3.2-py3-none-any.whl", hash = "sha256:d0dab52cf5b5b1000ee2839fbc2837e93b2514d3cb5bb61ae158a55b7a04c6be"},
{file = "courlan-1.3.2.tar.gz", hash = "sha256:0b66f4db3a9c39a6e22dd247c72cfaa57d68ea660e94bb2c84ec7db8712af190"},
]
[package.dependencies]
babel = ">=2.16.0"
tld = ">=0.13"
urllib3 = ">=1.26,<3"
[package.extras]
dev = ["black", "flake8", "mypy", "pytest", "pytest-cov", "types-urllib3"]
[[package]]
name = "coverage"
version = "7.13.1"
@@ -1883,6 +1918,29 @@ files = [
{file = "cymem-2.0.11.tar.gz", hash = "sha256:efe49a349d4a518be6b6c6b255d4a80f740a341544bde1a807707c058b88d0bd"},
]
[[package]]
name = "dateparser"
version = "1.2.2"
description = "Date parsing library designed to parse dates from HTML pages"
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "dateparser-1.2.2-py3-none-any.whl", hash = "sha256:5a5d7211a09013499867547023a2a0c91d5a27d15dd4dbcea676ea9fe66f2482"},
{file = "dateparser-1.2.2.tar.gz", hash = "sha256:986316f17cb8cdc23ea8ce563027c5ef12fc725b6fb1d137c14ca08777c5ecf7"},
]
[package.dependencies]
python-dateutil = ">=2.7.0"
pytz = ">=2024.2"
regex = ">=2024.9.11"
tzlocal = ">=0.2"
[package.extras]
calendars = ["convertdate (>=2.2.1)", "hijridate"]
fasttext = ["fasttext (>=0.9.1)", "numpy (>=1.19.3,<2)"]
langdetect = ["langdetect (>=1.0.0)"]
[[package]]
name = "ddgs"
version = "9.10.0"
@@ -3063,6 +3121,30 @@ files = [
{file = "hpack-4.1.0.tar.gz", hash = "sha256:ec5eca154f7056aa06f196a557655c5b009b382873ac8d1e66e79e87535f1dca"},
]
[[package]]
name = "htmldate"
version = "1.9.4"
description = "Fast and robust extraction of original and updated publication dates from URLs and web pages."
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "htmldate-1.9.4-py3-none-any.whl", hash = "sha256:1b94bcc4e08232a5b692159903acf95548b6a7492dddca5bb123d89d6325921c"},
{file = "htmldate-1.9.4.tar.gz", hash = "sha256:1129063e02dd0354b74264de71e950c0c3fcee191178321418ccad2074cc8ed0"},
]
[package.dependencies]
charset_normalizer = ">=3.4.0"
dateparser = ">=1.1.2"
lxml = {version = ">=5.3.0", markers = "platform_system != \"Darwin\" or python_version > \"3.8\""}
python-dateutil = ">=2.9.0.post0"
urllib3 = ">=1.26,<3"
[package.extras]
all = ["htmldate[dev]", "htmldate[speed]"]
dev = ["black", "flake8", "mypy", "pytest", "pytest-cov", "types-dateparser", "types-lxml", "types-python-dateutil", "types-urllib3"]
speed = ["backports-datetime-fromisoformat ; python_version < \"3.11\"", "faust-cchardet (>=2.1.19)", "urllib3[brotli]"]
[[package]]
name = "httpcore"
version = "1.0.8"
@@ -3605,6 +3687,21 @@ files = [
[package.dependencies]
referencing = ">=0.31.0"
[[package]]
name = "justext"
version = "3.0.2"
description = "Heuristic based boilerplate removal tool"
optional = false
python-versions = "*"
groups = ["main"]
files = [
{file = "justext-3.0.2-py2.py3-none-any.whl", hash = "sha256:62b1c562b15c3c6265e121cc070874243a443bfd53060e869393f09d6b6cc9a7"},
{file = "justext-3.0.2.tar.gz", hash = "sha256:13496a450c44c4cd5b5a75a5efcd9996066d2a189794ea99a49949685a0beb05"},
]
[package.dependencies]
lxml = {version = ">=4.4.2", extras = ["html-clean"]}
[[package]]
name = "kiwisolver"
version = "1.4.9"
@@ -3946,12 +4043,30 @@ files = [
{file = "lxml-6.0.2.tar.gz", hash = "sha256:cd79f3367bd74b317dda655dc8fcfa304d9eb6e4fb06b7168c5cf27f96e0cd62"},
]
[package.dependencies]
lxml_html_clean = {version = "*", optional = true, markers = "extra == \"html-clean\""}
[package.extras]
cssselect = ["cssselect (>=0.7)"]
html-clean = ["lxml_html_clean"]
html5 = ["html5lib"]
htmlsoup = ["BeautifulSoup4"]
[[package]]
name = "lxml-html-clean"
version = "0.4.3"
description = "HTML cleaner from lxml project"
optional = false
python-versions = "*"
groups = ["main"]
files = [
{file = "lxml_html_clean-0.4.3-py3-none-any.whl", hash = "sha256:63fd7b0b9c3a2e4176611c2ca5d61c4c07ffca2de76c14059a81a2825833731e"},
{file = "lxml_html_clean-0.4.3.tar.gz", hash = "sha256:c9df91925b00f836c807beab127aac82575110eacff54d0a75187914f1bd9d8c"},
]
[package.dependencies]
lxml = "*"
[[package]]
name = "markdown-it-py"
version = "4.0.0"
@@ -6524,10 +6639,9 @@ dev = ["atomicwrites (==1.2.1)", "attrs (==19.2.0)", "coverage (==6.5.0)", "hatc
name = "pytz"
version = "2025.2"
description = "World timezone definitions, modern and historical"
optional = true
optional = false
python-versions = "*"
groups = ["main"]
markers = "extra == \"benchmark\""
files = [
{file = "pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00"},
{file = "pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3"},
@@ -7906,6 +8020,18 @@ requests = ">=2.26.0"
[package.extras]
blobfile = ["blobfile (>=2)"]
[[package]]
name = "tld"
version = "0.13.1"
description = "Extract the top-level domain (TLD) from the URL given."
optional = false
python-versions = ">=3.7"
groups = ["main"]
files = [
{file = "tld-0.13.1-py2.py3-none-any.whl", hash = "sha256:a2d35109433ac83486ddf87e3c4539ab2c5c2478230e5d9c060a18af4b03aa7c"},
{file = "tld-0.13.1.tar.gz", hash = "sha256:75ec00936cbcf564f67361c41713363440b6c4ef0f0c1592b5b0fbe72c17a350"},
]
[[package]]
name = "tokenizers"
version = "0.22.2"
@@ -7982,6 +8108,31 @@ notebook = ["ipywidgets (>=6)"]
slack = ["slack-sdk"]
telegram = ["requests"]
[[package]]
name = "trafilatura"
version = "2.0.0"
description = "Python & Command-line tool to gather text and metadata on the Web: Crawling, scraping, extraction, output as CSV, JSON, HTML, MD, TXT, XML."
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "trafilatura-2.0.0-py3-none-any.whl", hash = "sha256:77eb5d1e993747f6f20938e1de2d840020719735690c840b9a1024803a4cd51d"},
{file = "trafilatura-2.0.0.tar.gz", hash = "sha256:ceb7094a6ecc97e72fea73c7dba36714c5c5b577b6470e4520dca893706d6247"},
]
[package.dependencies]
certifi = "*"
charset_normalizer = ">=3.4.0"
courlan = ">=1.3.2"
htmldate = ">=1.9.2"
justext = ">=3.0.1"
lxml = {version = ">=5.3.0", markers = "platform_system != \"Darwin\" or python_version > \"3.8\""}
urllib3 = ">=1.26,<3"
[package.extras]
all = ["brotli", "cchardet (>=2.1.7) ; python_version < \"3.11\"", "faust-cchardet (>=2.1.19) ; python_version >= \"3.11\"", "htmldate[speed] (>=1.9.2)", "py3langid (>=0.3.0)", "pycurl (>=7.45.3)", "urllib3[socks]", "zstandard (>=0.23.0)"]
dev = ["flake8", "mypy", "pytest", "pytest-cov", "types-lxml", "types-urllib3"]
[[package]]
name = "traitlets"
version = "5.14.3"
@@ -8221,15 +8372,33 @@ typing-extensions = ">=4.12.0"
name = "tzdata"
version = "2025.3"
description = "Provider of IANA time zone data"
optional = true
optional = false
python-versions = ">=2"
groups = ["main"]
markers = "extra == \"benchmark\""
markers = "extra == \"benchmark\" or platform_system == \"Windows\""
files = [
{file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"},
{file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"},
]
[[package]]
name = "tzlocal"
version = "5.3.1"
description = "tzinfo object for the local timezone"
optional = false
python-versions = ">=3.9"
groups = ["main"]
files = [
{file = "tzlocal-5.3.1-py3-none-any.whl", hash = "sha256:eb1a66c3ef5847adf7a834f1be0800581b683b5608e74f86ecbcef8ab91bb85d"},
{file = "tzlocal-5.3.1.tar.gz", hash = "sha256:cceffc7edecefea1f595541dbd6e990cb1ea3d19bf01b2809f362a03dd7921fd"},
]
[package.dependencies]
tzdata = {version = "*", markers = "platform_system == \"Windows\""}
[package.extras]
devenv = ["check-manifest", "pytest (>=4.3)", "pytest-cov", "pytest-mock (>=3.3)", "zest.releaser"]
[[package]]
name = "uritemplate"
version = "4.2.0"
@@ -9022,4 +9191,4 @@ benchmark = ["agbenchmark"]
[metadata]
lock-version = "2.1"
python-versions = "^3.12"
content-hash = "9388cd65752a0db5c42052e312a7df7b0f535256b2c27a9d9209d5118a418a25"
content-hash = "7de5064cccbd74f3bc85ded976c37b249f7da997d6ef5b5853240a57daae7695"

View File

@@ -37,6 +37,7 @@ colorama = "^0.4.6"
demjson3 = "^3.0.0"
docker = "*"
ddgs = "^9.9"
trafilatura = "^2.0"
en-core-web-sm = { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl" }
fastapi = "^0.109.1"
gitpython = "^3.1.32"