mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-04-08 03:00:28 -04:00
feat(forge): add lightweight web fetch component
Add WebFetchComponent for fast HTTP-based page fetching without browser overhead. Uses trafilatura for intelligent content extraction. Commands: - fetch_webpage: Extract main content as text/markdown/xml - Removes navigation, ads, boilerplate automatically - Extracts page metadata (title, description, author, date) - Extracts and lists page links - Much faster than Selenium-based read_webpage - fetch_raw_html: Get raw HTML for structure inspection - Optional truncation for large pages Features: - Trafilatura-powered content extraction (best-in-class accuracy) - Automatic link extraction with relative URL resolution - Page metadata extraction (OG tags, meta tags) - Configurable timeout, max content length, max links - Proper error handling for timeouts and HTTP errors - 19 comprehensive tests Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
291
classic/forge/forge/components/web/test_web_fetch.py
Normal file
291
classic/forge/forge/components/web/test_web_fetch.py
Normal file
@@ -0,0 +1,291 @@
|
||||
"""Tests for the web fetch component."""
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
|
||||
from forge.utils.exceptions import CommandExecutionError
|
||||
|
||||
from .web_fetch import WebFetchComponent, WebFetchConfiguration
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def web_fetch_component():
|
||||
"""Create a WebFetchComponent with default config."""
|
||||
config = WebFetchConfiguration()
|
||||
return WebFetchComponent(config)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_html():
|
||||
"""Sample HTML for testing."""
|
||||
return """
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Test Page Title</title>
|
||||
<meta name="description" content="Test page description">
|
||||
<meta name="author" content="Test Author">
|
||||
</head>
|
||||
<body>
|
||||
<nav>Navigation content to be removed</nav>
|
||||
<main>
|
||||
<article>
|
||||
<h1>Main Article Title</h1>
|
||||
<p>This is the main content of the article.</p>
|
||||
<p>It contains multiple paragraphs with important information.</p>
|
||||
<a href="/relative-link">Relative Link</a>
|
||||
<a href="https://example.com/absolute">Absolute Link</a>
|
||||
</article>
|
||||
</main>
|
||||
<footer>Footer content</footer>
|
||||
<script>console.log('script to remove');</script>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
|
||||
class TestFetchWebpage:
|
||||
"""Test fetch_webpage command."""
|
||||
|
||||
def test_fetch_webpage_extracts_content(
|
||||
self, mocker, web_fetch_component, sample_html
|
||||
):
|
||||
mock_response = MagicMock()
|
||||
mock_response.text = sample_html
|
||||
mock_response.headers = {"content-length": "1000"}
|
||||
|
||||
mocker.patch.object(
|
||||
web_fetch_component.client, "get", return_value=mock_response
|
||||
)
|
||||
|
||||
result = web_fetch_component.fetch_webpage(
|
||||
"https://example.com/test", include_links=False
|
||||
)
|
||||
|
||||
assert "Main Article Title" in result or "main content" in result.lower()
|
||||
assert "Title:" in result # Metadata included
|
||||
|
||||
def test_fetch_webpage_extracts_metadata(
|
||||
self, mocker, web_fetch_component, sample_html
|
||||
):
|
||||
mock_response = MagicMock()
|
||||
mock_response.text = sample_html
|
||||
mock_response.headers = {}
|
||||
|
||||
mocker.patch.object(
|
||||
web_fetch_component.client, "get", return_value=mock_response
|
||||
)
|
||||
|
||||
result = web_fetch_component.fetch_webpage(
|
||||
"https://example.com/test", include_links=False
|
||||
)
|
||||
|
||||
assert "Test Page Title" in result
|
||||
assert "Test page description" in result or "Description" in result
|
||||
|
||||
def test_fetch_webpage_extracts_links(
|
||||
self, mocker, web_fetch_component, sample_html
|
||||
):
|
||||
mock_response = MagicMock()
|
||||
mock_response.text = sample_html
|
||||
mock_response.headers = {}
|
||||
|
||||
mocker.patch.object(
|
||||
web_fetch_component.client, "get", return_value=mock_response
|
||||
)
|
||||
|
||||
result = web_fetch_component.fetch_webpage(
|
||||
"https://example.com/test", include_links=True
|
||||
)
|
||||
|
||||
assert "Links" in result
|
||||
assert "example.com" in result
|
||||
|
||||
def test_fetch_webpage_handles_timeout(self, mocker, web_fetch_component):
|
||||
mocker.patch.object(
|
||||
web_fetch_component.client,
|
||||
"get",
|
||||
side_effect=httpx.TimeoutException("Timeout"),
|
||||
)
|
||||
|
||||
with pytest.raises(CommandExecutionError) as exc_info:
|
||||
web_fetch_component.fetch_webpage("https://example.com/slow")
|
||||
|
||||
assert "timed out" in str(exc_info.value).lower()
|
||||
|
||||
def test_fetch_webpage_handles_http_error(self, mocker, web_fetch_component):
|
||||
mock_response = MagicMock()
|
||||
mock_response.status_code = 404
|
||||
mock_response.reason_phrase = "Not Found"
|
||||
|
||||
mocker.patch.object(
|
||||
web_fetch_component.client,
|
||||
"get",
|
||||
side_effect=httpx.HTTPStatusError(
|
||||
"Not Found", request=MagicMock(), response=mock_response
|
||||
),
|
||||
)
|
||||
|
||||
with pytest.raises(CommandExecutionError) as exc_info:
|
||||
web_fetch_component.fetch_webpage("https://example.com/missing")
|
||||
|
||||
assert "404" in str(exc_info.value)
|
||||
|
||||
def test_fetch_webpage_respects_max_content_length(
|
||||
self, mocker, web_fetch_component
|
||||
):
|
||||
mock_response = MagicMock()
|
||||
mock_response.headers = {"content-length": "999999999"}
|
||||
|
||||
mocker.patch.object(
|
||||
web_fetch_component.client, "get", return_value=mock_response
|
||||
)
|
||||
|
||||
with pytest.raises(CommandExecutionError) as exc_info:
|
||||
web_fetch_component.fetch_webpage("https://example.com/huge")
|
||||
|
||||
assert "too large" in str(exc_info.value).lower()
|
||||
|
||||
|
||||
class TestFetchRawHtml:
|
||||
"""Test fetch_raw_html command."""
|
||||
|
||||
def test_fetch_raw_html_returns_html(
|
||||
self, mocker, web_fetch_component, sample_html
|
||||
):
|
||||
mock_response = MagicMock()
|
||||
mock_response.text = sample_html
|
||||
mock_response.headers = {}
|
||||
|
||||
mocker.patch.object(
|
||||
web_fetch_component.client, "get", return_value=mock_response
|
||||
)
|
||||
|
||||
result = web_fetch_component.fetch_raw_html("https://example.com/test")
|
||||
|
||||
assert "<!DOCTYPE html>" in result
|
||||
assert "<title>Test Page Title</title>" in result
|
||||
assert "<script>" in result # Raw HTML includes scripts
|
||||
|
||||
def test_fetch_raw_html_truncates_long_content(self, mocker, web_fetch_component):
|
||||
long_html = "<html>" + "x" * 100000 + "</html>"
|
||||
|
||||
mock_response = MagicMock()
|
||||
mock_response.text = long_html
|
||||
mock_response.headers = {}
|
||||
|
||||
mocker.patch.object(
|
||||
web_fetch_component.client, "get", return_value=mock_response
|
||||
)
|
||||
|
||||
result = web_fetch_component.fetch_raw_html(
|
||||
"https://example.com/long", max_length=1000
|
||||
)
|
||||
|
||||
assert len(result) < len(long_html)
|
||||
assert "truncated" in result.lower()
|
||||
|
||||
|
||||
class TestLinkExtraction:
|
||||
"""Test link extraction functionality."""
|
||||
|
||||
def test_extracts_absolute_links(self, web_fetch_component):
|
||||
html = '<a href="https://example.com/page">Link Text</a>'
|
||||
links = web_fetch_component._extract_links(html, "https://base.com")
|
||||
|
||||
assert len(links) == 1
|
||||
assert "https://example.com/page" in links[0]
|
||||
assert "Link Text" in links[0]
|
||||
|
||||
def test_resolves_relative_links(self, web_fetch_component):
|
||||
html = '<a href="/relative/path">Relative</a>'
|
||||
links = web_fetch_component._extract_links(html, "https://base.com")
|
||||
|
||||
assert len(links) == 1
|
||||
assert "https://base.com/relative/path" in links[0]
|
||||
|
||||
def test_skips_javascript_links(self, web_fetch_component):
|
||||
html = """
|
||||
<a href="javascript:void(0)">JS Link</a>
|
||||
<a href="mailto:test@example.com">Email</a>
|
||||
<a href="tel:+1234567890">Phone</a>
|
||||
<a href="#section">Anchor</a>
|
||||
<a href="https://real.com">Real Link</a>
|
||||
"""
|
||||
links = web_fetch_component._extract_links(html, "https://base.com")
|
||||
|
||||
assert len(links) == 1
|
||||
assert "real.com" in links[0]
|
||||
|
||||
def test_respects_max_links(self, web_fetch_component):
|
||||
web_fetch_component.config.max_links = 3
|
||||
html = "".join(
|
||||
f'<a href="https://example.com/{i}">Link {i}</a>' for i in range(10)
|
||||
)
|
||||
links = web_fetch_component._extract_links(html, "https://base.com")
|
||||
|
||||
assert len(links) == 3
|
||||
|
||||
|
||||
class TestMetadataExtraction:
|
||||
"""Test metadata extraction functionality."""
|
||||
|
||||
def test_extracts_title(self, web_fetch_component):
|
||||
html = "<html><head><title>Page Title</title></head></html>"
|
||||
metadata = web_fetch_component._extract_metadata(html)
|
||||
|
||||
assert metadata.get("title") == "Page Title"
|
||||
|
||||
def test_extracts_description(self, web_fetch_component):
|
||||
html = '<html><head><meta name="description" content="Page desc"></head></html>'
|
||||
metadata = web_fetch_component._extract_metadata(html)
|
||||
|
||||
assert metadata.get("description") == "Page desc"
|
||||
|
||||
def test_extracts_author(self, web_fetch_component):
|
||||
html = '<html><head><meta name="author" content="John Doe"></head></html>'
|
||||
metadata = web_fetch_component._extract_metadata(html)
|
||||
|
||||
assert metadata.get("author") == "John Doe"
|
||||
|
||||
def test_extracts_og_metadata(self, web_fetch_component):
|
||||
html = """
|
||||
<html><head>
|
||||
<meta property="og:title" content="OG Title">
|
||||
<meta property="og:description" content="OG Description">
|
||||
</head></html>
|
||||
"""
|
||||
metadata = web_fetch_component._extract_metadata(html)
|
||||
|
||||
assert metadata.get("og_title") == "OG Title"
|
||||
assert metadata.get("og_description") == "OG Description"
|
||||
|
||||
|
||||
class TestConfiguration:
|
||||
"""Test configuration handling."""
|
||||
|
||||
def test_commands_available(self, web_fetch_component):
|
||||
commands = list(web_fetch_component.get_commands())
|
||||
command_names = [cmd.names[0] for cmd in commands]
|
||||
|
||||
assert "fetch_webpage" in command_names
|
||||
assert "fetch_raw_html" in command_names
|
||||
|
||||
def test_resources_provided(self, web_fetch_component):
|
||||
resources = list(web_fetch_component.get_resources())
|
||||
assert len(resources) == 1
|
||||
assert "fetch" in resources[0].lower() or "web" in resources[0].lower()
|
||||
|
||||
def test_custom_config(self):
|
||||
config = WebFetchConfiguration(
|
||||
timeout=60,
|
||||
max_content_length=5_000_000,
|
||||
max_links=100,
|
||||
)
|
||||
component = WebFetchComponent(config)
|
||||
|
||||
assert component.config.timeout == 60
|
||||
assert component.config.max_content_length == 5_000_000
|
||||
assert component.config.max_links == 100
|
||||
338
classic/forge/forge/components/web/web_fetch.py
Normal file
338
classic/forge/forge/components/web/web_fetch.py
Normal file
@@ -0,0 +1,338 @@
|
||||
"""
|
||||
Lightweight web page fetching component.
|
||||
|
||||
Provides fast HTTP-based page fetching without browser overhead.
|
||||
Uses trafilatura for intelligent content extraction.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Iterator, Literal, Optional
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
import httpx
|
||||
import trafilatura
|
||||
from bs4 import BeautifulSoup
|
||||
from pydantic import BaseModel
|
||||
|
||||
from forge.agent.components import ConfigurableComponent
|
||||
from forge.agent.protocols import CommandProvider, DirectiveProvider
|
||||
from forge.command import Command, command
|
||||
from forge.models.json_schema import JSONSchema
|
||||
from forge.utils.exceptions import CommandExecutionError
|
||||
from forge.utils.url_validator import validate_url
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Default headers to mimic a browser
|
||||
DEFAULT_HEADERS = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
),
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.5",
|
||||
"Accept-Encoding": "gzip, deflate",
|
||||
"DNT": "1",
|
||||
"Connection": "keep-alive",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
}
|
||||
|
||||
|
||||
class WebFetchConfiguration(BaseModel):
|
||||
"""Configuration for the web fetch component."""
|
||||
|
||||
timeout: int = 30
|
||||
"""Request timeout in seconds"""
|
||||
|
||||
max_content_length: int = 10_000_000 # 10MB
|
||||
"""Maximum content length to download"""
|
||||
|
||||
follow_redirects: bool = True
|
||||
"""Whether to follow HTTP redirects"""
|
||||
|
||||
extract_links: bool = True
|
||||
"""Whether to extract links from pages"""
|
||||
|
||||
max_links: int = 50
|
||||
"""Maximum number of links to return"""
|
||||
|
||||
include_metadata: bool = True
|
||||
"""Whether to include page metadata (title, description, etc.)"""
|
||||
|
||||
|
||||
class WebFetchComponent(
|
||||
DirectiveProvider, CommandProvider, ConfigurableComponent[WebFetchConfiguration]
|
||||
):
|
||||
"""
|
||||
Lightweight web page fetching component.
|
||||
|
||||
Provides fast HTTP-based page fetching without browser overhead.
|
||||
Uses trafilatura for intelligent main content extraction.
|
||||
"""
|
||||
|
||||
config_class = WebFetchConfiguration
|
||||
|
||||
def __init__(self, config: Optional[WebFetchConfiguration] = None):
|
||||
ConfigurableComponent.__init__(self, config)
|
||||
self._client: Optional[httpx.Client] = None
|
||||
|
||||
@property
|
||||
def client(self) -> httpx.Client:
|
||||
"""Lazy-loaded HTTP client."""
|
||||
if self._client is None:
|
||||
self._client = httpx.Client(
|
||||
headers=DEFAULT_HEADERS,
|
||||
timeout=self.config.timeout,
|
||||
follow_redirects=self.config.follow_redirects,
|
||||
)
|
||||
return self._client
|
||||
|
||||
def get_resources(self) -> Iterator[str]:
|
||||
yield "Ability to fetch and extract content from web pages."
|
||||
|
||||
def get_commands(self) -> Iterator[Command]:
|
||||
yield self.fetch_webpage
|
||||
yield self.fetch_raw_html
|
||||
|
||||
def _fetch_url(self, url: str) -> httpx.Response:
|
||||
"""Fetch a URL and return the response."""
|
||||
try:
|
||||
response = self.client.get(url)
|
||||
response.raise_for_status()
|
||||
|
||||
# Check content length
|
||||
content_length = response.headers.get("content-length")
|
||||
if content_length and int(content_length) > self.config.max_content_length:
|
||||
raise CommandExecutionError(
|
||||
f"Content too large: {int(content_length)} bytes "
|
||||
f"(max: {self.config.max_content_length})"
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
except httpx.TimeoutException:
|
||||
raise CommandExecutionError(
|
||||
f"Request timed out after {self.config.timeout} seconds"
|
||||
)
|
||||
except httpx.HTTPStatusError as e:
|
||||
raise CommandExecutionError(
|
||||
f"HTTP {e.response.status_code}: {e.response.reason_phrase}"
|
||||
)
|
||||
except httpx.RequestError as e:
|
||||
raise CommandExecutionError(f"Request failed: {e}")
|
||||
|
||||
def _extract_links(self, html: str, base_url: str) -> list[str]:
|
||||
"""Extract links from HTML content."""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
links = []
|
||||
|
||||
for a in soup.find_all("a", href=True):
|
||||
href = a["href"]
|
||||
# Skip javascript, mailto, tel links
|
||||
if href.startswith(("javascript:", "mailto:", "tel:", "#")):
|
||||
continue
|
||||
|
||||
# Resolve relative URLs
|
||||
absolute_url = urljoin(base_url, href)
|
||||
|
||||
# Only include http(s) links
|
||||
if absolute_url.startswith(("http://", "https://")):
|
||||
# Get link text
|
||||
text = a.get_text(strip=True)[:100] or "[no text]"
|
||||
links.append(f"{text}: {absolute_url}")
|
||||
|
||||
if len(links) >= self.config.max_links:
|
||||
break
|
||||
|
||||
return links
|
||||
|
||||
def _extract_metadata(self, html: str) -> dict[str, str]:
|
||||
"""Extract metadata from HTML."""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
metadata = {}
|
||||
|
||||
# Title
|
||||
title_tag = soup.find("title")
|
||||
if title_tag:
|
||||
metadata["title"] = title_tag.get_text(strip=True)
|
||||
|
||||
# Meta description
|
||||
desc = soup.find("meta", attrs={"name": "description"})
|
||||
if desc and desc.get("content"):
|
||||
metadata["description"] = desc["content"]
|
||||
|
||||
# Open Graph title/description
|
||||
og_title = soup.find("meta", attrs={"property": "og:title"})
|
||||
if og_title and og_title.get("content"):
|
||||
metadata["og_title"] = og_title["content"]
|
||||
|
||||
og_desc = soup.find("meta", attrs={"property": "og:description"})
|
||||
if og_desc and og_desc.get("content"):
|
||||
metadata["og_description"] = og_desc["content"]
|
||||
|
||||
# Author
|
||||
author = soup.find("meta", attrs={"name": "author"})
|
||||
if author and author.get("content"):
|
||||
metadata["author"] = author["content"]
|
||||
|
||||
# Published date
|
||||
for attr in ["article:published_time", "datePublished", "date"]:
|
||||
date_tag = soup.find("meta", attrs={"property": attr}) or soup.find(
|
||||
"meta", attrs={"name": attr}
|
||||
)
|
||||
if date_tag and date_tag.get("content"):
|
||||
metadata["published"] = date_tag["content"]
|
||||
break
|
||||
|
||||
return metadata
|
||||
|
||||
@command(
|
||||
["fetch_webpage", "fetch", "download_page"],
|
||||
"Fetch a webpage and extract its main content as clean text. "
|
||||
"Much faster than read_webpage (no browser needed).",
|
||||
{
|
||||
"url": JSONSchema(
|
||||
type=JSONSchema.Type.STRING,
|
||||
description="The URL to fetch",
|
||||
required=True,
|
||||
),
|
||||
"output_format": JSONSchema(
|
||||
type=JSONSchema.Type.STRING,
|
||||
description=(
|
||||
"Output format: 'text' (plain text), 'markdown' (with formatting), "
|
||||
"or 'xml' (structured). Default: 'markdown'"
|
||||
),
|
||||
required=False,
|
||||
),
|
||||
"include_links": JSONSchema(
|
||||
type=JSONSchema.Type.BOOLEAN,
|
||||
description="Whether to include extracted links. Default: true",
|
||||
required=False,
|
||||
),
|
||||
"include_comments": JSONSchema(
|
||||
type=JSONSchema.Type.BOOLEAN,
|
||||
description="Whether to include page comments. Default: false",
|
||||
required=False,
|
||||
),
|
||||
},
|
||||
)
|
||||
@validate_url
|
||||
def fetch_webpage(
|
||||
self,
|
||||
url: str,
|
||||
output_format: Literal["text", "markdown", "xml"] = "markdown",
|
||||
include_links: bool = True,
|
||||
include_comments: bool = False,
|
||||
) -> str:
|
||||
"""
|
||||
Fetch a webpage and extract its main content.
|
||||
|
||||
Uses trafilatura for intelligent content extraction - automatically
|
||||
removes navigation, ads, boilerplate, and extracts the main article text.
|
||||
|
||||
Args:
|
||||
url: The URL to fetch
|
||||
output_format: Output format (text, markdown, xml)
|
||||
include_links: Whether to include links from the page
|
||||
include_comments: Whether to include comments section
|
||||
|
||||
Returns:
|
||||
Extracted content with optional metadata and links
|
||||
"""
|
||||
response = self._fetch_url(url)
|
||||
html = response.text
|
||||
|
||||
# Extract main content using trafilatura
|
||||
extract_kwargs = {
|
||||
"include_comments": include_comments,
|
||||
"include_links": output_format == "markdown",
|
||||
"include_images": False,
|
||||
"include_tables": True,
|
||||
"no_fallback": False,
|
||||
}
|
||||
|
||||
if output_format == "markdown":
|
||||
content = trafilatura.extract(
|
||||
html, output_format="markdown", **extract_kwargs
|
||||
)
|
||||
elif output_format == "xml":
|
||||
content = trafilatura.extract(html, output_format="xml", **extract_kwargs)
|
||||
else:
|
||||
content = trafilatura.extract(html, **extract_kwargs)
|
||||
|
||||
if not content:
|
||||
# Fallback to basic BeautifulSoup extraction
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
for tag in soup(["script", "style", "nav", "footer", "header"]):
|
||||
tag.decompose()
|
||||
content = soup.get_text(separator="\n", strip=True)
|
||||
if not content:
|
||||
return "Could not extract content from this page."
|
||||
|
||||
# Build output
|
||||
output_parts = []
|
||||
|
||||
# Add metadata
|
||||
if self.config.include_metadata:
|
||||
metadata = self._extract_metadata(html)
|
||||
if metadata:
|
||||
meta_lines = []
|
||||
if "title" in metadata:
|
||||
meta_lines.append(f"**Title:** {metadata['title']}")
|
||||
if "description" in metadata:
|
||||
meta_lines.append(f"**Description:** {metadata['description']}")
|
||||
if "author" in metadata:
|
||||
meta_lines.append(f"**Author:** {metadata['author']}")
|
||||
if "published" in metadata:
|
||||
meta_lines.append(f"**Published:** {metadata['published']}")
|
||||
if meta_lines:
|
||||
output_parts.append("## Page Info\n" + "\n".join(meta_lines))
|
||||
|
||||
# Add main content
|
||||
output_parts.append(f"## Content\n{content}")
|
||||
|
||||
# Add links
|
||||
if include_links and self.config.extract_links:
|
||||
links = self._extract_links(html, url)
|
||||
if links:
|
||||
links_text = "\n".join(f"- {link}" for link in links)
|
||||
output_parts.append(f"## Links ({len(links)})\n{links_text}")
|
||||
|
||||
return "\n\n".join(output_parts)
|
||||
|
||||
@command(
|
||||
["fetch_raw_html", "get_html"],
|
||||
"Fetch a webpage and return the raw HTML. Use this when you need "
|
||||
"to inspect the page structure or extract specific elements.",
|
||||
{
|
||||
"url": JSONSchema(
|
||||
type=JSONSchema.Type.STRING,
|
||||
description="The URL to fetch",
|
||||
required=True,
|
||||
),
|
||||
"max_length": JSONSchema(
|
||||
type=JSONSchema.Type.INTEGER,
|
||||
description="Maximum characters to return. Default: 50000",
|
||||
required=False,
|
||||
),
|
||||
},
|
||||
)
|
||||
@validate_url
|
||||
def fetch_raw_html(self, url: str, max_length: int = 50000) -> str:
|
||||
"""
|
||||
Fetch a webpage and return the raw HTML.
|
||||
|
||||
Args:
|
||||
url: The URL to fetch
|
||||
max_length: Maximum characters to return
|
||||
|
||||
Returns:
|
||||
Raw HTML content (truncated if necessary)
|
||||
"""
|
||||
response = self._fetch_url(url)
|
||||
html = response.text
|
||||
|
||||
if len(html) > max_length:
|
||||
return html[:max_length] + f"\n\n... [truncated, {len(html)} total chars]"
|
||||
|
||||
return html
|
||||
179
classic/forge/poetry.lock
generated
179
classic/forge/poetry.lock
generated
@@ -322,6 +322,21 @@ files = [
|
||||
{file = "attrs-25.4.0.tar.gz", hash = "sha256:16d5969b87f0859ef33a48b35d55ac1be6e42ae49d5e853b597db70c35c57e11"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "babel"
|
||||
version = "2.17.0"
|
||||
description = "Internationalization utilities"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2"},
|
||||
{file = "babel-2.17.0.tar.gz", hash = "sha256:0c54cffb19f690cdcc52a3b50bcbf71e07a808d1c80d549f2459b9d2cf0afb9d"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
dev = ["backports.zoneinfo ; python_version < \"3.9\"", "freezegun (>=1.0,<2.0)", "jinja2 (>=3.0)", "pytest (>=6.0)", "pytest-cov", "pytz", "setuptools", "tzdata ; sys_platform == \"win32\""]
|
||||
|
||||
[[package]]
|
||||
name = "backoff"
|
||||
version = "2.2.1"
|
||||
@@ -1715,6 +1730,26 @@ mypy = ["bokeh", "contourpy[bokeh,docs]", "docutils-stubs", "mypy (==1.15.0)", "
|
||||
test = ["Pillow", "contourpy[test-no-images]", "matplotlib"]
|
||||
test-no-images = ["pytest", "pytest-cov", "pytest-rerunfailures", "pytest-xdist", "wurlitzer"]
|
||||
|
||||
[[package]]
|
||||
name = "courlan"
|
||||
version = "1.3.2"
|
||||
description = "Clean, filter and sample URLs to optimize data collection – includes spam, content type and language filters."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "courlan-1.3.2-py3-none-any.whl", hash = "sha256:d0dab52cf5b5b1000ee2839fbc2837e93b2514d3cb5bb61ae158a55b7a04c6be"},
|
||||
{file = "courlan-1.3.2.tar.gz", hash = "sha256:0b66f4db3a9c39a6e22dd247c72cfaa57d68ea660e94bb2c84ec7db8712af190"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
babel = ">=2.16.0"
|
||||
tld = ">=0.13"
|
||||
urllib3 = ">=1.26,<3"
|
||||
|
||||
[package.extras]
|
||||
dev = ["black", "flake8", "mypy", "pytest", "pytest-cov", "types-urllib3"]
|
||||
|
||||
[[package]]
|
||||
name = "coverage"
|
||||
version = "7.13.1"
|
||||
@@ -1883,6 +1918,29 @@ files = [
|
||||
{file = "cymem-2.0.11.tar.gz", hash = "sha256:efe49a349d4a518be6b6c6b255d4a80f740a341544bde1a807707c058b88d0bd"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dateparser"
|
||||
version = "1.2.2"
|
||||
description = "Date parsing library designed to parse dates from HTML pages"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "dateparser-1.2.2-py3-none-any.whl", hash = "sha256:5a5d7211a09013499867547023a2a0c91d5a27d15dd4dbcea676ea9fe66f2482"},
|
||||
{file = "dateparser-1.2.2.tar.gz", hash = "sha256:986316f17cb8cdc23ea8ce563027c5ef12fc725b6fb1d137c14ca08777c5ecf7"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
python-dateutil = ">=2.7.0"
|
||||
pytz = ">=2024.2"
|
||||
regex = ">=2024.9.11"
|
||||
tzlocal = ">=0.2"
|
||||
|
||||
[package.extras]
|
||||
calendars = ["convertdate (>=2.2.1)", "hijridate"]
|
||||
fasttext = ["fasttext (>=0.9.1)", "numpy (>=1.19.3,<2)"]
|
||||
langdetect = ["langdetect (>=1.0.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "ddgs"
|
||||
version = "9.10.0"
|
||||
@@ -3063,6 +3121,30 @@ files = [
|
||||
{file = "hpack-4.1.0.tar.gz", hash = "sha256:ec5eca154f7056aa06f196a557655c5b009b382873ac8d1e66e79e87535f1dca"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "htmldate"
|
||||
version = "1.9.4"
|
||||
description = "Fast and robust extraction of original and updated publication dates from URLs and web pages."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "htmldate-1.9.4-py3-none-any.whl", hash = "sha256:1b94bcc4e08232a5b692159903acf95548b6a7492dddca5bb123d89d6325921c"},
|
||||
{file = "htmldate-1.9.4.tar.gz", hash = "sha256:1129063e02dd0354b74264de71e950c0c3fcee191178321418ccad2074cc8ed0"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
charset_normalizer = ">=3.4.0"
|
||||
dateparser = ">=1.1.2"
|
||||
lxml = {version = ">=5.3.0", markers = "platform_system != \"Darwin\" or python_version > \"3.8\""}
|
||||
python-dateutil = ">=2.9.0.post0"
|
||||
urllib3 = ">=1.26,<3"
|
||||
|
||||
[package.extras]
|
||||
all = ["htmldate[dev]", "htmldate[speed]"]
|
||||
dev = ["black", "flake8", "mypy", "pytest", "pytest-cov", "types-dateparser", "types-lxml", "types-python-dateutil", "types-urllib3"]
|
||||
speed = ["backports-datetime-fromisoformat ; python_version < \"3.11\"", "faust-cchardet (>=2.1.19)", "urllib3[brotli]"]
|
||||
|
||||
[[package]]
|
||||
name = "httpcore"
|
||||
version = "1.0.8"
|
||||
@@ -3605,6 +3687,21 @@ files = [
|
||||
[package.dependencies]
|
||||
referencing = ">=0.31.0"
|
||||
|
||||
[[package]]
|
||||
name = "justext"
|
||||
version = "3.0.2"
|
||||
description = "Heuristic based boilerplate removal tool"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "justext-3.0.2-py2.py3-none-any.whl", hash = "sha256:62b1c562b15c3c6265e121cc070874243a443bfd53060e869393f09d6b6cc9a7"},
|
||||
{file = "justext-3.0.2.tar.gz", hash = "sha256:13496a450c44c4cd5b5a75a5efcd9996066d2a189794ea99a49949685a0beb05"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
lxml = {version = ">=4.4.2", extras = ["html-clean"]}
|
||||
|
||||
[[package]]
|
||||
name = "kiwisolver"
|
||||
version = "1.4.9"
|
||||
@@ -3946,12 +4043,30 @@ files = [
|
||||
{file = "lxml-6.0.2.tar.gz", hash = "sha256:cd79f3367bd74b317dda655dc8fcfa304d9eb6e4fb06b7168c5cf27f96e0cd62"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
lxml_html_clean = {version = "*", optional = true, markers = "extra == \"html-clean\""}
|
||||
|
||||
[package.extras]
|
||||
cssselect = ["cssselect (>=0.7)"]
|
||||
html-clean = ["lxml_html_clean"]
|
||||
html5 = ["html5lib"]
|
||||
htmlsoup = ["BeautifulSoup4"]
|
||||
|
||||
[[package]]
|
||||
name = "lxml-html-clean"
|
||||
version = "0.4.3"
|
||||
description = "HTML cleaner from lxml project"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "lxml_html_clean-0.4.3-py3-none-any.whl", hash = "sha256:63fd7b0b9c3a2e4176611c2ca5d61c4c07ffca2de76c14059a81a2825833731e"},
|
||||
{file = "lxml_html_clean-0.4.3.tar.gz", hash = "sha256:c9df91925b00f836c807beab127aac82575110eacff54d0a75187914f1bd9d8c"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
lxml = "*"
|
||||
|
||||
[[package]]
|
||||
name = "markdown-it-py"
|
||||
version = "4.0.0"
|
||||
@@ -6524,10 +6639,9 @@ dev = ["atomicwrites (==1.2.1)", "attrs (==19.2.0)", "coverage (==6.5.0)", "hatc
|
||||
name = "pytz"
|
||||
version = "2025.2"
|
||||
description = "World timezone definitions, modern and historical"
|
||||
optional = true
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
groups = ["main"]
|
||||
markers = "extra == \"benchmark\""
|
||||
files = [
|
||||
{file = "pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00"},
|
||||
{file = "pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3"},
|
||||
@@ -7906,6 +8020,18 @@ requests = ">=2.26.0"
|
||||
[package.extras]
|
||||
blobfile = ["blobfile (>=2)"]
|
||||
|
||||
[[package]]
|
||||
name = "tld"
|
||||
version = "0.13.1"
|
||||
description = "Extract the top-level domain (TLD) from the URL given."
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "tld-0.13.1-py2.py3-none-any.whl", hash = "sha256:a2d35109433ac83486ddf87e3c4539ab2c5c2478230e5d9c060a18af4b03aa7c"},
|
||||
{file = "tld-0.13.1.tar.gz", hash = "sha256:75ec00936cbcf564f67361c41713363440b6c4ef0f0c1592b5b0fbe72c17a350"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokenizers"
|
||||
version = "0.22.2"
|
||||
@@ -7982,6 +8108,31 @@ notebook = ["ipywidgets (>=6)"]
|
||||
slack = ["slack-sdk"]
|
||||
telegram = ["requests"]
|
||||
|
||||
[[package]]
|
||||
name = "trafilatura"
|
||||
version = "2.0.0"
|
||||
description = "Python & Command-line tool to gather text and metadata on the Web: Crawling, scraping, extraction, output as CSV, JSON, HTML, MD, TXT, XML."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "trafilatura-2.0.0-py3-none-any.whl", hash = "sha256:77eb5d1e993747f6f20938e1de2d840020719735690c840b9a1024803a4cd51d"},
|
||||
{file = "trafilatura-2.0.0.tar.gz", hash = "sha256:ceb7094a6ecc97e72fea73c7dba36714c5c5b577b6470e4520dca893706d6247"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
certifi = "*"
|
||||
charset_normalizer = ">=3.4.0"
|
||||
courlan = ">=1.3.2"
|
||||
htmldate = ">=1.9.2"
|
||||
justext = ">=3.0.1"
|
||||
lxml = {version = ">=5.3.0", markers = "platform_system != \"Darwin\" or python_version > \"3.8\""}
|
||||
urllib3 = ">=1.26,<3"
|
||||
|
||||
[package.extras]
|
||||
all = ["brotli", "cchardet (>=2.1.7) ; python_version < \"3.11\"", "faust-cchardet (>=2.1.19) ; python_version >= \"3.11\"", "htmldate[speed] (>=1.9.2)", "py3langid (>=0.3.0)", "pycurl (>=7.45.3)", "urllib3[socks]", "zstandard (>=0.23.0)"]
|
||||
dev = ["flake8", "mypy", "pytest", "pytest-cov", "types-lxml", "types-urllib3"]
|
||||
|
||||
[[package]]
|
||||
name = "traitlets"
|
||||
version = "5.14.3"
|
||||
@@ -8221,15 +8372,33 @@ typing-extensions = ">=4.12.0"
|
||||
name = "tzdata"
|
||||
version = "2025.3"
|
||||
description = "Provider of IANA time zone data"
|
||||
optional = true
|
||||
optional = false
|
||||
python-versions = ">=2"
|
||||
groups = ["main"]
|
||||
markers = "extra == \"benchmark\""
|
||||
markers = "extra == \"benchmark\" or platform_system == \"Windows\""
|
||||
files = [
|
||||
{file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"},
|
||||
{file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tzlocal"
|
||||
version = "5.3.1"
|
||||
description = "tzinfo object for the local timezone"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "tzlocal-5.3.1-py3-none-any.whl", hash = "sha256:eb1a66c3ef5847adf7a834f1be0800581b683b5608e74f86ecbcef8ab91bb85d"},
|
||||
{file = "tzlocal-5.3.1.tar.gz", hash = "sha256:cceffc7edecefea1f595541dbd6e990cb1ea3d19bf01b2809f362a03dd7921fd"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
tzdata = {version = "*", markers = "platform_system == \"Windows\""}
|
||||
|
||||
[package.extras]
|
||||
devenv = ["check-manifest", "pytest (>=4.3)", "pytest-cov", "pytest-mock (>=3.3)", "zest.releaser"]
|
||||
|
||||
[[package]]
|
||||
name = "uritemplate"
|
||||
version = "4.2.0"
|
||||
@@ -9022,4 +9191,4 @@ benchmark = ["agbenchmark"]
|
||||
[metadata]
|
||||
lock-version = "2.1"
|
||||
python-versions = "^3.12"
|
||||
content-hash = "9388cd65752a0db5c42052e312a7df7b0f535256b2c27a9d9209d5118a418a25"
|
||||
content-hash = "7de5064cccbd74f3bc85ded976c37b249f7da997d6ef5b5853240a57daae7695"
|
||||
|
||||
@@ -37,6 +37,7 @@ colorama = "^0.4.6"
|
||||
demjson3 = "^3.0.0"
|
||||
docker = "*"
|
||||
ddgs = "^9.9"
|
||||
trafilatura = "^2.0"
|
||||
en-core-web-sm = { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl" }
|
||||
fastapi = "^0.109.1"
|
||||
gitpython = "^3.1.32"
|
||||
|
||||
Reference in New Issue
Block a user