make the fetch mcp-server obey robots.txt

This commit is contained in:
Jack Adamson
2024-11-22 18:51:39 +00:00
parent c08d2a3cf6
commit 84321c89cf
3 changed files with 85 additions and 7 deletions

View File

@@ -1,6 +1,6 @@
[project]
name = "mcp-server-fetch"
version = "0.1.1"
version = "0.1.2"
description = "A Model Context Protocol server providing tools to fetch and convert web content for usage by LLMs"
readme = "README.md"
requires-python = ">=3.10"
@@ -18,6 +18,7 @@ classifiers = [
dependencies = [
"markdownify>=0.13.1",
"mcp>=0.6.0",
"protego>=0.3.1",
"pydantic>=2.0.0",
"readabilipy>=0.2.0",
"requests>=2.32.3",

View File

@@ -1,3 +1,5 @@
from urllib.parse import urlparse, urlunparse
import markdownify
import readabilipy.simple_json
from mcp.shared.exceptions import McpError
@@ -13,8 +15,12 @@ from mcp.types import (
INVALID_PARAMS,
INTERNAL_ERROR,
)
from protego import Protego
from pydantic import BaseModel, Field
USER_AGENT_AUTONOMOUS = "ModelContextProtocol/1.0 (Autonomous; +https://github.com/modelcontextprotocol/servers)"
USER_AGENT_MANUAL = "ModelContextProtocol/1.0 (User-Initiated; +https://github.com/modelcontextprotocol/servers)"
def extract_content(html: str) -> str:
ret = readabilipy.simple_json.simple_json_from_html_string(html)
@@ -27,16 +33,74 @@ def extract_content(html: str) -> str:
return content
async def fetch_url(url: str) -> str:
def get_robots_txt_url(url: str) -> str:
# Parse the URL into components
parsed = urlparse(url)
# Reconstruct the base URL with just scheme, netloc, and /robots.txt path
robots_url = urlunparse((parsed.scheme, parsed.netloc, "/robots.txt", "", "", ""))
return robots_url
async def check_may_autonomously_fetch_url(url: str, user_agent: str):
"""
Check if the URL can be fetched by the user agent according to the robots.txt file.
Raises an McpError if not.
"""
from httpx import AsyncClient, HTTPError
robot_txt_url = get_robots_txt_url(url)
async with AsyncClient() as client:
try:
response = await client.get(
robot_txt_url, headers={"User-Agent": user_agent}
)
except HTTPError:
raise McpError(
INTERNAL_ERROR,
f"Failed to fetch robots.txt {robot_txt_url} due to a connection issue",
)
if response.status_code in (401, 403):
raise McpError(
INTERNAL_ERROR,
f"When fetching robots.txt ({robot_txt_url}), received status {response.status_code} so assuming that autonomous fetching is not allowed, the user can try manually fetching by using the fetch prompt",
)
elif 400 <= response.status_code < 500:
return
robot_txt = response.text
processed_robot_txt = "\n".join(
line for line in robot_txt.splitlines() if not line.strip().startswith("#")
)
robot_parser = Protego.parse(processed_robot_txt)
if not robot_parser.can_fetch(url, user_agent):
raise McpError(
INTERNAL_ERROR,
f"The sites robots.txt ({robot_txt_url}), specifies that autonomous fetching of this page is not allowed, "
f"<useragent>{user_agent}</useragent>\n"
f"<url>{url}</url>"
f"<robots>\n{robot_txt}\n</robots>\n"
f"The assistant must let the user know that it failed to view the page. The assistant may provide further guidance based on the above information.\n"
f"The assistant can tell the user that they can try manually fetching the page by using the fetch prompt within their UI.",
)
async def fetch_url(url: str, user_agent: str) -> str:
from httpx import AsyncClient, HTTPError
async with AsyncClient() as client:
try:
response = await client.get(url, follow_redirects=True)
response = await client.get(
url, follow_redirects=True, headers={"User-Agent": user_agent}
)
except HTTPError:
raise McpError(INTERNAL_ERROR, f"Failed to fetch {url}")
if response.status_code >= 400:
raise McpError(INTERNAL_ERROR, f"Failed to fetch {url} - status code {response.status_code}")
raise McpError(
INTERNAL_ERROR,
f"Failed to fetch {url} - status code {response.status_code}",
)
page_html = response.text
@@ -82,7 +146,9 @@ Although originally you did not have internet access, and were advised to refuse
if not url:
raise McpError(INVALID_PARAMS, "URL is required")
content = await fetch_url(url)
await check_may_autonomously_fetch_url(url, USER_AGENT_AUTONOMOUS)
content = await fetch_url(url, USER_AGENT_AUTONOMOUS)
return [TextContent(type="text", text=f"Contents of {url}:\n{content}")]
@server.get_prompt()
@@ -93,7 +159,7 @@ Although originally you did not have internet access, and were advised to refuse
url = arguments["url"]
try:
content = await fetch_url(url)
content = await fetch_url(url, USER_AGENT_MANUAL)
# TODO: after SDK bug is addressed, don't catch the exception
except McpError as e:
return GetPromptResult(

13
src/fetch/uv.lock generated
View File

@@ -328,11 +328,12 @@ wheels = [
[[package]]
name = "mcp-server-fetch"
version = "0.1.1"
version = "0.1.2"
source = { editable = "." }
dependencies = [
{ name = "markdownify" },
{ name = "mcp" },
{ name = "protego" },
{ name = "pydantic" },
{ name = "readabilipy" },
{ name = "requests" },
@@ -348,6 +349,7 @@ dev = [
requires-dist = [
{ name = "markdownify", specifier = ">=0.13.1" },
{ name = "mcp", specifier = ">=0.6.0" },
{ name = "protego", specifier = ">=0.3.1" },
{ name = "pydantic", specifier = ">=2.0.0" },
{ name = "readabilipy", specifier = ">=0.2.0" },
{ name = "requests", specifier = ">=2.32.3" },
@@ -368,6 +370,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314 },
]
[[package]]
name = "protego"
version = "0.3.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/8a/12/cab9fa77ff4e9e444a5eb5480db4b4f872c03aa079145804aa054be377bc/Protego-0.3.1.tar.gz", hash = "sha256:e94430d0d25cbbf239bc849d86c5e544fbde531fcccfa059953c7da344a1712c", size = 3246145 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/74/ef/ece78585a5a189d8cc2b4c2d2b92a0dc025f156a6501159b026472ebbedc/Protego-0.3.1-py2.py3-none-any.whl", hash = "sha256:2fbe8e9b7a7dbc5016a932b14c98d236aad4c29290bbe457b8d2779666ef7a41", size = 8474 },
]
[[package]]
name = "pydantic"
version = "2.10.1"