mirror of
https://github.com/modelcontextprotocol/servers.git
synced 2026-02-19 11:54:58 -05:00
make the fetch mcp-server obey robots.txt
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
[project]
|
||||
name = "mcp-server-fetch"
|
||||
version = "0.1.1"
|
||||
version = "0.1.2"
|
||||
description = "A Model Context Protocol server providing tools to fetch and convert web content for usage by LLMs"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
@@ -18,6 +18,7 @@ classifiers = [
|
||||
dependencies = [
|
||||
"markdownify>=0.13.1",
|
||||
"mcp>=0.6.0",
|
||||
"protego>=0.3.1",
|
||||
"pydantic>=2.0.0",
|
||||
"readabilipy>=0.2.0",
|
||||
"requests>=2.32.3",
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
|
||||
import markdownify
|
||||
import readabilipy.simple_json
|
||||
from mcp.shared.exceptions import McpError
|
||||
@@ -13,8 +15,12 @@ from mcp.types import (
|
||||
INVALID_PARAMS,
|
||||
INTERNAL_ERROR,
|
||||
)
|
||||
from protego import Protego
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
USER_AGENT_AUTONOMOUS = "ModelContextProtocol/1.0 (Autonomous; +https://github.com/modelcontextprotocol/servers)"
|
||||
USER_AGENT_MANUAL = "ModelContextProtocol/1.0 (User-Initiated; +https://github.com/modelcontextprotocol/servers)"
|
||||
|
||||
|
||||
def extract_content(html: str) -> str:
|
||||
ret = readabilipy.simple_json.simple_json_from_html_string(html)
|
||||
@@ -27,16 +33,74 @@ def extract_content(html: str) -> str:
|
||||
return content
|
||||
|
||||
|
||||
async def fetch_url(url: str) -> str:
|
||||
def get_robots_txt_url(url: str) -> str:
|
||||
# Parse the URL into components
|
||||
parsed = urlparse(url)
|
||||
|
||||
# Reconstruct the base URL with just scheme, netloc, and /robots.txt path
|
||||
robots_url = urlunparse((parsed.scheme, parsed.netloc, "/robots.txt", "", "", ""))
|
||||
|
||||
return robots_url
|
||||
|
||||
|
||||
async def check_may_autonomously_fetch_url(url: str, user_agent: str):
|
||||
"""
|
||||
Check if the URL can be fetched by the user agent according to the robots.txt file.
|
||||
Raises an McpError if not.
|
||||
"""
|
||||
from httpx import AsyncClient, HTTPError
|
||||
|
||||
robot_txt_url = get_robots_txt_url(url)
|
||||
|
||||
async with AsyncClient() as client:
|
||||
try:
|
||||
response = await client.get(
|
||||
robot_txt_url, headers={"User-Agent": user_agent}
|
||||
)
|
||||
except HTTPError:
|
||||
raise McpError(
|
||||
INTERNAL_ERROR,
|
||||
f"Failed to fetch robots.txt {robot_txt_url} due to a connection issue",
|
||||
)
|
||||
if response.status_code in (401, 403):
|
||||
raise McpError(
|
||||
INTERNAL_ERROR,
|
||||
f"When fetching robots.txt ({robot_txt_url}), received status {response.status_code} so assuming that autonomous fetching is not allowed, the user can try manually fetching by using the fetch prompt",
|
||||
)
|
||||
elif 400 <= response.status_code < 500:
|
||||
return
|
||||
robot_txt = response.text
|
||||
processed_robot_txt = "\n".join(
|
||||
line for line in robot_txt.splitlines() if not line.strip().startswith("#")
|
||||
)
|
||||
robot_parser = Protego.parse(processed_robot_txt)
|
||||
if not robot_parser.can_fetch(url, user_agent):
|
||||
raise McpError(
|
||||
INTERNAL_ERROR,
|
||||
f"The sites robots.txt ({robot_txt_url}), specifies that autonomous fetching of this page is not allowed, "
|
||||
f"<useragent>{user_agent}</useragent>\n"
|
||||
f"<url>{url}</url>"
|
||||
f"<robots>\n{robot_txt}\n</robots>\n"
|
||||
f"The assistant must let the user know that it failed to view the page. The assistant may provide further guidance based on the above information.\n"
|
||||
f"The assistant can tell the user that they can try manually fetching the page by using the fetch prompt within their UI.",
|
||||
)
|
||||
|
||||
|
||||
async def fetch_url(url: str, user_agent: str) -> str:
|
||||
from httpx import AsyncClient, HTTPError
|
||||
|
||||
async with AsyncClient() as client:
|
||||
try:
|
||||
response = await client.get(url, follow_redirects=True)
|
||||
response = await client.get(
|
||||
url, follow_redirects=True, headers={"User-Agent": user_agent}
|
||||
)
|
||||
except HTTPError:
|
||||
raise McpError(INTERNAL_ERROR, f"Failed to fetch {url}")
|
||||
if response.status_code >= 400:
|
||||
raise McpError(INTERNAL_ERROR, f"Failed to fetch {url} - status code {response.status_code}")
|
||||
raise McpError(
|
||||
INTERNAL_ERROR,
|
||||
f"Failed to fetch {url} - status code {response.status_code}",
|
||||
)
|
||||
|
||||
page_html = response.text
|
||||
|
||||
@@ -82,7 +146,9 @@ Although originally you did not have internet access, and were advised to refuse
|
||||
if not url:
|
||||
raise McpError(INVALID_PARAMS, "URL is required")
|
||||
|
||||
content = await fetch_url(url)
|
||||
await check_may_autonomously_fetch_url(url, USER_AGENT_AUTONOMOUS)
|
||||
|
||||
content = await fetch_url(url, USER_AGENT_AUTONOMOUS)
|
||||
return [TextContent(type="text", text=f"Contents of {url}:\n{content}")]
|
||||
|
||||
@server.get_prompt()
|
||||
@@ -93,7 +159,7 @@ Although originally you did not have internet access, and were advised to refuse
|
||||
url = arguments["url"]
|
||||
|
||||
try:
|
||||
content = await fetch_url(url)
|
||||
content = await fetch_url(url, USER_AGENT_MANUAL)
|
||||
# TODO: after SDK bug is addressed, don't catch the exception
|
||||
except McpError as e:
|
||||
return GetPromptResult(
|
||||
|
||||
13
src/fetch/uv.lock
generated
13
src/fetch/uv.lock
generated
@@ -328,11 +328,12 @@ wheels = [
|
||||
|
||||
[[package]]
|
||||
name = "mcp-server-fetch"
|
||||
version = "0.1.1"
|
||||
version = "0.1.2"
|
||||
source = { editable = "." }
|
||||
dependencies = [
|
||||
{ name = "markdownify" },
|
||||
{ name = "mcp" },
|
||||
{ name = "protego" },
|
||||
{ name = "pydantic" },
|
||||
{ name = "readabilipy" },
|
||||
{ name = "requests" },
|
||||
@@ -348,6 +349,7 @@ dev = [
|
||||
requires-dist = [
|
||||
{ name = "markdownify", specifier = ">=0.13.1" },
|
||||
{ name = "mcp", specifier = ">=0.6.0" },
|
||||
{ name = "protego", specifier = ">=0.3.1" },
|
||||
{ name = "pydantic", specifier = ">=2.0.0" },
|
||||
{ name = "readabilipy", specifier = ">=0.2.0" },
|
||||
{ name = "requests", specifier = ">=2.32.3" },
|
||||
@@ -368,6 +370,15 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "protego"
|
||||
version = "0.3.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/8a/12/cab9fa77ff4e9e444a5eb5480db4b4f872c03aa079145804aa054be377bc/Protego-0.3.1.tar.gz", hash = "sha256:e94430d0d25cbbf239bc849d86c5e544fbde531fcccfa059953c7da344a1712c", size = 3246145 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/74/ef/ece78585a5a189d8cc2b4c2d2b92a0dc025f156a6501159b026472ebbedc/Protego-0.3.1-py2.py3-none-any.whl", hash = "sha256:2fbe8e9b7a7dbc5016a932b14c98d236aad4c29290bbe457b8d2779666ef7a41", size = 8474 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pydantic"
|
||||
version = "2.10.1"
|
||||
|
||||
Reference in New Issue
Block a user