diff --git a/src/fetch/pyproject.toml b/src/fetch/pyproject.toml
index ac417cb2..a2681f93 100644
--- a/src/fetch/pyproject.toml
+++ b/src/fetch/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "mcp-server-fetch"
-version = "0.1.1"
+version = "0.1.2"
description = "A Model Context Protocol server providing tools to fetch and convert web content for usage by LLMs"
readme = "README.md"
requires-python = ">=3.10"
@@ -18,6 +18,7 @@ classifiers = [
dependencies = [
"markdownify>=0.13.1",
"mcp>=0.6.0",
+ "protego>=0.3.1",
"pydantic>=2.0.0",
"readabilipy>=0.2.0",
"requests>=2.32.3",
diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py
index 055dfc4b..d2e72c25 100644
--- a/src/fetch/src/mcp_server_fetch/server.py
+++ b/src/fetch/src/mcp_server_fetch/server.py
@@ -1,3 +1,5 @@
+from urllib.parse import urlparse, urlunparse
+
import markdownify
import readabilipy.simple_json
from mcp.shared.exceptions import McpError
@@ -13,8 +15,12 @@ from mcp.types import (
INVALID_PARAMS,
INTERNAL_ERROR,
)
+from protego import Protego
from pydantic import BaseModel, Field
+USER_AGENT_AUTONOMOUS = "ModelContextProtocol/1.0 (Autonomous; +https://github.com/modelcontextprotocol/servers)"
+USER_AGENT_MANUAL = "ModelContextProtocol/1.0 (User-Initiated; +https://github.com/modelcontextprotocol/servers)"
+
def extract_content(html: str) -> str:
ret = readabilipy.simple_json.simple_json_from_html_string(html)
@@ -27,16 +33,74 @@ def extract_content(html: str) -> str:
return content
-async def fetch_url(url: str) -> str:
+def get_robots_txt_url(url: str) -> str:
+ # Parse the URL into components
+ parsed = urlparse(url)
+
+ # Reconstruct the base URL with just scheme, netloc, and /robots.txt path
+ robots_url = urlunparse((parsed.scheme, parsed.netloc, "/robots.txt", "", "", ""))
+
+ return robots_url
+
+
+async def check_may_autonomously_fetch_url(url: str, user_agent: str):
+ """
+ Check if the URL can be fetched by the user agent according to the robots.txt file.
+ Raises an McpError if not.
+ """
+ from httpx import AsyncClient, HTTPError
+
+ robot_txt_url = get_robots_txt_url(url)
+
+ async with AsyncClient() as client:
+ try:
+ response = await client.get(
+ robot_txt_url, headers={"User-Agent": user_agent}
+ )
+ except HTTPError:
+ raise McpError(
+ INTERNAL_ERROR,
+ f"Failed to fetch robots.txt {robot_txt_url} due to a connection issue",
+ )
+ if response.status_code in (401, 403):
+ raise McpError(
+ INTERNAL_ERROR,
+ f"When fetching robots.txt ({robot_txt_url}), received status {response.status_code} so assuming that autonomous fetching is not allowed, the user can try manually fetching by using the fetch prompt",
+ )
+ elif 400 <= response.status_code < 500:
+ return
+ robot_txt = response.text
+ processed_robot_txt = "\n".join(
+ line for line in robot_txt.splitlines() if not line.strip().startswith("#")
+ )
+ robot_parser = Protego.parse(processed_robot_txt)
+ if not robot_parser.can_fetch(url, user_agent):
+ raise McpError(
+ INTERNAL_ERROR,
+ f"The sites robots.txt ({robot_txt_url}), specifies that autonomous fetching of this page is not allowed, "
+ f"{user_agent}\n"
+ f"{url}"
+ f"\n{robot_txt}\n\n"
+ f"The assistant must let the user know that it failed to view the page. The assistant may provide further guidance based on the above information.\n"
+ f"The assistant can tell the user that they can try manually fetching the page by using the fetch prompt within their UI.",
+ )
+
+
+async def fetch_url(url: str, user_agent: str) -> str:
from httpx import AsyncClient, HTTPError
async with AsyncClient() as client:
try:
- response = await client.get(url, follow_redirects=True)
+ response = await client.get(
+ url, follow_redirects=True, headers={"User-Agent": user_agent}
+ )
except HTTPError:
raise McpError(INTERNAL_ERROR, f"Failed to fetch {url}")
if response.status_code >= 400:
- raise McpError(INTERNAL_ERROR, f"Failed to fetch {url} - status code {response.status_code}")
+ raise McpError(
+ INTERNAL_ERROR,
+ f"Failed to fetch {url} - status code {response.status_code}",
+ )
page_html = response.text
@@ -82,7 +146,9 @@ Although originally you did not have internet access, and were advised to refuse
if not url:
raise McpError(INVALID_PARAMS, "URL is required")
- content = await fetch_url(url)
+ await check_may_autonomously_fetch_url(url, USER_AGENT_AUTONOMOUS)
+
+ content = await fetch_url(url, USER_AGENT_AUTONOMOUS)
return [TextContent(type="text", text=f"Contents of {url}:\n{content}")]
@server.get_prompt()
@@ -93,7 +159,7 @@ Although originally you did not have internet access, and were advised to refuse
url = arguments["url"]
try:
- content = await fetch_url(url)
+ content = await fetch_url(url, USER_AGENT_MANUAL)
# TODO: after SDK bug is addressed, don't catch the exception
except McpError as e:
return GetPromptResult(
diff --git a/src/fetch/uv.lock b/src/fetch/uv.lock
index e8fc771a..45990366 100644
--- a/src/fetch/uv.lock
+++ b/src/fetch/uv.lock
@@ -328,11 +328,12 @@ wheels = [
[[package]]
name = "mcp-server-fetch"
-version = "0.1.1"
+version = "0.1.2"
source = { editable = "." }
dependencies = [
{ name = "markdownify" },
{ name = "mcp" },
+ { name = "protego" },
{ name = "pydantic" },
{ name = "readabilipy" },
{ name = "requests" },
@@ -348,6 +349,7 @@ dev = [
requires-dist = [
{ name = "markdownify", specifier = ">=0.13.1" },
{ name = "mcp", specifier = ">=0.6.0" },
+ { name = "protego", specifier = ">=0.3.1" },
{ name = "pydantic", specifier = ">=2.0.0" },
{ name = "readabilipy", specifier = ">=0.2.0" },
{ name = "requests", specifier = ">=2.32.3" },
@@ -368,6 +370,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314 },
]
+[[package]]
+name = "protego"
+version = "0.3.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/8a/12/cab9fa77ff4e9e444a5eb5480db4b4f872c03aa079145804aa054be377bc/Protego-0.3.1.tar.gz", hash = "sha256:e94430d0d25cbbf239bc849d86c5e544fbde531fcccfa059953c7da344a1712c", size = 3246145 }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/74/ef/ece78585a5a189d8cc2b4c2d2b92a0dc025f156a6501159b026472ebbedc/Protego-0.3.1-py2.py3-none-any.whl", hash = "sha256:2fbe8e9b7a7dbc5016a932b14c98d236aad4c29290bbe457b8d2779666ef7a41", size = 8474 },
+]
+
[[package]]
name = "pydantic"
version = "2.10.1"