diff --git a/src/fetch/pyproject.toml b/src/fetch/pyproject.toml index ac417cb2..a2681f93 100644 --- a/src/fetch/pyproject.toml +++ b/src/fetch/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "mcp-server-fetch" -version = "0.1.1" +version = "0.1.2" description = "A Model Context Protocol server providing tools to fetch and convert web content for usage by LLMs" readme = "README.md" requires-python = ">=3.10" @@ -18,6 +18,7 @@ classifiers = [ dependencies = [ "markdownify>=0.13.1", "mcp>=0.6.0", + "protego>=0.3.1", "pydantic>=2.0.0", "readabilipy>=0.2.0", "requests>=2.32.3", diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py index 055dfc4b..d2e72c25 100644 --- a/src/fetch/src/mcp_server_fetch/server.py +++ b/src/fetch/src/mcp_server_fetch/server.py @@ -1,3 +1,5 @@ +from urllib.parse import urlparse, urlunparse + import markdownify import readabilipy.simple_json from mcp.shared.exceptions import McpError @@ -13,8 +15,12 @@ from mcp.types import ( INVALID_PARAMS, INTERNAL_ERROR, ) +from protego import Protego from pydantic import BaseModel, Field +USER_AGENT_AUTONOMOUS = "ModelContextProtocol/1.0 (Autonomous; +https://github.com/modelcontextprotocol/servers)" +USER_AGENT_MANUAL = "ModelContextProtocol/1.0 (User-Initiated; +https://github.com/modelcontextprotocol/servers)" + def extract_content(html: str) -> str: ret = readabilipy.simple_json.simple_json_from_html_string(html) @@ -27,16 +33,74 @@ def extract_content(html: str) -> str: return content -async def fetch_url(url: str) -> str: +def get_robots_txt_url(url: str) -> str: + # Parse the URL into components + parsed = urlparse(url) + + # Reconstruct the base URL with just scheme, netloc, and /robots.txt path + robots_url = urlunparse((parsed.scheme, parsed.netloc, "/robots.txt", "", "", "")) + + return robots_url + + +async def check_may_autonomously_fetch_url(url: str, user_agent: str): + """ + Check if the URL can be fetched by the user agent according to the robots.txt file. + Raises an McpError if not. + """ + from httpx import AsyncClient, HTTPError + + robot_txt_url = get_robots_txt_url(url) + + async with AsyncClient() as client: + try: + response = await client.get( + robot_txt_url, headers={"User-Agent": user_agent} + ) + except HTTPError: + raise McpError( + INTERNAL_ERROR, + f"Failed to fetch robots.txt {robot_txt_url} due to a connection issue", + ) + if response.status_code in (401, 403): + raise McpError( + INTERNAL_ERROR, + f"When fetching robots.txt ({robot_txt_url}), received status {response.status_code} so assuming that autonomous fetching is not allowed, the user can try manually fetching by using the fetch prompt", + ) + elif 400 <= response.status_code < 500: + return + robot_txt = response.text + processed_robot_txt = "\n".join( + line for line in robot_txt.splitlines() if not line.strip().startswith("#") + ) + robot_parser = Protego.parse(processed_robot_txt) + if not robot_parser.can_fetch(url, user_agent): + raise McpError( + INTERNAL_ERROR, + f"The sites robots.txt ({robot_txt_url}), specifies that autonomous fetching of this page is not allowed, " + f"{user_agent}\n" + f"{url}" + f"\n{robot_txt}\n\n" + f"The assistant must let the user know that it failed to view the page. The assistant may provide further guidance based on the above information.\n" + f"The assistant can tell the user that they can try manually fetching the page by using the fetch prompt within their UI.", + ) + + +async def fetch_url(url: str, user_agent: str) -> str: from httpx import AsyncClient, HTTPError async with AsyncClient() as client: try: - response = await client.get(url, follow_redirects=True) + response = await client.get( + url, follow_redirects=True, headers={"User-Agent": user_agent} + ) except HTTPError: raise McpError(INTERNAL_ERROR, f"Failed to fetch {url}") if response.status_code >= 400: - raise McpError(INTERNAL_ERROR, f"Failed to fetch {url} - status code {response.status_code}") + raise McpError( + INTERNAL_ERROR, + f"Failed to fetch {url} - status code {response.status_code}", + ) page_html = response.text @@ -82,7 +146,9 @@ Although originally you did not have internet access, and were advised to refuse if not url: raise McpError(INVALID_PARAMS, "URL is required") - content = await fetch_url(url) + await check_may_autonomously_fetch_url(url, USER_AGENT_AUTONOMOUS) + + content = await fetch_url(url, USER_AGENT_AUTONOMOUS) return [TextContent(type="text", text=f"Contents of {url}:\n{content}")] @server.get_prompt() @@ -93,7 +159,7 @@ Although originally you did not have internet access, and were advised to refuse url = arguments["url"] try: - content = await fetch_url(url) + content = await fetch_url(url, USER_AGENT_MANUAL) # TODO: after SDK bug is addressed, don't catch the exception except McpError as e: return GetPromptResult( diff --git a/src/fetch/uv.lock b/src/fetch/uv.lock index e8fc771a..45990366 100644 --- a/src/fetch/uv.lock +++ b/src/fetch/uv.lock @@ -328,11 +328,12 @@ wheels = [ [[package]] name = "mcp-server-fetch" -version = "0.1.1" +version = "0.1.2" source = { editable = "." } dependencies = [ { name = "markdownify" }, { name = "mcp" }, + { name = "protego" }, { name = "pydantic" }, { name = "readabilipy" }, { name = "requests" }, @@ -348,6 +349,7 @@ dev = [ requires-dist = [ { name = "markdownify", specifier = ">=0.13.1" }, { name = "mcp", specifier = ">=0.6.0" }, + { name = "protego", specifier = ">=0.3.1" }, { name = "pydantic", specifier = ">=2.0.0" }, { name = "readabilipy", specifier = ">=0.2.0" }, { name = "requests", specifier = ">=2.32.3" }, @@ -368,6 +370,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314 }, ] +[[package]] +name = "protego" +version = "0.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8a/12/cab9fa77ff4e9e444a5eb5480db4b4f872c03aa079145804aa054be377bc/Protego-0.3.1.tar.gz", hash = "sha256:e94430d0d25cbbf239bc849d86c5e544fbde531fcccfa059953c7da344a1712c", size = 3246145 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/74/ef/ece78585a5a189d8cc2b4c2d2b92a0dc025f156a6501159b026472ebbedc/Protego-0.3.1-py2.py3-none-any.whl", hash = "sha256:2fbe8e9b7a7dbc5016a932b14c98d236aad4c29290bbe457b8d2779666ef7a41", size = 8474 }, +] + [[package]] name = "pydantic" version = "2.10.1"