mirror of
https://github.com/modelcontextprotocol/servers.git
synced 2026-02-19 11:54:58 -05:00
Merge pull request #144 from SkywaveTM/main
fix(fetch): properly handles robots.txt
This commit is contained in:
@@ -74,7 +74,9 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str) -> None:
|
||||
async with AsyncClient() as client:
|
||||
try:
|
||||
response = await client.get(
|
||||
robot_txt_url, headers={"User-Agent": user_agent}
|
||||
robot_txt_url,
|
||||
follow_redirects=True,
|
||||
headers={"User-Agent": user_agent},
|
||||
)
|
||||
except HTTPError:
|
||||
raise McpError(
|
||||
@@ -93,7 +95,7 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str) -> None:
|
||||
line for line in robot_txt.splitlines() if not line.strip().startswith("#")
|
||||
)
|
||||
robot_parser = Protego.parse(processed_robot_txt)
|
||||
if not robot_parser.can_fetch(url, user_agent):
|
||||
if not robot_parser.can_fetch(str(url), user_agent):
|
||||
raise McpError(
|
||||
INTERNAL_ERROR,
|
||||
f"The sites robots.txt ({robot_txt_url}), specifies that autonomous fetching of this page is not allowed, "
|
||||
|
||||
Reference in New Issue
Block a user