Merge pull request #144 from SkywaveTM/main

fix(fetch): properly handles robots.txt
This commit is contained in:
Justin Spahr-Summers
2024-12-03 07:18:33 -06:00
committed by GitHub

View File

@@ -74,7 +74,9 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str) -> None:
async with AsyncClient() as client:
try:
response = await client.get(
robot_txt_url, headers={"User-Agent": user_agent}
robot_txt_url,
follow_redirects=True,
headers={"User-Agent": user_agent},
)
except HTTPError:
raise McpError(
@@ -93,7 +95,7 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str) -> None:
line for line in robot_txt.splitlines() if not line.strip().startswith("#")
)
robot_parser = Protego.parse(processed_robot_txt)
if not robot_parser.can_fetch(url, user_agent):
if not robot_parser.can_fetch(str(url), user_agent):
raise McpError(
INTERNAL_ERROR,
f"The sites robots.txt ({robot_txt_url}), specifies that autonomous fetching of this page is not allowed, "