Compare commits

..

1 Commits

Author SHA1 Message Date
DEEVEN SERU
b8f5c208d0 Handle errors in Jina ExtractWebsiteContentBlock (#12048)
## Summary
- catch Jina reader client/server errors in ExtractWebsiteContentBlock
and surface a clear error output keyed to the user URL
- guard empty responses to return an explicit error instead of yielding
blank content
- add regression tests covering the happy path and HTTP client failures
via a monkeypatched fetch

## Testing
- not run (pytest unavailable in this environment)

---------

Co-authored-by: Nicholas Tindle <nicktindle@outlook.com>
Co-authored-by: Nicholas Tindle <nicholas.tindle@agpt.co>
2026-02-13 19:15:09 +00:00
5 changed files with 127 additions and 52 deletions

View File

@@ -11,15 +11,45 @@ import re
from collections.abc import Callable from collections.abc import Callable
from typing import Any, cast from typing import Any, cast
from backend.api.features.chat.sdk.tool_adapter import ( from backend.api.features.chat.sdk.tool_adapter import MCP_TOOL_PREFIX
BLOCKED_TOOLS,
DANGEROUS_PATTERNS,
MCP_TOOL_PREFIX,
WORKSPACE_SCOPED_TOOLS,
)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Tools that are blocked entirely (CLI/system access).
# "Bash" (capital) is the SDK built-in — it's NOT in allowed_tools but blocked
# here as defence-in-depth. The agent uses mcp__copilot__bash_exec instead,
# which has kernel-level network isolation (unshare --net).
BLOCKED_TOOLS = {
"Bash",
"bash",
"shell",
"exec",
"terminal",
"command",
}
# Tools allowed only when their path argument stays within the SDK workspace.
# The SDK uses these to handle oversized tool results (writes to tool-results/
# files, then reads them back) and for workspace file operations.
WORKSPACE_SCOPED_TOOLS = {"Read", "Write", "Edit", "Glob", "Grep"}
# Dangerous patterns in tool inputs
DANGEROUS_PATTERNS = [
r"sudo",
r"rm\s+-rf",
r"dd\s+if=",
r"/etc/passwd",
r"/etc/shadow",
r"chmod\s+777",
r"curl\s+.*\|.*sh",
r"wget\s+.*\|.*sh",
r"eval\s*\(",
r"exec\s*\(",
r"__import__",
r"os\.system",
r"subprocess",
]
def _deny(reason: str) -> dict[str, Any]: def _deny(reason: str) -> dict[str, Any]:
"""Return a hook denial response.""" """Return a hook denial response."""

View File

@@ -41,7 +41,6 @@ from .response_adapter import SDKResponseAdapter
from .security_hooks import create_security_hooks from .security_hooks import create_security_hooks
from .tool_adapter import ( from .tool_adapter import (
COPILOT_TOOL_NAMES, COPILOT_TOOL_NAMES,
SDK_DISALLOWED_TOOLS,
LongRunningCallback, LongRunningCallback,
create_copilot_mcp_server, create_copilot_mcp_server,
set_execution_context, set_execution_context,
@@ -544,7 +543,7 @@ async def stream_chat_completion_sdk(
"system_prompt": system_prompt, "system_prompt": system_prompt,
"mcp_servers": {"copilot": mcp_server}, "mcp_servers": {"copilot": mcp_server},
"allowed_tools": COPILOT_TOOL_NAMES, "allowed_tools": COPILOT_TOOL_NAMES,
"disallowed_tools": SDK_DISALLOWED_TOOLS, "disallowed_tools": ["Bash"],
"hooks": security_hooks, "hooks": security_hooks,
"cwd": sdk_cwd, "cwd": sdk_cwd,
"max_buffer_size": config.claude_agent_max_buffer_size, "max_buffer_size": config.claude_agent_max_buffer_size,

View File

@@ -310,48 +310,7 @@ def create_copilot_mcp_server():
# Bash is NOT included — use the sandboxed MCP bash_exec tool instead, # Bash is NOT included — use the sandboxed MCP bash_exec tool instead,
# which provides kernel-level network isolation via unshare --net. # which provides kernel-level network isolation via unshare --net.
# Task allows spawning sub-agents (rate-limited by security hooks). # Task allows spawning sub-agents (rate-limited by security hooks).
# WebSearch uses Brave Search via Anthropic's API — safe, no SSRF risk. _SDK_BUILTIN_TOOLS = ["Read", "Write", "Edit", "Glob", "Grep", "Task"]
_SDK_BUILTIN_TOOLS = ["Read", "Write", "Edit", "Glob", "Grep", "Task", "WebSearch"]
# SDK built-in tools that must be explicitly blocked.
# Bash: dangerous — agent uses mcp__copilot__bash_exec with kernel-level
# network isolation (unshare --net) instead.
# WebFetch: SSRF risk — can reach internal network (localhost, 10.x, etc.).
# Agent uses the SSRF-protected mcp__copilot__web_fetch tool instead.
SDK_DISALLOWED_TOOLS = ["Bash", "WebFetch"]
# Tools that are blocked entirely in security hooks (defence-in-depth).
# Includes SDK_DISALLOWED_TOOLS plus common aliases/synonyms.
BLOCKED_TOOLS = {
*SDK_DISALLOWED_TOOLS,
"bash",
"shell",
"exec",
"terminal",
"command",
}
# Tools allowed only when their path argument stays within the SDK workspace.
# The SDK uses these to handle oversized tool results (writes to tool-results/
# files, then reads them back) and for workspace file operations.
WORKSPACE_SCOPED_TOOLS = {"Read", "Write", "Edit", "Glob", "Grep"}
# Dangerous patterns in tool inputs
DANGEROUS_PATTERNS = [
r"sudo",
r"rm\s+-rf",
r"dd\s+if=",
r"/etc/passwd",
r"/etc/shadow",
r"chmod\s+777",
r"curl\s+.*\|.*sh",
r"wget\s+.*\|.*sh",
r"eval\s*\(",
r"exec\s*\(",
r"__import__",
r"os\.system",
r"subprocess",
]
# List of tool names for allowed_tools configuration # List of tool names for allowed_tools configuration
# Include MCP tools, the MCP Read tool for oversized results, # Include MCP tools, the MCP Read tool for oversized results,

View File

@@ -17,6 +17,7 @@ from backend.blocks.jina._auth import (
from backend.blocks.search import GetRequest from backend.blocks.search import GetRequest
from backend.data.model import SchemaField from backend.data.model import SchemaField
from backend.util.exceptions import BlockExecutionError from backend.util.exceptions import BlockExecutionError
from backend.util.request import HTTPClientError, HTTPServerError, validate_url
class SearchTheWebBlock(Block, GetRequest): class SearchTheWebBlock(Block, GetRequest):
@@ -110,7 +111,12 @@ class ExtractWebsiteContentBlock(Block, GetRequest):
self, input_data: Input, *, credentials: JinaCredentials, **kwargs self, input_data: Input, *, credentials: JinaCredentials, **kwargs
) -> BlockOutput: ) -> BlockOutput:
if input_data.raw_content: if input_data.raw_content:
url = input_data.url try:
parsed_url, _, _ = await validate_url(input_data.url, [])
url = parsed_url.geturl()
except ValueError as e:
yield "error", f"Invalid URL: {e}"
return
headers = {} headers = {}
else: else:
url = f"https://r.jina.ai/{input_data.url}" url = f"https://r.jina.ai/{input_data.url}"
@@ -119,5 +125,20 @@ class ExtractWebsiteContentBlock(Block, GetRequest):
"Authorization": f"Bearer {credentials.api_key.get_secret_value()}", "Authorization": f"Bearer {credentials.api_key.get_secret_value()}",
} }
content = await self.get_request(url, json=False, headers=headers) try:
content = await self.get_request(url, json=False, headers=headers)
except HTTPClientError as e:
yield "error", f"Client error ({e.status_code}) fetching {input_data.url}: {e}"
return
except HTTPServerError as e:
yield "error", f"Server error ({e.status_code}) fetching {input_data.url}: {e}"
return
except Exception as e:
yield "error", f"Failed to fetch {input_data.url}: {e}"
return
if not content:
yield "error", f"No content returned for {input_data.url}"
return
yield "content", content yield "content", content

View File

@@ -0,0 +1,66 @@
from typing import cast
import pytest
from backend.blocks.jina._auth import (
TEST_CREDENTIALS,
TEST_CREDENTIALS_INPUT,
JinaCredentialsInput,
)
from backend.blocks.jina.search import ExtractWebsiteContentBlock
from backend.util.request import HTTPClientError
@pytest.mark.asyncio
async def test_extract_website_content_returns_content(monkeypatch):
block = ExtractWebsiteContentBlock()
input_data = block.Input(
url="https://example.com",
credentials=cast(JinaCredentialsInput, TEST_CREDENTIALS_INPUT),
raw_content=True,
)
async def fake_get_request(url, json=False, headers=None):
assert url == "https://example.com"
assert headers == {}
return "page content"
monkeypatch.setattr(block, "get_request", fake_get_request)
results = [
output
async for output in block.run(
input_data=input_data, credentials=TEST_CREDENTIALS
)
]
assert ("content", "page content") in results
assert all(key != "error" for key, _ in results)
@pytest.mark.asyncio
async def test_extract_website_content_handles_http_error(monkeypatch):
block = ExtractWebsiteContentBlock()
input_data = block.Input(
url="https://example.com",
credentials=cast(JinaCredentialsInput, TEST_CREDENTIALS_INPUT),
raw_content=False,
)
async def fake_get_request(url, json=False, headers=None):
raise HTTPClientError("HTTP 400 Error: Bad Request", 400)
monkeypatch.setattr(block, "get_request", fake_get_request)
results = [
output
async for output in block.run(
input_data=input_data, credentials=TEST_CREDENTIALS
)
]
assert ("content", "page content") not in results
error_messages = [value for key, value in results if key == "error"]
assert error_messages
assert "Client error (400)" in error_messages[0]
assert "https://example.com" in error_messages[0]