Handle errors in Jina ExtractWebsiteContentBlock (#12048)

## Summary
- catch Jina reader client/server errors in ExtractWebsiteContentBlock
and surface a clear error output keyed to the user URL
- guard empty responses to return an explicit error instead of yielding
blank content
- add regression tests covering the happy path and HTTP client failures
via a monkeypatched fetch

## Testing
- not run (pytest unavailable in this environment)

---------

Co-authored-by: Nicholas Tindle <nicktindle@outlook.com>
Co-authored-by: Nicholas Tindle <nicholas.tindle@agpt.co>
This commit is contained in:
DEEVEN SERU
2026-02-14 00:45:09 +05:30
committed by GitHub
parent ca216dfd7f
commit b8f5c208d0
2 changed files with 89 additions and 2 deletions

View File

@@ -17,6 +17,7 @@ from backend.blocks.jina._auth import (
from backend.blocks.search import GetRequest
from backend.data.model import SchemaField
from backend.util.exceptions import BlockExecutionError
from backend.util.request import HTTPClientError, HTTPServerError, validate_url
class SearchTheWebBlock(Block, GetRequest):
@@ -110,7 +111,12 @@ class ExtractWebsiteContentBlock(Block, GetRequest):
self, input_data: Input, *, credentials: JinaCredentials, **kwargs
) -> BlockOutput:
if input_data.raw_content:
url = input_data.url
try:
parsed_url, _, _ = await validate_url(input_data.url, [])
url = parsed_url.geturl()
except ValueError as e:
yield "error", f"Invalid URL: {e}"
return
headers = {}
else:
url = f"https://r.jina.ai/{input_data.url}"
@@ -119,5 +125,20 @@ class ExtractWebsiteContentBlock(Block, GetRequest):
"Authorization": f"Bearer {credentials.api_key.get_secret_value()}",
}
content = await self.get_request(url, json=False, headers=headers)
try:
content = await self.get_request(url, json=False, headers=headers)
except HTTPClientError as e:
yield "error", f"Client error ({e.status_code}) fetching {input_data.url}: {e}"
return
except HTTPServerError as e:
yield "error", f"Server error ({e.status_code}) fetching {input_data.url}: {e}"
return
except Exception as e:
yield "error", f"Failed to fetch {input_data.url}: {e}"
return
if not content:
yield "error", f"No content returned for {input_data.url}"
return
yield "content", content

View File

@@ -0,0 +1,66 @@
from typing import cast
import pytest
from backend.blocks.jina._auth import (
TEST_CREDENTIALS,
TEST_CREDENTIALS_INPUT,
JinaCredentialsInput,
)
from backend.blocks.jina.search import ExtractWebsiteContentBlock
from backend.util.request import HTTPClientError
@pytest.mark.asyncio
async def test_extract_website_content_returns_content(monkeypatch):
block = ExtractWebsiteContentBlock()
input_data = block.Input(
url="https://example.com",
credentials=cast(JinaCredentialsInput, TEST_CREDENTIALS_INPUT),
raw_content=True,
)
async def fake_get_request(url, json=False, headers=None):
assert url == "https://example.com"
assert headers == {}
return "page content"
monkeypatch.setattr(block, "get_request", fake_get_request)
results = [
output
async for output in block.run(
input_data=input_data, credentials=TEST_CREDENTIALS
)
]
assert ("content", "page content") in results
assert all(key != "error" for key, _ in results)
@pytest.mark.asyncio
async def test_extract_website_content_handles_http_error(monkeypatch):
block = ExtractWebsiteContentBlock()
input_data = block.Input(
url="https://example.com",
credentials=cast(JinaCredentialsInput, TEST_CREDENTIALS_INPUT),
raw_content=False,
)
async def fake_get_request(url, json=False, headers=None):
raise HTTPClientError("HTTP 400 Error: Bad Request", 400)
monkeypatch.setattr(block, "get_request", fake_get_request)
results = [
output
async for output in block.run(
input_data=input_data, credentials=TEST_CREDENTIALS
)
]
assert ("content", "page content") not in results
error_messages = [value for key, value in results if key == "error"]
assert error_messages
assert "Client error (400)" in error_messages[0]
assert "https://example.com" in error_messages[0]