From b8f5c208d08e313306ad3ee87020d8746d9afbb4 Mon Sep 17 00:00:00 2001 From: DEEVEN SERU <144827577+DEVELOPER-DEEVEN@users.noreply.github.com> Date: Sat, 14 Feb 2026 00:45:09 +0530 Subject: [PATCH] Handle errors in Jina ExtractWebsiteContentBlock (#12048) ## Summary - catch Jina reader client/server errors in ExtractWebsiteContentBlock and surface a clear error output keyed to the user URL - guard empty responses to return an explicit error instead of yielding blank content - add regression tests covering the happy path and HTTP client failures via a monkeypatched fetch ## Testing - not run (pytest unavailable in this environment) --------- Co-authored-by: Nicholas Tindle Co-authored-by: Nicholas Tindle --- .../backend/backend/blocks/jina/search.py | 25 ++++++- .../test/blocks/test_jina_extract_website.py | 66 +++++++++++++++++++ 2 files changed, 89 insertions(+), 2 deletions(-) create mode 100644 autogpt_platform/backend/test/blocks/test_jina_extract_website.py diff --git a/autogpt_platform/backend/backend/blocks/jina/search.py b/autogpt_platform/backend/backend/blocks/jina/search.py index 22a883fa03..5e58ddcab4 100644 --- a/autogpt_platform/backend/backend/blocks/jina/search.py +++ b/autogpt_platform/backend/backend/blocks/jina/search.py @@ -17,6 +17,7 @@ from backend.blocks.jina._auth import ( from backend.blocks.search import GetRequest from backend.data.model import SchemaField from backend.util.exceptions import BlockExecutionError +from backend.util.request import HTTPClientError, HTTPServerError, validate_url class SearchTheWebBlock(Block, GetRequest): @@ -110,7 +111,12 @@ class ExtractWebsiteContentBlock(Block, GetRequest): self, input_data: Input, *, credentials: JinaCredentials, **kwargs ) -> BlockOutput: if input_data.raw_content: - url = input_data.url + try: + parsed_url, _, _ = await validate_url(input_data.url, []) + url = parsed_url.geturl() + except ValueError as e: + yield "error", f"Invalid URL: {e}" + return headers = {} else: url = f"https://r.jina.ai/{input_data.url}" @@ -119,5 +125,20 @@ class ExtractWebsiteContentBlock(Block, GetRequest): "Authorization": f"Bearer {credentials.api_key.get_secret_value()}", } - content = await self.get_request(url, json=False, headers=headers) + try: + content = await self.get_request(url, json=False, headers=headers) + except HTTPClientError as e: + yield "error", f"Client error ({e.status_code}) fetching {input_data.url}: {e}" + return + except HTTPServerError as e: + yield "error", f"Server error ({e.status_code}) fetching {input_data.url}: {e}" + return + except Exception as e: + yield "error", f"Failed to fetch {input_data.url}: {e}" + return + + if not content: + yield "error", f"No content returned for {input_data.url}" + return + yield "content", content diff --git a/autogpt_platform/backend/test/blocks/test_jina_extract_website.py b/autogpt_platform/backend/test/blocks/test_jina_extract_website.py new file mode 100644 index 0000000000..335c43f966 --- /dev/null +++ b/autogpt_platform/backend/test/blocks/test_jina_extract_website.py @@ -0,0 +1,66 @@ +from typing import cast + +import pytest + +from backend.blocks.jina._auth import ( + TEST_CREDENTIALS, + TEST_CREDENTIALS_INPUT, + JinaCredentialsInput, +) +from backend.blocks.jina.search import ExtractWebsiteContentBlock +from backend.util.request import HTTPClientError + + +@pytest.mark.asyncio +async def test_extract_website_content_returns_content(monkeypatch): + block = ExtractWebsiteContentBlock() + input_data = block.Input( + url="https://example.com", + credentials=cast(JinaCredentialsInput, TEST_CREDENTIALS_INPUT), + raw_content=True, + ) + + async def fake_get_request(url, json=False, headers=None): + assert url == "https://example.com" + assert headers == {} + return "page content" + + monkeypatch.setattr(block, "get_request", fake_get_request) + + results = [ + output + async for output in block.run( + input_data=input_data, credentials=TEST_CREDENTIALS + ) + ] + + assert ("content", "page content") in results + assert all(key != "error" for key, _ in results) + + +@pytest.mark.asyncio +async def test_extract_website_content_handles_http_error(monkeypatch): + block = ExtractWebsiteContentBlock() + input_data = block.Input( + url="https://example.com", + credentials=cast(JinaCredentialsInput, TEST_CREDENTIALS_INPUT), + raw_content=False, + ) + + async def fake_get_request(url, json=False, headers=None): + raise HTTPClientError("HTTP 400 Error: Bad Request", 400) + + monkeypatch.setattr(block, "get_request", fake_get_request) + + results = [ + output + async for output in block.run( + input_data=input_data, credentials=TEST_CREDENTIALS + ) + ] + + assert ("content", "page content") not in results + error_messages = [value for key, value in results if key == "error"] + assert error_messages + assert "Client error (400)" in error_messages[0] + assert "https://example.com" in error_messages[0]