feat(blocks): Add web scraper block to autogpt_server

- Import the web_scraper module in __init__.py - Create a new file web_scraper.py with the WebScraper block - Implement the run method to scrape content from a given URL - Handle HTTP errors and exceptions in the run method
2026-04-08 03:00:28 -04:00 · 2024-07-14 01:49:59 +01:00
parent d3dae2264d
commit e0387bd857
2 changed files with 37 additions and 2 deletions
--- a/rnd/autogpt_server/autogpt_server/blocks/init.py
+++ b/rnd/autogpt_server/autogpt_server/blocks/init.py
@@ -1,4 +1,4 @@
-from autogpt_server.blocks import agent, sample, reddit, text, ai, wikipedia, discord
+from autogpt_server.blocks import agent, sample, reddit, text, ai, wikipedia, discord, web_scraper
 from autogpt_server.data.block import Block

 AVAILABLE_BLOCKS = {
@@ -6,4 +6,4 @@ AVAILABLE_BLOCKS = {
    for block in [v() for v in Block.__subclasses__()]
 }

-__all__ = ["agent", "ai", "sample", "reddit", "text", "AVAILABLE_BLOCKS", "wikipedia", "discord"]
+__all__ = ["agent", "ai", "sample", "reddit", "text", "AVAILABLE_BLOCKS", "wikipedia", "discord", "web_scraper"]
--- a/rnd/autogpt_server/autogpt_server/blocks/web_scraper.py
+++ b/rnd/autogpt_server/autogpt_server/blocks/web_scraper.py
@@ -0,0 +1,35 @@
+import requests
+from autogpt_server.data.block import Block, BlockSchema, BlockOutput
+
+class WebScraper(Block):
+    class Input(BlockSchema):
+        url: str  # The URL to scrape
+
+    class Output(BlockSchema):
+        content: str  # The scraped content from the URL
+
+    def __init__(self):
+        super().__init__(
+            id="a1b2c3d4-5e6f-7g8h-9i0j-k1l2m3n4o5p6",  # Unique ID for the block
+            input_schema=WebScraper.Input,
+            output_schema=WebScraper.Output,
+            test_input={"url": "https://en.wikipedia.org/wiki/Artificial_intelligence"},
+            test_output={"content": "Artificial intelligence (AI) is intelligence..."},
+        )
+
+    def run(self, input_data: Input) -> BlockOutput:
+        try:
+            # Prepend the Jina-ai Reader URL to the input URL
+            jina_url = f"https://r.jina.ai/{input_data.url}"
+            
+            # Make the request to Jina-ai Reader
+            response = requests.get(jina_url)
+            response.raise_for_status()
+            
+            # Output the scraped content
+            yield "content", response.text
+
+        except requests.exceptions.HTTPError as http_err:
+            raise ValueError(f"HTTP error occurred: {http_err}")
+        except requests.RequestException as e:
+            raise ValueError(f"Request to Jina-ai Reader failed: {e}")