Merge branch 'master' into aarushikansal-add-vector-store-support

2026-01-13 17:18:08 -05:00 · 2024-08-04 22:18:26 -07:00 · 2024-08-02 07:12:16 -05:00 · 2024-08-02 09:38:37 +01:00 · 2024-07-14 14:01:29 +01:00
2 changed files with 136 additions and 0 deletions
--- a/rnd/autogpt_server/autogpt_server/blocks/chunking.py
+++ b/rnd/autogpt_server/autogpt_server/blocks/chunking.py
@@ -0,0 +1,71 @@
+import re
+from typing import List, Optional
+
+from pydantic import BaseModel, Field
+
+from autogpt_server.data.block import Block, BlockOutput, BlockSchema
+
+class ChunkingConfig(BaseModel):
+    chunk_size: int = Field(default=1000, description="Maximum number of characters per chunk")
+    overlap: int = Field(default=100, description="Number of characters to overlap between chunks")
+    split_on: Optional[str] = Field(default=None, description="Regular expression to split on (e.g., '\n\n' for paragraphs)")
+
+class ChunkingBlock(Block):
+    class Input(BlockSchema):
+        text: str = Field(description="Text to be chunked")
+        config: ChunkingConfig = Field(description="Chunking configuration")
+
+    class Output(BlockSchema):
+        chunks: List[str] = Field(description="List of text chunks")
+
+    def __init__(self):
+        super().__init__(
+            id="7d9e8f3a-2b5c-4e1d-9f3a-2b5c4e1d9f3a",
+            input_schema=ChunkingBlock.Input,
+            output_schema=ChunkingBlock.Output,
+            test_input={
+                "text": "This is a long piece of text that needs to be chunked. " * 20,
+                "config": {
+                    "chunk_size": 100,
+                    "overlap": 20,
+                    "split_on": None
+                }
+            },
+            test_output=("chunks", [
+                "This is a long piece of text that needs to be chunked. This is a long piece of text that needs to be chunked. ",
+                "to be chunked. This is a long piece of text that needs to be chunked. This is a long piece of text that needs ",
+                "text that needs to be chunked. This is a long piece of text that needs to be chunked. This is a long piece of ",
+                "of text that needs to be chunked. This is a long piece of text that needs to be chunked. This is a long piece ",
+                "piece of text that needs to be chunked. This is a long piece of text that needs to be chunked. This is a long "
+            ]),
+        )
+
+    def chunk_text(self, text: str, config: ChunkingConfig) -> List[str]:
+        if config.split_on:
+            # Split on the specified pattern
+            segments = re.split(config.split_on, text)
+            chunks = []
+            current_chunk = ""
+            for segment in segments:
+                if len(current_chunk) + len(segment) > config.chunk_size:
+                    if current_chunk:
+                        chunks.append(current_chunk.strip())
+                    current_chunk = segment
+                else:
+                    current_chunk += (" " if current_chunk else "") + segment
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+        else:
+            chunks = []
+            start = 0
+            while start < len(text):
+                end = start + config.chunk_size
+                chunk = text[start:end]
+                chunks.append(chunk)
+                start = end - config.overlap
+
+        return chunks
+
+    def run(self, input_data: Input) -> BlockOutput:
+        chunks = self.chunk_text(input_data.text, input_data.config)
+        yield "chunks", chunks
--- a/rnd/autogpt_server/autogpt_server/blocks/embeddings.py
+++ b/rnd/autogpt_server/autogpt_server/blocks/embeddings.py
@@ -0,0 +1,65 @@
+import logging
+from enum import Enum
+from typing import List
+
+import openai
+from pydantic import BaseModel, Field
+
+from autogpt_server.data.block import Block, BlockOutput, BlockSchema
+from autogpt_server.util import json
+
+logger = logging.getLogger(__name__)
+
+class EmbeddingModel(str, Enum):
+    ada_002 = "text-embedding-ada-002"
+
+class EmbeddingConfig(BaseModel):
+    model: EmbeddingModel
+    api_key: str
+
+class EmbeddingBlock(Block):
+    class Input(BlockSchema):
+        config: EmbeddingConfig
+        texts: List[str] = Field(description="List of texts to create embeddings for")
+
+    class Output(BlockSchema):
+        embeddings: List[List[float]]
+        error: str
+
+    def __init__(self):
+        super().__init__(
+            id="8f7e9a1c-3b7a-4b0f-9f1a-1c3b7a4b0f9f",
+            input_schema=EmbeddingBlock.Input,
+            output_schema=EmbeddingBlock.Output,
+            test_input={
+                "config": {
+                    "model": "text-embedding-ada-002",
+                    "api_key": "fake-api-key",
+                },
+                "texts": ["Hello, world!", "AutoGPT is amazing"],
+            },
+            test_output=("embeddings", [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]),
+            test_mock={"create_embeddings": lambda *args, **kwargs: [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]},
+        )
+
+    @staticmethod
+    def create_embeddings(api_key: str, model: EmbeddingModel, texts: List[str]) -> List[List[float]]:
+        openai.api_key = api_key
+        response = openai.embeddings.create(
+            model=model,
+            input=texts,
+        )
+        return [embedding.embedding for embedding in response.data]
+
+    def run(self, input_data: Input) -> BlockOutput:
+        try:
+            embeddings = self.create_embeddings(
+                input_data.config.api_key,
+                input_data.config.model,
+                input_data.texts
+            )
+            yield "embeddings", embeddings
+        except Exception as e:
+            error_message = f"Error creating embeddings: {str(e)}"
+            logger.error(error_message)
+            yield "error", error_message
Author	SHA1	Message	Date
Nicholas Tindle	4c29807ab8	Merge branch 'master' into aarushikansal-add-vector-store-support	2024-08-04 22:18:26 -07:00
Nicholas Tindle	c4cbf9e58a	Merge branch 'master' into aarushikansal-add-vector-store-support	2024-08-02 07:12:16 -05:00
Toran Bruce Richards	6de45f4081	Merge branch 'master' into aarushikansal-add-vector-store-support	2024-08-02 09:38:37 +01:00
Aarushi	ebffe0a022	add chunking and embeddings blocks	2024-07-14 14:01:29 +01:00