Merge branch 'master' into aarushikansal-add-vector-store-support

2026-02-12 15:55:03 -05:00 · 2024-08-04 22:18:26 -07:00 · 2024-08-02 07:12:16 -05:00 · 2024-08-02 09:38:37 +01:00 · 2024-07-14 14:01:29 +01:00
2 changed files with 136 additions and 0 deletions
--- a/rnd/autogpt_server/autogpt_server/blocks/chunking.py
+++ b/rnd/autogpt_server/autogpt_server/blocks/chunking.py
@@ -0,0 +1,71 @@
 import re
 from typing import List, Optional
 from pydantic import BaseModel, Field
 from autogpt_server.data.block import Block, BlockOutput, BlockSchema
 class ChunkingConfig(BaseModel):
    chunk_size: int = Field(default=1000, description="Maximum number of characters per chunk")
    overlap: int = Field(default=100, description="Number of characters to overlap between chunks")
    split_on: Optional[str] = Field(default=None, description="Regular expression to split on (e.g., '\n\n' for paragraphs)")
 class ChunkingBlock(Block):
    class Input(BlockSchema):
        text: str = Field(description="Text to be chunked")
        config: ChunkingConfig = Field(description="Chunking configuration")
    class Output(BlockSchema):
        chunks: List[str] = Field(description="List of text chunks")
    def __init__(self):
        super().__init__(
            id="7d9e8f3a-2b5c-4e1d-9f3a-2b5c4e1d9f3a",
            input_schema=ChunkingBlock.Input,
            output_schema=ChunkingBlock.Output,
            test_input={
                "text": "This is a long piece of text that needs to be chunked. " * 20,
                "config": {
                    "chunk_size": 100,
                    "overlap": 20,
                    "split_on": None
                }
            },
            test_output=("chunks", [
                "This is a long piece of text that needs to be chunked. This is a long piece of text that needs to be chunked. ",
                "to be chunked. This is a long piece of text that needs to be chunked. This is a long piece of text that needs ",
                "text that needs to be chunked. This is a long piece of text that needs to be chunked. This is a long piece of ",
                "of text that needs to be chunked. This is a long piece of text that needs to be chunked. This is a long piece ",
                "piece of text that needs to be chunked. This is a long piece of text that needs to be chunked. This is a long "
            ]),
        )
    def chunk_text(self, text: str, config: ChunkingConfig) -> List[str]:
        if config.split_on:
            # Split on the specified pattern
            segments = re.split(config.split_on, text)
            chunks = []
            current_chunk = ""
            for segment in segments:
                if len(current_chunk) + len(segment) > config.chunk_size:
                    if current_chunk:
                        chunks.append(current_chunk.strip())
                    current_chunk = segment
                else:
                    current_chunk += (" " if current_chunk else "") + segment
            if current_chunk:
                chunks.append(current_chunk.strip())
        else:
            chunks = []
            start = 0
            while start < len(text):
                end = start + config.chunk_size
                chunk = text[start:end]
                chunks.append(chunk)
                start = end - config.overlap
        return chunks
    def run(self, input_data: Input) -> BlockOutput:
        chunks = self.chunk_text(input_data.text, input_data.config)
        yield "chunks", chunks
--- a/rnd/autogpt_server/autogpt_server/blocks/embeddings.py
+++ b/rnd/autogpt_server/autogpt_server/blocks/embeddings.py
@@ -0,0 +1,65 @@
 import logging
 from enum import Enum
 from typing import List
 import openai
 from pydantic import BaseModel, Field
 from autogpt_server.data.block import Block, BlockOutput, BlockSchema
 from autogpt_server.util import json
 logger = logging.getLogger(__name__)
 class EmbeddingModel(str, Enum):
    ada_002 = "text-embedding-ada-002"
 class EmbeddingConfig(BaseModel):
    model: EmbeddingModel
    api_key: str
 class EmbeddingBlock(Block):
    class Input(BlockSchema):
        config: EmbeddingConfig
        texts: List[str] = Field(description="List of texts to create embeddings for")
    class Output(BlockSchema):
        embeddings: List[List[float]]
        error: str
    def __init__(self):
        super().__init__(
            id="8f7e9a1c-3b7a-4b0f-9f1a-1c3b7a4b0f9f",
            input_schema=EmbeddingBlock.Input,
            output_schema=EmbeddingBlock.Output,
            test_input={
                "config": {
                    "model": "text-embedding-ada-002",
                    "api_key": "fake-api-key",
                },
                "texts": ["Hello, world!", "AutoGPT is amazing"],
            },
            test_output=("embeddings", [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]),
            test_mock={"create_embeddings": lambda *args, **kwargs: [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]},
        )
    @staticmethod
    def create_embeddings(api_key: str, model: EmbeddingModel, texts: List[str]) -> List[List[float]]:
        openai.api_key = api_key
        response = openai.embeddings.create(
            model=model,
            input=texts,
        )
        return [embedding.embedding for embedding in response.data]
    def run(self, input_data: Input) -> BlockOutput:
        try:
            embeddings = self.create_embeddings(
                input_data.config.api_key,
                input_data.config.model,
                input_data.texts
            )
            yield "embeddings", embeddings
        except Exception as e:
            error_message = f"Error creating embeddings: {str(e)}"
            logger.error(error_message)
            yield "error", error_message
Author	SHA1	Message	Date
Nicholas Tindle	4c29807ab8	Merge branch 'master' into aarushikansal-add-vector-store-support	2024-08-04 22:18:26 -07:00
Nicholas Tindle	c4cbf9e58a	Merge branch 'master' into aarushikansal-add-vector-store-support	2024-08-02 07:12:16 -05:00
Toran Bruce Richards	6de45f4081	Merge branch 'master' into aarushikansal-add-vector-store-support	2024-08-02 09:38:37 +01:00
Aarushi	ebffe0a022	add chunking and embeddings blocks	2024-07-14 14:01:29 +01:00