Compare commits

...

4 Commits

Author SHA1 Message Date
Nicholas Tindle
4c29807ab8 Merge branch 'master' into aarushikansal-add-vector-store-support 2024-08-04 22:18:26 -07:00
Nicholas Tindle
c4cbf9e58a Merge branch 'master' into aarushikansal-add-vector-store-support 2024-08-02 07:12:16 -05:00
Toran Bruce Richards
6de45f4081 Merge branch 'master' into aarushikansal-add-vector-store-support 2024-08-02 09:38:37 +01:00
Aarushi
ebffe0a022 add chunking and embeddings blocks 2024-07-14 14:01:29 +01:00
2 changed files with 136 additions and 0 deletions

View File

@@ -0,0 +1,71 @@
import re
from typing import List, Optional
from pydantic import BaseModel, Field
from autogpt_server.data.block import Block, BlockOutput, BlockSchema
class ChunkingConfig(BaseModel):
chunk_size: int = Field(default=1000, description="Maximum number of characters per chunk")
overlap: int = Field(default=100, description="Number of characters to overlap between chunks")
split_on: Optional[str] = Field(default=None, description="Regular expression to split on (e.g., '\n\n' for paragraphs)")
class ChunkingBlock(Block):
class Input(BlockSchema):
text: str = Field(description="Text to be chunked")
config: ChunkingConfig = Field(description="Chunking configuration")
class Output(BlockSchema):
chunks: List[str] = Field(description="List of text chunks")
def __init__(self):
super().__init__(
id="7d9e8f3a-2b5c-4e1d-9f3a-2b5c4e1d9f3a",
input_schema=ChunkingBlock.Input,
output_schema=ChunkingBlock.Output,
test_input={
"text": "This is a long piece of text that needs to be chunked. " * 20,
"config": {
"chunk_size": 100,
"overlap": 20,
"split_on": None
}
},
test_output=("chunks", [
"This is a long piece of text that needs to be chunked. This is a long piece of text that needs to be chunked. ",
"to be chunked. This is a long piece of text that needs to be chunked. This is a long piece of text that needs ",
"text that needs to be chunked. This is a long piece of text that needs to be chunked. This is a long piece of ",
"of text that needs to be chunked. This is a long piece of text that needs to be chunked. This is a long piece ",
"piece of text that needs to be chunked. This is a long piece of text that needs to be chunked. This is a long "
]),
)
def chunk_text(self, text: str, config: ChunkingConfig) -> List[str]:
if config.split_on:
# Split on the specified pattern
segments = re.split(config.split_on, text)
chunks = []
current_chunk = ""
for segment in segments:
if len(current_chunk) + len(segment) > config.chunk_size:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = segment
else:
current_chunk += (" " if current_chunk else "") + segment
if current_chunk:
chunks.append(current_chunk.strip())
else:
chunks = []
start = 0
while start < len(text):
end = start + config.chunk_size
chunk = text[start:end]
chunks.append(chunk)
start = end - config.overlap
return chunks
def run(self, input_data: Input) -> BlockOutput:
chunks = self.chunk_text(input_data.text, input_data.config)
yield "chunks", chunks

View File

@@ -0,0 +1,65 @@
import logging
from enum import Enum
from typing import List
import openai
from pydantic import BaseModel, Field
from autogpt_server.data.block import Block, BlockOutput, BlockSchema
from autogpt_server.util import json
logger = logging.getLogger(__name__)
class EmbeddingModel(str, Enum):
ada_002 = "text-embedding-ada-002"
class EmbeddingConfig(BaseModel):
model: EmbeddingModel
api_key: str
class EmbeddingBlock(Block):
class Input(BlockSchema):
config: EmbeddingConfig
texts: List[str] = Field(description="List of texts to create embeddings for")
class Output(BlockSchema):
embeddings: List[List[float]]
error: str
def __init__(self):
super().__init__(
id="8f7e9a1c-3b7a-4b0f-9f1a-1c3b7a4b0f9f",
input_schema=EmbeddingBlock.Input,
output_schema=EmbeddingBlock.Output,
test_input={
"config": {
"model": "text-embedding-ada-002",
"api_key": "fake-api-key",
},
"texts": ["Hello, world!", "AutoGPT is amazing"],
},
test_output=("embeddings", [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]),
test_mock={"create_embeddings": lambda *args, **kwargs: [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]},
)
@staticmethod
def create_embeddings(api_key: str, model: EmbeddingModel, texts: List[str]) -> List[List[float]]:
openai.api_key = api_key
response = openai.embeddings.create(
model=model,
input=texts,
)
return [embedding.embedding for embedding in response.data]
def run(self, input_data: Input) -> BlockOutput:
try:
embeddings = self.create_embeddings(
input_data.config.api_key,
input_data.config.model,
input_data.texts
)
yield "embeddings", embeddings
except Exception as e:
error_message = f"Error creating embeddings: {str(e)}"
logger.error(error_message)
yield "error", error_message