mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-02-12 15:55:03 -05:00
Compare commits
4 Commits
fix/claude
...
aarushikan
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4c29807ab8 | ||
|
|
c4cbf9e58a | ||
|
|
6de45f4081 | ||
|
|
ebffe0a022 |
71
rnd/autogpt_server/autogpt_server/blocks/chunking.py
Normal file
71
rnd/autogpt_server/autogpt_server/blocks/chunking.py
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
import re
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
from autogpt_server.data.block import Block, BlockOutput, BlockSchema
|
||||||
|
|
||||||
|
class ChunkingConfig(BaseModel):
|
||||||
|
chunk_size: int = Field(default=1000, description="Maximum number of characters per chunk")
|
||||||
|
overlap: int = Field(default=100, description="Number of characters to overlap between chunks")
|
||||||
|
split_on: Optional[str] = Field(default=None, description="Regular expression to split on (e.g., '\n\n' for paragraphs)")
|
||||||
|
|
||||||
|
class ChunkingBlock(Block):
|
||||||
|
class Input(BlockSchema):
|
||||||
|
text: str = Field(description="Text to be chunked")
|
||||||
|
config: ChunkingConfig = Field(description="Chunking configuration")
|
||||||
|
|
||||||
|
class Output(BlockSchema):
|
||||||
|
chunks: List[str] = Field(description="List of text chunks")
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__(
|
||||||
|
id="7d9e8f3a-2b5c-4e1d-9f3a-2b5c4e1d9f3a",
|
||||||
|
input_schema=ChunkingBlock.Input,
|
||||||
|
output_schema=ChunkingBlock.Output,
|
||||||
|
test_input={
|
||||||
|
"text": "This is a long piece of text that needs to be chunked. " * 20,
|
||||||
|
"config": {
|
||||||
|
"chunk_size": 100,
|
||||||
|
"overlap": 20,
|
||||||
|
"split_on": None
|
||||||
|
}
|
||||||
|
},
|
||||||
|
test_output=("chunks", [
|
||||||
|
"This is a long piece of text that needs to be chunked. This is a long piece of text that needs to be chunked. ",
|
||||||
|
"to be chunked. This is a long piece of text that needs to be chunked. This is a long piece of text that needs ",
|
||||||
|
"text that needs to be chunked. This is a long piece of text that needs to be chunked. This is a long piece of ",
|
||||||
|
"of text that needs to be chunked. This is a long piece of text that needs to be chunked. This is a long piece ",
|
||||||
|
"piece of text that needs to be chunked. This is a long piece of text that needs to be chunked. This is a long "
|
||||||
|
]),
|
||||||
|
)
|
||||||
|
|
||||||
|
def chunk_text(self, text: str, config: ChunkingConfig) -> List[str]:
|
||||||
|
if config.split_on:
|
||||||
|
# Split on the specified pattern
|
||||||
|
segments = re.split(config.split_on, text)
|
||||||
|
chunks = []
|
||||||
|
current_chunk = ""
|
||||||
|
for segment in segments:
|
||||||
|
if len(current_chunk) + len(segment) > config.chunk_size:
|
||||||
|
if current_chunk:
|
||||||
|
chunks.append(current_chunk.strip())
|
||||||
|
current_chunk = segment
|
||||||
|
else:
|
||||||
|
current_chunk += (" " if current_chunk else "") + segment
|
||||||
|
if current_chunk:
|
||||||
|
chunks.append(current_chunk.strip())
|
||||||
|
else:
|
||||||
|
chunks = []
|
||||||
|
start = 0
|
||||||
|
while start < len(text):
|
||||||
|
end = start + config.chunk_size
|
||||||
|
chunk = text[start:end]
|
||||||
|
chunks.append(chunk)
|
||||||
|
start = end - config.overlap
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
def run(self, input_data: Input) -> BlockOutput:
|
||||||
|
chunks = self.chunk_text(input_data.text, input_data.config)
|
||||||
|
yield "chunks", chunks
|
||||||
65
rnd/autogpt_server/autogpt_server/blocks/embeddings.py
Normal file
65
rnd/autogpt_server/autogpt_server/blocks/embeddings.py
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
import logging
|
||||||
|
from enum import Enum
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import openai
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
from autogpt_server.data.block import Block, BlockOutput, BlockSchema
|
||||||
|
from autogpt_server.util import json
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class EmbeddingModel(str, Enum):
|
||||||
|
ada_002 = "text-embedding-ada-002"
|
||||||
|
|
||||||
|
class EmbeddingConfig(BaseModel):
|
||||||
|
model: EmbeddingModel
|
||||||
|
api_key: str
|
||||||
|
|
||||||
|
class EmbeddingBlock(Block):
|
||||||
|
class Input(BlockSchema):
|
||||||
|
config: EmbeddingConfig
|
||||||
|
texts: List[str] = Field(description="List of texts to create embeddings for")
|
||||||
|
|
||||||
|
class Output(BlockSchema):
|
||||||
|
embeddings: List[List[float]]
|
||||||
|
error: str
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__(
|
||||||
|
id="8f7e9a1c-3b7a-4b0f-9f1a-1c3b7a4b0f9f",
|
||||||
|
input_schema=EmbeddingBlock.Input,
|
||||||
|
output_schema=EmbeddingBlock.Output,
|
||||||
|
test_input={
|
||||||
|
"config": {
|
||||||
|
"model": "text-embedding-ada-002",
|
||||||
|
"api_key": "fake-api-key",
|
||||||
|
},
|
||||||
|
"texts": ["Hello, world!", "AutoGPT is amazing"],
|
||||||
|
},
|
||||||
|
test_output=("embeddings", [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]),
|
||||||
|
test_mock={"create_embeddings": lambda *args, **kwargs: [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]},
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def create_embeddings(api_key: str, model: EmbeddingModel, texts: List[str]) -> List[List[float]]:
|
||||||
|
openai.api_key = api_key
|
||||||
|
response = openai.embeddings.create(
|
||||||
|
model=model,
|
||||||
|
input=texts,
|
||||||
|
)
|
||||||
|
return [embedding.embedding for embedding in response.data]
|
||||||
|
|
||||||
|
def run(self, input_data: Input) -> BlockOutput:
|
||||||
|
try:
|
||||||
|
embeddings = self.create_embeddings(
|
||||||
|
input_data.config.api_key,
|
||||||
|
input_data.config.model,
|
||||||
|
input_data.texts
|
||||||
|
)
|
||||||
|
yield "embeddings", embeddings
|
||||||
|
except Exception as e:
|
||||||
|
error_message = f"Error creating embeddings: {str(e)}"
|
||||||
|
logger.error(error_message)
|
||||||
|
yield "error", error_message
|
||||||
Reference in New Issue
Block a user