mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-01-13 17:18:08 -05:00
Compare commits
4 Commits
fix/creden
...
aarushikan
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4c29807ab8 | ||
|
|
c4cbf9e58a | ||
|
|
6de45f4081 | ||
|
|
ebffe0a022 |
71
rnd/autogpt_server/autogpt_server/blocks/chunking.py
Normal file
71
rnd/autogpt_server/autogpt_server/blocks/chunking.py
Normal file
@@ -0,0 +1,71 @@
|
||||
import re
|
||||
from typing import List, Optional
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from autogpt_server.data.block import Block, BlockOutput, BlockSchema
|
||||
|
||||
class ChunkingConfig(BaseModel):
|
||||
chunk_size: int = Field(default=1000, description="Maximum number of characters per chunk")
|
||||
overlap: int = Field(default=100, description="Number of characters to overlap between chunks")
|
||||
split_on: Optional[str] = Field(default=None, description="Regular expression to split on (e.g., '\n\n' for paragraphs)")
|
||||
|
||||
class ChunkingBlock(Block):
|
||||
class Input(BlockSchema):
|
||||
text: str = Field(description="Text to be chunked")
|
||||
config: ChunkingConfig = Field(description="Chunking configuration")
|
||||
|
||||
class Output(BlockSchema):
|
||||
chunks: List[str] = Field(description="List of text chunks")
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(
|
||||
id="7d9e8f3a-2b5c-4e1d-9f3a-2b5c4e1d9f3a",
|
||||
input_schema=ChunkingBlock.Input,
|
||||
output_schema=ChunkingBlock.Output,
|
||||
test_input={
|
||||
"text": "This is a long piece of text that needs to be chunked. " * 20,
|
||||
"config": {
|
||||
"chunk_size": 100,
|
||||
"overlap": 20,
|
||||
"split_on": None
|
||||
}
|
||||
},
|
||||
test_output=("chunks", [
|
||||
"This is a long piece of text that needs to be chunked. This is a long piece of text that needs to be chunked. ",
|
||||
"to be chunked. This is a long piece of text that needs to be chunked. This is a long piece of text that needs ",
|
||||
"text that needs to be chunked. This is a long piece of text that needs to be chunked. This is a long piece of ",
|
||||
"of text that needs to be chunked. This is a long piece of text that needs to be chunked. This is a long piece ",
|
||||
"piece of text that needs to be chunked. This is a long piece of text that needs to be chunked. This is a long "
|
||||
]),
|
||||
)
|
||||
|
||||
def chunk_text(self, text: str, config: ChunkingConfig) -> List[str]:
|
||||
if config.split_on:
|
||||
# Split on the specified pattern
|
||||
segments = re.split(config.split_on, text)
|
||||
chunks = []
|
||||
current_chunk = ""
|
||||
for segment in segments:
|
||||
if len(current_chunk) + len(segment) > config.chunk_size:
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk.strip())
|
||||
current_chunk = segment
|
||||
else:
|
||||
current_chunk += (" " if current_chunk else "") + segment
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk.strip())
|
||||
else:
|
||||
chunks = []
|
||||
start = 0
|
||||
while start < len(text):
|
||||
end = start + config.chunk_size
|
||||
chunk = text[start:end]
|
||||
chunks.append(chunk)
|
||||
start = end - config.overlap
|
||||
|
||||
return chunks
|
||||
|
||||
def run(self, input_data: Input) -> BlockOutput:
|
||||
chunks = self.chunk_text(input_data.text, input_data.config)
|
||||
yield "chunks", chunks
|
||||
65
rnd/autogpt_server/autogpt_server/blocks/embeddings.py
Normal file
65
rnd/autogpt_server/autogpt_server/blocks/embeddings.py
Normal file
@@ -0,0 +1,65 @@
|
||||
import logging
|
||||
from enum import Enum
|
||||
from typing import List
|
||||
|
||||
import openai
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from autogpt_server.data.block import Block, BlockOutput, BlockSchema
|
||||
from autogpt_server.util import json
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class EmbeddingModel(str, Enum):
|
||||
ada_002 = "text-embedding-ada-002"
|
||||
|
||||
class EmbeddingConfig(BaseModel):
|
||||
model: EmbeddingModel
|
||||
api_key: str
|
||||
|
||||
class EmbeddingBlock(Block):
|
||||
class Input(BlockSchema):
|
||||
config: EmbeddingConfig
|
||||
texts: List[str] = Field(description="List of texts to create embeddings for")
|
||||
|
||||
class Output(BlockSchema):
|
||||
embeddings: List[List[float]]
|
||||
error: str
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(
|
||||
id="8f7e9a1c-3b7a-4b0f-9f1a-1c3b7a4b0f9f",
|
||||
input_schema=EmbeddingBlock.Input,
|
||||
output_schema=EmbeddingBlock.Output,
|
||||
test_input={
|
||||
"config": {
|
||||
"model": "text-embedding-ada-002",
|
||||
"api_key": "fake-api-key",
|
||||
},
|
||||
"texts": ["Hello, world!", "AutoGPT is amazing"],
|
||||
},
|
||||
test_output=("embeddings", [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]),
|
||||
test_mock={"create_embeddings": lambda *args, **kwargs: [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]},
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def create_embeddings(api_key: str, model: EmbeddingModel, texts: List[str]) -> List[List[float]]:
|
||||
openai.api_key = api_key
|
||||
response = openai.embeddings.create(
|
||||
model=model,
|
||||
input=texts,
|
||||
)
|
||||
return [embedding.embedding for embedding in response.data]
|
||||
|
||||
def run(self, input_data: Input) -> BlockOutput:
|
||||
try:
|
||||
embeddings = self.create_embeddings(
|
||||
input_data.config.api_key,
|
||||
input_data.config.model,
|
||||
input_data.texts
|
||||
)
|
||||
yield "embeddings", embeddings
|
||||
except Exception as e:
|
||||
error_message = f"Error creating embeddings: {str(e)}"
|
||||
logger.error(error_message)
|
||||
yield "error", error_message
|
||||
Reference in New Issue
Block a user