Compare commits

...

53 Commits

Author SHA1 Message Date
Zamil Majdy
ab3a62995f Revert 2024-09-17 21:00:47 -05:00
Zamil Majdy
f2e9a8463d Move shutil in 2024-09-17 20:55:20 -05:00
Zamil Majdy
107148749b Move venv in 2024-09-17 20:51:56 -05:00
Zamil Majdy
17f1d33ed3 Revert 2024-09-17 20:46:01 -05:00
Zamil Majdy
b3a0fc538a Revert 2024-09-17 20:39:29 -05:00
Zamil Majdy
e818bbf859 Merge branch 'master' into ntindle/samples 2024-09-17 18:30:23 -05:00
Nicholas Tindle
069ec89691 Update test_manager.py 2024-09-17 10:07:41 -05:00
Nicholas Tindle
84d490bcb1 Merge branch 'master' into ntindle/samples 2024-09-17 09:46:22 -05:00
Nicholas Tindle
9368956d5d Update test_manager.py 2024-09-16 14:53:28 -05:00
Nicholas Tindle
2c3bde0c53 Merge branch 'master' into ntindle/samples 2024-09-16 14:40:12 -05:00
Nicholas Tindle
104b56628e fix: merge oops 2024-09-16 14:39:23 -05:00
Nicholas Tindle
15ac526eee Merge branch 'master' into ntindle/samples 2024-09-15 10:26:49 -05:00
Nicholas Tindle
5f83e354b9 Merge branch 'master' into ntindle/samples 2024-09-13 23:35:34 -05:00
Nicholas Tindle
70ebf4d58b Update test_manager.py 2024-09-13 22:31:33 -05:00
Nicholas Tindle
6d0d264d99 feat(server): set timeout to 44 second 2024-09-13 18:09:16 -05:00
Nicholas Tindle
8e24b546a3 Merge branch 'master' into ntindle/samples 2024-09-12 23:57:23 -05:00
Nicholas Tindle
d4838cdc45 Update test_manager.py 2024-09-12 23:51:01 -05:00
Nicholas Tindle
acaca35498 Update test_manager.py 2024-09-12 23:43:58 -05:00
Nicholas Tindle
9ee0825f21 Update test_manager.py 2024-09-12 23:36:06 -05:00
Nicholas Tindle
5fde0f2c67 lets try 34 secondsd 2024-09-12 23:29:13 -05:00
Nicholas Tindle
e3407fdfb4 Update test_manager.py 2024-09-12 23:02:08 -05:00
Nicholas Tindle
b98e62cdef Update test_manager.py 2024-09-12 22:54:07 -05:00
Nicholas Tindle
4d82f78f04 Update test_manager.py 2024-09-12 22:47:36 -05:00
Nicholas Tindle
c5d2586f6c Update test_manager.py 2024-09-12 22:34:56 -05:00
Nicholas Tindle
589c8d94ec feat: warning 2024-09-12 10:59:33 -05:00
Nicholas Tindle
136d258a46 Merge branch 'master' into ntindle/samples 2024-09-12 07:02:42 -05:00
Nicholas Tindle
92bcc39f4d Merge branch 'master' into ntindle/samples 2024-09-08 07:48:24 -05:00
Nicholas Tindle
5909697215 Merge branch 'master' into ntindle/samples 2024-09-06 22:47:56 -05:00
Nicholas Tindle
bf34801a74 feat: longer timeout? 2024-09-06 22:05:16 -05:00
Nicholas Tindle
154eccb9af fix: longer for tests? 2024-09-06 21:56:25 -05:00
Nicholas Tindle
14f8a92c20 Merge branch 'master' into ntindle/samples 2024-09-06 21:48:22 -05:00
Nicholas Tindle
2c07c64ccf Update code.py 2024-09-05 18:16:34 -05:00
Nicholas Tindle
ef21d359a6 Update code.py 2024-09-05 17:44:37 -05:00
Nicholas Tindle
f4bd998fa2 Merge branch 'master' into ntindle/samples 2024-09-05 17:43:45 -05:00
Nicholas Tindle
4ebae90f62 Merge branch 'master' into ntindle/samples 2024-08-26 13:05:11 -05:00
Bentlybro
09d3768948 fix output to make pytest work 2024-08-17 21:37:32 +01:00
Nicholas Tindle
8c6adaeaa1 feat(server): linting and bug fix on llm 2024-08-16 19:22:12 -05:00
Nicholas Tindle
dabd2e1610 Merge branch 'master' into ntindle/samples 2024-08-16 17:19:36 -07:00
Nicholas Tindle
b228c4445e feat(server): much better execution of unified 2024-08-15 10:29:03 -05:00
Nicholas Tindle
05c9931c11 feat(server): more complicated blocks 2024-08-14 21:42:04 -05:00
Nicholas Tindle
9198a86c0e fix(server): no default was provided 2024-08-14 21:41:45 -05:00
Nicholas Tindle
c8fedf3dad feat(server): even more advanced 2024-08-14 21:28:51 -05:00
Nicholas Tindle
0c7e1838cd feat(server): more advanced coding blocks 2024-08-14 21:20:35 -05:00
Nicholas Tindle
979d80cd17 feat(server): broken code exec lol 2024-08-14 21:06:59 -05:00
Nicholas Tindle
4f7ffd13e4 feat(server): timeouts on code 2024-08-14 21:01:25 -05:00
Nicholas Tindle
b944e0f6da feat(server): code args 2024-08-14 20:57:38 -05:00
Nicholas Tindle
51aaaf6ddc fix(server): stratified sampling 2024-08-14 20:42:46 -05:00
Nicholas Tindle
3c662af1ba feat(server): allow yielding all data at once rather than row by row 2024-08-14 20:40:17 -05:00
Nicholas Tindle
17370116f6 feat(server): improve various sampling techniques 2024-08-14 20:39:48 -05:00
Nicholas Tindle
d15049e9a7 Merge branch 'master' into ntindle/samples 2024-08-14 16:12:00 -05:00
Nicholas Tindle
da4afd4530 fix(server): anthropic retry didn't work 2024-08-14 13:48:12 -05:00
Nicholas Tindle
7617aa6d1f Merge branch 'master' into ntindle/samples 2024-08-14 13:29:39 -05:00
Nicholas Tindle
b190e1f2aa feat(server): sampling and code block 2024-08-14 00:13:15 -05:00
3 changed files with 328 additions and 20 deletions

View File

@@ -14,7 +14,8 @@ class ReadCsvBlock(Block):
skip_columns: list[str] = [] skip_columns: list[str] = []
class Output(BlockSchema): class Output(BlockSchema):
data: dict[str, str] row: dict[str, str]
all_data: list[dict[str, str]]
def __init__(self): def __init__(self):
super().__init__( super().__init__(
@@ -27,8 +28,15 @@ class ReadCsvBlock(Block):
"contents": "a, b, c\n1,2,3\n4,5,6", "contents": "a, b, c\n1,2,3\n4,5,6",
}, },
test_output=[ test_output=[
("data", {"a": "1", "b": "2", "c": "3"}), ("row", {"a": "1", "b": "2", "c": "3"}),
("data", {"a": "4", "b": "5", "c": "6"}), ("row", {"a": "4", "b": "5", "c": "6"}),
(
"all_data",
[
{"a": "1", "b": "2", "c": "3"},
{"a": "4", "b": "5", "c": "6"},
],
),
], ],
) )
@@ -53,8 +61,7 @@ class ReadCsvBlock(Block):
for _ in range(input_data.skip_rows): for _ in range(input_data.skip_rows):
next(reader) next(reader)
# join the data with the header def process_row(row):
for row in reader:
data = {} data = {}
for i, value in enumerate(row): for i, value in enumerate(row):
if i not in input_data.skip_columns: if i not in input_data.skip_columns:
@@ -62,4 +69,12 @@ class ReadCsvBlock(Block):
data[header[i]] = value.strip() if input_data.strip else value data[header[i]] = value.strip() if input_data.strip else value
else: else:
data[str(i)] = value.strip() if input_data.strip else value data[str(i)] = value.strip() if input_data.strip else value
yield "data", data return data
all_data = []
for row in reader:
processed_row = process_row(row)
all_data.append(processed_row)
yield "row", processed_row
yield "all_data", all_data

View File

@@ -1,6 +1,7 @@
import logging import logging
from enum import Enum from enum import Enum
from typing import List, NamedTuple from json import JSONDecodeError
from typing import Any, List, NamedTuple
import anthropic import anthropic
import ollama import ollama
@@ -93,7 +94,7 @@ class AIStructuredResponseGeneratorBlock(Block):
) )
class Output(BlockSchema): class Output(BlockSchema):
response: dict[str, str] response: dict[str, Any]
error: str error: str
def __init__(self): def __init__(self):
@@ -139,16 +140,33 @@ class AIStructuredResponseGeneratorBlock(Block):
) )
return response.choices[0].message.content or "" return response.choices[0].message.content or ""
elif provider == "anthropic": elif provider == "anthropic":
sysprompt = "".join([p["content"] for p in prompt if p["role"] == "system"]) system_messages = [p["content"] for p in prompt if p["role"] == "system"]
usrprompt = [p for p in prompt if p["role"] == "user"] sysprompt = " ".join(system_messages)
messages = []
last_role = None
for p in prompt:
if p["role"] in ["user", "assistant"]:
if p["role"] != last_role:
messages.append({"role": p["role"], "content": p["content"]})
last_role = p["role"]
else:
# If the role is the same as the last one, combine the content
messages[-1]["content"] += "\n" + p["content"]
client = anthropic.Anthropic(api_key=api_key) client = anthropic.Anthropic(api_key=api_key)
response = client.messages.create( try:
model=model.value, response = client.messages.create(
max_tokens=4096, model=model.value,
system=sysprompt, max_tokens=4096,
messages=usrprompt, # type: ignore system=sysprompt,
) messages=messages,
return response.content[0].text if response.content else "" )
return response.content[0].text if response.content else ""
except anthropic.APIError as e:
error_message = f"Anthropic API error: {str(e)}"
logger.error(error_message)
raise ValueError(error_message)
elif provider == "groq": elif provider == "groq":
client = Groq(api_key=api_key) client = Groq(api_key=api_key)
response_format = {"type": "json_object"} if json_format else None response_format = {"type": "json_object"} if json_format else None
@@ -199,14 +217,16 @@ class AIStructuredResponseGeneratorBlock(Block):
prompt.append({"role": "user", "content": input_data.prompt}) prompt.append({"role": "user", "content": input_data.prompt})
def parse_response(resp: str) -> tuple[dict[str, str], str | None]: def parse_response(resp: str) -> tuple[dict[str, Any], str | None]:
try: try:
parsed = json.loads(resp) parsed = json.loads(resp)
if not isinstance(parsed, dict):
return {}, f"Expected a dictionary, but got {type(parsed)}"
miss_keys = set(input_data.expected_format.keys()) - set(parsed.keys()) miss_keys = set(input_data.expected_format.keys()) - set(parsed.keys())
if miss_keys: if miss_keys:
return parsed, f"Missing keys: {miss_keys}" return parsed, f"Missing keys: {miss_keys}"
return parsed, None return parsed, None
except Exception as e: except JSONDecodeError as e:
return {}, f"JSON decode error: {e}" return {}, f"JSON decode error: {e}"
logger.info(f"LLM request: {prompt}") logger.info(f"LLM request: {prompt}")
@@ -230,7 +250,16 @@ class AIStructuredResponseGeneratorBlock(Block):
if input_data.expected_format: if input_data.expected_format:
parsed_dict, parsed_error = parse_response(response_text) parsed_dict, parsed_error = parse_response(response_text)
if not parsed_error: if not parsed_error:
yield "response", {k: str(v) for k, v in parsed_dict.items()} yield "response", {
k: (
json.loads(v)
if isinstance(v, str)
and v.startswith("[")
and v.endswith("]")
else (", ".join(v) if isinstance(v, list) else v)
)
for k, v in parsed_dict.items()
}
return return
else: else:
yield "response", {"response": response_text} yield "response", {"response": response_text}

View File

@@ -0,0 +1,264 @@
import random
from collections import defaultdict
from enum import Enum
from typing import Any, Dict, List, Optional, Union
from autogpt_server.data.block import Block, BlockCategory, BlockOutput, BlockSchema
from autogpt_server.data.model import SchemaField
class SamplingMethod(str, Enum):
RANDOM = "random"
SYSTEMATIC = "systematic"
TOP = "top"
BOTTOM = "bottom"
STRATIFIED = "stratified"
WEIGHTED = "weighted"
RESERVOIR = "reservoir"
CLUSTER = "cluster"
class DataSamplingBlock(Block):
class Input(BlockSchema):
data: Union[Dict[str, Any], List[Union[dict, List[Any]]]] = SchemaField(
description="The dataset to sample from. Can be a single dictionary, a list of dictionaries, or a list of lists.",
placeholder="{'id': 1, 'value': 'a'} or [{'id': 1, 'value': 'a'}, {'id': 2, 'value': 'b'}, ...]",
)
sample_size: int = SchemaField(
description="The number of samples to take from the dataset.",
placeholder="10",
default=10,
)
sampling_method: SamplingMethod = SchemaField(
description="The method to use for sampling.",
default=SamplingMethod.RANDOM,
)
accumulate: bool = SchemaField(
description="Whether to accumulate data before sampling.",
default=False,
)
random_seed: Optional[int] = SchemaField(
description="Seed for random number generator (optional).",
default=None,
)
stratify_key: Optional[str] = SchemaField(
description="Key to use for stratified sampling (required for stratified sampling).",
default=None,
)
weight_key: Optional[str] = SchemaField(
description="Key to use for weighted sampling (required for weighted sampling).",
default=None,
)
cluster_key: Optional[str] = SchemaField(
description="Key to use for cluster sampling (required for cluster sampling).",
default=None,
)
class Output(BlockSchema):
sampled_data: List[Union[dict, List[Any]]] = SchemaField(
description="The sampled subset of the input data."
)
sample_indices: List[int] = SchemaField(
description="The indices of the sampled data in the original dataset."
)
def __init__(self):
super().__init__(
id="4a448883-71fa-49cf-91cf-70d793bd7d87",
description="This block samples data from a given dataset using various sampling methods.",
categories={BlockCategory.LOGIC},
input_schema=DataSamplingBlock.Input,
output_schema=DataSamplingBlock.Output,
test_input={
"data": [
{"id": i, "value": chr(97 + i), "group": i % 3} for i in range(10)
],
"sample_size": 3,
"sampling_method": SamplingMethod.STRATIFIED,
"accumulate": False,
"random_seed": 42,
"stratify_key": "group",
},
test_output=[
(
"sampled_data",
[
{"id": 0, "value": "a", "group": 0},
{"id": 1, "value": "b", "group": 1},
{"id": 8, "value": "i", "group": 2},
],
),
("sample_indices", [0, 1, 8]),
],
)
self.accumulated_data = []
def run(self, input_data: Input) -> BlockOutput:
if input_data.accumulate:
if isinstance(input_data.data, dict):
self.accumulated_data.append(input_data.data)
elif isinstance(input_data.data, list):
self.accumulated_data.extend(input_data.data)
else:
raise ValueError(f"Unsupported data type: {type(input_data.data)}")
# If we don't have enough data yet, return without sampling
if len(self.accumulated_data) < input_data.sample_size:
return
data_to_sample = self.accumulated_data
else:
# If not accumulating, use the input data directly
data_to_sample = (
input_data.data
if isinstance(input_data.data, list)
else [input_data.data]
)
if input_data.random_seed is not None:
random.seed(input_data.random_seed)
data_size = len(data_to_sample)
if input_data.sample_size > data_size:
raise ValueError(
f"Sample size ({input_data.sample_size}) cannot be larger than the dataset size ({data_size})."
)
indices = []
if input_data.sampling_method == SamplingMethod.RANDOM:
indices = random.sample(range(data_size), input_data.sample_size)
elif input_data.sampling_method == SamplingMethod.SYSTEMATIC:
step = data_size // input_data.sample_size
start = random.randint(0, step - 1)
indices = list(range(start, data_size, step))[: input_data.sample_size]
elif input_data.sampling_method == SamplingMethod.TOP:
indices = list(range(input_data.sample_size))
elif input_data.sampling_method == SamplingMethod.BOTTOM:
indices = list(range(data_size - input_data.sample_size, data_size))
elif input_data.sampling_method == SamplingMethod.STRATIFIED:
if not input_data.stratify_key:
raise ValueError(
"Stratify key must be provided for stratified sampling."
)
strata = defaultdict(list)
for i, item in enumerate(data_to_sample):
if isinstance(item, dict):
strata_value = item.get(input_data.stratify_key)
elif hasattr(item, input_data.stratify_key):
strata_value = getattr(item, input_data.stratify_key)
else:
raise ValueError(
f"Stratify key '{input_data.stratify_key}' not found in item {item}"
)
if strata_value is None:
raise ValueError(
f"Stratify value for key '{input_data.stratify_key}' is None"
)
strata[str(strata_value)].append(i)
# Calculate the number of samples to take from each stratum
stratum_sizes = {
k: max(1, int(len(v) / data_size * input_data.sample_size))
for k, v in strata.items()
}
# Adjust sizes to ensure we get exactly sample_size samples
while sum(stratum_sizes.values()) != input_data.sample_size:
if sum(stratum_sizes.values()) < input_data.sample_size:
stratum_sizes[
max(stratum_sizes, key=lambda k: stratum_sizes[k])
] += 1
else:
stratum_sizes[
max(stratum_sizes, key=lambda k: stratum_sizes[k])
] -= 1
for stratum, size in stratum_sizes.items():
indices.extend(random.sample(strata[stratum], size))
elif input_data.sampling_method == SamplingMethod.WEIGHTED:
if not input_data.weight_key:
raise ValueError("Weight key must be provided for weighted sampling.")
weights = []
for item in data_to_sample:
if isinstance(item, dict):
weight = item.get(input_data.weight_key)
elif hasattr(item, input_data.weight_key):
weight = getattr(item, input_data.weight_key)
else:
raise ValueError(
f"Weight key '{input_data.weight_key}' not found in item {item}"
)
if weight is None:
raise ValueError(
f"Weight value for key '{input_data.weight_key}' is None"
)
try:
weights.append(float(weight))
except ValueError:
raise ValueError(
f"Weight value '{weight}' cannot be converted to a number"
)
if not weights:
raise ValueError(
f"No valid weights found using key '{input_data.weight_key}'"
)
indices = random.choices(
range(data_size), weights=weights, k=input_data.sample_size
)
elif input_data.sampling_method == SamplingMethod.RESERVOIR:
indices = list(range(input_data.sample_size))
for i in range(input_data.sample_size, data_size):
j = random.randint(0, i)
if j < input_data.sample_size:
indices[j] = i
elif input_data.sampling_method == SamplingMethod.CLUSTER:
if not input_data.cluster_key:
raise ValueError("Cluster key must be provided for cluster sampling.")
clusters = defaultdict(list)
for i, item in enumerate(data_to_sample):
if isinstance(item, dict):
cluster_value = item.get(input_data.cluster_key)
elif hasattr(item, input_data.cluster_key):
cluster_value = getattr(item, input_data.cluster_key)
else:
raise TypeError(
f"Item {item} does not have the cluster key '{input_data.cluster_key}'"
)
clusters[str(cluster_value)].append(i)
# Randomly select clusters until we have enough samples
selected_clusters = []
while (
sum(len(clusters[c]) for c in selected_clusters)
< input_data.sample_size
):
available_clusters = [c for c in clusters if c not in selected_clusters]
if not available_clusters:
break
selected_clusters.append(random.choice(available_clusters))
for cluster in selected_clusters:
indices.extend(clusters[cluster])
# If we have more samples than needed, randomly remove some
if len(indices) > input_data.sample_size:
indices = random.sample(indices, input_data.sample_size)
else:
raise ValueError(f"Unknown sampling method: {input_data.sampling_method}")
sampled_data = [data_to_sample[i] for i in indices]
# Clear accumulated data after sampling if accumulation is enabled
if input_data.accumulate:
self.accumulated_data = []
yield "sampled_data", sampled_data
yield "sample_indices", indices