mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-02-03 11:24:57 -05:00
- **FIX ALL LINT/TYPE ERRORS IN AUTOGPT, FORGE, AND BENCHMARK** ### Linting - Clean up linter configs for `autogpt`, `forge`, and `benchmark` - Add type checking with Pyright - Create unified pre-commit config - Create unified linting and type checking CI workflow ### Testing - Synchronize CI test setups for `autogpt`, `forge`, and `benchmark` - Add missing pytest-cov to benchmark dependencies - Mark GCS tests as slow to speed up pre-commit test runs - Repair `forge` test suite - Add `AgentDB.close()` method for test DB teardown in db_test.py - Use actual temporary dir instead of forge/test_workspace/ - Move left-behind dependencies for moved `forge`-code to from autogpt to forge ### Notable type changes - Replace uses of `ChatModelProvider` by `MultiProvider` - Removed unnecessary exports from various __init__.py - Simplify `FileStorage.open_file` signature by removing `IOBase` from return type union - Implement `S3BinaryIOWrapper(BinaryIO)` type interposer for `S3FileStorage` - Expand overloads of `GCSFileStorage.open_file` for improved typing of read and write modes Had to silence type checking for the extra overloads, because (I think) Pyright is reporting a false-positive: https://github.com/microsoft/pyright/issues/8007 - Change `count_tokens`, `get_tokenizer`, `count_message_tokens` methods on `ModelProvider`s from class methods to instance methods - Move `CompletionModelFunction.schema` method -> helper function `format_function_def_for_openai` in `forge.llm.providers.openai` - Rename `ModelProvider` -> `BaseModelProvider` - Rename `ChatModelProvider` -> `BaseChatModelProvider` - Add type `ChatModelProvider` which is a union of all subclasses of `BaseChatModelProvider` ### Removed rather than fixed - Remove deprecated and broken autogpt/agbenchmark_config/benchmarks.py - Various base classes and properties on base classes in `forge.llm.providers.schema` and `forge.models.providers` ### Fixes for other issues that came to light - Clean up `forge.agent_protocol.api_router`, `forge.agent_protocol.database`, and `forge.agent.agent` - Add fallback behavior to `ImageGeneratorComponent` - Remove test for deprecated failure behavior - Fix `agbenchmark.challenges.builtin` challenge exclusion mechanism on Windows - Fix `_tool_calls_compat_extract_calls` in `forge.llm.providers.openai` - Add support for `any` (= no type specified) in `JSONSchema.typescript_type`
94 lines
3.1 KiB
Python
94 lines
3.1 KiB
Python
import datetime
|
|
import time
|
|
|
|
import pytest
|
|
import requests
|
|
|
|
URL_BENCHMARK = "http://localhost:8080/ap/v1"
|
|
URL_AGENT = "http://localhost:8000/ap/v1"
|
|
|
|
try:
|
|
response = requests.get(f"{URL_AGENT}/agent/tasks")
|
|
except requests.exceptions.ConnectionError:
|
|
pytest.skip("No agent available to test against", allow_module_level=True)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"eval_id, input_text, expected_artifact_length, test_name, should_be_successful",
|
|
[
|
|
(
|
|
"021c695a-6cc4-46c2-b93a-f3a9b0f4d123",
|
|
"Write the word 'Washington' to a .txt file",
|
|
0,
|
|
"WriteFile",
|
|
True,
|
|
),
|
|
(
|
|
"f219f3d3-a41b-45a9-a3d0-389832086ee8",
|
|
"Read the file called file_to_read.txt "
|
|
"and write its content to a file called output.txt",
|
|
1,
|
|
"ReadFile",
|
|
False,
|
|
),
|
|
],
|
|
)
|
|
def test_entire_workflow(
|
|
eval_id: str,
|
|
input_text: str,
|
|
expected_artifact_length: int,
|
|
test_name: str,
|
|
should_be_successful: bool,
|
|
):
|
|
task_request = {"eval_id": eval_id, "input": input_text}
|
|
response = requests.get(f"{URL_AGENT}/agent/tasks")
|
|
task_count_before = response.json()["pagination"]["total_items"]
|
|
# First POST request
|
|
task_response_benchmark = requests.post(
|
|
URL_BENCHMARK + "/agent/tasks", json=task_request
|
|
)
|
|
response = requests.get(f"{URL_AGENT}/agent/tasks")
|
|
task_count_after = response.json()["pagination"]["total_items"]
|
|
assert task_count_after == task_count_before + 1
|
|
|
|
timestamp_after_task_eval_created = datetime.datetime.now(datetime.timezone.utc)
|
|
time.sleep(1.1) # To make sure the 2 timestamps to compare are different
|
|
assert task_response_benchmark.status_code == 200
|
|
task_response_benchmark = task_response_benchmark.json()
|
|
assert task_response_benchmark["input"] == input_text
|
|
|
|
task_response_benchmark_id = task_response_benchmark["task_id"]
|
|
|
|
response_task_agent = requests.get(
|
|
f"{URL_AGENT}/agent/tasks/{task_response_benchmark_id}"
|
|
)
|
|
assert response_task_agent.status_code == 200
|
|
response_task_agent = response_task_agent.json()
|
|
assert len(response_task_agent["artifacts"]) == expected_artifact_length
|
|
|
|
step_request = {"input": input_text}
|
|
|
|
step_response = requests.post(
|
|
URL_BENCHMARK + "/agent/tasks/" + task_response_benchmark_id + "/steps",
|
|
json=step_request,
|
|
)
|
|
assert step_response.status_code == 200
|
|
step_response = step_response.json()
|
|
assert step_response["is_last"] is True # Assuming is_last is always True
|
|
|
|
eval_response = requests.post(
|
|
URL_BENCHMARK + "/agent/tasks/" + task_response_benchmark_id + "/evaluations",
|
|
json={},
|
|
)
|
|
assert eval_response.status_code == 200
|
|
eval_response = eval_response.json()
|
|
print("eval_response")
|
|
print(eval_response)
|
|
assert eval_response["run_details"]["test_name"] == test_name
|
|
assert eval_response["metrics"]["success"] == should_be_successful
|
|
benchmark_start_time = datetime.datetime.fromisoformat(
|
|
eval_response["run_details"]["benchmark_start_time"]
|
|
)
|
|
|
|
assert benchmark_start_time < timestamp_after_task_eval_created
|