mirror of
https://github.com/microsoft/autogen.git
synced 2026-04-20 03:02:16 -04:00
Agenteval integration (#2672)
* first pass at offline agent eval integration * Integrating AgentEval for offline scenarios * removing old changes * fixing notebook, updating docs * fixing subcriteria bug * updating class comment * cleaning up agent constructors * moving AgentEval agents to separate folder and adding a brief README * fixing build breaks * fixing formatting break * fixing comments * consolidating files in the agenteval folder under contrib and cleaning up imports * fixing import ordering * adding basic agenteval tests and fixing criteria parsing bug * first try at adding openai agenteval tests to build process * adding non-openai agenteval tests to build process * updating test settings * updating openai test * Update test/agentchat/contrib/agent_eval/test_agent_eval.py Co-authored-by: Wael Karkoub <wael.karkoub96@gmail.com> * Update .github/workflows/contrib-openai.yml Co-authored-by: Wael Karkoub <wael.karkoub96@gmail.com> * test commit * updating typing and converting to pydantic objects * fixing test file --------- Co-authored-by: Beibin Li <BeibinLi@users.noreply.github.com> Co-authored-by: Chi Wang <wang.chi@microsoft.com> Co-authored-by: Wael Karkoub <wael.karkoub96@gmail.com>
This commit is contained in:
committed by
GitHub
parent
4b747d731a
commit
dad9c66104
100
test/agentchat/contrib/agent_eval/test_agent_eval.py
Normal file
100
test/agentchat/contrib/agent_eval/test_agent_eval.py
Normal file
@@ -0,0 +1,100 @@
|
||||
#!/usr/bin/env python3 -m pytest
|
||||
|
||||
import json
|
||||
|
||||
import pytest
|
||||
from conftest import reason, skip_openai # noqa: E402
|
||||
|
||||
import autogen
|
||||
from autogen.agentchat.contrib.agent_eval.agent_eval import generate_criteria, quantify_criteria
|
||||
from autogen.agentchat.contrib.agent_eval.criterion import Criterion
|
||||
from autogen.agentchat.contrib.agent_eval.task import Task
|
||||
|
||||
KEY_LOC = "notebook"
|
||||
OAI_CONFIG_LIST = "OAI_CONFIG_LIST"
|
||||
|
||||
|
||||
def remove_ground_truth(test_case: str):
|
||||
test_details = json.loads(test_case)
|
||||
# need to remove the ground truth from the test details
|
||||
correctness = test_details.pop("is_correct", None)
|
||||
test_details.pop("correct_ans", None)
|
||||
test_details.pop("check_result", None)
|
||||
return str(test_details), correctness
|
||||
|
||||
|
||||
if not skip_openai:
|
||||
openai_config_list = autogen.config_list_from_json(
|
||||
OAI_CONFIG_LIST,
|
||||
file_location=KEY_LOC,
|
||||
# The Retrieval tool requires at least gpt-3.5-turbo-1106 (newer versions are supported) or gpt-4-turbo-preview models.
|
||||
# https://platform.openai.com/docs/models/overview
|
||||
filter_dict={
|
||||
"api_type": ["openai"],
|
||||
"model": [
|
||||
"gpt-4-turbo",
|
||||
"gpt-4-turbo-preview",
|
||||
"gpt-4-0125-preview",
|
||||
"gpt-4-1106-preview",
|
||||
"gpt-3.5-turbo",
|
||||
"gpt-3.5-turbo-0125",
|
||||
"gpt-3.5-turbo-1106",
|
||||
],
|
||||
},
|
||||
)
|
||||
|
||||
aoai_config_list = autogen.config_list_from_json(
|
||||
OAI_CONFIG_LIST,
|
||||
file_location=KEY_LOC,
|
||||
filter_dict={"api_type": ["azure"]},
|
||||
)
|
||||
|
||||
success_str = open("test/test_files/agenteval-in-out/samples/sample_math_response_successful.txt", "r").read()
|
||||
response_successful = remove_ground_truth(success_str)[0]
|
||||
failed_str = open("test/test_files/agenteval-in-out/samples/sample_math_response_failed.txt", "r").read()
|
||||
response_failed = remove_ground_truth(failed_str)[0]
|
||||
task = Task(
|
||||
**{
|
||||
"name": "Math problem solving",
|
||||
"description": "Given any question, the system needs to solve the problem as consisely and accurately as possible",
|
||||
"successful_response": response_successful,
|
||||
"failed_response": response_failed,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
skip_openai,
|
||||
reason=reason,
|
||||
)
|
||||
def test_generate_criteria():
|
||||
criteria = generate_criteria(task=task, llm_config={"config_list": aoai_config_list})
|
||||
assert criteria
|
||||
assert len(criteria) > 0
|
||||
assert criteria[0].description
|
||||
assert criteria[0].name
|
||||
assert criteria[0].accepted_values
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
skip_openai,
|
||||
reason=reason,
|
||||
)
|
||||
def test_quantify_criteria():
|
||||
criteria_file = "test/test_files/agenteval-in-out/samples/sample_math_criteria.json"
|
||||
criteria = open(criteria_file, "r").read()
|
||||
criteria = Criterion.parse_json_str(criteria)
|
||||
|
||||
test_case = open("test/test_files/agenteval-in-out/samples/sample_test_case.json", "r").read()
|
||||
test_case, ground_truth = remove_ground_truth(test_case)
|
||||
|
||||
quantified = quantify_criteria(
|
||||
llm_config={"config_list": aoai_config_list},
|
||||
criteria=criteria,
|
||||
task=task,
|
||||
test_case=test_case,
|
||||
ground_truth=ground_truth,
|
||||
)
|
||||
assert quantified
|
||||
assert quantified["actual_success"]
|
||||
assert quantified["estimated_performance"]
|
||||
59
test/agentchat/contrib/agent_eval/test_criterion.py
Normal file
59
test/agentchat/contrib/agent_eval/test_criterion.py
Normal file
@@ -0,0 +1,59 @@
|
||||
#!/usr/bin/env python3 -m pytest
|
||||
|
||||
from autogen.agentchat.contrib.agent_eval.criterion import Criterion
|
||||
|
||||
|
||||
def test_parse_json_str():
|
||||
criteria_file = "test/test_files/agenteval-in-out/samples/sample_math_criteria.json"
|
||||
criteria = open(criteria_file, "r").read()
|
||||
criteria = Criterion.parse_json_str(criteria)
|
||||
assert criteria
|
||||
assert len(criteria) == 6
|
||||
assert criteria[0].name == "Problem Interpretation"
|
||||
assert criteria[0].description == "Ability to correctly interpret the problem."
|
||||
assert len(criteria[0].accepted_values) == 5
|
||||
|
||||
|
||||
def test_write_json():
|
||||
criteria1 = Criterion(name="test1", description="test1 description", accepted_values=["test1", "test2"])
|
||||
criteria2 = Criterion(name="test2", description="test2 description", accepted_values=["test1", "test2"])
|
||||
output = Criterion.write_json([criteria1, criteria2])
|
||||
assert (
|
||||
output
|
||||
== """[
|
||||
{
|
||||
"name": "test1",
|
||||
"description": "test1 description",
|
||||
"accepted_values": [
|
||||
"test1",
|
||||
"test2"
|
||||
],
|
||||
"sub_criteria": []
|
||||
},
|
||||
{
|
||||
"name": "test2",
|
||||
"description": "test2 description",
|
||||
"accepted_values": [
|
||||
"test1",
|
||||
"test2"
|
||||
],
|
||||
"sub_criteria": []
|
||||
}
|
||||
]"""
|
||||
)
|
||||
|
||||
|
||||
def test_write_parse_compatibility():
|
||||
criterion1 = Criterion(name="test1", description="test1 description", accepted_values=["test1", "test2"])
|
||||
criterion2 = Criterion(name="test2", description="test2 description", accepted_values=["test1", "test2"])
|
||||
|
||||
output = Criterion.write_json([criterion1, criterion2])
|
||||
criteria = Criterion.parse_json_str(output)
|
||||
assert criteria
|
||||
assert len(criteria) == 2
|
||||
assert criteria[0].name == "test1"
|
||||
assert criteria[0].description == "test1 description"
|
||||
assert len(criteria[0].accepted_values) == 2
|
||||
assert criteria[1].name == "test2"
|
||||
assert criteria[1].description == "test2 description"
|
||||
assert len(criteria[1].accepted_values) == 2
|
||||
22
test/agentchat/contrib/agent_eval/test_task.py
Normal file
22
test/agentchat/contrib/agent_eval/test_task.py
Normal file
@@ -0,0 +1,22 @@
|
||||
#!/usr/bin/env python3 -m pytest
|
||||
|
||||
from autogen.agentchat.contrib.agent_eval.task import Task
|
||||
|
||||
|
||||
def test_parse_json_str():
|
||||
task = Task(
|
||||
**{
|
||||
"name": "Math problem solving",
|
||||
"description": "Given any question, the system needs to solve the problem as consisely and accurately as possible",
|
||||
"successful_response": '{"message": "The answer is 5", "is_correct": True}',
|
||||
"failed_response": '{"message": "I don\'t know the answer", "is_correct": False}',
|
||||
}
|
||||
)
|
||||
assert task
|
||||
assert task.name == "Math problem solving"
|
||||
assert (
|
||||
task.description
|
||||
== "Given any question, the system needs to solve the problem as consisely and accurately as possible"
|
||||
)
|
||||
assert task.successful_response == '{"message": "The answer is 5", "is_correct": True}'
|
||||
assert task.failed_response == '{"message": "I don\'t know the answer", "is_correct": False}'
|
||||
Reference in New Issue
Block a user