Agenteval integration (#2672)

* first pass at offline agent eval integration * Integrating AgentEval for offline scenarios * removing old changes * fixing notebook, updating docs * fixing subcriteria bug * updating class comment * cleaning up agent constructors * moving AgentEval agents to separate folder and adding a brief README * fixing build breaks * fixing formatting break * fixing comments * consolidating files in the agenteval folder under contrib and cleaning up imports * fixing import ordering * adding basic agenteval tests and fixing criteria parsing bug * first try at adding openai agenteval tests to build process * adding non-openai agenteval tests to build process * updating test settings * updating openai test * Update test/agentchat/contrib/agent_eval/test_agent_eval.py Co-authored-by: Wael Karkoub <wael.karkoub96@gmail.com> * Update .github/workflows/contrib-openai.yml Co-authored-by: Wael Karkoub <wael.karkoub96@gmail.com> * test commit * updating typing and converting to pydantic objects * fixing test file --------- Co-authored-by: Beibin Li <BeibinLi@users.noreply.github.com> Co-authored-by: Chi Wang <wang.chi@microsoft.com> Co-authored-by: Wael Karkoub <wael.karkoub96@gmail.com>
2026-04-20 03:02:16 -04:00 · 2024-05-14 00:14:37 -07:00
parent 4b747d731a
commit dad9c66104
14 changed files with 2777 additions and 3038 deletions
--- a/test/agentchat/contrib/agent_eval/test_agent_eval.py
+++ b/test/agentchat/contrib/agent_eval/test_agent_eval.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3 -m pytest
+
+import json
+
+import pytest
+from conftest import reason, skip_openai  # noqa: E402
+
+import autogen
+from autogen.agentchat.contrib.agent_eval.agent_eval import generate_criteria, quantify_criteria
+from autogen.agentchat.contrib.agent_eval.criterion import Criterion
+from autogen.agentchat.contrib.agent_eval.task import Task
+
+KEY_LOC = "notebook"
+OAI_CONFIG_LIST = "OAI_CONFIG_LIST"
+
+
+def remove_ground_truth(test_case: str):
+    test_details = json.loads(test_case)
+    # need to remove the ground truth from the test details
+    correctness = test_details.pop("is_correct", None)
+    test_details.pop("correct_ans", None)
+    test_details.pop("check_result", None)
+    return str(test_details), correctness
+
+
+if not skip_openai:
+    openai_config_list = autogen.config_list_from_json(
+        OAI_CONFIG_LIST,
+        file_location=KEY_LOC,
+        # The Retrieval tool requires at least gpt-3.5-turbo-1106 (newer versions are supported) or gpt-4-turbo-preview models.
+        # https://platform.openai.com/docs/models/overview
+        filter_dict={
+            "api_type": ["openai"],
+            "model": [
+                "gpt-4-turbo",
+                "gpt-4-turbo-preview",
+                "gpt-4-0125-preview",
+                "gpt-4-1106-preview",
+                "gpt-3.5-turbo",
+                "gpt-3.5-turbo-0125",
+                "gpt-3.5-turbo-1106",
+            ],
+        },
+    )
+
+    aoai_config_list = autogen.config_list_from_json(
+        OAI_CONFIG_LIST,
+        file_location=KEY_LOC,
+        filter_dict={"api_type": ["azure"]},
+    )
+
+    success_str = open("test/test_files/agenteval-in-out/samples/sample_math_response_successful.txt", "r").read()
+    response_successful = remove_ground_truth(success_str)[0]
+    failed_str = open("test/test_files/agenteval-in-out/samples/sample_math_response_failed.txt", "r").read()
+    response_failed = remove_ground_truth(failed_str)[0]
+    task = Task(
+        **{
+            "name": "Math problem solving",
+            "description": "Given any question, the system needs to solve the problem as consisely and accurately as possible",
+            "successful_response": response_successful,
+            "failed_response": response_failed,
+        }
+    )
+
+
+@pytest.mark.skipif(
+    skip_openai,
+    reason=reason,
+)
+def test_generate_criteria():
+    criteria = generate_criteria(task=task, llm_config={"config_list": aoai_config_list})
+    assert criteria
+    assert len(criteria) > 0
+    assert criteria[0].description
+    assert criteria[0].name
+    assert criteria[0].accepted_values
+
+
+@pytest.mark.skipif(
+    skip_openai,
+    reason=reason,
+)
+def test_quantify_criteria():
+    criteria_file = "test/test_files/agenteval-in-out/samples/sample_math_criteria.json"
+    criteria = open(criteria_file, "r").read()
+    criteria = Criterion.parse_json_str(criteria)
+
+    test_case = open("test/test_files/agenteval-in-out/samples/sample_test_case.json", "r").read()
+    test_case, ground_truth = remove_ground_truth(test_case)
+
+    quantified = quantify_criteria(
+        llm_config={"config_list": aoai_config_list},
+        criteria=criteria,
+        task=task,
+        test_case=test_case,
+        ground_truth=ground_truth,
+    )
+    assert quantified
+    assert quantified["actual_success"]
+    assert quantified["estimated_performance"]
--- a/test/agentchat/contrib/agent_eval/test_criterion.py
+++ b/test/agentchat/contrib/agent_eval/test_criterion.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3 -m pytest
+
+from autogen.agentchat.contrib.agent_eval.criterion import Criterion
+
+
+def test_parse_json_str():
+    criteria_file = "test/test_files/agenteval-in-out/samples/sample_math_criteria.json"
+    criteria = open(criteria_file, "r").read()
+    criteria = Criterion.parse_json_str(criteria)
+    assert criteria
+    assert len(criteria) == 6
+    assert criteria[0].name == "Problem Interpretation"
+    assert criteria[0].description == "Ability to correctly interpret the problem."
+    assert len(criteria[0].accepted_values) == 5
+
+
+def test_write_json():
+    criteria1 = Criterion(name="test1", description="test1 description", accepted_values=["test1", "test2"])
+    criteria2 = Criterion(name="test2", description="test2 description", accepted_values=["test1", "test2"])
+    output = Criterion.write_json([criteria1, criteria2])
+    assert (
+        output
+        == """[
+  {
+    "name": "test1",
+    "description": "test1 description",
+    "accepted_values": [
+      "test1",
+      "test2"
+    ],
+    "sub_criteria": []
+  },
+  {
+    "name": "test2",
+    "description": "test2 description",
+    "accepted_values": [
+      "test1",
+      "test2"
+    ],
+    "sub_criteria": []
+  }
+]"""
+    )
+
+
+def test_write_parse_compatibility():
+    criterion1 = Criterion(name="test1", description="test1 description", accepted_values=["test1", "test2"])
+    criterion2 = Criterion(name="test2", description="test2 description", accepted_values=["test1", "test2"])
+
+    output = Criterion.write_json([criterion1, criterion2])
+    criteria = Criterion.parse_json_str(output)
+    assert criteria
+    assert len(criteria) == 2
+    assert criteria[0].name == "test1"
+    assert criteria[0].description == "test1 description"
+    assert len(criteria[0].accepted_values) == 2
+    assert criteria[1].name == "test2"
+    assert criteria[1].description == "test2 description"
+    assert len(criteria[1].accepted_values) == 2
--- a/test/agentchat/contrib/agent_eval/test_task.py
+++ b/test/agentchat/contrib/agent_eval/test_task.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3 -m pytest
+
+from autogen.agentchat.contrib.agent_eval.task import Task
+
+
+def test_parse_json_str():
+    task = Task(
+        **{
+            "name": "Math problem solving",
+            "description": "Given any question, the system needs to solve the problem as consisely and accurately as possible",
+            "successful_response": '{"message": "The answer is 5", "is_correct": True}',
+            "failed_response": '{"message": "I don\'t know the answer", "is_correct": False}',
+        }
+    )
+    assert task
+    assert task.name == "Math problem solving"
+    assert (
+        task.description
+        == "Given any question, the system needs to solve the problem as consisely and accurately as possible"
+    )
+    assert task.successful_response == '{"message": "The answer is 5", "is_correct": True}'
+    assert task.failed_response == '{"message": "I don\'t know the answer", "is_correct": False}'