Agenteval integration (#2672)

* first pass at offline agent eval integration * Integrating AgentEval for offline scenarios * removing old changes * fixing notebook, updating docs * fixing subcriteria bug * updating class comment * cleaning up agent constructors * moving AgentEval agents to separate folder and adding a brief README * fixing build breaks * fixing formatting break * fixing comments * consolidating files in the agenteval folder under contrib and cleaning up imports * fixing import ordering * adding basic agenteval tests and fixing criteria parsing bug * first try at adding openai agenteval tests to build process * adding non-openai agenteval tests to build process * updating test settings * updating openai test * Update test/agentchat/contrib/agent_eval/test_agent_eval.py Co-authored-by: Wael Karkoub <wael.karkoub96@gmail.com> * Update .github/workflows/contrib-openai.yml Co-authored-by: Wael Karkoub <wael.karkoub96@gmail.com> * test commit * updating typing and converting to pydantic objects * fixing test file --------- Co-authored-by: Beibin Li <BeibinLi@users.noreply.github.com> Co-authored-by: Chi Wang <wang.chi@microsoft.com> Co-authored-by: Wael Karkoub <wael.karkoub96@gmail.com>
2026-04-20 03:02:16 -04:00 · 2024-05-14 00:14:37 -07:00
parent 4b747d731a
commit dad9c66104
14 changed files with 2777 additions and 3038 deletions
--- a/test/test_files/agenteval-in-out/samples/sample_math_criteria.json
+++ b/test/test_files/agenteval-in-out/samples/sample_math_criteria.json
@@ -1,26 +1,26 @@
-{
-    "Problem Interpretation": {
-      "description": "Ability to correctly interpret the problem.",
-      "accepted_values": ["completely off", "slightly relevant", "relevant", "mostly accurate", "completely accurate"]
-    },
-    "Mathematical Methodology": {
-      "description": "Adequacy of the chosen mathematical or algorithmic methodology for the question",
-      "accepted_values": ["inappropriate", "barely adequate", "adequate", "mostly effective", "completely effective"]
-    },
-    "Calculation Correctness": {
-      "description": "Accuracy of calculations made and solutions given",
-      "accepted_values": ["completely incorrect", "mostly incorrect", "neither", "mostly correct", "completely correct"]
-    },
-    "Explanation Clarity": {
-      "description": "Clarity and comprehensibility of explanations, including language use and structure",
-      "accepted_values": ["not at all clear", "slightly clear", "moderately clear", "very clear", "completely clear"]
-    },
-    "Code Efficiency": {
-      "description": "Quality of code in terms of  efficiency and elegance",
-      "accepted_values": ["not at all efficient", "slightly efficient", "moderately efficient", "very efficient", "extremely efficient"]
-    },
-    "Code Correctness": {
-      "description": "Correctness of the provided code",
-      "accepted_values": ["completely incorrect", "mostly incorrect", "partly correct", "mostly correct", "completely correct"]
-    }
+[
+  { "name": "Problem Interpretation",
+    "description": "Ability to correctly interpret the problem.",
+    "accepted_values": ["completely off", "slightly relevant", "relevant", "mostly accurate", "completely accurate"]
+  },
+  { "name": "Mathematical Methodology",
+    "description": "Adequacy of the chosen mathematical or algorithmic methodology for the question",
+    "accepted_values": ["inappropriate", "barely adequate", "adequate", "mostly effective", "completely effective"]
+  },
+  { "name": "Calculation Correctness",
+    "description": "Accuracy of calculations made and solutions given",
+    "accepted_values": ["completely incorrect", "mostly incorrect", "neither", "mostly correct", "completely correct"]
+  },
+  { "name": "Explanation Clarity",
+    "description": "Clarity and comprehensibility of explanations, including language use and structure",
+    "accepted_values": ["not at all clear", "slightly clear", "moderately clear", "very clear", "completely clear"]
+  },
+  { "name": "Code Efficiency",
+    "description": "Quality of code in terms of  efficiency and elegance",
+    "accepted_values": ["not at all efficient", "slightly efficient", "moderately efficient", "very efficient", "extremely efficient"]
+  },
+  { "name": "Code Correctness",
+    "description": "Correctness of the provided code",
+    "accepted_values": ["completely incorrect", "mostly incorrect", "partly correct", "mostly correct", "completely correct"]
  }
+]