Agenteval integration (#2672)

* first pass at offline agent eval integration

* Integrating AgentEval for offline scenarios

* removing old changes

* fixing notebook, updating docs

* fixing subcriteria bug

* updating class comment

* cleaning up agent constructors

* moving AgentEval agents to separate folder and adding a brief README

* fixing build breaks

* fixing formatting break

* fixing comments

* consolidating files in the agenteval folder under contrib and cleaning up imports

* fixing import ordering

* adding basic agenteval tests and fixing criteria parsing bug

* first try at adding openai agenteval tests to build process

* adding non-openai agenteval tests to build process

* updating test settings

* updating openai test

* Update test/agentchat/contrib/agent_eval/test_agent_eval.py

Co-authored-by: Wael Karkoub <wael.karkoub96@gmail.com>

* Update .github/workflows/contrib-openai.yml

Co-authored-by: Wael Karkoub <wael.karkoub96@gmail.com>

* test commit

* updating typing and converting to pydantic objects

* fixing test file

---------

Co-authored-by: Beibin Li <BeibinLi@users.noreply.github.com>
Co-authored-by: Chi Wang <wang.chi@microsoft.com>
Co-authored-by: Wael Karkoub <wael.karkoub96@gmail.com>
This commit is contained in:
James Woffinden-Luey
2024-05-14 00:14:37 -07:00
committed by GitHub
parent 4b747d731a
commit dad9c66104
14 changed files with 2777 additions and 3038 deletions

View File

@@ -1,26 +1,26 @@
{
"Problem Interpretation": {
"description": "Ability to correctly interpret the problem.",
"accepted_values": ["completely off", "slightly relevant", "relevant", "mostly accurate", "completely accurate"]
},
"Mathematical Methodology": {
"description": "Adequacy of the chosen mathematical or algorithmic methodology for the question",
"accepted_values": ["inappropriate", "barely adequate", "adequate", "mostly effective", "completely effective"]
},
"Calculation Correctness": {
"description": "Accuracy of calculations made and solutions given",
"accepted_values": ["completely incorrect", "mostly incorrect", "neither", "mostly correct", "completely correct"]
},
"Explanation Clarity": {
"description": "Clarity and comprehensibility of explanations, including language use and structure",
"accepted_values": ["not at all clear", "slightly clear", "moderately clear", "very clear", "completely clear"]
},
"Code Efficiency": {
"description": "Quality of code in terms of efficiency and elegance",
"accepted_values": ["not at all efficient", "slightly efficient", "moderately efficient", "very efficient", "extremely efficient"]
},
"Code Correctness": {
"description": "Correctness of the provided code",
"accepted_values": ["completely incorrect", "mostly incorrect", "partly correct", "mostly correct", "completely correct"]
}
[
{ "name": "Problem Interpretation",
"description": "Ability to correctly interpret the problem.",
"accepted_values": ["completely off", "slightly relevant", "relevant", "mostly accurate", "completely accurate"]
},
{ "name": "Mathematical Methodology",
"description": "Adequacy of the chosen mathematical or algorithmic methodology for the question",
"accepted_values": ["inappropriate", "barely adequate", "adequate", "mostly effective", "completely effective"]
},
{ "name": "Calculation Correctness",
"description": "Accuracy of calculations made and solutions given",
"accepted_values": ["completely incorrect", "mostly incorrect", "neither", "mostly correct", "completely correct"]
},
{ "name": "Explanation Clarity",
"description": "Clarity and comprehensibility of explanations, including language use and structure",
"accepted_values": ["not at all clear", "slightly clear", "moderately clear", "very clear", "completely clear"]
},
{ "name": "Code Efficiency",
"description": "Quality of code in terms of efficiency and elegance",
"accepted_values": ["not at all efficient", "slightly efficient", "moderately efficient", "very efficient", "extremely efficient"]
},
{ "name": "Code Correctness",
"description": "Correctness of the provided code",
"accepted_values": ["completely incorrect", "mostly incorrect", "partly correct", "mostly correct", "completely correct"]
}
]