raise error when msg is invalid; fix docstr; improve ResponsiveAgent; update doc and packaging; capture ipython output; find code blocks with llm when regex fails. (#1154)

* autogen.agent -> autogen.agentchat * bug fix in portfolio * notebook * timeout * timeout * infer lang; close #1150 * timeout * message context * context handling * add sender to generate_reply * clean up the receive function * move mathchat to contrib * contrib * last_message * Add OptiGuide: agent and notebook * Optiguide notebook: add figures and URL 1. figures and code points to remote URL 2. simplify the prompt for the interpreter, because all information is already in the chat history. * Update name: Agent -> GenericAgent * Update notebook * Rename: GenericAgent -> ResponsiveAgent * Rebase to autogen.agentchat * OptiGuide: Comment, sytle, and notebook updates * simplify optiguide * raise error when msg is invalid; fix docstr * allow return None for generate_reply() * update_system_message * test update_system_message * simplify optiguide * simplify optiguide * simplify optiguide * simplify optiguide * move test * add test and fix bug * doc update * doc update * doc update * color * optiguide * prompt * test danger case * packaging * docker * remove path in traceback * capture ipython output * simplify * find code blocks with llm * find code with llm * order * order * fix bug in context handling * print executing msg * print executing msg * test find code * test find code * disable find_code * default_auto_reply * default auto reply * remove optiguide * remove -e --------- Co-authored-by: Beibin Li <beibin79@gmail.com>
2026-04-20 03:02:16 -04:00 · 2023-07-31 19:22:30 -07:00
parent da92238ffe
commit c48babd02f
39 changed files with 1225 additions and 866 deletions
--- a/test/autogen/agentchat/extensions/init.py
+++ b/test/autogen/agentchat/extensions/init.py
--- a/test/autogen/agentchat/extensions/tsp.py
+++ b/test/autogen/agentchat/extensions/tsp.py
--- a/test/autogen/agentchat/extensions/tsp_api.py
+++ b/test/autogen/agentchat/extensions/tsp_api.py
--- a/test/autogen/agentchat/test_assistant_agent.py
+++ b/test/autogen/agentchat/test_assistant_agent.py
@@ -1,10 +1,10 @@
 import os
 import sys
 import pytest
-from flaml import oai
+from flaml import autogen
 from flaml.autogen.agentchat import AssistantAgent, UserProxyAgent

-KEY_LOC = "test/autogen"
+KEY_LOC = "notebook"
 OAI_CONFIG_LIST = "OAI_CONFIG_LIST"
 here = os.path.abspath(os.path.dirname(__file__))

@@ -20,16 +20,16 @@ def test_ai_user_proxy_agent():
        return

    conversations = {}
-    oai.ChatCompletion.start_logging(conversations)
+    autogen.ChatCompletion.start_logging(conversations)

-    config_list = oai.config_list_from_json(
+    config_list = autogen.config_list_from_json(
        OAI_CONFIG_LIST,
        file_location=KEY_LOC,
    )
    assistant = AssistantAgent(
        "assistant",
        system_message="You are a helpful assistant.",
-        oai_config={
+        llm_config={
            "request_timeout": 600,
            "seed": 42,
            "config_list": config_list,
@@ -41,7 +41,7 @@ def test_ai_user_proxy_agent():
        human_input_mode="NEVER",
        max_consecutive_auto_reply=2,
        code_execution_config=False,
-        oai_config={
+        llm_config={
            "config_list": config_list,
        },
        # In the system message the "user" always refers to ther other agent.
@@ -62,7 +62,7 @@ def test_gpt35(human_input_mode="NEVER", max_consecutive_auto_reply=5):
        import openai
    except ImportError:
        return
-    config_list = oai.config_list_from_json(
+    config_list = autogen.config_list_from_json(
        OAI_CONFIG_LIST,
        file_location=KEY_LOC,
        filter_dict={
@@ -75,14 +75,14 @@ def test_gpt35(human_input_mode="NEVER", max_consecutive_auto_reply=5):
            },
        },
    )
+    llm_config = {
+        "seed": 42,
+        "config_list": config_list,
+        "max_tokens": 1024,
+    }
    assistant = AssistantAgent(
        "coding_agent",
-        oai_config={
-            # "request_timeout": 600,
-            "seed": 42,
-            "config_list": config_list,
-            "max_tokens": 1024,
-        },
+        llm_config=llm_config,
    )
    user = UserProxyAgent(
        "user",
@@ -94,6 +94,8 @@ def test_gpt35(human_input_mode="NEVER", max_consecutive_auto_reply=5):
            "use_docker": "python:3",
            "timeout": 60,
        },
+        llm_config=llm_config,
+        system_message="""Reply TERMINATE to end the conversation.""",
    )
    user.initiate_chat(assistant, message="TERMINATE")
    # should terminate without sending any message
@@ -115,16 +117,17 @@ def test_create_execute_script(human_input_mode="NEVER", max_consecutive_auto_re
    except ImportError:
        return

-    config_list = oai.config_list_from_json(OAI_CONFIG_LIST, file_location=KEY_LOC)
+    config_list = autogen.config_list_from_json(OAI_CONFIG_LIST, file_location=KEY_LOC)
    conversations = {}
-    oai.ChatCompletion.start_logging(conversations)
+    autogen.ChatCompletion.start_logging(conversations)
+    llm_config = {
+        "request_timeout": 600,
+        "seed": 42,
+        "config_list": config_list,
+    }
    assistant = AssistantAgent(
        "assistant",
-        oai_config={
-            "request_timeout": 600,
-            "seed": 42,
-            "config_list": config_list,
-        },
+        llm_config=llm_config,
    )
    user = UserProxyAgent(
        "user",
@@ -145,10 +148,10 @@ print('Hello world!')
 ```""",
    )
    print(conversations)
-    oai.ChatCompletion.start_logging(compact=False)
+    autogen.ChatCompletion.start_logging(compact=False)
    user.send("""Execute temp.py""", assistant)
-    print(oai.ChatCompletion.logged_history)
-    oai.ChatCompletion.stop_logging()
+    print(autogen.ChatCompletion.logged_history)
+    autogen.ChatCompletion.stop_logging()


 def test_tsp(human_input_mode="NEVER", max_consecutive_auto_reply=10):
@@ -157,7 +160,7 @@ def test_tsp(human_input_mode="NEVER", max_consecutive_auto_reply=10):
    except ImportError:
        return

-    config_list = oai.config_list_from_json(
+    config_list = autogen.config_list_from_json(
        OAI_CONFIG_LIST,
        file_location=KEY_LOC,
        filter_dict={
@@ -179,19 +182,17 @@ def test_tsp(human_input_mode="NEVER", max_consecutive_auto_reply=10):
        def generate_init_message(self, question) -> str:
            return self._prompt.format(question=question)

-    oai.ChatCompletion.start_logging()
-    assistant = AssistantAgent("assistant", oai_config={"temperature": 0, "config_list": config_list})
+    autogen.ChatCompletion.start_logging()
+    assistant = AssistantAgent("assistant", llm_config={"temperature": 0, "config_list": config_list})
    user = TSPUserProxyAgent(
        "user",
        code_execution_config={"work_dir": here},
        human_input_mode=human_input_mode,
        max_consecutive_auto_reply=max_consecutive_auto_reply,
    )
-    # agent.receive(prompt.format(question=hard_questions[0]), user)
-    # agent.receive(prompt.format(question=hard_questions[1]), user)
    user.initiate_chat(assistant, question=hard_questions[2])
-    print(oai.ChatCompletion.logged_history)
-    oai.ChatCompletion.stop_logging()
+    print(autogen.ChatCompletion.logged_history)
+    autogen.ChatCompletion.stop_logging()


 if __name__ == "__main__":
--- a/test/autogen/agentchat/test_math_user_proxy_agent.py
+++ b/test/autogen/agentchat/test_math_user_proxy_agent.py
@@ -1,14 +1,12 @@
-from flaml import oai
+import pytest
+import sys
+from flaml import autogen
 from flaml.autogen.agentchat.contrib.math_user_proxy_agent import (
    MathUserProxyAgent,
    _remove_print,
    _add_print_to_last_line,
 )
-import pytest
-import sys
-
-KEY_LOC = "test/autogen"
-OAI_CONFIG_LIST = "OAI_CONFIG_LIST"
+from test_assistant_agent import KEY_LOC, OAI_CONFIG_LIST


@pytest.mark.skipif(
@@ -24,9 +22,9 @@ def test_math_user_proxy_agent():
    from flaml.autogen.agentchat.assistant_agent import AssistantAgent

    conversations = {}
-    oai.ChatCompletion.start_logging(conversations)
+    autogen.ChatCompletion.start_logging(conversations)

-    config_list = oai.config_list_from_json(
+    config_list = autogen.config_list_from_json(
        OAI_CONFIG_LIST,
        file_location=KEY_LOC,
        filter_dict={
@@ -36,7 +34,7 @@ def test_math_user_proxy_agent():
    assistant = AssistantAgent(
        "assistant",
        system_message="You are a helpful assistant.",
-        oai_config={
+        llm_config={
            "request_timeout": 600,
            "seed": 42,
            "config_list": config_list,
--- a/test/autogen/agentchat/test_responsive_agent.py
+++ b/test/autogen/agentchat/test_responsive_agent.py
@@ -13,16 +13,20 @@ def test_responsive_agent(monkeypatch):
    monkeypatch.setattr(sys, "stdin", StringIO("TERMINATE\n\n"))
    dummy_agent_1.receive(
        {
-            "content": "hello",
+            "content": "hello {name}",
+            "context": {
+                "name": "dummy_agent_2",
+            },
        },
        dummy_agent_2,
    )  # receive a dict
-
+    assert "context" in dummy_agent_1.chat_messages["dummy_agent_2"][-2]
    # receive dict without openai fields to be printed, such as "content", 'function_call'. There should be no error raised.
-    pre_len = len(dummy_agent_1.oai_conversations["dummy_agent_2"])
-    dummy_agent_1.receive({"message": "hello"}, dummy_agent_2)
+    pre_len = len(dummy_agent_1.chat_messages["dummy_agent_2"])
+    with pytest.raises(ValueError):
+        dummy_agent_1.receive({"message": "hello"}, dummy_agent_2)
    assert pre_len == len(
-        dummy_agent_1.oai_conversations["dummy_agent_2"]
+        dummy_agent_1.chat_messages["dummy_agent_2"]
    ), "When the message is not an valid openai message, it should not be appended to the oai conversation."

    monkeypatch.setattr(sys, "stdin", StringIO("exit"))
@@ -36,14 +40,18 @@ def test_responsive_agent(monkeypatch):
    )  # send a dict

    # send dict with no openai fields
-    pre_len = len(dummy_agent_1.oai_conversations["dummy_agent_2"])
+    pre_len = len(dummy_agent_1.chat_messages["dummy_agent_2"])
    with pytest.raises(ValueError):
        dummy_agent_1.send({"message": "hello"}, dummy_agent_2)

    assert pre_len == len(
-        dummy_agent_1.oai_conversations["dummy_agent_2"]
+        dummy_agent_1.chat_messages["dummy_agent_2"]
    ), "When the message is not a valid openai message, it should not be appended to the oai conversation."

+    # update system message
+    dummy_agent_1.update_system_message("new system message")
+    assert dummy_agent_1._oai_system_message[0]["content"] == "new system message"
+

 if __name__ == "__main__":
    test_responsive_agent(pytest.monkeypatch)
--- a/test/autogen/agentchat/tsp_prompt.txt
+++ b/test/autogen/agentchat/tsp_prompt.txt
--- a/test/autogen/oai/test_completion.py
+++ b/test/autogen/oai/test_completion.py
@@ -5,30 +5,28 @@ import pytest
 from functools import partial
 import os
 import json
-from flaml import oai
+from flaml import autogen
 from flaml.autogen.code_utils import (
    eval_function_completions,
    generate_assertions,
    implement,
    generate_code,
-    improve_function,
-    improve_code,
 )
 from flaml.autogen.math_utils import eval_math_responses, solve_problem

-KEY_LOC = "test/autogen"
+KEY_LOC = "notebook"
 OAI_CONFIG_LIST = "OAI_CONFIG_LIST"
 here = os.path.abspath(os.path.dirname(__file__))


 def yes_or_no_filter(context, response, **_):
    return context.get("yes_or_no_choice", False) is False or any(
-        text in ["Yes.", "No."] for text in oai.Completion.extract_text(response)
+        text in ["Yes.", "No."] for text in autogen.Completion.extract_text(response)
    )


 def valid_json_filter(response, **_):
-    for text in oai.Completion.extract_text(response):
+    for text in autogen.Completion.extract_text(response):
        try:
            json.loads(text)
            return True
@@ -43,47 +41,47 @@ def test_filter():
    except ImportError as exc:
        print(exc)
        return
-    response = oai.Completion.create(
+    response = autogen.Completion.create(
        context={"yes_or_no_choice": True},
        config_list=[{"model": "text-ada-001"}, {"model": "gpt-3.5-turbo"}, {"model": "text-davinci-003"}],
        prompt="Is 37 a prime number? Please answer 'Yes.' or 'No.'",
        filter_func=yes_or_no_filter,
    )
    assert (
-        oai.Completion.extract_text(response)[0] in ["Yes.", "No."]
+        autogen.Completion.extract_text(response)[0] in ["Yes.", "No."]
        or not response["pass_filter"]
        and response["config_id"] == 2
    )
-    response = oai.Completion.create(
+    response = autogen.Completion.create(
        context={"yes_or_no_choice": False},
        config_list=[{"model": "text-ada-001"}, {"model": "gpt-3.5-turbo"}, {"model": "text-davinci-003"}],
        prompt="Is 37 a prime number?",
        filter_func=yes_or_no_filter,
    )
    assert response["model"] == "text-ada-001"
-    response = oai.Completion.create(
+    response = autogen.Completion.create(
        config_list=[{"model": "text-ada-001"}, {"model": "gpt-3.5-turbo"}, {"model": "text-davinci-003"}],
        prompt="How to construct a json request to Bing API to search for 'latest AI news'? Return the JSON request.",
        filter_func=valid_json_filter,
    )
    assert response["config_id"] == 2 or response["pass_filter"], "the response must pass filter unless all fail"
-    assert not response["pass_filter"] or json.loads(oai.Completion.extract_text(response)[0])
+    assert not response["pass_filter"] or json.loads(autogen.Completion.extract_text(response)[0])


 def test_chatcompletion():
-    params = oai.ChatCompletion._construct_params(
+    params = autogen.ChatCompletion._construct_params(
        context=None,
        config={"model": "unknown"},
        prompt="hi",
    )
    assert "messages" in params
-    params = oai.Completion._construct_params(
+    params = autogen.Completion._construct_params(
        context=None,
        config={"model": "unknown"},
        prompt="hi",
    )
    assert "messages" not in params
-    params = oai.Completion._construct_params(
+    params = autogen.Completion._construct_params(
        context=None,
        config={"model": "gpt-4"},
        prompt="hi",
@@ -97,46 +95,13 @@ def test_multi_model():
    except ImportError as exc:
        print(exc)
        return
-    response = oai.Completion.create(
-        config_list=oai.config_list_gpt4_gpt35(KEY_LOC),
+    response = autogen.Completion.create(
+        config_list=autogen.config_list_gpt4_gpt35(KEY_LOC),
        prompt="Hi",
    )
    print(response)


-def test_improve():
-    try:
-        import openai
-        import diskcache
-    except ImportError as exc:
-        print(exc)
-        return
-    config_list = oai.config_list_openai_aoai(KEY_LOC)
-    improved, _ = improve_function(
-        "flaml/autogen/math_utils.py",
-        "solve_problem",
-        "Solve math problems accurately, by avoiding calculation errors and reduce reasoning errors.",
-        config_list=config_list,
-    )
-    with open(f"{here}/math_utils.py.improved", "w") as f:
-        f.write(improved)
-    suggestion, _ = improve_code(
-        ["flaml/autogen/code_utils.py", "flaml/autogen/math_utils.py"],
-        "leverage generative AI smartly and cost-effectively",
-        config_list=config_list,
-    )
-    print(suggestion)
-    improvement, cost = improve_code(
-        ["flaml/autogen/code_utils.py", "flaml/autogen/math_utils.py"],
-        "leverage generative AI smartly and cost-effectively",
-        suggest_only=False,
-        config_list=config_list,
-    )
-    print(cost)
-    with open(f"{here}/suggested_improvement.txt", "w") as f:
-        f.write(improvement)
-
-
 def test_nocontext():
    try:
        import openai
@@ -144,12 +109,12 @@ def test_nocontext():
    except ImportError as exc:
        print(exc)
        return
-    response = oai.Completion.create(
+    response = autogen.Completion.create(
        model="text-ada-001", prompt="1+1=", max_tokens=1, use_cache=False, request_timeout=10
    )
    print(response)
    code, _ = generate_code(
-        config_list=oai.config_list_from_json(
+        config_list=autogen.config_list_from_json(
            OAI_CONFIG_LIST,
            file_location=KEY_LOC,
            filter_dict={
@@ -175,7 +140,7 @@ def test_nocontext():
    )
    print(code)

-    solution, cost = solve_problem("1+1=", config_list=oai.config_list_gpt4_gpt35(KEY_LOC))
+    solution, cost = solve_problem("1+1=", config_list=autogen.config_list_gpt4_gpt35(KEY_LOC))
    print(solution, cost)


@@ -184,7 +149,7 @@ def test_nocontext():
    reason="do not run on windows",
 )
 def test_humaneval(num_samples=1):
-    gpt35_config_list = oai.config_list_from_json(
+    gpt35_config_list = autogen.config_list_from_json(
        env_or_file="OAI_CONFIG_LIST",
        filter_dict={
            "model": {
@@ -221,17 +186,17 @@ def test_humaneval(num_samples=1):
        }
        for x in range(n_tune_data, len(data))
    ]
-    oai.Completion.clear_cache(cache_path_root="{here}/cache")
-    oai.Completion.set_cache(seed)
+    autogen.Completion.clear_cache(cache_path_root="{here}/cache")
+    autogen.Completion.set_cache(seed)
    try:
        import openai
        import diskcache
    except ImportError as exc:
        print(exc)
        return
-    oai.Completion.clear_cache(400)
+    autogen.Completion.clear_cache(400)
    # no error should be raised
-    response = oai.Completion.create(
+    response = autogen.Completion.create(
        context=test_data[0],
        config_list=[{"model": "gpt-3.5-turbo"}],
        prompt="",
@@ -241,7 +206,7 @@ def test_humaneval(num_samples=1):
    )
    # assert response == -1
    # a minimal tuning example
-    config, _ = oai.Completion.tune(
+    config, _ = autogen.Completion.tune(
        data=tune_data,
        metric="success",
        mode="max",
@@ -249,9 +214,9 @@ def test_humaneval(num_samples=1):
        n=1,
        prompt="{definition}",
    )
-    response = oai.Completion.create(context=test_data[0], **config)
+    response = autogen.Completion.create(context=test_data[0], **config)
    # a minimal tuning example for tuning chat completion models using the Completion class
-    config, _ = oai.Completion.tune(
+    config, _ = autogen.Completion.tune(
        data=tune_data,
        metric="succeed_assertions",
        mode="max",
@@ -260,10 +225,10 @@ def test_humaneval(num_samples=1):
        model="text-davinci-003",
        prompt="{definition}",
    )
-    response = oai.Completion.create(context=test_data[0], **config)
+    response = autogen.Completion.create(context=test_data[0], **config)
    # a minimal tuning example for tuning chat completion models using the ChatCompletion class
-    config_list = oai.config_list_openai_aoai(KEY_LOC)
-    config, _ = oai.ChatCompletion.tune(
+    config_list = autogen.config_list_openai_aoai(KEY_LOC)
+    config, _ = autogen.ChatCompletion.tune(
        data=tune_data,
        metric="expected_success",
        mode="max",
@@ -272,7 +237,7 @@ def test_humaneval(num_samples=1):
        messages=[{"role": "user", "content": "{definition}"}],
        config_list=config_list,
    )
-    response = oai.ChatCompletion.create(context=test_data[0], config_list=config_list, **config)
+    response = autogen.ChatCompletion.create(context=test_data[0], config_list=config_list, **config)
    print(response)
    from openai.error import RateLimitError

@@ -289,7 +254,7 @@ def test_humaneval(num_samples=1):
    assert selected == 0
    print(eval_function_completions([code], **tune_data[1]))
    # a more comprehensive tuning example
-    config2, analysis = oai.Completion.tune(
+    config2, analysis = autogen.Completion.tune(
        data=tune_data,
        metric="success",
        mode="max",
@@ -310,12 +275,12 @@ def test_humaneval(num_samples=1):
    print(config2)
    print(analysis.best_result)
    print(test_data[0])
-    response = oai.Completion.create(context=test_data[0], **config2)
+    response = autogen.Completion.create(context=test_data[0], **config2)
    print(response)
-    oai.Completion.data = test_data[:num_samples]
-    result = oai.Completion._eval(analysis.best_config, prune=False, eval_only=True)
+    autogen.Completion.data = test_data[:num_samples]
+    result = autogen.Completion._eval(analysis.best_config, prune=False, eval_only=True)
    print("result without pruning", result)
-    result = oai.Completion.test(test_data[:num_samples], **config2)
+    result = autogen.Completion.test(test_data[:num_samples], **config2)
    print(result)
    try:
        code, cost, selected = implement(
@@ -376,7 +341,7 @@ def test_math(num_samples=-1):
        % data["problem"]
    ]

-    oai.Completion.set_cache(seed)
+    autogen.Completion.set_cache(seed)
    vanilla_config = {
        "model": "text-davinci-003",
        "temperature": 1,
@@ -386,8 +351,8 @@ def test_math(num_samples=-1):
        "stop": "###",
    }
    test_data_sample = test_data[0:3]
-    result = oai.Completion.test(test_data_sample, eval_math_responses, **vanilla_config)
-    result = oai.Completion.test(
+    result = autogen.Completion.test(test_data_sample, eval_math_responses, **vanilla_config)
+    result = autogen.Completion.test(
        test_data_sample,
        eval_math_responses,
        agg_method="median",
@@ -400,13 +365,13 @@ def test_math(num_samples=-1):
    def my_average(results):
        return np.mean(results)

-    result = oai.Completion.test(
+    result = autogen.Completion.test(
        test_data_sample,
        eval_math_responses,
        agg_method=my_median,
        **vanilla_config,
    )
-    result = oai.Completion.test(
+    result = autogen.Completion.test(
        test_data_sample,
        eval_math_responses,
        agg_method={
@@ -420,7 +385,7 @@ def test_math(num_samples=-1):

    print(result)

-    config, _ = oai.Completion.tune(
+    config, _ = autogen.Completion.tune(
        data=tune_data,  # the data for tuning
        metric="expected_success",  # the metric to optimize
        mode="max",  # the optimization mode
@@ -433,7 +398,7 @@ def test_math(num_samples=-1):
        stop="###",  # the stop sequence
    )
    print("tuned config", config)
-    result = oai.Completion.test(test_data_sample, config_list=oai.config_list_openai_aoai(KEY_LOC), **config)
+    result = autogen.Completion.test(test_data_sample, config_list=autogen.config_list_openai_aoai(KEY_LOC), **config)
    print("result from tuned config:", result)
    print("empty responses", eval_math_responses([], None))

@@ -441,7 +406,7 @@ def test_math(num_samples=-1):
 if __name__ == "__main__":
    import openai

-    config_list = oai.config_list_openai_aoai(KEY_LOC)
+    config_list = autogen.config_list_openai_aoai(KEY_LOC)
    assert len(config_list) >= 3, config_list
    openai.api_key = os.environ["OPENAI_API_KEY"]

--- a/test/autogen/oai/test_utils.py
+++ b/test/autogen/oai/test_utils.py
@@ -1,22 +1,21 @@
 import json
 import os
-from flaml import oai
-
-KEY_LOC = "test/autogen"
+from flaml import autogen
+from test_completion import KEY_LOC, OAI_CONFIG_LIST


 def test_config_list_from_json():
-    config_list = oai.config_list_gpt4_gpt35(key_file_path=KEY_LOC)
+    config_list = autogen.config_list_gpt4_gpt35(key_file_path=KEY_LOC)
    json_file = os.path.join(KEY_LOC, "config_list_test.json")
    with open(json_file, "w") as f:
        json.dump(config_list, f, indent=4)
-    config_list_1 = oai.config_list_from_json(json_file)
+    config_list_1 = autogen.config_list_from_json(json_file)
    assert config_list == config_list_1
    os.environ["config_list_test"] = json.dumps(config_list)
-    config_list_2 = oai.config_list_from_json("config_list_test")
+    config_list_2 = autogen.config_list_from_json("config_list_test")
    assert config_list == config_list_2
-    config_list_3 = oai.config_list_from_json(
-        "OAI_CONFIG_LIST", file_location=KEY_LOC, filter_dict={"model": ["gpt4", "gpt-4-32k"]}
+    config_list_3 = autogen.config_list_from_json(
+        OAI_CONFIG_LIST, file_location=KEY_LOC, filter_dict={"model": ["gpt4", "gpt-4-32k"]}
    )
    assert all(config.get("model") in ["gpt4", "gpt-4-32k"] for config in config_list_3)
    del os.environ["config_list_test"]
@@ -24,7 +23,7 @@ def test_config_list_from_json():


 def test_config_list_openai_aoai():
-    config_list = oai.config_list_openai_aoai(key_file_path=KEY_LOC)
+    config_list = autogen.config_list_openai_aoai(key_file_path=KEY_LOC)
    assert all(config.get("api_type") in [None, "open_ai", "azure"] for config in config_list)


--- a/test/autogen/test_code.py
+++ b/test/autogen/test_code.py
@@ -1,11 +1,151 @@
 import sys
 import os
 import pytest
-from flaml.autogen.code_utils import UNKNOWN, extract_code, execute_code, infer_lang
+from flaml import autogen
+from flaml.autogen.code_utils import (
+    UNKNOWN,
+    extract_code,
+    execute_code,
+    infer_lang,
+    improve_code,
+    improve_function,
+)

+KEY_LOC = "notebook"
+OAI_CONFIG_LIST = "OAI_CONFIG_LIST"
 here = os.path.abspath(os.path.dirname(__file__))


+# def test_find_code():
+#     try:
+#         import openai
+#     except ImportError:
+#         return
+#     # need gpt-4 for this task
+#     config_list = autogen.config_list_from_json(
+#         OAI_CONFIG_LIST,
+#         file_location=KEY_LOC,
+#         filter_dict={
+#             "model": ["gpt-4", "gpt4", "gpt-4-32k", "gpt-4-32k-0314"],
+#         },
+#     )
+#     # config_list = autogen.config_list_from_json(
+#     #     OAI_CONFIG_LIST,
+#     #     file_location=KEY_LOC,
+#     #     filter_dict={
+#     #         "model": {
+#     #             "gpt-3.5-turbo",
+#     #             "gpt-3.5-turbo-16k",
+#     #             "gpt-3.5-turbo-0301",
+#     #             "chatgpt-35-turbo-0301",
+#     #             "gpt-35-turbo-v0301",
+#     #         },
+#     #     },
+#     # )
+#     seed = 42
+#     messages = [
+#         {
+#             "role": "user",
+#             "content": "Print hello world to a file called hello.txt",
+#         },
+#         {
+#             "role": "user",
+#             "content": """
+# # filename: write_hello.py
+# ```
+# with open('hello.txt', 'w') as f:
+#     f.write('Hello, World!')
+# print('Hello, World! printed to hello.txt')
+# ```
+# Please execute the above Python code to print "Hello, World!" to a file called hello.txt and print the success message.
+# """,
+#         },
+#     ]
+#     codeblocks, _ = find_code(messages, seed=seed, config_list=config_list)
+#     assert codeblocks[0][0] == "python", codeblocks
+#     messages += [
+#         {
+#             "role": "user",
+#             "content": """
+# exitcode: 0 (execution succeeded)
+# Code output:
+# Hello, World! printed to hello.txt
+# """,
+#         },
+#         {
+#             "role": "assistant",
+#             "content": "Great! Can I help you with anything else?",
+#         },
+#     ]
+#     codeblocks, content = find_code(messages, seed=seed, config_list=config_list)
+#     assert codeblocks[0][0] == "unknown", content
+#     messages += [
+#         {
+#             "role": "user",
+#             "content": "Save a pandas df with 3 rows and 3 columns to disk.",
+#         },
+#         {
+#             "role": "assistant",
+#             "content": """
+# ```
+# # filename: save_df.py
+# import pandas as pd
+
+# df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
+# df.to_csv('df.csv')
+# print('df saved to df.csv')
+# ```
+# Please execute the above Python code to save a pandas df with 3 rows and 3 columns to disk.
+# Before you run the code above, run
+# ```
+# pip install pandas
+# ```
+# first to install pandas.
+# """,
+#         },
+#     ]
+#     codeblocks, content = find_code(messages, seed=seed, config_list=config_list)
+#     assert (
+#         len(codeblocks) == 2
+#         and (codeblocks[0][0] == "sh"
+#         and codeblocks[1][0] == "python"
+#         or codeblocks[0][0] == "python"
+#         and codeblocks[1][0] == "sh")
+#     ), content
+
+#     messages += [
+#         {
+#             "role": "user",
+#             "content": "The code is unsafe to execute in my environment.",
+#         },
+#         {
+#             "role": "assistant",
+#             "content": "please run python write_hello.py",
+#         },
+#     ]
+#     # codeblocks, content = find_code(messages, config_list=config_list)
+#     # assert codeblocks[0][0] != "unknown", content
+#     # I'm sorry, but I cannot execute code from earlier messages. Please provide the code again if you would like me to execute it.
+
+#     messages[-1]["content"] = "please skip pip install pandas if you already have pandas installed"
+#     codeblocks, content = find_code(messages, seed=seed, config_list=config_list)
+#     assert codeblocks[0][0] != "sh", content
+
+#     messages += [
+#         {
+#             "role": "user",
+#             "content": "The code is still unsafe to execute in my environment.",
+#         },
+#         {
+#             "role": "assistant",
+#             "content": "Let me try something else. Do you have docker installed?",
+#         },
+#     ]
+#     codeblocks, content = find_code(messages, seed=seed, config_list=config_list)
+#     assert codeblocks[0][0] == "unknown", content
+#     print(content)
+
+
 def test_infer_lang():
    assert infer_lang("print('hello world')") == "python"
    assert infer_lang("pip install flaml") == "sh"
@@ -59,12 +199,16 @@ def test_execute_code():
        import docker
    except ImportError as exc:
        print(exc)
-        return
-    exitcode, msg, image = execute_code("print('hello world')", filename="tmp/codetest.py")
-    assert exitcode == 0 and msg == b"hello world\n", msg
+        docker = None
+    exit_code, msg, image = execute_code("print('hello world')", filename="tmp/codetest.py")
+    assert exit_code == 0 and msg == "hello world\n", msg
    # read a file
    print(execute_code("with open('tmp/codetest.py', 'r') as f: a=f.read()"))
    # create a file
+    exit_code, msg, image = execute_code(
+        "with open('tmp/codetest.py', 'w') as f: f.write('b=1')", work_dir=f"{here}/my_tmp", filename="tmp2/codetest.py"
+    )
+    assert exit_code and 'File "tmp2/codetest.py"' in msg, msg
    print(execute_code("with open('tmp/codetest.py', 'w') as f: f.write('b=1')", work_dir=f"{here}/my_tmp"))
    # execute code in a file
    print(execute_code(filename="tmp/codetest.py"))
@@ -72,20 +216,53 @@ def test_execute_code():
    # execute code for assertion error
    exit_code, msg, image = execute_code("assert 1==2")
    assert exit_code, msg
+    assert 'File ""' in msg
    # execute code which takes a long time
    exit_code, error, image = execute_code("import time; time.sleep(2)", timeout=1)
-    assert exit_code and error.decode() == "Timeout"
-    assert isinstance(image, str)
+    assert exit_code and error == "Timeout"
+    assert isinstance(image, str) or docker is None or os.path.exists("/.dockerenv")


 def test_execute_code_no_docker():
    exit_code, error, image = execute_code("import time; time.sleep(2)", timeout=1, use_docker=False)
    if sys.platform != "win32":
-        assert exit_code and error.decode() == "Timeout"
+        assert exit_code and error == "Timeout"
    assert image is None


+def test_improve():
+    try:
+        import openai
+    except ImportError:
+        return
+    config_list = autogen.config_list_openai_aoai(KEY_LOC)
+    improved, _ = improve_function(
+        "flaml/autogen/math_utils.py",
+        "solve_problem",
+        "Solve math problems accurately, by avoiding calculation errors and reduce reasoning errors.",
+        config_list=config_list,
+    )
+    with open(f"{here}/math_utils.py.improved", "w") as f:
+        f.write(improved)
+    suggestion, _ = improve_code(
+        ["flaml/autogen/code_utils.py", "flaml/autogen/math_utils.py"],
+        "leverage generative AI smartly and cost-effectively",
+        config_list=config_list,
+    )
+    print(suggestion)
+    improvement, cost = improve_code(
+        ["flaml/autogen/code_utils.py", "flaml/autogen/math_utils.py"],
+        "leverage generative AI smartly and cost-effectively",
+        suggest_only=False,
+        config_list=config_list,
+    )
+    print(cost)
+    with open(f"{here}/suggested_improvement.txt", "w") as f:
+        f.write(improvement)
+
+
 if __name__ == "__main__":
    # test_infer_lang()
    # test_extract_code()
    test_execute_code()
+    # test_find_code()
--- a/test/autogen/test_function_call.py
+++ b/test/autogen/test_function_call.py
@@ -4,15 +4,14 @@ except ImportError:
    openai = None
 import pytest
 import json
-from flaml import oai
+from flaml import autogen
 from flaml.autogen.math_utils import eval_math_responses
-
-KEY_LOC = "test/autogen"
+from test_code import KEY_LOC


@pytest.mark.skipif(openai is None, reason="openai not installed")
 def test_eval_math_responses():
-    config_list = oai.config_list_from_models(
+    config_list = autogen.config_list_from_models(
        KEY_LOC, exclude="aoai", model_list=["gpt-4-0613", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k"]
    )
    functions = [
@@ -36,7 +35,7 @@ def test_eval_math_responses():
            },
        },
    ]
-    response = oai.ChatCompletion.create(
+    response = autogen.ChatCompletion.create(
        config_list=config_list,
        messages=[
            {
@@ -47,7 +46,7 @@ def test_eval_math_responses():
        functions=functions,
    )
    print(response)
-    responses = oai.ChatCompletion.extract_text_or_function_call(response)
+    responses = autogen.ChatCompletion.extract_text_or_function_call(response)
    print(responses[0])
    function_call = responses[0]["function_call"]
    name, arguments = function_call["name"], json.loads(function_call["arguments"])