home-llm/tests/llama_conversation/test_agent.py

import json
import logging
import pytest
import jinja2
from unittest.mock import patch, MagicMock, PropertyMock, AsyncMock, ANY

from custom_components.llama_conversation.backends.llamacpp import LlamaCppAgent
from custom_components.llama_conversation.backends.ollama import OllamaAPIAgent
from custom_components.llama_conversation.backends.tailored_openai import TextGenerationWebuiAgent
from custom_components.llama_conversation.backends.generic_openai import GenericOpenAIAPIAgent
from custom_components.llama_conversation.const import (
    CONF_CHAT_MODEL,
    CONF_MAX_TOKENS,
    CONF_PROMPT,
    CONF_TEMPERATURE,
    CONF_TOP_K,
    CONF_TOP_P,
    CONF_MIN_P,
    CONF_TYPICAL_P,
    CONF_REQUEST_TIMEOUT,
    CONF_BACKEND_TYPE,
    CONF_DOWNLOADED_MODEL_FILE,
    CONF_EXTRA_ATTRIBUTES_TO_EXPOSE,
    CONF_PROMPT_TEMPLATE,
    CONF_ENABLE_FLASH_ATTENTION,
    CONF_USE_GBNF_GRAMMAR,
    CONF_GBNF_GRAMMAR_FILE,
    CONF_USE_IN_CONTEXT_LEARNING_EXAMPLES,
    CONF_IN_CONTEXT_EXAMPLES_FILE,
    CONF_NUM_IN_CONTEXT_EXAMPLES,
    CONF_TEXT_GEN_WEBUI_PRESET,
    CONF_OPENAI_API_KEY,
    CONF_TEXT_GEN_WEBUI_ADMIN_KEY,
    CONF_REFRESH_SYSTEM_PROMPT,
    CONF_REMEMBER_CONVERSATION,
    CONF_REMEMBER_NUM_INTERACTIONS,
    CONF_PROMPT_CACHING_ENABLED,
    CONF_PROMPT_CACHING_INTERVAL,
    CONF_SERVICE_CALL_REGEX,
    CONF_REMOTE_USE_CHAT_ENDPOINT,
    CONF_TEXT_GEN_WEBUI_CHAT_MODE,
    CONF_OLLAMA_KEEP_ALIVE_MIN,
    CONF_OLLAMA_JSON_MODE,
    CONF_CONTEXT_LENGTH,
    CONF_BATCH_SIZE,
    CONF_THREAD_COUNT,
    CONF_BATCH_THREAD_COUNT,
    DEFAULT_CHAT_MODEL,
    DEFAULT_MAX_TOKENS,
    DEFAULT_PROMPT_BASE,
    DEFAULT_TEMPERATURE,
    DEFAULT_TOP_K,
    DEFAULT_TOP_P,
    DEFAULT_MIN_P,
    DEFAULT_TYPICAL_P,
    DEFAULT_BACKEND_TYPE,
    DEFAULT_REQUEST_TIMEOUT,
    DEFAULT_EXTRA_ATTRIBUTES_TO_EXPOSE,
    DEFAULT_PROMPT_TEMPLATE,
    DEFAULT_ENABLE_FLASH_ATTENTION,
    DEFAULT_USE_GBNF_GRAMMAR,
    DEFAULT_GBNF_GRAMMAR_FILE,
    DEFAULT_USE_IN_CONTEXT_LEARNING_EXAMPLES,
    DEFAULT_IN_CONTEXT_EXAMPLES_FILE,
    DEFAULT_NUM_IN_CONTEXT_EXAMPLES,
    DEFAULT_REFRESH_SYSTEM_PROMPT,
    DEFAULT_REMEMBER_CONVERSATION,
    DEFAULT_REMEMBER_NUM_INTERACTIONS,
    DEFAULT_PROMPT_CACHING_ENABLED,
    DEFAULT_PROMPT_CACHING_INTERVAL,
    DEFAULT_SERVICE_CALL_REGEX,
    DEFAULT_REMOTE_USE_CHAT_ENDPOINT,
    DEFAULT_TEXT_GEN_WEBUI_CHAT_MODE,
    DEFAULT_OLLAMA_KEEP_ALIVE_MIN,
    DEFAULT_OLLAMA_JSON_MODE,
    DEFAULT_CONTEXT_LENGTH,
    DEFAULT_BATCH_SIZE,
    DEFAULT_THREAD_COUNT,
    DEFAULT_BATCH_THREAD_COUNT,
    TEXT_GEN_WEBUI_CHAT_MODE_CHAT,
    TEXT_GEN_WEBUI_CHAT_MODE_INSTRUCT,
    TEXT_GEN_WEBUI_CHAT_MODE_CHAT_INSTRUCT,
    DOMAIN,
    PROMPT_TEMPLATE_DESCRIPTIONS,
    DEFAULT_OPTIONS,
)

from homeassistant.components.conversation import ConversationInput
from homeassistant.const import (
    CONF_HOST,
    CONF_PORT,
    CONF_SSL,
    CONF_LLM_HASS_API
)
from homeassistant.helpers.llm import LLM_API_ASSIST, APIInstance

_LOGGER = logging.getLogger(__name__)

class WarnDict(dict):
    def get(self, _key, _default=None):
        if _key in self:
            return self[_key]

        _LOGGER.warning(f"attempting to get unset dictionary key {_key}")

        return _default

class MockConfigEntry:
    def __init__(self, entry_id='test_entry_id', data={}, options={}):
        self.entry_id = entry_id
        self.data = WarnDict(data)
        # Use a mutable dict for options in tests
        self.options = WarnDict(dict(options))


@pytest.fixture
def config_entry():
    yield MockConfigEntry(
        data={
            CONF_CHAT_MODEL: DEFAULT_CHAT_MODEL,
            CONF_BACKEND_TYPE: DEFAULT_BACKEND_TYPE,
            CONF_DOWNLOADED_MODEL_FILE: "/config/models/some-model.q4_k_m.gguf",
            CONF_HOST: "localhost",
            CONF_PORT: "5000",
            CONF_SSL: False,
            CONF_OPENAI_API_KEY: "OpenAI-API-Key",
            CONF_TEXT_GEN_WEBUI_ADMIN_KEY: "Text-Gen-Webui-Admin-Key"
        },
        options={
            **DEFAULT_OPTIONS,
            CONF_LLM_HASS_API: LLM_API_ASSIST,
            CONF_PROMPT: DEFAULT_PROMPT_BASE,
            CONF_SERVICE_CALL_REGEX: r"({[\S \t]*})"
        }
    )

@pytest.fixture
def local_llama_agent_fixture(config_entry, hass, enable_custom_integrations):
    with patch.object(LlamaCppAgent, '_load_icl_examples') as load_icl_examples_mock, \
         patch.object(LlamaCppAgent, '_load_grammar') as load_grammar_mock, \
         patch.object(LlamaCppAgent, 'entry', new_callable=PropertyMock) as entry_mock, \
         patch.object(LlamaCppAgent, '_async_get_exposed_entities') as get_exposed_entities_mock, \
         patch.object(APIInstance, 'async_call_tool') as call_tool_mock, \
         patch('homeassistant.helpers.template.Template') as template_mock, \
         patch('custom_components.llama_conversation.backends.llamacpp.importlib.import_module') as import_module_mock, \
         patch('custom_components.llama_conversation.utils.importlib.import_module') as import_module_mock_2, \
         patch('custom_components.llama_conversation.utils.install_llama_cpp_python') as install_llama_cpp_python_mock:

        entry_mock.return_value = config_entry
        llama_instance_mock = MagicMock()
        llama_class_mock = MagicMock()
        llama_class_mock.return_value = llama_instance_mock
        import_module_mock.return_value = MagicMock(Llama=llama_class_mock)
        import_module_mock_2.return_value = MagicMock(Llama=llama_class_mock)
        install_llama_cpp_python_mock.return_value = True
        get_exposed_entities_mock.return_value = (
            {
                "light.kitchen_light": { "state": "on" },
                "light.office_lamp": { "state": "on" },
                "switch.downstairs_hallway": { "state": "off" },
                "fan.bedroom": { "state": "on" },
            },
            ["light", "switch", "fan"]
        )
        # template_mock.side_affect = lambda template, _: jinja2.Template(template)
        generate_mock = llama_instance_mock.generate
        generate_mock.return_value = list(range(20))

        detokenize_mock = llama_instance_mock.detokenize
        detokenize_mock.return_value = ("I am saying something!\n" + json.dumps({
            "name": "HassTurnOn",
            "arguments": {
                "name": "light.kitchen_light"
            }
        })).encode()

        call_tool_mock.return_value = {"result": "success"}

        agent_obj = LlamaCppAgent(
            hass,
            config_entry
        )

        all_mocks = {
            "llama_class": llama_class_mock,
            "tokenize": llama_instance_mock.tokenize,
            "generate": generate_mock,
        }

        yield agent_obj, all_mocks

# TODO: test base llama agent (ICL loading other languages)

async def test_local_llama_agent(local_llama_agent_fixture):

    local_llama_agent: LlamaCppAgent
    all_mocks: dict[str, MagicMock]
    local_llama_agent, all_mocks = local_llama_agent_fixture

    # invoke the conversation agent
    conversation_id = "test-conversation"
    result = await local_llama_agent.async_process(ConversationInput(
        "turn on the kitchen lights", MagicMock(), conversation_id, None, "en", agent_id="test-agent"
    ))

    # assert on results + check side effects
    assert result.response.speech['plain']['speech'] == "I am saying something!"

    all_mocks["llama_class"].assert_called_once_with(
        model_path=local_llama_agent.entry.data.get(CONF_DOWNLOADED_MODEL_FILE),
        n_ctx=local_llama_agent.entry.options.get(CONF_CONTEXT_LENGTH),
        n_batch=local_llama_agent.entry.options.get(CONF_BATCH_SIZE),
        n_threads=local_llama_agent.entry.options.get(CONF_THREAD_COUNT),
        n_threads_batch=local_llama_agent.entry.options.get(CONF_BATCH_THREAD_COUNT),
        flash_attn=local_llama_agent.entry.options.get(CONF_ENABLE_FLASH_ATTENTION)
    )

    all_mocks["tokenize"].assert_called_once()
    all_mocks["generate"].assert_called_once_with(
        ANY,
        temp=local_llama_agent.entry.options.get(CONF_TEMPERATURE),
        top_k=local_llama_agent.entry.options.get(CONF_TOP_K),
        top_p=local_llama_agent.entry.options.get(CONF_TOP_P),
        typical_p=local_llama_agent.entry.options[CONF_TYPICAL_P],
        min_p=local_llama_agent.entry.options[CONF_MIN_P],
        grammar=ANY,
    )

    # reset mock stats
    for mock in all_mocks.values():
        mock.reset_mock()

    # change options then apply them
    local_llama_agent.entry.options[CONF_CONTEXT_LENGTH] = 1024
    local_llama_agent.entry.options[CONF_BATCH_SIZE] = 1024
    local_llama_agent.entry.options[CONF_THREAD_COUNT] = 24
    local_llama_agent.entry.options[CONF_BATCH_THREAD_COUNT] = 24
    local_llama_agent.entry.options[CONF_TEMPERATURE] = 2.0
    local_llama_agent.entry.options[CONF_ENABLE_FLASH_ATTENTION] = True
    local_llama_agent.entry.options[CONF_TOP_K] = 20
    local_llama_agent.entry.options[CONF_TOP_P] = 0.9
    local_llama_agent.entry.options[CONF_MIN_P] = 0.2
    local_llama_agent.entry.options[CONF_TYPICAL_P] = 0.95

    local_llama_agent._update_options()

    all_mocks["llama_class"].assert_called_once_with(
        model_path=local_llama_agent.entry.data.get(CONF_DOWNLOADED_MODEL_FILE),
        n_ctx=local_llama_agent.entry.options.get(CONF_CONTEXT_LENGTH),
        n_batch=local_llama_agent.entry.options.get(CONF_BATCH_SIZE),
        n_threads=local_llama_agent.entry.options.get(CONF_THREAD_COUNT),
        n_threads_batch=local_llama_agent.entry.options.get(CONF_BATCH_THREAD_COUNT),
        flash_attn=local_llama_agent.entry.options.get(CONF_ENABLE_FLASH_ATTENTION)
    )

    # do another turn of the same conversation
    result = await local_llama_agent.async_process(ConversationInput(
        "turn off the office lights", MagicMock(), conversation_id, None, "en", agent_id="test-agent"
    ))

    assert result.response.speech['plain']['speech'] == "I am saying something!"

    all_mocks["tokenize"].assert_called_once()
    all_mocks["generate"].assert_called_once_with(
        ANY,
        temp=local_llama_agent.entry.options.get(CONF_TEMPERATURE),
        top_k=local_llama_agent.entry.options.get(CONF_TOP_K),
        top_p=local_llama_agent.entry.options.get(CONF_TOP_P),
        typical_p=local_llama_agent.entry.options[CONF_TYPICAL_P],
        min_p=local_llama_agent.entry.options[CONF_MIN_P],
        grammar=ANY,
    )

@pytest.fixture
def ollama_agent_fixture(config_entry, hass, enable_custom_integrations):
    with patch.object(OllamaAPIAgent, '_load_icl_examples') as load_icl_examples_mock, \
         patch.object(OllamaAPIAgent, 'entry', new_callable=PropertyMock) as entry_mock, \
         patch.object(OllamaAPIAgent, '_async_get_exposed_entities') as get_exposed_entities_mock, \
         patch.object(APIInstance, 'async_call_tool') as call_tool_mock, \
         patch('homeassistant.helpers.template.Template') as template_mock, \
         patch('custom_components.llama_conversation.backends.ollama.async_get_clientsession') as get_clientsession:

        entry_mock.return_value = config_entry
        get_exposed_entities_mock.return_value = (
            {
                "light.kitchen_light": { "state": "on" },
                "light.office_lamp": { "state": "on" },
                "switch.downstairs_hallway": { "state": "off" },
                "fan.bedroom": { "state": "on" },
            },
            ["light", "switch", "fan"]
        )

        response_mock = MagicMock()
        response_mock.json.return_value = { "models": [ {"name": config_entry.data[CONF_CHAT_MODEL] }] }
        get_clientsession.get.return_value = response_mock

        call_tool_mock.return_value = {"result": "success"}

        agent_obj = OllamaAPIAgent(
            hass,
            config_entry
        )

        all_mocks = {
            "requests_get": get_clientsession.get,
            "requests_post": get_clientsession.post
        }

        yield agent_obj, all_mocks

async def test_ollama_agent(ollama_agent_fixture):

    ollama_agent: OllamaAPIAgent
    all_mocks: dict[str, MagicMock]
    ollama_agent, all_mocks = ollama_agent_fixture

    all_mocks["requests_get"].assert_called_once_with(
        "http://localhost:5000/api/tags",
        headers={ "Authorization": "Bearer OpenAI-API-Key" }
    )

    response_mock = MagicMock()
    response_mock.json.return_value = {
        "model": ollama_agent.entry.data[CONF_CHAT_MODEL],
        "created_at": "2023-11-09T21:07:55.186497Z",
        "response": "I am saying something!\n" + json.dumps({
            "name": "HassTurnOn",
            "arguments": {
                "name": "light.kitchen_light"
            }
        }),
        "done": True,
        "context": [1, 2, 3],
        "total_duration": 4648158584,
        "load_duration": 4071084,
        "prompt_eval_count": 36,
        "prompt_eval_duration": 439038000,
        "eval_count": 180,
        "eval_duration": 4196918000
    }
    all_mocks["requests_post"].return_value = response_mock

    # invoke the conversation agent
    conversation_id = "test-conversation"
    result = await ollama_agent.async_process(ConversationInput(
        "turn on the kitchen lights", MagicMock(), conversation_id, None, "en", agent_id="test-agent"
    ))

    # assert on results + check side effects
    assert result.response.speech['plain']['speech'] == "I am saying something!"

    all_mocks["requests_post"].assert_called_once_with(
        "http://localhost:5000/api/generate",
        headers={ "Authorization": "Bearer OpenAI-API-Key" },
        json={
            "model":  ollama_agent.entry.data[CONF_CHAT_MODEL],
            "stream": False,
            "keep_alive": f"{ollama_agent.entry.options[CONF_OLLAMA_KEEP_ALIVE_MIN]}m", # prevent ollama from unloading the model
            "options": {
                "num_ctx": ollama_agent.entry.options[CONF_CONTEXT_LENGTH],
                "top_p": ollama_agent.entry.options[CONF_TOP_P],
                "top_k": ollama_agent.entry.options[CONF_TOP_K],
                "typical_p": ollama_agent.entry.options[CONF_TYPICAL_P],
                "temperature": ollama_agent.entry.options[CONF_TEMPERATURE],
                "num_predict": ollama_agent.entry.options[CONF_MAX_TOKENS],
            },
            "prompt": ANY,
            "raw": True
        },
        timeout=ollama_agent.entry.options[CONF_REQUEST_TIMEOUT]
    )

    # reset mock stats
    for mock in all_mocks.values():
        mock.reset_mock()

    # change options
    ollama_agent.entry.options[CONF_CONTEXT_LENGTH] = 1024
    ollama_agent.entry.options[CONF_MAX_TOKENS] = 10
    ollama_agent.entry.options[CONF_REQUEST_TIMEOUT] = 60
    ollama_agent.entry.options[CONF_OLLAMA_KEEP_ALIVE_MIN] = 99
    ollama_agent.entry.options[CONF_REMOTE_USE_CHAT_ENDPOINT] = True
    ollama_agent.entry.options[CONF_OLLAMA_JSON_MODE] = True
    ollama_agent.entry.options[CONF_TEMPERATURE] = 2.0
    ollama_agent.entry.options[CONF_TOP_K] = 20
    ollama_agent.entry.options[CONF_TOP_P] = 0.9
    ollama_agent.entry.options[CONF_TYPICAL_P] = 0.5

    # do another turn of the same conversation
    result = await ollama_agent.async_process(ConversationInput(
        "turn off the office lights", MagicMock(), conversation_id, None, "en", agent_id="test-agent"
    ))

    assert result.response.speech['plain']['speech'] == "I am saying something!"

    all_mocks["requests_post"].assert_called_once_with(
        "http://localhost:5000/api/chat",
        headers={ "Authorization": "Bearer OpenAI-API-Key" },
        json={
            "model":  ollama_agent.entry.data[CONF_CHAT_MODEL],
            "stream": False,
            "format": "json",
            "keep_alive": f"{ollama_agent.entry.options[CONF_OLLAMA_KEEP_ALIVE_MIN]}m", # prevent ollama from unloading the model
            "options": {
                "num_ctx": ollama_agent.entry.options[CONF_CONTEXT_LENGTH],
                "top_p": ollama_agent.entry.options[CONF_TOP_P],
                "top_k": ollama_agent.entry.options[CONF_TOP_K],
                "typical_p": ollama_agent.entry.options[CONF_TYPICAL_P],
                "temperature": ollama_agent.entry.options[CONF_TEMPERATURE],
                "num_predict": ollama_agent.entry.options[CONF_MAX_TOKENS],
            },
            "messages": ANY
        },
        timeout=ollama_agent.entry.options[CONF_REQUEST_TIMEOUT]
    )


@pytest.fixture
def text_generation_webui_agent_fixture(config_entry, hass, enable_custom_integrations):
    with patch.object(TextGenerationWebuiAgent, '_load_icl_examples') as load_icl_examples_mock, \
         patch.object(TextGenerationWebuiAgent, 'entry', new_callable=PropertyMock) as entry_mock, \
         patch.object(TextGenerationWebuiAgent, '_async_get_exposed_entities') as get_exposed_entities_mock, \
         patch.object(APIInstance, 'async_call_tool') as call_tool_mock, \
         patch('homeassistant.helpers.template.Template') as template_mock, \
         patch('custom_components.llama_conversation.backends.tailored_openai.async_get_clientsession') as get_clientsession_mock:

        entry_mock.return_value = config_entry
        get_exposed_entities_mock.return_value = (
            {
                "light.kitchen_light": { "state": "on" },
                "light.office_lamp": { "state": "on" },
                "switch.downstairs_hallway": { "state": "off" },
                "fan.bedroom": { "state": "on" },
            },
            ["light", "switch", "fan"]
        )

        response_mock = MagicMock()
        response_mock.json.return_value = { "model_name": config_entry.data[CONF_CHAT_MODEL] }
        get_clientsession_mock.get.return_value = response_mock

        call_tool_mock.return_value = {"result": "success"}

        agent_obj = TextGenerationWebuiAgent(
            hass,
            config_entry
        )

        all_mocks = {
            "requests_get": get_clientsession_mock.get,
            "requests_post": get_clientsession_mock.post
        }

        yield agent_obj, all_mocks

async def test_text_generation_webui_agent(text_generation_webui_agent_fixture):

    text_generation_webui_agent: TextGenerationWebuiAgent
    all_mocks: dict[str, MagicMock]
    text_generation_webui_agent, all_mocks = text_generation_webui_agent_fixture

    all_mocks["requests_get"].assert_called_once_with(
        "http://localhost:5000/v1/internal/model/info",
        headers={ "Authorization": "Bearer Text-Gen-Webui-Admin-Key" }
    )

    response_mock = MagicMock()
    response_mock.json.return_value = {
        "id": "cmpl-uqkvlQyYK7bGYrRHQ0eXlWi7",
        "object": "text_completion",
        "created": 1589478378,
        "model": "gpt-3.5-turbo-instruct",
        "system_fingerprint": "fp_44709d6fcb",
        "choices": [{
            "text": "I am saying something!\n" + json.dumps({
                "name": "HassTurnOn",
                "arguments": {
                    "name": "light.kitchen_light"
                }
            }),
            "index": 0,
            "logprobs": None,
            "finish_reason": "length"
        }],
        "usage": {
            "prompt_tokens": 5,
            "completion_tokens": 7,
            "total_tokens": 12
        }
    }
    all_mocks["requests_post"].return_value = response_mock

    # invoke the conversation agent
    conversation_id = "test-conversation"
    result = await text_generation_webui_agent.async_process(ConversationInput(
        "turn on the kitchen lights", MagicMock(), conversation_id, None, "en", agent_id="test-agent"
    ))

    # assert on results + check side effects
    assert result.response.speech['plain']['speech'] == "I am saying something!"

    all_mocks["requests_post"].assert_called_once_with(
        "http://localhost:5000/v1/completions",
        json={
            "model":  text_generation_webui_agent.entry.data[CONF_CHAT_MODEL],
            "top_p": text_generation_webui_agent.entry.options[CONF_TOP_P],
            "top_k": text_generation_webui_agent.entry.options[CONF_TOP_K],
            "temperature": text_generation_webui_agent.entry.options[CONF_TEMPERATURE],
            "min_p": text_generation_webui_agent.entry.options[CONF_MIN_P],
            "typical_p": text_generation_webui_agent.entry.options[CONF_TYPICAL_P],
            "truncation_length": text_generation_webui_agent.entry.options[CONF_CONTEXT_LENGTH],
            "max_tokens": text_generation_webui_agent.entry.options[CONF_MAX_TOKENS],
            "prompt": ANY
        },
        headers={ "Authorization": "Bearer OpenAI-API-Key" },
        timeout=text_generation_webui_agent.entry.options[CONF_REQUEST_TIMEOUT]
    )

    # reset mock stats
    for mock in all_mocks.values():
        mock.reset_mock()

    text_generation_webui_agent.entry.options[CONF_TEXT_GEN_WEBUI_PRESET] = "Some Preset"

    # do another turn of the same conversation and use a preset
    result = await text_generation_webui_agent.async_process(ConversationInput(
        "turn off the office lights", MagicMock(), conversation_id, None, "en", agent_id="test-agent"
    ))

    assert result.response.speech['plain']['speech'] == "I am saying something!"

    all_mocks["requests_post"].assert_called_once_with(
        "http://localhost:5000/v1/completions",
        json={
            "model":  text_generation_webui_agent.entry.data[CONF_CHAT_MODEL],
            "top_p": text_generation_webui_agent.entry.options[CONF_TOP_P],
            "top_k": text_generation_webui_agent.entry.options[CONF_TOP_K],
            "temperature": text_generation_webui_agent.entry.options[CONF_TEMPERATURE],
            "min_p": text_generation_webui_agent.entry.options[CONF_MIN_P],
            "typical_p": text_generation_webui_agent.entry.options[CONF_TYPICAL_P],
            "truncation_length": text_generation_webui_agent.entry.options[CONF_CONTEXT_LENGTH],
            "max_tokens": text_generation_webui_agent.entry.options[CONF_MAX_TOKENS],
            "preset": "Some Preset",
            "prompt": ANY
        },
        headers={ "Authorization": "Bearer OpenAI-API-Key" },
        timeout=text_generation_webui_agent.entry.options[CONF_REQUEST_TIMEOUT]
    )

    # change options
    text_generation_webui_agent.entry.options[CONF_MAX_TOKENS] = 10
    text_generation_webui_agent.entry.options[CONF_REQUEST_TIMEOUT] = 60
    text_generation_webui_agent.entry.options[CONF_REMOTE_USE_CHAT_ENDPOINT] = True
    text_generation_webui_agent.entry.options[CONF_TEMPERATURE] = 2.0
    text_generation_webui_agent.entry.options[CONF_TOP_P] = 0.9
    text_generation_webui_agent.entry.options[CONF_MIN_P] = 0.2
    text_generation_webui_agent.entry.options[CONF_TYPICAL_P] = 0.95
    text_generation_webui_agent.entry.options[CONF_TEXT_GEN_WEBUI_PRESET] = ""

    response_mock.json.return_value = {
        "id": "chatcmpl-123",
        # text-gen-webui has a typo where it is 'chat.completions' not 'chat.completion'
        "object": "chat.completions",
        "created": 1677652288,
        "model": text_generation_webui_agent.entry.data[CONF_CHAT_MODEL],
        "system_fingerprint": "fp_44709d6fcb",
        "choices": [{
            "index": 0,
            "message": {
                "role": "assistant",
                "content": "I am saying something!\n" + json.dumps({
                    "name": "HassTurnOn",
                    "arguments": {
                        "name": "light.kitchen_light"
                    }
                }),
            },
            "logprobs": None,
            "finish_reason": "stop"
        }],
        "usage": {
            "prompt_tokens": 9,
            "completion_tokens": 12,
            "total_tokens": 21
        }
    }

    # reset mock stats
    for mock in all_mocks.values():
        mock.reset_mock()

    # do another turn of the same conversation but the chat endpoint
    result = await text_generation_webui_agent.async_process(ConversationInput(
        "turn off the office lights", MagicMock(), conversation_id, None, "en", agent_id="test-agent"
    ))

    assert result.response.speech['plain']['speech'] == "I am saying something!"

    all_mocks["requests_post"].assert_called_once_with(
        "http://localhost:5000/v1/chat/completions",
        json={
            "model":  text_generation_webui_agent.entry.data[CONF_CHAT_MODEL],
            "top_p": text_generation_webui_agent.entry.options[CONF_TOP_P],
            "top_k": text_generation_webui_agent.entry.options[CONF_TOP_K],
            "temperature": text_generation_webui_agent.entry.options[CONF_TEMPERATURE],
            "min_p": text_generation_webui_agent.entry.options[CONF_MIN_P],
            "typical_p": text_generation_webui_agent.entry.options[CONF_TYPICAL_P],
            "truncation_length": text_generation_webui_agent.entry.options[CONF_CONTEXT_LENGTH],
            "max_tokens": text_generation_webui_agent.entry.options[CONF_MAX_TOKENS],
            "mode": text_generation_webui_agent.entry.options[CONF_TEXT_GEN_WEBUI_CHAT_MODE],
            "messages": ANY
        },
        headers={ "Authorization": "Bearer OpenAI-API-Key" },
        timeout=text_generation_webui_agent.entry.options[CONF_REQUEST_TIMEOUT]
    )


@pytest.fixture
def generic_openai_agent_fixture(config_entry, hass, enable_custom_integrations):
    with patch.object(GenericOpenAIAPIAgent, '_load_icl_examples') as load_icl_examples_mock, \
         patch.object(GenericOpenAIAPIAgent, 'entry', new_callable=PropertyMock) as entry_mock, \
         patch.object(GenericOpenAIAPIAgent, '_async_get_exposed_entities') as get_exposed_entities_mock, \
         patch.object(APIInstance, 'async_call_tool') as call_tool_mock, \
         patch('homeassistant.helpers.template.Template') as template_mock, \
         patch('custom_components.llama_conversation.backends.generic_openai.async_get_clientsession') as get_clientsession_mock:

        entry_mock.return_value = config_entry
        get_exposed_entities_mock.return_value = (
            {
                "light.kitchen_light": { "state": "on" },
                "light.office_lamp": { "state": "on" },
                "switch.downstairs_hallway": { "state": "off" },
                "fan.bedroom": { "state": "on" },
            },
            ["light", "switch", "fan"]
        )

        call_tool_mock.return_value = {"result": "success"}

        agent_obj = GenericOpenAIAPIAgent(
            hass,
            config_entry
        )

        all_mocks = {
            "requests_get": get_clientsession_mock.get,
            "requests_post": get_clientsession_mock.post
        }

        yield agent_obj, all_mocks

async def test_generic_openai_agent(generic_openai_agent_fixture):

    generic_openai_agent: TextGenerationWebuiAgent
    all_mocks: dict[str, MagicMock]
    generic_openai_agent, all_mocks = generic_openai_agent_fixture

    response_mock = MagicMock()
    response_mock.json.return_value = {
        "id": "cmpl-uqkvlQyYK7bGYrRHQ0eXlWi7",
        "object": "text_completion",
        "created": 1589478378,
        "model": "gpt-3.5-turbo-instruct",
        "system_fingerprint": "fp_44709d6fcb",
        "choices": [{
            "text": "I am saying something!\n" + json.dumps({
                "name": "HassTurnOn",
                "arguments": {
                    "name": "light.kitchen_light"
                }
            }),
            "index": 0,
            "logprobs": None,
            "finish_reason": "length"
        }],
        "usage": {
            "prompt_tokens": 5,
            "completion_tokens": 7,
            "total_tokens": 12
        }
    }
    all_mocks["requests_post"].return_value = response_mock

    # invoke the conversation agent
    conversation_id = "test-conversation"
    result = await generic_openai_agent.async_process(ConversationInput(
        "turn on the kitchen lights", MagicMock(), conversation_id, None, "en", agent_id="test-agent"
    ))

    # assert on results + check side effects
    assert result.response.speech['plain']['speech'] == "I am saying something!"

    all_mocks["requests_post"].assert_called_once_with(
        "http://localhost:5000/v1/completions",
        json={
            "model":  generic_openai_agent.entry.data[CONF_CHAT_MODEL],
            "top_p": generic_openai_agent.entry.options[CONF_TOP_P],
            "temperature": generic_openai_agent.entry.options[CONF_TEMPERATURE],
            "max_tokens": generic_openai_agent.entry.options[CONF_MAX_TOKENS],
            "prompt": ANY
        },
        headers={ "Authorization": "Bearer OpenAI-API-Key" },
        timeout=generic_openai_agent.entry.options[CONF_REQUEST_TIMEOUT]
    )

    # reset mock stats
    for mock in all_mocks.values():
        mock.reset_mock()

    # change options
    generic_openai_agent.entry.options[CONF_MAX_TOKENS] = 10
    generic_openai_agent.entry.options[CONF_REQUEST_TIMEOUT] = 60
    generic_openai_agent.entry.options[CONF_REMOTE_USE_CHAT_ENDPOINT] = True
    generic_openai_agent.entry.options[CONF_TEMPERATURE] = 2.0
    generic_openai_agent.entry.options[CONF_TOP_P] = 0.9

    response_mock.json.return_value = {
        "id": "chatcmpl-123",
        "object": "chat.completion",
        "created": 1677652288,
        "model": generic_openai_agent.entry.data[CONF_CHAT_MODEL],
        "system_fingerprint": "fp_44709d6fcb",
        "choices": [{
            "index": 0,
            "message": {
                "role": "assistant",
                "content": "I am saying something!\n" + json.dumps({
                    "name": "HassTurnOn",
                    "arguments": {
                        "name": "light.kitchen_light"
                    }
                }),
            },
            "logprobs": None,
            "finish_reason": "stop"
        }],
        "usage": {
            "prompt_tokens": 9,
            "completion_tokens": 12,
            "total_tokens": 21
        }
    }

    # do another turn of the same conversation but the chat endpoint
    result = await generic_openai_agent.async_process(ConversationInput(
        "turn off the office lights", MagicMock(), conversation_id, None, "en", agent_id="test-agent"
    ))

    assert result.response.speech['plain']['speech'] == "I am saying something!"

    all_mocks["requests_post"].assert_called_once_with(
        "http://localhost:5000/v1/chat/completions",
        json={
            "model":  generic_openai_agent.entry.data[CONF_CHAT_MODEL],
            "top_p": generic_openai_agent.entry.options[CONF_TOP_P],
            "temperature": generic_openai_agent.entry.options[CONF_TEMPERATURE],
            "max_tokens": generic_openai_agent.entry.options[CONF_MAX_TOKENS],
            "messages": ANY
        },
        headers={ "Authorization": "Bearer OpenAI-API-Key" },
        timeout=generic_openai_agent.entry.options[CONF_REQUEST_TIMEOUT]
    )