wire up options to support functiongemma properly

2026-01-08 05:14:02 -05:00 · 2025-12-20 22:25:03 -05:00
parent 29d839eea8
commit 0e4031ef43
8 changed files with 91 additions and 37 deletions
--- a/custom_components/llama_conversation/backends/generic_openai.py
+++ b/custom_components/llama_conversation/backends/generic_openai.py
@@ -28,6 +28,7 @@ from custom_components.llama_conversation.const import (
    CONF_REMEMBER_CONVERSATION_TIME_MINUTES,
    CONF_GENERIC_OPENAI_PATH,
    CONF_ENABLE_LEGACY_TOOL_CALLING,
+    CONF_TOOL_RESPONSE_AS_STRING,
    CONF_RESPONSE_JSON_SCHEMA,
    DEFAULT_MAX_TOKENS,
    DEFAULT_TEMPERATURE,
@@ -37,6 +38,7 @@ from custom_components.llama_conversation.const import (
    DEFAULT_REMEMBER_CONVERSATION_TIME_MINUTES,
    DEFAULT_GENERIC_OPENAI_PATH,
    DEFAULT_ENABLE_LEGACY_TOOL_CALLING,
+    DEFAULT_TOOL_RESPONSE_AS_STRING,
    RECOMMENDED_CHAT_MODELS,
 )
 from custom_components.llama_conversation.entity import TextGenerationResult, LocalLLMClient
@@ -126,15 +128,18 @@ class GenericOpenAIAPIClient(LocalLLMClient):
        model_name = entity_options[CONF_CHAT_MODEL]
        temperature = entity_options.get(CONF_TEMPERATURE, DEFAULT_TEMPERATURE)
        top_p = entity_options.get(CONF_TOP_P, DEFAULT_TOP_P)
+        max_tokens = entity_options.get(CONF_MAX_TOKENS, DEFAULT_MAX_TOKENS)
        timeout = entity_options.get(CONF_REQUEST_TIMEOUT, DEFAULT_REQUEST_TIMEOUT)
        enable_legacy_tool_calling = entity_options.get(CONF_ENABLE_LEGACY_TOOL_CALLING, DEFAULT_ENABLE_LEGACY_TOOL_CALLING)
+        tool_response_as_string = entity_options.get(CONF_TOOL_RESPONSE_AS_STRING, DEFAULT_TOOL_RESPONSE_AS_STRING)

        endpoint, additional_params = self._chat_completion_params(entity_options)
-        messages = get_oai_formatted_messages(conversation, user_content_as_list=True)
+        messages = get_oai_formatted_messages(conversation, user_content_as_list=True, tool_result_to_str=tool_response_as_string)

        request_params = {
            "model": model_name,
            "stream": True,
+            "max_tokens": max_tokens,
            "temperature": temperature,
            "top_p": top_p,
            "messages": messages
@@ -152,8 +157,6 @@ class GenericOpenAIAPIClient(LocalLLMClient):
            }

        tools = None
-        # "legacy" tool calling passes the tools directly as part of the system prompt instead of as "tools"
-        # most local backends absolutely butcher any sort of prompt formatting when using tool calling
        if llm_api and not enable_legacy_tool_calling:
            tools = get_oai_formatted_tools(llm_api, self._async_get_all_exposed_domains())
            request_params["tools"] = tools
--- a/custom_components/llama_conversation/backends/llamacpp.py
+++ b/custom_components/llama_conversation/backends/llamacpp.py
@@ -19,6 +19,8 @@ from homeassistant.helpers.event import async_track_state_change, async_call_lat

 from custom_components.llama_conversation.utils import install_llama_cpp_python, validate_llama_cpp_python_installation, get_oai_formatted_messages, get_oai_formatted_tools
 from custom_components.llama_conversation.const import (
+    CONF_ENABLE_LEGACY_TOOL_CALLING,
+    CONF_TOOL_RESPONSE_AS_STRING,
    CONF_INSTALLED_LLAMACPP_VERSION,
    CONF_CHAT_MODEL,
    CONF_MAX_TOKENS,
@@ -38,7 +40,10 @@ from custom_components.llama_conversation.const import (
    CONF_LLAMACPP_BATCH_SIZE,
    CONF_LLAMACPP_THREAD_COUNT,
    CONF_LLAMACPP_BATCH_THREAD_COUNT,
+    CONF_LLAMACPP_CACHE_SIZE_MB,
    CONF_INSTALLED_LLAMACPP_VERSION,
+    DEFAULT_ENABLE_LEGACY_TOOL_CALLING,
+    DEFAULT_TOOL_RESPONSE_AS_STRING,
    DEFAULT_MAX_TOKENS,
    DEFAULT_PROMPT,
    DEFAULT_TEMPERATURE,
@@ -55,6 +60,7 @@ from custom_components.llama_conversation.const import (
    DEFAULT_LLAMACPP_BATCH_SIZE,
    DEFAULT_LLAMACPP_THREAD_COUNT,
    DEFAULT_LLAMACPP_BATCH_THREAD_COUNT,
+    DEFAULT_LLAMACPP_CACHE_SIZE_MB,
    DOMAIN,
    CONF_RESPONSE_JSON_SCHEMA,
 )
@@ -84,6 +90,7 @@ def snapshot_settings(options: dict[str, Any]) -> dict[str, Any]:
        CONF_LLAMACPP_BATCH_SIZE: options.get(CONF_LLAMACPP_BATCH_SIZE, DEFAULT_LLAMACPP_BATCH_SIZE),
        CONF_LLAMACPP_THREAD_COUNT: options.get(CONF_LLAMACPP_THREAD_COUNT, DEFAULT_LLAMACPP_THREAD_COUNT),
        CONF_LLAMACPP_BATCH_THREAD_COUNT: options.get(CONF_LLAMACPP_BATCH_THREAD_COUNT, DEFAULT_LLAMACPP_BATCH_THREAD_COUNT),
+        CONF_LLAMACPP_CACHE_SIZE_MB: options.get(CONF_LLAMACPP_CACHE_SIZE_MB, DEFAULT_LLAMACPP_CACHE_SIZE_MB),
        CONF_LLAMACPP_ENABLE_FLASH_ATTENTION: options.get(CONF_LLAMACPP_ENABLE_FLASH_ATTENTION, DEFAULT_LLAMACPP_ENABLE_FLASH_ATTENTION),
        CONF_INSTALLED_LLAMACPP_VERSION: options.get(CONF_INSTALLED_LLAMACPP_VERSION, ""),
        CONF_GBNF_GRAMMAR_FILE: options.get(CONF_GBNF_GRAMMAR_FILE, DEFAULT_GBNF_GRAMMAR_FILE),
@@ -172,11 +179,13 @@ class LlamaCppClient(LocalLLMClient):
        )
        _LOGGER.debug("Model loaded")

-        # FIXME: make cache size configurable (0 means disabled)
-        llm.set_cache(LlamaDiskCache(
-            capacity_bytes=int(512 * 10e8),
-            cache_dir=os.path.join(self.hass.config.media_dirs.get("local", self.hass.config.path("media")), "kv_cache")
-        ))
+        # create disk cache if enabled
+        cache_size = model_settings.get(CONF_LLAMACPP_CACHE_SIZE_MB, DEFAULT_LLAMACPP_CACHE_SIZE_MB)
+        if cache_size > 0:
+            llm.set_cache(LlamaDiskCache(
+                capacity_bytes=int(cache_size * (1024 ** 3)),
+                cache_dir=os.path.join(self.hass.config.media_dirs.get("local", self.hass.config.path("media")), "kv_cache")
+            ))

        if model_settings[CONF_PROMPT_CACHING_ENABLED]:
            @callback
@@ -224,6 +233,8 @@ class LlamaCppClient(LocalLLMClient):
            should_reload = True
        elif loaded_options[CONF_LLAMACPP_ENABLE_FLASH_ATTENTION] != entity_options.get(CONF_LLAMACPP_ENABLE_FLASH_ATTENTION, DEFAULT_LLAMACPP_ENABLE_FLASH_ATTENTION):
            should_reload = True
+        elif loaded_options[CONF_LLAMACPP_CACHE_SIZE_MB] != entity_options.get(CONF_LLAMACPP_CACHE_SIZE_MB, DEFAULT_LLAMACPP_CACHE_SIZE_MB):
+            should_reload = True
        elif loaded_options[CONF_INSTALLED_LLAMACPP_VERSION] != entity_options.get(CONF_INSTALLED_LLAMACPP_VERSION):
            should_reload = True
            _LOGGER.debug(f"Reloading llama.cpp...")
@@ -437,12 +448,14 @@ class LlamaCppClient(LocalLLMClient):
        min_p = entity_options.get(CONF_MIN_P, DEFAULT_MIN_P)
        typical_p = entity_options.get(CONF_TYPICAL_P, DEFAULT_TYPICAL_P)
        grammar = self.grammars.get(model_name) if entity_options.get(CONF_USE_GBNF_GRAMMAR, DEFAULT_USE_GBNF_GRAMMAR) else None
+        enable_legacy_tool_calling = entity_options.get(CONF_ENABLE_LEGACY_TOOL_CALLING, DEFAULT_ENABLE_LEGACY_TOOL_CALLING)
+        tool_response_as_string = entity_options.get(CONF_TOOL_RESPONSE_AS_STRING, DEFAULT_TOOL_RESPONSE_AS_STRING)

        _LOGGER.debug(f"Options: {entity_options}")

-        messages = get_oai_formatted_messages(conversation)
+        messages = get_oai_formatted_messages(conversation, tool_result_to_str=tool_response_as_string)
        tools = None
-        if llm_api:
+        if llm_api and not enable_legacy_tool_calling:
            tools = get_oai_formatted_tools(llm_api, self._async_get_all_exposed_domains())

        _LOGGER.debug(f"Generating completion with {len(messages)} messages and {len(tools) if tools else 0} tools...")
--- a/custom_components/llama_conversation/backends/ollama.py
+++ b/custom_components/llama_conversation/backends/ollama.py
@@ -33,6 +33,7 @@ from custom_components.llama_conversation.const import (
    CONF_OLLAMA_JSON_MODE,
    CONF_CONTEXT_LENGTH,
    CONF_ENABLE_LEGACY_TOOL_CALLING,
+    CONF_TOOL_RESPONSE_AS_STRING,
    CONF_RESPONSE_JSON_SCHEMA,
    DEFAULT_MAX_TOKENS,
    DEFAULT_TEMPERATURE,
@@ -47,6 +48,7 @@ from custom_components.llama_conversation.const import (
    DEFAULT_OLLAMA_JSON_MODE,
    DEFAULT_CONTEXT_LENGTH,
    DEFAULT_ENABLE_LEGACY_TOOL_CALLING,
+    DEFAULT_TOOL_RESPONSE_AS_STRING,
 )

 from custom_components.llama_conversation.entity import LocalLLMClient, TextGenerationResult
@@ -194,7 +196,8 @@ class OllamaAPIClient(LocalLLMClient):
        typical_p = entity_options.get(CONF_TYPICAL_P, DEFAULT_TYPICAL_P)
        timeout = entity_options.get(CONF_REQUEST_TIMEOUT, DEFAULT_REQUEST_TIMEOUT)
        keep_alive = entity_options.get(CONF_OLLAMA_KEEP_ALIVE_MIN, DEFAULT_OLLAMA_KEEP_ALIVE_MIN)
-        legacy_tool_calling = entity_options.get(CONF_ENABLE_LEGACY_TOOL_CALLING, DEFAULT_ENABLE_LEGACY_TOOL_CALLING)
+        enable_legacy_tool_calling = entity_options.get(CONF_ENABLE_LEGACY_TOOL_CALLING, DEFAULT_ENABLE_LEGACY_TOOL_CALLING)
+        tool_response_as_string = entity_options.get(CONF_TOOL_RESPONSE_AS_STRING, DEFAULT_TOOL_RESPONSE_AS_STRING)
        think_mode = entity_options.get(CONF_ENABLE_THINK_MODE, DEFAULT_ENABLE_THINK_MODE)
        json_mode = entity_options.get(CONF_OLLAMA_JSON_MODE, DEFAULT_OLLAMA_JSON_MODE)

@@ -208,9 +211,9 @@ class OllamaAPIClient(LocalLLMClient):
            "min_p": entity_options.get(CONF_MIN_P, DEFAULT_MIN_P),
        }

-        messages = get_oai_formatted_messages(conversation, tool_args_to_str=False)
+        messages = get_oai_formatted_messages(conversation, tool_args_to_str=False, tool_result_to_str=tool_response_as_string)
        tools = None
-        if llm_api and not legacy_tool_calling:
+        if llm_api and not enable_legacy_tool_calling:
            tools = get_oai_formatted_tools(llm_api, self._async_get_all_exposed_domains())
        keep_alive_payload = self._format_keep_alive(keep_alive)

--- a/custom_components/llama_conversation/config_flow.py
+++ b/custom_components/llama_conversation/config_flow.py
@@ -70,6 +70,7 @@ from .const import (
    CONF_TOOL_CALL_PREFIX,
    CONF_TOOL_CALL_SUFFIX,
    CONF_ENABLE_LEGACY_TOOL_CALLING,
+    CONF_TOOL_RESPONSE_AS_STRING,
    CONF_LLAMACPP_ENABLE_FLASH_ATTENTION,
    CONF_USE_GBNF_GRAMMAR,
    CONF_GBNF_GRAMMAR_FILE,
@@ -96,6 +97,7 @@ from .const import (
    CONF_LLAMACPP_THREAD_COUNT,
    CONF_LLAMACPP_BATCH_THREAD_COUNT,
    CONF_LLAMACPP_REINSTALL,
+    CONF_LLAMACPP_CACHE_SIZE_MB,
    DEFAULT_CHAT_MODEL,
    DEFAULT_PORT,
    DEFAULT_SSL,
@@ -121,6 +123,7 @@ from .const import (
    DEFAULT_TOOL_CALL_PREFIX,
    DEFAULT_TOOL_CALL_SUFFIX,
    DEFAULT_ENABLE_LEGACY_TOOL_CALLING,
+    DEFAULT_TOOL_RESPONSE_AS_STRING,
    DEFAULT_LLAMACPP_ENABLE_FLASH_ATTENTION,
    DEFAULT_USE_GBNF_GRAMMAR,
    DEFAULT_GBNF_GRAMMAR_FILE,
@@ -142,6 +145,7 @@ from .const import (
    DEFAULT_LLAMACPP_BATCH_SIZE,
    DEFAULT_LLAMACPP_THREAD_COUNT,
    DEFAULT_LLAMACPP_BATCH_THREAD_COUNT,
+    DEFAULT_LLAMACPP_CACHE_SIZE_MB,
    BACKEND_TYPE_LLAMA_CPP,
    BACKEND_TYPE_TEXT_GEN_WEBUI,
    BACKEND_TYPE_GENERIC_OPENAI,
@@ -621,6 +625,11 @@ def local_llama_config_option_schema(
            description={"suggested_value": options.get(CONF_ENABLE_LEGACY_TOOL_CALLING)},
            default=DEFAULT_ENABLE_LEGACY_TOOL_CALLING
        ): bool,
+        vol.Required(
+            CONF_TOOL_RESPONSE_AS_STRING,
+            description={"suggested_value": options.get(CONF_TOOL_RESPONSE_AS_STRING)},
+            default=DEFAULT_TOOL_RESPONSE_AS_STRING
+        ): bool,
    }

    if subentry_type == ai_task.DOMAIN:
@@ -727,7 +736,7 @@ def local_llama_config_option_schema(
                    CONF_PROMPT_CACHING_INTERVAL,
                    description={"suggested_value": options.get(CONF_PROMPT_CACHING_INTERVAL)},
                    default=DEFAULT_PROMPT_CACHING_INTERVAL,
-                ): NumberSelector(NumberSelectorConfig(min=1, max=60, step=1)),
+                ): NumberSelector(NumberSelectorConfig(min=1, max=60, step=1))
            })
        result.update({
            vol.Required(
@@ -781,6 +790,11 @@ def local_llama_config_option_schema(
                description={"suggested_value": options.get(CONF_LLAMACPP_ENABLE_FLASH_ATTENTION)},
                default=DEFAULT_LLAMACPP_ENABLE_FLASH_ATTENTION,
            ): BooleanSelector(BooleanSelectorConfig()),
+            vol.Required(
+                CONF_LLAMACPP_CACHE_SIZE_MB,
+                description={"suggested_value": options.get(CONF_LLAMACPP_CACHE_SIZE_MB)},
+                default=DEFAULT_LLAMACPP_CACHE_SIZE_MB,
+            ): NumberSelector(NumberSelectorConfig(min=0, max=1024, step=1)),
            vol.Required(
                CONF_USE_GBNF_GRAMMAR,
                description={"suggested_value": options.get(CONF_USE_GBNF_GRAMMAR)},
@@ -975,6 +989,7 @@ def local_llama_config_option_schema(
        CONF_TOOL_CALL_SUFFIX,
        CONF_MAX_TOOL_CALL_ITERATIONS,
        CONF_ENABLE_LEGACY_TOOL_CALLING,
+        CONF_TOOL_RESPONSE_AS_STRING,
        CONF_USE_GBNF_GRAMMAR,
        CONF_GBNF_GRAMMAR_FILE,
        # integration specific options
@@ -989,6 +1004,7 @@ def local_llama_config_option_schema(
        CONF_IN_CONTEXT_EXAMPLES_FILE,
        CONF_NUM_IN_CONTEXT_EXAMPLES,
        # backend specific options
+        CONF_LLAMACPP_CACHE_SIZE_MB,
        CONF_LLAMACPP_BATCH_SIZE,
        CONF_LLAMACPP_THREAD_COUNT,
        CONF_LLAMACPP_BATCH_THREAD_COUNT,
--- a/custom_components/llama_conversation/const.py
+++ b/custom_components/llama_conversation/const.py
@@ -149,6 +149,8 @@ CONF_TOOL_CALL_SUFFIX = "tool_call_suffix"
 DEFAULT_TOOL_CALL_SUFFIX = "</tool_call>"
 CONF_ENABLE_LEGACY_TOOL_CALLING = "enable_legacy_tool_calling"
 DEFAULT_ENABLE_LEGACY_TOOL_CALLING = False
+CONF_TOOL_RESPONSE_AS_STRING = "tool_response_as_string"
+DEFAULT_TOOL_RESPONSE_AS_STRING = True
 CONF_LLAMACPP_ENABLE_FLASH_ATTENTION = "enable_flash_attention"
 DEFAULT_LLAMACPP_ENABLE_FLASH_ATTENTION = False
 CONF_USE_GBNF_GRAMMAR = "gbnf_grammar"
@@ -201,6 +203,8 @@ DEFAULT_LLAMACPP_THREAD_COUNT = os.cpu_count()
 CONF_LLAMACPP_BATCH_THREAD_COUNT = "n_batch_threads"
 DEFAULT_LLAMACPP_BATCH_THREAD_COUNT = os.cpu_count()
 CONF_LLAMACPP_REINSTALL = "reinstall_llama_cpp"
+CONF_LLAMACPP_CACHE_SIZE_MB = "llama_cpp_cache_size_mb"
+DEFAULT_LLAMACPP_CACHE_SIZE_MB = 128

 DEFAULT_OPTIONS = types.MappingProxyType(
    {
--- a/custom_components/llama_conversation/entity.py
+++ b/custom_components/llama_conversation/entity.py
@@ -497,7 +497,6 @@ class LocalLLMClient:
        entities_to_expose = self._async_get_exposed_entities()

        extra_attributes_to_expose = entity_options.get(CONF_EXTRA_ATTRIBUTES_TO_EXPOSE, DEFAULT_EXTRA_ATTRIBUTES_TO_EXPOSE)
-        enable_legacy_tool_calling = entity_options.get(CONF_ENABLE_LEGACY_TOOL_CALLING, DEFAULT_ENABLE_LEGACY_TOOL_CALLING)
        tool_call_prefix = entity_options.get(CONF_TOOL_CALL_PREFIX, DEFAULT_TOOL_CALL_PREFIX)
        tool_call_suffix = entity_options.get(CONF_TOOL_CALL_SUFFIX, DEFAULT_TOOL_CALL_SUFFIX)

@@ -571,21 +570,16 @@ class LocalLLMClient:
            "tool_call_suffix": tool_call_suffix,
        }

-        if enable_legacy_tool_calling:
-            if llm_api:
-                tools = []
-                for tool in llm_api.tools:
-                    tools.append(f"{tool.name}({','.join(flatten_vol_schema(tool.parameters))})")
-                render_variables["tools"] = tools
-                render_variables["formatted_tools"] = ", ".join(tools)
-            else:
-                message = "No tools were provided. If the user requests you interact with a device, tell them you are unable to do so."
-                render_variables["tools"] = [message]
-                render_variables["formatted_tools"] = message
+        if llm_api:
+            tools = []
+            for tool in llm_api.tools:
+                tools.append(f"{tool.name}({','.join(flatten_vol_schema(tool.parameters))})")
+            render_variables["tools"] = tools
+            render_variables["formatted_tools"] = ", ".join(tools)
        else:
-            # Tools are passed via the API not the prompt
-            render_variables["tools"] = []
-            render_variables["formatted_tools"] = ""
+            message = "No tools were provided. If the user requests you interact with a device, tell them you are unable to do so."
+            render_variables["tools"] = [message]
+            render_variables["formatted_tools"] = message

        # only pass examples if there are loaded examples + an API was exposed
        if self.in_context_examples and llm_api:
--- a/custom_components/llama_conversation/translations/en.json
+++ b/custom_components/llama_conversation/translations/en.json
@@ -101,11 +101,13 @@
                        "batch_size": "(llama.cpp) Batch Size",
                        "n_threads": "(llama.cpp) Thread Count",
                        "n_batch_threads": "(llama.cpp) Batch Thread Count",
+                        "llama_cpp_cache_size_mb": "(llama.cpp) Disk KV Cache Size (MB)",
                        "thinking_prefix": "Reasoning Content Prefix",
                        "thinking_suffix": "Reasoning Content Suffix",
                        "tool_call_prefix": "Tool Call Prefix",
                        "tool_call_suffix": "Tool Call Suffix",
                        "enable_legacy_tool_calling": "Enable Legacy Tool Calling",
+                        "tool_response_as_string": "Tool Response as String",
                        "max_tool_call_iterations": "Maximum Tool Call Attempts"
                    },
                    "data_description": {
@@ -116,6 +118,7 @@
                        "gbnf_grammar": "Forces the model to output properly formatted responses. Ensure the file specified below exists in the integration directory.",
                        "prompt_caching": "Prompt caching attempts to pre-process the prompt (house state) and cache the processing that needs to be done to understand the prompt. Enabling this will cause the model to re-process the prompt any time an entity state changes in the house, restricted by the interval below.",
                        "enable_legacy_tool_calling": "Prefer to process tool calls locally rather than relying on the backend to handle the tool calling format. Can be more reliable, however it requires properly setting the tool call prefix and suffix.",
+                        "tool_response_as_string": "Some prompt templates expect the tool response to be provided as a JSON serialized string, rather than the raw object.",
                        "max_tool_call_iterations": "Set to 0 to generate the response and tool call in one attempt, without looping (use this for Home models v1-v3)."
                    },
                    "description": "Please configure the model according to how it should be prompted. There are many different options and selecting the correct ones for your model is essential to getting optimal performance. See [here](https://github.com/acon96/home-llm/blob/develop/docs/Backend%20Configuration.md) for more information about the options on this page.\n\n**Some defaults may have been chosen for you based on the name of the selected model name or filename.** If you renamed a file or are using a fine-tuning of a supported model, then the defaults may not have been detected.",
@@ -155,11 +158,13 @@
                        "batch_size": "(llama.cpp) Batch Size",
                        "n_threads": "(llama.cpp) Thread Count",
                        "n_batch_threads": "(llama.cpp) Batch Thread Count",
+                        "llama_cpp_cache_size_mb": "(llama.cpp) Disk KV Cache Size (MB)",
                        "thinking_prefix": "Reasoning Content Prefix",
                        "thinking_suffix": "Reasoning Content Suffix",
                        "tool_call_prefix": "Tool Call Prefix",
                        "tool_call_suffix": "Tool Call Suffix",
                        "enable_legacy_tool_calling": "Enable Legacy Tool Calling",
+                        "tool_response_as_string": "Tool Response as String",
                        "max_tool_call_iterations": "Maximum Tool Call Attempts"
                    },
                    "data_description": {
@@ -170,6 +175,7 @@
                        "gbnf_grammar": "Forces the model to output properly formatted responses. Ensure the file specified below exists in the integration directory.",
                        "prompt_caching": "Prompt caching attempts to pre-process the prompt (house state) and cache the processing that needs to be done to understand the prompt. Enabling this will cause the model to re-process the prompt any time an entity state changes in the house, restricted by the interval below.",
                        "enable_legacy_tool_calling": "Prefer to process tool calls locally rather than relying on the backend to handle the tool calling format. Can be more reliable, however it requires properly setting the tool call prefix and suffix.",
+                        "tool_response_as_string": "Some prompt templates expect the tool response to be provided as a JSON serialized string, rather than the raw object.",
                        "max_tool_call_iterations": "Set to 0 to generate the response and tool call in one attempt, without looping (use this for Home models v1-v3)."
                    },
                    "description": "Please configure the model according to how it should be prompted. There are many different options and selecting the correct ones for your model is essential to getting optimal performance. See [here](https://github.com/acon96/home-llm/blob/develop/docs/Backend%20Configuration.md) for more information about the options on this page.\n\n**Some defaults may have been chosen for you based on the name of the selected model name or filename.** If you renamed a file or are using a fine-tuning of a supported model, then the defaults may not have been detected.",
@@ -241,11 +247,13 @@
                        "batch_size": "(llama.cpp) Batch Size",
                        "n_threads": "(llama.cpp) Thread Count",
                        "n_batch_threads": "(llama.cpp) Batch Thread Count",
+                        "llama_cpp_cache_size_mb": "(llama.cpp) Disk KV Cache Size (MB)",
                        "thinking_prefix": "Reasoning Content Prefix",
                        "thinking_suffix": "Reasoning Content Suffix",
                        "tool_call_prefix": "Tool Call Prefix",
                        "tool_call_suffix": "Tool Call Suffix",
                        "enable_legacy_tool_calling": "Enable Legacy Tool Calling",
+                        "tool_response_as_string": "Tool Response as String",
                        "max_tool_call_iterations": "Maximum Tool Call Attempts",
                        "ai_task_extraction_method": "Structured Data Extraction Method",
                        "ai_task_retries": "Retry attempts for structured data extraction"
@@ -257,6 +265,7 @@
                        "gbnf_grammar": "Forces the model to output properly formatted responses. Ensure the file specified below exists in the integration directory.",
                        "prompt_caching": "Prompt caching attempts to pre-process the prompt (house state) and cache the processing that needs to be done to understand the prompt. Enabling this will cause the model to re-process the prompt any time an entity state changes in the house, restricted by the interval below.",
                        "enable_legacy_tool_calling": "Prefer to process tool calls locally rather than relying on the backend to handle the tool calling format. Can be more reliable, however it requires properly setting the tool call prefix and suffix.",
+                        "tool_response_as_string": "Some prompt templates expect the tool response to be provided as a JSON serialized string, rather than the raw object.",
                        "max_tool_call_iterations": "Set to 0 to generate the response and tool call in one attempt, without looping (use this for Home models v1-v3).",
                        "ai_task_extraction_method": "Select the method used to extract structured data from the model's response. 'Structured Output' tells the backend to force the model to produce output following the provided JSON Schema; 'Tool Calling' provides a tool to the model that should be called with the appropriate arguments that match the desired output structure."
                    },
@@ -297,11 +306,13 @@
                        "batch_size": "(llama.cpp) Batch Size",
                        "n_threads": "(llama.cpp) Thread Count",
                        "n_batch_threads": "(llama.cpp) Batch Thread Count",
+                        "llama_cpp_cache_size_mb": "(llama.cpp) Disk KV Cache Size (MB)",
                        "thinking_prefix": "Reasoning Content Prefix",
                        "thinking_suffix": "Reasoning Content Suffix",
                        "tool_call_prefix": "Tool Call Prefix",
                        "tool_call_suffix": "Tool Call Suffix",
                        "enable_legacy_tool_calling": "Enable Legacy Tool Calling",
+                        "tool_response_as_string": "Tool Response as String",
                        "max_tool_call_iterations": "Maximum Tool Call Attempts"
                    },
                    "data_description": {
@@ -312,6 +323,7 @@
                        "gbnf_grammar": "Forces the model to output properly formatted responses. Ensure the file specified below exists in the integration directory.",
                        "prompt_caching": "Prompt caching attempts to pre-process the prompt (house state) and cache the processing that needs to be done to understand the prompt. Enabling this will cause the model to re-process the prompt any time an entity state changes in the house, restricted by the interval below.",
                        "enable_legacy_tool_calling": "Prefer to process tool calls locally rather than relying on the backend to handle the tool calling format. Can be more reliable, however it requires properly setting the tool call prefix and suffix.",
+                        "tool_response_as_string": "Some prompt templates expect the tool response to be provided as a JSON serialized string, rather than the raw object.",
                        "max_tool_call_iterations": "Set to 0 to generate the response and tool call in one attempt, without looping (use this for Home models v1-v3)."
                    },
                    "description": "Please configure the model according to how it should be prompted. There are many different options and selecting the correct ones for your model is essential to getting optimal performance. See [here](https://github.com/acon96/home-llm/blob/develop/docs/Backend%20Configuration.md) for more information about the options on this page.\n\n**Some defaults may have been chosen for you based on the name of the selected model name or filename.** If you renamed a file or are using a fine-tuning of a supported model, then the defaults may not have been detected.",
--- a/custom_components/llama_conversation/utils.py
+++ b/custom_components/llama_conversation/utils.py
@@ -278,7 +278,8 @@ def format_url(*, hostname: str, port: str, ssl: bool, path: str):
 def get_oai_formatted_tools(llm_api: llm.APIInstance, domains: list[str]) -> List[ChatCompletionTool]:    
    result: List[ChatCompletionTool] = []

-    for tool in llm_api.tools:
+    # sort tools by name to improve cache hits
+    for tool in sorted(llm_api.tools, key=lambda t: t.name):
        # when combining with home assistant llm APIs, it adds a prefix to differentiate tools; compare against the suffix here
        if tool.name.endswith(SERVICE_TOOL_NAME):
            result.extend([{
@@ -302,7 +303,13 @@ def get_oai_formatted_tools(llm_api: llm.APIInstance, domains: list[str]) -> Lis

    return result

-def get_oai_formatted_messages(conversation: Sequence[conversation.Content], user_content_as_list: bool = False, tool_args_to_str: bool = True) -> List[ChatCompletionRequestMessage]:
+def get_oai_formatted_messages(
+        conversation: Sequence[conversation.Content],
+        *,
+        user_content_as_list: bool = False,
+        tool_args_to_str: bool = True,
+        tool_result_to_str: bool = True,
+    ) -> List[ChatCompletionRequestMessage]:
    messages: List[ChatCompletionRequestMessage] = []
    for message in conversation:
        if message.role == "system":
@@ -354,14 +361,16 @@ def get_oai_formatted_messages(conversation: Sequence[conversation.Content], use
                    ]
                })
        elif message.role == "tool_result":
-            messages.append({
-                "role": "tool",
-                # FIXME: what is the correct format for content here? gemma expects name and result
-                # "content": json.dumps(message.tool_result),
-                "content": {
+            if tool_result_to_str:
+                content = json.dumps(message.tool_result)
+            else:
+                content = {
                    "name": message.tool_name,
                    "response": { "result": message.tool_result },
-                },
+                }
+            messages.append({
+                "role": "tool",
+                "content": content,
                "tool_call_id": message.tool_call_id
            })