From 1425413fc989cb11ea85e914991518eee4933bfc Mon Sep 17 00:00:00 2001
From: Alex O'Connell <git@alexoconnell.net>
Date: Mon, 15 Sep 2025 22:10:25 -0400
Subject: [PATCH] add docker compose stack for testing + backends are mostly
 working at this point

---
 TODO.md                                       | 15 ++++-
 .../backends/generic_openai.py                | 22 +++----
 .../backends/tailored_openai.py               | 39 +-----------
 .../llama_conversation/config_flow.py         | 14 ++++-
 custom_components/llama_conversation/const.py | 14 ++++-
 .../llama_conversation/conversation.py        | 53 +++++++++++------
 .../llama_conversation/translations/en.json   | 40 +++++--------
 custom_components/llama_conversation/utils.py | 18 +++---
 docker-compose.yml                            | 59 +++++++++++++++++++
 scripts/fix_metadata.sh                       |  9 +++
 scripts/import_ollama_model.sh                |  8 +++
 scripts/zephyr_legacy.txt                     | 17 ++++++
 12 files changed, 203 insertions(+), 105 deletions(-)
 create mode 100644 docker-compose.yml
 create mode 100644 scripts/fix_metadata.sh
 create mode 100755 scripts/import_ollama_model.sh
 create mode 100644 scripts/zephyr_legacy.txt

diff --git a/TODO.md b/TODO.md
index 9081b13..5e8d446 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,8 +1,9 @@
 # TODO
-- [ ] proper tool calling support  
+- [x] proper tool calling support  
 - [ ] fix old GGUFs to support tool calling  
 - [ ] home assistant component text streaming support  
-- [ ] new models based on qwen3  
+- [ ] new model based on qwen3 0.6b  
+- [ ] new model based on gemma3 270m  
 - [ ] support AI task API  
 - [x] support new LLM APIs  
     - rewrite how services are called  
@@ -42,6 +43,16 @@
 - [x] use varied system prompts to add behaviors  
 
 
+## v0.4 TODO for release:
+[ ] re-order the settings on the options config flow page. the order is very confusing  
+[ ] split out entity functionality so we can support conversation + ai tasks  
+[x] fix icl examples to match new tool calling syntax config  
+[x] set up docker-compose for running all of the various backends  
+[ ] fix and re-upload all compatible old models (+ upload all original safetensors)  
+[ ] move llamacpp to a separate process because of all the crashing  
+[ ] dedicated localai backend (tailored openai variant /w model loading)  
+[ ] fix the openai responses backend
+
 ## more complicated ideas
 - [ ] "context requests"  
     - basically just let the model decide what RAG/extra context it wants  
diff --git a/custom_components/llama_conversation/backends/generic_openai.py b/custom_components/llama_conversation/backends/generic_openai.py
index 7f358aa..41cb5bb 100644
--- a/custom_components/llama_conversation/backends/generic_openai.py
+++ b/custom_components/llama_conversation/backends/generic_openai.py
@@ -26,6 +26,7 @@ from custom_components.llama_conversation.const import (
     CONF_REMEMBER_CONVERSATION,
     CONF_REMEMBER_CONVERSATION_TIME_MINUTES,
     CONF_GENERIC_OPENAI_PATH,
+    CONF_ENABLE_LEGACY_TOOL_CALLING,
     DEFAULT_MAX_TOKENS,
     DEFAULT_TEMPERATURE,
     DEFAULT_TOP_P,
@@ -33,6 +34,7 @@ from custom_components.llama_conversation.const import (
     DEFAULT_REMEMBER_CONVERSATION,
     DEFAULT_REMEMBER_CONVERSATION_TIME_MINUTES,
     DEFAULT_GENERIC_OPENAI_PATH,
+    DEFAULT_ENABLE_LEGACY_TOOL_CALLING,
 )
 from custom_components.llama_conversation.conversation import LocalLLMAgent, TextGenerationResult
 
@@ -63,6 +65,7 @@ class GenericOpenAIAPIAgent(LocalLLMAgent):
         temperature = self.entry.options.get(CONF_TEMPERATURE, DEFAULT_TEMPERATURE)
         top_p = self.entry.options.get(CONF_TOP_P, DEFAULT_TOP_P)
         timeout = self.entry.options.get(CONF_REQUEST_TIMEOUT, DEFAULT_REQUEST_TIMEOUT)
+        enable_legacy_tool_calling = self.entry.options.get(CONF_ENABLE_LEGACY_TOOL_CALLING, DEFAULT_ENABLE_LEGACY_TOOL_CALLING)
 
         endpoint, additional_params = self._chat_completion_params()
         messages = get_oai_formatted_messages(conversation)
@@ -77,7 +80,9 @@ class GenericOpenAIAPIAgent(LocalLLMAgent):
         }
 
         tools = None
-        if llm_api:
+        # "legacy" tool calling passes the tools directly as part of the system prompt instead of as "tools"
+        # most local backends absolutely butcher any sort of prompt formatting when using tool calling
+        if llm_api and not enable_legacy_tool_calling:
             tools = get_oai_formatted_tools(llm_api, self._async_get_all_exposed_domains())
             request_params["tools"] = tools
 
@@ -103,22 +108,19 @@ class GenericOpenAIAPIAgent(LocalLLMAgent):
                 ) as response:
                     response.raise_for_status()
                     async for line_bytes in response.content:
-                        chunk = line_bytes.decode("utf-8").strip().removeprefix("data: ")
+                        raw_line = line_bytes.decode("utf-8").strip()
+                        if raw_line.startswith("error: "):
+                            raise Exception(f"Error from server: {raw_line}")
+                        chunk = raw_line.removeprefix("data: ")
                         if "[DONE]" in chunk:
                             break
-                        
+
                         if chunk and chunk.strip():
                             yield self._extract_response(json.loads(chunk), llm_api)
             except asyncio.TimeoutError as err:
                 raise HomeAssistantError("The generation request timed out! Please check your connection settings, increase the timeout in settings, or decrease the number of exposed entities.") from err
             except aiohttp.ClientError as err:
                 raise HomeAssistantError(f"Failed to communicate with the API! {err}") from err
-            except Exception as err:
-                _LOGGER.debug(f"Err was: {err}")
-                _LOGGER.debug(f"Request was: {request_params}")
-                _LOGGER.debug(f"Result was: {response}")
-                _LOGGER.debug(f"Chunk was {chunk}")
-                raise HomeAssistantError(f"An unknown error occurred! {err}") from err
 
         return self._async_parse_completion(llm_api, anext_token=anext_token())
     
@@ -159,8 +161,6 @@ class GenericOpenAIAPIAgent(LocalLLMAgent):
             if choice["finish_reason"] == "length" or choice["finish_reason"] == "content_filter":
                 _LOGGER.warning("Model response did not end on a stop token (unfinished sentence)")
 
-        _LOGGER.debug("Model chunk '%s'", response_text)
-
         return response_text, tool_calls
 
 
diff --git a/custom_components/llama_conversation/backends/tailored_openai.py b/custom_components/llama_conversation/backends/tailored_openai.py
index ead6cd4..c7517b2 100644
--- a/custom_components/llama_conversation/backends/tailored_openai.py
+++ b/custom_components/llama_conversation/backends/tailored_openai.py
@@ -80,11 +80,11 @@ class TextGenerationWebuiAgent(GenericOpenAIAPIAgent):
             _LOGGER.debug("Connection error was: %s", repr(ex))
             raise ConfigEntryNotReady("There was a problem connecting to the remote server") from ex
 
-    def _chat_completion_params(self, conversation: List[Dict[str, str]]) -> Tuple[str, Dict]:
+    def _chat_completion_params(self) -> Tuple[str, Dict[str, Any]]:
         preset = self.entry.options.get(CONF_TEXT_GEN_WEBUI_PRESET)
         chat_mode = self.entry.options.get(CONF_TEXT_GEN_WEBUI_CHAT_MODE, DEFAULT_TEXT_GEN_WEBUI_CHAT_MODE)
 
-        endpoint, request_params = super()._chat_completion_params(conversation)
+        endpoint, request_params = super()._chat_completion_params()
 
         request_params["mode"] = chat_mode
         if chat_mode == TEXT_GEN_WEBUI_CHAT_MODE_CHAT or chat_mode == TEXT_GEN_WEBUI_CHAT_MODE_CHAT_INSTRUCT:
@@ -98,37 +98,6 @@ class TextGenerationWebuiAgent(GenericOpenAIAPIAgent):
 
         return endpoint, request_params
 
-    def _completion_params(self, conversation: List[Dict[str, str]]) -> Tuple[str, Dict[str, Any]]:
-        preset = self.entry.options.get(CONF_TEXT_GEN_WEBUI_PRESET)
-
-        endpoint, request_params = super()._completion_params(conversation)
-
-        if preset:
-            request_params["preset"] = preset
-
-        request_params["truncation_length"] = self.entry.options.get(CONF_CONTEXT_LENGTH, DEFAULT_CONTEXT_LENGTH)
-        request_params["top_k"] = self.entry.options.get(CONF_TOP_K, DEFAULT_TOP_K)
-        request_params["min_p"] = self.entry.options.get(CONF_MIN_P, DEFAULT_MIN_P)
-        request_params["typical_p"] = self.entry.options.get(CONF_TYPICAL_P, DEFAULT_TYPICAL_P)
-
-        return endpoint, request_params
-
-    def _extract_response(self, response_json: dict) -> TextGenerationResult:
-        choices = response_json["choices"]
-        if choices[0]["finish_reason"] != "stop":
-            _LOGGER.warning("Model response did not end on a stop token (unfinished sentence)")
-
-        context_len = self.entry.options.get(CONF_CONTEXT_LENGTH, DEFAULT_CONTEXT_LENGTH)
-        max_tokens = self.entry.options.get(CONF_MAX_TOKENS, DEFAULT_MAX_TOKENS)
-        if response_json["usage"]["prompt_tokens"] + max_tokens > context_len:
-            self._warn_context_size()
-
-        # text-gen-webui has a typo where it is 'chat.completions' not 'chat.completion'
-        if response_json["object"] == "chat.completions":
-            return choices[0]["message"]["content"]
-        else:
-            return choices[0]["text"]
-
 class LlamaCppServerAgent(GenericOpenAIAPIAgent):
     grammar: str
 
@@ -152,8 +121,4 @@ class LlamaCppServerAgent(GenericOpenAIAPIAgent):
         if self.entry.options.get(CONF_USE_GBNF_GRAMMAR, DEFAULT_USE_GBNF_GRAMMAR):
             request_params["grammar"] = self.grammar
 
-        # force usage of COMMON_CHAT_TOOL_CHOICE_NONE so it returns raw content and then parse ourself when using
-        # the custom home llm tool call syntax. otherwise let the server detect it automatically
-        request_params["tool_choice"] = "none"
-
         return endpoint, request_params
\ No newline at end of file
diff --git a/custom_components/llama_conversation/config_flow.py b/custom_components/llama_conversation/config_flow.py
index 74a1b16..2548783 100644
--- a/custom_components/llama_conversation/config_flow.py
+++ b/custom_components/llama_conversation/config_flow.py
@@ -34,8 +34,6 @@ from homeassistant.helpers.selector import (
     BooleanSelector,
     BooleanSelectorConfig,
 )
-from homeassistant.util.package import is_installed
-from importlib.metadata import version
 
 from .utils import download_model_from_hf, get_llama_cpp_python_version, install_llama_cpp_python, format_url, MissingQuantizationException
 from .const import (
@@ -58,6 +56,7 @@ from .const import (
     CONF_THINKING_SUFFIX,
     CONF_TOOL_CALL_PREFIX,
     CONF_TOOL_CALL_SUFFIX,
+    CONF_ENABLE_LEGACY_TOOL_CALLING,
     CONF_ENABLE_FLASH_ATTENTION,
     CONF_USE_GBNF_GRAMMAR,
     CONF_GBNF_GRAMMAR_FILE,
@@ -107,6 +106,7 @@ from .const import (
     DEFAULT_THINKING_SUFFIX,
     DEFAULT_TOOL_CALL_PREFIX,
     DEFAULT_TOOL_CALL_SUFFIX,
+    DEFAULT_ENABLE_LEGACY_TOOL_CALLING,
     DEFAULT_ENABLE_FLASH_ATTENTION,
     DEFAULT_USE_GBNF_GRAMMAR,
     DEFAULT_GBNF_GRAMMAR_FILE,
@@ -1068,6 +1068,11 @@ def local_llama_config_option_schema(hass: HomeAssistant, options: MappingProxyT
                 description={"suggested_value": options.get(CONF_REQUEST_TIMEOUT)},
                 default=DEFAULT_REQUEST_TIMEOUT,
             ): NumberSelector(NumberSelectorConfig(min=5, max=900, step=1, unit_of_measurement=UnitOfTime.SECONDS, mode=NumberSelectorMode.BOX)),
+            vol.Required(
+                CONF_ENABLE_LEGACY_TOOL_CALLING,
+                description={"suggested_value": options.get(CONF_ENABLE_LEGACY_TOOL_CALLING)},
+                default=DEFAULT_ENABLE_LEGACY_TOOL_CALLING
+            ): bool,
         })
     elif backend_type in BACKEND_TYPE_GENERIC_OPENAI_RESPONSES:
         del result[CONF_REMEMBER_NUM_INTERACTIONS]
@@ -1127,6 +1132,11 @@ def local_llama_config_option_schema(hass: HomeAssistant, options: MappingProxyT
                 description={"suggested_value": options.get(CONF_REQUEST_TIMEOUT)},
                 default=DEFAULT_REQUEST_TIMEOUT,
             ): NumberSelector(NumberSelectorConfig(min=5, max=900, step=1, unit_of_measurement=UnitOfTime.SECONDS, mode=NumberSelectorMode.BOX)),
+            vol.Required(
+                CONF_ENABLE_LEGACY_TOOL_CALLING,
+                description={"suggested_value": options.get(CONF_ENABLE_LEGACY_TOOL_CALLING)},
+                default=DEFAULT_ENABLE_LEGACY_TOOL_CALLING
+            ): bool,
         })
     elif backend_type == BACKEND_TYPE_OLLAMA:
         result = insert_after_key(result, CONF_MAX_TOKENS, {
diff --git a/custom_components/llama_conversation/const.py b/custom_components/llama_conversation/const.py
index c4ae940..f00879e 100644
--- a/custom_components/llama_conversation/const.py
+++ b/custom_components/llama_conversation/const.py
@@ -76,13 +76,13 @@ ICL_EXTRAS = """
 {% for item in response_examples %}
 {{ item.request }}
 {{ item.response }}
-<functioncall> {{ item.tool | to_json }}
+{{ tool_call_prefix }}{{ item.tool | to_json }}{{ tool_call_suffix }}
 {% endfor %}"""
 ICL_NO_SYSTEM_PROMPT_EXTRAS = """
 {% for item in response_examples %}
 {{ item.request }}
 {{ item.response }}
-<functioncall> {{ item.tool | to_json }}
+{{ tool_call_prefix }}{{ item.tool | to_json }}{{ tool_call_suffix }}
 {% endfor %}
 <user_instruction>:"""
 DEFAULT_PROMPT = DEFAULT_PROMPT_BASE + ICL_EXTRAS
@@ -137,6 +137,8 @@ CONF_TOOL_CALL_PREFIX = "tool_call_prefix"
 DEFAULT_TOOL_CALL_PREFIX = "<tool_call>"
 CONF_TOOL_CALL_SUFFIX = "tool_call_suffix"
 DEFAULT_TOOL_CALL_SUFFIX = "</tool_call>"
+CONF_ENABLE_LEGACY_TOOL_CALLING = "enable_legacy_tool_calling"
+DEFAULT_ENABLE_LEGACY_TOOL_CALLING = False
 CONF_ENABLE_FLASH_ATTENTION = "enable_flash_attention"
 DEFAULT_ENABLE_FLASH_ATTENTION = False
 CONF_USE_GBNF_GRAMMAR = "gbnf_grammar"
@@ -179,7 +181,6 @@ CONF_GENERIC_OPENAI_PATH = "openai_path"
 DEFAULT_GENERIC_OPENAI_PATH = "v1"
 CONF_GENERIC_OPENAI_VALIDATE_MODEL = "openai_validate_model"
 DEFAULT_GENERIC_OPENAI_VALIDATE_MODEL = True
-
 CONF_CONTEXT_LENGTH = "context_length"
 DEFAULT_CONTEXT_LENGTH = 2048
 CONF_BATCH_SIZE = "batch_size"
@@ -228,6 +229,7 @@ OPTIONS_OVERRIDES = {
         CONF_TOOL_CALL_SUFFIX: "```",
         CONF_CONTEXT_LENGTH: 131072,
         CONF_MAX_TOOL_CALL_ITERATIONS: 1,
+        CONF_ENABLE_LEGACY_TOOL_CALLING: True
     },
     "home-3b-v3": {
         CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY,
@@ -235,6 +237,7 @@ OPTIONS_OVERRIDES = {
         CONF_TOOL_CALL_PREFIX: "```homeassistant",
         CONF_TOOL_CALL_SUFFIX: "```",
         CONF_MAX_TOOL_CALL_ITERATIONS: 1,
+        CONF_ENABLE_LEGACY_TOOL_CALLING: True
     },
     "home-3b-v2": {
         CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY,
@@ -242,6 +245,7 @@ OPTIONS_OVERRIDES = {
         CONF_TOOL_CALL_PREFIX: "```homeassistant",
         CONF_TOOL_CALL_SUFFIX: "```",
         CONF_MAX_TOOL_CALL_ITERATIONS: 1,
+        CONF_ENABLE_LEGACY_TOOL_CALLING: True
     },
     "home-3b-v1": {
         CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY,
@@ -249,6 +253,7 @@ OPTIONS_OVERRIDES = {
         CONF_TOOL_CALL_PREFIX: "```homeassistant",
         CONF_TOOL_CALL_SUFFIX: "```",
         CONF_MAX_TOOL_CALL_ITERATIONS: 1,
+        CONF_ENABLE_LEGACY_TOOL_CALLING: True
     },
     "home-1b-v3": {
         CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY,
@@ -256,6 +261,7 @@ OPTIONS_OVERRIDES = {
         CONF_TOOL_CALL_PREFIX: "```homeassistant",
         CONF_TOOL_CALL_SUFFIX: "```",
         CONF_MAX_TOOL_CALL_ITERATIONS: 1,
+        CONF_ENABLE_LEGACY_TOOL_CALLING: True
     },
     "home-1b-v2": {
         CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY,
@@ -263,6 +269,7 @@ OPTIONS_OVERRIDES = {
         CONF_TOOL_CALL_PREFIX: "```homeassistant",
         CONF_TOOL_CALL_SUFFIX: "```",
         CONF_MAX_TOOL_CALL_ITERATIONS: 1,
+        CONF_ENABLE_LEGACY_TOOL_CALLING: True
     },
     "home-1b-v1": {
         CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY,
@@ -270,6 +277,7 @@ OPTIONS_OVERRIDES = {
         CONF_TOOL_CALL_PREFIX: "```homeassistant",
         CONF_TOOL_CALL_SUFFIX: "```",
         CONF_MAX_TOOL_CALL_ITERATIONS: 1,
+        CONF_ENABLE_LEGACY_TOOL_CALLING: True
     },
     "mistral": {
         CONF_PROMPT: DEFAULT_PROMPT_BASE + ICL_NO_SYSTEM_PROMPT_EXTRAS,
diff --git a/custom_components/llama_conversation/conversation.py b/custom_components/llama_conversation/conversation.py
index c6b0048..a43789d 100644
--- a/custom_components/llama_conversation/conversation.py
+++ b/custom_components/llama_conversation/conversation.py
@@ -22,7 +22,7 @@ from homeassistant.helpers import config_validation as cv, intent, template, ent
 from homeassistant.helpers.entity_platform import AddConfigEntryEntitiesCallback
 from homeassistant.util import color
 
-from .utils import closest_color, parse_raw_tool_call
+from .utils import closest_color, parse_raw_tool_call, flatten_vol_schema
 from .const import (
     CONF_CHAT_MODEL,
     CONF_PROMPT,
@@ -40,6 +40,7 @@ from .const import (
     CONF_THINKING_SUFFIX,
     CONF_TOOL_CALL_PREFIX,
     CONF_TOOL_CALL_SUFFIX,
+    CONF_ENABLE_LEGACY_TOOL_CALLING,
     DEFAULT_PROMPT,
     DEFAULT_BACKEND_TYPE,
     DEFAULT_EXTRA_ATTRIBUTES_TO_EXPOSE,
@@ -58,6 +59,7 @@ from .const import (
     DEFAULT_THINKING_SUFFIX,
     DEFAULT_TOOL_CALL_PREFIX,
     DEFAULT_TOOL_CALL_SUFFIX,
+    DEFAULT_ENABLE_LEGACY_TOOL_CALLING,
 )
 
 _LOGGER = logging.getLogger(__name__)
@@ -82,8 +84,6 @@ async def update_listener(hass: HomeAssistant, entry: ConfigEntry):
     agent: LocalLLMAgent = entry.runtime_data
     await hass.async_add_executor_job(agent._update_options)
 
-    return True
-
 async def async_setup_entry(hass: HomeAssistant, entry: ConfigEntry, async_add_entities: AddConfigEntryEntitiesCallback) -> bool:
     """Set up Local LLM Conversation from a config entry."""
 
@@ -427,15 +427,18 @@ class LocalLLMAgent(ConversationEntity, AbstractConversationAgent):
 
                 elif tool_suffix in potential_block and in_tool_call:
                     in_tool_call = False
-                    tool_call, to_say = parse_raw_tool_call(tool_content.strip().removeprefix(tool_prefix).removesuffix(tool_suffix), llm_api)
-                    _LOGGER.debug("Tool call parsed: %s", tool_call)
-
-                    if tool_call:
-                        result.tool_calls = [tool_call]
-                    if to_say:
-                        content = to_say
+                    if not llm_api:
+                        _LOGGER.warning("Model attempted to call a tool but no LLM API was provided, ignoring tool calls")
                     else:
-                        content = None
+                        tool_call, to_say = parse_raw_tool_call(tool_content.strip().removeprefix(tool_prefix).removesuffix(tool_suffix), llm_api)
+                        _LOGGER.debug("Tool call parsed: %s", tool_call)
+
+                        if tool_call:
+                            result.tool_calls = [tool_call]
+                        if to_say:
+                            content = to_say
+                        else:
+                            content = None
 
                 result.response = content
             
@@ -463,9 +466,9 @@ class LocalLLMAgent(ConversationEntity, AbstractConversationAgent):
 
         return list(domains)
 
-    def _async_get_exposed_entities(self) -> dict[str, str]:
+    def _async_get_exposed_entities(self) -> dict[str, dict]:
         """Gather exposed entity states"""
-        entity_states = {}
+        entity_states: dict[str, dict] = {}
         entity_registry = er.async_get(self.hass)
         device_registry = dr.async_get(self.hass)
         area_registry = ar.async_get(self.hass)
@@ -577,10 +580,12 @@ class LocalLLMAgent(ConversationEntity, AbstractConversationAgent):
         """Generate the system prompt with current entity states"""
         entities_to_expose = self._async_get_exposed_entities()
 
-        extra_attributes_to_expose = self.entry.options \
-            .get(CONF_EXTRA_ATTRIBUTES_TO_EXPOSE, DEFAULT_EXTRA_ATTRIBUTES_TO_EXPOSE)
+        extra_attributes_to_expose = self.entry.options.get(CONF_EXTRA_ATTRIBUTES_TO_EXPOSE, DEFAULT_EXTRA_ATTRIBUTES_TO_EXPOSE)
+        enable_legacy_tool_calling = self.entry.options.get(CONF_ENABLE_LEGACY_TOOL_CALLING, DEFAULT_ENABLE_LEGACY_TOOL_CALLING)
+        tool_call_prefix = self.entry.options.get(CONF_TOOL_CALL_PREFIX, DEFAULT_TOOL_CALL_PREFIX)
+        tool_call_suffix = self.entry.options.get(CONF_TOOL_CALL_SUFFIX, DEFAULT_TOOL_CALL_SUFFIX)
 
-        def expose_attributes(attributes) -> list[str]:
+        def expose_attributes(attributes: dict[str, Any]) -> list[str]:
             result = []
             for attribute_name in extra_attributes_to_expose:
                 if attribute_name not in attributes:
@@ -645,9 +650,23 @@ class LocalLLMAgent(ConversationEntity, AbstractConversationAgent):
         render_variables = {
             "devices": devices,
             "formatted_devices": formatted_devices,
-            "response_examples": []
+            "response_examples": [],
+            "tool_call_prefix": tool_call_prefix,
+            "tool_call_suffix": tool_call_suffix,
         }
 
+        if enable_legacy_tool_calling:
+            if llm_api:
+                tools = []
+                for tool in llm_api.tools:
+                    tools.append(f"{tool.name}({','.join(flatten_vol_schema(tool.parameters))})")
+                render_variables["tools"] = tools
+                render_variables["formatted_tools"] = ", ".join(tools)
+            else:
+                message = "No tools were provided. If the user requests you interact with a device, tell them you are unable to do so."
+                render_variables["tools"] = [message]
+                render_variables["formatted_tools"] = message
+
         # only pass examples if there are loaded examples + an API was exposed
         if self.in_context_examples and llm_api:
             num_examples = int(self.entry.options.get(CONF_NUM_IN_CONTEXT_EXAMPLES, DEFAULT_NUM_IN_CONTEXT_EXAMPLES))
diff --git a/custom_components/llama_conversation/translations/en.json b/custom_components/llama_conversation/translations/en.json
index 0b6c714..7fff800 100644
--- a/custom_components/llama_conversation/translations/en.json
+++ b/custom_components/llama_conversation/translations/en.json
@@ -92,7 +92,13 @@
                     "context_length": "Context Length",
                     "batch_size": "Batch Size",
                     "n_threads": "Thread Count",
-                    "n_batch_threads": "Batch Thread Count"
+                    "n_batch_threads": "Batch Thread Count",
+                    "thinking_prefix": "Reasoning Content Prefix",
+                    "thinking_suffix": "Reasoning Content Suffix",
+                    "tool_call_prefix": "Tool Call Prefix",
+                    "tool_call_suffix": "Tool Call Suffix",
+                    "enable_legacy_tool_calling": "Enable Legacy Tool Calling",
+                    "max_tool_call_iterations": "Maximum Tool Call Attempts"
                 },
                 "data_description": {
                     "llm_hass_api": "Select 'Assist' if you want the model to be able to control devices. If you are using the Home-LLM v1, v2, or v3 model then select 'Home-LLM (v1-3)'",
@@ -115,9 +121,6 @@
                     "llm_hass_api": "Selected LLM API",
                     "max_new_tokens": "Maximum tokens to return in response",
                     "prompt": "System Prompt",
-                    "prompt_template": "Prompt Format",
-                    "tool_format": "Tool Format",
-                    "tool_multi_turn_chat": "Multi-Turn Tool Use",
                     "temperature": "Temperature",
                     "top_k": "Top K",
                     "top_p": "Top P",
@@ -147,7 +150,13 @@
                     "context_length": "Context Length",
                     "batch_size": "Batch Size",
                     "n_threads": "Thread Count",
-                    "n_batch_threads": "Batch Thread Count"
+                    "n_batch_threads": "Batch Thread Count",
+                    "thinking_prefix": "Reasoning Content Prefix",
+                    "thinking_suffix": "Reasoning Content Suffix",
+                    "tool_call_prefix": "Tool Call Prefix",
+                    "tool_call_suffix": "Tool Call Suffix",
+                    "enable_legacy_tool_calling": "Enable Legacy Tool Calling",
+                    "max_tool_call_iterations": "Maximum Tool Call Attempts"
                 },
                 "data_description": {
                     "llm_hass_api": "Select 'Assist' if you want the model to be able to control devices. If you are using the Home-LLM v1, v2, or v3 model then select 'Home-LLM (v1-3)'",
@@ -167,27 +176,6 @@
         }
     },
     "selector": {
-        "prompt_template": {
-            "options": {
-                "chatml": "ChatML",
-                "vicuna": "Vicuna",
-                "alpaca": "Alpaca",
-                "mistral": "Mistral",
-                "zephyr": "Zephyr (<|endoftext|>)",
-                "zephyr2": "Zephyr ('</s>')",
-                "zephyr3": "Zephyr (<|end|>)",
-                "llama3": "Llama 3",
-                "command-r": "Command R",
-                "no_prompt_template": "None"
-            }
-        },
-        "tool_format": {
-            "options": {
-                "full_tool_format": "Full JSON Tool Format",
-                "reduced_tool_format": "Reduced JSON Tool Format",
-                "min_tool_format": "Minimal Function Style Tool Format"
-            }
-        },
         "model_backend": {
             "options": {
                 "llama_cpp_hf": "Llama.cpp (HuggingFace)",
diff --git a/custom_components/llama_conversation/utils.py b/custom_components/llama_conversation/utils.py
index e78f16e..e6a1b3f 100644
--- a/custom_components/llama_conversation/utils.py
+++ b/custom_components/llama_conversation/utils.py
@@ -335,10 +335,10 @@ def get_home_llm_tools(llm_api: llm.APIInstance, domains: list[str]) -> List[Dic
         # scripts show up as individual services
         if domain == "script" and not scripts_added:
             all_services.extend([
-                ("script.reload", vol.Schema({})),
-                ("script.turn_on", vol.Schema({})),
-                ("script.turn_off", vol.Schema({})),
-                ("script.toggle", vol.Schema({})),
+                ("script.reload", vol.Schema({vol.Required("target_device"): str})),
+                ("script.turn_on", vol.Schema({vol.Required("target_device"): str})),
+                ("script.turn_off", vol.Schema({vol.Required("target_device"): str})),
+                ("script.toggle", vol.Schema({vol.Required("target_device"): str})),
             ])
             scripts_added = True
             continue
@@ -350,7 +350,8 @@ def get_home_llm_tools(llm_api: llm.APIInstance, domains: list[str]) -> List[Dic
             args = flatten_vol_schema(service.schema)
             args_to_expose = set(args).intersection(ALLOWED_SERVICE_CALL_ARGUMENTS)
             service_schema = vol.Schema({
-                vol.Optional(arg): str for arg in args_to_expose
+                vol.Required("target_device"): str,
+                **{vol.Optional(arg): str for arg in args_to_expose}
             })
 
             all_services.append((f"{domain}.{name}", service_schema))
@@ -384,18 +385,21 @@ def parse_raw_tool_call(raw_block: str | dict, llm_api: llm.APIInstance) -> tupl
     else:
         schema_to_validate = vol.Schema({
             vol.Required("name"): str,
-            vol.Required("arguments"): dict,
+            vol.Required("arguments"): str | dict,
         })
 
     try:
         schema_to_validate(parsed_tool_call)
     except vol.Error as ex:
         _LOGGER.info(f"LLM produced an improperly formatted response: {repr(ex)}")
-        raise # re-raise exception for now to force the LLM to try again
+        raise ex # re-raise exception for now to force the LLM to try again
 
     # try to fix certain arguments
     args_dict = parsed_tool_call if llm_api.api.id == HOME_LLM_API_ID else parsed_tool_call["arguments"]
 
+    if isinstance(args_dict, str):
+        args_dict = json.loads(args_dict)
+
     # make sure brightness is 0-255 and not a percentage
     if "brightness" in args_dict and 0.0 < args_dict["brightness"] <= 1.0:
         args_dict["brightness"] = int(args_dict["brightness"] * 255)
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..fa8aa07
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,59 @@
+# you can start and stop backends by running `docker-compose up -d <service name>`
+version: '3.8'
+services:
+  ollama:
+    image: ollama/ollama:latest
+    container_name: ollama
+    ports:
+      - "11434:11434"  # Ollama default
+    volumes:
+      - ./models:/models
+      - ./scripts:/scripts # needed for import script
+    environment:
+      - OLLAMA_MODELS=/models/.ollama
+    restart: unless-stopped
+
+  text-generation-webui:
+    image: atinoda/text-generation-webui:default-cpu
+    container_name: textgen-webui
+    init: true
+    environment:
+      - EXTRA_LAUNCH_ARGS="--listen --verbose" # Custom launch args (e.g., --model MODEL_NAME)
+    ports:
+      - "7860:7860"  # Web UI default
+      # - "5000:5000" # API Default
+      # - "5005:5005" # Streaming API default
+    volumes:
+      - ./models:/app/user_data/models
+    restart: unless-stopped
+
+  # llamacpp server can only run one model at a time; set it below
+  llamacpp:
+    image: ghcr.io/ggerganov/llama.cpp:server
+    container_name: llamacpp-server
+    ports:
+      - "8000:8000"  # llama.cpp server default
+    volumes:
+      - ./models:/models
+    environment:
+      - MODEL_DIR=/models
+    restart: unless-stopped
+    command: |-
+      --port 8000
+      --no-webui
+      --metrics
+      --jinja
+      --ctx-size 8192
+      --alias "Home-3B-v3"
+      --model "/models/Home-3B-v3-fixed.q4_k_m.gguf"
+
+  localai:
+    image: localai/localai:latest
+    container_name: localai
+    ports:
+      - "8080:8080"  # LocalAI default
+    volumes:
+      - ./models:/models
+    environment:
+      - MODELS_PATH=/models
+    restart: unless-stopped
diff --git a/scripts/fix_metadata.sh b/scripts/fix_metadata.sh
new file mode 100644
index 0000000..6c0e724
--- /dev/null
+++ b/scripts/fix_metadata.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+LLAMA_CPP=../llama.cpp
+MODEL_NAME=$1
+OUTPUT_NAME=$2
+PRE_TOKENIZER=${3:-stablelm2}
+CHAT_TEMPLATE=${4:-zephyr_legacy}
+
+python3 ${LLAMA_CPP}/gguf-py/gguf/scripts/gguf_new_metadata.py $MODEL_NAME $OUTPUT_NAME --pre-tokenizer $PRE_TOKENIZER --chat-template "$(cat $CHAT_TEMPLATE.txt)"
\ No newline at end of file
diff --git a/scripts/import_ollama_model.sh b/scripts/import_ollama_model.sh
new file mode 100755
index 0000000..b984a35
--- /dev/null
+++ b/scripts/import_ollama_model.sh
@@ -0,0 +1,8 @@
+# Usage: docker exec -it ollama bash -c "/scripts/import_ollama_model.sh /models/Home-3B-v3.q4_k_m.gguf Home-3B-v3:q4_k_m"
+LLAMA_CPP=../llama.cpp
+GGUF_FILE=$1
+MODEL_NAME=$2
+
+echo "FROM $GGUF_FILE" > $GGUF_FILE.Modelfile
+ollama create $MODEL_NAME -f $GGUF_FILE.Modelfile
+rm -f $GGUF_FILE.Modelfile
\ No newline at end of file
diff --git a/scripts/zephyr_legacy.txt b/scripts/zephyr_legacy.txt
new file mode 100644
index 0000000..6f75337
--- /dev/null
+++ b/scripts/zephyr_legacy.txt
@@ -0,0 +1,17 @@
+{% for message in messages %}
+{%- if message['role'] == 'user' or message['role'] == 'tool' -%}
+<|user|> {{ message['content'] }}{{ eos_token }}
+{%- elif message['role'] == 'system' -%}
+<|system|> {{ message['content'] }}
+Services:
+{%- for tool in tools %} {{ tool['function']['name'] }}({% for param in tool['function']['parameters']['properties'].keys() if param != 'target_device' %}{{ param }}{% if not loop.last %},{% endif %}{% endfor -%}),{% if not loop.last -%}
+{%- if tools | length == 0 %}No tools were provided. If the user requests you interact with a device, tell them you are unable to do so.{% endif %}
+{%- endif -%}{%- endfor -%}
+{{ eos_token }}
+{%- elif message['role'] == 'assistant' -%}
+<|assistant|> {{ message['content'] }}{{ eos_token }}
+{%- endif -%}
+{%- if loop.last and add_generation_prompt %}
+<|assistant|>
+{%- endif %}
+{% endfor -%}
\ No newline at end of file