From 1425413fc989cb11ea85e914991518eee4933bfc Mon Sep 17 00:00:00 2001 From: Alex O'Connell Date: Mon, 15 Sep 2025 22:10:25 -0400 Subject: [PATCH] add docker compose stack for testing + backends are mostly working at this point --- TODO.md | 15 ++++- .../backends/generic_openai.py | 22 +++---- .../backends/tailored_openai.py | 39 +----------- .../llama_conversation/config_flow.py | 14 ++++- custom_components/llama_conversation/const.py | 14 ++++- .../llama_conversation/conversation.py | 53 +++++++++++------ .../llama_conversation/translations/en.json | 40 +++++-------- custom_components/llama_conversation/utils.py | 18 +++--- docker-compose.yml | 59 +++++++++++++++++++ scripts/fix_metadata.sh | 9 +++ scripts/import_ollama_model.sh | 8 +++ scripts/zephyr_legacy.txt | 17 ++++++ 12 files changed, 203 insertions(+), 105 deletions(-) create mode 100644 docker-compose.yml create mode 100644 scripts/fix_metadata.sh create mode 100755 scripts/import_ollama_model.sh create mode 100644 scripts/zephyr_legacy.txt diff --git a/TODO.md b/TODO.md index 9081b13..5e8d446 100644 --- a/TODO.md +++ b/TODO.md @@ -1,8 +1,9 @@ # TODO -- [ ] proper tool calling support +- [x] proper tool calling support - [ ] fix old GGUFs to support tool calling - [ ] home assistant component text streaming support -- [ ] new models based on qwen3 +- [ ] new model based on qwen3 0.6b +- [ ] new model based on gemma3 270m - [ ] support AI task API - [x] support new LLM APIs - rewrite how services are called @@ -42,6 +43,16 @@ - [x] use varied system prompts to add behaviors +## v0.4 TODO for release: +[ ] re-order the settings on the options config flow page. the order is very confusing +[ ] split out entity functionality so we can support conversation + ai tasks +[x] fix icl examples to match new tool calling syntax config +[x] set up docker-compose for running all of the various backends +[ ] fix and re-upload all compatible old models (+ upload all original safetensors) +[ ] move llamacpp to a separate process because of all the crashing +[ ] dedicated localai backend (tailored openai variant /w model loading) +[ ] fix the openai responses backend + ## more complicated ideas - [ ] "context requests" - basically just let the model decide what RAG/extra context it wants diff --git a/custom_components/llama_conversation/backends/generic_openai.py b/custom_components/llama_conversation/backends/generic_openai.py index 7f358aa..41cb5bb 100644 --- a/custom_components/llama_conversation/backends/generic_openai.py +++ b/custom_components/llama_conversation/backends/generic_openai.py @@ -26,6 +26,7 @@ from custom_components.llama_conversation.const import ( CONF_REMEMBER_CONVERSATION, CONF_REMEMBER_CONVERSATION_TIME_MINUTES, CONF_GENERIC_OPENAI_PATH, + CONF_ENABLE_LEGACY_TOOL_CALLING, DEFAULT_MAX_TOKENS, DEFAULT_TEMPERATURE, DEFAULT_TOP_P, @@ -33,6 +34,7 @@ from custom_components.llama_conversation.const import ( DEFAULT_REMEMBER_CONVERSATION, DEFAULT_REMEMBER_CONVERSATION_TIME_MINUTES, DEFAULT_GENERIC_OPENAI_PATH, + DEFAULT_ENABLE_LEGACY_TOOL_CALLING, ) from custom_components.llama_conversation.conversation import LocalLLMAgent, TextGenerationResult @@ -63,6 +65,7 @@ class GenericOpenAIAPIAgent(LocalLLMAgent): temperature = self.entry.options.get(CONF_TEMPERATURE, DEFAULT_TEMPERATURE) top_p = self.entry.options.get(CONF_TOP_P, DEFAULT_TOP_P) timeout = self.entry.options.get(CONF_REQUEST_TIMEOUT, DEFAULT_REQUEST_TIMEOUT) + enable_legacy_tool_calling = self.entry.options.get(CONF_ENABLE_LEGACY_TOOL_CALLING, DEFAULT_ENABLE_LEGACY_TOOL_CALLING) endpoint, additional_params = self._chat_completion_params() messages = get_oai_formatted_messages(conversation) @@ -77,7 +80,9 @@ class GenericOpenAIAPIAgent(LocalLLMAgent): } tools = None - if llm_api: + # "legacy" tool calling passes the tools directly as part of the system prompt instead of as "tools" + # most local backends absolutely butcher any sort of prompt formatting when using tool calling + if llm_api and not enable_legacy_tool_calling: tools = get_oai_formatted_tools(llm_api, self._async_get_all_exposed_domains()) request_params["tools"] = tools @@ -103,22 +108,19 @@ class GenericOpenAIAPIAgent(LocalLLMAgent): ) as response: response.raise_for_status() async for line_bytes in response.content: - chunk = line_bytes.decode("utf-8").strip().removeprefix("data: ") + raw_line = line_bytes.decode("utf-8").strip() + if raw_line.startswith("error: "): + raise Exception(f"Error from server: {raw_line}") + chunk = raw_line.removeprefix("data: ") if "[DONE]" in chunk: break - + if chunk and chunk.strip(): yield self._extract_response(json.loads(chunk), llm_api) except asyncio.TimeoutError as err: raise HomeAssistantError("The generation request timed out! Please check your connection settings, increase the timeout in settings, or decrease the number of exposed entities.") from err except aiohttp.ClientError as err: raise HomeAssistantError(f"Failed to communicate with the API! {err}") from err - except Exception as err: - _LOGGER.debug(f"Err was: {err}") - _LOGGER.debug(f"Request was: {request_params}") - _LOGGER.debug(f"Result was: {response}") - _LOGGER.debug(f"Chunk was {chunk}") - raise HomeAssistantError(f"An unknown error occurred! {err}") from err return self._async_parse_completion(llm_api, anext_token=anext_token()) @@ -159,8 +161,6 @@ class GenericOpenAIAPIAgent(LocalLLMAgent): if choice["finish_reason"] == "length" or choice["finish_reason"] == "content_filter": _LOGGER.warning("Model response did not end on a stop token (unfinished sentence)") - _LOGGER.debug("Model chunk '%s'", response_text) - return response_text, tool_calls diff --git a/custom_components/llama_conversation/backends/tailored_openai.py b/custom_components/llama_conversation/backends/tailored_openai.py index ead6cd4..c7517b2 100644 --- a/custom_components/llama_conversation/backends/tailored_openai.py +++ b/custom_components/llama_conversation/backends/tailored_openai.py @@ -80,11 +80,11 @@ class TextGenerationWebuiAgent(GenericOpenAIAPIAgent): _LOGGER.debug("Connection error was: %s", repr(ex)) raise ConfigEntryNotReady("There was a problem connecting to the remote server") from ex - def _chat_completion_params(self, conversation: List[Dict[str, str]]) -> Tuple[str, Dict]: + def _chat_completion_params(self) -> Tuple[str, Dict[str, Any]]: preset = self.entry.options.get(CONF_TEXT_GEN_WEBUI_PRESET) chat_mode = self.entry.options.get(CONF_TEXT_GEN_WEBUI_CHAT_MODE, DEFAULT_TEXT_GEN_WEBUI_CHAT_MODE) - endpoint, request_params = super()._chat_completion_params(conversation) + endpoint, request_params = super()._chat_completion_params() request_params["mode"] = chat_mode if chat_mode == TEXT_GEN_WEBUI_CHAT_MODE_CHAT or chat_mode == TEXT_GEN_WEBUI_CHAT_MODE_CHAT_INSTRUCT: @@ -98,37 +98,6 @@ class TextGenerationWebuiAgent(GenericOpenAIAPIAgent): return endpoint, request_params - def _completion_params(self, conversation: List[Dict[str, str]]) -> Tuple[str, Dict[str, Any]]: - preset = self.entry.options.get(CONF_TEXT_GEN_WEBUI_PRESET) - - endpoint, request_params = super()._completion_params(conversation) - - if preset: - request_params["preset"] = preset - - request_params["truncation_length"] = self.entry.options.get(CONF_CONTEXT_LENGTH, DEFAULT_CONTEXT_LENGTH) - request_params["top_k"] = self.entry.options.get(CONF_TOP_K, DEFAULT_TOP_K) - request_params["min_p"] = self.entry.options.get(CONF_MIN_P, DEFAULT_MIN_P) - request_params["typical_p"] = self.entry.options.get(CONF_TYPICAL_P, DEFAULT_TYPICAL_P) - - return endpoint, request_params - - def _extract_response(self, response_json: dict) -> TextGenerationResult: - choices = response_json["choices"] - if choices[0]["finish_reason"] != "stop": - _LOGGER.warning("Model response did not end on a stop token (unfinished sentence)") - - context_len = self.entry.options.get(CONF_CONTEXT_LENGTH, DEFAULT_CONTEXT_LENGTH) - max_tokens = self.entry.options.get(CONF_MAX_TOKENS, DEFAULT_MAX_TOKENS) - if response_json["usage"]["prompt_tokens"] + max_tokens > context_len: - self._warn_context_size() - - # text-gen-webui has a typo where it is 'chat.completions' not 'chat.completion' - if response_json["object"] == "chat.completions": - return choices[0]["message"]["content"] - else: - return choices[0]["text"] - class LlamaCppServerAgent(GenericOpenAIAPIAgent): grammar: str @@ -152,8 +121,4 @@ class LlamaCppServerAgent(GenericOpenAIAPIAgent): if self.entry.options.get(CONF_USE_GBNF_GRAMMAR, DEFAULT_USE_GBNF_GRAMMAR): request_params["grammar"] = self.grammar - # force usage of COMMON_CHAT_TOOL_CHOICE_NONE so it returns raw content and then parse ourself when using - # the custom home llm tool call syntax. otherwise let the server detect it automatically - request_params["tool_choice"] = "none" - return endpoint, request_params \ No newline at end of file diff --git a/custom_components/llama_conversation/config_flow.py b/custom_components/llama_conversation/config_flow.py index 74a1b16..2548783 100644 --- a/custom_components/llama_conversation/config_flow.py +++ b/custom_components/llama_conversation/config_flow.py @@ -34,8 +34,6 @@ from homeassistant.helpers.selector import ( BooleanSelector, BooleanSelectorConfig, ) -from homeassistant.util.package import is_installed -from importlib.metadata import version from .utils import download_model_from_hf, get_llama_cpp_python_version, install_llama_cpp_python, format_url, MissingQuantizationException from .const import ( @@ -58,6 +56,7 @@ from .const import ( CONF_THINKING_SUFFIX, CONF_TOOL_CALL_PREFIX, CONF_TOOL_CALL_SUFFIX, + CONF_ENABLE_LEGACY_TOOL_CALLING, CONF_ENABLE_FLASH_ATTENTION, CONF_USE_GBNF_GRAMMAR, CONF_GBNF_GRAMMAR_FILE, @@ -107,6 +106,7 @@ from .const import ( DEFAULT_THINKING_SUFFIX, DEFAULT_TOOL_CALL_PREFIX, DEFAULT_TOOL_CALL_SUFFIX, + DEFAULT_ENABLE_LEGACY_TOOL_CALLING, DEFAULT_ENABLE_FLASH_ATTENTION, DEFAULT_USE_GBNF_GRAMMAR, DEFAULT_GBNF_GRAMMAR_FILE, @@ -1068,6 +1068,11 @@ def local_llama_config_option_schema(hass: HomeAssistant, options: MappingProxyT description={"suggested_value": options.get(CONF_REQUEST_TIMEOUT)}, default=DEFAULT_REQUEST_TIMEOUT, ): NumberSelector(NumberSelectorConfig(min=5, max=900, step=1, unit_of_measurement=UnitOfTime.SECONDS, mode=NumberSelectorMode.BOX)), + vol.Required( + CONF_ENABLE_LEGACY_TOOL_CALLING, + description={"suggested_value": options.get(CONF_ENABLE_LEGACY_TOOL_CALLING)}, + default=DEFAULT_ENABLE_LEGACY_TOOL_CALLING + ): bool, }) elif backend_type in BACKEND_TYPE_GENERIC_OPENAI_RESPONSES: del result[CONF_REMEMBER_NUM_INTERACTIONS] @@ -1127,6 +1132,11 @@ def local_llama_config_option_schema(hass: HomeAssistant, options: MappingProxyT description={"suggested_value": options.get(CONF_REQUEST_TIMEOUT)}, default=DEFAULT_REQUEST_TIMEOUT, ): NumberSelector(NumberSelectorConfig(min=5, max=900, step=1, unit_of_measurement=UnitOfTime.SECONDS, mode=NumberSelectorMode.BOX)), + vol.Required( + CONF_ENABLE_LEGACY_TOOL_CALLING, + description={"suggested_value": options.get(CONF_ENABLE_LEGACY_TOOL_CALLING)}, + default=DEFAULT_ENABLE_LEGACY_TOOL_CALLING + ): bool, }) elif backend_type == BACKEND_TYPE_OLLAMA: result = insert_after_key(result, CONF_MAX_TOKENS, { diff --git a/custom_components/llama_conversation/const.py b/custom_components/llama_conversation/const.py index c4ae940..f00879e 100644 --- a/custom_components/llama_conversation/const.py +++ b/custom_components/llama_conversation/const.py @@ -76,13 +76,13 @@ ICL_EXTRAS = """ {% for item in response_examples %} {{ item.request }} {{ item.response }} - {{ item.tool | to_json }} +{{ tool_call_prefix }}{{ item.tool | to_json }}{{ tool_call_suffix }} {% endfor %}""" ICL_NO_SYSTEM_PROMPT_EXTRAS = """ {% for item in response_examples %} {{ item.request }} {{ item.response }} - {{ item.tool | to_json }} +{{ tool_call_prefix }}{{ item.tool | to_json }}{{ tool_call_suffix }} {% endfor %} :""" DEFAULT_PROMPT = DEFAULT_PROMPT_BASE + ICL_EXTRAS @@ -137,6 +137,8 @@ CONF_TOOL_CALL_PREFIX = "tool_call_prefix" DEFAULT_TOOL_CALL_PREFIX = "" CONF_TOOL_CALL_SUFFIX = "tool_call_suffix" DEFAULT_TOOL_CALL_SUFFIX = "" +CONF_ENABLE_LEGACY_TOOL_CALLING = "enable_legacy_tool_calling" +DEFAULT_ENABLE_LEGACY_TOOL_CALLING = False CONF_ENABLE_FLASH_ATTENTION = "enable_flash_attention" DEFAULT_ENABLE_FLASH_ATTENTION = False CONF_USE_GBNF_GRAMMAR = "gbnf_grammar" @@ -179,7 +181,6 @@ CONF_GENERIC_OPENAI_PATH = "openai_path" DEFAULT_GENERIC_OPENAI_PATH = "v1" CONF_GENERIC_OPENAI_VALIDATE_MODEL = "openai_validate_model" DEFAULT_GENERIC_OPENAI_VALIDATE_MODEL = True - CONF_CONTEXT_LENGTH = "context_length" DEFAULT_CONTEXT_LENGTH = 2048 CONF_BATCH_SIZE = "batch_size" @@ -228,6 +229,7 @@ OPTIONS_OVERRIDES = { CONF_TOOL_CALL_SUFFIX: "```", CONF_CONTEXT_LENGTH: 131072, CONF_MAX_TOOL_CALL_ITERATIONS: 1, + CONF_ENABLE_LEGACY_TOOL_CALLING: True }, "home-3b-v3": { CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY, @@ -235,6 +237,7 @@ OPTIONS_OVERRIDES = { CONF_TOOL_CALL_PREFIX: "```homeassistant", CONF_TOOL_CALL_SUFFIX: "```", CONF_MAX_TOOL_CALL_ITERATIONS: 1, + CONF_ENABLE_LEGACY_TOOL_CALLING: True }, "home-3b-v2": { CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY, @@ -242,6 +245,7 @@ OPTIONS_OVERRIDES = { CONF_TOOL_CALL_PREFIX: "```homeassistant", CONF_TOOL_CALL_SUFFIX: "```", CONF_MAX_TOOL_CALL_ITERATIONS: 1, + CONF_ENABLE_LEGACY_TOOL_CALLING: True }, "home-3b-v1": { CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY, @@ -249,6 +253,7 @@ OPTIONS_OVERRIDES = { CONF_TOOL_CALL_PREFIX: "```homeassistant", CONF_TOOL_CALL_SUFFIX: "```", CONF_MAX_TOOL_CALL_ITERATIONS: 1, + CONF_ENABLE_LEGACY_TOOL_CALLING: True }, "home-1b-v3": { CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY, @@ -256,6 +261,7 @@ OPTIONS_OVERRIDES = { CONF_TOOL_CALL_PREFIX: "```homeassistant", CONF_TOOL_CALL_SUFFIX: "```", CONF_MAX_TOOL_CALL_ITERATIONS: 1, + CONF_ENABLE_LEGACY_TOOL_CALLING: True }, "home-1b-v2": { CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY, @@ -263,6 +269,7 @@ OPTIONS_OVERRIDES = { CONF_TOOL_CALL_PREFIX: "```homeassistant", CONF_TOOL_CALL_SUFFIX: "```", CONF_MAX_TOOL_CALL_ITERATIONS: 1, + CONF_ENABLE_LEGACY_TOOL_CALLING: True }, "home-1b-v1": { CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY, @@ -270,6 +277,7 @@ OPTIONS_OVERRIDES = { CONF_TOOL_CALL_PREFIX: "```homeassistant", CONF_TOOL_CALL_SUFFIX: "```", CONF_MAX_TOOL_CALL_ITERATIONS: 1, + CONF_ENABLE_LEGACY_TOOL_CALLING: True }, "mistral": { CONF_PROMPT: DEFAULT_PROMPT_BASE + ICL_NO_SYSTEM_PROMPT_EXTRAS, diff --git a/custom_components/llama_conversation/conversation.py b/custom_components/llama_conversation/conversation.py index c6b0048..a43789d 100644 --- a/custom_components/llama_conversation/conversation.py +++ b/custom_components/llama_conversation/conversation.py @@ -22,7 +22,7 @@ from homeassistant.helpers import config_validation as cv, intent, template, ent from homeassistant.helpers.entity_platform import AddConfigEntryEntitiesCallback from homeassistant.util import color -from .utils import closest_color, parse_raw_tool_call +from .utils import closest_color, parse_raw_tool_call, flatten_vol_schema from .const import ( CONF_CHAT_MODEL, CONF_PROMPT, @@ -40,6 +40,7 @@ from .const import ( CONF_THINKING_SUFFIX, CONF_TOOL_CALL_PREFIX, CONF_TOOL_CALL_SUFFIX, + CONF_ENABLE_LEGACY_TOOL_CALLING, DEFAULT_PROMPT, DEFAULT_BACKEND_TYPE, DEFAULT_EXTRA_ATTRIBUTES_TO_EXPOSE, @@ -58,6 +59,7 @@ from .const import ( DEFAULT_THINKING_SUFFIX, DEFAULT_TOOL_CALL_PREFIX, DEFAULT_TOOL_CALL_SUFFIX, + DEFAULT_ENABLE_LEGACY_TOOL_CALLING, ) _LOGGER = logging.getLogger(__name__) @@ -82,8 +84,6 @@ async def update_listener(hass: HomeAssistant, entry: ConfigEntry): agent: LocalLLMAgent = entry.runtime_data await hass.async_add_executor_job(agent._update_options) - return True - async def async_setup_entry(hass: HomeAssistant, entry: ConfigEntry, async_add_entities: AddConfigEntryEntitiesCallback) -> bool: """Set up Local LLM Conversation from a config entry.""" @@ -427,15 +427,18 @@ class LocalLLMAgent(ConversationEntity, AbstractConversationAgent): elif tool_suffix in potential_block and in_tool_call: in_tool_call = False - tool_call, to_say = parse_raw_tool_call(tool_content.strip().removeprefix(tool_prefix).removesuffix(tool_suffix), llm_api) - _LOGGER.debug("Tool call parsed: %s", tool_call) - - if tool_call: - result.tool_calls = [tool_call] - if to_say: - content = to_say + if not llm_api: + _LOGGER.warning("Model attempted to call a tool but no LLM API was provided, ignoring tool calls") else: - content = None + tool_call, to_say = parse_raw_tool_call(tool_content.strip().removeprefix(tool_prefix).removesuffix(tool_suffix), llm_api) + _LOGGER.debug("Tool call parsed: %s", tool_call) + + if tool_call: + result.tool_calls = [tool_call] + if to_say: + content = to_say + else: + content = None result.response = content @@ -463,9 +466,9 @@ class LocalLLMAgent(ConversationEntity, AbstractConversationAgent): return list(domains) - def _async_get_exposed_entities(self) -> dict[str, str]: + def _async_get_exposed_entities(self) -> dict[str, dict]: """Gather exposed entity states""" - entity_states = {} + entity_states: dict[str, dict] = {} entity_registry = er.async_get(self.hass) device_registry = dr.async_get(self.hass) area_registry = ar.async_get(self.hass) @@ -577,10 +580,12 @@ class LocalLLMAgent(ConversationEntity, AbstractConversationAgent): """Generate the system prompt with current entity states""" entities_to_expose = self._async_get_exposed_entities() - extra_attributes_to_expose = self.entry.options \ - .get(CONF_EXTRA_ATTRIBUTES_TO_EXPOSE, DEFAULT_EXTRA_ATTRIBUTES_TO_EXPOSE) + extra_attributes_to_expose = self.entry.options.get(CONF_EXTRA_ATTRIBUTES_TO_EXPOSE, DEFAULT_EXTRA_ATTRIBUTES_TO_EXPOSE) + enable_legacy_tool_calling = self.entry.options.get(CONF_ENABLE_LEGACY_TOOL_CALLING, DEFAULT_ENABLE_LEGACY_TOOL_CALLING) + tool_call_prefix = self.entry.options.get(CONF_TOOL_CALL_PREFIX, DEFAULT_TOOL_CALL_PREFIX) + tool_call_suffix = self.entry.options.get(CONF_TOOL_CALL_SUFFIX, DEFAULT_TOOL_CALL_SUFFIX) - def expose_attributes(attributes) -> list[str]: + def expose_attributes(attributes: dict[str, Any]) -> list[str]: result = [] for attribute_name in extra_attributes_to_expose: if attribute_name not in attributes: @@ -645,9 +650,23 @@ class LocalLLMAgent(ConversationEntity, AbstractConversationAgent): render_variables = { "devices": devices, "formatted_devices": formatted_devices, - "response_examples": [] + "response_examples": [], + "tool_call_prefix": tool_call_prefix, + "tool_call_suffix": tool_call_suffix, } + if enable_legacy_tool_calling: + if llm_api: + tools = [] + for tool in llm_api.tools: + tools.append(f"{tool.name}({','.join(flatten_vol_schema(tool.parameters))})") + render_variables["tools"] = tools + render_variables["formatted_tools"] = ", ".join(tools) + else: + message = "No tools were provided. If the user requests you interact with a device, tell them you are unable to do so." + render_variables["tools"] = [message] + render_variables["formatted_tools"] = message + # only pass examples if there are loaded examples + an API was exposed if self.in_context_examples and llm_api: num_examples = int(self.entry.options.get(CONF_NUM_IN_CONTEXT_EXAMPLES, DEFAULT_NUM_IN_CONTEXT_EXAMPLES)) diff --git a/custom_components/llama_conversation/translations/en.json b/custom_components/llama_conversation/translations/en.json index 0b6c714..7fff800 100644 --- a/custom_components/llama_conversation/translations/en.json +++ b/custom_components/llama_conversation/translations/en.json @@ -92,7 +92,13 @@ "context_length": "Context Length", "batch_size": "Batch Size", "n_threads": "Thread Count", - "n_batch_threads": "Batch Thread Count" + "n_batch_threads": "Batch Thread Count", + "thinking_prefix": "Reasoning Content Prefix", + "thinking_suffix": "Reasoning Content Suffix", + "tool_call_prefix": "Tool Call Prefix", + "tool_call_suffix": "Tool Call Suffix", + "enable_legacy_tool_calling": "Enable Legacy Tool Calling", + "max_tool_call_iterations": "Maximum Tool Call Attempts" }, "data_description": { "llm_hass_api": "Select 'Assist' if you want the model to be able to control devices. If you are using the Home-LLM v1, v2, or v3 model then select 'Home-LLM (v1-3)'", @@ -115,9 +121,6 @@ "llm_hass_api": "Selected LLM API", "max_new_tokens": "Maximum tokens to return in response", "prompt": "System Prompt", - "prompt_template": "Prompt Format", - "tool_format": "Tool Format", - "tool_multi_turn_chat": "Multi-Turn Tool Use", "temperature": "Temperature", "top_k": "Top K", "top_p": "Top P", @@ -147,7 +150,13 @@ "context_length": "Context Length", "batch_size": "Batch Size", "n_threads": "Thread Count", - "n_batch_threads": "Batch Thread Count" + "n_batch_threads": "Batch Thread Count", + "thinking_prefix": "Reasoning Content Prefix", + "thinking_suffix": "Reasoning Content Suffix", + "tool_call_prefix": "Tool Call Prefix", + "tool_call_suffix": "Tool Call Suffix", + "enable_legacy_tool_calling": "Enable Legacy Tool Calling", + "max_tool_call_iterations": "Maximum Tool Call Attempts" }, "data_description": { "llm_hass_api": "Select 'Assist' if you want the model to be able to control devices. If you are using the Home-LLM v1, v2, or v3 model then select 'Home-LLM (v1-3)'", @@ -167,27 +176,6 @@ } }, "selector": { - "prompt_template": { - "options": { - "chatml": "ChatML", - "vicuna": "Vicuna", - "alpaca": "Alpaca", - "mistral": "Mistral", - "zephyr": "Zephyr (<|endoftext|>)", - "zephyr2": "Zephyr ('')", - "zephyr3": "Zephyr (<|end|>)", - "llama3": "Llama 3", - "command-r": "Command R", - "no_prompt_template": "None" - } - }, - "tool_format": { - "options": { - "full_tool_format": "Full JSON Tool Format", - "reduced_tool_format": "Reduced JSON Tool Format", - "min_tool_format": "Minimal Function Style Tool Format" - } - }, "model_backend": { "options": { "llama_cpp_hf": "Llama.cpp (HuggingFace)", diff --git a/custom_components/llama_conversation/utils.py b/custom_components/llama_conversation/utils.py index e78f16e..e6a1b3f 100644 --- a/custom_components/llama_conversation/utils.py +++ b/custom_components/llama_conversation/utils.py @@ -335,10 +335,10 @@ def get_home_llm_tools(llm_api: llm.APIInstance, domains: list[str]) -> List[Dic # scripts show up as individual services if domain == "script" and not scripts_added: all_services.extend([ - ("script.reload", vol.Schema({})), - ("script.turn_on", vol.Schema({})), - ("script.turn_off", vol.Schema({})), - ("script.toggle", vol.Schema({})), + ("script.reload", vol.Schema({vol.Required("target_device"): str})), + ("script.turn_on", vol.Schema({vol.Required("target_device"): str})), + ("script.turn_off", vol.Schema({vol.Required("target_device"): str})), + ("script.toggle", vol.Schema({vol.Required("target_device"): str})), ]) scripts_added = True continue @@ -350,7 +350,8 @@ def get_home_llm_tools(llm_api: llm.APIInstance, domains: list[str]) -> List[Dic args = flatten_vol_schema(service.schema) args_to_expose = set(args).intersection(ALLOWED_SERVICE_CALL_ARGUMENTS) service_schema = vol.Schema({ - vol.Optional(arg): str for arg in args_to_expose + vol.Required("target_device"): str, + **{vol.Optional(arg): str for arg in args_to_expose} }) all_services.append((f"{domain}.{name}", service_schema)) @@ -384,18 +385,21 @@ def parse_raw_tool_call(raw_block: str | dict, llm_api: llm.APIInstance) -> tupl else: schema_to_validate = vol.Schema({ vol.Required("name"): str, - vol.Required("arguments"): dict, + vol.Required("arguments"): str | dict, }) try: schema_to_validate(parsed_tool_call) except vol.Error as ex: _LOGGER.info(f"LLM produced an improperly formatted response: {repr(ex)}") - raise # re-raise exception for now to force the LLM to try again + raise ex # re-raise exception for now to force the LLM to try again # try to fix certain arguments args_dict = parsed_tool_call if llm_api.api.id == HOME_LLM_API_ID else parsed_tool_call["arguments"] + if isinstance(args_dict, str): + args_dict = json.loads(args_dict) + # make sure brightness is 0-255 and not a percentage if "brightness" in args_dict and 0.0 < args_dict["brightness"] <= 1.0: args_dict["brightness"] = int(args_dict["brightness"] * 255) diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..fa8aa07 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,59 @@ +# you can start and stop backends by running `docker-compose up -d ` +version: '3.8' +services: + ollama: + image: ollama/ollama:latest + container_name: ollama + ports: + - "11434:11434" # Ollama default + volumes: + - ./models:/models + - ./scripts:/scripts # needed for import script + environment: + - OLLAMA_MODELS=/models/.ollama + restart: unless-stopped + + text-generation-webui: + image: atinoda/text-generation-webui:default-cpu + container_name: textgen-webui + init: true + environment: + - EXTRA_LAUNCH_ARGS="--listen --verbose" # Custom launch args (e.g., --model MODEL_NAME) + ports: + - "7860:7860" # Web UI default + # - "5000:5000" # API Default + # - "5005:5005" # Streaming API default + volumes: + - ./models:/app/user_data/models + restart: unless-stopped + + # llamacpp server can only run one model at a time; set it below + llamacpp: + image: ghcr.io/ggerganov/llama.cpp:server + container_name: llamacpp-server + ports: + - "8000:8000" # llama.cpp server default + volumes: + - ./models:/models + environment: + - MODEL_DIR=/models + restart: unless-stopped + command: |- + --port 8000 + --no-webui + --metrics + --jinja + --ctx-size 8192 + --alias "Home-3B-v3" + --model "/models/Home-3B-v3-fixed.q4_k_m.gguf" + + localai: + image: localai/localai:latest + container_name: localai + ports: + - "8080:8080" # LocalAI default + volumes: + - ./models:/models + environment: + - MODELS_PATH=/models + restart: unless-stopped diff --git a/scripts/fix_metadata.sh b/scripts/fix_metadata.sh new file mode 100644 index 0000000..6c0e724 --- /dev/null +++ b/scripts/fix_metadata.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +LLAMA_CPP=../llama.cpp +MODEL_NAME=$1 +OUTPUT_NAME=$2 +PRE_TOKENIZER=${3:-stablelm2} +CHAT_TEMPLATE=${4:-zephyr_legacy} + +python3 ${LLAMA_CPP}/gguf-py/gguf/scripts/gguf_new_metadata.py $MODEL_NAME $OUTPUT_NAME --pre-tokenizer $PRE_TOKENIZER --chat-template "$(cat $CHAT_TEMPLATE.txt)" \ No newline at end of file diff --git a/scripts/import_ollama_model.sh b/scripts/import_ollama_model.sh new file mode 100755 index 0000000..b984a35 --- /dev/null +++ b/scripts/import_ollama_model.sh @@ -0,0 +1,8 @@ +# Usage: docker exec -it ollama bash -c "/scripts/import_ollama_model.sh /models/Home-3B-v3.q4_k_m.gguf Home-3B-v3:q4_k_m" +LLAMA_CPP=../llama.cpp +GGUF_FILE=$1 +MODEL_NAME=$2 + +echo "FROM $GGUF_FILE" > $GGUF_FILE.Modelfile +ollama create $MODEL_NAME -f $GGUF_FILE.Modelfile +rm -f $GGUF_FILE.Modelfile \ No newline at end of file diff --git a/scripts/zephyr_legacy.txt b/scripts/zephyr_legacy.txt new file mode 100644 index 0000000..6f75337 --- /dev/null +++ b/scripts/zephyr_legacy.txt @@ -0,0 +1,17 @@ +{% for message in messages %} +{%- if message['role'] == 'user' or message['role'] == 'tool' -%} +<|user|> {{ message['content'] }}{{ eos_token }} +{%- elif message['role'] == 'system' -%} +<|system|> {{ message['content'] }} +Services: +{%- for tool in tools %} {{ tool['function']['name'] }}({% for param in tool['function']['parameters']['properties'].keys() if param != 'target_device' %}{{ param }}{% if not loop.last %},{% endif %}{% endfor -%}),{% if not loop.last -%} +{%- if tools | length == 0 %}No tools were provided. If the user requests you interact with a device, tell them you are unable to do so.{% endif %} +{%- endif -%}{%- endfor -%} +{{ eos_token }} +{%- elif message['role'] == 'assistant' -%} +<|assistant|> {{ message['content'] }}{{ eos_token }} +{%- endif -%} +{%- if loop.last and add_generation_prompt %} +<|assistant|> +{%- endif %} +{% endfor -%} \ No newline at end of file