add docker compose stack for testing + backends are mostly working at this point

This commit is contained in:
Alex O'Connell
2025-09-15 22:10:25 -04:00
parent 9b8baeed07
commit 1425413fc9
12 changed files with 203 additions and 105 deletions

15
TODO.md
View File

@@ -1,8 +1,9 @@
# TODO # TODO
- [ ] proper tool calling support - [x] proper tool calling support
- [ ] fix old GGUFs to support tool calling - [ ] fix old GGUFs to support tool calling
- [ ] home assistant component text streaming support - [ ] home assistant component text streaming support
- [ ] new models based on qwen3 - [ ] new model based on qwen3 0.6b
- [ ] new model based on gemma3 270m
- [ ] support AI task API - [ ] support AI task API
- [x] support new LLM APIs - [x] support new LLM APIs
- rewrite how services are called - rewrite how services are called
@@ -42,6 +43,16 @@
- [x] use varied system prompts to add behaviors - [x] use varied system prompts to add behaviors
## v0.4 TODO for release:
[ ] re-order the settings on the options config flow page. the order is very confusing
[ ] split out entity functionality so we can support conversation + ai tasks
[x] fix icl examples to match new tool calling syntax config
[x] set up docker-compose for running all of the various backends
[ ] fix and re-upload all compatible old models (+ upload all original safetensors)
[ ] move llamacpp to a separate process because of all the crashing
[ ] dedicated localai backend (tailored openai variant /w model loading)
[ ] fix the openai responses backend
## more complicated ideas ## more complicated ideas
- [ ] "context requests" - [ ] "context requests"
- basically just let the model decide what RAG/extra context it wants - basically just let the model decide what RAG/extra context it wants

View File

@@ -26,6 +26,7 @@ from custom_components.llama_conversation.const import (
CONF_REMEMBER_CONVERSATION, CONF_REMEMBER_CONVERSATION,
CONF_REMEMBER_CONVERSATION_TIME_MINUTES, CONF_REMEMBER_CONVERSATION_TIME_MINUTES,
CONF_GENERIC_OPENAI_PATH, CONF_GENERIC_OPENAI_PATH,
CONF_ENABLE_LEGACY_TOOL_CALLING,
DEFAULT_MAX_TOKENS, DEFAULT_MAX_TOKENS,
DEFAULT_TEMPERATURE, DEFAULT_TEMPERATURE,
DEFAULT_TOP_P, DEFAULT_TOP_P,
@@ -33,6 +34,7 @@ from custom_components.llama_conversation.const import (
DEFAULT_REMEMBER_CONVERSATION, DEFAULT_REMEMBER_CONVERSATION,
DEFAULT_REMEMBER_CONVERSATION_TIME_MINUTES, DEFAULT_REMEMBER_CONVERSATION_TIME_MINUTES,
DEFAULT_GENERIC_OPENAI_PATH, DEFAULT_GENERIC_OPENAI_PATH,
DEFAULT_ENABLE_LEGACY_TOOL_CALLING,
) )
from custom_components.llama_conversation.conversation import LocalLLMAgent, TextGenerationResult from custom_components.llama_conversation.conversation import LocalLLMAgent, TextGenerationResult
@@ -63,6 +65,7 @@ class GenericOpenAIAPIAgent(LocalLLMAgent):
temperature = self.entry.options.get(CONF_TEMPERATURE, DEFAULT_TEMPERATURE) temperature = self.entry.options.get(CONF_TEMPERATURE, DEFAULT_TEMPERATURE)
top_p = self.entry.options.get(CONF_TOP_P, DEFAULT_TOP_P) top_p = self.entry.options.get(CONF_TOP_P, DEFAULT_TOP_P)
timeout = self.entry.options.get(CONF_REQUEST_TIMEOUT, DEFAULT_REQUEST_TIMEOUT) timeout = self.entry.options.get(CONF_REQUEST_TIMEOUT, DEFAULT_REQUEST_TIMEOUT)
enable_legacy_tool_calling = self.entry.options.get(CONF_ENABLE_LEGACY_TOOL_CALLING, DEFAULT_ENABLE_LEGACY_TOOL_CALLING)
endpoint, additional_params = self._chat_completion_params() endpoint, additional_params = self._chat_completion_params()
messages = get_oai_formatted_messages(conversation) messages = get_oai_formatted_messages(conversation)
@@ -77,7 +80,9 @@ class GenericOpenAIAPIAgent(LocalLLMAgent):
} }
tools = None tools = None
if llm_api: # "legacy" tool calling passes the tools directly as part of the system prompt instead of as "tools"
# most local backends absolutely butcher any sort of prompt formatting when using tool calling
if llm_api and not enable_legacy_tool_calling:
tools = get_oai_formatted_tools(llm_api, self._async_get_all_exposed_domains()) tools = get_oai_formatted_tools(llm_api, self._async_get_all_exposed_domains())
request_params["tools"] = tools request_params["tools"] = tools
@@ -103,22 +108,19 @@ class GenericOpenAIAPIAgent(LocalLLMAgent):
) as response: ) as response:
response.raise_for_status() response.raise_for_status()
async for line_bytes in response.content: async for line_bytes in response.content:
chunk = line_bytes.decode("utf-8").strip().removeprefix("data: ") raw_line = line_bytes.decode("utf-8").strip()
if raw_line.startswith("error: "):
raise Exception(f"Error from server: {raw_line}")
chunk = raw_line.removeprefix("data: ")
if "[DONE]" in chunk: if "[DONE]" in chunk:
break break
if chunk and chunk.strip(): if chunk and chunk.strip():
yield self._extract_response(json.loads(chunk), llm_api) yield self._extract_response(json.loads(chunk), llm_api)
except asyncio.TimeoutError as err: except asyncio.TimeoutError as err:
raise HomeAssistantError("The generation request timed out! Please check your connection settings, increase the timeout in settings, or decrease the number of exposed entities.") from err raise HomeAssistantError("The generation request timed out! Please check your connection settings, increase the timeout in settings, or decrease the number of exposed entities.") from err
except aiohttp.ClientError as err: except aiohttp.ClientError as err:
raise HomeAssistantError(f"Failed to communicate with the API! {err}") from err raise HomeAssistantError(f"Failed to communicate with the API! {err}") from err
except Exception as err:
_LOGGER.debug(f"Err was: {err}")
_LOGGER.debug(f"Request was: {request_params}")
_LOGGER.debug(f"Result was: {response}")
_LOGGER.debug(f"Chunk was {chunk}")
raise HomeAssistantError(f"An unknown error occurred! {err}") from err
return self._async_parse_completion(llm_api, anext_token=anext_token()) return self._async_parse_completion(llm_api, anext_token=anext_token())
@@ -159,8 +161,6 @@ class GenericOpenAIAPIAgent(LocalLLMAgent):
if choice["finish_reason"] == "length" or choice["finish_reason"] == "content_filter": if choice["finish_reason"] == "length" or choice["finish_reason"] == "content_filter":
_LOGGER.warning("Model response did not end on a stop token (unfinished sentence)") _LOGGER.warning("Model response did not end on a stop token (unfinished sentence)")
_LOGGER.debug("Model chunk '%s'", response_text)
return response_text, tool_calls return response_text, tool_calls

View File

@@ -80,11 +80,11 @@ class TextGenerationWebuiAgent(GenericOpenAIAPIAgent):
_LOGGER.debug("Connection error was: %s", repr(ex)) _LOGGER.debug("Connection error was: %s", repr(ex))
raise ConfigEntryNotReady("There was a problem connecting to the remote server") from ex raise ConfigEntryNotReady("There was a problem connecting to the remote server") from ex
def _chat_completion_params(self, conversation: List[Dict[str, str]]) -> Tuple[str, Dict]: def _chat_completion_params(self) -> Tuple[str, Dict[str, Any]]:
preset = self.entry.options.get(CONF_TEXT_GEN_WEBUI_PRESET) preset = self.entry.options.get(CONF_TEXT_GEN_WEBUI_PRESET)
chat_mode = self.entry.options.get(CONF_TEXT_GEN_WEBUI_CHAT_MODE, DEFAULT_TEXT_GEN_WEBUI_CHAT_MODE) chat_mode = self.entry.options.get(CONF_TEXT_GEN_WEBUI_CHAT_MODE, DEFAULT_TEXT_GEN_WEBUI_CHAT_MODE)
endpoint, request_params = super()._chat_completion_params(conversation) endpoint, request_params = super()._chat_completion_params()
request_params["mode"] = chat_mode request_params["mode"] = chat_mode
if chat_mode == TEXT_GEN_WEBUI_CHAT_MODE_CHAT or chat_mode == TEXT_GEN_WEBUI_CHAT_MODE_CHAT_INSTRUCT: if chat_mode == TEXT_GEN_WEBUI_CHAT_MODE_CHAT or chat_mode == TEXT_GEN_WEBUI_CHAT_MODE_CHAT_INSTRUCT:
@@ -98,37 +98,6 @@ class TextGenerationWebuiAgent(GenericOpenAIAPIAgent):
return endpoint, request_params return endpoint, request_params
def _completion_params(self, conversation: List[Dict[str, str]]) -> Tuple[str, Dict[str, Any]]:
preset = self.entry.options.get(CONF_TEXT_GEN_WEBUI_PRESET)
endpoint, request_params = super()._completion_params(conversation)
if preset:
request_params["preset"] = preset
request_params["truncation_length"] = self.entry.options.get(CONF_CONTEXT_LENGTH, DEFAULT_CONTEXT_LENGTH)
request_params["top_k"] = self.entry.options.get(CONF_TOP_K, DEFAULT_TOP_K)
request_params["min_p"] = self.entry.options.get(CONF_MIN_P, DEFAULT_MIN_P)
request_params["typical_p"] = self.entry.options.get(CONF_TYPICAL_P, DEFAULT_TYPICAL_P)
return endpoint, request_params
def _extract_response(self, response_json: dict) -> TextGenerationResult:
choices = response_json["choices"]
if choices[0]["finish_reason"] != "stop":
_LOGGER.warning("Model response did not end on a stop token (unfinished sentence)")
context_len = self.entry.options.get(CONF_CONTEXT_LENGTH, DEFAULT_CONTEXT_LENGTH)
max_tokens = self.entry.options.get(CONF_MAX_TOKENS, DEFAULT_MAX_TOKENS)
if response_json["usage"]["prompt_tokens"] + max_tokens > context_len:
self._warn_context_size()
# text-gen-webui has a typo where it is 'chat.completions' not 'chat.completion'
if response_json["object"] == "chat.completions":
return choices[0]["message"]["content"]
else:
return choices[0]["text"]
class LlamaCppServerAgent(GenericOpenAIAPIAgent): class LlamaCppServerAgent(GenericOpenAIAPIAgent):
grammar: str grammar: str
@@ -152,8 +121,4 @@ class LlamaCppServerAgent(GenericOpenAIAPIAgent):
if self.entry.options.get(CONF_USE_GBNF_GRAMMAR, DEFAULT_USE_GBNF_GRAMMAR): if self.entry.options.get(CONF_USE_GBNF_GRAMMAR, DEFAULT_USE_GBNF_GRAMMAR):
request_params["grammar"] = self.grammar request_params["grammar"] = self.grammar
# force usage of COMMON_CHAT_TOOL_CHOICE_NONE so it returns raw content and then parse ourself when using
# the custom home llm tool call syntax. otherwise let the server detect it automatically
request_params["tool_choice"] = "none"
return endpoint, request_params return endpoint, request_params

View File

@@ -34,8 +34,6 @@ from homeassistant.helpers.selector import (
BooleanSelector, BooleanSelector,
BooleanSelectorConfig, BooleanSelectorConfig,
) )
from homeassistant.util.package import is_installed
from importlib.metadata import version
from .utils import download_model_from_hf, get_llama_cpp_python_version, install_llama_cpp_python, format_url, MissingQuantizationException from .utils import download_model_from_hf, get_llama_cpp_python_version, install_llama_cpp_python, format_url, MissingQuantizationException
from .const import ( from .const import (
@@ -58,6 +56,7 @@ from .const import (
CONF_THINKING_SUFFIX, CONF_THINKING_SUFFIX,
CONF_TOOL_CALL_PREFIX, CONF_TOOL_CALL_PREFIX,
CONF_TOOL_CALL_SUFFIX, CONF_TOOL_CALL_SUFFIX,
CONF_ENABLE_LEGACY_TOOL_CALLING,
CONF_ENABLE_FLASH_ATTENTION, CONF_ENABLE_FLASH_ATTENTION,
CONF_USE_GBNF_GRAMMAR, CONF_USE_GBNF_GRAMMAR,
CONF_GBNF_GRAMMAR_FILE, CONF_GBNF_GRAMMAR_FILE,
@@ -107,6 +106,7 @@ from .const import (
DEFAULT_THINKING_SUFFIX, DEFAULT_THINKING_SUFFIX,
DEFAULT_TOOL_CALL_PREFIX, DEFAULT_TOOL_CALL_PREFIX,
DEFAULT_TOOL_CALL_SUFFIX, DEFAULT_TOOL_CALL_SUFFIX,
DEFAULT_ENABLE_LEGACY_TOOL_CALLING,
DEFAULT_ENABLE_FLASH_ATTENTION, DEFAULT_ENABLE_FLASH_ATTENTION,
DEFAULT_USE_GBNF_GRAMMAR, DEFAULT_USE_GBNF_GRAMMAR,
DEFAULT_GBNF_GRAMMAR_FILE, DEFAULT_GBNF_GRAMMAR_FILE,
@@ -1068,6 +1068,11 @@ def local_llama_config_option_schema(hass: HomeAssistant, options: MappingProxyT
description={"suggested_value": options.get(CONF_REQUEST_TIMEOUT)}, description={"suggested_value": options.get(CONF_REQUEST_TIMEOUT)},
default=DEFAULT_REQUEST_TIMEOUT, default=DEFAULT_REQUEST_TIMEOUT,
): NumberSelector(NumberSelectorConfig(min=5, max=900, step=1, unit_of_measurement=UnitOfTime.SECONDS, mode=NumberSelectorMode.BOX)), ): NumberSelector(NumberSelectorConfig(min=5, max=900, step=1, unit_of_measurement=UnitOfTime.SECONDS, mode=NumberSelectorMode.BOX)),
vol.Required(
CONF_ENABLE_LEGACY_TOOL_CALLING,
description={"suggested_value": options.get(CONF_ENABLE_LEGACY_TOOL_CALLING)},
default=DEFAULT_ENABLE_LEGACY_TOOL_CALLING
): bool,
}) })
elif backend_type in BACKEND_TYPE_GENERIC_OPENAI_RESPONSES: elif backend_type in BACKEND_TYPE_GENERIC_OPENAI_RESPONSES:
del result[CONF_REMEMBER_NUM_INTERACTIONS] del result[CONF_REMEMBER_NUM_INTERACTIONS]
@@ -1127,6 +1132,11 @@ def local_llama_config_option_schema(hass: HomeAssistant, options: MappingProxyT
description={"suggested_value": options.get(CONF_REQUEST_TIMEOUT)}, description={"suggested_value": options.get(CONF_REQUEST_TIMEOUT)},
default=DEFAULT_REQUEST_TIMEOUT, default=DEFAULT_REQUEST_TIMEOUT,
): NumberSelector(NumberSelectorConfig(min=5, max=900, step=1, unit_of_measurement=UnitOfTime.SECONDS, mode=NumberSelectorMode.BOX)), ): NumberSelector(NumberSelectorConfig(min=5, max=900, step=1, unit_of_measurement=UnitOfTime.SECONDS, mode=NumberSelectorMode.BOX)),
vol.Required(
CONF_ENABLE_LEGACY_TOOL_CALLING,
description={"suggested_value": options.get(CONF_ENABLE_LEGACY_TOOL_CALLING)},
default=DEFAULT_ENABLE_LEGACY_TOOL_CALLING
): bool,
}) })
elif backend_type == BACKEND_TYPE_OLLAMA: elif backend_type == BACKEND_TYPE_OLLAMA:
result = insert_after_key(result, CONF_MAX_TOKENS, { result = insert_after_key(result, CONF_MAX_TOKENS, {

View File

@@ -76,13 +76,13 @@ ICL_EXTRAS = """
{% for item in response_examples %} {% for item in response_examples %}
{{ item.request }} {{ item.request }}
{{ item.response }} {{ item.response }}
<functioncall> {{ item.tool | to_json }} {{ tool_call_prefix }}{{ item.tool | to_json }}{{ tool_call_suffix }}
{% endfor %}""" {% endfor %}"""
ICL_NO_SYSTEM_PROMPT_EXTRAS = """ ICL_NO_SYSTEM_PROMPT_EXTRAS = """
{% for item in response_examples %} {% for item in response_examples %}
{{ item.request }} {{ item.request }}
{{ item.response }} {{ item.response }}
<functioncall> {{ item.tool | to_json }} {{ tool_call_prefix }}{{ item.tool | to_json }}{{ tool_call_suffix }}
{% endfor %} {% endfor %}
<user_instruction>:""" <user_instruction>:"""
DEFAULT_PROMPT = DEFAULT_PROMPT_BASE + ICL_EXTRAS DEFAULT_PROMPT = DEFAULT_PROMPT_BASE + ICL_EXTRAS
@@ -137,6 +137,8 @@ CONF_TOOL_CALL_PREFIX = "tool_call_prefix"
DEFAULT_TOOL_CALL_PREFIX = "<tool_call>" DEFAULT_TOOL_CALL_PREFIX = "<tool_call>"
CONF_TOOL_CALL_SUFFIX = "tool_call_suffix" CONF_TOOL_CALL_SUFFIX = "tool_call_suffix"
DEFAULT_TOOL_CALL_SUFFIX = "</tool_call>" DEFAULT_TOOL_CALL_SUFFIX = "</tool_call>"
CONF_ENABLE_LEGACY_TOOL_CALLING = "enable_legacy_tool_calling"
DEFAULT_ENABLE_LEGACY_TOOL_CALLING = False
CONF_ENABLE_FLASH_ATTENTION = "enable_flash_attention" CONF_ENABLE_FLASH_ATTENTION = "enable_flash_attention"
DEFAULT_ENABLE_FLASH_ATTENTION = False DEFAULT_ENABLE_FLASH_ATTENTION = False
CONF_USE_GBNF_GRAMMAR = "gbnf_grammar" CONF_USE_GBNF_GRAMMAR = "gbnf_grammar"
@@ -179,7 +181,6 @@ CONF_GENERIC_OPENAI_PATH = "openai_path"
DEFAULT_GENERIC_OPENAI_PATH = "v1" DEFAULT_GENERIC_OPENAI_PATH = "v1"
CONF_GENERIC_OPENAI_VALIDATE_MODEL = "openai_validate_model" CONF_GENERIC_OPENAI_VALIDATE_MODEL = "openai_validate_model"
DEFAULT_GENERIC_OPENAI_VALIDATE_MODEL = True DEFAULT_GENERIC_OPENAI_VALIDATE_MODEL = True
CONF_CONTEXT_LENGTH = "context_length" CONF_CONTEXT_LENGTH = "context_length"
DEFAULT_CONTEXT_LENGTH = 2048 DEFAULT_CONTEXT_LENGTH = 2048
CONF_BATCH_SIZE = "batch_size" CONF_BATCH_SIZE = "batch_size"
@@ -228,6 +229,7 @@ OPTIONS_OVERRIDES = {
CONF_TOOL_CALL_SUFFIX: "```", CONF_TOOL_CALL_SUFFIX: "```",
CONF_CONTEXT_LENGTH: 131072, CONF_CONTEXT_LENGTH: 131072,
CONF_MAX_TOOL_CALL_ITERATIONS: 1, CONF_MAX_TOOL_CALL_ITERATIONS: 1,
CONF_ENABLE_LEGACY_TOOL_CALLING: True
}, },
"home-3b-v3": { "home-3b-v3": {
CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY, CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY,
@@ -235,6 +237,7 @@ OPTIONS_OVERRIDES = {
CONF_TOOL_CALL_PREFIX: "```homeassistant", CONF_TOOL_CALL_PREFIX: "```homeassistant",
CONF_TOOL_CALL_SUFFIX: "```", CONF_TOOL_CALL_SUFFIX: "```",
CONF_MAX_TOOL_CALL_ITERATIONS: 1, CONF_MAX_TOOL_CALL_ITERATIONS: 1,
CONF_ENABLE_LEGACY_TOOL_CALLING: True
}, },
"home-3b-v2": { "home-3b-v2": {
CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY, CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY,
@@ -242,6 +245,7 @@ OPTIONS_OVERRIDES = {
CONF_TOOL_CALL_PREFIX: "```homeassistant", CONF_TOOL_CALL_PREFIX: "```homeassistant",
CONF_TOOL_CALL_SUFFIX: "```", CONF_TOOL_CALL_SUFFIX: "```",
CONF_MAX_TOOL_CALL_ITERATIONS: 1, CONF_MAX_TOOL_CALL_ITERATIONS: 1,
CONF_ENABLE_LEGACY_TOOL_CALLING: True
}, },
"home-3b-v1": { "home-3b-v1": {
CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY, CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY,
@@ -249,6 +253,7 @@ OPTIONS_OVERRIDES = {
CONF_TOOL_CALL_PREFIX: "```homeassistant", CONF_TOOL_CALL_PREFIX: "```homeassistant",
CONF_TOOL_CALL_SUFFIX: "```", CONF_TOOL_CALL_SUFFIX: "```",
CONF_MAX_TOOL_CALL_ITERATIONS: 1, CONF_MAX_TOOL_CALL_ITERATIONS: 1,
CONF_ENABLE_LEGACY_TOOL_CALLING: True
}, },
"home-1b-v3": { "home-1b-v3": {
CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY, CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY,
@@ -256,6 +261,7 @@ OPTIONS_OVERRIDES = {
CONF_TOOL_CALL_PREFIX: "```homeassistant", CONF_TOOL_CALL_PREFIX: "```homeassistant",
CONF_TOOL_CALL_SUFFIX: "```", CONF_TOOL_CALL_SUFFIX: "```",
CONF_MAX_TOOL_CALL_ITERATIONS: 1, CONF_MAX_TOOL_CALL_ITERATIONS: 1,
CONF_ENABLE_LEGACY_TOOL_CALLING: True
}, },
"home-1b-v2": { "home-1b-v2": {
CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY, CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY,
@@ -263,6 +269,7 @@ OPTIONS_OVERRIDES = {
CONF_TOOL_CALL_PREFIX: "```homeassistant", CONF_TOOL_CALL_PREFIX: "```homeassistant",
CONF_TOOL_CALL_SUFFIX: "```", CONF_TOOL_CALL_SUFFIX: "```",
CONF_MAX_TOOL_CALL_ITERATIONS: 1, CONF_MAX_TOOL_CALL_ITERATIONS: 1,
CONF_ENABLE_LEGACY_TOOL_CALLING: True
}, },
"home-1b-v1": { "home-1b-v1": {
CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY, CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY,
@@ -270,6 +277,7 @@ OPTIONS_OVERRIDES = {
CONF_TOOL_CALL_PREFIX: "```homeassistant", CONF_TOOL_CALL_PREFIX: "```homeassistant",
CONF_TOOL_CALL_SUFFIX: "```", CONF_TOOL_CALL_SUFFIX: "```",
CONF_MAX_TOOL_CALL_ITERATIONS: 1, CONF_MAX_TOOL_CALL_ITERATIONS: 1,
CONF_ENABLE_LEGACY_TOOL_CALLING: True
}, },
"mistral": { "mistral": {
CONF_PROMPT: DEFAULT_PROMPT_BASE + ICL_NO_SYSTEM_PROMPT_EXTRAS, CONF_PROMPT: DEFAULT_PROMPT_BASE + ICL_NO_SYSTEM_PROMPT_EXTRAS,

View File

@@ -22,7 +22,7 @@ from homeassistant.helpers import config_validation as cv, intent, template, ent
from homeassistant.helpers.entity_platform import AddConfigEntryEntitiesCallback from homeassistant.helpers.entity_platform import AddConfigEntryEntitiesCallback
from homeassistant.util import color from homeassistant.util import color
from .utils import closest_color, parse_raw_tool_call from .utils import closest_color, parse_raw_tool_call, flatten_vol_schema
from .const import ( from .const import (
CONF_CHAT_MODEL, CONF_CHAT_MODEL,
CONF_PROMPT, CONF_PROMPT,
@@ -40,6 +40,7 @@ from .const import (
CONF_THINKING_SUFFIX, CONF_THINKING_SUFFIX,
CONF_TOOL_CALL_PREFIX, CONF_TOOL_CALL_PREFIX,
CONF_TOOL_CALL_SUFFIX, CONF_TOOL_CALL_SUFFIX,
CONF_ENABLE_LEGACY_TOOL_CALLING,
DEFAULT_PROMPT, DEFAULT_PROMPT,
DEFAULT_BACKEND_TYPE, DEFAULT_BACKEND_TYPE,
DEFAULT_EXTRA_ATTRIBUTES_TO_EXPOSE, DEFAULT_EXTRA_ATTRIBUTES_TO_EXPOSE,
@@ -58,6 +59,7 @@ from .const import (
DEFAULT_THINKING_SUFFIX, DEFAULT_THINKING_SUFFIX,
DEFAULT_TOOL_CALL_PREFIX, DEFAULT_TOOL_CALL_PREFIX,
DEFAULT_TOOL_CALL_SUFFIX, DEFAULT_TOOL_CALL_SUFFIX,
DEFAULT_ENABLE_LEGACY_TOOL_CALLING,
) )
_LOGGER = logging.getLogger(__name__) _LOGGER = logging.getLogger(__name__)
@@ -82,8 +84,6 @@ async def update_listener(hass: HomeAssistant, entry: ConfigEntry):
agent: LocalLLMAgent = entry.runtime_data agent: LocalLLMAgent = entry.runtime_data
await hass.async_add_executor_job(agent._update_options) await hass.async_add_executor_job(agent._update_options)
return True
async def async_setup_entry(hass: HomeAssistant, entry: ConfigEntry, async_add_entities: AddConfigEntryEntitiesCallback) -> bool: async def async_setup_entry(hass: HomeAssistant, entry: ConfigEntry, async_add_entities: AddConfigEntryEntitiesCallback) -> bool:
"""Set up Local LLM Conversation from a config entry.""" """Set up Local LLM Conversation from a config entry."""
@@ -427,15 +427,18 @@ class LocalLLMAgent(ConversationEntity, AbstractConversationAgent):
elif tool_suffix in potential_block and in_tool_call: elif tool_suffix in potential_block and in_tool_call:
in_tool_call = False in_tool_call = False
tool_call, to_say = parse_raw_tool_call(tool_content.strip().removeprefix(tool_prefix).removesuffix(tool_suffix), llm_api) if not llm_api:
_LOGGER.debug("Tool call parsed: %s", tool_call) _LOGGER.warning("Model attempted to call a tool but no LLM API was provided, ignoring tool calls")
if tool_call:
result.tool_calls = [tool_call]
if to_say:
content = to_say
else: else:
content = None tool_call, to_say = parse_raw_tool_call(tool_content.strip().removeprefix(tool_prefix).removesuffix(tool_suffix), llm_api)
_LOGGER.debug("Tool call parsed: %s", tool_call)
if tool_call:
result.tool_calls = [tool_call]
if to_say:
content = to_say
else:
content = None
result.response = content result.response = content
@@ -463,9 +466,9 @@ class LocalLLMAgent(ConversationEntity, AbstractConversationAgent):
return list(domains) return list(domains)
def _async_get_exposed_entities(self) -> dict[str, str]: def _async_get_exposed_entities(self) -> dict[str, dict]:
"""Gather exposed entity states""" """Gather exposed entity states"""
entity_states = {} entity_states: dict[str, dict] = {}
entity_registry = er.async_get(self.hass) entity_registry = er.async_get(self.hass)
device_registry = dr.async_get(self.hass) device_registry = dr.async_get(self.hass)
area_registry = ar.async_get(self.hass) area_registry = ar.async_get(self.hass)
@@ -577,10 +580,12 @@ class LocalLLMAgent(ConversationEntity, AbstractConversationAgent):
"""Generate the system prompt with current entity states""" """Generate the system prompt with current entity states"""
entities_to_expose = self._async_get_exposed_entities() entities_to_expose = self._async_get_exposed_entities()
extra_attributes_to_expose = self.entry.options \ extra_attributes_to_expose = self.entry.options.get(CONF_EXTRA_ATTRIBUTES_TO_EXPOSE, DEFAULT_EXTRA_ATTRIBUTES_TO_EXPOSE)
.get(CONF_EXTRA_ATTRIBUTES_TO_EXPOSE, DEFAULT_EXTRA_ATTRIBUTES_TO_EXPOSE) enable_legacy_tool_calling = self.entry.options.get(CONF_ENABLE_LEGACY_TOOL_CALLING, DEFAULT_ENABLE_LEGACY_TOOL_CALLING)
tool_call_prefix = self.entry.options.get(CONF_TOOL_CALL_PREFIX, DEFAULT_TOOL_CALL_PREFIX)
tool_call_suffix = self.entry.options.get(CONF_TOOL_CALL_SUFFIX, DEFAULT_TOOL_CALL_SUFFIX)
def expose_attributes(attributes) -> list[str]: def expose_attributes(attributes: dict[str, Any]) -> list[str]:
result = [] result = []
for attribute_name in extra_attributes_to_expose: for attribute_name in extra_attributes_to_expose:
if attribute_name not in attributes: if attribute_name not in attributes:
@@ -645,9 +650,23 @@ class LocalLLMAgent(ConversationEntity, AbstractConversationAgent):
render_variables = { render_variables = {
"devices": devices, "devices": devices,
"formatted_devices": formatted_devices, "formatted_devices": formatted_devices,
"response_examples": [] "response_examples": [],
"tool_call_prefix": tool_call_prefix,
"tool_call_suffix": tool_call_suffix,
} }
if enable_legacy_tool_calling:
if llm_api:
tools = []
for tool in llm_api.tools:
tools.append(f"{tool.name}({','.join(flatten_vol_schema(tool.parameters))})")
render_variables["tools"] = tools
render_variables["formatted_tools"] = ", ".join(tools)
else:
message = "No tools were provided. If the user requests you interact with a device, tell them you are unable to do so."
render_variables["tools"] = [message]
render_variables["formatted_tools"] = message
# only pass examples if there are loaded examples + an API was exposed # only pass examples if there are loaded examples + an API was exposed
if self.in_context_examples and llm_api: if self.in_context_examples and llm_api:
num_examples = int(self.entry.options.get(CONF_NUM_IN_CONTEXT_EXAMPLES, DEFAULT_NUM_IN_CONTEXT_EXAMPLES)) num_examples = int(self.entry.options.get(CONF_NUM_IN_CONTEXT_EXAMPLES, DEFAULT_NUM_IN_CONTEXT_EXAMPLES))

View File

@@ -92,7 +92,13 @@
"context_length": "Context Length", "context_length": "Context Length",
"batch_size": "Batch Size", "batch_size": "Batch Size",
"n_threads": "Thread Count", "n_threads": "Thread Count",
"n_batch_threads": "Batch Thread Count" "n_batch_threads": "Batch Thread Count",
"thinking_prefix": "Reasoning Content Prefix",
"thinking_suffix": "Reasoning Content Suffix",
"tool_call_prefix": "Tool Call Prefix",
"tool_call_suffix": "Tool Call Suffix",
"enable_legacy_tool_calling": "Enable Legacy Tool Calling",
"max_tool_call_iterations": "Maximum Tool Call Attempts"
}, },
"data_description": { "data_description": {
"llm_hass_api": "Select 'Assist' if you want the model to be able to control devices. If you are using the Home-LLM v1, v2, or v3 model then select 'Home-LLM (v1-3)'", "llm_hass_api": "Select 'Assist' if you want the model to be able to control devices. If you are using the Home-LLM v1, v2, or v3 model then select 'Home-LLM (v1-3)'",
@@ -115,9 +121,6 @@
"llm_hass_api": "Selected LLM API", "llm_hass_api": "Selected LLM API",
"max_new_tokens": "Maximum tokens to return in response", "max_new_tokens": "Maximum tokens to return in response",
"prompt": "System Prompt", "prompt": "System Prompt",
"prompt_template": "Prompt Format",
"tool_format": "Tool Format",
"tool_multi_turn_chat": "Multi-Turn Tool Use",
"temperature": "Temperature", "temperature": "Temperature",
"top_k": "Top K", "top_k": "Top K",
"top_p": "Top P", "top_p": "Top P",
@@ -147,7 +150,13 @@
"context_length": "Context Length", "context_length": "Context Length",
"batch_size": "Batch Size", "batch_size": "Batch Size",
"n_threads": "Thread Count", "n_threads": "Thread Count",
"n_batch_threads": "Batch Thread Count" "n_batch_threads": "Batch Thread Count",
"thinking_prefix": "Reasoning Content Prefix",
"thinking_suffix": "Reasoning Content Suffix",
"tool_call_prefix": "Tool Call Prefix",
"tool_call_suffix": "Tool Call Suffix",
"enable_legacy_tool_calling": "Enable Legacy Tool Calling",
"max_tool_call_iterations": "Maximum Tool Call Attempts"
}, },
"data_description": { "data_description": {
"llm_hass_api": "Select 'Assist' if you want the model to be able to control devices. If you are using the Home-LLM v1, v2, or v3 model then select 'Home-LLM (v1-3)'", "llm_hass_api": "Select 'Assist' if you want the model to be able to control devices. If you are using the Home-LLM v1, v2, or v3 model then select 'Home-LLM (v1-3)'",
@@ -167,27 +176,6 @@
} }
}, },
"selector": { "selector": {
"prompt_template": {
"options": {
"chatml": "ChatML",
"vicuna": "Vicuna",
"alpaca": "Alpaca",
"mistral": "Mistral",
"zephyr": "Zephyr (<|endoftext|>)",
"zephyr2": "Zephyr ('</s>')",
"zephyr3": "Zephyr (<|end|>)",
"llama3": "Llama 3",
"command-r": "Command R",
"no_prompt_template": "None"
}
},
"tool_format": {
"options": {
"full_tool_format": "Full JSON Tool Format",
"reduced_tool_format": "Reduced JSON Tool Format",
"min_tool_format": "Minimal Function Style Tool Format"
}
},
"model_backend": { "model_backend": {
"options": { "options": {
"llama_cpp_hf": "Llama.cpp (HuggingFace)", "llama_cpp_hf": "Llama.cpp (HuggingFace)",

View File

@@ -335,10 +335,10 @@ def get_home_llm_tools(llm_api: llm.APIInstance, domains: list[str]) -> List[Dic
# scripts show up as individual services # scripts show up as individual services
if domain == "script" and not scripts_added: if domain == "script" and not scripts_added:
all_services.extend([ all_services.extend([
("script.reload", vol.Schema({})), ("script.reload", vol.Schema({vol.Required("target_device"): str})),
("script.turn_on", vol.Schema({})), ("script.turn_on", vol.Schema({vol.Required("target_device"): str})),
("script.turn_off", vol.Schema({})), ("script.turn_off", vol.Schema({vol.Required("target_device"): str})),
("script.toggle", vol.Schema({})), ("script.toggle", vol.Schema({vol.Required("target_device"): str})),
]) ])
scripts_added = True scripts_added = True
continue continue
@@ -350,7 +350,8 @@ def get_home_llm_tools(llm_api: llm.APIInstance, domains: list[str]) -> List[Dic
args = flatten_vol_schema(service.schema) args = flatten_vol_schema(service.schema)
args_to_expose = set(args).intersection(ALLOWED_SERVICE_CALL_ARGUMENTS) args_to_expose = set(args).intersection(ALLOWED_SERVICE_CALL_ARGUMENTS)
service_schema = vol.Schema({ service_schema = vol.Schema({
vol.Optional(arg): str for arg in args_to_expose vol.Required("target_device"): str,
**{vol.Optional(arg): str for arg in args_to_expose}
}) })
all_services.append((f"{domain}.{name}", service_schema)) all_services.append((f"{domain}.{name}", service_schema))
@@ -384,18 +385,21 @@ def parse_raw_tool_call(raw_block: str | dict, llm_api: llm.APIInstance) -> tupl
else: else:
schema_to_validate = vol.Schema({ schema_to_validate = vol.Schema({
vol.Required("name"): str, vol.Required("name"): str,
vol.Required("arguments"): dict, vol.Required("arguments"): str | dict,
}) })
try: try:
schema_to_validate(parsed_tool_call) schema_to_validate(parsed_tool_call)
except vol.Error as ex: except vol.Error as ex:
_LOGGER.info(f"LLM produced an improperly formatted response: {repr(ex)}") _LOGGER.info(f"LLM produced an improperly formatted response: {repr(ex)}")
raise # re-raise exception for now to force the LLM to try again raise ex # re-raise exception for now to force the LLM to try again
# try to fix certain arguments # try to fix certain arguments
args_dict = parsed_tool_call if llm_api.api.id == HOME_LLM_API_ID else parsed_tool_call["arguments"] args_dict = parsed_tool_call if llm_api.api.id == HOME_LLM_API_ID else parsed_tool_call["arguments"]
if isinstance(args_dict, str):
args_dict = json.loads(args_dict)
# make sure brightness is 0-255 and not a percentage # make sure brightness is 0-255 and not a percentage
if "brightness" in args_dict and 0.0 < args_dict["brightness"] <= 1.0: if "brightness" in args_dict and 0.0 < args_dict["brightness"] <= 1.0:
args_dict["brightness"] = int(args_dict["brightness"] * 255) args_dict["brightness"] = int(args_dict["brightness"] * 255)

59
docker-compose.yml Normal file
View File

@@ -0,0 +1,59 @@
# you can start and stop backends by running `docker-compose up -d <service name>`
version: '3.8'
services:
ollama:
image: ollama/ollama:latest
container_name: ollama
ports:
- "11434:11434" # Ollama default
volumes:
- ./models:/models
- ./scripts:/scripts # needed for import script
environment:
- OLLAMA_MODELS=/models/.ollama
restart: unless-stopped
text-generation-webui:
image: atinoda/text-generation-webui:default-cpu
container_name: textgen-webui
init: true
environment:
- EXTRA_LAUNCH_ARGS="--listen --verbose" # Custom launch args (e.g., --model MODEL_NAME)
ports:
- "7860:7860" # Web UI default
# - "5000:5000" # API Default
# - "5005:5005" # Streaming API default
volumes:
- ./models:/app/user_data/models
restart: unless-stopped
# llamacpp server can only run one model at a time; set it below
llamacpp:
image: ghcr.io/ggerganov/llama.cpp:server
container_name: llamacpp-server
ports:
- "8000:8000" # llama.cpp server default
volumes:
- ./models:/models
environment:
- MODEL_DIR=/models
restart: unless-stopped
command: |-
--port 8000
--no-webui
--metrics
--jinja
--ctx-size 8192
--alias "Home-3B-v3"
--model "/models/Home-3B-v3-fixed.q4_k_m.gguf"
localai:
image: localai/localai:latest
container_name: localai
ports:
- "8080:8080" # LocalAI default
volumes:
- ./models:/models
environment:
- MODELS_PATH=/models
restart: unless-stopped

9
scripts/fix_metadata.sh Normal file
View File

@@ -0,0 +1,9 @@
#!/bin/bash
LLAMA_CPP=../llama.cpp
MODEL_NAME=$1
OUTPUT_NAME=$2
PRE_TOKENIZER=${3:-stablelm2}
CHAT_TEMPLATE=${4:-zephyr_legacy}
python3 ${LLAMA_CPP}/gguf-py/gguf/scripts/gguf_new_metadata.py $MODEL_NAME $OUTPUT_NAME --pre-tokenizer $PRE_TOKENIZER --chat-template "$(cat $CHAT_TEMPLATE.txt)"

8
scripts/import_ollama_model.sh Executable file
View File

@@ -0,0 +1,8 @@
# Usage: docker exec -it ollama bash -c "/scripts/import_ollama_model.sh /models/Home-3B-v3.q4_k_m.gguf Home-3B-v3:q4_k_m"
LLAMA_CPP=../llama.cpp
GGUF_FILE=$1
MODEL_NAME=$2
echo "FROM $GGUF_FILE" > $GGUF_FILE.Modelfile
ollama create $MODEL_NAME -f $GGUF_FILE.Modelfile
rm -f $GGUF_FILE.Modelfile

17
scripts/zephyr_legacy.txt Normal file
View File

@@ -0,0 +1,17 @@
{% for message in messages %}
{%- if message['role'] == 'user' or message['role'] == 'tool' -%}
<|user|> {{ message['content'] }}{{ eos_token }}
{%- elif message['role'] == 'system' -%}
<|system|> {{ message['content'] }}
Services:
{%- for tool in tools %} {{ tool['function']['name'] }}({% for param in tool['function']['parameters']['properties'].keys() if param != 'target_device' %}{{ param }}{% if not loop.last %},{% endif %}{% endfor -%}),{% if not loop.last -%}
{%- if tools | length == 0 %}No tools were provided. If the user requests you interact with a device, tell them you are unable to do so.{% endif %}
{%- endif -%}{%- endfor -%}
{{ eos_token }}
{%- elif message['role'] == 'assistant' -%}
<|assistant|> {{ message['content'] }}{{ eos_token }}
{%- endif -%}
{%- if loop.last and add_generation_prompt %}
<|assistant|>
{%- endif %}
{% endfor -%}