add docker compose stack for testing + backends are mostly working at this point

This commit is contained in:
Alex O'Connell
2025-09-15 22:10:25 -04:00
parent 9b8baeed07
commit 1425413fc9
12 changed files with 203 additions and 105 deletions

15
TODO.md
View File

@@ -1,8 +1,9 @@
# TODO
- [ ] proper tool calling support
- [x] proper tool calling support
- [ ] fix old GGUFs to support tool calling
- [ ] home assistant component text streaming support
- [ ] new models based on qwen3
- [ ] new model based on qwen3 0.6b
- [ ] new model based on gemma3 270m
- [ ] support AI task API
- [x] support new LLM APIs
- rewrite how services are called
@@ -42,6 +43,16 @@
- [x] use varied system prompts to add behaviors
## v0.4 TODO for release:
[ ] re-order the settings on the options config flow page. the order is very confusing
[ ] split out entity functionality so we can support conversation + ai tasks
[x] fix icl examples to match new tool calling syntax config
[x] set up docker-compose for running all of the various backends
[ ] fix and re-upload all compatible old models (+ upload all original safetensors)
[ ] move llamacpp to a separate process because of all the crashing
[ ] dedicated localai backend (tailored openai variant /w model loading)
[ ] fix the openai responses backend
## more complicated ideas
- [ ] "context requests"
- basically just let the model decide what RAG/extra context it wants

View File

@@ -26,6 +26,7 @@ from custom_components.llama_conversation.const import (
CONF_REMEMBER_CONVERSATION,
CONF_REMEMBER_CONVERSATION_TIME_MINUTES,
CONF_GENERIC_OPENAI_PATH,
CONF_ENABLE_LEGACY_TOOL_CALLING,
DEFAULT_MAX_TOKENS,
DEFAULT_TEMPERATURE,
DEFAULT_TOP_P,
@@ -33,6 +34,7 @@ from custom_components.llama_conversation.const import (
DEFAULT_REMEMBER_CONVERSATION,
DEFAULT_REMEMBER_CONVERSATION_TIME_MINUTES,
DEFAULT_GENERIC_OPENAI_PATH,
DEFAULT_ENABLE_LEGACY_TOOL_CALLING,
)
from custom_components.llama_conversation.conversation import LocalLLMAgent, TextGenerationResult
@@ -63,6 +65,7 @@ class GenericOpenAIAPIAgent(LocalLLMAgent):
temperature = self.entry.options.get(CONF_TEMPERATURE, DEFAULT_TEMPERATURE)
top_p = self.entry.options.get(CONF_TOP_P, DEFAULT_TOP_P)
timeout = self.entry.options.get(CONF_REQUEST_TIMEOUT, DEFAULT_REQUEST_TIMEOUT)
enable_legacy_tool_calling = self.entry.options.get(CONF_ENABLE_LEGACY_TOOL_CALLING, DEFAULT_ENABLE_LEGACY_TOOL_CALLING)
endpoint, additional_params = self._chat_completion_params()
messages = get_oai_formatted_messages(conversation)
@@ -77,7 +80,9 @@ class GenericOpenAIAPIAgent(LocalLLMAgent):
}
tools = None
if llm_api:
# "legacy" tool calling passes the tools directly as part of the system prompt instead of as "tools"
# most local backends absolutely butcher any sort of prompt formatting when using tool calling
if llm_api and not enable_legacy_tool_calling:
tools = get_oai_formatted_tools(llm_api, self._async_get_all_exposed_domains())
request_params["tools"] = tools
@@ -103,22 +108,19 @@ class GenericOpenAIAPIAgent(LocalLLMAgent):
) as response:
response.raise_for_status()
async for line_bytes in response.content:
chunk = line_bytes.decode("utf-8").strip().removeprefix("data: ")
raw_line = line_bytes.decode("utf-8").strip()
if raw_line.startswith("error: "):
raise Exception(f"Error from server: {raw_line}")
chunk = raw_line.removeprefix("data: ")
if "[DONE]" in chunk:
break
if chunk and chunk.strip():
yield self._extract_response(json.loads(chunk), llm_api)
except asyncio.TimeoutError as err:
raise HomeAssistantError("The generation request timed out! Please check your connection settings, increase the timeout in settings, or decrease the number of exposed entities.") from err
except aiohttp.ClientError as err:
raise HomeAssistantError(f"Failed to communicate with the API! {err}") from err
except Exception as err:
_LOGGER.debug(f"Err was: {err}")
_LOGGER.debug(f"Request was: {request_params}")
_LOGGER.debug(f"Result was: {response}")
_LOGGER.debug(f"Chunk was {chunk}")
raise HomeAssistantError(f"An unknown error occurred! {err}") from err
return self._async_parse_completion(llm_api, anext_token=anext_token())
@@ -159,8 +161,6 @@ class GenericOpenAIAPIAgent(LocalLLMAgent):
if choice["finish_reason"] == "length" or choice["finish_reason"] == "content_filter":
_LOGGER.warning("Model response did not end on a stop token (unfinished sentence)")
_LOGGER.debug("Model chunk '%s'", response_text)
return response_text, tool_calls

View File

@@ -80,11 +80,11 @@ class TextGenerationWebuiAgent(GenericOpenAIAPIAgent):
_LOGGER.debug("Connection error was: %s", repr(ex))
raise ConfigEntryNotReady("There was a problem connecting to the remote server") from ex
def _chat_completion_params(self, conversation: List[Dict[str, str]]) -> Tuple[str, Dict]:
def _chat_completion_params(self) -> Tuple[str, Dict[str, Any]]:
preset = self.entry.options.get(CONF_TEXT_GEN_WEBUI_PRESET)
chat_mode = self.entry.options.get(CONF_TEXT_GEN_WEBUI_CHAT_MODE, DEFAULT_TEXT_GEN_WEBUI_CHAT_MODE)
endpoint, request_params = super()._chat_completion_params(conversation)
endpoint, request_params = super()._chat_completion_params()
request_params["mode"] = chat_mode
if chat_mode == TEXT_GEN_WEBUI_CHAT_MODE_CHAT or chat_mode == TEXT_GEN_WEBUI_CHAT_MODE_CHAT_INSTRUCT:
@@ -98,37 +98,6 @@ class TextGenerationWebuiAgent(GenericOpenAIAPIAgent):
return endpoint, request_params
def _completion_params(self, conversation: List[Dict[str, str]]) -> Tuple[str, Dict[str, Any]]:
preset = self.entry.options.get(CONF_TEXT_GEN_WEBUI_PRESET)
endpoint, request_params = super()._completion_params(conversation)
if preset:
request_params["preset"] = preset
request_params["truncation_length"] = self.entry.options.get(CONF_CONTEXT_LENGTH, DEFAULT_CONTEXT_LENGTH)
request_params["top_k"] = self.entry.options.get(CONF_TOP_K, DEFAULT_TOP_K)
request_params["min_p"] = self.entry.options.get(CONF_MIN_P, DEFAULT_MIN_P)
request_params["typical_p"] = self.entry.options.get(CONF_TYPICAL_P, DEFAULT_TYPICAL_P)
return endpoint, request_params
def _extract_response(self, response_json: dict) -> TextGenerationResult:
choices = response_json["choices"]
if choices[0]["finish_reason"] != "stop":
_LOGGER.warning("Model response did not end on a stop token (unfinished sentence)")
context_len = self.entry.options.get(CONF_CONTEXT_LENGTH, DEFAULT_CONTEXT_LENGTH)
max_tokens = self.entry.options.get(CONF_MAX_TOKENS, DEFAULT_MAX_TOKENS)
if response_json["usage"]["prompt_tokens"] + max_tokens > context_len:
self._warn_context_size()
# text-gen-webui has a typo where it is 'chat.completions' not 'chat.completion'
if response_json["object"] == "chat.completions":
return choices[0]["message"]["content"]
else:
return choices[0]["text"]
class LlamaCppServerAgent(GenericOpenAIAPIAgent):
grammar: str
@@ -152,8 +121,4 @@ class LlamaCppServerAgent(GenericOpenAIAPIAgent):
if self.entry.options.get(CONF_USE_GBNF_GRAMMAR, DEFAULT_USE_GBNF_GRAMMAR):
request_params["grammar"] = self.grammar
# force usage of COMMON_CHAT_TOOL_CHOICE_NONE so it returns raw content and then parse ourself when using
# the custom home llm tool call syntax. otherwise let the server detect it automatically
request_params["tool_choice"] = "none"
return endpoint, request_params

View File

@@ -34,8 +34,6 @@ from homeassistant.helpers.selector import (
BooleanSelector,
BooleanSelectorConfig,
)
from homeassistant.util.package import is_installed
from importlib.metadata import version
from .utils import download_model_from_hf, get_llama_cpp_python_version, install_llama_cpp_python, format_url, MissingQuantizationException
from .const import (
@@ -58,6 +56,7 @@ from .const import (
CONF_THINKING_SUFFIX,
CONF_TOOL_CALL_PREFIX,
CONF_TOOL_CALL_SUFFIX,
CONF_ENABLE_LEGACY_TOOL_CALLING,
CONF_ENABLE_FLASH_ATTENTION,
CONF_USE_GBNF_GRAMMAR,
CONF_GBNF_GRAMMAR_FILE,
@@ -107,6 +106,7 @@ from .const import (
DEFAULT_THINKING_SUFFIX,
DEFAULT_TOOL_CALL_PREFIX,
DEFAULT_TOOL_CALL_SUFFIX,
DEFAULT_ENABLE_LEGACY_TOOL_CALLING,
DEFAULT_ENABLE_FLASH_ATTENTION,
DEFAULT_USE_GBNF_GRAMMAR,
DEFAULT_GBNF_GRAMMAR_FILE,
@@ -1068,6 +1068,11 @@ def local_llama_config_option_schema(hass: HomeAssistant, options: MappingProxyT
description={"suggested_value": options.get(CONF_REQUEST_TIMEOUT)},
default=DEFAULT_REQUEST_TIMEOUT,
): NumberSelector(NumberSelectorConfig(min=5, max=900, step=1, unit_of_measurement=UnitOfTime.SECONDS, mode=NumberSelectorMode.BOX)),
vol.Required(
CONF_ENABLE_LEGACY_TOOL_CALLING,
description={"suggested_value": options.get(CONF_ENABLE_LEGACY_TOOL_CALLING)},
default=DEFAULT_ENABLE_LEGACY_TOOL_CALLING
): bool,
})
elif backend_type in BACKEND_TYPE_GENERIC_OPENAI_RESPONSES:
del result[CONF_REMEMBER_NUM_INTERACTIONS]
@@ -1127,6 +1132,11 @@ def local_llama_config_option_schema(hass: HomeAssistant, options: MappingProxyT
description={"suggested_value": options.get(CONF_REQUEST_TIMEOUT)},
default=DEFAULT_REQUEST_TIMEOUT,
): NumberSelector(NumberSelectorConfig(min=5, max=900, step=1, unit_of_measurement=UnitOfTime.SECONDS, mode=NumberSelectorMode.BOX)),
vol.Required(
CONF_ENABLE_LEGACY_TOOL_CALLING,
description={"suggested_value": options.get(CONF_ENABLE_LEGACY_TOOL_CALLING)},
default=DEFAULT_ENABLE_LEGACY_TOOL_CALLING
): bool,
})
elif backend_type == BACKEND_TYPE_OLLAMA:
result = insert_after_key(result, CONF_MAX_TOKENS, {

View File

@@ -76,13 +76,13 @@ ICL_EXTRAS = """
{% for item in response_examples %}
{{ item.request }}
{{ item.response }}
<functioncall> {{ item.tool | to_json }}
{{ tool_call_prefix }}{{ item.tool | to_json }}{{ tool_call_suffix }}
{% endfor %}"""
ICL_NO_SYSTEM_PROMPT_EXTRAS = """
{% for item in response_examples %}
{{ item.request }}
{{ item.response }}
<functioncall> {{ item.tool | to_json }}
{{ tool_call_prefix }}{{ item.tool | to_json }}{{ tool_call_suffix }}
{% endfor %}
<user_instruction>:"""
DEFAULT_PROMPT = DEFAULT_PROMPT_BASE + ICL_EXTRAS
@@ -137,6 +137,8 @@ CONF_TOOL_CALL_PREFIX = "tool_call_prefix"
DEFAULT_TOOL_CALL_PREFIX = "<tool_call>"
CONF_TOOL_CALL_SUFFIX = "tool_call_suffix"
DEFAULT_TOOL_CALL_SUFFIX = "</tool_call>"
CONF_ENABLE_LEGACY_TOOL_CALLING = "enable_legacy_tool_calling"
DEFAULT_ENABLE_LEGACY_TOOL_CALLING = False
CONF_ENABLE_FLASH_ATTENTION = "enable_flash_attention"
DEFAULT_ENABLE_FLASH_ATTENTION = False
CONF_USE_GBNF_GRAMMAR = "gbnf_grammar"
@@ -179,7 +181,6 @@ CONF_GENERIC_OPENAI_PATH = "openai_path"
DEFAULT_GENERIC_OPENAI_PATH = "v1"
CONF_GENERIC_OPENAI_VALIDATE_MODEL = "openai_validate_model"
DEFAULT_GENERIC_OPENAI_VALIDATE_MODEL = True
CONF_CONTEXT_LENGTH = "context_length"
DEFAULT_CONTEXT_LENGTH = 2048
CONF_BATCH_SIZE = "batch_size"
@@ -228,6 +229,7 @@ OPTIONS_OVERRIDES = {
CONF_TOOL_CALL_SUFFIX: "```",
CONF_CONTEXT_LENGTH: 131072,
CONF_MAX_TOOL_CALL_ITERATIONS: 1,
CONF_ENABLE_LEGACY_TOOL_CALLING: True
},
"home-3b-v3": {
CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY,
@@ -235,6 +237,7 @@ OPTIONS_OVERRIDES = {
CONF_TOOL_CALL_PREFIX: "```homeassistant",
CONF_TOOL_CALL_SUFFIX: "```",
CONF_MAX_TOOL_CALL_ITERATIONS: 1,
CONF_ENABLE_LEGACY_TOOL_CALLING: True
},
"home-3b-v2": {
CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY,
@@ -242,6 +245,7 @@ OPTIONS_OVERRIDES = {
CONF_TOOL_CALL_PREFIX: "```homeassistant",
CONF_TOOL_CALL_SUFFIX: "```",
CONF_MAX_TOOL_CALL_ITERATIONS: 1,
CONF_ENABLE_LEGACY_TOOL_CALLING: True
},
"home-3b-v1": {
CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY,
@@ -249,6 +253,7 @@ OPTIONS_OVERRIDES = {
CONF_TOOL_CALL_PREFIX: "```homeassistant",
CONF_TOOL_CALL_SUFFIX: "```",
CONF_MAX_TOOL_CALL_ITERATIONS: 1,
CONF_ENABLE_LEGACY_TOOL_CALLING: True
},
"home-1b-v3": {
CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY,
@@ -256,6 +261,7 @@ OPTIONS_OVERRIDES = {
CONF_TOOL_CALL_PREFIX: "```homeassistant",
CONF_TOOL_CALL_SUFFIX: "```",
CONF_MAX_TOOL_CALL_ITERATIONS: 1,
CONF_ENABLE_LEGACY_TOOL_CALLING: True
},
"home-1b-v2": {
CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY,
@@ -263,6 +269,7 @@ OPTIONS_OVERRIDES = {
CONF_TOOL_CALL_PREFIX: "```homeassistant",
CONF_TOOL_CALL_SUFFIX: "```",
CONF_MAX_TOOL_CALL_ITERATIONS: 1,
CONF_ENABLE_LEGACY_TOOL_CALLING: True
},
"home-1b-v1": {
CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY,
@@ -270,6 +277,7 @@ OPTIONS_OVERRIDES = {
CONF_TOOL_CALL_PREFIX: "```homeassistant",
CONF_TOOL_CALL_SUFFIX: "```",
CONF_MAX_TOOL_CALL_ITERATIONS: 1,
CONF_ENABLE_LEGACY_TOOL_CALLING: True
},
"mistral": {
CONF_PROMPT: DEFAULT_PROMPT_BASE + ICL_NO_SYSTEM_PROMPT_EXTRAS,

View File

@@ -22,7 +22,7 @@ from homeassistant.helpers import config_validation as cv, intent, template, ent
from homeassistant.helpers.entity_platform import AddConfigEntryEntitiesCallback
from homeassistant.util import color
from .utils import closest_color, parse_raw_tool_call
from .utils import closest_color, parse_raw_tool_call, flatten_vol_schema
from .const import (
CONF_CHAT_MODEL,
CONF_PROMPT,
@@ -40,6 +40,7 @@ from .const import (
CONF_THINKING_SUFFIX,
CONF_TOOL_CALL_PREFIX,
CONF_TOOL_CALL_SUFFIX,
CONF_ENABLE_LEGACY_TOOL_CALLING,
DEFAULT_PROMPT,
DEFAULT_BACKEND_TYPE,
DEFAULT_EXTRA_ATTRIBUTES_TO_EXPOSE,
@@ -58,6 +59,7 @@ from .const import (
DEFAULT_THINKING_SUFFIX,
DEFAULT_TOOL_CALL_PREFIX,
DEFAULT_TOOL_CALL_SUFFIX,
DEFAULT_ENABLE_LEGACY_TOOL_CALLING,
)
_LOGGER = logging.getLogger(__name__)
@@ -82,8 +84,6 @@ async def update_listener(hass: HomeAssistant, entry: ConfigEntry):
agent: LocalLLMAgent = entry.runtime_data
await hass.async_add_executor_job(agent._update_options)
return True
async def async_setup_entry(hass: HomeAssistant, entry: ConfigEntry, async_add_entities: AddConfigEntryEntitiesCallback) -> bool:
"""Set up Local LLM Conversation from a config entry."""
@@ -427,15 +427,18 @@ class LocalLLMAgent(ConversationEntity, AbstractConversationAgent):
elif tool_suffix in potential_block and in_tool_call:
in_tool_call = False
tool_call, to_say = parse_raw_tool_call(tool_content.strip().removeprefix(tool_prefix).removesuffix(tool_suffix), llm_api)
_LOGGER.debug("Tool call parsed: %s", tool_call)
if tool_call:
result.tool_calls = [tool_call]
if to_say:
content = to_say
if not llm_api:
_LOGGER.warning("Model attempted to call a tool but no LLM API was provided, ignoring tool calls")
else:
content = None
tool_call, to_say = parse_raw_tool_call(tool_content.strip().removeprefix(tool_prefix).removesuffix(tool_suffix), llm_api)
_LOGGER.debug("Tool call parsed: %s", tool_call)
if tool_call:
result.tool_calls = [tool_call]
if to_say:
content = to_say
else:
content = None
result.response = content
@@ -463,9 +466,9 @@ class LocalLLMAgent(ConversationEntity, AbstractConversationAgent):
return list(domains)
def _async_get_exposed_entities(self) -> dict[str, str]:
def _async_get_exposed_entities(self) -> dict[str, dict]:
"""Gather exposed entity states"""
entity_states = {}
entity_states: dict[str, dict] = {}
entity_registry = er.async_get(self.hass)
device_registry = dr.async_get(self.hass)
area_registry = ar.async_get(self.hass)
@@ -577,10 +580,12 @@ class LocalLLMAgent(ConversationEntity, AbstractConversationAgent):
"""Generate the system prompt with current entity states"""
entities_to_expose = self._async_get_exposed_entities()
extra_attributes_to_expose = self.entry.options \
.get(CONF_EXTRA_ATTRIBUTES_TO_EXPOSE, DEFAULT_EXTRA_ATTRIBUTES_TO_EXPOSE)
extra_attributes_to_expose = self.entry.options.get(CONF_EXTRA_ATTRIBUTES_TO_EXPOSE, DEFAULT_EXTRA_ATTRIBUTES_TO_EXPOSE)
enable_legacy_tool_calling = self.entry.options.get(CONF_ENABLE_LEGACY_TOOL_CALLING, DEFAULT_ENABLE_LEGACY_TOOL_CALLING)
tool_call_prefix = self.entry.options.get(CONF_TOOL_CALL_PREFIX, DEFAULT_TOOL_CALL_PREFIX)
tool_call_suffix = self.entry.options.get(CONF_TOOL_CALL_SUFFIX, DEFAULT_TOOL_CALL_SUFFIX)
def expose_attributes(attributes) -> list[str]:
def expose_attributes(attributes: dict[str, Any]) -> list[str]:
result = []
for attribute_name in extra_attributes_to_expose:
if attribute_name not in attributes:
@@ -645,9 +650,23 @@ class LocalLLMAgent(ConversationEntity, AbstractConversationAgent):
render_variables = {
"devices": devices,
"formatted_devices": formatted_devices,
"response_examples": []
"response_examples": [],
"tool_call_prefix": tool_call_prefix,
"tool_call_suffix": tool_call_suffix,
}
if enable_legacy_tool_calling:
if llm_api:
tools = []
for tool in llm_api.tools:
tools.append(f"{tool.name}({','.join(flatten_vol_schema(tool.parameters))})")
render_variables["tools"] = tools
render_variables["formatted_tools"] = ", ".join(tools)
else:
message = "No tools were provided. If the user requests you interact with a device, tell them you are unable to do so."
render_variables["tools"] = [message]
render_variables["formatted_tools"] = message
# only pass examples if there are loaded examples + an API was exposed
if self.in_context_examples and llm_api:
num_examples = int(self.entry.options.get(CONF_NUM_IN_CONTEXT_EXAMPLES, DEFAULT_NUM_IN_CONTEXT_EXAMPLES))

View File

@@ -92,7 +92,13 @@
"context_length": "Context Length",
"batch_size": "Batch Size",
"n_threads": "Thread Count",
"n_batch_threads": "Batch Thread Count"
"n_batch_threads": "Batch Thread Count",
"thinking_prefix": "Reasoning Content Prefix",
"thinking_suffix": "Reasoning Content Suffix",
"tool_call_prefix": "Tool Call Prefix",
"tool_call_suffix": "Tool Call Suffix",
"enable_legacy_tool_calling": "Enable Legacy Tool Calling",
"max_tool_call_iterations": "Maximum Tool Call Attempts"
},
"data_description": {
"llm_hass_api": "Select 'Assist' if you want the model to be able to control devices. If you are using the Home-LLM v1, v2, or v3 model then select 'Home-LLM (v1-3)'",
@@ -115,9 +121,6 @@
"llm_hass_api": "Selected LLM API",
"max_new_tokens": "Maximum tokens to return in response",
"prompt": "System Prompt",
"prompt_template": "Prompt Format",
"tool_format": "Tool Format",
"tool_multi_turn_chat": "Multi-Turn Tool Use",
"temperature": "Temperature",
"top_k": "Top K",
"top_p": "Top P",
@@ -147,7 +150,13 @@
"context_length": "Context Length",
"batch_size": "Batch Size",
"n_threads": "Thread Count",
"n_batch_threads": "Batch Thread Count"
"n_batch_threads": "Batch Thread Count",
"thinking_prefix": "Reasoning Content Prefix",
"thinking_suffix": "Reasoning Content Suffix",
"tool_call_prefix": "Tool Call Prefix",
"tool_call_suffix": "Tool Call Suffix",
"enable_legacy_tool_calling": "Enable Legacy Tool Calling",
"max_tool_call_iterations": "Maximum Tool Call Attempts"
},
"data_description": {
"llm_hass_api": "Select 'Assist' if you want the model to be able to control devices. If you are using the Home-LLM v1, v2, or v3 model then select 'Home-LLM (v1-3)'",
@@ -167,27 +176,6 @@
}
},
"selector": {
"prompt_template": {
"options": {
"chatml": "ChatML",
"vicuna": "Vicuna",
"alpaca": "Alpaca",
"mistral": "Mistral",
"zephyr": "Zephyr (<|endoftext|>)",
"zephyr2": "Zephyr ('</s>')",
"zephyr3": "Zephyr (<|end|>)",
"llama3": "Llama 3",
"command-r": "Command R",
"no_prompt_template": "None"
}
},
"tool_format": {
"options": {
"full_tool_format": "Full JSON Tool Format",
"reduced_tool_format": "Reduced JSON Tool Format",
"min_tool_format": "Minimal Function Style Tool Format"
}
},
"model_backend": {
"options": {
"llama_cpp_hf": "Llama.cpp (HuggingFace)",

View File

@@ -335,10 +335,10 @@ def get_home_llm_tools(llm_api: llm.APIInstance, domains: list[str]) -> List[Dic
# scripts show up as individual services
if domain == "script" and not scripts_added:
all_services.extend([
("script.reload", vol.Schema({})),
("script.turn_on", vol.Schema({})),
("script.turn_off", vol.Schema({})),
("script.toggle", vol.Schema({})),
("script.reload", vol.Schema({vol.Required("target_device"): str})),
("script.turn_on", vol.Schema({vol.Required("target_device"): str})),
("script.turn_off", vol.Schema({vol.Required("target_device"): str})),
("script.toggle", vol.Schema({vol.Required("target_device"): str})),
])
scripts_added = True
continue
@@ -350,7 +350,8 @@ def get_home_llm_tools(llm_api: llm.APIInstance, domains: list[str]) -> List[Dic
args = flatten_vol_schema(service.schema)
args_to_expose = set(args).intersection(ALLOWED_SERVICE_CALL_ARGUMENTS)
service_schema = vol.Schema({
vol.Optional(arg): str for arg in args_to_expose
vol.Required("target_device"): str,
**{vol.Optional(arg): str for arg in args_to_expose}
})
all_services.append((f"{domain}.{name}", service_schema))
@@ -384,18 +385,21 @@ def parse_raw_tool_call(raw_block: str | dict, llm_api: llm.APIInstance) -> tupl
else:
schema_to_validate = vol.Schema({
vol.Required("name"): str,
vol.Required("arguments"): dict,
vol.Required("arguments"): str | dict,
})
try:
schema_to_validate(parsed_tool_call)
except vol.Error as ex:
_LOGGER.info(f"LLM produced an improperly formatted response: {repr(ex)}")
raise # re-raise exception for now to force the LLM to try again
raise ex # re-raise exception for now to force the LLM to try again
# try to fix certain arguments
args_dict = parsed_tool_call if llm_api.api.id == HOME_LLM_API_ID else parsed_tool_call["arguments"]
if isinstance(args_dict, str):
args_dict = json.loads(args_dict)
# make sure brightness is 0-255 and not a percentage
if "brightness" in args_dict and 0.0 < args_dict["brightness"] <= 1.0:
args_dict["brightness"] = int(args_dict["brightness"] * 255)

59
docker-compose.yml Normal file
View File

@@ -0,0 +1,59 @@
# you can start and stop backends by running `docker-compose up -d <service name>`
version: '3.8'
services:
ollama:
image: ollama/ollama:latest
container_name: ollama
ports:
- "11434:11434" # Ollama default
volumes:
- ./models:/models
- ./scripts:/scripts # needed for import script
environment:
- OLLAMA_MODELS=/models/.ollama
restart: unless-stopped
text-generation-webui:
image: atinoda/text-generation-webui:default-cpu
container_name: textgen-webui
init: true
environment:
- EXTRA_LAUNCH_ARGS="--listen --verbose" # Custom launch args (e.g., --model MODEL_NAME)
ports:
- "7860:7860" # Web UI default
# - "5000:5000" # API Default
# - "5005:5005" # Streaming API default
volumes:
- ./models:/app/user_data/models
restart: unless-stopped
# llamacpp server can only run one model at a time; set it below
llamacpp:
image: ghcr.io/ggerganov/llama.cpp:server
container_name: llamacpp-server
ports:
- "8000:8000" # llama.cpp server default
volumes:
- ./models:/models
environment:
- MODEL_DIR=/models
restart: unless-stopped
command: |-
--port 8000
--no-webui
--metrics
--jinja
--ctx-size 8192
--alias "Home-3B-v3"
--model "/models/Home-3B-v3-fixed.q4_k_m.gguf"
localai:
image: localai/localai:latest
container_name: localai
ports:
- "8080:8080" # LocalAI default
volumes:
- ./models:/models
environment:
- MODELS_PATH=/models
restart: unless-stopped

9
scripts/fix_metadata.sh Normal file
View File

@@ -0,0 +1,9 @@
#!/bin/bash
LLAMA_CPP=../llama.cpp
MODEL_NAME=$1
OUTPUT_NAME=$2
PRE_TOKENIZER=${3:-stablelm2}
CHAT_TEMPLATE=${4:-zephyr_legacy}
python3 ${LLAMA_CPP}/gguf-py/gguf/scripts/gguf_new_metadata.py $MODEL_NAME $OUTPUT_NAME --pre-tokenizer $PRE_TOKENIZER --chat-template "$(cat $CHAT_TEMPLATE.txt)"

8
scripts/import_ollama_model.sh Executable file
View File

@@ -0,0 +1,8 @@
# Usage: docker exec -it ollama bash -c "/scripts/import_ollama_model.sh /models/Home-3B-v3.q4_k_m.gguf Home-3B-v3:q4_k_m"
LLAMA_CPP=../llama.cpp
GGUF_FILE=$1
MODEL_NAME=$2
echo "FROM $GGUF_FILE" > $GGUF_FILE.Modelfile
ollama create $MODEL_NAME -f $GGUF_FILE.Modelfile
rm -f $GGUF_FILE.Modelfile

17
scripts/zephyr_legacy.txt Normal file
View File

@@ -0,0 +1,17 @@
{% for message in messages %}
{%- if message['role'] == 'user' or message['role'] == 'tool' -%}
<|user|> {{ message['content'] }}{{ eos_token }}
{%- elif message['role'] == 'system' -%}
<|system|> {{ message['content'] }}
Services:
{%- for tool in tools %} {{ tool['function']['name'] }}({% for param in tool['function']['parameters']['properties'].keys() if param != 'target_device' %}{{ param }}{% if not loop.last %},{% endif %}{% endfor -%}),{% if not loop.last -%}
{%- if tools | length == 0 %}No tools were provided. If the user requests you interact with a device, tell them you are unable to do so.{% endif %}
{%- endif -%}{%- endfor -%}
{{ eos_token }}
{%- elif message['role'] == 'assistant' -%}
<|assistant|> {{ message['content'] }}{{ eos_token }}
{%- endif -%}
{%- if loop.last and add_generation_prompt %}
<|assistant|>
{%- endif %}
{% endfor -%}