mirror of
https://github.com/acon96/home-llm.git
synced 2026-01-07 21:04:08 -05:00
add docker compose stack for testing + backends are mostly working at this point
This commit is contained in:
15
TODO.md
15
TODO.md
@@ -1,8 +1,9 @@
|
||||
# TODO
|
||||
- [ ] proper tool calling support
|
||||
- [x] proper tool calling support
|
||||
- [ ] fix old GGUFs to support tool calling
|
||||
- [ ] home assistant component text streaming support
|
||||
- [ ] new models based on qwen3
|
||||
- [ ] new model based on qwen3 0.6b
|
||||
- [ ] new model based on gemma3 270m
|
||||
- [ ] support AI task API
|
||||
- [x] support new LLM APIs
|
||||
- rewrite how services are called
|
||||
@@ -42,6 +43,16 @@
|
||||
- [x] use varied system prompts to add behaviors
|
||||
|
||||
|
||||
## v0.4 TODO for release:
|
||||
[ ] re-order the settings on the options config flow page. the order is very confusing
|
||||
[ ] split out entity functionality so we can support conversation + ai tasks
|
||||
[x] fix icl examples to match new tool calling syntax config
|
||||
[x] set up docker-compose for running all of the various backends
|
||||
[ ] fix and re-upload all compatible old models (+ upload all original safetensors)
|
||||
[ ] move llamacpp to a separate process because of all the crashing
|
||||
[ ] dedicated localai backend (tailored openai variant /w model loading)
|
||||
[ ] fix the openai responses backend
|
||||
|
||||
## more complicated ideas
|
||||
- [ ] "context requests"
|
||||
- basically just let the model decide what RAG/extra context it wants
|
||||
|
||||
@@ -26,6 +26,7 @@ from custom_components.llama_conversation.const import (
|
||||
CONF_REMEMBER_CONVERSATION,
|
||||
CONF_REMEMBER_CONVERSATION_TIME_MINUTES,
|
||||
CONF_GENERIC_OPENAI_PATH,
|
||||
CONF_ENABLE_LEGACY_TOOL_CALLING,
|
||||
DEFAULT_MAX_TOKENS,
|
||||
DEFAULT_TEMPERATURE,
|
||||
DEFAULT_TOP_P,
|
||||
@@ -33,6 +34,7 @@ from custom_components.llama_conversation.const import (
|
||||
DEFAULT_REMEMBER_CONVERSATION,
|
||||
DEFAULT_REMEMBER_CONVERSATION_TIME_MINUTES,
|
||||
DEFAULT_GENERIC_OPENAI_PATH,
|
||||
DEFAULT_ENABLE_LEGACY_TOOL_CALLING,
|
||||
)
|
||||
from custom_components.llama_conversation.conversation import LocalLLMAgent, TextGenerationResult
|
||||
|
||||
@@ -63,6 +65,7 @@ class GenericOpenAIAPIAgent(LocalLLMAgent):
|
||||
temperature = self.entry.options.get(CONF_TEMPERATURE, DEFAULT_TEMPERATURE)
|
||||
top_p = self.entry.options.get(CONF_TOP_P, DEFAULT_TOP_P)
|
||||
timeout = self.entry.options.get(CONF_REQUEST_TIMEOUT, DEFAULT_REQUEST_TIMEOUT)
|
||||
enable_legacy_tool_calling = self.entry.options.get(CONF_ENABLE_LEGACY_TOOL_CALLING, DEFAULT_ENABLE_LEGACY_TOOL_CALLING)
|
||||
|
||||
endpoint, additional_params = self._chat_completion_params()
|
||||
messages = get_oai_formatted_messages(conversation)
|
||||
@@ -77,7 +80,9 @@ class GenericOpenAIAPIAgent(LocalLLMAgent):
|
||||
}
|
||||
|
||||
tools = None
|
||||
if llm_api:
|
||||
# "legacy" tool calling passes the tools directly as part of the system prompt instead of as "tools"
|
||||
# most local backends absolutely butcher any sort of prompt formatting when using tool calling
|
||||
if llm_api and not enable_legacy_tool_calling:
|
||||
tools = get_oai_formatted_tools(llm_api, self._async_get_all_exposed_domains())
|
||||
request_params["tools"] = tools
|
||||
|
||||
@@ -103,22 +108,19 @@ class GenericOpenAIAPIAgent(LocalLLMAgent):
|
||||
) as response:
|
||||
response.raise_for_status()
|
||||
async for line_bytes in response.content:
|
||||
chunk = line_bytes.decode("utf-8").strip().removeprefix("data: ")
|
||||
raw_line = line_bytes.decode("utf-8").strip()
|
||||
if raw_line.startswith("error: "):
|
||||
raise Exception(f"Error from server: {raw_line}")
|
||||
chunk = raw_line.removeprefix("data: ")
|
||||
if "[DONE]" in chunk:
|
||||
break
|
||||
|
||||
|
||||
if chunk and chunk.strip():
|
||||
yield self._extract_response(json.loads(chunk), llm_api)
|
||||
except asyncio.TimeoutError as err:
|
||||
raise HomeAssistantError("The generation request timed out! Please check your connection settings, increase the timeout in settings, or decrease the number of exposed entities.") from err
|
||||
except aiohttp.ClientError as err:
|
||||
raise HomeAssistantError(f"Failed to communicate with the API! {err}") from err
|
||||
except Exception as err:
|
||||
_LOGGER.debug(f"Err was: {err}")
|
||||
_LOGGER.debug(f"Request was: {request_params}")
|
||||
_LOGGER.debug(f"Result was: {response}")
|
||||
_LOGGER.debug(f"Chunk was {chunk}")
|
||||
raise HomeAssistantError(f"An unknown error occurred! {err}") from err
|
||||
|
||||
return self._async_parse_completion(llm_api, anext_token=anext_token())
|
||||
|
||||
@@ -159,8 +161,6 @@ class GenericOpenAIAPIAgent(LocalLLMAgent):
|
||||
if choice["finish_reason"] == "length" or choice["finish_reason"] == "content_filter":
|
||||
_LOGGER.warning("Model response did not end on a stop token (unfinished sentence)")
|
||||
|
||||
_LOGGER.debug("Model chunk '%s'", response_text)
|
||||
|
||||
return response_text, tool_calls
|
||||
|
||||
|
||||
|
||||
@@ -80,11 +80,11 @@ class TextGenerationWebuiAgent(GenericOpenAIAPIAgent):
|
||||
_LOGGER.debug("Connection error was: %s", repr(ex))
|
||||
raise ConfigEntryNotReady("There was a problem connecting to the remote server") from ex
|
||||
|
||||
def _chat_completion_params(self, conversation: List[Dict[str, str]]) -> Tuple[str, Dict]:
|
||||
def _chat_completion_params(self) -> Tuple[str, Dict[str, Any]]:
|
||||
preset = self.entry.options.get(CONF_TEXT_GEN_WEBUI_PRESET)
|
||||
chat_mode = self.entry.options.get(CONF_TEXT_GEN_WEBUI_CHAT_MODE, DEFAULT_TEXT_GEN_WEBUI_CHAT_MODE)
|
||||
|
||||
endpoint, request_params = super()._chat_completion_params(conversation)
|
||||
endpoint, request_params = super()._chat_completion_params()
|
||||
|
||||
request_params["mode"] = chat_mode
|
||||
if chat_mode == TEXT_GEN_WEBUI_CHAT_MODE_CHAT or chat_mode == TEXT_GEN_WEBUI_CHAT_MODE_CHAT_INSTRUCT:
|
||||
@@ -98,37 +98,6 @@ class TextGenerationWebuiAgent(GenericOpenAIAPIAgent):
|
||||
|
||||
return endpoint, request_params
|
||||
|
||||
def _completion_params(self, conversation: List[Dict[str, str]]) -> Tuple[str, Dict[str, Any]]:
|
||||
preset = self.entry.options.get(CONF_TEXT_GEN_WEBUI_PRESET)
|
||||
|
||||
endpoint, request_params = super()._completion_params(conversation)
|
||||
|
||||
if preset:
|
||||
request_params["preset"] = preset
|
||||
|
||||
request_params["truncation_length"] = self.entry.options.get(CONF_CONTEXT_LENGTH, DEFAULT_CONTEXT_LENGTH)
|
||||
request_params["top_k"] = self.entry.options.get(CONF_TOP_K, DEFAULT_TOP_K)
|
||||
request_params["min_p"] = self.entry.options.get(CONF_MIN_P, DEFAULT_MIN_P)
|
||||
request_params["typical_p"] = self.entry.options.get(CONF_TYPICAL_P, DEFAULT_TYPICAL_P)
|
||||
|
||||
return endpoint, request_params
|
||||
|
||||
def _extract_response(self, response_json: dict) -> TextGenerationResult:
|
||||
choices = response_json["choices"]
|
||||
if choices[0]["finish_reason"] != "stop":
|
||||
_LOGGER.warning("Model response did not end on a stop token (unfinished sentence)")
|
||||
|
||||
context_len = self.entry.options.get(CONF_CONTEXT_LENGTH, DEFAULT_CONTEXT_LENGTH)
|
||||
max_tokens = self.entry.options.get(CONF_MAX_TOKENS, DEFAULT_MAX_TOKENS)
|
||||
if response_json["usage"]["prompt_tokens"] + max_tokens > context_len:
|
||||
self._warn_context_size()
|
||||
|
||||
# text-gen-webui has a typo where it is 'chat.completions' not 'chat.completion'
|
||||
if response_json["object"] == "chat.completions":
|
||||
return choices[0]["message"]["content"]
|
||||
else:
|
||||
return choices[0]["text"]
|
||||
|
||||
class LlamaCppServerAgent(GenericOpenAIAPIAgent):
|
||||
grammar: str
|
||||
|
||||
@@ -152,8 +121,4 @@ class LlamaCppServerAgent(GenericOpenAIAPIAgent):
|
||||
if self.entry.options.get(CONF_USE_GBNF_GRAMMAR, DEFAULT_USE_GBNF_GRAMMAR):
|
||||
request_params["grammar"] = self.grammar
|
||||
|
||||
# force usage of COMMON_CHAT_TOOL_CHOICE_NONE so it returns raw content and then parse ourself when using
|
||||
# the custom home llm tool call syntax. otherwise let the server detect it automatically
|
||||
request_params["tool_choice"] = "none"
|
||||
|
||||
return endpoint, request_params
|
||||
@@ -34,8 +34,6 @@ from homeassistant.helpers.selector import (
|
||||
BooleanSelector,
|
||||
BooleanSelectorConfig,
|
||||
)
|
||||
from homeassistant.util.package import is_installed
|
||||
from importlib.metadata import version
|
||||
|
||||
from .utils import download_model_from_hf, get_llama_cpp_python_version, install_llama_cpp_python, format_url, MissingQuantizationException
|
||||
from .const import (
|
||||
@@ -58,6 +56,7 @@ from .const import (
|
||||
CONF_THINKING_SUFFIX,
|
||||
CONF_TOOL_CALL_PREFIX,
|
||||
CONF_TOOL_CALL_SUFFIX,
|
||||
CONF_ENABLE_LEGACY_TOOL_CALLING,
|
||||
CONF_ENABLE_FLASH_ATTENTION,
|
||||
CONF_USE_GBNF_GRAMMAR,
|
||||
CONF_GBNF_GRAMMAR_FILE,
|
||||
@@ -107,6 +106,7 @@ from .const import (
|
||||
DEFAULT_THINKING_SUFFIX,
|
||||
DEFAULT_TOOL_CALL_PREFIX,
|
||||
DEFAULT_TOOL_CALL_SUFFIX,
|
||||
DEFAULT_ENABLE_LEGACY_TOOL_CALLING,
|
||||
DEFAULT_ENABLE_FLASH_ATTENTION,
|
||||
DEFAULT_USE_GBNF_GRAMMAR,
|
||||
DEFAULT_GBNF_GRAMMAR_FILE,
|
||||
@@ -1068,6 +1068,11 @@ def local_llama_config_option_schema(hass: HomeAssistant, options: MappingProxyT
|
||||
description={"suggested_value": options.get(CONF_REQUEST_TIMEOUT)},
|
||||
default=DEFAULT_REQUEST_TIMEOUT,
|
||||
): NumberSelector(NumberSelectorConfig(min=5, max=900, step=1, unit_of_measurement=UnitOfTime.SECONDS, mode=NumberSelectorMode.BOX)),
|
||||
vol.Required(
|
||||
CONF_ENABLE_LEGACY_TOOL_CALLING,
|
||||
description={"suggested_value": options.get(CONF_ENABLE_LEGACY_TOOL_CALLING)},
|
||||
default=DEFAULT_ENABLE_LEGACY_TOOL_CALLING
|
||||
): bool,
|
||||
})
|
||||
elif backend_type in BACKEND_TYPE_GENERIC_OPENAI_RESPONSES:
|
||||
del result[CONF_REMEMBER_NUM_INTERACTIONS]
|
||||
@@ -1127,6 +1132,11 @@ def local_llama_config_option_schema(hass: HomeAssistant, options: MappingProxyT
|
||||
description={"suggested_value": options.get(CONF_REQUEST_TIMEOUT)},
|
||||
default=DEFAULT_REQUEST_TIMEOUT,
|
||||
): NumberSelector(NumberSelectorConfig(min=5, max=900, step=1, unit_of_measurement=UnitOfTime.SECONDS, mode=NumberSelectorMode.BOX)),
|
||||
vol.Required(
|
||||
CONF_ENABLE_LEGACY_TOOL_CALLING,
|
||||
description={"suggested_value": options.get(CONF_ENABLE_LEGACY_TOOL_CALLING)},
|
||||
default=DEFAULT_ENABLE_LEGACY_TOOL_CALLING
|
||||
): bool,
|
||||
})
|
||||
elif backend_type == BACKEND_TYPE_OLLAMA:
|
||||
result = insert_after_key(result, CONF_MAX_TOKENS, {
|
||||
|
||||
@@ -76,13 +76,13 @@ ICL_EXTRAS = """
|
||||
{% for item in response_examples %}
|
||||
{{ item.request }}
|
||||
{{ item.response }}
|
||||
<functioncall> {{ item.tool | to_json }}
|
||||
{{ tool_call_prefix }}{{ item.tool | to_json }}{{ tool_call_suffix }}
|
||||
{% endfor %}"""
|
||||
ICL_NO_SYSTEM_PROMPT_EXTRAS = """
|
||||
{% for item in response_examples %}
|
||||
{{ item.request }}
|
||||
{{ item.response }}
|
||||
<functioncall> {{ item.tool | to_json }}
|
||||
{{ tool_call_prefix }}{{ item.tool | to_json }}{{ tool_call_suffix }}
|
||||
{% endfor %}
|
||||
<user_instruction>:"""
|
||||
DEFAULT_PROMPT = DEFAULT_PROMPT_BASE + ICL_EXTRAS
|
||||
@@ -137,6 +137,8 @@ CONF_TOOL_CALL_PREFIX = "tool_call_prefix"
|
||||
DEFAULT_TOOL_CALL_PREFIX = "<tool_call>"
|
||||
CONF_TOOL_CALL_SUFFIX = "tool_call_suffix"
|
||||
DEFAULT_TOOL_CALL_SUFFIX = "</tool_call>"
|
||||
CONF_ENABLE_LEGACY_TOOL_CALLING = "enable_legacy_tool_calling"
|
||||
DEFAULT_ENABLE_LEGACY_TOOL_CALLING = False
|
||||
CONF_ENABLE_FLASH_ATTENTION = "enable_flash_attention"
|
||||
DEFAULT_ENABLE_FLASH_ATTENTION = False
|
||||
CONF_USE_GBNF_GRAMMAR = "gbnf_grammar"
|
||||
@@ -179,7 +181,6 @@ CONF_GENERIC_OPENAI_PATH = "openai_path"
|
||||
DEFAULT_GENERIC_OPENAI_PATH = "v1"
|
||||
CONF_GENERIC_OPENAI_VALIDATE_MODEL = "openai_validate_model"
|
||||
DEFAULT_GENERIC_OPENAI_VALIDATE_MODEL = True
|
||||
|
||||
CONF_CONTEXT_LENGTH = "context_length"
|
||||
DEFAULT_CONTEXT_LENGTH = 2048
|
||||
CONF_BATCH_SIZE = "batch_size"
|
||||
@@ -228,6 +229,7 @@ OPTIONS_OVERRIDES = {
|
||||
CONF_TOOL_CALL_SUFFIX: "```",
|
||||
CONF_CONTEXT_LENGTH: 131072,
|
||||
CONF_MAX_TOOL_CALL_ITERATIONS: 1,
|
||||
CONF_ENABLE_LEGACY_TOOL_CALLING: True
|
||||
},
|
||||
"home-3b-v3": {
|
||||
CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY,
|
||||
@@ -235,6 +237,7 @@ OPTIONS_OVERRIDES = {
|
||||
CONF_TOOL_CALL_PREFIX: "```homeassistant",
|
||||
CONF_TOOL_CALL_SUFFIX: "```",
|
||||
CONF_MAX_TOOL_CALL_ITERATIONS: 1,
|
||||
CONF_ENABLE_LEGACY_TOOL_CALLING: True
|
||||
},
|
||||
"home-3b-v2": {
|
||||
CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY,
|
||||
@@ -242,6 +245,7 @@ OPTIONS_OVERRIDES = {
|
||||
CONF_TOOL_CALL_PREFIX: "```homeassistant",
|
||||
CONF_TOOL_CALL_SUFFIX: "```",
|
||||
CONF_MAX_TOOL_CALL_ITERATIONS: 1,
|
||||
CONF_ENABLE_LEGACY_TOOL_CALLING: True
|
||||
},
|
||||
"home-3b-v1": {
|
||||
CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY,
|
||||
@@ -249,6 +253,7 @@ OPTIONS_OVERRIDES = {
|
||||
CONF_TOOL_CALL_PREFIX: "```homeassistant",
|
||||
CONF_TOOL_CALL_SUFFIX: "```",
|
||||
CONF_MAX_TOOL_CALL_ITERATIONS: 1,
|
||||
CONF_ENABLE_LEGACY_TOOL_CALLING: True
|
||||
},
|
||||
"home-1b-v3": {
|
||||
CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY,
|
||||
@@ -256,6 +261,7 @@ OPTIONS_OVERRIDES = {
|
||||
CONF_TOOL_CALL_PREFIX: "```homeassistant",
|
||||
CONF_TOOL_CALL_SUFFIX: "```",
|
||||
CONF_MAX_TOOL_CALL_ITERATIONS: 1,
|
||||
CONF_ENABLE_LEGACY_TOOL_CALLING: True
|
||||
},
|
||||
"home-1b-v2": {
|
||||
CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY,
|
||||
@@ -263,6 +269,7 @@ OPTIONS_OVERRIDES = {
|
||||
CONF_TOOL_CALL_PREFIX: "```homeassistant",
|
||||
CONF_TOOL_CALL_SUFFIX: "```",
|
||||
CONF_MAX_TOOL_CALL_ITERATIONS: 1,
|
||||
CONF_ENABLE_LEGACY_TOOL_CALLING: True
|
||||
},
|
||||
"home-1b-v1": {
|
||||
CONF_PROMPT: DEFAULT_PROMPT_BASE_LEGACY,
|
||||
@@ -270,6 +277,7 @@ OPTIONS_OVERRIDES = {
|
||||
CONF_TOOL_CALL_PREFIX: "```homeassistant",
|
||||
CONF_TOOL_CALL_SUFFIX: "```",
|
||||
CONF_MAX_TOOL_CALL_ITERATIONS: 1,
|
||||
CONF_ENABLE_LEGACY_TOOL_CALLING: True
|
||||
},
|
||||
"mistral": {
|
||||
CONF_PROMPT: DEFAULT_PROMPT_BASE + ICL_NO_SYSTEM_PROMPT_EXTRAS,
|
||||
|
||||
@@ -22,7 +22,7 @@ from homeassistant.helpers import config_validation as cv, intent, template, ent
|
||||
from homeassistant.helpers.entity_platform import AddConfigEntryEntitiesCallback
|
||||
from homeassistant.util import color
|
||||
|
||||
from .utils import closest_color, parse_raw_tool_call
|
||||
from .utils import closest_color, parse_raw_tool_call, flatten_vol_schema
|
||||
from .const import (
|
||||
CONF_CHAT_MODEL,
|
||||
CONF_PROMPT,
|
||||
@@ -40,6 +40,7 @@ from .const import (
|
||||
CONF_THINKING_SUFFIX,
|
||||
CONF_TOOL_CALL_PREFIX,
|
||||
CONF_TOOL_CALL_SUFFIX,
|
||||
CONF_ENABLE_LEGACY_TOOL_CALLING,
|
||||
DEFAULT_PROMPT,
|
||||
DEFAULT_BACKEND_TYPE,
|
||||
DEFAULT_EXTRA_ATTRIBUTES_TO_EXPOSE,
|
||||
@@ -58,6 +59,7 @@ from .const import (
|
||||
DEFAULT_THINKING_SUFFIX,
|
||||
DEFAULT_TOOL_CALL_PREFIX,
|
||||
DEFAULT_TOOL_CALL_SUFFIX,
|
||||
DEFAULT_ENABLE_LEGACY_TOOL_CALLING,
|
||||
)
|
||||
|
||||
_LOGGER = logging.getLogger(__name__)
|
||||
@@ -82,8 +84,6 @@ async def update_listener(hass: HomeAssistant, entry: ConfigEntry):
|
||||
agent: LocalLLMAgent = entry.runtime_data
|
||||
await hass.async_add_executor_job(agent._update_options)
|
||||
|
||||
return True
|
||||
|
||||
async def async_setup_entry(hass: HomeAssistant, entry: ConfigEntry, async_add_entities: AddConfigEntryEntitiesCallback) -> bool:
|
||||
"""Set up Local LLM Conversation from a config entry."""
|
||||
|
||||
@@ -427,15 +427,18 @@ class LocalLLMAgent(ConversationEntity, AbstractConversationAgent):
|
||||
|
||||
elif tool_suffix in potential_block and in_tool_call:
|
||||
in_tool_call = False
|
||||
tool_call, to_say = parse_raw_tool_call(tool_content.strip().removeprefix(tool_prefix).removesuffix(tool_suffix), llm_api)
|
||||
_LOGGER.debug("Tool call parsed: %s", tool_call)
|
||||
|
||||
if tool_call:
|
||||
result.tool_calls = [tool_call]
|
||||
if to_say:
|
||||
content = to_say
|
||||
if not llm_api:
|
||||
_LOGGER.warning("Model attempted to call a tool but no LLM API was provided, ignoring tool calls")
|
||||
else:
|
||||
content = None
|
||||
tool_call, to_say = parse_raw_tool_call(tool_content.strip().removeprefix(tool_prefix).removesuffix(tool_suffix), llm_api)
|
||||
_LOGGER.debug("Tool call parsed: %s", tool_call)
|
||||
|
||||
if tool_call:
|
||||
result.tool_calls = [tool_call]
|
||||
if to_say:
|
||||
content = to_say
|
||||
else:
|
||||
content = None
|
||||
|
||||
result.response = content
|
||||
|
||||
@@ -463,9 +466,9 @@ class LocalLLMAgent(ConversationEntity, AbstractConversationAgent):
|
||||
|
||||
return list(domains)
|
||||
|
||||
def _async_get_exposed_entities(self) -> dict[str, str]:
|
||||
def _async_get_exposed_entities(self) -> dict[str, dict]:
|
||||
"""Gather exposed entity states"""
|
||||
entity_states = {}
|
||||
entity_states: dict[str, dict] = {}
|
||||
entity_registry = er.async_get(self.hass)
|
||||
device_registry = dr.async_get(self.hass)
|
||||
area_registry = ar.async_get(self.hass)
|
||||
@@ -577,10 +580,12 @@ class LocalLLMAgent(ConversationEntity, AbstractConversationAgent):
|
||||
"""Generate the system prompt with current entity states"""
|
||||
entities_to_expose = self._async_get_exposed_entities()
|
||||
|
||||
extra_attributes_to_expose = self.entry.options \
|
||||
.get(CONF_EXTRA_ATTRIBUTES_TO_EXPOSE, DEFAULT_EXTRA_ATTRIBUTES_TO_EXPOSE)
|
||||
extra_attributes_to_expose = self.entry.options.get(CONF_EXTRA_ATTRIBUTES_TO_EXPOSE, DEFAULT_EXTRA_ATTRIBUTES_TO_EXPOSE)
|
||||
enable_legacy_tool_calling = self.entry.options.get(CONF_ENABLE_LEGACY_TOOL_CALLING, DEFAULT_ENABLE_LEGACY_TOOL_CALLING)
|
||||
tool_call_prefix = self.entry.options.get(CONF_TOOL_CALL_PREFIX, DEFAULT_TOOL_CALL_PREFIX)
|
||||
tool_call_suffix = self.entry.options.get(CONF_TOOL_CALL_SUFFIX, DEFAULT_TOOL_CALL_SUFFIX)
|
||||
|
||||
def expose_attributes(attributes) -> list[str]:
|
||||
def expose_attributes(attributes: dict[str, Any]) -> list[str]:
|
||||
result = []
|
||||
for attribute_name in extra_attributes_to_expose:
|
||||
if attribute_name not in attributes:
|
||||
@@ -645,9 +650,23 @@ class LocalLLMAgent(ConversationEntity, AbstractConversationAgent):
|
||||
render_variables = {
|
||||
"devices": devices,
|
||||
"formatted_devices": formatted_devices,
|
||||
"response_examples": []
|
||||
"response_examples": [],
|
||||
"tool_call_prefix": tool_call_prefix,
|
||||
"tool_call_suffix": tool_call_suffix,
|
||||
}
|
||||
|
||||
if enable_legacy_tool_calling:
|
||||
if llm_api:
|
||||
tools = []
|
||||
for tool in llm_api.tools:
|
||||
tools.append(f"{tool.name}({','.join(flatten_vol_schema(tool.parameters))})")
|
||||
render_variables["tools"] = tools
|
||||
render_variables["formatted_tools"] = ", ".join(tools)
|
||||
else:
|
||||
message = "No tools were provided. If the user requests you interact with a device, tell them you are unable to do so."
|
||||
render_variables["tools"] = [message]
|
||||
render_variables["formatted_tools"] = message
|
||||
|
||||
# only pass examples if there are loaded examples + an API was exposed
|
||||
if self.in_context_examples and llm_api:
|
||||
num_examples = int(self.entry.options.get(CONF_NUM_IN_CONTEXT_EXAMPLES, DEFAULT_NUM_IN_CONTEXT_EXAMPLES))
|
||||
|
||||
@@ -92,7 +92,13 @@
|
||||
"context_length": "Context Length",
|
||||
"batch_size": "Batch Size",
|
||||
"n_threads": "Thread Count",
|
||||
"n_batch_threads": "Batch Thread Count"
|
||||
"n_batch_threads": "Batch Thread Count",
|
||||
"thinking_prefix": "Reasoning Content Prefix",
|
||||
"thinking_suffix": "Reasoning Content Suffix",
|
||||
"tool_call_prefix": "Tool Call Prefix",
|
||||
"tool_call_suffix": "Tool Call Suffix",
|
||||
"enable_legacy_tool_calling": "Enable Legacy Tool Calling",
|
||||
"max_tool_call_iterations": "Maximum Tool Call Attempts"
|
||||
},
|
||||
"data_description": {
|
||||
"llm_hass_api": "Select 'Assist' if you want the model to be able to control devices. If you are using the Home-LLM v1, v2, or v3 model then select 'Home-LLM (v1-3)'",
|
||||
@@ -115,9 +121,6 @@
|
||||
"llm_hass_api": "Selected LLM API",
|
||||
"max_new_tokens": "Maximum tokens to return in response",
|
||||
"prompt": "System Prompt",
|
||||
"prompt_template": "Prompt Format",
|
||||
"tool_format": "Tool Format",
|
||||
"tool_multi_turn_chat": "Multi-Turn Tool Use",
|
||||
"temperature": "Temperature",
|
||||
"top_k": "Top K",
|
||||
"top_p": "Top P",
|
||||
@@ -147,7 +150,13 @@
|
||||
"context_length": "Context Length",
|
||||
"batch_size": "Batch Size",
|
||||
"n_threads": "Thread Count",
|
||||
"n_batch_threads": "Batch Thread Count"
|
||||
"n_batch_threads": "Batch Thread Count",
|
||||
"thinking_prefix": "Reasoning Content Prefix",
|
||||
"thinking_suffix": "Reasoning Content Suffix",
|
||||
"tool_call_prefix": "Tool Call Prefix",
|
||||
"tool_call_suffix": "Tool Call Suffix",
|
||||
"enable_legacy_tool_calling": "Enable Legacy Tool Calling",
|
||||
"max_tool_call_iterations": "Maximum Tool Call Attempts"
|
||||
},
|
||||
"data_description": {
|
||||
"llm_hass_api": "Select 'Assist' if you want the model to be able to control devices. If you are using the Home-LLM v1, v2, or v3 model then select 'Home-LLM (v1-3)'",
|
||||
@@ -167,27 +176,6 @@
|
||||
}
|
||||
},
|
||||
"selector": {
|
||||
"prompt_template": {
|
||||
"options": {
|
||||
"chatml": "ChatML",
|
||||
"vicuna": "Vicuna",
|
||||
"alpaca": "Alpaca",
|
||||
"mistral": "Mistral",
|
||||
"zephyr": "Zephyr (<|endoftext|>)",
|
||||
"zephyr2": "Zephyr ('</s>')",
|
||||
"zephyr3": "Zephyr (<|end|>)",
|
||||
"llama3": "Llama 3",
|
||||
"command-r": "Command R",
|
||||
"no_prompt_template": "None"
|
||||
}
|
||||
},
|
||||
"tool_format": {
|
||||
"options": {
|
||||
"full_tool_format": "Full JSON Tool Format",
|
||||
"reduced_tool_format": "Reduced JSON Tool Format",
|
||||
"min_tool_format": "Minimal Function Style Tool Format"
|
||||
}
|
||||
},
|
||||
"model_backend": {
|
||||
"options": {
|
||||
"llama_cpp_hf": "Llama.cpp (HuggingFace)",
|
||||
|
||||
@@ -335,10 +335,10 @@ def get_home_llm_tools(llm_api: llm.APIInstance, domains: list[str]) -> List[Dic
|
||||
# scripts show up as individual services
|
||||
if domain == "script" and not scripts_added:
|
||||
all_services.extend([
|
||||
("script.reload", vol.Schema({})),
|
||||
("script.turn_on", vol.Schema({})),
|
||||
("script.turn_off", vol.Schema({})),
|
||||
("script.toggle", vol.Schema({})),
|
||||
("script.reload", vol.Schema({vol.Required("target_device"): str})),
|
||||
("script.turn_on", vol.Schema({vol.Required("target_device"): str})),
|
||||
("script.turn_off", vol.Schema({vol.Required("target_device"): str})),
|
||||
("script.toggle", vol.Schema({vol.Required("target_device"): str})),
|
||||
])
|
||||
scripts_added = True
|
||||
continue
|
||||
@@ -350,7 +350,8 @@ def get_home_llm_tools(llm_api: llm.APIInstance, domains: list[str]) -> List[Dic
|
||||
args = flatten_vol_schema(service.schema)
|
||||
args_to_expose = set(args).intersection(ALLOWED_SERVICE_CALL_ARGUMENTS)
|
||||
service_schema = vol.Schema({
|
||||
vol.Optional(arg): str for arg in args_to_expose
|
||||
vol.Required("target_device"): str,
|
||||
**{vol.Optional(arg): str for arg in args_to_expose}
|
||||
})
|
||||
|
||||
all_services.append((f"{domain}.{name}", service_schema))
|
||||
@@ -384,18 +385,21 @@ def parse_raw_tool_call(raw_block: str | dict, llm_api: llm.APIInstance) -> tupl
|
||||
else:
|
||||
schema_to_validate = vol.Schema({
|
||||
vol.Required("name"): str,
|
||||
vol.Required("arguments"): dict,
|
||||
vol.Required("arguments"): str | dict,
|
||||
})
|
||||
|
||||
try:
|
||||
schema_to_validate(parsed_tool_call)
|
||||
except vol.Error as ex:
|
||||
_LOGGER.info(f"LLM produced an improperly formatted response: {repr(ex)}")
|
||||
raise # re-raise exception for now to force the LLM to try again
|
||||
raise ex # re-raise exception for now to force the LLM to try again
|
||||
|
||||
# try to fix certain arguments
|
||||
args_dict = parsed_tool_call if llm_api.api.id == HOME_LLM_API_ID else parsed_tool_call["arguments"]
|
||||
|
||||
if isinstance(args_dict, str):
|
||||
args_dict = json.loads(args_dict)
|
||||
|
||||
# make sure brightness is 0-255 and not a percentage
|
||||
if "brightness" in args_dict and 0.0 < args_dict["brightness"] <= 1.0:
|
||||
args_dict["brightness"] = int(args_dict["brightness"] * 255)
|
||||
|
||||
59
docker-compose.yml
Normal file
59
docker-compose.yml
Normal file
@@ -0,0 +1,59 @@
|
||||
# you can start and stop backends by running `docker-compose up -d <service name>`
|
||||
version: '3.8'
|
||||
services:
|
||||
ollama:
|
||||
image: ollama/ollama:latest
|
||||
container_name: ollama
|
||||
ports:
|
||||
- "11434:11434" # Ollama default
|
||||
volumes:
|
||||
- ./models:/models
|
||||
- ./scripts:/scripts # needed for import script
|
||||
environment:
|
||||
- OLLAMA_MODELS=/models/.ollama
|
||||
restart: unless-stopped
|
||||
|
||||
text-generation-webui:
|
||||
image: atinoda/text-generation-webui:default-cpu
|
||||
container_name: textgen-webui
|
||||
init: true
|
||||
environment:
|
||||
- EXTRA_LAUNCH_ARGS="--listen --verbose" # Custom launch args (e.g., --model MODEL_NAME)
|
||||
ports:
|
||||
- "7860:7860" # Web UI default
|
||||
# - "5000:5000" # API Default
|
||||
# - "5005:5005" # Streaming API default
|
||||
volumes:
|
||||
- ./models:/app/user_data/models
|
||||
restart: unless-stopped
|
||||
|
||||
# llamacpp server can only run one model at a time; set it below
|
||||
llamacpp:
|
||||
image: ghcr.io/ggerganov/llama.cpp:server
|
||||
container_name: llamacpp-server
|
||||
ports:
|
||||
- "8000:8000" # llama.cpp server default
|
||||
volumes:
|
||||
- ./models:/models
|
||||
environment:
|
||||
- MODEL_DIR=/models
|
||||
restart: unless-stopped
|
||||
command: |-
|
||||
--port 8000
|
||||
--no-webui
|
||||
--metrics
|
||||
--jinja
|
||||
--ctx-size 8192
|
||||
--alias "Home-3B-v3"
|
||||
--model "/models/Home-3B-v3-fixed.q4_k_m.gguf"
|
||||
|
||||
localai:
|
||||
image: localai/localai:latest
|
||||
container_name: localai
|
||||
ports:
|
||||
- "8080:8080" # LocalAI default
|
||||
volumes:
|
||||
- ./models:/models
|
||||
environment:
|
||||
- MODELS_PATH=/models
|
||||
restart: unless-stopped
|
||||
9
scripts/fix_metadata.sh
Normal file
9
scripts/fix_metadata.sh
Normal file
@@ -0,0 +1,9 @@
|
||||
#!/bin/bash
|
||||
|
||||
LLAMA_CPP=../llama.cpp
|
||||
MODEL_NAME=$1
|
||||
OUTPUT_NAME=$2
|
||||
PRE_TOKENIZER=${3:-stablelm2}
|
||||
CHAT_TEMPLATE=${4:-zephyr_legacy}
|
||||
|
||||
python3 ${LLAMA_CPP}/gguf-py/gguf/scripts/gguf_new_metadata.py $MODEL_NAME $OUTPUT_NAME --pre-tokenizer $PRE_TOKENIZER --chat-template "$(cat $CHAT_TEMPLATE.txt)"
|
||||
8
scripts/import_ollama_model.sh
Executable file
8
scripts/import_ollama_model.sh
Executable file
@@ -0,0 +1,8 @@
|
||||
# Usage: docker exec -it ollama bash -c "/scripts/import_ollama_model.sh /models/Home-3B-v3.q4_k_m.gguf Home-3B-v3:q4_k_m"
|
||||
LLAMA_CPP=../llama.cpp
|
||||
GGUF_FILE=$1
|
||||
MODEL_NAME=$2
|
||||
|
||||
echo "FROM $GGUF_FILE" > $GGUF_FILE.Modelfile
|
||||
ollama create $MODEL_NAME -f $GGUF_FILE.Modelfile
|
||||
rm -f $GGUF_FILE.Modelfile
|
||||
17
scripts/zephyr_legacy.txt
Normal file
17
scripts/zephyr_legacy.txt
Normal file
@@ -0,0 +1,17 @@
|
||||
{% for message in messages %}
|
||||
{%- if message['role'] == 'user' or message['role'] == 'tool' -%}
|
||||
<|user|> {{ message['content'] }}{{ eos_token }}
|
||||
{%- elif message['role'] == 'system' -%}
|
||||
<|system|> {{ message['content'] }}
|
||||
Services:
|
||||
{%- for tool in tools %} {{ tool['function']['name'] }}({% for param in tool['function']['parameters']['properties'].keys() if param != 'target_device' %}{{ param }}{% if not loop.last %},{% endif %}{% endfor -%}),{% if not loop.last -%}
|
||||
{%- if tools | length == 0 %}No tools were provided. If the user requests you interact with a device, tell them you are unable to do so.{% endif %}
|
||||
{%- endif -%}{%- endfor -%}
|
||||
{{ eos_token }}
|
||||
{%- elif message['role'] == 'assistant' -%}
|
||||
<|assistant|> {{ message['content'] }}{{ eos_token }}
|
||||
{%- endif -%}
|
||||
{%- if loop.last and add_generation_prompt %}
|
||||
<|assistant|>
|
||||
{%- endif %}
|
||||
{% endfor -%}
|
||||
Reference in New Issue
Block a user