mirror of
https://github.com/acon96/home-llm.git
synced 2026-01-08 21:28:05 -05:00
clean up the options page
This commit is contained in:
3
TODO.md
3
TODO.md
@@ -45,12 +45,11 @@
|
||||
|
||||
|
||||
## v0.4 TODO for release:
|
||||
- [ ] re-order the settings on the options config flow page. the order is very confusing
|
||||
- [x] re-order the settings on the options config flow page. the order is very confusing
|
||||
- [ ] split out entity functionality so we can support conversation + ai tasks
|
||||
- [x] fix icl examples to match new tool calling syntax config
|
||||
- [x] set up docker-compose for running all of the various backends
|
||||
- [ ] fix and re-upload all compatible old models (+ upload all original safetensors)
|
||||
- [ ] dedicated localai backend (tailored openai variant /w model loading)
|
||||
- [x] fix the openai responses backend
|
||||
|
||||
## more complicated ideas
|
||||
|
||||
@@ -6,7 +6,7 @@ import logging
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
from typing import Any, Callable, List, Generator, AsyncGenerator, Optional
|
||||
from typing import Any, Callable, List, Generator, AsyncGenerator, Optional, cast
|
||||
|
||||
from homeassistant.components import conversation as conversation
|
||||
from homeassistant.components.conversation.const import DOMAIN as CONVERSATION_DOMAIN
|
||||
@@ -28,15 +28,15 @@ from custom_components.llama_conversation.const import (
|
||||
CONF_TYPICAL_P,
|
||||
CONF_MIN_P,
|
||||
CONF_DOWNLOADED_MODEL_FILE,
|
||||
CONF_ENABLE_FLASH_ATTENTION,
|
||||
CONF_LLAMACPP_ENABLE_FLASH_ATTENTION,
|
||||
CONF_USE_GBNF_GRAMMAR,
|
||||
CONF_GBNF_GRAMMAR_FILE,
|
||||
CONF_PROMPT_CACHING_ENABLED,
|
||||
CONF_PROMPT_CACHING_INTERVAL,
|
||||
CONF_CONTEXT_LENGTH,
|
||||
CONF_BATCH_SIZE,
|
||||
CONF_THREAD_COUNT,
|
||||
CONF_BATCH_THREAD_COUNT,
|
||||
CONF_LLAMACPP_BATCH_SIZE,
|
||||
CONF_LLAMACPP_THREAD_COUNT,
|
||||
CONF_LLAMACPP_BATCH_THREAD_COUNT,
|
||||
DEFAULT_MAX_TOKENS,
|
||||
DEFAULT_PROMPT,
|
||||
DEFAULT_TEMPERATURE,
|
||||
@@ -44,15 +44,16 @@ from custom_components.llama_conversation.const import (
|
||||
DEFAULT_TOP_P,
|
||||
DEFAULT_MIN_P,
|
||||
DEFAULT_TYPICAL_P,
|
||||
DEFAULT_ENABLE_FLASH_ATTENTION,
|
||||
DEFAULT_LLAMACPP_ENABLE_FLASH_ATTENTION,
|
||||
DEFAULT_USE_GBNF_GRAMMAR,
|
||||
DEFAULT_GBNF_GRAMMAR_FILE,
|
||||
DEFAULT_PROMPT_CACHING_ENABLED,
|
||||
DEFAULT_PROMPT_CACHING_INTERVAL,
|
||||
DEFAULT_CONTEXT_LENGTH,
|
||||
DEFAULT_BATCH_SIZE,
|
||||
DEFAULT_THREAD_COUNT,
|
||||
DEFAULT_BATCH_THREAD_COUNT,
|
||||
DEFAULT_LLAMACPP_BATCH_SIZE,
|
||||
DEFAULT_LLAMACPP_THREAD_COUNT,
|
||||
DEFAULT_LLAMACPP_BATCH_THREAD_COUNT,
|
||||
DOMAIN,
|
||||
)
|
||||
from custom_components.llama_conversation.conversation import LocalLLMAgent, TextGenerationResult
|
||||
|
||||
@@ -71,7 +72,7 @@ class LlamaCppAgent(LocalLLMAgent):
|
||||
llm: LlamaType
|
||||
grammar: Any
|
||||
llama_cpp_module: Any
|
||||
remove_prompt_caching_listener: Callable
|
||||
remove_prompt_caching_listener: Optional[Callable]
|
||||
model_lock: threading.Lock
|
||||
last_cache_prime: float
|
||||
last_updated_entities: dict[str, float]
|
||||
@@ -81,7 +82,7 @@ class LlamaCppAgent(LocalLLMAgent):
|
||||
_attr_supports_streaming = True
|
||||
|
||||
def _load_model(self, entry: ConfigEntry) -> None:
|
||||
self.model_path = entry.data.get(CONF_DOWNLOADED_MODEL_FILE)
|
||||
self.model_path = entry.data.get(CONF_DOWNLOADED_MODEL_FILE, "")
|
||||
|
||||
_LOGGER.info(
|
||||
"Using model file '%s'", self.model_path
|
||||
@@ -109,18 +110,18 @@ class LlamaCppAgent(LocalLLMAgent):
|
||||
_LOGGER.debug(f"Loading model '{self.model_path}'...")
|
||||
self.loaded_model_settings = {}
|
||||
self.loaded_model_settings[CONF_CONTEXT_LENGTH] = entry.options.get(CONF_CONTEXT_LENGTH, DEFAULT_CONTEXT_LENGTH)
|
||||
self.loaded_model_settings[CONF_BATCH_SIZE] = entry.options.get(CONF_BATCH_SIZE, DEFAULT_BATCH_SIZE)
|
||||
self.loaded_model_settings[CONF_THREAD_COUNT] = entry.options.get(CONF_THREAD_COUNT, DEFAULT_THREAD_COUNT)
|
||||
self.loaded_model_settings[CONF_BATCH_THREAD_COUNT] = entry.options.get(CONF_BATCH_THREAD_COUNT, DEFAULT_BATCH_THREAD_COUNT)
|
||||
self.loaded_model_settings[CONF_ENABLE_FLASH_ATTENTION] = entry.options.get(CONF_ENABLE_FLASH_ATTENTION, DEFAULT_ENABLE_FLASH_ATTENTION)
|
||||
self.loaded_model_settings[CONF_LLAMACPP_BATCH_SIZE] = entry.options.get(CONF_LLAMACPP_BATCH_SIZE, DEFAULT_LLAMACPP_BATCH_SIZE)
|
||||
self.loaded_model_settings[CONF_LLAMACPP_THREAD_COUNT] = entry.options.get(CONF_LLAMACPP_THREAD_COUNT, DEFAULT_LLAMACPP_THREAD_COUNT)
|
||||
self.loaded_model_settings[CONF_LLAMACPP_BATCH_THREAD_COUNT] = entry.options.get(CONF_LLAMACPP_BATCH_THREAD_COUNT, DEFAULT_LLAMACPP_BATCH_THREAD_COUNT)
|
||||
self.loaded_model_settings[CONF_LLAMACPP_ENABLE_FLASH_ATTENTION] = entry.options.get(CONF_LLAMACPP_ENABLE_FLASH_ATTENTION, DEFAULT_LLAMACPP_ENABLE_FLASH_ATTENTION)
|
||||
|
||||
self.llm = Llama(
|
||||
model_path=self.model_path,
|
||||
n_ctx=int(self.loaded_model_settings[CONF_CONTEXT_LENGTH]),
|
||||
n_batch=int(self.loaded_model_settings[CONF_BATCH_SIZE]),
|
||||
n_threads=int(self.loaded_model_settings[CONF_THREAD_COUNT]),
|
||||
n_threads_batch=int(self.loaded_model_settings[CONF_BATCH_THREAD_COUNT]),
|
||||
flash_attn=self.loaded_model_settings[CONF_ENABLE_FLASH_ATTENTION],
|
||||
n_batch=int(self.loaded_model_settings[CONF_LLAMACPP_BATCH_SIZE]),
|
||||
n_threads=int(self.loaded_model_settings[CONF_LLAMACPP_THREAD_COUNT]),
|
||||
n_threads_batch=int(self.loaded_model_settings[CONF_LLAMACPP_BATCH_THREAD_COUNT]),
|
||||
flash_attn=self.loaded_model_settings[CONF_LLAMACPP_ENABLE_FLASH_ATTENTION],
|
||||
)
|
||||
_LOGGER.debug("Model loaded")
|
||||
|
||||
@@ -136,7 +137,7 @@ class LlamaCppAgent(LocalLLMAgent):
|
||||
# ))
|
||||
|
||||
self.remove_prompt_caching_listener = None
|
||||
self.last_cache_prime = None
|
||||
self.last_cache_prime = 0.0
|
||||
self.last_updated_entities = {}
|
||||
self.cache_refresh_after_cooldown = False
|
||||
self.model_lock = threading.Lock()
|
||||
@@ -167,26 +168,26 @@ class LlamaCppAgent(LocalLLMAgent):
|
||||
|
||||
model_reloaded = False
|
||||
if self.loaded_model_settings[CONF_CONTEXT_LENGTH] != self.entry.options.get(CONF_CONTEXT_LENGTH, DEFAULT_CONTEXT_LENGTH) or \
|
||||
self.loaded_model_settings[CONF_BATCH_SIZE] != self.entry.options.get(CONF_BATCH_SIZE, DEFAULT_BATCH_SIZE) or \
|
||||
self.loaded_model_settings[CONF_THREAD_COUNT] != self.entry.options.get(CONF_THREAD_COUNT, DEFAULT_THREAD_COUNT) or \
|
||||
self.loaded_model_settings[CONF_BATCH_THREAD_COUNT] != self.entry.options.get(CONF_BATCH_THREAD_COUNT, DEFAULT_BATCH_THREAD_COUNT) or \
|
||||
self.loaded_model_settings[CONF_ENABLE_FLASH_ATTENTION] != self.entry.options.get(CONF_ENABLE_FLASH_ATTENTION, DEFAULT_ENABLE_FLASH_ATTENTION):
|
||||
self.loaded_model_settings[CONF_LLAMACPP_BATCH_SIZE] != self.entry.options.get(CONF_LLAMACPP_BATCH_SIZE, DEFAULT_LLAMACPP_BATCH_SIZE) or \
|
||||
self.loaded_model_settings[CONF_LLAMACPP_THREAD_COUNT] != self.entry.options.get(CONF_LLAMACPP_THREAD_COUNT, DEFAULT_LLAMACPP_THREAD_COUNT) or \
|
||||
self.loaded_model_settings[CONF_LLAMACPP_BATCH_THREAD_COUNT] != self.entry.options.get(CONF_LLAMACPP_BATCH_THREAD_COUNT, DEFAULT_LLAMACPP_BATCH_THREAD_COUNT) or \
|
||||
self.loaded_model_settings[CONF_LLAMACPP_ENABLE_FLASH_ATTENTION] != self.entry.options.get(CONF_LLAMACPP_ENABLE_FLASH_ATTENTION, DEFAULT_LLAMACPP_ENABLE_FLASH_ATTENTION):
|
||||
|
||||
_LOGGER.debug(f"Reloading model '{self.model_path}'...")
|
||||
self.loaded_model_settings[CONF_CONTEXT_LENGTH] = self.entry.options.get(CONF_CONTEXT_LENGTH, DEFAULT_CONTEXT_LENGTH)
|
||||
self.loaded_model_settings[CONF_BATCH_SIZE] = self.entry.options.get(CONF_BATCH_SIZE, DEFAULT_BATCH_SIZE)
|
||||
self.loaded_model_settings[CONF_THREAD_COUNT] = self.entry.options.get(CONF_THREAD_COUNT, DEFAULT_THREAD_COUNT)
|
||||
self.loaded_model_settings[CONF_BATCH_THREAD_COUNT] = self.entry.options.get(CONF_BATCH_THREAD_COUNT, DEFAULT_BATCH_THREAD_COUNT)
|
||||
self.loaded_model_settings[CONF_ENABLE_FLASH_ATTENTION] = self.entry.options.get(CONF_ENABLE_FLASH_ATTENTION, DEFAULT_ENABLE_FLASH_ATTENTION)
|
||||
self.loaded_model_settings[CONF_LLAMACPP_BATCH_SIZE] = self.entry.options.get(CONF_LLAMACPP_BATCH_SIZE, DEFAULT_LLAMACPP_BATCH_SIZE)
|
||||
self.loaded_model_settings[CONF_LLAMACPP_THREAD_COUNT] = self.entry.options.get(CONF_LLAMACPP_THREAD_COUNT, DEFAULT_LLAMACPP_THREAD_COUNT)
|
||||
self.loaded_model_settings[CONF_LLAMACPP_BATCH_THREAD_COUNT] = self.entry.options.get(CONF_LLAMACPP_BATCH_THREAD_COUNT, DEFAULT_LLAMACPP_BATCH_THREAD_COUNT)
|
||||
self.loaded_model_settings[CONF_LLAMACPP_ENABLE_FLASH_ATTENTION] = self.entry.options.get(CONF_LLAMACPP_ENABLE_FLASH_ATTENTION, DEFAULT_LLAMACPP_ENABLE_FLASH_ATTENTION)
|
||||
|
||||
Llama = getattr(self.llama_cpp_module, "Llama")
|
||||
self.llm = Llama(
|
||||
model_path=self.model_path,
|
||||
n_ctx=int(self.loaded_model_settings[CONF_CONTEXT_LENGTH]),
|
||||
n_batch=int(self.loaded_model_settings[CONF_BATCH_SIZE]),
|
||||
n_threads=int(self.loaded_model_settings[CONF_THREAD_COUNT]),
|
||||
n_threads_batch=int(self.loaded_model_settings[CONF_BATCH_THREAD_COUNT]),
|
||||
flash_attn=self.loaded_model_settings[CONF_ENABLE_FLASH_ATTENTION],
|
||||
n_batch=int(self.loaded_model_settings[CONF_LLAMACPP_BATCH_SIZE]),
|
||||
n_threads=int(self.loaded_model_settings[CONF_LLAMACPP_THREAD_COUNT]),
|
||||
n_threads_batch=int(self.loaded_model_settings[CONF_LLAMACPP_BATCH_THREAD_COUNT]),
|
||||
flash_attn=self.loaded_model_settings[CONF_LLAMACPP_ENABLE_FLASH_ATTENTION],
|
||||
)
|
||||
_LOGGER.debug("Model loaded")
|
||||
model_reloaded = True
|
||||
@@ -211,7 +212,7 @@ class LlamaCppAgent(LocalLLMAgent):
|
||||
else:
|
||||
self._set_prompt_caching(enabled=False)
|
||||
|
||||
def _async_get_exposed_entities(self) -> dict[str, str]:
|
||||
def _async_get_exposed_entities(self) -> dict[str, dict[str, str]]:
|
||||
"""Takes the super class function results and sorts the entities with the recently updated at the end"""
|
||||
entities = LocalLLMAgent._async_get_exposed_entities(self)
|
||||
|
||||
@@ -219,7 +220,7 @@ class LlamaCppAgent(LocalLLMAgent):
|
||||
if not self.entry.options.get(CONF_PROMPT_CACHING_ENABLED, DEFAULT_PROMPT_CACHING_ENABLED):
|
||||
return entities
|
||||
|
||||
entity_order = { name: None for name in entities.keys() }
|
||||
entity_order: dict[str, Optional[float]] = { name: None for name in entities.keys() }
|
||||
entity_order.update(self.last_updated_entities)
|
||||
|
||||
def sort_key(item):
|
||||
@@ -235,7 +236,7 @@ class LlamaCppAgent(LocalLLMAgent):
|
||||
|
||||
_LOGGER.debug(f"sorted_items: {sorted_items}")
|
||||
|
||||
sorted_entities = {}
|
||||
sorted_entities: dict[str, dict[str, str]] = {}
|
||||
for item_name, _ in sorted_items:
|
||||
sorted_entities[item_name] = entities[item_name]
|
||||
|
||||
@@ -271,6 +272,7 @@ class LlamaCppAgent(LocalLLMAgent):
|
||||
try:
|
||||
llm_api = await llm.async_get_api(
|
||||
self.hass, self.entry.options[CONF_LLM_HASS_API],
|
||||
llm_context=llm.LLMContext(DOMAIN, context=None, language=None, assistant=None, device_id=None)
|
||||
)
|
||||
except HomeAssistantError:
|
||||
_LOGGER.exception("Failed to get LLM API when caching prompt!")
|
||||
@@ -301,33 +303,48 @@ class LlamaCppAgent(LocalLLMAgent):
|
||||
return
|
||||
|
||||
try:
|
||||
# Build system/user messages and use the chat-completion API to prime
|
||||
# the model. We request only a single token (max_tokens=1) and
|
||||
# discard the result. This avoids implementing any streaming logic
|
||||
# while still priming the KV cache with the system prompt.
|
||||
raw_prompt = self.entry.options.get(CONF_PROMPT, DEFAULT_PROMPT)
|
||||
prompt = self._format_prompt([
|
||||
{ "role": "system", "message": self._generate_system_prompt(raw_prompt, llm_api)},
|
||||
{ "role": "user", "message": "" }
|
||||
], include_generation_prompt=False)
|
||||
system_prompt = self._generate_system_prompt(raw_prompt, llm_api)
|
||||
|
||||
input_tokens = self.llm.tokenize(
|
||||
prompt.encode(), add_bos=False
|
||||
)
|
||||
messages = get_oai_formatted_messages([
|
||||
conversation.SystemContent(content=system_prompt),
|
||||
conversation.UserContent(content="")
|
||||
])
|
||||
tools = None
|
||||
if llm_api:
|
||||
tools = get_oai_formatted_tools(llm_api, self._async_get_all_exposed_domains())
|
||||
|
||||
temperature = self.entry.options.get(CONF_TEMPERATURE, DEFAULT_TEMPERATURE)
|
||||
top_k = int(self.entry.options.get(CONF_TOP_K, DEFAULT_TOP_K))
|
||||
top_p = self.entry.options.get(CONF_TOP_P, DEFAULT_TOP_P)
|
||||
min_p = self.entry.options.get(CONF_MIN_P, DEFAULT_MIN_P)
|
||||
typical_p = self.entry.options.get(CONF_TYPICAL_P, DEFAULT_TYPICAL_P)
|
||||
grammar = self.grammar if self.entry.options.get(CONF_USE_GBNF_GRAMMAR, DEFAULT_USE_GBNF_GRAMMAR) else None
|
||||
|
||||
_LOGGER.debug(f"Processing {len(input_tokens)} input tokens...")
|
||||
_LOGGER.debug("Priming model cache via chat completion API...")
|
||||
|
||||
# grab just one token. should prime the kv cache with the system prompt
|
||||
next(self.llm.generate(
|
||||
input_tokens,
|
||||
temp=temperature,
|
||||
top_k=top_k,
|
||||
top_p=top_p,
|
||||
grammar=grammar
|
||||
))
|
||||
try:
|
||||
# avoid strict typing issues from the llama-cpp-python bindings
|
||||
self.llm.create_chat_completion(
|
||||
messages,
|
||||
tools=tools,
|
||||
temperature=temperature,
|
||||
top_k=top_k,
|
||||
top_p=top_p,
|
||||
min_p=min_p,
|
||||
typical_p=typical_p,
|
||||
max_tokens=1,
|
||||
grammar=grammar,
|
||||
stream=False,
|
||||
)
|
||||
|
||||
self.last_cache_prime = time.time()
|
||||
self.last_cache_prime = time.time()
|
||||
except Exception:
|
||||
_LOGGER.exception("Failed to prime model cache")
|
||||
finally:
|
||||
self.model_lock.release()
|
||||
|
||||
|
||||
@@ -121,4 +121,4 @@ class LlamaCppServerAgent(GenericOpenAIAPIAgent):
|
||||
if self.entry.options.get(CONF_USE_GBNF_GRAMMAR, DEFAULT_USE_GBNF_GRAMMAR):
|
||||
request_params["grammar"] = self.grammar
|
||||
|
||||
return endpoint, request_params
|
||||
return endpoint, request_params
|
||||
|
||||
@@ -57,7 +57,7 @@ from .const import (
|
||||
CONF_TOOL_CALL_PREFIX,
|
||||
CONF_TOOL_CALL_SUFFIX,
|
||||
CONF_ENABLE_LEGACY_TOOL_CALLING,
|
||||
CONF_ENABLE_FLASH_ATTENTION,
|
||||
CONF_LLAMACPP_ENABLE_FLASH_ATTENTION,
|
||||
CONF_USE_GBNF_GRAMMAR,
|
||||
CONF_GBNF_GRAMMAR_FILE,
|
||||
CONF_EXTRA_ATTRIBUTES_TO_EXPOSE,
|
||||
@@ -80,9 +80,9 @@ from .const import (
|
||||
CONF_GENERIC_OPENAI_PATH,
|
||||
CONF_GENERIC_OPENAI_VALIDATE_MODEL,
|
||||
CONF_CONTEXT_LENGTH,
|
||||
CONF_BATCH_SIZE,
|
||||
CONF_THREAD_COUNT,
|
||||
CONF_BATCH_THREAD_COUNT,
|
||||
CONF_LLAMACPP_BATCH_SIZE,
|
||||
CONF_LLAMACPP_THREAD_COUNT,
|
||||
CONF_LLAMACPP_BATCH_THREAD_COUNT,
|
||||
DEFAULT_CHAT_MODEL,
|
||||
DEFAULT_PORT,
|
||||
DEFAULT_SSL,
|
||||
@@ -107,7 +107,7 @@ from .const import (
|
||||
DEFAULT_TOOL_CALL_PREFIX,
|
||||
DEFAULT_TOOL_CALL_SUFFIX,
|
||||
DEFAULT_ENABLE_LEGACY_TOOL_CALLING,
|
||||
DEFAULT_ENABLE_FLASH_ATTENTION,
|
||||
DEFAULT_LLAMACPP_ENABLE_FLASH_ATTENTION,
|
||||
DEFAULT_USE_GBNF_GRAMMAR,
|
||||
DEFAULT_GBNF_GRAMMAR_FILE,
|
||||
DEFAULT_EXTRA_ATTRIBUTES_TO_EXPOSE,
|
||||
@@ -126,9 +126,9 @@ from .const import (
|
||||
DEFAULT_GENERIC_OPENAI_PATH,
|
||||
DEFAULT_GENERIC_OPENAI_VALIDATE_MODEL,
|
||||
DEFAULT_CONTEXT_LENGTH,
|
||||
DEFAULT_BATCH_SIZE,
|
||||
DEFAULT_THREAD_COUNT,
|
||||
DEFAULT_BATCH_THREAD_COUNT,
|
||||
DEFAULT_LLAMACPP_BATCH_SIZE,
|
||||
DEFAULT_LLAMACPP_THREAD_COUNT,
|
||||
DEFAULT_LLAMACPP_BATCH_THREAD_COUNT,
|
||||
BACKEND_TYPE_LLAMA_HF,
|
||||
BACKEND_TYPE_LLAMA_EXISTING,
|
||||
BACKEND_TYPE_TEXT_GEN_WEBUI,
|
||||
@@ -882,7 +882,7 @@ def local_llama_config_option_schema(hass: HomeAssistant, options: MappingProxyT
|
||||
CONF_MAX_TOKENS,
|
||||
description={"suggested_value": options.get(CONF_MAX_TOKENS)},
|
||||
default=DEFAULT_MAX_TOKENS,
|
||||
): int,
|
||||
): NumberSelector(NumberSelectorConfig(min=1, max=8192, step=1)),
|
||||
vol.Required(
|
||||
CONF_EXTRA_ATTRIBUTES_TO_EXPOSE,
|
||||
description={"suggested_value": options.get(CONF_EXTRA_ATTRIBUTES_TO_EXPOSE)},
|
||||
@@ -926,7 +926,7 @@ def local_llama_config_option_schema(hass: HomeAssistant, options: MappingProxyT
|
||||
}
|
||||
|
||||
if is_local_backend(backend_type):
|
||||
result = insert_after_key(result, CONF_MAX_TOKENS, {
|
||||
result.update({
|
||||
vol.Required(
|
||||
CONF_TOP_K,
|
||||
description={"suggested_value": options.get(CONF_TOP_K)},
|
||||
@@ -969,24 +969,24 @@ def local_llama_config_option_schema(hass: HomeAssistant, options: MappingProxyT
|
||||
default=DEFAULT_CONTEXT_LENGTH,
|
||||
): NumberSelector(NumberSelectorConfig(min=512, max=32768, step=1)),
|
||||
vol.Required(
|
||||
CONF_BATCH_SIZE,
|
||||
description={"suggested_value": options.get(CONF_BATCH_SIZE)},
|
||||
default=DEFAULT_BATCH_SIZE,
|
||||
CONF_LLAMACPP_BATCH_SIZE,
|
||||
description={"suggested_value": options.get(CONF_LLAMACPP_BATCH_SIZE)},
|
||||
default=DEFAULT_LLAMACPP_BATCH_SIZE,
|
||||
): NumberSelector(NumberSelectorConfig(min=1, max=8192, step=1)),
|
||||
vol.Required(
|
||||
CONF_THREAD_COUNT,
|
||||
description={"suggested_value": options.get(CONF_THREAD_COUNT)},
|
||||
default=DEFAULT_THREAD_COUNT,
|
||||
CONF_LLAMACPP_THREAD_COUNT,
|
||||
description={"suggested_value": options.get(CONF_LLAMACPP_THREAD_COUNT)},
|
||||
default=DEFAULT_LLAMACPP_THREAD_COUNT,
|
||||
): NumberSelector(NumberSelectorConfig(min=1, max=(os.cpu_count() * 2), step=1)),
|
||||
vol.Required(
|
||||
CONF_BATCH_THREAD_COUNT,
|
||||
description={"suggested_value": options.get(CONF_BATCH_THREAD_COUNT)},
|
||||
default=DEFAULT_BATCH_THREAD_COUNT,
|
||||
CONF_LLAMACPP_BATCH_THREAD_COUNT,
|
||||
description={"suggested_value": options.get(CONF_LLAMACPP_BATCH_THREAD_COUNT)},
|
||||
default=DEFAULT_LLAMACPP_BATCH_THREAD_COUNT,
|
||||
): NumberSelector(NumberSelectorConfig(min=1, max=(os.cpu_count() * 2), step=1)),
|
||||
vol.Required(
|
||||
CONF_ENABLE_FLASH_ATTENTION,
|
||||
description={"suggested_value": options.get(CONF_ENABLE_FLASH_ATTENTION)},
|
||||
default=DEFAULT_ENABLE_FLASH_ATTENTION,
|
||||
CONF_LLAMACPP_ENABLE_FLASH_ATTENTION,
|
||||
description={"suggested_value": options.get(CONF_LLAMACPP_ENABLE_FLASH_ATTENTION)},
|
||||
default=DEFAULT_LLAMACPP_ENABLE_FLASH_ATTENTION,
|
||||
): BooleanSelector(BooleanSelectorConfig()),
|
||||
vol.Required(
|
||||
CONF_USE_GBNF_GRAMMAR,
|
||||
@@ -1000,7 +1000,7 @@ def local_llama_config_option_schema(hass: HomeAssistant, options: MappingProxyT
|
||||
): str
|
||||
})
|
||||
elif backend_type == BACKEND_TYPE_TEXT_GEN_WEBUI:
|
||||
result = insert_after_key(result, CONF_MAX_TOKENS, {
|
||||
result.update({
|
||||
vol.Required(
|
||||
CONF_CONTEXT_LENGTH,
|
||||
description={"suggested_value": options.get(CONF_CONTEXT_LENGTH)},
|
||||
@@ -1052,7 +1052,7 @@ def local_llama_config_option_schema(hass: HomeAssistant, options: MappingProxyT
|
||||
)),
|
||||
})
|
||||
elif backend_type in BACKEND_TYPE_GENERIC_OPENAI:
|
||||
result = insert_after_key(result, CONF_MAX_TOKENS, {
|
||||
result.update({
|
||||
vol.Required(
|
||||
CONF_TEMPERATURE,
|
||||
description={"suggested_value": options.get(CONF_TEMPERATURE)},
|
||||
@@ -1076,7 +1076,7 @@ def local_llama_config_option_schema(hass: HomeAssistant, options: MappingProxyT
|
||||
})
|
||||
elif backend_type in BACKEND_TYPE_GENERIC_OPENAI_RESPONSES:
|
||||
del result[CONF_REMEMBER_NUM_INTERACTIONS]
|
||||
result = insert_after_key(result, CONF_REMEMBER_CONVERSATION, {
|
||||
result.update({
|
||||
vol.Required(
|
||||
CONF_REMEMBER_CONVERSATION_TIME_MINUTES,
|
||||
description={"suggested_value": options.get(CONF_REMEMBER_CONVERSATION_TIME_MINUTES)},
|
||||
@@ -1101,7 +1101,7 @@ def local_llama_config_option_schema(hass: HomeAssistant, options: MappingProxyT
|
||||
): NumberSelector(NumberSelectorConfig(min=5, max=900, step=1, unit_of_measurement=UnitOfTime.SECONDS, mode=NumberSelectorMode.BOX)),
|
||||
})
|
||||
elif backend_type == BACKEND_TYPE_LLAMA_CPP_SERVER:
|
||||
result = insert_after_key(result, CONF_MAX_TOKENS, {
|
||||
result.update({
|
||||
vol.Required(
|
||||
CONF_TOP_K,
|
||||
description={"suggested_value": options.get(CONF_TOP_K)},
|
||||
@@ -1139,7 +1139,7 @@ def local_llama_config_option_schema(hass: HomeAssistant, options: MappingProxyT
|
||||
): bool,
|
||||
})
|
||||
elif backend_type == BACKEND_TYPE_OLLAMA:
|
||||
result = insert_after_key(result, CONF_MAX_TOKENS, {
|
||||
result.update({
|
||||
vol.Required(
|
||||
CONF_CONTEXT_LENGTH,
|
||||
description={"suggested_value": options.get(CONF_CONTEXT_LENGTH)},
|
||||
@@ -1182,4 +1182,53 @@ def local_llama_config_option_schema(hass: HomeAssistant, options: MappingProxyT
|
||||
): NumberSelector(NumberSelectorConfig(min=-1, max=1440, step=1, unit_of_measurement=UnitOfTime.MINUTES, mode=NumberSelectorMode.BOX)),
|
||||
})
|
||||
|
||||
# sort the options
|
||||
global_order = [
|
||||
# general
|
||||
CONF_LLM_HASS_API,
|
||||
CONF_PROMPT,
|
||||
CONF_CONTEXT_LENGTH,
|
||||
CONF_MAX_TOKENS,
|
||||
CONF_OPENAI_API_KEY,
|
||||
CONF_REQUEST_TIMEOUT,
|
||||
# sampling parameters
|
||||
CONF_TEMPERATURE,
|
||||
CONF_TOP_K,
|
||||
CONF_TOP_P,
|
||||
CONF_MIN_P,
|
||||
CONF_TYPICAL_P,
|
||||
# tool calling/reasoning
|
||||
CONF_THINKING_PREFIX,
|
||||
CONF_THINKING_SUFFIX,
|
||||
CONF_TOOL_CALL_PREFIX,
|
||||
CONF_TOOL_CALL_SUFFIX,
|
||||
CONF_MAX_TOOL_CALL_ITERATIONS,
|
||||
CONF_ENABLE_LEGACY_TOOL_CALLING,
|
||||
CONF_USE_GBNF_GRAMMAR,
|
||||
CONF_GBNF_GRAMMAR_FILE,
|
||||
# integration specific options
|
||||
CONF_EXTRA_ATTRIBUTES_TO_EXPOSE,
|
||||
CONF_REFRESH_SYSTEM_PROMPT,
|
||||
CONF_REMEMBER_CONVERSATION,
|
||||
CONF_REMEMBER_NUM_INTERACTIONS,
|
||||
CONF_REMEMBER_CONVERSATION_TIME_MINUTES,
|
||||
CONF_PROMPT_CACHING_ENABLED,
|
||||
CONF_PROMPT_CACHING_INTERVAL,
|
||||
CONF_USE_IN_CONTEXT_LEARNING_EXAMPLES,
|
||||
CONF_IN_CONTEXT_EXAMPLES_FILE,
|
||||
CONF_NUM_IN_CONTEXT_EXAMPLES,
|
||||
# backend specific options
|
||||
CONF_LLAMACPP_BATCH_SIZE,
|
||||
CONF_LLAMACPP_THREAD_COUNT,
|
||||
CONF_LLAMACPP_BATCH_THREAD_COUNT,
|
||||
CONF_LLAMACPP_ENABLE_FLASH_ATTENTION,
|
||||
CONF_TEXT_GEN_WEBUI_ADMIN_KEY,
|
||||
CONF_TEXT_GEN_WEBUI_PRESET,
|
||||
CONF_TEXT_GEN_WEBUI_CHAT_MODE,
|
||||
CONF_OLLAMA_KEEP_ALIVE_MIN,
|
||||
CONF_OLLAMA_JSON_MODE,
|
||||
]
|
||||
|
||||
result = { k: v for k, v in sorted(result.items(), key=lambda item: global_order.index(item[0]) if item[0] in global_order else 9999) }
|
||||
|
||||
return result
|
||||
|
||||
@@ -90,7 +90,7 @@ CONF_CHAT_MODEL = "huggingface_model"
|
||||
DEFAULT_CHAT_MODEL = "acon96/Home-3B-v3-GGUF"
|
||||
RECOMMENDED_CHAT_MODELS = [ "acon96/Home-3B-v3-GGUF", "acon96/Home-1B-v3-GGUF", "TheBloke/Mistral-7B-Instruct-v0.2-GGUF" ]
|
||||
CONF_MAX_TOKENS = "max_new_tokens"
|
||||
DEFAULT_MAX_TOKENS = 128
|
||||
DEFAULT_MAX_TOKENS = 512
|
||||
CONF_TOP_K = "top_k"
|
||||
DEFAULT_TOP_K = 40
|
||||
CONF_TOP_P = "top_p"
|
||||
@@ -139,8 +139,8 @@ CONF_TOOL_CALL_SUFFIX = "tool_call_suffix"
|
||||
DEFAULT_TOOL_CALL_SUFFIX = "</tool_call>"
|
||||
CONF_ENABLE_LEGACY_TOOL_CALLING = "enable_legacy_tool_calling"
|
||||
DEFAULT_ENABLE_LEGACY_TOOL_CALLING = False
|
||||
CONF_ENABLE_FLASH_ATTENTION = "enable_flash_attention"
|
||||
DEFAULT_ENABLE_FLASH_ATTENTION = False
|
||||
CONF_LLAMACPP_ENABLE_FLASH_ATTENTION = "enable_flash_attention"
|
||||
DEFAULT_LLAMACPP_ENABLE_FLASH_ATTENTION = False
|
||||
CONF_USE_GBNF_GRAMMAR = "gbnf_grammar"
|
||||
DEFAULT_USE_GBNF_GRAMMAR = False
|
||||
CONF_GBNF_GRAMMAR_FILE = "gbnf_grammar_file"
|
||||
@@ -183,12 +183,12 @@ CONF_GENERIC_OPENAI_VALIDATE_MODEL = "openai_validate_model"
|
||||
DEFAULT_GENERIC_OPENAI_VALIDATE_MODEL = True
|
||||
CONF_CONTEXT_LENGTH = "context_length"
|
||||
DEFAULT_CONTEXT_LENGTH = 2048
|
||||
CONF_BATCH_SIZE = "batch_size"
|
||||
DEFAULT_BATCH_SIZE = 512
|
||||
CONF_THREAD_COUNT = "n_threads"
|
||||
DEFAULT_THREAD_COUNT = os.cpu_count()
|
||||
CONF_BATCH_THREAD_COUNT = "n_batch_threads"
|
||||
DEFAULT_BATCH_THREAD_COUNT = os.cpu_count()
|
||||
CONF_LLAMACPP_BATCH_SIZE = "batch_size"
|
||||
DEFAULT_LLAMACPP_BATCH_SIZE = 512
|
||||
CONF_LLAMACPP_THREAD_COUNT = "n_threads"
|
||||
DEFAULT_LLAMACPP_THREAD_COUNT = os.cpu_count()
|
||||
CONF_LLAMACPP_BATCH_THREAD_COUNT = "n_batch_threads"
|
||||
DEFAULT_LLAMACPP_BATCH_THREAD_COUNT = os.cpu_count()
|
||||
|
||||
DEFAULT_OPTIONS = types.MappingProxyType(
|
||||
{
|
||||
@@ -200,7 +200,7 @@ DEFAULT_OPTIONS = types.MappingProxyType(
|
||||
CONF_TYPICAL_P: DEFAULT_TYPICAL_P,
|
||||
CONF_TEMPERATURE: DEFAULT_TEMPERATURE,
|
||||
CONF_REQUEST_TIMEOUT: DEFAULT_REQUEST_TIMEOUT,
|
||||
CONF_ENABLE_FLASH_ATTENTION: DEFAULT_ENABLE_FLASH_ATTENTION,
|
||||
CONF_LLAMACPP_ENABLE_FLASH_ATTENTION: DEFAULT_LLAMACPP_ENABLE_FLASH_ATTENTION,
|
||||
CONF_USE_GBNF_GRAMMAR: DEFAULT_USE_GBNF_GRAMMAR,
|
||||
CONF_EXTRA_ATTRIBUTES_TO_EXPOSE: DEFAULT_EXTRA_ATTRIBUTES_TO_EXPOSE,
|
||||
CONF_REFRESH_SYSTEM_PROMPT: DEFAULT_REFRESH_SYSTEM_PROMPT,
|
||||
@@ -210,9 +210,9 @@ DEFAULT_OPTIONS = types.MappingProxyType(
|
||||
CONF_IN_CONTEXT_EXAMPLES_FILE: DEFAULT_IN_CONTEXT_EXAMPLES_FILE,
|
||||
CONF_NUM_IN_CONTEXT_EXAMPLES: DEFAULT_NUM_IN_CONTEXT_EXAMPLES,
|
||||
CONF_CONTEXT_LENGTH: DEFAULT_CONTEXT_LENGTH,
|
||||
CONF_BATCH_SIZE: DEFAULT_BATCH_SIZE,
|
||||
CONF_THREAD_COUNT: DEFAULT_THREAD_COUNT,
|
||||
CONF_BATCH_THREAD_COUNT: DEFAULT_BATCH_THREAD_COUNT,
|
||||
CONF_LLAMACPP_BATCH_SIZE: DEFAULT_LLAMACPP_BATCH_SIZE,
|
||||
CONF_LLAMACPP_THREAD_COUNT: DEFAULT_LLAMACPP_THREAD_COUNT,
|
||||
CONF_LLAMACPP_BATCH_THREAD_COUNT: DEFAULT_LLAMACPP_BATCH_THREAD_COUNT,
|
||||
CONF_PROMPT_CACHING_ENABLED: DEFAULT_PROMPT_CACHING_ENABLED,
|
||||
CONF_OLLAMA_KEEP_ALIVE_MIN: DEFAULT_OLLAMA_KEEP_ALIVE_MIN,
|
||||
CONF_OLLAMA_JSON_MODE: DEFAULT_OLLAMA_JSON_MODE,
|
||||
|
||||
@@ -466,7 +466,7 @@ class LocalLLMAgent(ConversationEntity, AbstractConversationAgent):
|
||||
|
||||
return list(domains)
|
||||
|
||||
def _async_get_exposed_entities(self) -> dict[str, dict]:
|
||||
def _async_get_exposed_entities(self) -> dict[str, dict[str, Any]]:
|
||||
"""Gather exposed entity states"""
|
||||
entity_states: dict[str, dict] = {}
|
||||
entity_registry = er.async_get(self.hass)
|
||||
|
||||
@@ -40,7 +40,6 @@
|
||||
"openai_validate_model": "Validate model exists?",
|
||||
"text_generation_webui_admin_key": "Admin Key",
|
||||
"text_generation_webui_preset": "Generation Preset/Character Name",
|
||||
"remote_use_chat_endpoint": "Use chat completions endpoint",
|
||||
"text_generation_webui_chat_mode": "Chat Mode",
|
||||
"selected_language": "Model Language"
|
||||
},
|
||||
@@ -85,7 +84,6 @@
|
||||
"in_context_examples_file": "In context learning examples CSV filename",
|
||||
"num_in_context_examples": "Number of ICL examples to generate",
|
||||
"text_generation_webui_preset": "Generation Preset/Character Name",
|
||||
"remote_use_chat_endpoint": "Use chat completions endpoint",
|
||||
"text_generation_webui_chat_mode": "Chat Mode",
|
||||
"prompt_caching": "Enable Prompt Caching",
|
||||
"prompt_caching_interval": "Prompt Caching fastest refresh interval (sec)",
|
||||
@@ -104,7 +102,6 @@
|
||||
"llm_hass_api": "Select 'Assist' if you want the model to be able to control devices. If you are using the Home-LLM v1, v2, or v3 model then select 'Home-LLM (v1-3)'",
|
||||
"prompt": "See [here](https://github.com/acon96/home-llm/blob/develop/docs/Model%20Prompting.md) for more information on model prompting.",
|
||||
"in_context_examples": "If you are using a model that is not specifically fine-tuned for use with this integration: enable this",
|
||||
"remote_use_chat_endpoint": "If this is enabled, then the integration will use the chat completion HTTP endpoint instead of the text completion one.",
|
||||
"extra_attributes_to_expose": "This is the list of Home Assistant 'attributes' that are exposed to the model. This limits how much information the model is able to see and answer questions on.",
|
||||
"gbnf_grammar": "Forces the model to output properly formatted responses. Ensure the file specified below exists in the integration directory.",
|
||||
"prompt_caching": "Prompt caching attempts to pre-process the prompt (house state) and cache the processing that needs to be done to understand the prompt. Enabling this will cause the model to re-process the prompt any time an entity state changes in the house, restricted by the interval below."
|
||||
@@ -127,10 +124,10 @@
|
||||
"min_p": "Min P",
|
||||
"typical_p": "Typical P",
|
||||
"request_timeout": "Remote Request Timeout (seconds)",
|
||||
"ollama_keep_alive": "Keep Alive/Inactivity Timeout (minutes)",
|
||||
"ollama_json_mode": "JSON Output Mode",
|
||||
"ollama_keep_alive": "(ollama) Keep Alive/Inactivity Timeout (minutes)",
|
||||
"ollama_json_mode": "(ollama) JSON Output Mode",
|
||||
"extra_attributes_to_expose": "Additional attribute to expose in the context",
|
||||
"enable_flash_attention": "Enable Flash Attention",
|
||||
"enable_flash_attention": "(llama.cpp) Enable Flash Attention",
|
||||
"gbnf_grammar": "Enable GBNF Grammar",
|
||||
"gbnf_grammar_file": "GBNF Grammar Filename",
|
||||
"openai_api_key": "API Key",
|
||||
@@ -142,15 +139,14 @@
|
||||
"in_context_examples": "Enable in context learning (ICL) examples",
|
||||
"in_context_examples_file": "In context learning examples CSV filename",
|
||||
"num_in_context_examples": "Number of ICL examples to generate",
|
||||
"text_generation_webui_preset": "Generation Preset/Character Name",
|
||||
"remote_use_chat_endpoint": "Use chat completions endpoint",
|
||||
"text_generation_webui_chat_mode": "Chat Mode",
|
||||
"text_generation_webui_preset": "(text-generation-webui) Generation Preset/Character Name",
|
||||
"text_generation_webui_chat_mode": "(text-generation-webui) Chat Mode",
|
||||
"prompt_caching": "Enable Prompt Caching",
|
||||
"prompt_caching_interval": "Prompt Caching fastest refresh interval (sec)",
|
||||
"context_length": "Context Length",
|
||||
"batch_size": "Batch Size",
|
||||
"n_threads": "Thread Count",
|
||||
"n_batch_threads": "Batch Thread Count",
|
||||
"batch_size": "(llama.cpp) Batch Size",
|
||||
"n_threads": "(llama.cpp) Thread Count",
|
||||
"n_batch_threads": "(llama.cpp) Batch Thread Count",
|
||||
"thinking_prefix": "Reasoning Content Prefix",
|
||||
"thinking_suffix": "Reasoning Content Suffix",
|
||||
"tool_call_prefix": "Tool Call Prefix",
|
||||
@@ -162,7 +158,6 @@
|
||||
"llm_hass_api": "Select 'Assist' if you want the model to be able to control devices. If you are using the Home-LLM v1, v2, or v3 model then select 'Home-LLM (v1-3)'",
|
||||
"prompt": "See [here](https://github.com/acon96/home-llm/blob/develop/docs/Model%20Prompting.md) for more information on model prompting.",
|
||||
"in_context_examples": "If you are using a model that is not specifically fine-tuned for use with this integration: enable this",
|
||||
"remote_use_chat_endpoint": "If this is enabled, then the integration will use the chat completion HTTP endpoint instead of the text completion one.",
|
||||
"extra_attributes_to_expose": "This is the list of Home Assistant 'attributes' that are exposed to the model. This limits how much information the model is able to see and answer questions on.",
|
||||
"gbnf_grammar": "Forces the model to output properly formatted responses. Ensure the file specified below exists in the integration directory.",
|
||||
"prompt_caching": "Prompt caching attempts to pre-process the prompt (house state) and cache the processing that needs to be done to understand the prompt. Enabling this will cause the model to re-process the prompt any time an entity state changes in the house, restricted by the interval below."
|
||||
|
||||
Reference in New Issue
Block a user