wire up options to support functiongemma properly

This commit is contained in:
Alex O'Connell
2025-12-20 22:25:03 -05:00
parent 29d839eea8
commit 0e4031ef43
8 changed files with 91 additions and 37 deletions

View File

@@ -28,6 +28,7 @@ from custom_components.llama_conversation.const import (
CONF_REMEMBER_CONVERSATION_TIME_MINUTES,
CONF_GENERIC_OPENAI_PATH,
CONF_ENABLE_LEGACY_TOOL_CALLING,
CONF_TOOL_RESPONSE_AS_STRING,
CONF_RESPONSE_JSON_SCHEMA,
DEFAULT_MAX_TOKENS,
DEFAULT_TEMPERATURE,
@@ -37,6 +38,7 @@ from custom_components.llama_conversation.const import (
DEFAULT_REMEMBER_CONVERSATION_TIME_MINUTES,
DEFAULT_GENERIC_OPENAI_PATH,
DEFAULT_ENABLE_LEGACY_TOOL_CALLING,
DEFAULT_TOOL_RESPONSE_AS_STRING,
RECOMMENDED_CHAT_MODELS,
)
from custom_components.llama_conversation.entity import TextGenerationResult, LocalLLMClient
@@ -126,15 +128,18 @@ class GenericOpenAIAPIClient(LocalLLMClient):
model_name = entity_options[CONF_CHAT_MODEL]
temperature = entity_options.get(CONF_TEMPERATURE, DEFAULT_TEMPERATURE)
top_p = entity_options.get(CONF_TOP_P, DEFAULT_TOP_P)
max_tokens = entity_options.get(CONF_MAX_TOKENS, DEFAULT_MAX_TOKENS)
timeout = entity_options.get(CONF_REQUEST_TIMEOUT, DEFAULT_REQUEST_TIMEOUT)
enable_legacy_tool_calling = entity_options.get(CONF_ENABLE_LEGACY_TOOL_CALLING, DEFAULT_ENABLE_LEGACY_TOOL_CALLING)
tool_response_as_string = entity_options.get(CONF_TOOL_RESPONSE_AS_STRING, DEFAULT_TOOL_RESPONSE_AS_STRING)
endpoint, additional_params = self._chat_completion_params(entity_options)
messages = get_oai_formatted_messages(conversation, user_content_as_list=True)
messages = get_oai_formatted_messages(conversation, user_content_as_list=True, tool_result_to_str=tool_response_as_string)
request_params = {
"model": model_name,
"stream": True,
"max_tokens": max_tokens,
"temperature": temperature,
"top_p": top_p,
"messages": messages
@@ -152,8 +157,6 @@ class GenericOpenAIAPIClient(LocalLLMClient):
}
tools = None
# "legacy" tool calling passes the tools directly as part of the system prompt instead of as "tools"
# most local backends absolutely butcher any sort of prompt formatting when using tool calling
if llm_api and not enable_legacy_tool_calling:
tools = get_oai_formatted_tools(llm_api, self._async_get_all_exposed_domains())
request_params["tools"] = tools

View File

@@ -19,6 +19,8 @@ from homeassistant.helpers.event import async_track_state_change, async_call_lat
from custom_components.llama_conversation.utils import install_llama_cpp_python, validate_llama_cpp_python_installation, get_oai_formatted_messages, get_oai_formatted_tools
from custom_components.llama_conversation.const import (
CONF_ENABLE_LEGACY_TOOL_CALLING,
CONF_TOOL_RESPONSE_AS_STRING,
CONF_INSTALLED_LLAMACPP_VERSION,
CONF_CHAT_MODEL,
CONF_MAX_TOKENS,
@@ -38,7 +40,10 @@ from custom_components.llama_conversation.const import (
CONF_LLAMACPP_BATCH_SIZE,
CONF_LLAMACPP_THREAD_COUNT,
CONF_LLAMACPP_BATCH_THREAD_COUNT,
CONF_LLAMACPP_CACHE_SIZE_MB,
CONF_INSTALLED_LLAMACPP_VERSION,
DEFAULT_ENABLE_LEGACY_TOOL_CALLING,
DEFAULT_TOOL_RESPONSE_AS_STRING,
DEFAULT_MAX_TOKENS,
DEFAULT_PROMPT,
DEFAULT_TEMPERATURE,
@@ -55,6 +60,7 @@ from custom_components.llama_conversation.const import (
DEFAULT_LLAMACPP_BATCH_SIZE,
DEFAULT_LLAMACPP_THREAD_COUNT,
DEFAULT_LLAMACPP_BATCH_THREAD_COUNT,
DEFAULT_LLAMACPP_CACHE_SIZE_MB,
DOMAIN,
CONF_RESPONSE_JSON_SCHEMA,
)
@@ -84,6 +90,7 @@ def snapshot_settings(options: dict[str, Any]) -> dict[str, Any]:
CONF_LLAMACPP_BATCH_SIZE: options.get(CONF_LLAMACPP_BATCH_SIZE, DEFAULT_LLAMACPP_BATCH_SIZE),
CONF_LLAMACPP_THREAD_COUNT: options.get(CONF_LLAMACPP_THREAD_COUNT, DEFAULT_LLAMACPP_THREAD_COUNT),
CONF_LLAMACPP_BATCH_THREAD_COUNT: options.get(CONF_LLAMACPP_BATCH_THREAD_COUNT, DEFAULT_LLAMACPP_BATCH_THREAD_COUNT),
CONF_LLAMACPP_CACHE_SIZE_MB: options.get(CONF_LLAMACPP_CACHE_SIZE_MB, DEFAULT_LLAMACPP_CACHE_SIZE_MB),
CONF_LLAMACPP_ENABLE_FLASH_ATTENTION: options.get(CONF_LLAMACPP_ENABLE_FLASH_ATTENTION, DEFAULT_LLAMACPP_ENABLE_FLASH_ATTENTION),
CONF_INSTALLED_LLAMACPP_VERSION: options.get(CONF_INSTALLED_LLAMACPP_VERSION, ""),
CONF_GBNF_GRAMMAR_FILE: options.get(CONF_GBNF_GRAMMAR_FILE, DEFAULT_GBNF_GRAMMAR_FILE),
@@ -172,11 +179,13 @@ class LlamaCppClient(LocalLLMClient):
)
_LOGGER.debug("Model loaded")
# FIXME: make cache size configurable (0 means disabled)
llm.set_cache(LlamaDiskCache(
capacity_bytes=int(512 * 10e8),
cache_dir=os.path.join(self.hass.config.media_dirs.get("local", self.hass.config.path("media")), "kv_cache")
))
# create disk cache if enabled
cache_size = model_settings.get(CONF_LLAMACPP_CACHE_SIZE_MB, DEFAULT_LLAMACPP_CACHE_SIZE_MB)
if cache_size > 0:
llm.set_cache(LlamaDiskCache(
capacity_bytes=int(cache_size * (1024 ** 3)),
cache_dir=os.path.join(self.hass.config.media_dirs.get("local", self.hass.config.path("media")), "kv_cache")
))
if model_settings[CONF_PROMPT_CACHING_ENABLED]:
@callback
@@ -224,6 +233,8 @@ class LlamaCppClient(LocalLLMClient):
should_reload = True
elif loaded_options[CONF_LLAMACPP_ENABLE_FLASH_ATTENTION] != entity_options.get(CONF_LLAMACPP_ENABLE_FLASH_ATTENTION, DEFAULT_LLAMACPP_ENABLE_FLASH_ATTENTION):
should_reload = True
elif loaded_options[CONF_LLAMACPP_CACHE_SIZE_MB] != entity_options.get(CONF_LLAMACPP_CACHE_SIZE_MB, DEFAULT_LLAMACPP_CACHE_SIZE_MB):
should_reload = True
elif loaded_options[CONF_INSTALLED_LLAMACPP_VERSION] != entity_options.get(CONF_INSTALLED_LLAMACPP_VERSION):
should_reload = True
_LOGGER.debug(f"Reloading llama.cpp...")
@@ -437,12 +448,14 @@ class LlamaCppClient(LocalLLMClient):
min_p = entity_options.get(CONF_MIN_P, DEFAULT_MIN_P)
typical_p = entity_options.get(CONF_TYPICAL_P, DEFAULT_TYPICAL_P)
grammar = self.grammars.get(model_name) if entity_options.get(CONF_USE_GBNF_GRAMMAR, DEFAULT_USE_GBNF_GRAMMAR) else None
enable_legacy_tool_calling = entity_options.get(CONF_ENABLE_LEGACY_TOOL_CALLING, DEFAULT_ENABLE_LEGACY_TOOL_CALLING)
tool_response_as_string = entity_options.get(CONF_TOOL_RESPONSE_AS_STRING, DEFAULT_TOOL_RESPONSE_AS_STRING)
_LOGGER.debug(f"Options: {entity_options}")
messages = get_oai_formatted_messages(conversation)
messages = get_oai_formatted_messages(conversation, tool_result_to_str=tool_response_as_string)
tools = None
if llm_api:
if llm_api and not enable_legacy_tool_calling:
tools = get_oai_formatted_tools(llm_api, self._async_get_all_exposed_domains())
_LOGGER.debug(f"Generating completion with {len(messages)} messages and {len(tools) if tools else 0} tools...")

View File

@@ -33,6 +33,7 @@ from custom_components.llama_conversation.const import (
CONF_OLLAMA_JSON_MODE,
CONF_CONTEXT_LENGTH,
CONF_ENABLE_LEGACY_TOOL_CALLING,
CONF_TOOL_RESPONSE_AS_STRING,
CONF_RESPONSE_JSON_SCHEMA,
DEFAULT_MAX_TOKENS,
DEFAULT_TEMPERATURE,
@@ -47,6 +48,7 @@ from custom_components.llama_conversation.const import (
DEFAULT_OLLAMA_JSON_MODE,
DEFAULT_CONTEXT_LENGTH,
DEFAULT_ENABLE_LEGACY_TOOL_CALLING,
DEFAULT_TOOL_RESPONSE_AS_STRING,
)
from custom_components.llama_conversation.entity import LocalLLMClient, TextGenerationResult
@@ -194,7 +196,8 @@ class OllamaAPIClient(LocalLLMClient):
typical_p = entity_options.get(CONF_TYPICAL_P, DEFAULT_TYPICAL_P)
timeout = entity_options.get(CONF_REQUEST_TIMEOUT, DEFAULT_REQUEST_TIMEOUT)
keep_alive = entity_options.get(CONF_OLLAMA_KEEP_ALIVE_MIN, DEFAULT_OLLAMA_KEEP_ALIVE_MIN)
legacy_tool_calling = entity_options.get(CONF_ENABLE_LEGACY_TOOL_CALLING, DEFAULT_ENABLE_LEGACY_TOOL_CALLING)
enable_legacy_tool_calling = entity_options.get(CONF_ENABLE_LEGACY_TOOL_CALLING, DEFAULT_ENABLE_LEGACY_TOOL_CALLING)
tool_response_as_string = entity_options.get(CONF_TOOL_RESPONSE_AS_STRING, DEFAULT_TOOL_RESPONSE_AS_STRING)
think_mode = entity_options.get(CONF_ENABLE_THINK_MODE, DEFAULT_ENABLE_THINK_MODE)
json_mode = entity_options.get(CONF_OLLAMA_JSON_MODE, DEFAULT_OLLAMA_JSON_MODE)
@@ -208,9 +211,9 @@ class OllamaAPIClient(LocalLLMClient):
"min_p": entity_options.get(CONF_MIN_P, DEFAULT_MIN_P),
}
messages = get_oai_formatted_messages(conversation, tool_args_to_str=False)
messages = get_oai_formatted_messages(conversation, tool_args_to_str=False, tool_result_to_str=tool_response_as_string)
tools = None
if llm_api and not legacy_tool_calling:
if llm_api and not enable_legacy_tool_calling:
tools = get_oai_formatted_tools(llm_api, self._async_get_all_exposed_domains())
keep_alive_payload = self._format_keep_alive(keep_alive)

View File

@@ -70,6 +70,7 @@ from .const import (
CONF_TOOL_CALL_PREFIX,
CONF_TOOL_CALL_SUFFIX,
CONF_ENABLE_LEGACY_TOOL_CALLING,
CONF_TOOL_RESPONSE_AS_STRING,
CONF_LLAMACPP_ENABLE_FLASH_ATTENTION,
CONF_USE_GBNF_GRAMMAR,
CONF_GBNF_GRAMMAR_FILE,
@@ -96,6 +97,7 @@ from .const import (
CONF_LLAMACPP_THREAD_COUNT,
CONF_LLAMACPP_BATCH_THREAD_COUNT,
CONF_LLAMACPP_REINSTALL,
CONF_LLAMACPP_CACHE_SIZE_MB,
DEFAULT_CHAT_MODEL,
DEFAULT_PORT,
DEFAULT_SSL,
@@ -121,6 +123,7 @@ from .const import (
DEFAULT_TOOL_CALL_PREFIX,
DEFAULT_TOOL_CALL_SUFFIX,
DEFAULT_ENABLE_LEGACY_TOOL_CALLING,
DEFAULT_TOOL_RESPONSE_AS_STRING,
DEFAULT_LLAMACPP_ENABLE_FLASH_ATTENTION,
DEFAULT_USE_GBNF_GRAMMAR,
DEFAULT_GBNF_GRAMMAR_FILE,
@@ -142,6 +145,7 @@ from .const import (
DEFAULT_LLAMACPP_BATCH_SIZE,
DEFAULT_LLAMACPP_THREAD_COUNT,
DEFAULT_LLAMACPP_BATCH_THREAD_COUNT,
DEFAULT_LLAMACPP_CACHE_SIZE_MB,
BACKEND_TYPE_LLAMA_CPP,
BACKEND_TYPE_TEXT_GEN_WEBUI,
BACKEND_TYPE_GENERIC_OPENAI,
@@ -621,6 +625,11 @@ def local_llama_config_option_schema(
description={"suggested_value": options.get(CONF_ENABLE_LEGACY_TOOL_CALLING)},
default=DEFAULT_ENABLE_LEGACY_TOOL_CALLING
): bool,
vol.Required(
CONF_TOOL_RESPONSE_AS_STRING,
description={"suggested_value": options.get(CONF_TOOL_RESPONSE_AS_STRING)},
default=DEFAULT_TOOL_RESPONSE_AS_STRING
): bool,
}
if subentry_type == ai_task.DOMAIN:
@@ -727,7 +736,7 @@ def local_llama_config_option_schema(
CONF_PROMPT_CACHING_INTERVAL,
description={"suggested_value": options.get(CONF_PROMPT_CACHING_INTERVAL)},
default=DEFAULT_PROMPT_CACHING_INTERVAL,
): NumberSelector(NumberSelectorConfig(min=1, max=60, step=1)),
): NumberSelector(NumberSelectorConfig(min=1, max=60, step=1))
})
result.update({
vol.Required(
@@ -781,6 +790,11 @@ def local_llama_config_option_schema(
description={"suggested_value": options.get(CONF_LLAMACPP_ENABLE_FLASH_ATTENTION)},
default=DEFAULT_LLAMACPP_ENABLE_FLASH_ATTENTION,
): BooleanSelector(BooleanSelectorConfig()),
vol.Required(
CONF_LLAMACPP_CACHE_SIZE_MB,
description={"suggested_value": options.get(CONF_LLAMACPP_CACHE_SIZE_MB)},
default=DEFAULT_LLAMACPP_CACHE_SIZE_MB,
): NumberSelector(NumberSelectorConfig(min=0, max=1024, step=1)),
vol.Required(
CONF_USE_GBNF_GRAMMAR,
description={"suggested_value": options.get(CONF_USE_GBNF_GRAMMAR)},
@@ -975,6 +989,7 @@ def local_llama_config_option_schema(
CONF_TOOL_CALL_SUFFIX,
CONF_MAX_TOOL_CALL_ITERATIONS,
CONF_ENABLE_LEGACY_TOOL_CALLING,
CONF_TOOL_RESPONSE_AS_STRING,
CONF_USE_GBNF_GRAMMAR,
CONF_GBNF_GRAMMAR_FILE,
# integration specific options
@@ -989,6 +1004,7 @@ def local_llama_config_option_schema(
CONF_IN_CONTEXT_EXAMPLES_FILE,
CONF_NUM_IN_CONTEXT_EXAMPLES,
# backend specific options
CONF_LLAMACPP_CACHE_SIZE_MB,
CONF_LLAMACPP_BATCH_SIZE,
CONF_LLAMACPP_THREAD_COUNT,
CONF_LLAMACPP_BATCH_THREAD_COUNT,

View File

@@ -149,6 +149,8 @@ CONF_TOOL_CALL_SUFFIX = "tool_call_suffix"
DEFAULT_TOOL_CALL_SUFFIX = "</tool_call>"
CONF_ENABLE_LEGACY_TOOL_CALLING = "enable_legacy_tool_calling"
DEFAULT_ENABLE_LEGACY_TOOL_CALLING = False
CONF_TOOL_RESPONSE_AS_STRING = "tool_response_as_string"
DEFAULT_TOOL_RESPONSE_AS_STRING = True
CONF_LLAMACPP_ENABLE_FLASH_ATTENTION = "enable_flash_attention"
DEFAULT_LLAMACPP_ENABLE_FLASH_ATTENTION = False
CONF_USE_GBNF_GRAMMAR = "gbnf_grammar"
@@ -201,6 +203,8 @@ DEFAULT_LLAMACPP_THREAD_COUNT = os.cpu_count()
CONF_LLAMACPP_BATCH_THREAD_COUNT = "n_batch_threads"
DEFAULT_LLAMACPP_BATCH_THREAD_COUNT = os.cpu_count()
CONF_LLAMACPP_REINSTALL = "reinstall_llama_cpp"
CONF_LLAMACPP_CACHE_SIZE_MB = "llama_cpp_cache_size_mb"
DEFAULT_LLAMACPP_CACHE_SIZE_MB = 128
DEFAULT_OPTIONS = types.MappingProxyType(
{

View File

@@ -497,7 +497,6 @@ class LocalLLMClient:
entities_to_expose = self._async_get_exposed_entities()
extra_attributes_to_expose = entity_options.get(CONF_EXTRA_ATTRIBUTES_TO_EXPOSE, DEFAULT_EXTRA_ATTRIBUTES_TO_EXPOSE)
enable_legacy_tool_calling = entity_options.get(CONF_ENABLE_LEGACY_TOOL_CALLING, DEFAULT_ENABLE_LEGACY_TOOL_CALLING)
tool_call_prefix = entity_options.get(CONF_TOOL_CALL_PREFIX, DEFAULT_TOOL_CALL_PREFIX)
tool_call_suffix = entity_options.get(CONF_TOOL_CALL_SUFFIX, DEFAULT_TOOL_CALL_SUFFIX)
@@ -571,21 +570,16 @@ class LocalLLMClient:
"tool_call_suffix": tool_call_suffix,
}
if enable_legacy_tool_calling:
if llm_api:
tools = []
for tool in llm_api.tools:
tools.append(f"{tool.name}({','.join(flatten_vol_schema(tool.parameters))})")
render_variables["tools"] = tools
render_variables["formatted_tools"] = ", ".join(tools)
else:
message = "No tools were provided. If the user requests you interact with a device, tell them you are unable to do so."
render_variables["tools"] = [message]
render_variables["formatted_tools"] = message
if llm_api:
tools = []
for tool in llm_api.tools:
tools.append(f"{tool.name}({','.join(flatten_vol_schema(tool.parameters))})")
render_variables["tools"] = tools
render_variables["formatted_tools"] = ", ".join(tools)
else:
# Tools are passed via the API not the prompt
render_variables["tools"] = []
render_variables["formatted_tools"] = ""
message = "No tools were provided. If the user requests you interact with a device, tell them you are unable to do so."
render_variables["tools"] = [message]
render_variables["formatted_tools"] = message
# only pass examples if there are loaded examples + an API was exposed
if self.in_context_examples and llm_api:

View File

@@ -101,11 +101,13 @@
"batch_size": "(llama.cpp) Batch Size",
"n_threads": "(llama.cpp) Thread Count",
"n_batch_threads": "(llama.cpp) Batch Thread Count",
"llama_cpp_cache_size_mb": "(llama.cpp) Disk KV Cache Size (MB)",
"thinking_prefix": "Reasoning Content Prefix",
"thinking_suffix": "Reasoning Content Suffix",
"tool_call_prefix": "Tool Call Prefix",
"tool_call_suffix": "Tool Call Suffix",
"enable_legacy_tool_calling": "Enable Legacy Tool Calling",
"tool_response_as_string": "Tool Response as String",
"max_tool_call_iterations": "Maximum Tool Call Attempts"
},
"data_description": {
@@ -116,6 +118,7 @@
"gbnf_grammar": "Forces the model to output properly formatted responses. Ensure the file specified below exists in the integration directory.",
"prompt_caching": "Prompt caching attempts to pre-process the prompt (house state) and cache the processing that needs to be done to understand the prompt. Enabling this will cause the model to re-process the prompt any time an entity state changes in the house, restricted by the interval below.",
"enable_legacy_tool_calling": "Prefer to process tool calls locally rather than relying on the backend to handle the tool calling format. Can be more reliable, however it requires properly setting the tool call prefix and suffix.",
"tool_response_as_string": "Some prompt templates expect the tool response to be provided as a JSON serialized string, rather than the raw object.",
"max_tool_call_iterations": "Set to 0 to generate the response and tool call in one attempt, without looping (use this for Home models v1-v3)."
},
"description": "Please configure the model according to how it should be prompted. There are many different options and selecting the correct ones for your model is essential to getting optimal performance. See [here](https://github.com/acon96/home-llm/blob/develop/docs/Backend%20Configuration.md) for more information about the options on this page.\n\n**Some defaults may have been chosen for you based on the name of the selected model name or filename.** If you renamed a file or are using a fine-tuning of a supported model, then the defaults may not have been detected.",
@@ -155,11 +158,13 @@
"batch_size": "(llama.cpp) Batch Size",
"n_threads": "(llama.cpp) Thread Count",
"n_batch_threads": "(llama.cpp) Batch Thread Count",
"llama_cpp_cache_size_mb": "(llama.cpp) Disk KV Cache Size (MB)",
"thinking_prefix": "Reasoning Content Prefix",
"thinking_suffix": "Reasoning Content Suffix",
"tool_call_prefix": "Tool Call Prefix",
"tool_call_suffix": "Tool Call Suffix",
"enable_legacy_tool_calling": "Enable Legacy Tool Calling",
"tool_response_as_string": "Tool Response as String",
"max_tool_call_iterations": "Maximum Tool Call Attempts"
},
"data_description": {
@@ -170,6 +175,7 @@
"gbnf_grammar": "Forces the model to output properly formatted responses. Ensure the file specified below exists in the integration directory.",
"prompt_caching": "Prompt caching attempts to pre-process the prompt (house state) and cache the processing that needs to be done to understand the prompt. Enabling this will cause the model to re-process the prompt any time an entity state changes in the house, restricted by the interval below.",
"enable_legacy_tool_calling": "Prefer to process tool calls locally rather than relying on the backend to handle the tool calling format. Can be more reliable, however it requires properly setting the tool call prefix and suffix.",
"tool_response_as_string": "Some prompt templates expect the tool response to be provided as a JSON serialized string, rather than the raw object.",
"max_tool_call_iterations": "Set to 0 to generate the response and tool call in one attempt, without looping (use this for Home models v1-v3)."
},
"description": "Please configure the model according to how it should be prompted. There are many different options and selecting the correct ones for your model is essential to getting optimal performance. See [here](https://github.com/acon96/home-llm/blob/develop/docs/Backend%20Configuration.md) for more information about the options on this page.\n\n**Some defaults may have been chosen for you based on the name of the selected model name or filename.** If you renamed a file or are using a fine-tuning of a supported model, then the defaults may not have been detected.",
@@ -241,11 +247,13 @@
"batch_size": "(llama.cpp) Batch Size",
"n_threads": "(llama.cpp) Thread Count",
"n_batch_threads": "(llama.cpp) Batch Thread Count",
"llama_cpp_cache_size_mb": "(llama.cpp) Disk KV Cache Size (MB)",
"thinking_prefix": "Reasoning Content Prefix",
"thinking_suffix": "Reasoning Content Suffix",
"tool_call_prefix": "Tool Call Prefix",
"tool_call_suffix": "Tool Call Suffix",
"enable_legacy_tool_calling": "Enable Legacy Tool Calling",
"tool_response_as_string": "Tool Response as String",
"max_tool_call_iterations": "Maximum Tool Call Attempts",
"ai_task_extraction_method": "Structured Data Extraction Method",
"ai_task_retries": "Retry attempts for structured data extraction"
@@ -257,6 +265,7 @@
"gbnf_grammar": "Forces the model to output properly formatted responses. Ensure the file specified below exists in the integration directory.",
"prompt_caching": "Prompt caching attempts to pre-process the prompt (house state) and cache the processing that needs to be done to understand the prompt. Enabling this will cause the model to re-process the prompt any time an entity state changes in the house, restricted by the interval below.",
"enable_legacy_tool_calling": "Prefer to process tool calls locally rather than relying on the backend to handle the tool calling format. Can be more reliable, however it requires properly setting the tool call prefix and suffix.",
"tool_response_as_string": "Some prompt templates expect the tool response to be provided as a JSON serialized string, rather than the raw object.",
"max_tool_call_iterations": "Set to 0 to generate the response and tool call in one attempt, without looping (use this for Home models v1-v3).",
"ai_task_extraction_method": "Select the method used to extract structured data from the model's response. 'Structured Output' tells the backend to force the model to produce output following the provided JSON Schema; 'Tool Calling' provides a tool to the model that should be called with the appropriate arguments that match the desired output structure."
},
@@ -297,11 +306,13 @@
"batch_size": "(llama.cpp) Batch Size",
"n_threads": "(llama.cpp) Thread Count",
"n_batch_threads": "(llama.cpp) Batch Thread Count",
"llama_cpp_cache_size_mb": "(llama.cpp) Disk KV Cache Size (MB)",
"thinking_prefix": "Reasoning Content Prefix",
"thinking_suffix": "Reasoning Content Suffix",
"tool_call_prefix": "Tool Call Prefix",
"tool_call_suffix": "Tool Call Suffix",
"enable_legacy_tool_calling": "Enable Legacy Tool Calling",
"tool_response_as_string": "Tool Response as String",
"max_tool_call_iterations": "Maximum Tool Call Attempts"
},
"data_description": {
@@ -312,6 +323,7 @@
"gbnf_grammar": "Forces the model to output properly formatted responses. Ensure the file specified below exists in the integration directory.",
"prompt_caching": "Prompt caching attempts to pre-process the prompt (house state) and cache the processing that needs to be done to understand the prompt. Enabling this will cause the model to re-process the prompt any time an entity state changes in the house, restricted by the interval below.",
"enable_legacy_tool_calling": "Prefer to process tool calls locally rather than relying on the backend to handle the tool calling format. Can be more reliable, however it requires properly setting the tool call prefix and suffix.",
"tool_response_as_string": "Some prompt templates expect the tool response to be provided as a JSON serialized string, rather than the raw object.",
"max_tool_call_iterations": "Set to 0 to generate the response and tool call in one attempt, without looping (use this for Home models v1-v3)."
},
"description": "Please configure the model according to how it should be prompted. There are many different options and selecting the correct ones for your model is essential to getting optimal performance. See [here](https://github.com/acon96/home-llm/blob/develop/docs/Backend%20Configuration.md) for more information about the options on this page.\n\n**Some defaults may have been chosen for you based on the name of the selected model name or filename.** If you renamed a file or are using a fine-tuning of a supported model, then the defaults may not have been detected.",

View File

@@ -278,7 +278,8 @@ def format_url(*, hostname: str, port: str, ssl: bool, path: str):
def get_oai_formatted_tools(llm_api: llm.APIInstance, domains: list[str]) -> List[ChatCompletionTool]:
result: List[ChatCompletionTool] = []
for tool in llm_api.tools:
# sort tools by name to improve cache hits
for tool in sorted(llm_api.tools, key=lambda t: t.name):
# when combining with home assistant llm APIs, it adds a prefix to differentiate tools; compare against the suffix here
if tool.name.endswith(SERVICE_TOOL_NAME):
result.extend([{
@@ -302,7 +303,13 @@ def get_oai_formatted_tools(llm_api: llm.APIInstance, domains: list[str]) -> Lis
return result
def get_oai_formatted_messages(conversation: Sequence[conversation.Content], user_content_as_list: bool = False, tool_args_to_str: bool = True) -> List[ChatCompletionRequestMessage]:
def get_oai_formatted_messages(
conversation: Sequence[conversation.Content],
*,
user_content_as_list: bool = False,
tool_args_to_str: bool = True,
tool_result_to_str: bool = True,
) -> List[ChatCompletionRequestMessage]:
messages: List[ChatCompletionRequestMessage] = []
for message in conversation:
if message.role == "system":
@@ -354,14 +361,16 @@ def get_oai_formatted_messages(conversation: Sequence[conversation.Content], use
]
})
elif message.role == "tool_result":
messages.append({
"role": "tool",
# FIXME: what is the correct format for content here? gemma expects name and result
# "content": json.dumps(message.tool_result),
"content": {
if tool_result_to_str:
content = json.dumps(message.tool_result)
else:
content = {
"name": message.tool_name,
"response": { "result": message.tool_result },
},
}
messages.append({
"role": "tool",
"content": content,
"tool_call_id": message.tool_call_id
})