Build llama.cpp wheels in forked repo + support reinstallation

2026-01-09 21:58:00 -05:00 · 2025-10-08 21:19:06 -04:00
parent 286cf9a888
commit 2df454985d
10 changed files with 196 additions and 197 deletions
--- a/.github/workflows/create-release.yml
+++ b/.github/workflows/create-release.yml
@@ -1,126 +0,0 @@
 name: Create Release
 on:
  workflow_dispatch:
    inputs:
      release_notes: 
        description: "Release Notes"
        required: true
        type: string
 permissions:
  contents: write
 jobs:
  build_wheels:
    name: Build wheels for ${{ matrix.arch }} (HA ${{ matrix.home_assistant_image }})
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        include:
        # ARM64
        - home_assistant_image: "aarch64-homeassistant:2025.4.1"
          arch: "aarch64"
        # 32bit ARM (Raspberry pis)
        - home_assistant_image: "armhf-homeassistant:2025.4.1"
          arch: "armhf"
        # x64
        - home_assistant_image: "amd64-homeassistant:2025.4.1"
          arch: "x86_64"
        # 32 bit for older processors
        - home_assistant_image: "i386-homeassistant:2025.4.1"
          arch: "i386"
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
      - name: Verify version match
        if: startsWith(github.event.ref, 'refs/tags/v')
        run: |
          tag_version=$(echo ${{ github.ref }} | sed 's/refs\/tags\/v//')
          component_version_manifest=$(jq -r '.version' custom_components/llama_conversation/manifest.json)
          component_version_const=$(cat custom_components/llama_conversation/const.py | grep "INTEGRATION_VERSION" | tr -d ' ' | tr -d '"' | tr -d 'INTEGRATION_VERSION=')
          if [ "$tag_version" != "$component_version_manifest" ]; then
            echo "The version in the GitHub tag ($tag_version) does not match the version in the Home Assistant custom component manifest ($component_version_manifest)!"
            exit 1
          fi
          if [ "$tag_version" != "$component_version_const" ]; then
            echo "The version in the GitHub tag ($tag_version) does not match the version in const.py ($component_version_const)!"
            exit 1
          fi
          echo "All required versions match."
      - name: Read llama-cpp-python version
        run: cat custom_components/llama_conversation/const.py | grep "EMBEDDED_LLAMA_CPP_PYTHON_VERSION" | tr -d ' ' | tr -d '"' >> $GITHUB_ENV
      - name: Build artifact
        uses: uraimo/run-on-arch-action@v2
        id: build
        with:
          arch: none
          distro: none
          base_image: homeassistant/${{ matrix.home_assistant_image }}
          # Create an artifacts directory
          setup: |
            mkdir -p "${PWD}/artifacts"
          # Mount the artifacts directory as /artifacts in the container
          dockerRunArgs: |
            --volume "${PWD}/artifacts:/artifacts"
          # The shell to run commands with in the container
          shell: /bin/bash
          # Produce a binary artifact and place it in the mounted volume
          run: |
            apk update
            apk add build-base python3-dev cmake
            pip3 install build
            cd /tmp
            git clone --quiet --recurse-submodules https://github.com/abetlen/llama-cpp-python --branch "v${{ env.EMBEDDED_LLAMA_CPP_PYTHON_VERSION }}"
            cd llama-cpp-python
            tag="homellm"
            sed -i -E "s/^(__version__ *= *\"[0-9]+\.[0-9]+\.[0-9]+)\"/\1+${tag}\"/" llama_cpp/__init__.py
            export CMAKE_ARGS="-DLLAVA_BUILD=OFF -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DGGML_BACKEND_DL=ON"
            python3 -m build --wheel
            mv ./dist/*.whl /artifacts
            ls -la /artifacts/
      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
          path: ./artifacts/*.whl
          name: artifact_${{ matrix.arch }}
  release:
    name: Create Release
    needs: [ build_wheels ]
    runs-on: ubuntu-latest
    if: startsWith(github.event.ref, 'refs/tags/v')
    steps:
      - name: Download artifacts
        uses: actions/download-artifact@v4
        with:
          path: dist
          merge-multiple: true
      - name: Create GitHub release
        uses: softprops/action-gh-release@v2
        with:
          files: dist/*
          body: ${{ inputs.release_notes }}
          make_latest: true
--- a/TODO.md
+++ b/TODO.md
@@ -2,6 +2,7 @@
 - [x] proper tool calling support  
 - [ ] fix old GGUFs to support tool calling  
 - [x] home assistant component text streaming support  
 - [x] move llama-cpp build to forked repo + add support for multi backend builds (no more -noavx)  
 - [ ] new model based on qwen3 0.6b  
 - [ ] new model based on gemma3 270m  
 - [ ] support AI task API  
--- a/custom_components/llama_conversation/init.py
+++ b/custom_components/llama_conversation/init.py
@@ -44,7 +44,6 @@ from .const import (
    BACKEND_TYPE_OLLAMA,
    BACKEND_TYPE_LLAMA_EXISTING_OLD,
    BACKEND_TYPE_LLAMA_HF_OLD,
    EMBEDDED_LLAMA_CPP_PYTHON_VERSION
 )
 from .entity import LocalLLMClient, LocalLLMConfigEntry
 from .backends.llamacpp import LlamaCppClient
@@ -141,7 +140,7 @@ async def async_migrate_entry(hass: HomeAssistant, config_entry: LocalLLMConfigE
        if backend == BACKEND_TYPE_LLAMA_EXISTING_OLD or backend == BACKEND_TYPE_LLAMA_HF_OLD:
            backend = BACKEND_TYPE_LLAMA_CPP
            entry_data[CONF_BACKEND_TYPE] = BACKEND_TYPE_LLAMA_CPP
-            entry_options[CONF_INSTALLED_LLAMACPP_VERSION] = await hass.async_add_executor_job(get_llama_cpp_python_version) or EMBEDDED_LLAMA_CPP_PYTHON_VERSION
+            entry_options[CONF_INSTALLED_LLAMACPP_VERSION] = await hass.async_add_executor_job(get_llama_cpp_python_version)
        else:
            # ensure all remote backends have a path set
            entry_options[CONF_GENERIC_OPENAI_PATH] = entry_options.get(CONF_GENERIC_OPENAI_PATH, "")
--- a/custom_components/llama_conversation/backends/llamacpp.py
+++ b/custom_components/llama_conversation/backends/llamacpp.py
@@ -39,6 +39,7 @@ from custom_components.llama_conversation.const import (
    CONF_LLAMACPP_BATCH_SIZE,
    CONF_LLAMACPP_THREAD_COUNT,
    CONF_LLAMACPP_BATCH_THREAD_COUNT,
    CONF_INSTALLED_LLAMACPP_VERSION,
    DEFAULT_MAX_TOKENS,
    DEFAULT_PROMPT,
    DEFAULT_TEMPERATURE,
@@ -78,6 +79,7 @@ def snapshot_settings(options: dict[str, Any]) -> dict[str, Any]:
        CONF_LLAMACPP_THREAD_COUNT: options.get(CONF_LLAMACPP_THREAD_COUNT, DEFAULT_LLAMACPP_THREAD_COUNT),
        CONF_LLAMACPP_BATCH_THREAD_COUNT: options.get(CONF_LLAMACPP_BATCH_THREAD_COUNT, DEFAULT_LLAMACPP_BATCH_THREAD_COUNT),
        CONF_LLAMACPP_ENABLE_FLASH_ATTENTION: options.get(CONF_LLAMACPP_ENABLE_FLASH_ATTENTION, DEFAULT_LLAMACPP_ENABLE_FLASH_ATTENTION),
        CONF_INSTALLED_LLAMACPP_VERSION: options.get(CONF_INSTALLED_LLAMACPP_VERSION, ""),
        CONF_GBNF_GRAMMAR_FILE: options.get(CONF_GBNF_GRAMMAR_FILE, DEFAULT_GBNF_GRAMMAR_FILE),
        CONF_PROMPT_CACHING_ENABLED: options.get(CONF_PROMPT_CACHING_ENABLED, DEFAULT_PROMPT_CACHING_ENABLED),
    }
@@ -115,7 +117,7 @@ class LlamaCppClient(LocalLLMClient):
    @staticmethod
    def get_name(client_options: dict[str, Any]):
-        return f"Llama.cpp (llama-cpp-python v{client_options[CONF_INSTALLED_LLAMACPP_VERSION]})"
+        return "Llama.cpp"
    async def async_get_available_models(self) -> List[str]:
        return [] # TODO: find available "huggingface_hub" models that have been downloaded
@@ -215,6 +217,11 @@ class LlamaCppClient(LocalLLMClient):
            should_reload = True
        elif loaded_options[CONF_LLAMACPP_ENABLE_FLASH_ATTENTION] != entity_options.get(CONF_LLAMACPP_ENABLE_FLASH_ATTENTION, DEFAULT_LLAMACPP_ENABLE_FLASH_ATTENTION):
            should_reload = True
        elif loaded_options[CONF_INSTALLED_LLAMACPP_VERSION] != entity_options.get(CONF_INSTALLED_LLAMACPP_VERSION):
            should_reload = True
            _LOGGER.debug(f"Reloading llama.cpp...")
            if self.llama_cpp_module:
                self.llama_cpp_module = importlib.reload(self.llama_cpp_module)
        model_path = entity_options.get(CONF_DOWNLOADED_MODEL_FILE, "")
        model_name = entity_options.get(CONF_CHAT_MODEL, "")
--- a/custom_components/llama_conversation/config_flow.py
+++ b/custom_components/llama_conversation/config_flow.py
@@ -1,6 +1,7 @@
 """Config flow for Local LLM Conversation integration."""
 from __future__ import annotations
 from asyncio import Task
 import logging
 import os
 from typing import Any
@@ -39,7 +40,8 @@ from homeassistant.helpers.selector import (
    BooleanSelectorConfig,
 )
-from .utils import download_model_from_hf, get_llama_cpp_python_version, install_llama_cpp_python, is_valid_hostname, MissingQuantizationException
+from .utils import download_model_from_hf, get_llama_cpp_python_version, install_llama_cpp_python, \
    is_valid_hostname, get_available_llama_cpp_versions, MissingQuantizationException
 from .const import (
    CONF_CHAT_MODEL,
    CONF_MAX_TOKENS,
@@ -87,6 +89,7 @@ from .const import (
    CONF_LLAMACPP_BATCH_SIZE,
    CONF_LLAMACPP_THREAD_COUNT,
    CONF_LLAMACPP_BATCH_THREAD_COUNT,
    CONF_LLAMACPP_REINSTALL,
    DEFAULT_CHAT_MODEL,
    DEFAULT_PORT,
    DEFAULT_SSL,
@@ -258,14 +261,14 @@ class ConfigFlow(BaseConfigFlow, domain=DOMAIN):
                if backend == BACKEND_TYPE_LLAMA_CPP:
                    installed_version = await self.hass.async_add_executor_job(get_llama_cpp_python_version)
                    _LOGGER.debug(f"installed version: {installed_version}")
-                    if installed_version == EMBEDDED_LLAMA_CPP_PYTHON_VERSION:
+                    if installed_version and installed_version == EMBEDDED_LLAMA_CPP_PYTHON_VERSION:
                        self.client_config[CONF_INSTALLED_LLAMACPP_VERSION] = installed_version
                        return await self.async_step_finish()
                    else:
                        self.internal_step = "install_local_wheels"
                        _LOGGER.debug("Queuing install task...")
                        async def install_task():
-                            await self.hass.async_add_executor_job(
+                            return await self.hass.async_add_executor_job(
                                install_llama_cpp_python, self.hass.config.config_dir
                            )
@@ -376,7 +379,7 @@ class ConfigFlow(BaseConfigFlow, domain=DOMAIN):
    @classmethod
    def async_supports_options_flow(cls, config_entry: ConfigEntry) -> bool:
-        return config_entry.data[CONF_BACKEND_TYPE] != BACKEND_TYPE_LLAMA_CPP
+        return True
    @staticmethod
    def async_get_options_flow(
@@ -399,6 +402,9 @@ class OptionsFlow(BaseOptionsFlow):
    """Local LLM config flow options handler."""
    model_config: dict[str, Any] | None = None
    reinstall_task: Task[Any] | None = None
    wheel_install_error: str | None = None
    wheel_install_successful: bool = False
    async def async_step_init(
        self, user_input: dict[str, Any] | None = None
@@ -410,32 +416,112 @@ class OptionsFlow(BaseOptionsFlow):
        backend_type = self.config_entry.data.get(CONF_BACKEND_TYPE, DEFAULT_BACKEND_TYPE)
        client_config = dict(self.config_entry.options)
        if self.wheel_install_error:
            _LOGGER.warning("Failed to install wheel: %s", repr(self.wheel_install_error))
            return self.async_abort(reason="pip_wheel_error")
        if self.wheel_install_successful:
            client_config[CONF_INSTALLED_LLAMACPP_VERSION] = await self.hass.async_add_executor_job(get_llama_cpp_python_version)
            _LOGGER.debug(f"new version is: {client_config[CONF_INSTALLED_LLAMACPP_VERSION]}")
            return self.async_create_entry(data=client_config)
        if backend_type == BACKEND_TYPE_LLAMA_CPP:
            potential_versions = await get_available_llama_cpp_versions(self.hass)
            schema = vol.Schema({
                vol.Required(CONF_LLAMACPP_REINSTALL, default=False): BooleanSelector(BooleanSelectorConfig()),
                vol.Required(CONF_INSTALLED_LLAMACPP_VERSION, default=client_config.get(CONF_INSTALLED_LLAMACPP_VERSION, "not installed")): SelectSelector(
                    SelectSelectorConfig(
                        options=[ SelectOptionDict(value=x[0], label=x[0] if not x[1] else f"{x[0]} (local)") for x in potential_versions ],
                        mode=SelectSelectorMode.DROPDOWN,
                    )
                )
            })
            return self.async_show_form(
                step_id="reinstall",
                data_schema=schema,
            )
        else:
            if user_input is not None:
                client_config.update(user_input)
                # validate remote connections
                connect_err = await BACKEND_TO_CLS[backend_type].async_validate_connection(self.hass, client_config)
                if not connect_err:
                    return self.async_create_entry(data=client_config)
                else:
                    errors["base"] = "failed_to_connect"
                    description_placeholders["exception"] = str(connect_err)
            schema = remote_connection_schema(
                backend_type=backend_type,
                host=client_config.get(CONF_HOST),
                port=client_config.get(CONF_PORT),
                ssl=client_config.get(CONF_SSL),
                selected_path=client_config.get(CONF_GENERIC_OPENAI_PATH)
            )
            return self.async_show_form(
                step_id="init",
                data_schema=schema,
                errors=errors,
                description_placeholders=description_placeholders,
            )
    async def async_step_reinstall(self, user_input: dict[str, Any] | None = None) -> ConfigFlowResult:
        client_config = dict(self.config_entry.options)
        if user_input is not None:
-            client_config.update(user_input)
+            if not user_input[CONF_LLAMACPP_REINSTALL]:
-
+                _LOGGER.debug("Reinstall was not selected, finishing")
            # validate remote connections
            connect_err = await BACKEND_TO_CLS[backend_type].async_validate_connection(self.hass, client_config)
            if not connect_err:
                return self.async_create_entry(data=client_config)
        if not self.reinstall_task:
            if not user_input:
                return self.async_abort(reason="unknown")
            desired_version = user_input.get(CONF_INSTALLED_LLAMACPP_VERSION)
            async def install_task():
                return await self.hass.async_add_executor_job(
                    install_llama_cpp_python, self.hass.config.config_dir, True, desired_version
                )
            self.reinstall_task = self.hass.async_create_background_task(
                install_task(), name="llama_cpp_python_installation")
            _LOGGER.debug("Queuing reinstall task...")
            return self.async_show_progress(
                progress_task=self.reinstall_task,
                step_id="reinstall",
                progress_action="install_local_wheels",
            )
        if not self.reinstall_task.done():
            return self.async_show_progress(
                progress_task=self.reinstall_task,
                step_id="reinstall",
                progress_action="install_local_wheels",
            )
        _LOGGER.debug("done... checking result")
        install_exception = self.reinstall_task.exception()
        if install_exception:
            self.wheel_install_error = repr(install_exception)
            _LOGGER.debug(f"Hit error: {self.wheel_install_error}")
            return self.async_show_progress_done(next_step_id="init")
        else:
            wheel_install_result = self.reinstall_task.result()
            if not wheel_install_result:
                self.wheel_install_error = "Pip returned false"
                _LOGGER.debug(f"Hit error: {self.wheel_install_error} ({wheel_install_result})")
                return self.async_show_progress_done(next_step_id="init")
            else:
-                errors["base"] = "failed_to_connect"
+                _LOGGER.debug(f"Finished install: {wheel_install_result}")
-                description_placeholders["exception"] = str(connect_err)
+                self.wheel_install_successful = True
-
+                return self.async_show_progress_done(next_step_id="init")
        schema = remote_connection_schema(
            backend_type=backend_type,
            host=client_config.get(CONF_HOST),
            port=client_config.get(CONF_PORT),
            ssl=client_config.get(CONF_SSL),
            selected_path=client_config.get(CONF_GENERIC_OPENAI_PATH)
        )
        return self.async_show_form(
            step_id="init",
            data_schema=schema,
            errors=errors,
            description_placeholders=description_placeholders,
        )
 def STEP_LOCAL_MODEL_SELECTION_DATA_SCHEMA(model_file=None, chat_model=None, downloaded_model_quantization=None, available_quantizations=None):
--- a/custom_components/llama_conversation/const.py
+++ b/custom_components/llama_conversation/const.py
@@ -191,6 +191,7 @@ CONF_LLAMACPP_THREAD_COUNT = "n_threads"
 DEFAULT_LLAMACPP_THREAD_COUNT = os.cpu_count()
 CONF_LLAMACPP_BATCH_THREAD_COUNT = "n_batch_threads"
 DEFAULT_LLAMACPP_BATCH_THREAD_COUNT = os.cpu_count()
 CONF_LLAMACPP_REINSTALL = "reinstall_llama_cpp"
 DEFAULT_OPTIONS = types.MappingProxyType(
    {
@@ -318,4 +319,4 @@ OPTIONS_OVERRIDES = {
 # INTEGRATION_VERSION = "0.4.0"
 INTEGRATION_VERSION = "0.3.11"
-EMBEDDED_LLAMA_CPP_PYTHON_VERSION = "0.3.16"
+EMBEDDED_LLAMA_CPP_PYTHON_VERSION = "0.3.16+b6153"
--- a/custom_components/llama_conversation/translations/en.json
+++ b/custom_components/llama_conversation/translations/en.json
@@ -187,13 +187,27 @@
                    "text_generation_webui_admin_key": "Admin Key",
                    "text_generation_webui_preset": "Generation Preset/Character Name",
                    "text_generation_webui_chat_mode": "Chat Mode"
-                }
+                },
                "description": "Please provide the connection details to connect to the API that is hosting the model.",
                "title": "Configure Connection"
            },
            "reinstall": {
                "data": {
                    "reinstall_llama_cpp": "Reinstall Llama.cpp",
                    "installed_llama_cpp_version": "Version to (re)install"
                },
                "description": "__If you are experiencing issues with Llama.cpp__, you can force a reinstall of the package here. This will attempt to re-install or upgrade the llama-cpp-python package from GitHub *or* a local wheel file placed in the `/config/custom_components/llama_conversation/` directory.",
                "title": "Reinstall Llama.cpp"
            }
        },
        "error": {
            "failed_to_connect": "Failed to connect to the remote API: {exception}",
            "invalid_hostname": "The provided hostname was invalid. Please ensure you only provide the domain or IP address and not the full API endpoint.",
-            "unknown": "Unexpected error"
+            "unknown": "Unexpected error",
            "pip_wheel_error": "Pip returned an error while installing the wheel! Please check the Home Assistant logs for more details."
        },
        "progress": {
            "install_local_wheels": "Please wait while Llama.cpp is installed..."
        }
    },
    "selector": {
--- a/custom_components/llama_conversation/utils.py
+++ b/custom_components/llama_conversation/utils.py
@@ -9,13 +9,14 @@ import multiprocessing
 import voluptuous as vol
 import webcolors
 import json
-from typing import Any, Dict, List, Sequence, cast
+from typing import Any, Dict, List, Sequence, Tuple, cast
 from webcolors import CSS3
 from importlib.metadata import version
 from homeassistant.core import HomeAssistant
 from homeassistant.components import conversation
 from homeassistant.helpers import config_validation as cv
-from homeassistant.helpers import intent, llm
+from homeassistant.helpers import intent, llm, aiohttp_client
 from homeassistant.requirements import pip_kwargs
 from homeassistant.util import color
 from homeassistant.util.package import install_package, is_installed
@@ -191,18 +192,11 @@ def validate_llama_cpp_python_installation():
 def get_llama_cpp_python_version():
    if not is_installed("llama-cpp-python"):
        return None
-    return version("llama-cpp-python").split("+")[0]
+    return version("llama-cpp-python")
-def install_llama_cpp_python(config_dir: str):
+def get_runtime_and_platform_suffix() -> Tuple[str, str]:
    runtime_version = f"cp{sys.version_info.major}{sys.version_info.minor}"
    installed_wrong_version = False
    if is_installed("llama-cpp-python"):
        if version("llama-cpp-python") != EMBEDDED_LLAMA_CPP_PYTHON_VERSION:
            installed_wrong_version = True
        else:
            time.sleep(0.5) # I still don't know why this is required
            return True
    platform_suffix = platform.machine()
    # remap other names for architectures to the names we use
    if platform_suffix == "arm64":
@@ -210,42 +204,65 @@ def install_llama_cpp_python(config_dir: str):
    if platform_suffix == "i386" or platform_suffix == "amd64":
        platform_suffix = "x86_64"
-    runtime_version = f"cp{sys.version_info.major}{sys.version_info.minor}"
+    return runtime_version, platform_suffix
-    
+
 async def get_available_llama_cpp_versions(hass: HomeAssistant) -> List[Tuple[str, bool]]:
    github_index_url = "https://acon96.github.io/llama-cpp-python/whl/ha/llama-cpp-python/"
    session = aiohttp_client.async_get_clientsession(hass)
    try:
        async with session.get(github_index_url) as resp:
            if resp.status != 200:
                raise Exception(f"Failed to fetch available versions from GitHub (HTTP {resp.status})")
            text = await resp.text()
            # pull version numbers out of h2 tags
            versions = re.findall(r"<h2.*>(.+)</h2>", text)
            remote =  sorted([(v, False) for v in versions], reverse=True)
    except Exception as ex:
        _LOGGER.warning(f"Error fetching available versions from GitHub: {repr(ex)}")
        remote = []
    runtime_version, platform_suffix = get_runtime_and_platform_suffix()
    folder = os.path.dirname(__file__)
    potential_wheels = sorted([ path for path in os.listdir(folder) if path.endswith(f"{platform_suffix}.whl") ], reverse=True)
-    potential_wheels = [ wheel for wheel in potential_wheels if runtime_version in wheel ]
+    local = [ (wheel, True) for wheel in potential_wheels if runtime_version in wheel and "llama_cpp_python" in wheel]
-    potential_wheels = [ wheel for wheel in potential_wheels if f"{EMBEDDED_LLAMA_CPP_PYTHON_VERSION}+homellm" in wheel ]
+    
    return remote + local
-    _LOGGER.debug(f"{potential_wheels=}")
+def install_llama_cpp_python(config_dir: str, force_reinstall: bool = False, specific_version: str | None = None) -> bool:
    if len(potential_wheels) > 0:
-        latest_wheel = potential_wheels[0]
+    installed_wrong_version = False
-
+    if is_installed("llama-cpp-python") and not force_reinstall:
-        _LOGGER.info("Installing llama-cpp-python from local wheel")
+        if version("llama-cpp-python") != EMBEDDED_LLAMA_CPP_PYTHON_VERSION:
-        _LOGGER.debug(f"Wheel location: {latest_wheel}")
+            installed_wrong_version = True
-        return install_package(os.path.join(folder, latest_wheel), **pip_kwargs(config_dir))
+        else:
            time.sleep(0.5) # I still don't know why this is required
            return True
-    # scikit-build-core v0.9.7+ doesn't recognize these builds as musllinux, and just tags them as generic linux
+    runtime_version, platform_suffix = get_runtime_and_platform_suffix()
-    # github_release_url = f"https://github.com/acon96/home-llm/releases/download/v{INTEGRATION_VERSION}/llama_cpp_python-{EMBEDDED_LLAMA_CPP_PYTHON_VERSION}+homellm-{runtime_version}-{runtime_version}-musllinux_1_2_{platform_suffix}.whl"
+
-    github_release_url = f"https://github.com/acon96/home-llm/releases/download/v{INTEGRATION_VERSION}/llama_cpp_python-{EMBEDDED_LLAMA_CPP_PYTHON_VERSION}+homellm-{runtime_version}-{runtime_version}-linux_{platform_suffix}.whl"
+    if not specific_version:
-    if install_package(github_release_url, **pip_kwargs(config_dir)):
+        specific_version = EMBEDDED_LLAMA_CPP_PYTHON_VERSION
-        _LOGGER.info("llama-cpp-python successfully installed from GitHub release")
+    
    if ".whl" in specific_version:
        wheel_location = os.path.join(os.path.dirname(__file__), specific_version)
    else:
        wheel_location = f"https://github.com/acon96/llama-cpp-python/releases/download/{specific_version}/llama_cpp_python-{specific_version}-{runtime_version}-{runtime_version}-linux_{platform_suffix}.whl"
    if install_package(wheel_location, **pip_kwargs(config_dir)):
        _LOGGER.info("llama-cpp-python successfully installed")
        return True
    # if it is just the wrong version installed then ignore the installation error
    if not installed_wrong_version:
        _LOGGER.error(
-            "Error installing llama-cpp-python. Could not install the binary wheels from GitHub for " + \
+            "Error installing llama-cpp-python. Could not install the binary wheels from GitHub." + \
            f"platform: {platform_suffix}, python version: {sys.version_info.major}.{sys.version_info.minor}. " + \
            "Please manually build or download the wheels and place them in the `/config/custom_components/llama_conversation` directory." + \
            "Make sure that you download the correct .whl file for your platform and python version from the GitHub releases page."
        )
        return False
    else:
        _LOGGER.info(
-            "Error installing llama-cpp-python. Could not install the binary wheels from GitHub for " + \
+            "Error installing llama-cpp-python. Could not install the binary wheels from GitHub." + \
            f"platform: {platform_suffix}, python version: {sys.version_info.major}.{sys.version_info.minor}. " + \
            f"You already have a version of llama-cpp-python ({version('llama-cpp-python')}) installed, however it may not be compatible!"
        )
        time.sleep(0.5) # I still don't know why this is required
--- a/scripts/make_wheel.sh
+++ b/scripts/make_wheel.sh
@@ -2,15 +2,15 @@
 # Don't run this. This is executed inside of the home assistant container to build the wheel
 apk update
-apk add build-base python3-dev
+apk add build-base python3-dev linux-headers
 tag=$1
 cd /tmp
-git clone --recurse-submodules https://github.com/abetlen/llama-cpp-python --branch $1
+git clone --recurse-submodules https://github.com/acon96/llama-cpp-python --branch $tag --depth 1 --shallow-submodules
 cd llama-cpp-python
 pip3 install build
-
+sed -i -E "s/^(__version__ *= *\")[^\"]+\"/\1${tag}\"/" llama_cpp/__init__.py
 tag="homellm"
 sed -i -E "s/^(__version__ *= *\"[0-9]+\.[0-9]+\.[0-9]+)\"/\1+${tag}\"/" llama_cpp/__init__.py
 export CMAKE_ARGS="-DLLAVA_BUILD=OFF -DGGML_NATIVE=ON"
 python3 -m build --wheel
--- a/scripts/run_docker_to_make_wheels.sh
+++ b/scripts/run_docker_to_make_wheels.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
-VERSION_TO_BUILD="v0.3.16"
+VERSION_TO_BUILD="0.3.16+b6713"
 # make python 11 wheels
 # docker run -it --rm \