From 2df454985d3feabb3d4898c44812ab0506889496 Mon Sep 17 00:00:00 2001
From: Alex O'Connell <git@alexoconnell.net>
Date: Wed, 8 Oct 2025 21:19:06 -0400
Subject: [PATCH] Build llama.cpp wheels in forked repo + support
 reinstallation

---
 .github/workflows/create-release.yml          | 126 ----------------
 TODO.md                                       |   1 +
 .../llama_conversation/__init__.py            |   3 +-
 .../llama_conversation/backends/llamacpp.py   |   9 +-
 .../llama_conversation/config_flow.py         | 140 ++++++++++++++----
 custom_components/llama_conversation/const.py |   3 +-
 .../llama_conversation/translations/en.json   |  18 ++-
 custom_components/llama_conversation/utils.py |  81 ++++++----
 scripts/make_wheel.sh                         |  10 +-
 scripts/run_docker_to_make_wheels.sh          |   2 +-
 10 files changed, 196 insertions(+), 197 deletions(-)
 delete mode 100644 .github/workflows/create-release.yml

diff --git a/.github/workflows/create-release.yml b/.github/workflows/create-release.yml
deleted file mode 100644
index a8cab75..0000000
--- a/.github/workflows/create-release.yml
+++ /dev/null
@@ -1,126 +0,0 @@
-name: Create Release
-
-on:
-  workflow_dispatch:
-    inputs:
-      release_notes: 
-        description: "Release Notes"
-        required: true
-        type: string
-
-permissions:
-  contents: write
-
-jobs:
-  build_wheels:
-    name: Build wheels for ${{ matrix.arch }} (HA ${{ matrix.home_assistant_image }})
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-        # ARM64
-        - home_assistant_image: "aarch64-homeassistant:2025.4.1"
-          arch: "aarch64"
-          
-        # 32bit ARM (Raspberry pis)
-        - home_assistant_image: "armhf-homeassistant:2025.4.1"
-          arch: "armhf"
-
-        # x64
-        - home_assistant_image: "amd64-homeassistant:2025.4.1"
-          arch: "x86_64"
-
-        # 32 bit for older processors
-        - home_assistant_image: "i386-homeassistant:2025.4.1"
-          arch: "i386"
-        
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Verify version match
-        if: startsWith(github.event.ref, 'refs/tags/v')
-        run: |
-          tag_version=$(echo ${{ github.ref }} | sed 's/refs\/tags\/v//')
-          component_version_manifest=$(jq -r '.version' custom_components/llama_conversation/manifest.json)
-          component_version_const=$(cat custom_components/llama_conversation/const.py | grep "INTEGRATION_VERSION" | tr -d ' ' | tr -d '"' | tr -d 'INTEGRATION_VERSION=')
-
-          if [ "$tag_version" != "$component_version_manifest" ]; then
-            echo "The version in the GitHub tag ($tag_version) does not match the version in the Home Assistant custom component manifest ($component_version_manifest)!"
-            exit 1
-          fi
-
-          if [ "$tag_version" != "$component_version_const" ]; then
-            echo "The version in the GitHub tag ($tag_version) does not match the version in const.py ($component_version_const)!"
-            exit 1
-          fi
-
-          echo "All required versions match."
-
-      - name: Read llama-cpp-python version
-        run: cat custom_components/llama_conversation/const.py | grep "EMBEDDED_LLAMA_CPP_PYTHON_VERSION" | tr -d ' ' | tr -d '"' >> $GITHUB_ENV
-
-      - name: Build artifact
-        uses: uraimo/run-on-arch-action@v2
-        id: build
-        with:
-          arch: none
-          distro: none
-          base_image: homeassistant/${{ matrix.home_assistant_image }}
-
-          # Create an artifacts directory
-          setup: |
-            mkdir -p "${PWD}/artifacts"
-
-          # Mount the artifacts directory as /artifacts in the container
-          dockerRunArgs: |
-            --volume "${PWD}/artifacts:/artifacts"
-
-          # The shell to run commands with in the container
-          shell: /bin/bash
-
-          # Produce a binary artifact and place it in the mounted volume
-          run: |
-            apk update
-            apk add build-base python3-dev cmake
-            pip3 install build
-
-            cd /tmp
-            git clone --quiet --recurse-submodules https://github.com/abetlen/llama-cpp-python --branch "v${{ env.EMBEDDED_LLAMA_CPP_PYTHON_VERSION }}"
-            cd llama-cpp-python
-
-            tag="homellm"
-            sed -i -E "s/^(__version__ *= *\"[0-9]+\.[0-9]+\.[0-9]+)\"/\1+${tag}\"/" llama_cpp/__init__.py
-
-            export CMAKE_ARGS="-DLLAVA_BUILD=OFF -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DGGML_BACKEND_DL=ON"
-            python3 -m build --wheel
-
-            mv ./dist/*.whl /artifacts
-            ls -la /artifacts/
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: ./artifacts/*.whl
-          name: artifact_${{ matrix.arch }}
-
-  release:
-    name: Create Release
-    needs: [ build_wheels ]
-    runs-on: ubuntu-latest
-    if: startsWith(github.event.ref, 'refs/tags/v')
-
-    steps:
-      - name: Download artifacts
-        uses: actions/download-artifact@v4
-        with:
-          path: dist
-          merge-multiple: true
-      
-      - name: Create GitHub release
-        uses: softprops/action-gh-release@v2
-        with:
-          files: dist/*
-          body: ${{ inputs.release_notes }}
-          make_latest: true
\ No newline at end of file
diff --git a/TODO.md b/TODO.md
index c421433..508f1db 100644
--- a/TODO.md
+++ b/TODO.md
@@ -2,6 +2,7 @@
 - [x] proper tool calling support  
 - [ ] fix old GGUFs to support tool calling  
 - [x] home assistant component text streaming support  
+- [x] move llama-cpp build to forked repo + add support for multi backend builds (no more -noavx)  
 - [ ] new model based on qwen3 0.6b  
 - [ ] new model based on gemma3 270m  
 - [ ] support AI task API  
diff --git a/custom_components/llama_conversation/__init__.py b/custom_components/llama_conversation/__init__.py
index 8d2cf81..ef4eaf6 100644
--- a/custom_components/llama_conversation/__init__.py
+++ b/custom_components/llama_conversation/__init__.py
@@ -44,7 +44,6 @@ from .const import (
     BACKEND_TYPE_OLLAMA,
     BACKEND_TYPE_LLAMA_EXISTING_OLD,
     BACKEND_TYPE_LLAMA_HF_OLD,
-    EMBEDDED_LLAMA_CPP_PYTHON_VERSION
 )
 from .entity import LocalLLMClient, LocalLLMConfigEntry
 from .backends.llamacpp import LlamaCppClient
@@ -141,7 +140,7 @@ async def async_migrate_entry(hass: HomeAssistant, config_entry: LocalLLMConfigE
         if backend == BACKEND_TYPE_LLAMA_EXISTING_OLD or backend == BACKEND_TYPE_LLAMA_HF_OLD:
             backend = BACKEND_TYPE_LLAMA_CPP
             entry_data[CONF_BACKEND_TYPE] = BACKEND_TYPE_LLAMA_CPP
-            entry_options[CONF_INSTALLED_LLAMACPP_VERSION] = await hass.async_add_executor_job(get_llama_cpp_python_version) or EMBEDDED_LLAMA_CPP_PYTHON_VERSION
+            entry_options[CONF_INSTALLED_LLAMACPP_VERSION] = await hass.async_add_executor_job(get_llama_cpp_python_version)
         else:
             # ensure all remote backends have a path set
             entry_options[CONF_GENERIC_OPENAI_PATH] = entry_options.get(CONF_GENERIC_OPENAI_PATH, "")
diff --git a/custom_components/llama_conversation/backends/llamacpp.py b/custom_components/llama_conversation/backends/llamacpp.py
index 0fcb3a5..68b1902 100644
--- a/custom_components/llama_conversation/backends/llamacpp.py
+++ b/custom_components/llama_conversation/backends/llamacpp.py
@@ -39,6 +39,7 @@ from custom_components.llama_conversation.const import (
     CONF_LLAMACPP_BATCH_SIZE,
     CONF_LLAMACPP_THREAD_COUNT,
     CONF_LLAMACPP_BATCH_THREAD_COUNT,
+    CONF_INSTALLED_LLAMACPP_VERSION,
     DEFAULT_MAX_TOKENS,
     DEFAULT_PROMPT,
     DEFAULT_TEMPERATURE,
@@ -78,6 +79,7 @@ def snapshot_settings(options: dict[str, Any]) -> dict[str, Any]:
         CONF_LLAMACPP_THREAD_COUNT: options.get(CONF_LLAMACPP_THREAD_COUNT, DEFAULT_LLAMACPP_THREAD_COUNT),
         CONF_LLAMACPP_BATCH_THREAD_COUNT: options.get(CONF_LLAMACPP_BATCH_THREAD_COUNT, DEFAULT_LLAMACPP_BATCH_THREAD_COUNT),
         CONF_LLAMACPP_ENABLE_FLASH_ATTENTION: options.get(CONF_LLAMACPP_ENABLE_FLASH_ATTENTION, DEFAULT_LLAMACPP_ENABLE_FLASH_ATTENTION),
+        CONF_INSTALLED_LLAMACPP_VERSION: options.get(CONF_INSTALLED_LLAMACPP_VERSION, ""),
         CONF_GBNF_GRAMMAR_FILE: options.get(CONF_GBNF_GRAMMAR_FILE, DEFAULT_GBNF_GRAMMAR_FILE),
         CONF_PROMPT_CACHING_ENABLED: options.get(CONF_PROMPT_CACHING_ENABLED, DEFAULT_PROMPT_CACHING_ENABLED),
     }
@@ -115,7 +117,7 @@ class LlamaCppClient(LocalLLMClient):
 
     @staticmethod
     def get_name(client_options: dict[str, Any]):
-        return f"Llama.cpp (llama-cpp-python v{client_options[CONF_INSTALLED_LLAMACPP_VERSION]})"
+        return "Llama.cpp"
 
     async def async_get_available_models(self) -> List[str]:
         return [] # TODO: find available "huggingface_hub" models that have been downloaded
@@ -215,6 +217,11 @@ class LlamaCppClient(LocalLLMClient):
             should_reload = True
         elif loaded_options[CONF_LLAMACPP_ENABLE_FLASH_ATTENTION] != entity_options.get(CONF_LLAMACPP_ENABLE_FLASH_ATTENTION, DEFAULT_LLAMACPP_ENABLE_FLASH_ATTENTION):
             should_reload = True
+        elif loaded_options[CONF_INSTALLED_LLAMACPP_VERSION] != entity_options.get(CONF_INSTALLED_LLAMACPP_VERSION):
+            should_reload = True
+            _LOGGER.debug(f"Reloading llama.cpp...")
+            if self.llama_cpp_module:
+                self.llama_cpp_module = importlib.reload(self.llama_cpp_module)
 
         model_path = entity_options.get(CONF_DOWNLOADED_MODEL_FILE, "")
         model_name = entity_options.get(CONF_CHAT_MODEL, "")
diff --git a/custom_components/llama_conversation/config_flow.py b/custom_components/llama_conversation/config_flow.py
index d456f4e..b2baf19 100644
--- a/custom_components/llama_conversation/config_flow.py
+++ b/custom_components/llama_conversation/config_flow.py
@@ -1,6 +1,7 @@
 """Config flow for Local LLM Conversation integration."""
 from __future__ import annotations
 
+from asyncio import Task
 import logging
 import os
 from typing import Any
@@ -39,7 +40,8 @@ from homeassistant.helpers.selector import (
     BooleanSelectorConfig,
 )
 
-from .utils import download_model_from_hf, get_llama_cpp_python_version, install_llama_cpp_python, is_valid_hostname, MissingQuantizationException
+from .utils import download_model_from_hf, get_llama_cpp_python_version, install_llama_cpp_python, \
+    is_valid_hostname, get_available_llama_cpp_versions, MissingQuantizationException
 from .const import (
     CONF_CHAT_MODEL,
     CONF_MAX_TOKENS,
@@ -87,6 +89,7 @@ from .const import (
     CONF_LLAMACPP_BATCH_SIZE,
     CONF_LLAMACPP_THREAD_COUNT,
     CONF_LLAMACPP_BATCH_THREAD_COUNT,
+    CONF_LLAMACPP_REINSTALL,
     DEFAULT_CHAT_MODEL,
     DEFAULT_PORT,
     DEFAULT_SSL,
@@ -258,14 +261,14 @@ class ConfigFlow(BaseConfigFlow, domain=DOMAIN):
                 if backend == BACKEND_TYPE_LLAMA_CPP:
                     installed_version = await self.hass.async_add_executor_job(get_llama_cpp_python_version)
                     _LOGGER.debug(f"installed version: {installed_version}")
-                    if installed_version == EMBEDDED_LLAMA_CPP_PYTHON_VERSION:
+                    if installed_version and installed_version == EMBEDDED_LLAMA_CPP_PYTHON_VERSION:
                         self.client_config[CONF_INSTALLED_LLAMACPP_VERSION] = installed_version
                         return await self.async_step_finish()
                     else:
                         self.internal_step = "install_local_wheels"
                         _LOGGER.debug("Queuing install task...")
                         async def install_task():
-                            await self.hass.async_add_executor_job(
+                            return await self.hass.async_add_executor_job(
                                 install_llama_cpp_python, self.hass.config.config_dir
                             )
 
@@ -376,7 +379,7 @@ class ConfigFlow(BaseConfigFlow, domain=DOMAIN):
     
     @classmethod
     def async_supports_options_flow(cls, config_entry: ConfigEntry) -> bool:
-        return config_entry.data[CONF_BACKEND_TYPE] != BACKEND_TYPE_LLAMA_CPP
+        return True
 
     @staticmethod
     def async_get_options_flow(
@@ -399,6 +402,9 @@ class OptionsFlow(BaseOptionsFlow):
     """Local LLM config flow options handler."""
 
     model_config: dict[str, Any] | None = None
+    reinstall_task: Task[Any] | None = None
+    wheel_install_error: str | None = None
+    wheel_install_successful: bool = False
 
     async def async_step_init(
         self, user_input: dict[str, Any] | None = None
@@ -410,32 +416,112 @@ class OptionsFlow(BaseOptionsFlow):
         backend_type = self.config_entry.data.get(CONF_BACKEND_TYPE, DEFAULT_BACKEND_TYPE)
         client_config = dict(self.config_entry.options)
 
+        if self.wheel_install_error:
+            _LOGGER.warning("Failed to install wheel: %s", repr(self.wheel_install_error))
+            return self.async_abort(reason="pip_wheel_error")
+
+        if self.wheel_install_successful:
+            client_config[CONF_INSTALLED_LLAMACPP_VERSION] = await self.hass.async_add_executor_job(get_llama_cpp_python_version)
+            _LOGGER.debug(f"new version is: {client_config[CONF_INSTALLED_LLAMACPP_VERSION]}")
+            return self.async_create_entry(data=client_config)
+
+        if backend_type == BACKEND_TYPE_LLAMA_CPP:
+            potential_versions = await get_available_llama_cpp_versions(self.hass)
+
+            schema = vol.Schema({
+                vol.Required(CONF_LLAMACPP_REINSTALL, default=False): BooleanSelector(BooleanSelectorConfig()),
+                vol.Required(CONF_INSTALLED_LLAMACPP_VERSION, default=client_config.get(CONF_INSTALLED_LLAMACPP_VERSION, "not installed")): SelectSelector(
+                    SelectSelectorConfig(
+                        options=[ SelectOptionDict(value=x[0], label=x[0] if not x[1] else f"{x[0]} (local)") for x in potential_versions ],
+                        mode=SelectSelectorMode.DROPDOWN,
+                    )
+                )
+            })
+
+            return self.async_show_form(
+                step_id="reinstall",
+                data_schema=schema,
+            )
+        else:
+
+            if user_input is not None:
+                client_config.update(user_input)
+
+                # validate remote connections
+                connect_err = await BACKEND_TO_CLS[backend_type].async_validate_connection(self.hass, client_config)
+
+                if not connect_err:
+                    return self.async_create_entry(data=client_config)
+                else:
+                    errors["base"] = "failed_to_connect"
+                    description_placeholders["exception"] = str(connect_err)
+
+            schema = remote_connection_schema(
+                backend_type=backend_type,
+                host=client_config.get(CONF_HOST),
+                port=client_config.get(CONF_PORT),
+                ssl=client_config.get(CONF_SSL),
+                selected_path=client_config.get(CONF_GENERIC_OPENAI_PATH)
+            )
+
+            return self.async_show_form(
+                step_id="init",
+                data_schema=schema,
+                errors=errors,
+                description_placeholders=description_placeholders,
+            )
+    
+    async def async_step_reinstall(self, user_input: dict[str, Any] | None = None) -> ConfigFlowResult:
+        client_config = dict(self.config_entry.options)
+
         if user_input is not None:
-            client_config.update(user_input)
-
-            # validate remote connections
-            connect_err = await BACKEND_TO_CLS[backend_type].async_validate_connection(self.hass, client_config)
-
-            if not connect_err:
+            if not user_input[CONF_LLAMACPP_REINSTALL]:
+                _LOGGER.debug("Reinstall was not selected, finishing")
                 return self.async_create_entry(data=client_config)
+                
+        if not self.reinstall_task:
+            if not user_input:
+                return self.async_abort(reason="unknown")
+            
+            desired_version = user_input.get(CONF_INSTALLED_LLAMACPP_VERSION)
+            async def install_task():
+                return await self.hass.async_add_executor_job(
+                    install_llama_cpp_python, self.hass.config.config_dir, True, desired_version
+                )
+
+            self.reinstall_task = self.hass.async_create_background_task(
+                install_task(), name="llama_cpp_python_installation")
+
+            _LOGGER.debug("Queuing reinstall task...")
+            return self.async_show_progress(
+                progress_task=self.reinstall_task,
+                step_id="reinstall",
+                progress_action="install_local_wheels",
+            )
+        
+        if not self.reinstall_task.done():
+            return self.async_show_progress(
+                progress_task=self.reinstall_task,
+                step_id="reinstall",
+                progress_action="install_local_wheels",
+            )
+        
+        _LOGGER.debug("done... checking result")
+        install_exception = self.reinstall_task.exception()
+        if install_exception:
+            self.wheel_install_error = repr(install_exception)
+            _LOGGER.debug(f"Hit error: {self.wheel_install_error}")
+            return self.async_show_progress_done(next_step_id="init")
+        else:
+            wheel_install_result = self.reinstall_task.result()
+            if not wheel_install_result:
+                self.wheel_install_error = "Pip returned false"
+                _LOGGER.debug(f"Hit error: {self.wheel_install_error} ({wheel_install_result})")
+                return self.async_show_progress_done(next_step_id="init")
             else:
-                errors["base"] = "failed_to_connect"
-                description_placeholders["exception"] = str(connect_err)
-
-        schema = remote_connection_schema(
-            backend_type=backend_type,
-            host=client_config.get(CONF_HOST),
-            port=client_config.get(CONF_PORT),
-            ssl=client_config.get(CONF_SSL),
-            selected_path=client_config.get(CONF_GENERIC_OPENAI_PATH)
-        )
-
-        return self.async_show_form(
-            step_id="init",
-            data_schema=schema,
-            errors=errors,
-            description_placeholders=description_placeholders,
-        )
+                _LOGGER.debug(f"Finished install: {wheel_install_result}")
+                self.wheel_install_successful = True
+                return self.async_show_progress_done(next_step_id="init")
     
 
 def STEP_LOCAL_MODEL_SELECTION_DATA_SCHEMA(model_file=None, chat_model=None, downloaded_model_quantization=None, available_quantizations=None):
diff --git a/custom_components/llama_conversation/const.py b/custom_components/llama_conversation/const.py
index 42cf340..9029b94 100644
--- a/custom_components/llama_conversation/const.py
+++ b/custom_components/llama_conversation/const.py
@@ -191,6 +191,7 @@ CONF_LLAMACPP_THREAD_COUNT = "n_threads"
 DEFAULT_LLAMACPP_THREAD_COUNT = os.cpu_count()
 CONF_LLAMACPP_BATCH_THREAD_COUNT = "n_batch_threads"
 DEFAULT_LLAMACPP_BATCH_THREAD_COUNT = os.cpu_count()
+CONF_LLAMACPP_REINSTALL = "reinstall_llama_cpp"
 
 DEFAULT_OPTIONS = types.MappingProxyType(
     {
@@ -318,4 +319,4 @@ OPTIONS_OVERRIDES = {
 
 # INTEGRATION_VERSION = "0.4.0"
 INTEGRATION_VERSION = "0.3.11"
-EMBEDDED_LLAMA_CPP_PYTHON_VERSION = "0.3.16"
+EMBEDDED_LLAMA_CPP_PYTHON_VERSION = "0.3.16+b6153"
diff --git a/custom_components/llama_conversation/translations/en.json b/custom_components/llama_conversation/translations/en.json
index c1ab991..4c3f1da 100644
--- a/custom_components/llama_conversation/translations/en.json
+++ b/custom_components/llama_conversation/translations/en.json
@@ -187,13 +187,27 @@
                     "text_generation_webui_admin_key": "Admin Key",
                     "text_generation_webui_preset": "Generation Preset/Character Name",
                     "text_generation_webui_chat_mode": "Chat Mode"
-                }
+                },
+                "description": "Please provide the connection details to connect to the API that is hosting the model.",
+                "title": "Configure Connection"
+            },
+            "reinstall": {
+                "data": {
+                    "reinstall_llama_cpp": "Reinstall Llama.cpp",
+                    "installed_llama_cpp_version": "Version to (re)install"
+                },
+                "description": "__If you are experiencing issues with Llama.cpp__, you can force a reinstall of the package here. This will attempt to re-install or upgrade the llama-cpp-python package from GitHub *or* a local wheel file placed in the `/config/custom_components/llama_conversation/` directory.",
+                "title": "Reinstall Llama.cpp"
             }
         },
         "error": {
             "failed_to_connect": "Failed to connect to the remote API: {exception}",
             "invalid_hostname": "The provided hostname was invalid. Please ensure you only provide the domain or IP address and not the full API endpoint.",
-            "unknown": "Unexpected error"
+            "unknown": "Unexpected error",
+            "pip_wheel_error": "Pip returned an error while installing the wheel! Please check the Home Assistant logs for more details."
+        },
+        "progress": {
+            "install_local_wheels": "Please wait while Llama.cpp is installed..."
         }
     },
     "selector": {
diff --git a/custom_components/llama_conversation/utils.py b/custom_components/llama_conversation/utils.py
index f9dd084..bc1cf31 100644
--- a/custom_components/llama_conversation/utils.py
+++ b/custom_components/llama_conversation/utils.py
@@ -9,13 +9,14 @@ import multiprocessing
 import voluptuous as vol
 import webcolors
 import json
-from typing import Any, Dict, List, Sequence, cast
+from typing import Any, Dict, List, Sequence, Tuple, cast
 from webcolors import CSS3
 from importlib.metadata import version
 
+from homeassistant.core import HomeAssistant
 from homeassistant.components import conversation
 from homeassistant.helpers import config_validation as cv
-from homeassistant.helpers import intent, llm
+from homeassistant.helpers import intent, llm, aiohttp_client
 from homeassistant.requirements import pip_kwargs
 from homeassistant.util import color
 from homeassistant.util.package import install_package, is_installed
@@ -191,18 +192,11 @@ def validate_llama_cpp_python_installation():
 def get_llama_cpp_python_version():
     if not is_installed("llama-cpp-python"):
         return None
-    return version("llama-cpp-python").split("+")[0]
+    return version("llama-cpp-python")
 
-def install_llama_cpp_python(config_dir: str):
+def get_runtime_and_platform_suffix() -> Tuple[str, str]:
+    runtime_version = f"cp{sys.version_info.major}{sys.version_info.minor}"
 
-    installed_wrong_version = False
-    if is_installed("llama-cpp-python"):
-        if version("llama-cpp-python") != EMBEDDED_LLAMA_CPP_PYTHON_VERSION:
-            installed_wrong_version = True
-        else:
-            time.sleep(0.5) # I still don't know why this is required
-            return True
-    
     platform_suffix = platform.machine()
     # remap other names for architectures to the names we use
     if platform_suffix == "arm64":
@@ -210,42 +204,65 @@ def install_llama_cpp_python(config_dir: str):
     if platform_suffix == "i386" or platform_suffix == "amd64":
         platform_suffix = "x86_64"
 
-    runtime_version = f"cp{sys.version_info.major}{sys.version_info.minor}"
-    
+    return runtime_version, platform_suffix
+
+async def get_available_llama_cpp_versions(hass: HomeAssistant) -> List[Tuple[str, bool]]:
+    github_index_url = "https://acon96.github.io/llama-cpp-python/whl/ha/llama-cpp-python/"
+    session = aiohttp_client.async_get_clientsession(hass)
+    try:
+        async with session.get(github_index_url) as resp:
+            if resp.status != 200:
+                raise Exception(f"Failed to fetch available versions from GitHub (HTTP {resp.status})")
+            text = await resp.text()
+            # pull version numbers out of h2 tags
+            versions = re.findall(r"<h2.*>(.+)</h2>", text)
+            remote =  sorted([(v, False) for v in versions], reverse=True)
+    except Exception as ex:
+        _LOGGER.warning(f"Error fetching available versions from GitHub: {repr(ex)}")
+        remote = []
+
+    runtime_version, platform_suffix = get_runtime_and_platform_suffix()
     folder = os.path.dirname(__file__)
     potential_wheels = sorted([ path for path in os.listdir(folder) if path.endswith(f"{platform_suffix}.whl") ], reverse=True)
-    potential_wheels = [ wheel for wheel in potential_wheels if runtime_version in wheel ]
-    potential_wheels = [ wheel for wheel in potential_wheels if f"{EMBEDDED_LLAMA_CPP_PYTHON_VERSION}+homellm" in wheel ]
+    local = [ (wheel, True) for wheel in potential_wheels if runtime_version in wheel and "llama_cpp_python" in wheel]
+    
+    return remote + local
 
-    _LOGGER.debug(f"{potential_wheels=}")
-    if len(potential_wheels) > 0:
+def install_llama_cpp_python(config_dir: str, force_reinstall: bool = False, specific_version: str | None = None) -> bool:
 
-        latest_wheel = potential_wheels[0]
-
-        _LOGGER.info("Installing llama-cpp-python from local wheel")
-        _LOGGER.debug(f"Wheel location: {latest_wheel}")
-        return install_package(os.path.join(folder, latest_wheel), **pip_kwargs(config_dir))
+    installed_wrong_version = False
+    if is_installed("llama-cpp-python") and not force_reinstall:
+        if version("llama-cpp-python") != EMBEDDED_LLAMA_CPP_PYTHON_VERSION:
+            installed_wrong_version = True
+        else:
+            time.sleep(0.5) # I still don't know why this is required
+            return True
         
-    # scikit-build-core v0.9.7+ doesn't recognize these builds as musllinux, and just tags them as generic linux
-    # github_release_url = f"https://github.com/acon96/home-llm/releases/download/v{INTEGRATION_VERSION}/llama_cpp_python-{EMBEDDED_LLAMA_CPP_PYTHON_VERSION}+homellm-{runtime_version}-{runtime_version}-musllinux_1_2_{platform_suffix}.whl"
-    github_release_url = f"https://github.com/acon96/home-llm/releases/download/v{INTEGRATION_VERSION}/llama_cpp_python-{EMBEDDED_LLAMA_CPP_PYTHON_VERSION}+homellm-{runtime_version}-{runtime_version}-linux_{platform_suffix}.whl"
-    if install_package(github_release_url, **pip_kwargs(config_dir)):
-        _LOGGER.info("llama-cpp-python successfully installed from GitHub release")
+    runtime_version, platform_suffix = get_runtime_and_platform_suffix()
+
+    if not specific_version:
+        specific_version = EMBEDDED_LLAMA_CPP_PYTHON_VERSION
+    
+    if ".whl" in specific_version:
+        wheel_location = os.path.join(os.path.dirname(__file__), specific_version)
+    else:
+        wheel_location = f"https://github.com/acon96/llama-cpp-python/releases/download/{specific_version}/llama_cpp_python-{specific_version}-{runtime_version}-{runtime_version}-linux_{platform_suffix}.whl"
+
+    if install_package(wheel_location, **pip_kwargs(config_dir)):
+        _LOGGER.info("llama-cpp-python successfully installed")
         return True
     
     # if it is just the wrong version installed then ignore the installation error
     if not installed_wrong_version:
         _LOGGER.error(
-            "Error installing llama-cpp-python. Could not install the binary wheels from GitHub for " + \
-            f"platform: {platform_suffix}, python version: {sys.version_info.major}.{sys.version_info.minor}. " + \
+            "Error installing llama-cpp-python. Could not install the binary wheels from GitHub." + \
             "Please manually build or download the wheels and place them in the `/config/custom_components/llama_conversation` directory." + \
             "Make sure that you download the correct .whl file for your platform and python version from the GitHub releases page."
         )
         return False
     else:
         _LOGGER.info(
-            "Error installing llama-cpp-python. Could not install the binary wheels from GitHub for " + \
-            f"platform: {platform_suffix}, python version: {sys.version_info.major}.{sys.version_info.minor}. " + \
+            "Error installing llama-cpp-python. Could not install the binary wheels from GitHub." + \
             f"You already have a version of llama-cpp-python ({version('llama-cpp-python')}) installed, however it may not be compatible!"
         )
         time.sleep(0.5) # I still don't know why this is required
diff --git a/scripts/make_wheel.sh b/scripts/make_wheel.sh
index 71e2e17..a65e18e 100644
--- a/scripts/make_wheel.sh
+++ b/scripts/make_wheel.sh
@@ -2,15 +2,15 @@
 # Don't run this. This is executed inside of the home assistant container to build the wheel
 
 apk update
-apk add build-base python3-dev
+apk add build-base python3-dev linux-headers
+
+tag=$1
 
 cd /tmp
-git clone --recurse-submodules https://github.com/abetlen/llama-cpp-python --branch $1
+git clone --recurse-submodules https://github.com/acon96/llama-cpp-python --branch $tag --depth 1 --shallow-submodules
 cd llama-cpp-python
 pip3 install build
-
-tag="homellm"
-sed -i -E "s/^(__version__ *= *\"[0-9]+\.[0-9]+\.[0-9]+)\"/\1+${tag}\"/" llama_cpp/__init__.py
+sed -i -E "s/^(__version__ *= *\")[^\"]+\"/\1${tag}\"/" llama_cpp/__init__.py
 
 export CMAKE_ARGS="-DLLAVA_BUILD=OFF -DGGML_NATIVE=ON"
 python3 -m build --wheel
diff --git a/scripts/run_docker_to_make_wheels.sh b/scripts/run_docker_to_make_wheels.sh
index 007a0f3..3dda311 100755
--- a/scripts/run_docker_to_make_wheels.sh
+++ b/scripts/run_docker_to_make_wheels.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-VERSION_TO_BUILD="v0.3.16"
+VERSION_TO_BUILD="0.3.16+b6713"
 
 # make python 11 wheels
 # docker run -it --rm \