Merge pull request #132 from acon96/release/v0.2.14

Release v0.2.14
2026-01-10 06:07:58 -05:00 · 2024-05-02 21:55:06 -04:00
parent f301e5cf45 875547d2e2
commit 9ed95dd987
10 changed files with 180 additions and 45 deletions
--- a/.github/workflows/create-release.yml
+++ b/.github/workflows/create-release.yml
@@ -25,7 +25,7 @@ jobs:
        - home_assistant_version: "2024.2.1"
          arch: "amd64"
          suffix: "-noavx"
-          extra_defines: "-DLLAMA_NATIVE=OFF -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF"
+          extra_defines: "-DLLAMA_NATIVE=OFF -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DLLAMA_F16C=OFF"
        - home_assistant_version: "2024.2.1"
          arch: "amd64"
          suffix: "-avx512"
@@ -33,7 +33,7 @@ jobs:
        - home_assistant_version: "2024.2.1"
          arch: "i386"
          suffix: "-noavx"
-          extra_defines: "-DLLAMA_NATIVE=OFF -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF"
+          extra_defines: "-DLLAMA_NATIVE=OFF -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DLLAMA_F16C=OFF"
        - home_assistant_version: "2024.2.1"
          arch: "i386"
          suffix: "-avx512"
--- a/README.md
+++ b/README.md
@@ -126,6 +126,7 @@ In order to facilitate running the project entirely on the system where Home Ass
 ## Version History
 | Version | Description                                                                                                                                                                                                          |
 |---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| v0.2.14 | Fix llama.cpp wheels + AVX detection                                                                                                                                                                                 |
 | v0.2.13 | Add support for Llama 3, build llama.cpp wheels that are compatible with non-AVX systems, fix an error with exposing script entities, fix multiple small Ollama backend issues, and add basic multi-language support |
 | v0.2.12 | Fix cover ICL examples, allow setting number of ICL examples, add min P and typical P sampler options, recommend models during setup, add JSON mode for Ollama backend, fix missing default options                  |
 | v0.2.11 | Add prompt caching, expose llama.cpp runtime settings, build llama-cpp-python wheels using GitHub actions, and install wheels directly from GitHub                                                                   |
--- a/addon/Dockerfile
+++ b/addon/Dockerfile
@@ -23,7 +23,7 @@ RUN \
        python3-venv \
        python3-pip \
    \
-    && git clone https://github.com/oobabooga/text-generation-webui.git ${APP_DIR} --branch snapshot-2024-04-14 \
+    && git clone https://github.com/oobabooga/text-generation-webui.git ${APP_DIR} --branch snapshot-2024-04-28 \
    && python3 -m pip install torch torchvision torchaudio py-cpuinfo==9.0.0 \
    && python3 -m pip install -r ${APP_DIR}/requirements_cpu_only_noavx2.txt llama-cpp-python \
    && apt-get purge -y --auto-remove \
--- a/addon/config.yaml
+++ b/addon/config.yaml
@@ -1,6 +1,6 @@
 ---
 name: oobabooga-text-generation-webui
-version: 2024.04.14
+version: 2024.04.28
 slug: text-generation-webui
 description: "A tool for running Large Language Models"
 url: "https://github.com/oobabooga/text-generation-webui"
--- a/custom_components/llama_conversation/agent.py
+++ b/custom_components/llama_conversation/agent.py
@@ -25,7 +25,7 @@ from homeassistant.helpers import config_validation as cv, intent, template, ent
 from homeassistant.helpers.event import async_track_state_change, async_call_later
 from homeassistant.util import ulid

-from .utils import closest_color, flatten_vol_schema, install_llama_cpp_python
+from .utils import closest_color, flatten_vol_schema, install_llama_cpp_python, validate_llama_cpp_python_installation
 from .const import (
    CONF_CHAT_MODEL,
    CONF_MAX_TOKENS,
@@ -426,7 +426,7 @@ class LLaMAAgent(AbstractConversationAgent):

            # if we filtered everything then just sample randomly
            if len(selected_in_context_examples) == 0:
-                selected_in_context_examples = self.in_context_examples
+                selected_in_context_examples = self.in_context_examples[:]

            random.shuffle(selected_in_context_examples)
            random.shuffle(entity_names)
@@ -536,6 +536,8 @@ class LocalLLaMAAgent(LLaMAAgent):

        if not self.model_path:
            raise Exception(f"Model was not found at '{self.model_path}'!")
+        
+        validate_llama_cpp_python_installation()

        # don't import it until now because the wheel is installed by config_flow.py
        try:
--- a/custom_components/llama_conversation/config_flow.py
+++ b/custom_components/llama_conversation/config_flow.py
@@ -361,7 +361,6 @@ class ConfigFlow(BaseLlamaConversationConfigFlow, config_entries.ConfigFlow, dom
        else:
            wheel_install_result = self.install_wheel_task.result()
            if not wheel_install_result:
-                _LOGGER.warning("Failed to install wheel: %s", repr(wheel_install_result))
                self.install_wheel_error = "pip_wheel_error"
                next_step = "pick_backend"
            else:
--- a/custom_components/llama_conversation/const.py
+++ b/custom_components/llama_conversation/const.py
@@ -271,5 +271,5 @@ OPTIONS_OVERRIDES = {
    }
 }

-INTEGRATION_VERSION = "0.2.13"
-EMBEDDED_LLAMA_CPP_PYTHON_VERSION = "0.2.64"
+INTEGRATION_VERSION = "0.2.14"
+EMBEDDED_LLAMA_CPP_PYTHON_VERSION = "0.2.69"
--- a/custom_components/llama_conversation/utils.py
+++ b/custom_components/llama_conversation/utils.py
@@ -67,6 +67,24 @@ def download_model_from_hf(model_name: str, quantization_type: str, storage_fold
        cache_dir=storage_folder,
    )

+def _load_extension():
+    """This needs to be at the root file level because we are using the 'spawn' start method"""
+    import importlib
+    importlib.import_module("llama_cpp")
+    
+def validate_llama_cpp_python_installation():
+    """
+    Spawns another process and tries to import llama.cpp to avoid crashing the main process
+    """
+    import multiprocessing
+    multiprocessing.set_start_method('spawn') # required because of aio
+    process = multiprocessing.Process(target=_load_extension)
+    process.start()
+    process.join()
+
+    if process.exitcode != 0:
+        raise Exception(f"Failed to properly initialize llama-cpp-python. (Exit code {process.exitcode}.)")
+
 def install_llama_cpp_python(config_dir: str):

    installed_wrong_version = False
@@ -90,7 +108,7 @@ def install_llama_cpp_python(config_dir: str):
                cpu_features = [ line for line in f.readlines() if line.startswith("Features") or line.startswith("flags")][0]
            if "avx512f" in cpu_features and "avx512bw" in cpu_features:
                instruction_extensions_suffix = "-avx512"
-            elif "avx2" not in cpu_features:
+            elif "avx2" not in cpu_features or "avx" not in cpu_features or "f16c" not in cpu_features or "fma" not in cpu_features or not ("sse3" in cpu_features or "ssse3" in cpu_features):
                instruction_extensions_suffix = "-noavx"
        except Exception as ex:
            _LOGGER.debug(f"Couldn't detect CPU features: {ex}")
@@ -129,4 +147,5 @@ def install_llama_cpp_python(config_dir: str):
            f"You already have a version of llama-cpp-python ({version('llama-cpp-python')}) installed, however it may not be compatible!"
        )
        time.sleep(0.5) # I still don't know why this is required
+
        return True
--- a/evaluate.py
+++ b/evaluate.py
@@ -1,15 +1,18 @@
 #!/usr/bin/env python3

-import argparse, os, re, json
+import argparse, os, re, json, csv, random
 import torch
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
 from peft import PeftConfig, PeftModel
 from tqdm import tqdm

+torch.set_default_device("cuda")
+
 CTX_SIZE = 2048
 TRUST_REMOTE_CODE = False

+
 """
 python3 evaluate.py stablehome-1_6b-rev3 --batch-size 8 --all-checkpoints
 python3 evaluate.py tinyhome-rev1 --batch-size 12 --all-checkpoints
@@ -17,6 +20,57 @@ python3 evaluate.py stablehome-3b-rev6 --batch-size 4 --lora --overwrite
 """

 service_call_regex = re.compile(r"```homeassistant\n([\S \t\n]*?)```")
+json_regex = re.compile(r"({[\S \t]*?})")
+service_names_regex = re.compile(r"\b\w+\.\w+\([^)]*\)")
+entity_ids_regex = re.compile(r"\b\w+\.\w+(?=\s'|\s=)")
+
+try:
+    with open("custom_components/llama_conversation/in_context_examples.csv", encoding="utf-8-sig") as f:
+        in_context_examples = list(csv.DictReader(f))
+except:
+    in_context_examples = []
+
+def icl_example_generator(num_examples, entity_names, service_names):
+    entity_domains = set([x.split(".")[0] for x in entity_names])
+    entity_names = entity_names[:]
+    
+    # filter out examples for disabled services
+    selected_in_context_examples = []
+    for x in in_context_examples:
+        if x["service"] in service_names and x["service"].split(".")[0] in entity_domains:
+            selected_in_context_examples.append(x)
+
+    # if we filtered everything then just sample randomly
+    if len(selected_in_context_examples) == 0:
+        selected_in_context_examples = in_context_examples[:]
+
+    random.shuffle(selected_in_context_examples)
+    random.shuffle(entity_names)
+
+    num_examples_to_generate = min(num_examples, len(selected_in_context_examples))
+    if num_examples_to_generate < num_examples:
+        print(f"Attempted to generate {num_examples} ICL examples for conversation, but only {len(selected_in_context_examples)} are available!")
+    
+    results = []
+    while len(results) < num_examples_to_generate:
+        if len(selected_in_context_examples) == 0:
+            break
+        
+        chosen_example = selected_in_context_examples.pop()
+        chosen_service = chosen_example["service"]
+        potential_devices = [ x for x in entity_names if x.split(".")[0] == chosen_service.split(".")[0] ]
+
+        if len(potential_devices) == 0:
+            continue
+        else:
+            example = {
+                "to_say": chosen_example["response"],
+                "service": chosen_service,
+                "target_device": potential_devices[0],
+            }
+            results.insert(0, json.dumps(example))
+    
+    return "\n".join(results)

 def tokenize(tokenizer, prompt):
    return tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=CTX_SIZE)
@@ -28,8 +82,8 @@ def generate(model, tokenizer, prompts):
    text = tokenizer.batch_decode(outputs)
    return text

-def evaluate(output_folder, trained_model, trained_tokenizer, dataset, batch_size):
-    split = trained_tokenizer.apply_chat_template(conversation=[{"role": "assistant", "content":  r"%%%%%%%%%%%%%%%%"}], tokenize=False).split( r"%%%%%%%%%%%%%%%%")[0]
+def evaluate(output_folder, trained_model, trained_tokenizer, dataset, batch_size, use_icl):
+    split = trained_tokenizer.apply_chat_template(conversation=[{"role": "assistant", "content":  r"%%%%%%%%%%%%%%%%"}], tokenize=False).split( r"%%%%%%%%%%%%%%%%")[0].replace(trained_tokenizer.bos_token, "")

    print("Evaluating...")
    correct_answers = 0
@@ -54,6 +108,18 @@ def evaluate(output_folder, trained_model, trained_tokenizer, dataset, batch_siz
                expected_responses = []
                for example in batch["conversations"]:
                    conversation = [ { "role": x["from"], "content": x["value"] } for x in example if x["from"] != "assistant"]
+
+                    if use_icl:
+                        new_conversation = []
+                        for turn in conversation:
+                            if turn["role"] == "system":
+                                entity_names = entity_ids_regex.findall(turn["content"])
+                                service_names = [ x.split("(")[0] for x in service_names_regex.findall(turn["content"]) ]
+                                icl_examples = icl_example_generator(5, entity_names, service_names)
+                                turn["content"] = turn["content"] + "Respond to the following user instruction by responding in the same format as the following examples:\n" + icl_examples
+                            new_conversation.append(turn)
+                        conversation = new_conversation
+                    
                    prompts.append(trained_tokenizer.apply_chat_template(
                        conversation=conversation,
                        max_length=CTX_SIZE,
@@ -61,7 +127,14 @@ def evaluate(output_folder, trained_model, trained_tokenizer, dataset, batch_siz
                        tokenize=False,
                        add_generation_prompt=True,
                    ))
-                    expected_responses.append([x["value"] for x in example if x["from"] == "assistant"][0])
+
+                    if use_icl:
+                        response = [x["value"] for x in example if x["from"] == "assistant"][0]
+                        expected_calls = service_call_regex.findall(response)
+                        to_say = service_call_regex.sub("", response)
+                        expected_responses.append(expected_calls[0])
+                    else:
+                        expected_responses.append([x["value"] for x in example if x["from"] == "assistant"][0])
            output = generate(trained_model, trained_tokenizer, prompts)

            for model_output, expected_response in zip(output, expected_responses):
@@ -69,14 +142,19 @@ def evaluate(output_folder, trained_model, trained_tokenizer, dataset, batch_siz

                expected_service_calls = []

-                for block in service_call_regex.findall(expected_response.strip()):
+                if use_icl:
+                    regex_to_use = json_regex
+                else:
+                    regex_to_use = service_call_regex
+
+                for block in regex_to_use.findall(expected_response.strip()):
                    for line in block.split("\n"):
                        if len(line) == 0:
                            continue
                        expected_service_calls.append(json.loads(line))
                        total_answers = total_answers + 1
                
-                found_responses = service_call_regex.findall(response.strip())
+                found_responses = regex_to_use.findall(response.strip())

                if len(expected_service_calls) == 0:
                    total_answers = total_answers + 1
@@ -101,6 +179,9 @@ def evaluate(output_folder, trained_model, trained_tokenizer, dataset, batch_siz
                            failed_examples.append({ "expected": expected_response, "actual": response, "invalid_json": True })
                            continue

+                        if use_icl:
+                            json_output.pop("to_say")
+                            
                        if json_output in expected_service_calls:
                            expected_service_calls.pop(expected_service_calls.index(json_output))
                            correct_answers = correct_answers + 1
@@ -136,7 +217,7 @@ def evaluate(output_folder, trained_model, trained_tokenizer, dataset, batch_siz
            "failed_examples": failed_examples,
        }, f, indent=4)

-def load_model(model_name, is_lora, checkpoint_name):
+def load_model(model_name, is_lora, is_hf, load_in_8bit, checkpoint_name):
    lora_folder = f"./loras/{model_name}/"
    model_folder = f"./models/{model_name}/"
    
@@ -147,7 +228,21 @@ def load_model(model_name, is_lora, checkpoint_name):
        lora_folder = lora_folder + f"{checkpoint_name}/"
        model_folder = model_folder + f"{checkpoint_name}/"

-    if is_lora:
+    if is_hf:
+        print(f"Loading model {model_name}...")
+        trained_model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            trust_remote_code=TRUST_REMOTE_CODE,
+            torch_dtype=torch.bfloat16,
+            load_in_8bit=load_in_8bit,
+        )
+
+        trained_tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            trust_remote_code=TRUST_REMOTE_CODE,
+            padding_side='left',
+        )
+    elif is_lora:
        adapter_config = PeftConfig.from_pretrained(lora_folder)
        base_model_name = adapter_config.base_model_name_or_path
        print(f"Loading lora from {lora_folder} ({base_model_name})...")
@@ -175,6 +270,7 @@ def load_model(model_name, is_lora, checkpoint_name):
            model_folder,
            trust_remote_code=TRUST_REMOTE_CODE,
            torch_dtype=torch.bfloat16,
+            load_in_8bit=load_in_8bit,
        )

        trained_tokenizer = AutoTokenizer.from_pretrained(
@@ -183,6 +279,9 @@ def load_model(model_name, is_lora, checkpoint_name):
            padding_side='left',
        )

+    if not trained_tokenizer.pad_token:
+        trained_tokenizer.pad_token = trained_tokenizer.eos_token
+
    trained_model.generation_config = GenerationConfig(
        max_new_tokens=128,
        use_cache=True,
@@ -191,13 +290,15 @@ def load_model(model_name, is_lora, checkpoint_name):
        top_k=40,
        top_p=1.0,
        repetition_penalty=1.15,
-        eos_token_id=trained_model.config.eos_token_id,
+        # eos_token_id=trained_model.config.eos_token_id,
+        eos_token_id=128009,
        pad_token_id=trained_model.config.pad_token_id if trained_model.config.pad_token_id else trained_model.config.eos_token_id,
    )

    return trained_model, trained_tokenizer

 def main():
+    global in_context_examples
    parser = argparse.ArgumentParser(description="Evaluate the function calling for a model")
    parser.add_argument("model")
    parser.add_argument("--dataset-file", default="./data/home_assistant_test.jsonl")
@@ -205,6 +306,8 @@ def main():
    parser.add_argument("--lora", default=False, action='store_const', const=True)
    parser.add_argument("--all-checkpoints", default=False, action='store_const', const=True)
    parser.add_argument("--overwrite", default=False, action='store_const', const=True)
+    parser.add_argument("--hf", default=False, action='store_const', const=True)
+    parser.add_argument("--load-in-8bit", default=False, action='store_const', const=True)

    args = parser.parse_args()
    batch_size = int(args.batch_size)
@@ -213,36 +316,42 @@ def main():

    print(f"Got {len(dataset)} examples to test")

-    model_folder = f"./loras/{args.model}/" if args.lora else f"./models/{args.model}/"
+    if args.hf:
+        output_folder = "./"
+        trained_model, trained_tokenizer = load_model(args.model, args.lora, True, args.load_in_8bit, None)
+        evaluate(output_folder, trained_model, trained_tokenizer, dataset, batch_size, True)

-    if not os.path.isdir(model_folder):
-        print(f"Model Not Found: {args.model}")
-        return
-
-    torch.set_default_device("cuda")
-    if not args.all_checkpoints:
-        checkpoints = [None]
    else:
-        checkpoints = [x for x in os.listdir(model_folder) if os.path.isdir(os.path.join(model_folder, x)) and "checkpoint" in x]
-        checkpoints = sorted(checkpoints, key=lambda x: int(x.split('-')[-1]))
-        checkpoints.append(None)
+        model_folder = f"./loras/{args.model}/" if args.lora else f"./models/{args.model}/"

-        print(f"Found {len(checkpoints) - 1} checkpoints to test (plus the final model)")
+        if not os.path.isdir(model_folder):
+            print(f"Model Not Found: {args.model}")
+            return

-    for ckpt in checkpoints:
-        if ckpt:
-            output_folder = os.path.join(model_folder, ckpt)
-        else:
-            output_folder = model_folder
        
-        output_filename = os.path.join(output_folder, "eval_results.json")
-        if os.path.exists(output_filename):
-            if not args.overwrite:
-                print(f"Evaluation already exists for {output_folder}. Skipping...")
-                continue
+        if not args.all_checkpoints:
+            checkpoints = [None]
+        else:
+            checkpoints = [x for x in os.listdir(model_folder) if os.path.isdir(os.path.join(model_folder, x)) and "checkpoint" in x]
+            checkpoints = sorted(checkpoints, key=lambda x: int(x.split('-')[-1]))
+            checkpoints.append(None)

-        trained_model, trained_tokenizer = load_model(args.model, args.lora, ckpt)
-        evaluate(output_folder, trained_model, trained_tokenizer, dataset, batch_size)
+            print(f"Found {len(checkpoints) - 1} checkpoints to test (plus the final model)")
+
+        for ckpt in checkpoints:
+            if ckpt:
+                output_folder = os.path.join(model_folder, ckpt)
+            else:
+                output_folder = model_folder
+            
+            output_filename = os.path.join(output_folder, "eval_results.json")
+            if os.path.exists(output_filename):
+                if not args.overwrite:
+                    print(f"Evaluation already exists for {output_folder}. Skipping...")
+                    continue
+
+            trained_model, trained_tokenizer = load_model(args.model, args.lora, ckpt, False)
+            evaluate(output_folder, trained_model, trained_tokenizer, dataset, batch_size, False)


 if __name__ == "__main__":
--- a/scripts/convert_and_quantize.sh
+++ b/scripts/convert_and_quantize.sh
@@ -10,8 +10,13 @@ if [[ ! -d "./models/$MODEL_NAME" ]]; then
 fi

 echo "Converting to GGUF..."
-$LLAMA_CPP/convert.py --outfile ./models/$MODEL_NAME/$MODEL_NAME.f16.gguf --outtype f16 ./models/$MODEL_NAME/
-# $LLAMA_CPP/convert-hf-to-gguf.py --outfile ./models/$MODEL_NAME/$MODEL_NAME.f16.gguf --outtype f16 ./models/$MODEL_NAME/
+if [ ! -f "./models/$MODEL_NAME/$MODEL_NAME.f16.gguf" ]; then
+    $LLAMA_CPP/convert.py --outfile ./models/$MODEL_NAME/$MODEL_NAME.f16.gguf --outtype f16 ./models/$MODEL_NAME/
+    # $LLAMA_CPP/convert-hf-to-gguf.py --outfile ./models/$MODEL_NAME/$MODEL_NAME.f16.gguf --outtype f16 ./models/$MODEL_NAME/
+else
+    echo "Converted model for already exists. Skipping..."
+fi
+

 DESIRED_QUANTS=("Q8_0" "Q5_K_M" "Q4_K_M" "Q3_K_M" "Q2_K")
 for QUANT in "${DESIRED_QUANTS[@]}"