add merge with alpaca dataset

2026-01-09 13:48:05 -05:00 · 2023-12-13 21:53:48 -05:00
parent 46b6ee4f65
commit 2384b75583
6 changed files with 53 additions and 21406 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -2,4 +2,6 @@ models/
 loras/
 core/
 config/
-.DS_Store
+.DS_Store
+data/*.json
+*.pyc
--- a/data/generate_home_assistant_data.py
+++ b/data/generate_home_assistant_data.py
@@ -367,8 +367,10 @@ def format_example(example):
    sys_prompt = "You are 'Al', a helpful AI Assistant that controls the devices in a house. Complete the following task as instructed or answer the following question with the information provided only."
    services_block = "Services: " + ", ".join(sorted(example["available_services"]))
    states_block = "Devices:\n" + "\n".join(example["states"])
-    question = "Request:\n" + example["question"]
-    answers = "Response:\n" + " ".join(example["answers"])
+    # question = "Request:\n" + example["question"]
+    # answers = "Response:\n" + " ".join(example["answers"])
+    question = example["question"]
+    answers = " ".join(example["answers"])

    system_block = "\n".join(["<|im_start|>system " + sys_prompt, services_block, states_block, "<|im_end|>" ])
    user_block = "<|im_start|>user " + question + "<|im_end|>"
--- a/data/home_assistant_test.json
+++ b/data/home_assistant_test.json
--- a/data/home_assistant_train.json
+++ b/data/home_assistant_train.json
--- a/data/merge_with_alpaca.py
+++ b/data/merge_with_alpaca.py
@@ -1 +1,43 @@
-# TODO: merge the home_assistant dataset with alpaca locally
+import random
+from datasets import load_dataset, concatenate_datasets
+from generate_home_assistant_data import format_example, random_device_list, SUPPORTED_DEVICES
+
+alpaca_dataset = load_dataset("yahma/alpaca-cleaned")["train"].train_test_split(test_size=0.1)
+home_assistant_dataset = load_dataset("json", data_files={  "train": "home_assistant_train.json", "test": "home_assistant_test.json" })
+
+random.seed(42)
+
+def format_alpaca(example):
+    question = example["instruction"]
+    if example["input"]:
+        question = question = "\n" + example["input"]
+
+    answer = example["output"]
+
+    device_list, device_types = random_device_list(max_devices=32, avoid_device_names=[])
+
+    available_services = []
+    for x in device_types:
+        available_services.extend([ f"{x}.{y}" for y in SUPPORTED_DEVICES[x].services ])
+
+    text = format_example(example={
+        "states": device_list,
+        "available_services": list(available_services),
+        "question": question,
+        "answers": [ answer ],
+        "service_calls": []
+    })
+
+    result = {
+        "text": text
+    }
+
+    return result
+
+alpaca_dataset = alpaca_dataset.map(format_alpaca).remove_columns(["input", "output", "instruction"])
+
+combined_dataset_train = concatenate_datasets([home_assistant_dataset["train"], alpaca_dataset["train"]]).shuffle(seed=42)
+combined_dataset_test = concatenate_datasets([home_assistant_dataset["test"], alpaca_dataset["test"]]).shuffle(seed=42)
+
+combined_dataset_train.to_json("home_assistant_alpaca_merged_train.json")
+combined_dataset_train.to_json("home_assistant_alpaca_merged_test.json")
--- a/train.py
+++ b/train.py
@@ -14,8 +14,9 @@ python3 train.py \
    --add_pad_token \
    --add_chatml_tokens \
    --bf16 \
-    --train_dataset data/home_assistant_train.json \
-    --test_dataset data/home_assistant_test.json
+    --train_dataset data/home_assistant_alpaca_merged_train.json \
+    --test_dataset data/home_assistant_alpaca_merged_test.json \ 
+    --learning_rate 2e-5
 """

 """