add merge with alpaca dataset

This commit is contained in:
Alex O'Connell
2023-12-13 21:53:48 -05:00
parent 46b6ee4f65
commit 2384b75583
6 changed files with 53 additions and 21406 deletions

4
.gitignore vendored
View File

@@ -2,4 +2,6 @@ models/
loras/
core/
config/
.DS_Store
.DS_Store
data/*.json
*.pyc

View File

@@ -367,8 +367,10 @@ def format_example(example):
sys_prompt = "You are 'Al', a helpful AI Assistant that controls the devices in a house. Complete the following task as instructed or answer the following question with the information provided only."
services_block = "Services: " + ", ".join(sorted(example["available_services"]))
states_block = "Devices:\n" + "\n".join(example["states"])
question = "Request:\n" + example["question"]
answers = "Response:\n" + " ".join(example["answers"])
# question = "Request:\n" + example["question"]
# answers = "Response:\n" + " ".join(example["answers"])
question = example["question"]
answers = " ".join(example["answers"])
system_block = "\n".join(["<|im_start|>system " + sys_prompt, services_block, states_block, "<|im_end|>" ])
user_block = "<|im_start|>user " + question + "<|im_end|>"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1 +1,43 @@
# TODO: merge the home_assistant dataset with alpaca locally
import random
from datasets import load_dataset, concatenate_datasets
from generate_home_assistant_data import format_example, random_device_list, SUPPORTED_DEVICES
alpaca_dataset = load_dataset("yahma/alpaca-cleaned")["train"].train_test_split(test_size=0.1)
home_assistant_dataset = load_dataset("json", data_files={ "train": "home_assistant_train.json", "test": "home_assistant_test.json" })
random.seed(42)
def format_alpaca(example):
question = example["instruction"]
if example["input"]:
question = question = "\n" + example["input"]
answer = example["output"]
device_list, device_types = random_device_list(max_devices=32, avoid_device_names=[])
available_services = []
for x in device_types:
available_services.extend([ f"{x}.{y}" for y in SUPPORTED_DEVICES[x].services ])
text = format_example(example={
"states": device_list,
"available_services": list(available_services),
"question": question,
"answers": [ answer ],
"service_calls": []
})
result = {
"text": text
}
return result
alpaca_dataset = alpaca_dataset.map(format_alpaca).remove_columns(["input", "output", "instruction"])
combined_dataset_train = concatenate_datasets([home_assistant_dataset["train"], alpaca_dataset["train"]]).shuffle(seed=42)
combined_dataset_test = concatenate_datasets([home_assistant_dataset["test"], alpaca_dataset["test"]]).shuffle(seed=42)
combined_dataset_train.to_json("home_assistant_alpaca_merged_train.json")
combined_dataset_train.to_json("home_assistant_alpaca_merged_test.json")

View File

@@ -14,8 +14,9 @@ python3 train.py \
--add_pad_token \
--add_chatml_tokens \
--bf16 \
--train_dataset data/home_assistant_train.json \
--test_dataset data/home_assistant_test.json
--train_dataset data/home_assistant_alpaca_merged_train.json \
--test_dataset data/home_assistant_alpaca_merged_test.json \
--learning_rate 2e-5
"""
"""