mirror of
https://github.com/acon96/home-llm.git
synced 2026-01-09 13:48:05 -05:00
add merge with alpaca dataset
This commit is contained in:
4
.gitignore
vendored
4
.gitignore
vendored
@@ -2,4 +2,6 @@ models/
|
||||
loras/
|
||||
core/
|
||||
config/
|
||||
.DS_Store
|
||||
.DS_Store
|
||||
data/*.json
|
||||
*.pyc
|
||||
@@ -367,8 +367,10 @@ def format_example(example):
|
||||
sys_prompt = "You are 'Al', a helpful AI Assistant that controls the devices in a house. Complete the following task as instructed or answer the following question with the information provided only."
|
||||
services_block = "Services: " + ", ".join(sorted(example["available_services"]))
|
||||
states_block = "Devices:\n" + "\n".join(example["states"])
|
||||
question = "Request:\n" + example["question"]
|
||||
answers = "Response:\n" + " ".join(example["answers"])
|
||||
# question = "Request:\n" + example["question"]
|
||||
# answers = "Response:\n" + " ".join(example["answers"])
|
||||
question = example["question"]
|
||||
answers = " ".join(example["answers"])
|
||||
|
||||
system_block = "\n".join(["<|im_start|>system " + sys_prompt, services_block, states_block, "<|im_end|>" ])
|
||||
user_block = "<|im_start|>user " + question + "<|im_end|>"
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1 +1,43 @@
|
||||
# TODO: merge the home_assistant dataset with alpaca locally
|
||||
import random
|
||||
from datasets import load_dataset, concatenate_datasets
|
||||
from generate_home_assistant_data import format_example, random_device_list, SUPPORTED_DEVICES
|
||||
|
||||
alpaca_dataset = load_dataset("yahma/alpaca-cleaned")["train"].train_test_split(test_size=0.1)
|
||||
home_assistant_dataset = load_dataset("json", data_files={ "train": "home_assistant_train.json", "test": "home_assistant_test.json" })
|
||||
|
||||
random.seed(42)
|
||||
|
||||
def format_alpaca(example):
|
||||
question = example["instruction"]
|
||||
if example["input"]:
|
||||
question = question = "\n" + example["input"]
|
||||
|
||||
answer = example["output"]
|
||||
|
||||
device_list, device_types = random_device_list(max_devices=32, avoid_device_names=[])
|
||||
|
||||
available_services = []
|
||||
for x in device_types:
|
||||
available_services.extend([ f"{x}.{y}" for y in SUPPORTED_DEVICES[x].services ])
|
||||
|
||||
text = format_example(example={
|
||||
"states": device_list,
|
||||
"available_services": list(available_services),
|
||||
"question": question,
|
||||
"answers": [ answer ],
|
||||
"service_calls": []
|
||||
})
|
||||
|
||||
result = {
|
||||
"text": text
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
alpaca_dataset = alpaca_dataset.map(format_alpaca).remove_columns(["input", "output", "instruction"])
|
||||
|
||||
combined_dataset_train = concatenate_datasets([home_assistant_dataset["train"], alpaca_dataset["train"]]).shuffle(seed=42)
|
||||
combined_dataset_test = concatenate_datasets([home_assistant_dataset["test"], alpaca_dataset["test"]]).shuffle(seed=42)
|
||||
|
||||
combined_dataset_train.to_json("home_assistant_alpaca_merged_train.json")
|
||||
combined_dataset_train.to_json("home_assistant_alpaca_merged_test.json")
|
||||
5
train.py
5
train.py
@@ -14,8 +14,9 @@ python3 train.py \
|
||||
--add_pad_token \
|
||||
--add_chatml_tokens \
|
||||
--bf16 \
|
||||
--train_dataset data/home_assistant_train.json \
|
||||
--test_dataset data/home_assistant_test.json
|
||||
--train_dataset data/home_assistant_alpaca_merged_train.json \
|
||||
--test_dataset data/home_assistant_alpaca_merged_test.json \
|
||||
--learning_rate 2e-5
|
||||
"""
|
||||
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user