mirror of
https://github.com/acon96/home-llm.git
synced 2026-01-10 14:18:00 -05:00
wizardlm merge + fix eval
This commit is contained in:
@@ -567,7 +567,7 @@ def generate_example_file(filename: str, seed: int, *, static_factor: int, templ
|
||||
|
||||
def format_alpaca(example):
|
||||
question = example["instruction"]
|
||||
if example["input"]:
|
||||
if "input" in example and example["input"]:
|
||||
question = question = "\n" + example["input"]
|
||||
|
||||
answer = example["output"]
|
||||
@@ -592,13 +592,13 @@ def format_alpaca(example):
|
||||
|
||||
return result
|
||||
|
||||
def merge_with_dataset(dataset_name, seed, outupt_name, format_function):
|
||||
def merge_with_dataset(dataset_name, seed, outupt_name, format_function, dataset_column_names):
|
||||
alpaca_dataset = load_dataset(dataset_name)["train"].train_test_split(test_size=0.1)
|
||||
home_assistant_dataset = load_dataset("json", data_files={ "train": "home_assistant_train.json", "test": "home_assistant_test.json" })
|
||||
|
||||
random.seed(seed)
|
||||
|
||||
alpaca_dataset = alpaca_dataset.map(format_function).remove_columns(["input", "output", "instruction"])
|
||||
alpaca_dataset = alpaca_dataset.map(format_function).remove_columns(dataset_column_names)
|
||||
|
||||
combined_dataset_train = concatenate_datasets([home_assistant_dataset["train"], alpaca_dataset["train"]]).shuffle(seed=42)
|
||||
combined_dataset_test = concatenate_datasets([home_assistant_dataset["test"], alpaca_dataset["test"]]).shuffle(seed=42)
|
||||
@@ -616,20 +616,33 @@ def main():
|
||||
parser.add_argument("--sample", action="store_true", help="Set this flag to enable generation of the train dataset.")
|
||||
parser.add_argument("--test", action="store_true", help="Set this flag to enable generation of the train dataset..")
|
||||
parser.add_argument("--train", action="store_true", help="Set this flag to enable generation of the train dataset.")
|
||||
parser.add_argument("--merge-alpaca", action="store_true", help="Set this flag to merge the generated datasets with the alpaca-cleaned dataset.")
|
||||
parser.add_argument("--merge", help="Set this flag to merge the generated datasets with the specified dataset.")
|
||||
train_size_group = parser.add_mutually_exclusive_group()
|
||||
train_size_group.add_argument('--small', action='store_const', const='small', dest='size')
|
||||
train_size_group.add_argument('--medium', action='store_const', const='medium', dest='size')
|
||||
train_size_group.add_argument('--large', action='store_const', const='large', dest='size')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.sample:
|
||||
generate_example_file("sample", 42, static_factor=1, template_factor=1, status_request_factor=1)
|
||||
if args.train:
|
||||
# TODO: add small, medium, large cli clags
|
||||
# generate_example_file("home_assistant_train", 42, static_factor=1, template_factor=10, status_request_factor=8)
|
||||
generate_example_file("home_assistant_train", 42, static_factor=5, template_factor=15, status_request_factor=12)
|
||||
# generate_example_file("home_assistant_train", 42, static_factor=5, template_factor=20, status_request_factor=15)
|
||||
if args.size == "small":
|
||||
generate_example_file("home_assistant_train", 42, static_factor=1, template_factor=10, status_request_factor=8)
|
||||
elif args.size == "medium":
|
||||
generate_example_file("home_assistant_train", 42, static_factor=5, template_factor=15, status_request_factor=12)
|
||||
elif args.size == "large":
|
||||
generate_example_file("home_assistant_train", 42, static_factor=5, template_factor=20, status_request_factor=15)
|
||||
else:
|
||||
raise Exception(f"Unrecognized dataset size: {args.size}")
|
||||
if args.test:
|
||||
generate_example_file("home_assistant_test", 12345, static_factor=0.25, template_factor=3, status_request_factor=2)
|
||||
if args.merge_alpaca:
|
||||
merge_with_dataset("yahma/alpaca-cleaned", 42, "alpaca", format_alpaca)
|
||||
|
||||
if args.merge == "alpaca":
|
||||
merge_with_dataset("yahma/alpaca-cleaned", 42, "alpaca", format_alpaca, ["input", "output", "instruction"])
|
||||
elif args.merge == "wizardlm70k":
|
||||
merge_with_dataset("WizardLM/WizardLM_evol_instruct_70k", 42, "wizardlm70k", format_alpaca, ["output", "instruction"])
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,3 +1,4 @@
|
||||
# home-llm experiements (phi1.5)
|
||||
rev1 - original test
|
||||
- 1 epoch
|
||||
- train ctx 1900
|
||||
@@ -217,7 +218,8 @@ rev 9 - reduced dataset size
|
||||
|
||||
------
|
||||
|
||||
home-1b-rev1
|
||||
# Home 1B
|
||||
## home-1b-rev1
|
||||
- 1 epoch
|
||||
- 2048 train ctx
|
||||
- batch size 8
|
||||
@@ -231,4 +233,80 @@ home-1b-rev1
|
||||
+ it works OK with low temperatures
|
||||
+ seems to handle the alpaca dataset not so well
|
||||
|
||||
home-1b-rev2
|
||||
Eval results for existing models:
|
||||
Home-1b-v1: 0.767816091954023
|
||||
Home-3b-v2: 0.6908045977011494
|
||||
|
||||
## home-1b-rev5 series
|
||||
- 1 epoch
|
||||
- 2048 train ctx
|
||||
- batch size 8
|
||||
- learning rate 1e-5
|
||||
- weight decay 0.1
|
||||
- gradient clipping 1.0
|
||||
- save model every 200 steps
|
||||
|
||||
home-1b-rev5
|
||||
- dataset size: medium
|
||||
- evaluation results:
|
||||
- 200: 0.553448275862069
|
||||
- 400: 0.7482758620689656 (+.19)
|
||||
- 600: 0.8103448275862069 (+.06)
|
||||
- 800: 0.8316091954022988 (+.02)
|
||||
- 1000: 0.8396551724137931 (+.008)
|
||||
- 1200: 0.8488505747126437 (+.009)
|
||||
- Final (1467): 0.8494252873563218 (+.00005)
|
||||
|
||||
home-1b-rev5_1
|
||||
- dataset size: small
|
||||
- evaluation results:
|
||||
- 200: 0.6057471264367816
|
||||
- 400: 0.7494252873563219 (+.143)
|
||||
- 600: 0.7683908045977011 (+.018)
|
||||
- 800: 0.7729885057471264 (+.0046)
|
||||
- Final (869): bad
|
||||
|
||||
home-1b-rev5_2
|
||||
- dataset size: large
|
||||
- evaluation results:
|
||||
- 200: --
|
||||
- 400: --
|
||||
- 600: 0.8425287356321839
|
||||
- 800: 0.8666666666666667
|
||||
- 1000: 0.8770114942528736
|
||||
- 1200: 0.8844827586206897
|
||||
- 1400: 0.8879310344827587
|
||||
- 1600: 0.8844827586206897
|
||||
- Final (1848): 0.8833333333333333
|
||||
|
||||
home-3b-v3-rev1
|
||||
- dataset size: large
|
||||
- evaluation results: 0.9091954022988505
|
||||
|
||||
home-3b-v3-rev2
|
||||
- dataset size: large + alpaca
|
||||
- evaluation results:
|
||||
|
||||
# Datasets
|
||||
|
||||
## SFT
|
||||
Alpaca: https://huggingface.co/datasets/yahma/alpaca-cleaned
|
||||
Alpaca (Translated): https://huggingface.co/datasets/saillab/taco-datasets
|
||||
WizardLM 200k: https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k
|
||||
WizardLM 70k: https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_70k
|
||||
Huggingface Ultrachat 200k: https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k
|
||||
OpenOrca Slim Deduped (363k): https://huggingface.co/datasets/Open-Orca/SlimOrca-Dedup
|
||||
|
||||
## DPO
|
||||
Intel Orca DPO Pairs: https://huggingface.co/datasets/Intel/orca_dpo_pairs
|
||||
Huggingface Ultrachat: https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized
|
||||
|
||||
----------------------------------------------------------------------------------------------------
|
||||
python3 evaluate.py home-1b-rev5_2/checkpoint-600 --batch-size 12 && \
|
||||
python3 evaluate.py home-1b-rev5_2/checkpoint-800 --batch-size 12 && \
|
||||
python3 evaluate.py home-1b-rev5_2/checkpoint-1000 --batch-size 12 && \
|
||||
python3 evaluate.py home-1b-rev5_2/checkpoint-1200 --batch-size 12 && \
|
||||
python3 evaluate.py home-1b-rev5_2/checkpoint-1400 --batch-size 12 && \
|
||||
python3 evaluate.py home-1b-rev5_2/checkpoint-1600 --batch-size 12 && \
|
||||
python3 evaluate.py home-1b-rev5_2/checkpoint-1800 --batch-size 12 && \
|
||||
python3 evaluate.py home-1b-rev5_2 --batch-size 12
|
||||
17
train.py
17
train.py
@@ -17,7 +17,7 @@ Phi Modules: fc1,fc2,q_proj,v_proj,k_proj,dense,embed_tokens,lm_head
|
||||
|
||||
"""
|
||||
python3 train.py \
|
||||
--run_name home-3b-v3-rev1 \
|
||||
--run_name home-3b-v3-rev2 \
|
||||
--base_model microsoft/phi-2 \
|
||||
--add_pad_token \
|
||||
--add_chatml_tokens \
|
||||
@@ -33,7 +33,7 @@ python3 train.py \
|
||||
|
||||
"""
|
||||
python3 train.py \
|
||||
--run_name home-1b-rev4 \
|
||||
--run_name home-1b-rev5 \
|
||||
--base_model microsoft/phi-1_5 \
|
||||
--add_pad_token \
|
||||
--add_chatml_tokens \
|
||||
@@ -42,7 +42,7 @@ python3 train.py \
|
||||
--test_dataset data/home_assistant_test.json \
|
||||
--learning_rate 1e-5 \
|
||||
--micro_batch_size 4 --gradient_checkpointing \
|
||||
--ctx_size 2048
|
||||
--ctx_size 2048 --save_steps 200
|
||||
"""
|
||||
|
||||
"""
|
||||
@@ -73,6 +73,7 @@ class TrainingRunArguments:
|
||||
resume_from_checkpoint: str = field(default="", metadata={"help": "The name of the checkpoint to resume training from"})
|
||||
eval_steps: int = field(default=100, metadata={"help": "The number of steps in between evaluations of the model"})
|
||||
save_steps: int = field(default=-1, metadata={"help": "The number of steps in between model checkpoints; set to -1 to save every epoch"})
|
||||
save_total_limit: int = field(default=1, metadata={"help": "The number of recent checkpoints of the model to save (not including the final model)"})
|
||||
group_by_length: bool = field(default=False, metadata={"help": "If enabled, the training data will be grouped by length to optimize use of padding"})
|
||||
|
||||
# Quantization
|
||||
@@ -101,8 +102,6 @@ training_run_args, _ = parser.parse_args_into_dataclasses(return_remaining_strin
|
||||
if sum([training_run_args.load_in_8bit, training_run_args.load_in_4bit, training_run_args.load_as_gptq]) > 1:
|
||||
raise Exception("Please select exactly one of 'load_in_8bit', 'load_in_4bit', or 'load_as_gptq")
|
||||
|
||||
# TODO: write a proper evaluation script
|
||||
|
||||
print(f"Loading model '{training_run_args.base_model}'...")
|
||||
|
||||
model_kwargs = {}
|
||||
@@ -139,7 +138,7 @@ model = AutoModelForCausalLM.from_pretrained(
|
||||
max_memory=find_max_vram(),
|
||||
**model_kwargs
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(training_run_args.base_model, trust_remote_code=True, use_fast=False)
|
||||
tokenizer = AutoTokenizer.from_pretrained(training_run_args.base_model, trust_remote_code=True)
|
||||
|
||||
if training_run_args.add_pad_token:
|
||||
tokenizer.add_special_tokens({'pad_token': '<|pad|>'})
|
||||
@@ -196,8 +195,8 @@ training_args = TrainingArguments(
|
||||
# per_device_eval_batch_size=1,
|
||||
gradient_accumulation_steps=training_run_args.batch_size//training_run_args.micro_batch_size,
|
||||
gradient_checkpointing=training_run_args.gradient_checkpointing,
|
||||
# weight_decay=training_run_args.weight_decay,
|
||||
# max_grad_norm=training_run_args.gradient_clip,
|
||||
weight_decay=training_run_args.weight_decay,
|
||||
max_grad_norm=training_run_args.gradient_clip,
|
||||
evaluation_strategy="steps",
|
||||
eval_steps=training_run_args.eval_steps,
|
||||
save_strategy=("steps" if training_run_args.save_steps != -1 else "epoch"),
|
||||
@@ -206,7 +205,7 @@ training_args = TrainingArguments(
|
||||
logging_steps=5,
|
||||
output_dir=model_dir,
|
||||
num_train_epochs=training_run_args.epochs,
|
||||
save_total_limit=1,
|
||||
save_total_limit=training_run_args.save_total_limit,
|
||||
# dataloader_pin_memory=False,
|
||||
report_to="tensorboard",
|
||||
learning_rate=training_run_args.learning_rate,
|
||||
|
||||
Reference in New Issue
Block a user