wizardlm merge + fix eval

2026-01-10 14:18:00 -05:00 · 2024-01-25 20:46:59 -05:00
parent 57634519ca
commit e6fae06133
3 changed files with 110 additions and 20 deletions
--- a/data/generate_home_assistant_data.py
+++ b/data/generate_home_assistant_data.py
@@ -567,7 +567,7 @@ def generate_example_file(filename: str, seed: int, *, static_factor: int, templ

 def format_alpaca(example):
    question = example["instruction"]
-    if example["input"]:
+    if "input" in example and example["input"]:
        question = question = "\n" + example["input"]

    answer = example["output"]
@@ -592,13 +592,13 @@ def format_alpaca(example):

    return result

-def merge_with_dataset(dataset_name, seed, outupt_name, format_function):
+def merge_with_dataset(dataset_name, seed, outupt_name, format_function, dataset_column_names):
    alpaca_dataset = load_dataset(dataset_name)["train"].train_test_split(test_size=0.1)
    home_assistant_dataset = load_dataset("json", data_files={  "train": "home_assistant_train.json", "test": "home_assistant_test.json" })

    random.seed(seed)

-    alpaca_dataset = alpaca_dataset.map(format_function).remove_columns(["input", "output", "instruction"])
+    alpaca_dataset = alpaca_dataset.map(format_function).remove_columns(dataset_column_names)

    combined_dataset_train = concatenate_datasets([home_assistant_dataset["train"], alpaca_dataset["train"]]).shuffle(seed=42)
    combined_dataset_test = concatenate_datasets([home_assistant_dataset["test"], alpaca_dataset["test"]]).shuffle(seed=42)
@@ -616,20 +616,33 @@ def main():
    parser.add_argument("--sample", action="store_true", help="Set this flag to enable generation of the train dataset.")
    parser.add_argument("--test", action="store_true", help="Set this flag to enable generation of the train dataset..")
    parser.add_argument("--train", action="store_true", help="Set this flag to enable generation of the train dataset.")
-    parser.add_argument("--merge-alpaca", action="store_true", help="Set this flag to merge the generated datasets with the alpaca-cleaned dataset.")
+    parser.add_argument("--merge", help="Set this flag to merge the generated datasets with the specified dataset.")
+    train_size_group = parser.add_mutually_exclusive_group()
+    train_size_group.add_argument('--small', action='store_const', const='small', dest='size')
+    train_size_group.add_argument('--medium', action='store_const', const='medium', dest='size')
+    train_size_group.add_argument('--large', action='store_const', const='large', dest='size')
+
    args = parser.parse_args()

    if args.sample:
        generate_example_file("sample", 42, static_factor=1, template_factor=1, status_request_factor=1)
    if args.train:
        # TODO: add small, medium, large cli clags
-        # generate_example_file("home_assistant_train", 42, static_factor=1, template_factor=10, status_request_factor=8)
-        generate_example_file("home_assistant_train", 42, static_factor=5, template_factor=15, status_request_factor=12)
-        # generate_example_file("home_assistant_train", 42, static_factor=5, template_factor=20, status_request_factor=15)
+        if args.size == "small":
+            generate_example_file("home_assistant_train", 42, static_factor=1, template_factor=10, status_request_factor=8)
+        elif args.size == "medium":
+            generate_example_file("home_assistant_train", 42, static_factor=5, template_factor=15, status_request_factor=12)
+        elif args.size == "large":
+            generate_example_file("home_assistant_train", 42, static_factor=5, template_factor=20, status_request_factor=15)
+        else:
+            raise Exception(f"Unrecognized dataset size: {args.size}")
    if args.test:
        generate_example_file("home_assistant_test", 12345, static_factor=0.25, template_factor=3, status_request_factor=2)
-    if args.merge_alpaca:
-        merge_with_dataset("yahma/alpaca-cleaned", 42, "alpaca", format_alpaca)
+
+    if args.merge == "alpaca":
+        merge_with_dataset("yahma/alpaca-cleaned", 42, "alpaca", format_alpaca, ["input", "output", "instruction"])
+    elif args.merge == "wizardlm70k":
+        merge_with_dataset("WizardLM/WizardLM_evol_instruct_70k", 42, "wizardlm70k", format_alpaca, ["output", "instruction"])

 if __name__ == "__main__":
    main()
--- a/docs/expermement-notes-phi-1_5.txt
+++ b/docs/expermement-notes-phi-1_5.txt
@@ -1,3 +1,4 @@
+# home-llm experiements (phi1.5)
 rev1 - original test
 - 1 epoch
 - train ctx 1900
@@ -217,7 +218,8 @@ rev 9 - reduced dataset size

 ------

-home-1b-rev1
+# Home 1B
+## home-1b-rev1
 - 1 epoch
 - 2048 train ctx
 - batch size 8
@@ -231,4 +233,80 @@ home-1b-rev1
 + it works OK with low temperatures
 + seems to handle the alpaca dataset not so well

-home-1b-rev2
+Eval results for existing models:
+Home-1b-v1: 0.767816091954023
+Home-3b-v2: 0.6908045977011494
+
+## home-1b-rev5 series
+- 1 epoch
+- 2048 train ctx
+- batch size 8
+- learning rate 1e-5
+- weight decay 0.1
+- gradient clipping 1.0
+- save model every 200 steps
+
+home-1b-rev5
+- dataset size: medium
+- evaluation results:
+  - 200: 0.553448275862069
+  - 400: 0.7482758620689656 (+.19)
+  - 600: 0.8103448275862069 (+.06)
+  - 800: 0.8316091954022988 (+.02)
+  - 1000:  0.8396551724137931 (+.008)
+  - 1200: 0.8488505747126437 (+.009)
+  - Final (1467): 0.8494252873563218 (+.00005)
+
+home-1b-rev5_1
+- dataset size: small
+- evaluation results:
+  - 200: 0.6057471264367816
+  - 400: 0.7494252873563219 (+.143)
+  - 600: 0.7683908045977011 (+.018)
+  - 800: 0.7729885057471264 (+.0046)
+  - Final (869): bad
+
+home-1b-rev5_2
+- dataset size: large
+- evaluation results:
+  - 200: --
+  - 400: --
+  - 600: 0.8425287356321839
+  - 800: 0.8666666666666667
+  - 1000: 0.8770114942528736
+  - 1200: 0.8844827586206897
+  - 1400: 0.8879310344827587
+  - 1600: 0.8844827586206897
+  - Final (1848): 0.8833333333333333
+
+home-3b-v3-rev1
+- dataset size: large
+- evaluation results: 0.9091954022988505
+
+home-3b-v3-rev2
+- dataset size: large + alpaca
+- evaluation results: 
+
+# Datasets
+
+## SFT
+Alpaca: https://huggingface.co/datasets/yahma/alpaca-cleaned
+Alpaca (Translated): https://huggingface.co/datasets/saillab/taco-datasets
+WizardLM 200k: https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k
+WizardLM 70k: https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_70k
+Huggingface Ultrachat 200k: https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k
+OpenOrca Slim Deduped (363k): https://huggingface.co/datasets/Open-Orca/SlimOrca-Dedup
+
+## DPO
+Intel Orca DPO Pairs: https://huggingface.co/datasets/Intel/orca_dpo_pairs
+Huggingface Ultrachat: https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized
+
+----------------------------------------------------------------------------------------------------
+python3 evaluate.py home-1b-rev5_2/checkpoint-600 --batch-size 12 && \
+  python3 evaluate.py home-1b-rev5_2/checkpoint-800 --batch-size 12  && \
+  python3 evaluate.py home-1b-rev5_2/checkpoint-1000 --batch-size 12 && \
+  python3 evaluate.py home-1b-rev5_2/checkpoint-1200 --batch-size 12 && \
+  python3 evaluate.py home-1b-rev5_2/checkpoint-1400 --batch-size 12 && \
+  python3 evaluate.py home-1b-rev5_2/checkpoint-1600 --batch-size 12 && \
+  python3 evaluate.py home-1b-rev5_2/checkpoint-1800 --batch-size 12 && \
+  python3 evaluate.py home-1b-rev5_2 --batch-size 12
--- a/train.py
+++ b/train.py
@@ -17,7 +17,7 @@ Phi Modules: fc1,fc2,q_proj,v_proj,k_proj,dense,embed_tokens,lm_head

 """
 python3 train.py \
-    --run_name home-3b-v3-rev1 \
+    --run_name home-3b-v3-rev2 \
    --base_model microsoft/phi-2 \
    --add_pad_token \
    --add_chatml_tokens \
@@ -33,7 +33,7 @@ python3 train.py \

 """
 python3 train.py \
-    --run_name home-1b-rev4 \
+    --run_name home-1b-rev5 \
    --base_model microsoft/phi-1_5 \
    --add_pad_token \
    --add_chatml_tokens \
@@ -42,7 +42,7 @@ python3 train.py \
    --test_dataset data/home_assistant_test.json \
    --learning_rate 1e-5 \
    --micro_batch_size 4 --gradient_checkpointing \
-    --ctx_size 2048
+    --ctx_size 2048 --save_steps 200
 """

 """
@@ -73,6 +73,7 @@ class TrainingRunArguments:
    resume_from_checkpoint: str = field(default="", metadata={"help": "The name of the checkpoint to resume training from"})
    eval_steps: int = field(default=100, metadata={"help": "The number of steps in between evaluations of the model"})
    save_steps: int = field(default=-1, metadata={"help": "The number of steps in between model checkpoints; set to -1 to save every epoch"})
+    save_total_limit: int = field(default=1, metadata={"help": "The number of recent checkpoints of the model to save (not including the final model)"})
    group_by_length: bool = field(default=False, metadata={"help": "If enabled, the training data will be grouped by length to optimize use of padding"})
    
    # Quantization
@@ -101,8 +102,6 @@ training_run_args, _ = parser.parse_args_into_dataclasses(return_remaining_strin
 if sum([training_run_args.load_in_8bit, training_run_args.load_in_4bit, training_run_args.load_as_gptq]) > 1:
    raise Exception("Please select exactly one of 'load_in_8bit', 'load_in_4bit', or 'load_as_gptq")

-# TODO: write a proper evaluation script
-
 print(f"Loading model '{training_run_args.base_model}'...")

 model_kwargs = {}
@@ -139,7 +138,7 @@ model = AutoModelForCausalLM.from_pretrained(
    max_memory=find_max_vram(),
    **model_kwargs
 )
-tokenizer = AutoTokenizer.from_pretrained(training_run_args.base_model, trust_remote_code=True, use_fast=False)
+tokenizer = AutoTokenizer.from_pretrained(training_run_args.base_model, trust_remote_code=True)

 if training_run_args.add_pad_token:
    tokenizer.add_special_tokens({'pad_token': '<|pad|>'})
@@ -196,8 +195,8 @@ training_args = TrainingArguments(
    # per_device_eval_batch_size=1,
    gradient_accumulation_steps=training_run_args.batch_size//training_run_args.micro_batch_size,
    gradient_checkpointing=training_run_args.gradient_checkpointing,
-    # weight_decay=training_run_args.weight_decay,
-    # max_grad_norm=training_run_args.gradient_clip,
+    weight_decay=training_run_args.weight_decay,
+    max_grad_norm=training_run_args.gradient_clip,
    evaluation_strategy="steps",
    eval_steps=training_run_args.eval_steps,
    save_strategy=("steps" if training_run_args.save_steps != -1 else "epoch"),
@@ -206,7 +205,7 @@ training_args = TrainingArguments(
    logging_steps=5, 
    output_dir=model_dir,
    num_train_epochs=training_run_args.epochs,
-    save_total_limit=1,
+    save_total_limit=training_run_args.save_total_limit,
    # dataloader_pin_memory=False,
    report_to="tensorboard",
    learning_rate=training_run_args.learning_rate,