wizardlm merge + fix eval

This commit is contained in:
Alex O'Connell
2024-01-25 20:46:59 -05:00
parent 57634519ca
commit e6fae06133
3 changed files with 110 additions and 20 deletions

View File

@@ -567,7 +567,7 @@ def generate_example_file(filename: str, seed: int, *, static_factor: int, templ
def format_alpaca(example):
question = example["instruction"]
if example["input"]:
if "input" in example and example["input"]:
question = question = "\n" + example["input"]
answer = example["output"]
@@ -592,13 +592,13 @@ def format_alpaca(example):
return result
def merge_with_dataset(dataset_name, seed, outupt_name, format_function):
def merge_with_dataset(dataset_name, seed, outupt_name, format_function, dataset_column_names):
alpaca_dataset = load_dataset(dataset_name)["train"].train_test_split(test_size=0.1)
home_assistant_dataset = load_dataset("json", data_files={ "train": "home_assistant_train.json", "test": "home_assistant_test.json" })
random.seed(seed)
alpaca_dataset = alpaca_dataset.map(format_function).remove_columns(["input", "output", "instruction"])
alpaca_dataset = alpaca_dataset.map(format_function).remove_columns(dataset_column_names)
combined_dataset_train = concatenate_datasets([home_assistant_dataset["train"], alpaca_dataset["train"]]).shuffle(seed=42)
combined_dataset_test = concatenate_datasets([home_assistant_dataset["test"], alpaca_dataset["test"]]).shuffle(seed=42)
@@ -616,20 +616,33 @@ def main():
parser.add_argument("--sample", action="store_true", help="Set this flag to enable generation of the train dataset.")
parser.add_argument("--test", action="store_true", help="Set this flag to enable generation of the train dataset..")
parser.add_argument("--train", action="store_true", help="Set this flag to enable generation of the train dataset.")
parser.add_argument("--merge-alpaca", action="store_true", help="Set this flag to merge the generated datasets with the alpaca-cleaned dataset.")
parser.add_argument("--merge", help="Set this flag to merge the generated datasets with the specified dataset.")
train_size_group = parser.add_mutually_exclusive_group()
train_size_group.add_argument('--small', action='store_const', const='small', dest='size')
train_size_group.add_argument('--medium', action='store_const', const='medium', dest='size')
train_size_group.add_argument('--large', action='store_const', const='large', dest='size')
args = parser.parse_args()
if args.sample:
generate_example_file("sample", 42, static_factor=1, template_factor=1, status_request_factor=1)
if args.train:
# TODO: add small, medium, large cli clags
# generate_example_file("home_assistant_train", 42, static_factor=1, template_factor=10, status_request_factor=8)
generate_example_file("home_assistant_train", 42, static_factor=5, template_factor=15, status_request_factor=12)
# generate_example_file("home_assistant_train", 42, static_factor=5, template_factor=20, status_request_factor=15)
if args.size == "small":
generate_example_file("home_assistant_train", 42, static_factor=1, template_factor=10, status_request_factor=8)
elif args.size == "medium":
generate_example_file("home_assistant_train", 42, static_factor=5, template_factor=15, status_request_factor=12)
elif args.size == "large":
generate_example_file("home_assistant_train", 42, static_factor=5, template_factor=20, status_request_factor=15)
else:
raise Exception(f"Unrecognized dataset size: {args.size}")
if args.test:
generate_example_file("home_assistant_test", 12345, static_factor=0.25, template_factor=3, status_request_factor=2)
if args.merge_alpaca:
merge_with_dataset("yahma/alpaca-cleaned", 42, "alpaca", format_alpaca)
if args.merge == "alpaca":
merge_with_dataset("yahma/alpaca-cleaned", 42, "alpaca", format_alpaca, ["input", "output", "instruction"])
elif args.merge == "wizardlm70k":
merge_with_dataset("WizardLM/WizardLM_evol_instruct_70k", 42, "wizardlm70k", format_alpaca, ["output", "instruction"])
if __name__ == "__main__":
main()

View File

@@ -1,3 +1,4 @@
# home-llm experiements (phi1.5)
rev1 - original test
- 1 epoch
- train ctx 1900
@@ -217,7 +218,8 @@ rev 9 - reduced dataset size
------
home-1b-rev1
# Home 1B
## home-1b-rev1
- 1 epoch
- 2048 train ctx
- batch size 8
@@ -231,4 +233,80 @@ home-1b-rev1
+ it works OK with low temperatures
+ seems to handle the alpaca dataset not so well
home-1b-rev2
Eval results for existing models:
Home-1b-v1: 0.767816091954023
Home-3b-v2: 0.6908045977011494
## home-1b-rev5 series
- 1 epoch
- 2048 train ctx
- batch size 8
- learning rate 1e-5
- weight decay 0.1
- gradient clipping 1.0
- save model every 200 steps
home-1b-rev5
- dataset size: medium
- evaluation results:
- 200: 0.553448275862069
- 400: 0.7482758620689656 (+.19)
- 600: 0.8103448275862069 (+.06)
- 800: 0.8316091954022988 (+.02)
- 1000: 0.8396551724137931 (+.008)
- 1200: 0.8488505747126437 (+.009)
- Final (1467): 0.8494252873563218 (+.00005)
home-1b-rev5_1
- dataset size: small
- evaluation results:
- 200: 0.6057471264367816
- 400: 0.7494252873563219 (+.143)
- 600: 0.7683908045977011 (+.018)
- 800: 0.7729885057471264 (+.0046)
- Final (869): bad
home-1b-rev5_2
- dataset size: large
- evaluation results:
- 200: --
- 400: --
- 600: 0.8425287356321839
- 800: 0.8666666666666667
- 1000: 0.8770114942528736
- 1200: 0.8844827586206897
- 1400: 0.8879310344827587
- 1600: 0.8844827586206897
- Final (1848): 0.8833333333333333
home-3b-v3-rev1
- dataset size: large
- evaluation results: 0.9091954022988505
home-3b-v3-rev2
- dataset size: large + alpaca
- evaluation results:
# Datasets
## SFT
Alpaca: https://huggingface.co/datasets/yahma/alpaca-cleaned
Alpaca (Translated): https://huggingface.co/datasets/saillab/taco-datasets
WizardLM 200k: https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k
WizardLM 70k: https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_70k
Huggingface Ultrachat 200k: https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k
OpenOrca Slim Deduped (363k): https://huggingface.co/datasets/Open-Orca/SlimOrca-Dedup
## DPO
Intel Orca DPO Pairs: https://huggingface.co/datasets/Intel/orca_dpo_pairs
Huggingface Ultrachat: https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized
----------------------------------------------------------------------------------------------------
python3 evaluate.py home-1b-rev5_2/checkpoint-600 --batch-size 12 && \
python3 evaluate.py home-1b-rev5_2/checkpoint-800 --batch-size 12 && \
python3 evaluate.py home-1b-rev5_2/checkpoint-1000 --batch-size 12 && \
python3 evaluate.py home-1b-rev5_2/checkpoint-1200 --batch-size 12 && \
python3 evaluate.py home-1b-rev5_2/checkpoint-1400 --batch-size 12 && \
python3 evaluate.py home-1b-rev5_2/checkpoint-1600 --batch-size 12 && \
python3 evaluate.py home-1b-rev5_2/checkpoint-1800 --batch-size 12 && \
python3 evaluate.py home-1b-rev5_2 --batch-size 12

View File

@@ -17,7 +17,7 @@ Phi Modules: fc1,fc2,q_proj,v_proj,k_proj,dense,embed_tokens,lm_head
"""
python3 train.py \
--run_name home-3b-v3-rev1 \
--run_name home-3b-v3-rev2 \
--base_model microsoft/phi-2 \
--add_pad_token \
--add_chatml_tokens \
@@ -33,7 +33,7 @@ python3 train.py \
"""
python3 train.py \
--run_name home-1b-rev4 \
--run_name home-1b-rev5 \
--base_model microsoft/phi-1_5 \
--add_pad_token \
--add_chatml_tokens \
@@ -42,7 +42,7 @@ python3 train.py \
--test_dataset data/home_assistant_test.json \
--learning_rate 1e-5 \
--micro_batch_size 4 --gradient_checkpointing \
--ctx_size 2048
--ctx_size 2048 --save_steps 200
"""
"""
@@ -73,6 +73,7 @@ class TrainingRunArguments:
resume_from_checkpoint: str = field(default="", metadata={"help": "The name of the checkpoint to resume training from"})
eval_steps: int = field(default=100, metadata={"help": "The number of steps in between evaluations of the model"})
save_steps: int = field(default=-1, metadata={"help": "The number of steps in between model checkpoints; set to -1 to save every epoch"})
save_total_limit: int = field(default=1, metadata={"help": "The number of recent checkpoints of the model to save (not including the final model)"})
group_by_length: bool = field(default=False, metadata={"help": "If enabled, the training data will be grouped by length to optimize use of padding"})
# Quantization
@@ -101,8 +102,6 @@ training_run_args, _ = parser.parse_args_into_dataclasses(return_remaining_strin
if sum([training_run_args.load_in_8bit, training_run_args.load_in_4bit, training_run_args.load_as_gptq]) > 1:
raise Exception("Please select exactly one of 'load_in_8bit', 'load_in_4bit', or 'load_as_gptq")
# TODO: write a proper evaluation script
print(f"Loading model '{training_run_args.base_model}'...")
model_kwargs = {}
@@ -139,7 +138,7 @@ model = AutoModelForCausalLM.from_pretrained(
max_memory=find_max_vram(),
**model_kwargs
)
tokenizer = AutoTokenizer.from_pretrained(training_run_args.base_model, trust_remote_code=True, use_fast=False)
tokenizer = AutoTokenizer.from_pretrained(training_run_args.base_model, trust_remote_code=True)
if training_run_args.add_pad_token:
tokenizer.add_special_tokens({'pad_token': '<|pad|>'})
@@ -196,8 +195,8 @@ training_args = TrainingArguments(
# per_device_eval_batch_size=1,
gradient_accumulation_steps=training_run_args.batch_size//training_run_args.micro_batch_size,
gradient_checkpointing=training_run_args.gradient_checkpointing,
# weight_decay=training_run_args.weight_decay,
# max_grad_norm=training_run_args.gradient_clip,
weight_decay=training_run_args.weight_decay,
max_grad_norm=training_run_args.gradient_clip,
evaluation_strategy="steps",
eval_steps=training_run_args.eval_steps,
save_strategy=("steps" if training_run_args.save_steps != -1 else "epoch"),
@@ -206,7 +205,7 @@ training_args = TrainingArguments(
logging_steps=5,
output_dir=model_dir,
num_train_epochs=training_run_args.epochs,
save_total_limit=1,
save_total_limit=training_run_args.save_total_limit,
# dataloader_pin_memory=False,
report_to="tensorboard",
learning_rate=training_run_args.learning_rate,