fixes for training zephyr base

This commit is contained in:
Alex O'Connell
2024-02-04 11:40:03 -05:00
parent cecf9bc53e
commit 7b01251f5d
2 changed files with 70 additions and 44 deletions

31
docs/perf.md Normal file
View File

@@ -0,0 +1,31 @@
# Home 1B V2 GGUF Q4_K_M RPI5
christmas.txt
llama_print_timings: load time = 678.37 ms
llama_print_timings: sample time = 16.38 ms / 45 runs ( 0.36 ms per token, 2747.09 tokens per second)
llama_print_timings: prompt eval time = 31356.56 ms / 487 tokens ( 64.39 ms per token, 15.53 tokens per second)
llama_print_timings: eval time = 4868.37 ms / 44 runs ( 110.64 ms per token, 9.04 tokens per second)
llama_print_timings: total time = 36265.33 ms / 531 tokens
climate.txt
llama_print_timings: load time = 613.87 ms
llama_print_timings: sample time = 20.62 ms / 55 runs ( 0.37 ms per token, 2667.96 tokens per second)
llama_print_timings: prompt eval time = 27324.34 ms / 431 tokens ( 63.40 ms per token, 15.77 tokens per second)
llama_print_timings: eval time = 5780.72 ms / 54 runs ( 107.05 ms per token, 9.34 tokens per second)
llama_print_timings: total time = 33152.48 ms / 485 tokens
# Home 3B V2 GGUF Q4_K_M RPI5
climate.txt
llama_print_timings: load time = 1179.64 ms
llama_print_timings: sample time = 19.25 ms / 52 runs ( 0.37 ms per token, 2702.00 tokens per second)
llama_print_timings: prompt eval time = 52688.82 ms / 431 tokens ( 122.25 ms per token, 8.18 tokens per second)
llama_print_timings: eval time = 10206.12 ms / 51 runs ( 200.12 ms per token, 5.00 tokens per second)
llama_print_timings: total time = 62942.85 ms / 482 tokens
sonnet.txt
llama_print_timings: load time = 1076.44 ms
llama_print_timings: sample time = 1225.34 ms / 236 runs ( 5.19 ms per token, 192.60 tokens per second)
llama_print_timings: prompt eval time = 60754.40 ms / 490 tokens ( 123.99 ms per token, 8.07 tokens per second)
llama_print_timings: eval time = 44885.82 ms / 213 runs ( 210.73 ms per token, 4.75 tokens per second)
llama_print_timings: total time = 107127.16 ms / 703 tokens

View File

@@ -12,7 +12,14 @@ from dataclasses import dataclass, field
from typing import Dict, Optional, Sequence, Sized, Iterator
"""
Phi Modules: fc1,fc2,q_proj,v_proj,k_proj,dense,embed_tokens,lm_head
Phi Modules:
- MLP: fc1,fc2
- MHA: q_proj,v_proj,k_proj,dense
- Embeddings: embed_tokens (input) lm_head (output)
StableLM Modules:
- MLP: up_proj,down_proj,gate_proj
- MHA: q_proj,v_proj,k_proj,o_proj
- Embeddings: embed_tokens (input) lm_head (output)
"""
"""
@@ -22,7 +29,7 @@ python3 train.py \
--add_pad_token \
--add_chatml_tokens \
--bf16 \
--train_dataset data/home_assistant_train.json \
--train_dataset data/home_assistant_train.jsonl \
--learning_rate 1e-5 \
--save_steps 1000 \
--micro_batch_size 2 --gradient_checkpointing \
@@ -37,8 +44,8 @@ python3 train.py \
--add_pad_token \
--add_chatml_tokens \
--bf16 \
--train_dataset data/home_assistant_train.json \
--test_dataset data/home_assistant_test.json \
--train_dataset data/home_assistant_train.jsonl \
--test_dataset data/home_assistant_test.jsonl \
--learning_rate 1e-5 \
--micro_batch_size 4 --gradient_checkpointing \
--ctx_size 2048 --save_steps 200
@@ -49,10 +56,9 @@ python3 train.py \
--run_name stablehome-1_6b-rev1 \
--base_model stabilityai/stablelm-2-zephyr-1_6b \
--bf16 \
--train_dataset data/home_assistant_train.json \
--test_dataset data/home_assistant_test.json \
--train_dataset data/home_assistant_train.jsonl \
--learning_rate 1e-5 \
--micro_batch_size 4 --gradient_checkpointing \
--micro_batch_size 2 --gradient_checkpointing \
--ctx_size 2048 --save_steps 200
"""
@@ -61,8 +67,8 @@ python3 train.py \
python3 train.py \
--run_name home-7b-rev2 \
--base_model TheBloke/Llama-2-7B-GPTQ \
--train_dataset data/home_assistant_train.json \
--test_dataset data/home_assistant_test.json \
--train_dataset data/home_assistant_train.jsonl \
--test_dataset data/home_assistant_test.jsonl \
--load_as_gptq --use_lora --gradient_checkpointing \
--add_pad_token --bf16 --micro_batch_size 4 --learning_rate 2e-5
"""
@@ -134,6 +140,7 @@ else:
# model_kwargs["resid_pdrop"] = 0.0
# model_kwargs["revision"] = "accfee56d8988cae60915486310362db5831b1bd"
model_kwargs["use_cache"] = False
def find_max_vram(min_buffer_mib=800):
total_mem = (torch.cuda.get_device_properties(0).total_memory / (1024 * 1024))
@@ -168,7 +175,7 @@ if training_run_args.add_chatml_tokens:
model.config.eos_token_id = tokenizer.eos_token_id
if training_run_args.add_chatml_prompt_template:
tokenizer.default_chat_template = (
tokenizer.chat_template = (
"{% for message in messages %}"
"{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
"{% endfor %}"
@@ -253,16 +260,12 @@ class DataCollatorForSupervisedFineTuning(object):
prefix_ids: list[int]
suffix_ids: list[int]
def __init__(self,
*,
tokenizer: AutoTokenizer,
response_prefix: str = "<|im_start|>assistant",
response_suffix: str = "<|im_end|>",
):
def __init__(self, *, tokenizer: AutoTokenizer):
self.tokenizer = tokenizer
self.response_prefix = response_prefix
self.response_suffix = response_suffix
assistant_prompt = tokenizer.apply_chat_template(conversation=[{"role": "assistant", "content": r"%%%%%%%%%%%%%%%%"}], tokenize=False).split( r"%%%%%%%%%%%%%%%%")
self.response_prefix = assistant_prompt[0]
self.response_suffix = assistant_prompt[1]
self.prefix_ids = self.tokenizer(self.response_prefix, add_special_tokens=False)["input_ids"]
self.suffix_ids = self.tokenizer(self.response_suffix, add_special_tokens=False)["input_ids"]
@@ -353,30 +356,35 @@ if training_run_args.test_dataset:
data_files["test"] = training_run_args.test_dataset
datasets = load_dataset("json", data_files=data_files)
def tokenize_raw_example(example):
def tokenize_raw_example(batch):
return tokenizer(
text=example["text"],
text=batch["text"],
max_length=training_run_args.ctx_size,
truncation=True,
add_special_tokens=False,
)
def tokenize_sharegpt_example(example):
conversation = [ { "role": x["from"], "content": x["value"] } for x in example["conversation"]]
return tokenizer.apply_chat_template(
conversation=conversation,
max_length=training_run_args.ctx_size,
truncation=True,
)
def tokenize_sharegpt_example(batch):
# TODO: figure out how to properly batch this
result = []
for example in batch["conversations"]:
conversation = [ { "role": x["from"], "content": x["value"] } for x in example ]
result.append(tokenizer.apply_chat_template(
conversation=conversation,
max_length=training_run_args.ctx_size,
truncation=True,
))
return {"input_ids": result}
print("Tokenizing datasets...")
if "text" in datasets["train"].column_names:
tokenize_function = tokenize_raw_example
columns_to_remove = ["text"]
elif "conversation" in datasets["train"].column_names:
elif "conversations" in datasets["train"].column_names:
tokenize_function = tokenize_sharegpt_example
columns_to_remove = ["conversation"]
columns_to_remove = ["conversations"]
else:
raise Exception("Unknown dataset input format (not raw corpus or sharegpt)")
@@ -484,18 +492,6 @@ class CustomTrainer(Trainer):
torch.save(trainable_params, saved_weights_file)
super()._save_optimizer_and_scheduler(output_dir)
def compute_metrics(pred: EvalPrediction):
inputs = pred.inputs
label_ids = pred.label_ids
logits = pred.predictions
return {"accuracy": 1.0 }
def preprocess_logits_for_metrics(logits, labels):
"""https://discuss.huggingface.co/t/cuda-out-of-memory-when-using-trainer-with-compute-metrics/2941/22"""
pred_ids = torch.argmax(logits, dim=-1)
return pred_ids, labels
trainer = CustomTrainer(
model=model,
@@ -503,8 +499,6 @@ trainer = CustomTrainer(
train_dataset=tokenized_train_dataset,
eval_dataset=tokenized_test_dataset,
data_collator=data_collator,
# compute_metrics=compute_metrics,
# preprocess_logits_for_metrics=preprocess_logits_for_metrics,
)
tensorboard_process = None
@@ -523,7 +517,8 @@ try:
else:
trainer.train()
# trainer.evaluate_all()
if training_run_args.train_dataset:
trainer.evaluate_all()
if training_run_args.use_lora and training_run_args.lora_merge:
trainer.save_model() # save lora