mirror of
https://github.com/acon96/home-llm.git
synced 2026-01-09 21:58:00 -05:00
fixes for training zephyr base
This commit is contained in:
31
docs/perf.md
Normal file
31
docs/perf.md
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
# Home 1B V2 GGUF Q4_K_M RPI5
|
||||||
|
|
||||||
|
christmas.txt
|
||||||
|
llama_print_timings: load time = 678.37 ms
|
||||||
|
llama_print_timings: sample time = 16.38 ms / 45 runs ( 0.36 ms per token, 2747.09 tokens per second)
|
||||||
|
llama_print_timings: prompt eval time = 31356.56 ms / 487 tokens ( 64.39 ms per token, 15.53 tokens per second)
|
||||||
|
llama_print_timings: eval time = 4868.37 ms / 44 runs ( 110.64 ms per token, 9.04 tokens per second)
|
||||||
|
llama_print_timings: total time = 36265.33 ms / 531 tokens
|
||||||
|
|
||||||
|
climate.txt
|
||||||
|
llama_print_timings: load time = 613.87 ms
|
||||||
|
llama_print_timings: sample time = 20.62 ms / 55 runs ( 0.37 ms per token, 2667.96 tokens per second)
|
||||||
|
llama_print_timings: prompt eval time = 27324.34 ms / 431 tokens ( 63.40 ms per token, 15.77 tokens per second)
|
||||||
|
llama_print_timings: eval time = 5780.72 ms / 54 runs ( 107.05 ms per token, 9.34 tokens per second)
|
||||||
|
llama_print_timings: total time = 33152.48 ms / 485 tokens
|
||||||
|
|
||||||
|
# Home 3B V2 GGUF Q4_K_M RPI5
|
||||||
|
|
||||||
|
climate.txt
|
||||||
|
llama_print_timings: load time = 1179.64 ms
|
||||||
|
llama_print_timings: sample time = 19.25 ms / 52 runs ( 0.37 ms per token, 2702.00 tokens per second)
|
||||||
|
llama_print_timings: prompt eval time = 52688.82 ms / 431 tokens ( 122.25 ms per token, 8.18 tokens per second)
|
||||||
|
llama_print_timings: eval time = 10206.12 ms / 51 runs ( 200.12 ms per token, 5.00 tokens per second)
|
||||||
|
llama_print_timings: total time = 62942.85 ms / 482 tokens
|
||||||
|
|
||||||
|
sonnet.txt
|
||||||
|
llama_print_timings: load time = 1076.44 ms
|
||||||
|
llama_print_timings: sample time = 1225.34 ms / 236 runs ( 5.19 ms per token, 192.60 tokens per second)
|
||||||
|
llama_print_timings: prompt eval time = 60754.40 ms / 490 tokens ( 123.99 ms per token, 8.07 tokens per second)
|
||||||
|
llama_print_timings: eval time = 44885.82 ms / 213 runs ( 210.73 ms per token, 4.75 tokens per second)
|
||||||
|
llama_print_timings: total time = 107127.16 ms / 703 tokens
|
||||||
83
train.py
83
train.py
@@ -12,7 +12,14 @@ from dataclasses import dataclass, field
|
|||||||
from typing import Dict, Optional, Sequence, Sized, Iterator
|
from typing import Dict, Optional, Sequence, Sized, Iterator
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Phi Modules: fc1,fc2,q_proj,v_proj,k_proj,dense,embed_tokens,lm_head
|
Phi Modules:
|
||||||
|
- MLP: fc1,fc2
|
||||||
|
- MHA: q_proj,v_proj,k_proj,dense
|
||||||
|
- Embeddings: embed_tokens (input) lm_head (output)
|
||||||
|
StableLM Modules:
|
||||||
|
- MLP: up_proj,down_proj,gate_proj
|
||||||
|
- MHA: q_proj,v_proj,k_proj,o_proj
|
||||||
|
- Embeddings: embed_tokens (input) lm_head (output)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
"""
|
"""
|
||||||
@@ -22,7 +29,7 @@ python3 train.py \
|
|||||||
--add_pad_token \
|
--add_pad_token \
|
||||||
--add_chatml_tokens \
|
--add_chatml_tokens \
|
||||||
--bf16 \
|
--bf16 \
|
||||||
--train_dataset data/home_assistant_train.json \
|
--train_dataset data/home_assistant_train.jsonl \
|
||||||
--learning_rate 1e-5 \
|
--learning_rate 1e-5 \
|
||||||
--save_steps 1000 \
|
--save_steps 1000 \
|
||||||
--micro_batch_size 2 --gradient_checkpointing \
|
--micro_batch_size 2 --gradient_checkpointing \
|
||||||
@@ -37,8 +44,8 @@ python3 train.py \
|
|||||||
--add_pad_token \
|
--add_pad_token \
|
||||||
--add_chatml_tokens \
|
--add_chatml_tokens \
|
||||||
--bf16 \
|
--bf16 \
|
||||||
--train_dataset data/home_assistant_train.json \
|
--train_dataset data/home_assistant_train.jsonl \
|
||||||
--test_dataset data/home_assistant_test.json \
|
--test_dataset data/home_assistant_test.jsonl \
|
||||||
--learning_rate 1e-5 \
|
--learning_rate 1e-5 \
|
||||||
--micro_batch_size 4 --gradient_checkpointing \
|
--micro_batch_size 4 --gradient_checkpointing \
|
||||||
--ctx_size 2048 --save_steps 200
|
--ctx_size 2048 --save_steps 200
|
||||||
@@ -49,10 +56,9 @@ python3 train.py \
|
|||||||
--run_name stablehome-1_6b-rev1 \
|
--run_name stablehome-1_6b-rev1 \
|
||||||
--base_model stabilityai/stablelm-2-zephyr-1_6b \
|
--base_model stabilityai/stablelm-2-zephyr-1_6b \
|
||||||
--bf16 \
|
--bf16 \
|
||||||
--train_dataset data/home_assistant_train.json \
|
--train_dataset data/home_assistant_train.jsonl \
|
||||||
--test_dataset data/home_assistant_test.json \
|
|
||||||
--learning_rate 1e-5 \
|
--learning_rate 1e-5 \
|
||||||
--micro_batch_size 4 --gradient_checkpointing \
|
--micro_batch_size 2 --gradient_checkpointing \
|
||||||
--ctx_size 2048 --save_steps 200
|
--ctx_size 2048 --save_steps 200
|
||||||
|
|
||||||
"""
|
"""
|
||||||
@@ -61,8 +67,8 @@ python3 train.py \
|
|||||||
python3 train.py \
|
python3 train.py \
|
||||||
--run_name home-7b-rev2 \
|
--run_name home-7b-rev2 \
|
||||||
--base_model TheBloke/Llama-2-7B-GPTQ \
|
--base_model TheBloke/Llama-2-7B-GPTQ \
|
||||||
--train_dataset data/home_assistant_train.json \
|
--train_dataset data/home_assistant_train.jsonl \
|
||||||
--test_dataset data/home_assistant_test.json \
|
--test_dataset data/home_assistant_test.jsonl \
|
||||||
--load_as_gptq --use_lora --gradient_checkpointing \
|
--load_as_gptq --use_lora --gradient_checkpointing \
|
||||||
--add_pad_token --bf16 --micro_batch_size 4 --learning_rate 2e-5
|
--add_pad_token --bf16 --micro_batch_size 4 --learning_rate 2e-5
|
||||||
"""
|
"""
|
||||||
@@ -134,6 +140,7 @@ else:
|
|||||||
|
|
||||||
# model_kwargs["resid_pdrop"] = 0.0
|
# model_kwargs["resid_pdrop"] = 0.0
|
||||||
# model_kwargs["revision"] = "accfee56d8988cae60915486310362db5831b1bd"
|
# model_kwargs["revision"] = "accfee56d8988cae60915486310362db5831b1bd"
|
||||||
|
model_kwargs["use_cache"] = False
|
||||||
|
|
||||||
def find_max_vram(min_buffer_mib=800):
|
def find_max_vram(min_buffer_mib=800):
|
||||||
total_mem = (torch.cuda.get_device_properties(0).total_memory / (1024 * 1024))
|
total_mem = (torch.cuda.get_device_properties(0).total_memory / (1024 * 1024))
|
||||||
@@ -168,7 +175,7 @@ if training_run_args.add_chatml_tokens:
|
|||||||
model.config.eos_token_id = tokenizer.eos_token_id
|
model.config.eos_token_id = tokenizer.eos_token_id
|
||||||
|
|
||||||
if training_run_args.add_chatml_prompt_template:
|
if training_run_args.add_chatml_prompt_template:
|
||||||
tokenizer.default_chat_template = (
|
tokenizer.chat_template = (
|
||||||
"{% for message in messages %}"
|
"{% for message in messages %}"
|
||||||
"{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
|
"{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
|
||||||
"{% endfor %}"
|
"{% endfor %}"
|
||||||
@@ -253,16 +260,12 @@ class DataCollatorForSupervisedFineTuning(object):
|
|||||||
prefix_ids: list[int]
|
prefix_ids: list[int]
|
||||||
suffix_ids: list[int]
|
suffix_ids: list[int]
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self, *, tokenizer: AutoTokenizer):
|
||||||
*,
|
|
||||||
tokenizer: AutoTokenizer,
|
|
||||||
response_prefix: str = "<|im_start|>assistant",
|
|
||||||
response_suffix: str = "<|im_end|>",
|
|
||||||
):
|
|
||||||
|
|
||||||
self.tokenizer = tokenizer
|
self.tokenizer = tokenizer
|
||||||
self.response_prefix = response_prefix
|
assistant_prompt = tokenizer.apply_chat_template(conversation=[{"role": "assistant", "content": r"%%%%%%%%%%%%%%%%"}], tokenize=False).split( r"%%%%%%%%%%%%%%%%")
|
||||||
self.response_suffix = response_suffix
|
self.response_prefix = assistant_prompt[0]
|
||||||
|
self.response_suffix = assistant_prompt[1]
|
||||||
|
|
||||||
self.prefix_ids = self.tokenizer(self.response_prefix, add_special_tokens=False)["input_ids"]
|
self.prefix_ids = self.tokenizer(self.response_prefix, add_special_tokens=False)["input_ids"]
|
||||||
self.suffix_ids = self.tokenizer(self.response_suffix, add_special_tokens=False)["input_ids"]
|
self.suffix_ids = self.tokenizer(self.response_suffix, add_special_tokens=False)["input_ids"]
|
||||||
@@ -353,30 +356,35 @@ if training_run_args.test_dataset:
|
|||||||
data_files["test"] = training_run_args.test_dataset
|
data_files["test"] = training_run_args.test_dataset
|
||||||
datasets = load_dataset("json", data_files=data_files)
|
datasets = load_dataset("json", data_files=data_files)
|
||||||
|
|
||||||
def tokenize_raw_example(example):
|
def tokenize_raw_example(batch):
|
||||||
return tokenizer(
|
return tokenizer(
|
||||||
text=example["text"],
|
text=batch["text"],
|
||||||
max_length=training_run_args.ctx_size,
|
max_length=training_run_args.ctx_size,
|
||||||
truncation=True,
|
truncation=True,
|
||||||
add_special_tokens=False,
|
add_special_tokens=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
def tokenize_sharegpt_example(example):
|
def tokenize_sharegpt_example(batch):
|
||||||
conversation = [ { "role": x["from"], "content": x["value"] } for x in example["conversation"]]
|
# TODO: figure out how to properly batch this
|
||||||
return tokenizer.apply_chat_template(
|
result = []
|
||||||
conversation=conversation,
|
for example in batch["conversations"]:
|
||||||
max_length=training_run_args.ctx_size,
|
conversation = [ { "role": x["from"], "content": x["value"] } for x in example ]
|
||||||
truncation=True,
|
result.append(tokenizer.apply_chat_template(
|
||||||
)
|
conversation=conversation,
|
||||||
|
max_length=training_run_args.ctx_size,
|
||||||
|
truncation=True,
|
||||||
|
))
|
||||||
|
|
||||||
|
return {"input_ids": result}
|
||||||
|
|
||||||
print("Tokenizing datasets...")
|
print("Tokenizing datasets...")
|
||||||
|
|
||||||
if "text" in datasets["train"].column_names:
|
if "text" in datasets["train"].column_names:
|
||||||
tokenize_function = tokenize_raw_example
|
tokenize_function = tokenize_raw_example
|
||||||
columns_to_remove = ["text"]
|
columns_to_remove = ["text"]
|
||||||
elif "conversation" in datasets["train"].column_names:
|
elif "conversations" in datasets["train"].column_names:
|
||||||
tokenize_function = tokenize_sharegpt_example
|
tokenize_function = tokenize_sharegpt_example
|
||||||
columns_to_remove = ["conversation"]
|
columns_to_remove = ["conversations"]
|
||||||
else:
|
else:
|
||||||
raise Exception("Unknown dataset input format (not raw corpus or sharegpt)")
|
raise Exception("Unknown dataset input format (not raw corpus or sharegpt)")
|
||||||
|
|
||||||
@@ -484,18 +492,6 @@ class CustomTrainer(Trainer):
|
|||||||
torch.save(trainable_params, saved_weights_file)
|
torch.save(trainable_params, saved_weights_file)
|
||||||
|
|
||||||
super()._save_optimizer_and_scheduler(output_dir)
|
super()._save_optimizer_and_scheduler(output_dir)
|
||||||
|
|
||||||
def compute_metrics(pred: EvalPrediction):
|
|
||||||
inputs = pred.inputs
|
|
||||||
label_ids = pred.label_ids
|
|
||||||
logits = pred.predictions
|
|
||||||
|
|
||||||
return {"accuracy": 1.0 }
|
|
||||||
|
|
||||||
def preprocess_logits_for_metrics(logits, labels):
|
|
||||||
"""https://discuss.huggingface.co/t/cuda-out-of-memory-when-using-trainer-with-compute-metrics/2941/22"""
|
|
||||||
pred_ids = torch.argmax(logits, dim=-1)
|
|
||||||
return pred_ids, labels
|
|
||||||
|
|
||||||
trainer = CustomTrainer(
|
trainer = CustomTrainer(
|
||||||
model=model,
|
model=model,
|
||||||
@@ -503,8 +499,6 @@ trainer = CustomTrainer(
|
|||||||
train_dataset=tokenized_train_dataset,
|
train_dataset=tokenized_train_dataset,
|
||||||
eval_dataset=tokenized_test_dataset,
|
eval_dataset=tokenized_test_dataset,
|
||||||
data_collator=data_collator,
|
data_collator=data_collator,
|
||||||
# compute_metrics=compute_metrics,
|
|
||||||
# preprocess_logits_for_metrics=preprocess_logits_for_metrics,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
tensorboard_process = None
|
tensorboard_process = None
|
||||||
@@ -523,7 +517,8 @@ try:
|
|||||||
else:
|
else:
|
||||||
trainer.train()
|
trainer.train()
|
||||||
|
|
||||||
# trainer.evaluate_all()
|
if training_run_args.train_dataset:
|
||||||
|
trainer.evaluate_all()
|
||||||
|
|
||||||
if training_run_args.use_lora and training_run_args.lora_merge:
|
if training_run_args.use_lora and training_run_args.lora_merge:
|
||||||
trainer.save_model() # save lora
|
trainer.save_model() # save lora
|
||||||
|
|||||||
Reference in New Issue
Block a user