mirror of
https://github.com/acon96/home-llm.git
synced 2026-01-08 21:28:05 -05:00
experiment notes
This commit is contained in:
@@ -101,3 +101,41 @@
|
||||
- 3600: 0.9736842105263158
|
||||
- 4000: 0.9706477732793523
|
||||
- Final: 0.9711538461538461
|
||||
|
||||
# rev6
|
||||
- lora rank: 64, alpha: 128
|
||||
- batch size: 32
|
||||
- dataset size: medium (with new device types)
|
||||
+ evaluation results:
|
||||
- 100: 0.7545546558704453
|
||||
- 200: 0.8567813765182186
|
||||
- 300: 0.8977732793522267
|
||||
- 400: 0.9068825910931174
|
||||
- 500: 0.9261133603238867
|
||||
- 600: 0.9342105263157895
|
||||
- 700: 0.9407894736842105
|
||||
- 800: 0.9478744939271255
|
||||
- 900: 0.937246963562753
|
||||
- 1000: 0.9438259109311741
|
||||
- Final: 0.9453441295546559
|
||||
|
||||
# rev7
|
||||
- lora rank: 64, alpha: 128
|
||||
- epochs: 2
|
||||
- batch size: 128
|
||||
- dataset size: large (with fixed service names)
|
||||
+ evaluation results:
|
||||
- 50: 0.6022267206477733
|
||||
- 100: 0.8254048582995951
|
||||
- 150: 0.8689271255060729
|
||||
- 200: 0.9013157894736842
|
||||
- 250: 0.9073886639676113
|
||||
- 300: 0.9210526315789473
|
||||
- 350: 0.937753036437247
|
||||
- 400: 0.9362348178137652
|
||||
- 450: 0.9478744939271255
|
||||
- 500: 0.9463562753036437
|
||||
- 550:
|
||||
- 600: 0.9473684210526315
|
||||
- 650: 0.9387651821862348
|
||||
- Final: 0.9463562753036437
|
||||
27
evaluate.py
27
evaluate.py
@@ -10,19 +10,24 @@ from tqdm import tqdm
|
||||
CTX_SIZE = 2048
|
||||
|
||||
"""
|
||||
python3 evaluate.py stablehome-3b-rev1/checkpoint-400 --batch-size 4 --lora && \
|
||||
python3 evaluate.py stablehome-3b-rev1/checkpoint-800 --batch-size 4 --lora && \
|
||||
python3 evaluate.py stablehome-3b-rev1/checkpoint-1200 --batch-size 4 --lora && \
|
||||
python3 evaluate.py stablehome-3b-rev1/checkpoint-1600 --batch-size 4 --lora && \
|
||||
python3 evaluate.py stablehome-3b-rev1/checkpoint-2000 --batch-size 4 --lora && \
|
||||
python3 evaluate.py stablehome-3b-rev1/checkpoint-2400 --batch-size 4 --lora && \
|
||||
python3 evaluate.py stablehome-3b-rev1/checkpoint-2800 --batch-size 4 --lora && \
|
||||
python3 evaluate.py stablehome-3b-rev1/checkpoint-3200 --batch-size 4 --lora && \
|
||||
python3 evaluate.py stablehome-3b-rev1/checkpoint-3600 --batch-size 4 --lora && \
|
||||
python3 evaluate.py stablehome-3b-rev1/checkpoint-4000 --batch-size 4 --lora && \
|
||||
python3 evaluate.py stablehome-3b-rev1 --batch-size 4 --lora
|
||||
python3 evaluate.py stablehome-3b-rev7/checkpoint-50 --batch-size 4 --lora && \
|
||||
python3 evaluate.py stablehome-3b-rev7/checkpoint-100 --batch-size 4 --lora && \
|
||||
python3 evaluate.py stablehome-3b-rev7/checkpoint-150 --batch-size 4 --lora && \
|
||||
python3 evaluate.py stablehome-3b-rev7/checkpoint-200 --batch-size 4 --lora && \
|
||||
python3 evaluate.py stablehome-3b-rev7/checkpoint-250 --batch-size 4 --lora && \
|
||||
python3 evaluate.py stablehome-3b-rev7/checkpoint-300 --batch-size 4 --lora && \
|
||||
python3 evaluate.py stablehome-3b-rev7/checkpoint-350 --batch-size 4 --lora && \
|
||||
python3 evaluate.py stablehome-3b-rev7/checkpoint-400 --batch-size 4 --lora && \
|
||||
python3 evaluate.py stablehome-3b-rev7/checkpoint-450 --batch-size 4 --lora && \
|
||||
python3 evaluate.py stablehome-3b-rev7/checkpoint-500 --batch-size 4 --lora && \
|
||||
python3 evaluate.py stablehome-3b-rev7/checkpoint-550 --batch-size 4 --lora && \
|
||||
python3 evaluate.py stablehome-3b-rev7/checkpoint-600 --batch-size 4 --lora && \
|
||||
python3 evaluate.py stablehome-3b-rev7/checkpoint-650 --batch-size 4 --lora && \
|
||||
python3 evaluate.py stablehome-3b-rev7 --batch-size 4 --lora
|
||||
"""
|
||||
|
||||
# TODO: auto detect all the checkpoints to run
|
||||
|
||||
def tokenize(tokenizer, prompt):
|
||||
return tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=CTX_SIZE)
|
||||
|
||||
|
||||
13
train.py
13
train.py
@@ -67,15 +67,15 @@ python3 train.py \
|
||||
|
||||
"""
|
||||
python3 train.py \
|
||||
--run_name stablehome-3b-rev5 \
|
||||
--run_name stablehome-3b-rev8 \
|
||||
--base_model stabilityai/stablelm-zephyr-3b \
|
||||
--bf16 \
|
||||
--train_dataset data/home_assistant_train.jsonl \
|
||||
--test_dataset data/home_assistant_test.jsonl \
|
||||
--learning_rate 1e-5 \
|
||||
--micro_batch_size 4 --gradient_checkpointing \
|
||||
--learning_rate 1e-5 --batch_size 128 --epochs 2 \
|
||||
--micro_batch_size 8 --gradient_checkpointing \
|
||||
--ctx_size 2048 \
|
||||
--save_steps 400 --save_total_limit 20 \
|
||||
--save_steps 50 --save_total_limit 20 --eval_steps 100 --logging_steps 2 \
|
||||
--use_lora --lora_rank 64 --lora_alpha 128 --lora_modules up_proj,down_proj,q_proj,v_proj,o_proj --lora_merge
|
||||
"""
|
||||
|
||||
@@ -109,6 +109,7 @@ class TrainingRunArguments:
|
||||
eval_steps: int = field(default=200, metadata={"help": "The number of steps in between evaluations of the model; set to -1 to evaluate every epoch"})
|
||||
save_steps: int = field(default=-1, metadata={"help": "The number of steps in between model checkpoints; set to -1 to save every epoch"})
|
||||
save_total_limit: int = field(default=1, metadata={"help": "The number of recent checkpoints of the model to save (not including the final model)"})
|
||||
logging_steps: int = field(default=5, metadata={"help": "Sets the number of steps in between log output for the training run"})
|
||||
group_by_length: bool = field(default=False, metadata={"help": "If enabled, the training data will be grouped by length to optimize use of padding"})
|
||||
pre_allocate_cuda_buffers: bool = field(default=True, metadata={"help": "If enabled, runs a forward and backward pass on the model before training to force pytorch to allocate the correct size CUDA buffers up front"})
|
||||
|
||||
@@ -255,7 +256,7 @@ training_args = TrainingArguments(
|
||||
save_strategy=("steps" if training_run_args.save_steps != -1 else "epoch"),
|
||||
save_steps=(training_run_args.save_steps if training_run_args.save_steps != -1 else None),
|
||||
save_safetensors=True,
|
||||
logging_steps=5,
|
||||
logging_steps=training_run_args.logging_steps,
|
||||
output_dir=model_dir,
|
||||
num_train_epochs=training_run_args.epochs,
|
||||
save_total_limit=training_run_args.save_total_limit,
|
||||
@@ -265,7 +266,7 @@ training_args = TrainingArguments(
|
||||
log_level="info",
|
||||
bf16=training_run_args.bf16,
|
||||
group_by_length=training_run_args.group_by_length,
|
||||
skip_memory_metrics=True,
|
||||
# skip_memory_metrics=True,
|
||||
**training_kwargs,
|
||||
# include_inputs_for_metrics=True,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user