experiment notes

This commit is contained in:
Alex O'Connell
2024-03-20 23:05:22 -04:00
parent fa31682c51
commit c67759e16f
3 changed files with 61 additions and 17 deletions

View File

@@ -101,3 +101,41 @@
- 3600: 0.9736842105263158
- 4000: 0.9706477732793523
- Final: 0.9711538461538461
# rev6
- lora rank: 64, alpha: 128
- batch size: 32
- dataset size: medium (with new device types)
+ evaluation results:
- 100: 0.7545546558704453
- 200: 0.8567813765182186
- 300: 0.8977732793522267
- 400: 0.9068825910931174
- 500: 0.9261133603238867
- 600: 0.9342105263157895
- 700: 0.9407894736842105
- 800: 0.9478744939271255
- 900: 0.937246963562753
- 1000: 0.9438259109311741
- Final: 0.9453441295546559
# rev7
- lora rank: 64, alpha: 128
- epochs: 2
- batch size: 128
- dataset size: large (with fixed service names)
+ evaluation results:
- 50: 0.6022267206477733
- 100: 0.8254048582995951
- 150: 0.8689271255060729
- 200: 0.9013157894736842
- 250: 0.9073886639676113
- 300: 0.9210526315789473
- 350: 0.937753036437247
- 400: 0.9362348178137652
- 450: 0.9478744939271255
- 500: 0.9463562753036437
- 550:
- 600: 0.9473684210526315
- 650: 0.9387651821862348
- Final: 0.9463562753036437

View File

@@ -10,19 +10,24 @@ from tqdm import tqdm
CTX_SIZE = 2048
"""
python3 evaluate.py stablehome-3b-rev1/checkpoint-400 --batch-size 4 --lora && \
python3 evaluate.py stablehome-3b-rev1/checkpoint-800 --batch-size 4 --lora && \
python3 evaluate.py stablehome-3b-rev1/checkpoint-1200 --batch-size 4 --lora && \
python3 evaluate.py stablehome-3b-rev1/checkpoint-1600 --batch-size 4 --lora && \
python3 evaluate.py stablehome-3b-rev1/checkpoint-2000 --batch-size 4 --lora && \
python3 evaluate.py stablehome-3b-rev1/checkpoint-2400 --batch-size 4 --lora && \
python3 evaluate.py stablehome-3b-rev1/checkpoint-2800 --batch-size 4 --lora && \
python3 evaluate.py stablehome-3b-rev1/checkpoint-3200 --batch-size 4 --lora && \
python3 evaluate.py stablehome-3b-rev1/checkpoint-3600 --batch-size 4 --lora && \
python3 evaluate.py stablehome-3b-rev1/checkpoint-4000 --batch-size 4 --lora && \
python3 evaluate.py stablehome-3b-rev1 --batch-size 4 --lora
python3 evaluate.py stablehome-3b-rev7/checkpoint-50 --batch-size 4 --lora && \
python3 evaluate.py stablehome-3b-rev7/checkpoint-100 --batch-size 4 --lora && \
python3 evaluate.py stablehome-3b-rev7/checkpoint-150 --batch-size 4 --lora && \
python3 evaluate.py stablehome-3b-rev7/checkpoint-200 --batch-size 4 --lora && \
python3 evaluate.py stablehome-3b-rev7/checkpoint-250 --batch-size 4 --lora && \
python3 evaluate.py stablehome-3b-rev7/checkpoint-300 --batch-size 4 --lora && \
python3 evaluate.py stablehome-3b-rev7/checkpoint-350 --batch-size 4 --lora && \
python3 evaluate.py stablehome-3b-rev7/checkpoint-400 --batch-size 4 --lora && \
python3 evaluate.py stablehome-3b-rev7/checkpoint-450 --batch-size 4 --lora && \
python3 evaluate.py stablehome-3b-rev7/checkpoint-500 --batch-size 4 --lora && \
python3 evaluate.py stablehome-3b-rev7/checkpoint-550 --batch-size 4 --lora && \
python3 evaluate.py stablehome-3b-rev7/checkpoint-600 --batch-size 4 --lora && \
python3 evaluate.py stablehome-3b-rev7/checkpoint-650 --batch-size 4 --lora && \
python3 evaluate.py stablehome-3b-rev7 --batch-size 4 --lora
"""
# TODO: auto detect all the checkpoints to run
def tokenize(tokenizer, prompt):
return tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=CTX_SIZE)

View File

@@ -67,15 +67,15 @@ python3 train.py \
"""
python3 train.py \
--run_name stablehome-3b-rev5 \
--run_name stablehome-3b-rev8 \
--base_model stabilityai/stablelm-zephyr-3b \
--bf16 \
--train_dataset data/home_assistant_train.jsonl \
--test_dataset data/home_assistant_test.jsonl \
--learning_rate 1e-5 \
--micro_batch_size 4 --gradient_checkpointing \
--learning_rate 1e-5 --batch_size 128 --epochs 2 \
--micro_batch_size 8 --gradient_checkpointing \
--ctx_size 2048 \
--save_steps 400 --save_total_limit 20 \
--save_steps 50 --save_total_limit 20 --eval_steps 100 --logging_steps 2 \
--use_lora --lora_rank 64 --lora_alpha 128 --lora_modules up_proj,down_proj,q_proj,v_proj,o_proj --lora_merge
"""
@@ -109,6 +109,7 @@ class TrainingRunArguments:
eval_steps: int = field(default=200, metadata={"help": "The number of steps in between evaluations of the model; set to -1 to evaluate every epoch"})
save_steps: int = field(default=-1, metadata={"help": "The number of steps in between model checkpoints; set to -1 to save every epoch"})
save_total_limit: int = field(default=1, metadata={"help": "The number of recent checkpoints of the model to save (not including the final model)"})
logging_steps: int = field(default=5, metadata={"help": "Sets the number of steps in between log output for the training run"})
group_by_length: bool = field(default=False, metadata={"help": "If enabled, the training data will be grouped by length to optimize use of padding"})
pre_allocate_cuda_buffers: bool = field(default=True, metadata={"help": "If enabled, runs a forward and backward pass on the model before training to force pytorch to allocate the correct size CUDA buffers up front"})
@@ -255,7 +256,7 @@ training_args = TrainingArguments(
save_strategy=("steps" if training_run_args.save_steps != -1 else "epoch"),
save_steps=(training_run_args.save_steps if training_run_args.save_steps != -1 else None),
save_safetensors=True,
logging_steps=5,
logging_steps=training_run_args.logging_steps,
output_dir=model_dir,
num_train_epochs=training_run_args.epochs,
save_total_limit=training_run_args.save_total_limit,
@@ -265,7 +266,7 @@ training_args = TrainingArguments(
log_level="info",
bf16=training_run_args.bf16,
group_by_length=training_run_args.group_by_length,
skip_memory_metrics=True,
# skip_memory_metrics=True,
**training_kwargs,
# include_inputs_for_metrics=True,
)