experiment notes

2026-01-08 21:28:05 -05:00 · 2024-03-20 23:05:22 -04:00
parent fa31682c51
commit c67759e16f
3 changed files with 61 additions and 17 deletions
--- a/docs/experiment-notes-stablelm.md
+++ b/docs/experiment-notes-stablelm.md
@@ -101,3 +101,41 @@
  - 3600: 0.9736842105263158
  - 4000: 0.9706477732793523
  - Final: 0.9711538461538461
+
+# rev6
+- lora rank: 64, alpha: 128
+- batch size: 32
+- dataset size: medium (with new device types)
+ evaluation results:
+  - 100: 0.7545546558704453
+  - 200: 0.8567813765182186
+  - 300: 0.8977732793522267
+  - 400: 0.9068825910931174
+  - 500: 0.9261133603238867
+  - 600: 0.9342105263157895
+  - 700: 0.9407894736842105
+  - 800: 0.9478744939271255
+  - 900: 0.937246963562753
+  - 1000: 0.9438259109311741
+  - Final: 0.9453441295546559
+
+# rev7
+- lora rank: 64, alpha: 128
+- epochs: 2
+- batch size: 128
+- dataset size: large (with fixed service names)
+ evaluation results:
+  - 50: 0.6022267206477733
+  - 100: 0.8254048582995951
+  - 150: 0.8689271255060729
+  - 200: 0.9013157894736842
+  - 250: 0.9073886639676113
+  - 300: 0.9210526315789473
+  - 350: 0.937753036437247
+  - 400: 0.9362348178137652
+  - 450: 0.9478744939271255
+  - 500: 0.9463562753036437
+  - 550: 
+  - 600: 0.9473684210526315
+  - 650: 0.9387651821862348
+  - Final: 0.9463562753036437
--- a/evaluate.py
+++ b/evaluate.py
@@ -10,19 +10,24 @@ from tqdm import tqdm
 CTX_SIZE = 2048

 """
-python3 evaluate.py stablehome-3b-rev1/checkpoint-400 --batch-size 4 --lora && \
-  python3 evaluate.py stablehome-3b-rev1/checkpoint-800 --batch-size 4 --lora && \
-  python3 evaluate.py stablehome-3b-rev1/checkpoint-1200 --batch-size 4 --lora && \
-  python3 evaluate.py stablehome-3b-rev1/checkpoint-1600 --batch-size 4 --lora && \
-  python3 evaluate.py stablehome-3b-rev1/checkpoint-2000 --batch-size 4 --lora && \
-  python3 evaluate.py stablehome-3b-rev1/checkpoint-2400 --batch-size 4 --lora && \
-  python3 evaluate.py stablehome-3b-rev1/checkpoint-2800 --batch-size 4 --lora && \
-  python3 evaluate.py stablehome-3b-rev1/checkpoint-3200 --batch-size 4 --lora && \
-  python3 evaluate.py stablehome-3b-rev1/checkpoint-3600 --batch-size 4 --lora && \
-  python3 evaluate.py stablehome-3b-rev1/checkpoint-4000 --batch-size 4 --lora && \
-  python3 evaluate.py stablehome-3b-rev1 --batch-size 4 --lora
+python3 evaluate.py stablehome-3b-rev7/checkpoint-50 --batch-size 4 --lora && \
+  python3 evaluate.py stablehome-3b-rev7/checkpoint-100 --batch-size 4 --lora && \
+  python3 evaluate.py stablehome-3b-rev7/checkpoint-150 --batch-size 4 --lora && \
+  python3 evaluate.py stablehome-3b-rev7/checkpoint-200 --batch-size 4 --lora && \
+  python3 evaluate.py stablehome-3b-rev7/checkpoint-250 --batch-size 4 --lora && \
+  python3 evaluate.py stablehome-3b-rev7/checkpoint-300 --batch-size 4 --lora && \
+  python3 evaluate.py stablehome-3b-rev7/checkpoint-350 --batch-size 4 --lora && \
+  python3 evaluate.py stablehome-3b-rev7/checkpoint-400 --batch-size 4 --lora && \
+  python3 evaluate.py stablehome-3b-rev7/checkpoint-450 --batch-size 4 --lora && \
+  python3 evaluate.py stablehome-3b-rev7/checkpoint-500 --batch-size 4 --lora && \
+  python3 evaluate.py stablehome-3b-rev7/checkpoint-550 --batch-size 4 --lora && \
+  python3 evaluate.py stablehome-3b-rev7/checkpoint-600 --batch-size 4 --lora && \
+  python3 evaluate.py stablehome-3b-rev7/checkpoint-650 --batch-size 4 --lora && \
+  python3 evaluate.py stablehome-3b-rev7 --batch-size 4 --lora
 """

+# TODO: auto detect all the checkpoints to run
+
 def tokenize(tokenizer, prompt):
    return tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=CTX_SIZE)

--- a/train.py
+++ b/train.py
@@ -67,15 +67,15 @@ python3 train.py \

 """
 python3 train.py \
-    --run_name stablehome-3b-rev5 \
+    --run_name stablehome-3b-rev8 \
    --base_model stabilityai/stablelm-zephyr-3b \
    --bf16 \
    --train_dataset data/home_assistant_train.jsonl \
    --test_dataset data/home_assistant_test.jsonl \
-    --learning_rate 1e-5 \
-    --micro_batch_size 4 --gradient_checkpointing \
+    --learning_rate 1e-5 --batch_size 128 --epochs 2 \
+    --micro_batch_size 8 --gradient_checkpointing \
    --ctx_size 2048 \
-    --save_steps 400 --save_total_limit 20 \
+    --save_steps 50 --save_total_limit 20 --eval_steps 100 --logging_steps 2 \
    --use_lora --lora_rank 64 --lora_alpha 128 --lora_modules up_proj,down_proj,q_proj,v_proj,o_proj --lora_merge
 """

@@ -109,6 +109,7 @@ class TrainingRunArguments:
    eval_steps: int = field(default=200, metadata={"help": "The number of steps in between evaluations of the model; set to -1 to evaluate every epoch"})
    save_steps: int = field(default=-1, metadata={"help": "The number of steps in between model checkpoints; set to -1 to save every epoch"})
    save_total_limit: int = field(default=1, metadata={"help": "The number of recent checkpoints of the model to save (not including the final model)"})
+    logging_steps: int = field(default=5, metadata={"help": "Sets the number of steps in between log output for the training run"})
    group_by_length: bool = field(default=False, metadata={"help": "If enabled, the training data will be grouped by length to optimize use of padding"})
    pre_allocate_cuda_buffers: bool = field(default=True, metadata={"help": "If enabled, runs a forward and backward pass on the model before training to force pytorch to allocate the correct size CUDA buffers up front"})
    
@@ -255,7 +256,7 @@ training_args = TrainingArguments(
    save_strategy=("steps" if training_run_args.save_steps != -1 else "epoch"),
    save_steps=(training_run_args.save_steps if training_run_args.save_steps != -1 else None),
    save_safetensors=True,
-    logging_steps=5, 
+    logging_steps=training_run_args.logging_steps, 
    output_dir=model_dir,
    num_train_epochs=training_run_args.epochs,
    save_total_limit=training_run_args.save_total_limit,
@@ -265,7 +266,7 @@ training_args = TrainingArguments(
    log_level="info",
    bf16=training_run_args.bf16,
    group_by_length=training_run_args.group_by_length,
-    skip_memory_metrics=True,
+    # skip_memory_metrics=True,
    **training_kwargs,
    # include_inputs_for_metrics=True,
 )