save rev 8.9

2026-01-09 13:48:05 -05:00 · 2023-10-23 14:43:19 -04:00
parent a90a861065
commit 7f8e2fe112
5 changed files with 7374 additions and 7305 deletions
--- a/data/generate_home_assistant_data.py
+++ b/data/generate_home_assistant_data.py
@@ -345,14 +345,18 @@ def format_example(example):
    sys_prompt = "You are 'Al', a helpful AI Assistant that controls the devices in a house. Complete the following task ask instructed with the information provided only."
    services_block = "Services: " + ", ".join(sorted(example["available_services"]))
    states_block = "Devices:\n" + "\n".join(example["states"])
-    answers = "Response: " + " ".join(example["answers"])
-    question = "Request: " + example["question"]
+    question = "Request:\n" + example["question"]
+    answers = "Response:\n" + " ".join(example["answers"])
+
+    example_lines = [sys_prompt, services_block, states_block, question, answers]
    if len(example["service_calls"]) > 0:
-        code_block = "```homeassistant\n" + "\n".join(example["service_calls"]) + "\n```done"
-    else:
-        code_block = ""
+        code_block = "```homeassistant\n" + "\n".join(example["service_calls"]) + "\n```"
+        # code_block = "```homeassistant " + "\n```homeassistant ".join(example["service_calls"])
+        example_lines.append(code_block)
+
+    # code_block = "Actions:\n```homeassistant\n" + "\n".join(example["service_calls"]) + "\n```done"
        
-    result = "\n".join([sys_prompt, services_block, states_block, question, answers, code_block])
+    result = "\n".join(example_lines) + "\n"
    if "<device_name" in result:
        print("bad templating")
    return result
--- a/data/home_assistant_test.json
+++ b/data/home_assistant_test.json
--- a/data/home_assistant_train.json
+++ b/data/home_assistant_train.json
--- a/expermement-notes.txt
+++ b/expermement-notes.txt
@@ -150,4 +150,58 @@ rev 8.3 - further reduced training rate
 - 1 epoch
 - train ctx 512
 - batch size 8
- learning rate 8e-6
+- learning rate 8e-6
+ certainly not overfit like < rev7
+ has some creativity with how it repsonds
+ will often get the device name wrong on the first try
+
+rev 8.4 - re-ordered prompt
+- 1 epoch
+- train ctx 512
+- batch size 8
+- learning rate 8e-6
+- put actions before response and also made actions it's own "block"
+ it *works* but is incredibly open ended
+ basically never stops generating text
+
+rev 8.5 - tweaked prompt format again
+- 1 epoch
+- train ctx 512
+- batch size 8
+- learning rate 8e-6
+- re-orderd response before actions again but made actions less like a "block" so it might stop generation
+ that worked rather badly
+
+rev 8.6 - make prompt look more like other examples it has seen before
+- 1 epoch
+- train ctx 512
+- batch size 8
+- learning rate 8e-6
+- change ```done to just ``` and add 3 newlines at the end (idk it keeps doing that for other prompts before stopping)
+ it wants to generate the other prompt types much more with this config
+ only get the correct response about 50% of the time
+ it totally stops correctly when it DOES work
+
+rev 8.7 - try to fit a bit more. the last iteration jumps around on which format it chooses
+- 1 epoch
+- train ctx 512
+- batch size 8
+- learning rate 1e-5
+ similar issues as last model
+ altering the format (with newlines) makes it pick our format more often
+ comparing to 8.6 with modified format shows this one is better at getting device names right
+
+rev 8.8 - train with newlines instead of spaces in requets/response
+- 1 epoch
+- train ctx 512
+- batch size 8
+- learning rate 1e-5
+ definitely worse than the previous one
+ for some reason both 8.7 and 8.8 are horrible when using their actual template but if you deviate slightly it works a lot better on inference
+
+rev 8.9 - actually fix pad token
+- 1 epoch
+- train ctx 512
+- batch size 8
+- learning rate 1e-5
+ properly generates a response (+ terminates) when using the actual template
--- a/train.py
+++ b/train.py
@@ -1,9 +1,8 @@
 import torch
 import torch.nn.functional
-from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, TrainingArguments, Trainer, DataCollatorForLanguageModeling
+from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
 from datasets import load_dataset
-import evaluate
-import numpy as np
+from dataclasses import dataclass

 torch.set_default_device("cuda")
 torch.set_default_tensor_type('torch.cuda.FloatTensor')
@@ -13,22 +12,22 @@ TRAIN_CTX_SIZE = 512 # The number of tokens to pad + truncate the input examples
 BATCH_SIZE = 8 # The simulated "batch size" that we will train on. will tweak gradient accumulations steps
 MICRO_BATCH_SIZE = 2 # The actual batch size that will fit into VRAM on this machine
 TRAINING_EPOCHS = 1 # The number of times to train the model on each example
-LEARNING_RATE_START = 8e-6 # The starting learning rate (speed at which the model trains)
+LEARNING_RATE_START = 1e-5 # The starting learning rate (speed at which the model trains)
 LEARNING_RATE_SCHEDULE = "cosine" # How fast the learning rate is reduced during training
-RUN_NAME = "home-llm-rev8.3"
+RUN_NAME = "home-llm-rev8.9"
 OUTPUT_DIR =f"./models/{RUN_NAME}"

 # TODO: write a proper evaluation script

 model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", trust_remote_code=True).to(dtype=torch.bfloat16, device="cuda")
 tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)
-tokenizer.pad_token = tokenizer.eos_token
-pad_token_id = tokenizer(tokenizer.pad_token)["input_ids"][0]
-# tokenizer.add_special_tokens({'pad_token': '<|pad|>'})
+# tokenizer.pad_token = tokenizer.eos_token
+# pad_token_id = tokenizer(tokenizer.pad_token)["input_ids"][0]
+tokenizer.add_special_tokens({'pad_token': '<|pad|>'})
 # model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)

 def tokenize_function(example):
-    result = tokenizer(example['text'],
+    result = tokenizer(example['text'] + tokenizer.eos_token,
                       return_attention_mask=False,
                       padding=True, max_length=TRAIN_CTX_SIZE, truncation=True)
    
@@ -70,11 +69,23 @@ class NoAttentionMaskDataCollator(DataCollatorForLanguageModeling):

 data_collator = NoAttentionMaskDataCollator(tokenizer, mlm=False)

+# TODO: ignore user input when training
+# @dataclass
+# class CustomDataCollator:
+#     tokenizer: AutoTokenizer
+#     train_ctx_size: int
+#     def __call__(self, features, **kwargs):
+#         for feature in features:
+
+# data_collator = CustomDataCollator(tokenizer=tokenizer)
+
 trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
+    # train_dataset=datasets["train"],
+    # eval_dataset=datasets["test"],
    data_collator=data_collator,
 )