save state for rev 8.3

This commit is contained in:
Alex O'Connell
2023-10-23 01:09:41 -04:00
parent 67a27c6a1b
commit 575efeac41
5 changed files with 6316 additions and 16479 deletions

View File

@@ -352,7 +352,7 @@ def format_example(example):
else:
code_block = ""
result = "\n".join([sys_prompt, services_block, states_block, question, answers, code_block]) + "<endresponse>"
result = "\n".join([sys_prompt, services_block, states_block, question, answers, code_block])
if "<device_name" in result:
print("bad templating")
return result
@@ -390,7 +390,7 @@ def generate_example_file(filename: str, seed: int, *, static_factor: int, templ
# TODO: make more randomized names for devices (random words or people's names)
# TODO: answer questions about more than one thing in the state list at once
def main():
generate_example_file("home_assistant_train", 42, static_factor=3, template_factor=30, status_request_factor=20)
generate_example_file("home_assistant_train", 42, static_factor=3, template_factor=20, status_request_factor=10)
generate_example_file("home_assistant_test", 42, static_factor=1, template_factor=3, status_request_factor=2)
if __name__ == "__main__":

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -103,7 +103,6 @@ rev 6.3 - higher batch
- batch size 12
- learning rate cosine 1e-4
rev 7 - tweak dataset again
- 2 epochs
- train ctx 512
@@ -126,5 +125,29 @@ rev 7.3 - try adding fake end of sentence token
- batch size 8
- learning rate 1e-4
# TODO
rev 7.4 - dataset tweaks
rev 8 - dataset tweaks. add status requests
+ service requests still mostly work but status requests are pretty broken
rev 8.1 - tweak example counts + ratios
- 1 epoch
- train ctx 512
- batch size 8
- learning rate 1e-4
+ seems to have worked better with lower example counts
rev 8.2 - try to fit learning rate so loss doesn't bottom out till the end of training
- 1 epoch
- train ctx 512
- batch size 8
- learning rate 8e-5 (didn't change loss at all)
- learning rate 5e-5 (same)
- learning rate 1e-5 (wayyyy better)
+ pretty sure i've been overcranking most of these and destroying most of the model
+ oh yuuhhhhh it's overcranked. nails both request types (plus even ending generation)
+ needs ambiguous device name examples because I totally just asked it an ambiguous question and it answered the one I wasn't expecting
rev 8.3 - further reduced training rate
- 1 epoch
- train ctx 512
- batch size 8
- learning rate 8e-6

View File

@@ -13,11 +13,13 @@ TRAIN_CTX_SIZE = 512 # The number of tokens to pad + truncate the input examples
BATCH_SIZE = 8 # The simulated "batch size" that we will train on. will tweak gradient accumulations steps
MICRO_BATCH_SIZE = 2 # The actual batch size that will fit into VRAM on this machine
TRAINING_EPOCHS = 1 # The number of times to train the model on each example
LEARNING_RATE_START = 1e-4 # The starting learning rate (speed at which the model trains)
LEARNING_RATE_START = 8e-6 # The starting learning rate (speed at which the model trains)
LEARNING_RATE_SCHEDULE = "cosine" # How fast the learning rate is reduced during training
RUN_NAME = "home-llm-rev7.3"
RUN_NAME = "home-llm-rev8.3"
OUTPUT_DIR =f"./models/{RUN_NAME}"
# TODO: write a proper evaluation script
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", trust_remote_code=True).to(dtype=torch.bfloat16, device="cuda")
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token