save rev 8.9

This commit is contained in:
Alex O'Connell
2023-10-23 14:43:19 -04:00
parent a90a861065
commit 7f8e2fe112
5 changed files with 7374 additions and 7305 deletions

View File

@@ -345,14 +345,18 @@ def format_example(example):
sys_prompt = "You are 'Al', a helpful AI Assistant that controls the devices in a house. Complete the following task ask instructed with the information provided only."
services_block = "Services: " + ", ".join(sorted(example["available_services"]))
states_block = "Devices:\n" + "\n".join(example["states"])
answers = "Response: " + " ".join(example["answers"])
question = "Request: " + example["question"]
question = "Request:\n" + example["question"]
answers = "Response:\n" + " ".join(example["answers"])
example_lines = [sys_prompt, services_block, states_block, question, answers]
if len(example["service_calls"]) > 0:
code_block = "```homeassistant\n" + "\n".join(example["service_calls"]) + "\n```done"
else:
code_block = ""
code_block = "```homeassistant\n" + "\n".join(example["service_calls"]) + "\n```"
# code_block = "```homeassistant " + "\n```homeassistant ".join(example["service_calls"])
example_lines.append(code_block)
# code_block = "Actions:\n```homeassistant\n" + "\n".join(example["service_calls"]) + "\n```done"
result = "\n".join([sys_prompt, services_block, states_block, question, answers, code_block])
result = "\n".join(example_lines) + "\n"
if "<device_name" in result:
print("bad templating")
return result

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -150,4 +150,58 @@ rev 8.3 - further reduced training rate
- 1 epoch
- train ctx 512
- batch size 8
- learning rate 8e-6
- learning rate 8e-6
+ certainly not overfit like < rev7
+ has some creativity with how it repsonds
+ will often get the device name wrong on the first try
rev 8.4 - re-ordered prompt
- 1 epoch
- train ctx 512
- batch size 8
- learning rate 8e-6
- put actions before response and also made actions it's own "block"
+ it *works* but is incredibly open ended
+ basically never stops generating text
rev 8.5 - tweaked prompt format again
- 1 epoch
- train ctx 512
- batch size 8
- learning rate 8e-6
- re-orderd response before actions again but made actions less like a "block" so it might stop generation
+ that worked rather badly
rev 8.6 - make prompt look more like other examples it has seen before
- 1 epoch
- train ctx 512
- batch size 8
- learning rate 8e-6
- change ```done to just ``` and add 3 newlines at the end (idk it keeps doing that for other prompts before stopping)
+ it wants to generate the other prompt types much more with this config
+ only get the correct response about 50% of the time
+ it totally stops correctly when it DOES work
rev 8.7 - try to fit a bit more. the last iteration jumps around on which format it chooses
- 1 epoch
- train ctx 512
- batch size 8
- learning rate 1e-5
+ similar issues as last model
+ altering the format (with newlines) makes it pick our format more often
+ comparing to 8.6 with modified format shows this one is better at getting device names right
rev 8.8 - train with newlines instead of spaces in requets/response
- 1 epoch
- train ctx 512
- batch size 8
- learning rate 1e-5
+ definitely worse than the previous one
+ for some reason both 8.7 and 8.8 are horrible when using their actual template but if you deviate slightly it works a lot better on inference
rev 8.9 - actually fix pad token
- 1 epoch
- train ctx 512
- batch size 8
- learning rate 1e-5
+ properly generates a response (+ terminates) when using the actual template

View File

@@ -1,9 +1,8 @@
import torch
import torch.nn.functional
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset
import evaluate
import numpy as np
from dataclasses import dataclass
torch.set_default_device("cuda")
torch.set_default_tensor_type('torch.cuda.FloatTensor')
@@ -13,22 +12,22 @@ TRAIN_CTX_SIZE = 512 # The number of tokens to pad + truncate the input examples
BATCH_SIZE = 8 # The simulated "batch size" that we will train on. will tweak gradient accumulations steps
MICRO_BATCH_SIZE = 2 # The actual batch size that will fit into VRAM on this machine
TRAINING_EPOCHS = 1 # The number of times to train the model on each example
LEARNING_RATE_START = 8e-6 # The starting learning rate (speed at which the model trains)
LEARNING_RATE_START = 1e-5 # The starting learning rate (speed at which the model trains)
LEARNING_RATE_SCHEDULE = "cosine" # How fast the learning rate is reduced during training
RUN_NAME = "home-llm-rev8.3"
RUN_NAME = "home-llm-rev8.9"
OUTPUT_DIR =f"./models/{RUN_NAME}"
# TODO: write a proper evaluation script
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", trust_remote_code=True).to(dtype=torch.bfloat16, device="cuda")
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
pad_token_id = tokenizer(tokenizer.pad_token)["input_ids"][0]
# tokenizer.add_special_tokens({'pad_token': '<|pad|>'})
# tokenizer.pad_token = tokenizer.eos_token
# pad_token_id = tokenizer(tokenizer.pad_token)["input_ids"][0]
tokenizer.add_special_tokens({'pad_token': '<|pad|>'})
# model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)
def tokenize_function(example):
result = tokenizer(example['text'],
result = tokenizer(example['text'] + tokenizer.eos_token,
return_attention_mask=False,
padding=True, max_length=TRAIN_CTX_SIZE, truncation=True)
@@ -70,11 +69,23 @@ class NoAttentionMaskDataCollator(DataCollatorForLanguageModeling):
data_collator = NoAttentionMaskDataCollator(tokenizer, mlm=False)
# TODO: ignore user input when training
# @dataclass
# class CustomDataCollator:
# tokenizer: AutoTokenizer
# train_ctx_size: int
# def __call__(self, features, **kwargs):
# for feature in features:
# data_collator = CustomDataCollator(tokenizer=tokenizer)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train_dataset,
eval_dataset=tokenized_test_dataset,
# train_dataset=datasets["train"],
# eval_dataset=datasets["test"],
data_collator=data_collator,
)