mirror of
https://github.com/acon96/home-llm.git
synced 2026-01-09 13:48:05 -05:00
save rev 8.9
This commit is contained in:
@@ -345,14 +345,18 @@ def format_example(example):
|
||||
sys_prompt = "You are 'Al', a helpful AI Assistant that controls the devices in a house. Complete the following task ask instructed with the information provided only."
|
||||
services_block = "Services: " + ", ".join(sorted(example["available_services"]))
|
||||
states_block = "Devices:\n" + "\n".join(example["states"])
|
||||
answers = "Response: " + " ".join(example["answers"])
|
||||
question = "Request: " + example["question"]
|
||||
question = "Request:\n" + example["question"]
|
||||
answers = "Response:\n" + " ".join(example["answers"])
|
||||
|
||||
example_lines = [sys_prompt, services_block, states_block, question, answers]
|
||||
if len(example["service_calls"]) > 0:
|
||||
code_block = "```homeassistant\n" + "\n".join(example["service_calls"]) + "\n```done"
|
||||
else:
|
||||
code_block = ""
|
||||
code_block = "```homeassistant\n" + "\n".join(example["service_calls"]) + "\n```"
|
||||
# code_block = "```homeassistant " + "\n```homeassistant ".join(example["service_calls"])
|
||||
example_lines.append(code_block)
|
||||
|
||||
# code_block = "Actions:\n```homeassistant\n" + "\n".join(example["service_calls"]) + "\n```done"
|
||||
|
||||
result = "\n".join([sys_prompt, services_block, states_block, question, answers, code_block])
|
||||
result = "\n".join(example_lines) + "\n"
|
||||
if "<device_name" in result:
|
||||
print("bad templating")
|
||||
return result
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -150,4 +150,58 @@ rev 8.3 - further reduced training rate
|
||||
- 1 epoch
|
||||
- train ctx 512
|
||||
- batch size 8
|
||||
- learning rate 8e-6
|
||||
- learning rate 8e-6
|
||||
+ certainly not overfit like < rev7
|
||||
+ has some creativity with how it repsonds
|
||||
+ will often get the device name wrong on the first try
|
||||
|
||||
rev 8.4 - re-ordered prompt
|
||||
- 1 epoch
|
||||
- train ctx 512
|
||||
- batch size 8
|
||||
- learning rate 8e-6
|
||||
- put actions before response and also made actions it's own "block"
|
||||
+ it *works* but is incredibly open ended
|
||||
+ basically never stops generating text
|
||||
|
||||
rev 8.5 - tweaked prompt format again
|
||||
- 1 epoch
|
||||
- train ctx 512
|
||||
- batch size 8
|
||||
- learning rate 8e-6
|
||||
- re-orderd response before actions again but made actions less like a "block" so it might stop generation
|
||||
+ that worked rather badly
|
||||
|
||||
rev 8.6 - make prompt look more like other examples it has seen before
|
||||
- 1 epoch
|
||||
- train ctx 512
|
||||
- batch size 8
|
||||
- learning rate 8e-6
|
||||
- change ```done to just ``` and add 3 newlines at the end (idk it keeps doing that for other prompts before stopping)
|
||||
+ it wants to generate the other prompt types much more with this config
|
||||
+ only get the correct response about 50% of the time
|
||||
+ it totally stops correctly when it DOES work
|
||||
|
||||
rev 8.7 - try to fit a bit more. the last iteration jumps around on which format it chooses
|
||||
- 1 epoch
|
||||
- train ctx 512
|
||||
- batch size 8
|
||||
- learning rate 1e-5
|
||||
+ similar issues as last model
|
||||
+ altering the format (with newlines) makes it pick our format more often
|
||||
+ comparing to 8.6 with modified format shows this one is better at getting device names right
|
||||
|
||||
rev 8.8 - train with newlines instead of spaces in requets/response
|
||||
- 1 epoch
|
||||
- train ctx 512
|
||||
- batch size 8
|
||||
- learning rate 1e-5
|
||||
+ definitely worse than the previous one
|
||||
+ for some reason both 8.7 and 8.8 are horrible when using their actual template but if you deviate slightly it works a lot better on inference
|
||||
|
||||
rev 8.9 - actually fix pad token
|
||||
- 1 epoch
|
||||
- train ctx 512
|
||||
- batch size 8
|
||||
- learning rate 1e-5
|
||||
+ properly generates a response (+ terminates) when using the actual template
|
||||
29
train.py
29
train.py
@@ -1,9 +1,8 @@
|
||||
import torch
|
||||
import torch.nn.functional
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, TrainingArguments, Trainer, DataCollatorForLanguageModeling
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
|
||||
from datasets import load_dataset
|
||||
import evaluate
|
||||
import numpy as np
|
||||
from dataclasses import dataclass
|
||||
|
||||
torch.set_default_device("cuda")
|
||||
torch.set_default_tensor_type('torch.cuda.FloatTensor')
|
||||
@@ -13,22 +12,22 @@ TRAIN_CTX_SIZE = 512 # The number of tokens to pad + truncate the input examples
|
||||
BATCH_SIZE = 8 # The simulated "batch size" that we will train on. will tweak gradient accumulations steps
|
||||
MICRO_BATCH_SIZE = 2 # The actual batch size that will fit into VRAM on this machine
|
||||
TRAINING_EPOCHS = 1 # The number of times to train the model on each example
|
||||
LEARNING_RATE_START = 8e-6 # The starting learning rate (speed at which the model trains)
|
||||
LEARNING_RATE_START = 1e-5 # The starting learning rate (speed at which the model trains)
|
||||
LEARNING_RATE_SCHEDULE = "cosine" # How fast the learning rate is reduced during training
|
||||
RUN_NAME = "home-llm-rev8.3"
|
||||
RUN_NAME = "home-llm-rev8.9"
|
||||
OUTPUT_DIR =f"./models/{RUN_NAME}"
|
||||
|
||||
# TODO: write a proper evaluation script
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", trust_remote_code=True).to(dtype=torch.bfloat16, device="cuda")
|
||||
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
pad_token_id = tokenizer(tokenizer.pad_token)["input_ids"][0]
|
||||
# tokenizer.add_special_tokens({'pad_token': '<|pad|>'})
|
||||
# tokenizer.pad_token = tokenizer.eos_token
|
||||
# pad_token_id = tokenizer(tokenizer.pad_token)["input_ids"][0]
|
||||
tokenizer.add_special_tokens({'pad_token': '<|pad|>'})
|
||||
# model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)
|
||||
|
||||
def tokenize_function(example):
|
||||
result = tokenizer(example['text'],
|
||||
result = tokenizer(example['text'] + tokenizer.eos_token,
|
||||
return_attention_mask=False,
|
||||
padding=True, max_length=TRAIN_CTX_SIZE, truncation=True)
|
||||
|
||||
@@ -70,11 +69,23 @@ class NoAttentionMaskDataCollator(DataCollatorForLanguageModeling):
|
||||
|
||||
data_collator = NoAttentionMaskDataCollator(tokenizer, mlm=False)
|
||||
|
||||
# TODO: ignore user input when training
|
||||
# @dataclass
|
||||
# class CustomDataCollator:
|
||||
# tokenizer: AutoTokenizer
|
||||
# train_ctx_size: int
|
||||
# def __call__(self, features, **kwargs):
|
||||
# for feature in features:
|
||||
|
||||
# data_collator = CustomDataCollator(tokenizer=tokenizer)
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=tokenized_train_dataset,
|
||||
eval_dataset=tokenized_test_dataset,
|
||||
# train_dataset=datasets["train"],
|
||||
# eval_dataset=datasets["test"],
|
||||
data_collator=data_collator,
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user