diff --git a/README.md b/README.md index 96e785b..dc550cf 100644 --- a/README.md +++ b/README.md @@ -99,7 +99,7 @@ accelerate launch --config_file fsdp_config.yaml train.py \ -The 1B model was trained as a full fine-tuning on an RTX 3090 (24GB). Training took approximately 2.5 hours. It was trained on the `--medium` dataset variant. +The 1B model was trained as a full fine-tuning on an RTX 3090 (24GB). Training took approximately 2 hours. It was trained on the `--medium` dataset variant.
Training Arguments diff --git a/data/translate_data.py b/data/translate_data.py index d30daba..f745a45 100644 --- a/data/translate_data.py +++ b/data/translate_data.py @@ -5,8 +5,10 @@ import time import re from deep_translator import GoogleTranslator +from deep_translator.base import BaseTranslator from deep_translator.exceptions import TooManyRequests from tqdm import tqdm +from transformers import pipeline SUPPORTED_DEVICES = [ "light", @@ -25,15 +27,25 @@ SUPPORTED_DEVICES = [ def format_device_name(input_str): return input_str.replace('-', '_').replace(' ', '_').lower() +class Seq2SeqTranslator(BaseTranslator): + + def __init__(self, model_name: str, **kwargs): + super().__init__(**kwargs) + + self.translator = pipeline("translation", model=model_name, tokenizer=model_name, device=0) + + def translate(self, text: str, **kwargs): + return self.translator(text)[0]["translation_text"] + class DatasetTranslator: - translator: GoogleTranslator + translator: Seq2SeqTranslator source_language: str target_language: str - def __init__(self, source_language, target_language): + def __init__(self, source_language, target_language, model_name): self.source_language = source_language self.target_language = target_language - self.translator = GoogleTranslator(source=source_language, target=target_language) + self.translator = Seq2SeqTranslator(model_name=model_name) def translate_all_piles(self): os.makedirs(f"./piles/{self.target_language}", exist_ok=True) @@ -323,6 +335,6 @@ class DatasetTranslator: # TODO: cmd line args -DatasetTranslator("english", "german").translate_all_piles() -DatasetTranslator("english", "spanish").translate_all_piles() -DatasetTranslator("english", "french").translate_all_piles() \ No newline at end of file +DatasetTranslator("english", "german", "Helsinki-NLP/opus-mt-en-de").translate_all_piles() +# DatasetTranslator("english", "spanish", "Helsinki-NLP/opus-mt-en-es").translate_all_piles() +# DatasetTranslator("english", "french", "Helsinki-NLP/opus-mt-en-fr").translate_all_piles() \ No newline at end of file diff --git a/train.py b/train.py index 776f664..8b1091b 100644 --- a/train.py +++ b/train.py @@ -133,7 +133,7 @@ python3 train.py \ """ python3 train.py \ - --run_name tinyhome-rev3 \ + --run_name tinyhome-rev4 \ --base_model TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ --bf16 \ --train_dataset data/home_assistant_train.jsonl \ @@ -668,13 +668,13 @@ if not training_run_args.dpo: tokens_in_train_set, longest_example = sum(example_lengths), max(example_lengths) print(f"Train dataset has {int(tokens_in_train_set / 1000000)}M tokens. Longest Example: {longest_example} tokens") - data_collator = DataCollatorForSupervisedFineTuning(tokenizer=tokenizer) + # data_collator = DataCollatorForSupervisedFineTuning(tokenizer=tokenizer) # fix for tinyllama not detecting split properly - # data_collator = DataCollatorForSupervisedFineTuning( - # tokenizer=tokenizer, - # prefix_ids=[29966, 29989, 465, 22137, 29989, 29958, 13], - # suffix_ids=[2], - # ) + data_collator = DataCollatorForSupervisedFineTuning( + tokenizer=tokenizer, + prefix_ids=[29966, 29989, 465, 22137, 29989, 29958, 13], + suffix_ids=[2], + ) trainer = CustomSFTTrainer( model=model,