clean up training folder + update TODO

2026-01-08 21:28:05 -05:00 · 2025-12-20 23:10:00 -05:00
parent 0b776c0a23
commit ee5d8c4a67
9 changed files with 21 additions and 16 deletions
--- a/TODO.md
+++ b/TODO.md
@@ -1,6 +1,7 @@
 # TODO
 - [ ] new model based on qwen3 0.6b, 1.7b and 4b  
 - [ ] add examples of 'fixing' a failed tool call to the dataset
 - [ ] add proper 'refusals' to the dataset (i.e. tool/device not available or device is already in the desired state)
 - [ ] new model based on qwen3 0.6b, 1.7b and 4b  
 - [x] new model based on gemma3 270m  
 - [x] support AI task API  
 - [ ] vision support for remote backends  
--- a/repository.yaml
+++ b/repository.yaml
@@ -1,4 +0,0 @@
 # https://developers.home-assistant.io/docs/add-ons/repository#repository-configuration
 name: text-generation-webui - Home Assistant Add-on
 url: 'https://github.com/acon96/home-llm'
 maintainer: acon96
--- a/train/README.md
+++ b/train/README.md
@@ -0,0 +1,3 @@
 # Training Home LLM Models
 This directory contains resources and instructions for training Home LLM models. Currently, it is recommended to use axolotl via a Docker container for training. There are various examples of model configurations provided in the `config/` folder. Additionally, you can refer to the [Axolotl documentation](https://docs.axolotl.ai/) for more detailed guidance on setting up and running training sessions.
--- a/train/configs/functiongemma-270m.yml
+++ b/train/configs/functiongemma-270m.yml
--- a/train/configs/gemma3-270m.yml
+++ b/train/configs/gemma3-270m.yml
--- a/train/evaluate.py
+++ b/train/evaluate.py
--- a/train/run.sh
+++ b/train/run.sh
@@ -1,8 +0,0 @@
 docker run -d --rm \
    --gpus all \
    -p 8888:8888 \
    -v /mnt/data/training-runs:/workspace/data/axolotl-artifacts \
    -v /mnt/data/training-data:/workspace/data/datasets \
    -v /mnt/data/training-configs:/workspace/configs \
    -v /mnt/data/hf-cache:/workspace/data/huggingface-cache \
    axolotlai/axolotl-cloud:main-py3.11-cu128-2.8.0
--- a/train/train.sh
+++ b/train/train.sh
@@ -0,0 +1,12 @@
 #!/bin/bash
 MODEL_NAME=${1}
 REMOTE_SERVER=${2}
 if [ -z "$MODEL_NAME" ] || [ -z "$REMOTE_SERVER" ]; then
  echo "Usage: $0 <config-name> <remote-server>"
  exit 1
 fi
 scp configs/${MODEL_NAME}.yml ${REMOTE_SERVER}:/mnt/data/training-configs/
 cat training-job.yml | sed "s/MODEL_NAME/${MODEL_NAME}/g" | kubectl create -f -
--- a/train/training-job.yml
+++ b/train/training-job.yml
@@ -1,11 +1,12 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  generateName: training-job-
+  generateName: training-job-MODEL_NAME
  namespace: ai
  labels:
    app: training-job
 spec:
  ttlSecondsAfterFinished: 604800 # 7 days (7 * 24 * 60 * 60)
  template:
    metadata:
      labels:
@@ -18,7 +19,7 @@ spec:
          command: 
            - axolotl
            - train
-            - /workspace/configs/functiongemma-270m.yml
+            - /workspace/configs/MODEL_NAME.yml
          env:
            - name: AXOLOTL_DO_NOT_TRACK
              value: "1"
@@ -38,7 +39,7 @@ spec:
              mountPath: /workspace/data/huggingface-cache
          resources:
            limits:
-              nvidia.com/gpu: 2
+              nvidia.com/gpu: 2 # number of GPUs to assign to this pod
      initContainers:
        - name: tensorboard
          image: python:3.11-slim
		`@@ -0,0 +1,3 @@`
							`# Training Home LLM Models`

							This directory contains resources and instructions for training Home LLM models. Currently, it is recommended to use axolotl via a Docker container for training. There are various examples of model configurations provided in the `config/` folder. Additionally, you can refer to the [Axolotl documentation](https://docs.axolotl.ai/) for more detailed guidance on setting up and running training sessions.