clean up training folder + update TODO

2026-01-07 21:04:08 -05:00 · 2025-12-20 23:10:00 -05:00
parent 0b776c0a23
commit ee5d8c4a67
9 changed files with 21 additions and 16 deletions
--- a/TODO.md
+++ b/TODO.md
@@ -1,6 +1,7 @@
 # TODO
- [ ] new model based on qwen3 0.6b, 1.7b and 4b  
 - [ ] add examples of 'fixing' a failed tool call to the dataset
+- [ ] add proper 'refusals' to the dataset (i.e. tool/device not available or device is already in the desired state)
+- [ ] new model based on qwen3 0.6b, 1.7b and 4b  
 - [x] new model based on gemma3 270m  
 - [x] support AI task API  
 - [ ] vision support for remote backends  
--- a/repository.yaml
+++ b/repository.yaml
@@ -1,4 +0,0 @@
-# https://developers.home-assistant.io/docs/add-ons/repository#repository-configuration
-name: text-generation-webui - Home Assistant Add-on
-url: 'https://github.com/acon96/home-llm'
-maintainer: acon96
--- a/train/README.md
+++ b/train/README.md
@@ -0,0 +1,3 @@
+# Training Home LLM Models
+
+This directory contains resources and instructions for training Home LLM models. Currently, it is recommended to use axolotl via a Docker container for training. There are various examples of model configurations provided in the `config/` folder. Additionally, you can refer to the [Axolotl documentation](https://docs.axolotl.ai/) for more detailed guidance on setting up and running training sessions.
--- a/train/configs/functiongemma-270m.yml
+++ b/train/configs/functiongemma-270m.yml
--- a/train/configs/gemma3-270m.yml
+++ b/train/configs/gemma3-270m.yml
--- a/train/evaluate.py
+++ b/train/evaluate.py
--- a/train/run.sh
+++ b/train/run.sh
@@ -1,8 +0,0 @@
-docker run -d --rm \
-    --gpus all \
-    -p 8888:8888 \
-    -v /mnt/data/training-runs:/workspace/data/axolotl-artifacts \
-    -v /mnt/data/training-data:/workspace/data/datasets \
-    -v /mnt/data/training-configs:/workspace/configs \
-    -v /mnt/data/hf-cache:/workspace/data/huggingface-cache \
-    axolotlai/axolotl-cloud:main-py3.11-cu128-2.8.0
--- a/train/train.sh
+++ b/train/train.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+MODEL_NAME=${1}
+REMOTE_SERVER=${2}
+
+if [ -z "$MODEL_NAME" ] || [ -z "$REMOTE_SERVER" ]; then
+  echo "Usage: $0 <config-name> <remote-server>"
+  exit 1
+fi
+
+scp configs/${MODEL_NAME}.yml ${REMOTE_SERVER}:/mnt/data/training-configs/
+cat training-job.yml | sed "s/MODEL_NAME/${MODEL_NAME}/g" | kubectl create -f -
--- a/train/training-job.yml
+++ b/train/training-job.yml
@@ -1,11 +1,12 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  generateName: training-job-
+  generateName: training-job-MODEL_NAME
  namespace: ai
  labels:
    app: training-job
 spec:
+  ttlSecondsAfterFinished: 604800 # 7 days (7 * 24 * 60 * 60)
  template:
    metadata:
      labels:
@@ -18,7 +19,7 @@ spec:
          command: 
            - axolotl
            - train
-            - /workspace/configs/functiongemma-270m.yml
+            - /workspace/configs/MODEL_NAME.yml
          env:
            - name: AXOLOTL_DO_NOT_TRACK
              value: "1"
@@ -38,7 +39,7 @@ spec:
              mountPath: /workspace/data/huggingface-cache
          resources:
            limits:
-              nvidia.com/gpu: 2
+              nvidia.com/gpu: 2 # number of GPUs to assign to this pod
      initContainers:
        - name: tensorboard
          image: python:3.11-slim