clean up training folder + update TODO

This commit is contained in:
Alex O'Connell
2025-12-20 23:10:00 -05:00
parent 0b776c0a23
commit ee5d8c4a67
9 changed files with 21 additions and 16 deletions

View File

@@ -1,6 +1,7 @@
# TODO
- [ ] new model based on qwen3 0.6b, 1.7b and 4b
- [ ] add examples of 'fixing' a failed tool call to the dataset
- [ ] add proper 'refusals' to the dataset (i.e. tool/device not available or device is already in the desired state)
- [ ] new model based on qwen3 0.6b, 1.7b and 4b
- [x] new model based on gemma3 270m
- [x] support AI task API
- [ ] vision support for remote backends

View File

@@ -1,4 +0,0 @@
# https://developers.home-assistant.io/docs/add-ons/repository#repository-configuration
name: text-generation-webui - Home Assistant Add-on
url: 'https://github.com/acon96/home-llm'
maintainer: acon96

3
train/README.md Normal file
View File

@@ -0,0 +1,3 @@
# Training Home LLM Models
This directory contains resources and instructions for training Home LLM models. Currently, it is recommended to use axolotl via a Docker container for training. There are various examples of model configurations provided in the `config/` folder. Additionally, you can refer to the [Axolotl documentation](https://docs.axolotl.ai/) for more detailed guidance on setting up and running training sessions.

View File

@@ -1,8 +0,0 @@
docker run -d --rm \
--gpus all \
-p 8888:8888 \
-v /mnt/data/training-runs:/workspace/data/axolotl-artifacts \
-v /mnt/data/training-data:/workspace/data/datasets \
-v /mnt/data/training-configs:/workspace/configs \
-v /mnt/data/hf-cache:/workspace/data/huggingface-cache \
axolotlai/axolotl-cloud:main-py3.11-cu128-2.8.0

12
train/train.sh Normal file
View File

@@ -0,0 +1,12 @@
#!/bin/bash
MODEL_NAME=${1}
REMOTE_SERVER=${2}
if [ -z "$MODEL_NAME" ] || [ -z "$REMOTE_SERVER" ]; then
echo "Usage: $0 <config-name> <remote-server>"
exit 1
fi
scp configs/${MODEL_NAME}.yml ${REMOTE_SERVER}:/mnt/data/training-configs/
cat training-job.yml | sed "s/MODEL_NAME/${MODEL_NAME}/g" | kubectl create -f -

View File

@@ -1,11 +1,12 @@
apiVersion: batch/v1
kind: Job
metadata:
generateName: training-job-
generateName: training-job-MODEL_NAME
namespace: ai
labels:
app: training-job
spec:
ttlSecondsAfterFinished: 604800 # 7 days (7 * 24 * 60 * 60)
template:
metadata:
labels:
@@ -18,7 +19,7 @@ spec:
command:
- axolotl
- train
- /workspace/configs/functiongemma-270m.yml
- /workspace/configs/MODEL_NAME.yml
env:
- name: AXOLOTL_DO_NOT_TRACK
value: "1"
@@ -38,7 +39,7 @@ spec:
mountPath: /workspace/data/huggingface-cache
resources:
limits:
nvidia.com/gpu: 2
nvidia.com/gpu: 2 # number of GPUs to assign to this pod
initContainers:
- name: tensorboard
image: python:3.11-slim