mirror of
https://github.com/acon96/home-llm.git
synced 2026-01-07 21:04:08 -05:00
clean up training folder + update TODO
This commit is contained in:
3
TODO.md
3
TODO.md
@@ -1,6 +1,7 @@
|
||||
# TODO
|
||||
- [ ] new model based on qwen3 0.6b, 1.7b and 4b
|
||||
- [ ] add examples of 'fixing' a failed tool call to the dataset
|
||||
- [ ] add proper 'refusals' to the dataset (i.e. tool/device not available or device is already in the desired state)
|
||||
- [ ] new model based on qwen3 0.6b, 1.7b and 4b
|
||||
- [x] new model based on gemma3 270m
|
||||
- [x] support AI task API
|
||||
- [ ] vision support for remote backends
|
||||
|
||||
@@ -1,4 +0,0 @@
|
||||
# https://developers.home-assistant.io/docs/add-ons/repository#repository-configuration
|
||||
name: text-generation-webui - Home Assistant Add-on
|
||||
url: 'https://github.com/acon96/home-llm'
|
||||
maintainer: acon96
|
||||
3
train/README.md
Normal file
3
train/README.md
Normal file
@@ -0,0 +1,3 @@
|
||||
# Training Home LLM Models
|
||||
|
||||
This directory contains resources and instructions for training Home LLM models. Currently, it is recommended to use axolotl via a Docker container for training. There are various examples of model configurations provided in the `config/` folder. Additionally, you can refer to the [Axolotl documentation](https://docs.axolotl.ai/) for more detailed guidance on setting up and running training sessions.
|
||||
@@ -1,8 +0,0 @@
|
||||
docker run -d --rm \
|
||||
--gpus all \
|
||||
-p 8888:8888 \
|
||||
-v /mnt/data/training-runs:/workspace/data/axolotl-artifacts \
|
||||
-v /mnt/data/training-data:/workspace/data/datasets \
|
||||
-v /mnt/data/training-configs:/workspace/configs \
|
||||
-v /mnt/data/hf-cache:/workspace/data/huggingface-cache \
|
||||
axolotlai/axolotl-cloud:main-py3.11-cu128-2.8.0
|
||||
12
train/train.sh
Normal file
12
train/train.sh
Normal file
@@ -0,0 +1,12 @@
|
||||
#!/bin/bash
|
||||
|
||||
MODEL_NAME=${1}
|
||||
REMOTE_SERVER=${2}
|
||||
|
||||
if [ -z "$MODEL_NAME" ] || [ -z "$REMOTE_SERVER" ]; then
|
||||
echo "Usage: $0 <config-name> <remote-server>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
scp configs/${MODEL_NAME}.yml ${REMOTE_SERVER}:/mnt/data/training-configs/
|
||||
cat training-job.yml | sed "s/MODEL_NAME/${MODEL_NAME}/g" | kubectl create -f -
|
||||
@@ -1,11 +1,12 @@
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
generateName: training-job-
|
||||
generateName: training-job-MODEL_NAME
|
||||
namespace: ai
|
||||
labels:
|
||||
app: training-job
|
||||
spec:
|
||||
ttlSecondsAfterFinished: 604800 # 7 days (7 * 24 * 60 * 60)
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
@@ -18,7 +19,7 @@ spec:
|
||||
command:
|
||||
- axolotl
|
||||
- train
|
||||
- /workspace/configs/functiongemma-270m.yml
|
||||
- /workspace/configs/MODEL_NAME.yml
|
||||
env:
|
||||
- name: AXOLOTL_DO_NOT_TRACK
|
||||
value: "1"
|
||||
@@ -38,7 +39,7 @@ spec:
|
||||
mountPath: /workspace/data/huggingface-cache
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 2
|
||||
nvidia.com/gpu: 2 # number of GPUs to assign to this pod
|
||||
initContainers:
|
||||
- name: tensorboard
|
||||
image: python:3.11-slim
|
||||
|
||||
Reference in New Issue
Block a user