mirror of
https://github.com/acon96/home-llm.git
synced 2026-01-08 21:28:05 -05:00
clean up training folder + update TODO
This commit is contained in:
3
TODO.md
3
TODO.md
@@ -1,6 +1,7 @@
|
|||||||
# TODO
|
# TODO
|
||||||
- [ ] new model based on qwen3 0.6b, 1.7b and 4b
|
|
||||||
- [ ] add examples of 'fixing' a failed tool call to the dataset
|
- [ ] add examples of 'fixing' a failed tool call to the dataset
|
||||||
|
- [ ] add proper 'refusals' to the dataset (i.e. tool/device not available or device is already in the desired state)
|
||||||
|
- [ ] new model based on qwen3 0.6b, 1.7b and 4b
|
||||||
- [x] new model based on gemma3 270m
|
- [x] new model based on gemma3 270m
|
||||||
- [x] support AI task API
|
- [x] support AI task API
|
||||||
- [ ] vision support for remote backends
|
- [ ] vision support for remote backends
|
||||||
|
|||||||
@@ -1,4 +0,0 @@
|
|||||||
# https://developers.home-assistant.io/docs/add-ons/repository#repository-configuration
|
|
||||||
name: text-generation-webui - Home Assistant Add-on
|
|
||||||
url: 'https://github.com/acon96/home-llm'
|
|
||||||
maintainer: acon96
|
|
||||||
3
train/README.md
Normal file
3
train/README.md
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
# Training Home LLM Models
|
||||||
|
|
||||||
|
This directory contains resources and instructions for training Home LLM models. Currently, it is recommended to use axolotl via a Docker container for training. There are various examples of model configurations provided in the `config/` folder. Additionally, you can refer to the [Axolotl documentation](https://docs.axolotl.ai/) for more detailed guidance on setting up and running training sessions.
|
||||||
@@ -1,8 +0,0 @@
|
|||||||
docker run -d --rm \
|
|
||||||
--gpus all \
|
|
||||||
-p 8888:8888 \
|
|
||||||
-v /mnt/data/training-runs:/workspace/data/axolotl-artifacts \
|
|
||||||
-v /mnt/data/training-data:/workspace/data/datasets \
|
|
||||||
-v /mnt/data/training-configs:/workspace/configs \
|
|
||||||
-v /mnt/data/hf-cache:/workspace/data/huggingface-cache \
|
|
||||||
axolotlai/axolotl-cloud:main-py3.11-cu128-2.8.0
|
|
||||||
12
train/train.sh
Normal file
12
train/train.sh
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
MODEL_NAME=${1}
|
||||||
|
REMOTE_SERVER=${2}
|
||||||
|
|
||||||
|
if [ -z "$MODEL_NAME" ] || [ -z "$REMOTE_SERVER" ]; then
|
||||||
|
echo "Usage: $0 <config-name> <remote-server>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
scp configs/${MODEL_NAME}.yml ${REMOTE_SERVER}:/mnt/data/training-configs/
|
||||||
|
cat training-job.yml | sed "s/MODEL_NAME/${MODEL_NAME}/g" | kubectl create -f -
|
||||||
@@ -1,11 +1,12 @@
|
|||||||
apiVersion: batch/v1
|
apiVersion: batch/v1
|
||||||
kind: Job
|
kind: Job
|
||||||
metadata:
|
metadata:
|
||||||
generateName: training-job-
|
generateName: training-job-MODEL_NAME
|
||||||
namespace: ai
|
namespace: ai
|
||||||
labels:
|
labels:
|
||||||
app: training-job
|
app: training-job
|
||||||
spec:
|
spec:
|
||||||
|
ttlSecondsAfterFinished: 604800 # 7 days (7 * 24 * 60 * 60)
|
||||||
template:
|
template:
|
||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
@@ -18,7 +19,7 @@ spec:
|
|||||||
command:
|
command:
|
||||||
- axolotl
|
- axolotl
|
||||||
- train
|
- train
|
||||||
- /workspace/configs/functiongemma-270m.yml
|
- /workspace/configs/MODEL_NAME.yml
|
||||||
env:
|
env:
|
||||||
- name: AXOLOTL_DO_NOT_TRACK
|
- name: AXOLOTL_DO_NOT_TRACK
|
||||||
value: "1"
|
value: "1"
|
||||||
@@ -38,7 +39,7 @@ spec:
|
|||||||
mountPath: /workspace/data/huggingface-cache
|
mountPath: /workspace/data/huggingface-cache
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
nvidia.com/gpu: 2
|
nvidia.com/gpu: 2 # number of GPUs to assign to this pod
|
||||||
initContainers:
|
initContainers:
|
||||||
- name: tensorboard
|
- name: tensorboard
|
||||||
image: python:3.11-slim
|
image: python:3.11-slim
|
||||||
|
|||||||
Reference in New Issue
Block a user