Files
home-llm/train/training-job.yml
2025-12-21 14:14:31 -05:00

104 lines
3.2 KiB
YAML

apiVersion: batch/v1
kind: Job
metadata:
generateName: training-job-MODEL_NAME
namespace: ai
labels:
app: training-job
spec:
ttlSecondsAfterFinished: 604800 # 7 days (7 * 24 * 60 * 60)
template:
metadata:
labels:
app: training-job
spec:
containers:
- name: axolotl
image: axolotlai/axolotl-cloud:main-py3.11-cu128-2.8.0
imagePullPolicy: IfNotPresent
command:
- axolotl
- train
- /workspace/configs/MODEL_NAME.yml
env:
- name: AXOLOTL_DO_NOT_TRACK
value: "1"
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token
key: token
volumeMounts:
- name: training-runs
mountPath: /workspace/data/training-runs
- name: training-data
mountPath: /workspace/data/datasets
- name: training-configs
mountPath: /workspace/configs
- name: hf-cache
mountPath: /workspace/data/huggingface-cache
resources:
limits:
nvidia.com/gpu: 2 # number of GPUs to assign to this pod
initContainers:
- name: preprocess-dataset
image: axolotlai/axolotl-cloud:main-py3.11-cu128-2.8.0
imagePullPolicy: IfNotPresent
command:
- axolotl
- preprocess
- /workspace/configs/MODEL_NAME.yml
- --debug
env:
- name: AXOLOTL_DO_NOT_TRACK
value: "1"
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token
key: token
volumeMounts:
- name: training-runs
mountPath: /workspace/data/training-runs
- name: training-data
mountPath: /workspace/data/datasets
- name: training-configs
mountPath: /workspace/configs
- name: hf-cache
mountPath: /workspace/data/huggingface-cache
- name: tensorboard
image: python:3.11-slim
imagePullPolicy: IfNotPresent
restartPolicy: Always # mark as sidecar
command:
- bash
- -c
- "pip3 install tensorboard && tensorboard --logdir=/workspace/data/training-runs --host=0.0.0.0 --port 8080"
ports:
- containerPort: 8080
name: tensorboard
protocol: TCP
volumeMounts:
- name: training-runs
mountPath: /workspace/data/training-runs
runtimeClassName: nvidia
nodeSelector:
nvidia.com/gpu: "true"
restartPolicy: OnFailure
volumes:
- name: training-runs
hostPath:
path: /mnt/data/training-runs
type: DirectoryOrCreate
- name: training-data
hostPath:
path: /mnt/data/training-data
type: DirectoryOrCreate
- name: training-configs
hostPath:
path: /mnt/data/training-configs
type: DirectoryOrCreate
- name: hf-cache
hostPath:
path: /mnt/data/hf-cache
type: DirectoryOrCreate