mirror of
https://github.com/acon96/home-llm.git
synced 2026-01-07 21:04:08 -05:00
104 lines
3.2 KiB
YAML
104 lines
3.2 KiB
YAML
apiVersion: batch/v1
|
|
kind: Job
|
|
metadata:
|
|
generateName: training-job-MODEL_NAME
|
|
namespace: ai
|
|
labels:
|
|
app: training-job
|
|
spec:
|
|
ttlSecondsAfterFinished: 604800 # 7 days (7 * 24 * 60 * 60)
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: training-job
|
|
spec:
|
|
containers:
|
|
- name: axolotl
|
|
image: axolotlai/axolotl-cloud:main-py3.11-cu128-2.8.0
|
|
imagePullPolicy: IfNotPresent
|
|
command:
|
|
- axolotl
|
|
- train
|
|
- /workspace/configs/MODEL_NAME.yml
|
|
env:
|
|
- name: AXOLOTL_DO_NOT_TRACK
|
|
value: "1"
|
|
- name: HF_TOKEN
|
|
valueFrom:
|
|
secretKeyRef:
|
|
name: hf-token
|
|
key: token
|
|
volumeMounts:
|
|
- name: training-runs
|
|
mountPath: /workspace/data/training-runs
|
|
- name: training-data
|
|
mountPath: /workspace/data/datasets
|
|
- name: training-configs
|
|
mountPath: /workspace/configs
|
|
- name: hf-cache
|
|
mountPath: /workspace/data/huggingface-cache
|
|
resources:
|
|
limits:
|
|
nvidia.com/gpu: 2 # number of GPUs to assign to this pod
|
|
initContainers:
|
|
- name: preprocess-dataset
|
|
image: axolotlai/axolotl-cloud:main-py3.11-cu128-2.8.0
|
|
imagePullPolicy: IfNotPresent
|
|
command:
|
|
- axolotl
|
|
- preprocess
|
|
- /workspace/configs/MODEL_NAME.yml
|
|
- --debug
|
|
env:
|
|
- name: AXOLOTL_DO_NOT_TRACK
|
|
value: "1"
|
|
- name: HF_TOKEN
|
|
valueFrom:
|
|
secretKeyRef:
|
|
name: hf-token
|
|
key: token
|
|
volumeMounts:
|
|
- name: training-runs
|
|
mountPath: /workspace/data/training-runs
|
|
- name: training-data
|
|
mountPath: /workspace/data/datasets
|
|
- name: training-configs
|
|
mountPath: /workspace/configs
|
|
- name: hf-cache
|
|
mountPath: /workspace/data/huggingface-cache
|
|
- name: tensorboard
|
|
image: python:3.11-slim
|
|
imagePullPolicy: IfNotPresent
|
|
restartPolicy: Always # mark as sidecar
|
|
command:
|
|
- bash
|
|
- -c
|
|
- "pip3 install tensorboard && tensorboard --logdir=/workspace/data/training-runs --host=0.0.0.0 --port 8080"
|
|
ports:
|
|
- containerPort: 8080
|
|
name: tensorboard
|
|
protocol: TCP
|
|
volumeMounts:
|
|
- name: training-runs
|
|
mountPath: /workspace/data/training-runs
|
|
runtimeClassName: nvidia
|
|
nodeSelector:
|
|
nvidia.com/gpu: "true"
|
|
restartPolicy: OnFailure
|
|
volumes:
|
|
- name: training-runs
|
|
hostPath:
|
|
path: /mnt/data/training-runs
|
|
type: DirectoryOrCreate
|
|
- name: training-data
|
|
hostPath:
|
|
path: /mnt/data/training-data
|
|
type: DirectoryOrCreate
|
|
- name: training-configs
|
|
hostPath:
|
|
path: /mnt/data/training-configs
|
|
type: DirectoryOrCreate
|
|
- name: hf-cache
|
|
hostPath:
|
|
path: /mnt/data/hf-cache
|
|
type: DirectoryOrCreate |