add imatrix quants and k8s training job

This commit is contained in:
Alex O'Connell
2025-12-13 18:25:17 -05:00
parent dac9973cb5
commit edc09c40e7
2 changed files with 99 additions and 11 deletions

View File

@@ -1,36 +1,53 @@
#!/bin/bash
set -e
LLAMA_CPP=../llama.cpp
MODEL_NAME=$1
OUT_TYPE=${2:-"f16"}
MODELS_DIR=${3:-"./models"}
LLAMA_CPP=${4:-"./llama.cpp"}
if [[ ! -d "./models/$MODEL_NAME" ]]; then
if [[ ! -d "$MODELS_DIR/$MODEL_NAME" ]]; then
echo "Unknown model $MODEL_NAME"
exit -1
fi
if [ -f "./models/$MODEL_NAME/gguf_overrides.json" ]; then
OVERRIDES="--metadata ./models/$MODEL_NAME/gguf_overrides.json"
echo "Using metadata from ./models/$MODEL_NAME/gguf_overrides.json"
if [ -f "$MODELS_DIR/$MODEL_NAME/gguf_overrides.json" ]; then
OVERRIDES="--metadata $MODELS_DIR/$MODEL_NAME/gguf_overrides.json"
echo "Using metadata from $MODELS_DIR/$MODEL_NAME/gguf_overrides.json"
else
OVERRIDES=""
fi
echo "Converting to GGUF..."
if [ ! -f "./models/$MODEL_NAME/$MODEL_NAME.f16.gguf" ]; then
$LLAMA_CPP/convert_hf_to_gguf.py --outfile ./models/$MODEL_NAME/$MODEL_NAME.f16.gguf --outtype f16 ./models/$MODEL_NAME/ $OVERRIDES
if [ ! -f "$MODELS_DIR/$MODEL_NAME/$MODEL_NAME.$OUT_TYPE.gguf" ]; then
$LLAMA_CPP/convert_hf_to_gguf.py --outfile $MODELS_DIR/$MODEL_NAME/$MODEL_NAME.$OUT_TYPE.gguf --outtype $OUT_TYPE $MODELS_DIR/$MODEL_NAME/ $OVERRIDES
else
echo "Converted model for already exists. Skipping..."
fi
echo "Generate imatrix for model..."
if [ ! -f "groups_merged.txt" ]; then
echo "Downloading groups_merged.txt..."
wget https://huggingface.co/datasets/froggeric/imatrix/resolve/main/groups_merged.txt
fi
DESIRED_QUANTS=("Q8_0" "Q5_K_M" "Q4_0" "Q4_1" "Q4_K_M")
if [ ! -f "$MODELS_DIR/$MODEL_NAME/$MODEL_NAME.imatrix.gguf" ]; then
$LLAMA_CPP/build/bin/llama-imatrix -m $MODELS_DIR/$MODEL_NAME/$MODEL_NAME.$OUT_TYPE.gguf -ngl 999 -c 512 -f groups_merged.txt -o $MODELS_DIR/$MODEL_NAME/$MODEL_NAME.imatrix.gguf
else
echo "Imatrix model already exists. Skipping..."
fi
DESIRED_QUANTS=("Q8_0" "Q6_K" "Q5_K_M" "Q4_0" "Q4_1" "Q3_K_M" "IQ4_NL" "IQ4_XS")
for QUANT in "${DESIRED_QUANTS[@]}"
do
echo "Quantizing to $QUANT..."
QUANT_LOWER=$(echo "$QUANT" | awk '{print tolower($0)}')
if [ ! -f "./models/$MODEL_NAME/$MODEL_NAME.$QUANT_LOWER.gguf" ]; then
$LLAMA_CPP/build/bin/llama-quantize ./models/$MODEL_NAME/$MODEL_NAME.f16.gguf ./models/$MODEL_NAME/$MODEL_NAME.$QUANT_LOWER.gguf $QUANT
if [ ! -f "$MODELS_DIR/$MODEL_NAME/$MODEL_NAME.$QUANT_LOWER.gguf" ]; then
$LLAMA_CPP/build/bin/llama-quantize --imatrix $MODELS_DIR/$MODEL_NAME/$MODEL_NAME.imatrix.gguf $MODELS_DIR/$MODEL_NAME/$MODEL_NAME.$OUT_TYPE.gguf $MODELS_DIR/$MODEL_NAME/$MODEL_NAME.$QUANT_LOWER.gguf $QUANT
else
echo "Quantized model for '$QUANT' already exists. Skipping..."
fi
done
done
echo "All done!"

71
train/training-job.yml Normal file
View File

@@ -0,0 +1,71 @@
apiVersion: batch/v1
kind: Job
metadata:
generateName: training-job-
namespace: ai
labels:
app: training-job
spec:
template:
metadata:
labels:
app: training-job
spec:
containers:
- name: axolotl
image: axolotlai/axolotl-cloud:main-py3.11-cu128-2.8.0
imagePullPolicy: IfNotPresent
command:
- axolotl
- train
- /workspace/configs/gemma3-270m.yml
env:
- name: AXOLOTL_DO_NOT_TRACK
value: "1"
volumeMounts:
- name: training-runs
mountPath: /workspace/data/training-runs
- name: training-data
mountPath: /workspace/data/datasets
- name: training-configs
mountPath: /workspace/configs
- name: hf-cache
mountPath: /workspace/data/huggingface-cache
resources:
limits:
nvidia.com/gpu: 2
- name: tensorboard
image: python:3.11-slim
imagePullPolicy: IfNotPresent
command:
- bash
- -c
- "pip3 install tensorboard && tensorboard --logdir=/workspace/data/training-runs --host=0.0.0.0 --port 8080"
ports:
- containerPort: 8080
name: tensorboard
protocol: TCP
volumeMounts:
- name: training-runs
mountPath: /workspace/data/training-runs
runtimeClassName: nvidia
nodeSelector:
nvidia.com/gpu: "true"
restartPolicy: OnFailure
volumes:
- name: training-runs
hostPath:
path: /mnt/data/training-runs
type: DirectoryOrCreate
- name: training-data
hostPath:
path: /mnt/data/training-data
type: DirectoryOrCreate
- name: training-configs
hostPath:
path: /mnt/data/training-configs
type: DirectoryOrCreate
- name: hf-cache
hostPath:
path: /mnt/data/hf-cache
type: DirectoryOrCreate