diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/stable_diffusion/implementations/tinybox_8xMI300X/dev_run.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/stable_diffusion/implementations/tinybox_8xMI300X/dev_run.sh new file mode 100755 index 0000000000..5e35ff65a4 --- /dev/null +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/stable_diffusion/implementations/tinybox_8xMI300X/dev_run.sh @@ -0,0 +1,72 @@ +#!/usr/bin/env bash + +DATETIME=${2:-$(date "+%m%d%H%M")} +LOGFILE="${HOME}/logs/sd_mi300x_${DATETIME}.log" +# UNET_CKPTDIR must be set: training saves checkpoints to this path, then a separate eval process scans this path to know which checkpoints to eval +export UNET_CKPTDIR="${HOME}/stable_diffusion/training_checkpoints/${DATETIME}" +mkdir -p "${HOME}/logs" "$UNET_CKPTDIR" + +# run this script in isolation when using the --bg flag +if [[ "${1:-}" == "--bg" ]]; then + echo "logging output to $LOGFILE" + echo "saving UNet checkpoints to $UNET_CKPTDIR" + script_path="$(readlink -f "${BASH_SOURCE[0]}")" + nohup bash "$script_path" run "$DATETIME" >"$LOGFILE" 2>&1 & disown $! + exit 0 +fi + +# venv management +if [[ -d .venv-sd-mlperf ]]; then + . .venv-sd-mlperf/bin/activate +else + python3 -m venv .venv-sd-mlperf && . .venv-sd-mlperf/bin/activate + pip install --index-url https://download.pytorch.org/whl/cpu torch && pip install tqdm numpy ftfy regex pillow scipy wandb webdataset +fi +pip list +apt list --installed | grep amdgpu +rocm-smi --version +modinfo amdgpu | grep version + +export BEAM=2 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 IGNORE_JIT_FIRST_BEAM=1 HCQDEV_WAIT_TIMEOUT_MS=300000 +export AMD_LLVM=0 # bf16 seems to require this +export DATADIR="/raid/datasets/stable_diffusion" +export CKPTDIR="/raid/weights/stable_diffusion" +export EVAL_CKPT_DIR=$UNET_CKPTDIR +export MODEL="stable_diffusion" PYTHONPATH="." +export GPUS=8 BS=304 +export CONTEXT_BS=816 DENOISE_BS=600 DECODE_BS=384 INCEPTION_BS=560 CLIP_BS=240 +export WANDB=1 +export PARALLEL=4 +export PYTHONUNBUFFERED=1 +sudo rocm-smi -d 0 1 2 3 4 5 6 7 --setperfdeterminism 1500 || exit 1 + +# Retry BEAM search if script fails before BEAM COMPLETE is printed, but don't retry after that +run_retry(){ local try=0 max=5 code tmp py pgid kids + while :; do + tmp=$(mktemp) + setsid bash -c 'exec env "$@"' _ "$@" > >(tee -a "$LOGFILE" | tee "$tmp") 2>&1 & + py=$!; pgid=$(ps -o pgid= -p "$py" | tr -d ' ') + wait "$py"; code=$? + [[ -n "$pgid" ]] && { kill -TERM -"$pgid" 2>/dev/null; sleep 1; kill -KILL -"$pgid" 2>/dev/null; } + kids=$(pgrep -P "$py" || true) + while [[ -n "$kids" ]]; do + kill -TERM $kids 2>/dev/null; sleep 0.5 + kids=$(for k in $kids; do pgrep -P "$k" || true; done) + done + grep -q 'BEAM COMPLETE' "$tmp" && { rm -f "$tmp"; return 1; } + rm -f "$tmp" + ((code==0)) && return 0 + ((try>=max)) && return 2 + ((try++)); sleep 90; echo "try = ${try}" + done +} + +# Power limiting to 400W is only needed if GPUs fall out of sync (causing 2.2x increased train time) at higher power, which has been observed at 450W +sudo rocm-smi -d 0 1 2 3 4 5 6 7 --setpoweroverdrive 750 && \ +run_retry TOTAL_CKPTS=7 python3 examples/mlperf/model_train.py; (( $? == 2 )) && { echo "training failed before BEAM completion"; exit 2; } +sleep 90 + +run_retry EVAL_SAMPLES=600 python3 examples/mlperf/model_eval.py; (( $? == 2 )) && { echo "eval failed before BEAM completion"; exit 2; } +# Checkpoints will be evaluated in reverse chronological order, even if above training crashed early +# STOP_IF_CONVERGED=1: Stop the eval after the first time convergence is detected; no more checkpoints will be evaluated after that. +STOP_IF_CONVERGED=1 python3 examples/mlperf/model_eval.py