diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_beam.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_beam.sh new file mode 100755 index 0000000000..e5e32ed339 --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_beam.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +export PYTHONPATH="." +export DEV=${DEV:-AMD} +export IGNORE_OOB=1 +export REWRITE_STACK_LIMIT=5000000 HCQDEV_WAIT_TIMEOUT_MS=240000 + +export DEBUG=${DEBUG:-2} +export FLASH_ATTENTION=${FLASH_ATTENTION:-1} +export ALL2ALL=${ALL2ALL:-1} + +export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16" +export DP=8 BS=8 EVAL_BS=8 GRADIENT_ACC_STEPS=1 +export GBS=$((BS * GRADIENT_ACC_STEPS)) + +export MODEL="llama3" +export BASEDIR="/raid/datasets/c4-8b/" +export SMALL=1 +export LLAMA3_SIZE=${LLAMA3_SIZE:-"8B"} +export EVAL_TARGET=3.3 EVAL_FREQ=12288 +export LR="4e-4" END_LR="4e-5" WARMUP_SAMPLES=256 MAX_STEPS=1200000 +export WARMUP_STEPS=$((WARMUP_SAMPLES / GBS)) +export SAMPLES=$((MAX_STEPS * GBS)) + +export SEED=5760 + +export JITBEAM=3 +export BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 + +export FAKEDATA=1 BENCHMARK=10 LLAMA_LAYERS=2 + +python3 examples/mlperf/model_train.py