From 0c769289ebd6852b38db30ea48736dc71c176003 Mon Sep 17 00:00:00 2001 From: wozeparrot Date: Thu, 5 Mar 2026 14:18:03 +0800 Subject: [PATCH] llama3: more scripts (#15107) --- .../tinybox_8xMI350X/dev_beam.sh | 36 ++++++++++++++++ .../tinybox_8xMI350X/dev_run.sh | 31 +++++++++++++ .../tinybox_8xMI350X/dev_beam.sh | 5 ++- .../tinybox_8xMI350X/dev_beam_mp.sh | 43 +++++++++++++++++++ .../tinybox_8xMI350X/dev_run.sh | 4 +- .../tinybox_8xMI350X/dev_run_mp.sh | 38 ++++++++++++++++ 6 files changed, 153 insertions(+), 4 deletions(-) create mode 100755 examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama405b/implementations/tinybox_8xMI350X/dev_beam.sh create mode 100755 examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama405b/implementations/tinybox_8xMI350X/dev_run.sh create mode 100755 examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_beam_mp.sh create mode 100755 examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_run_mp.sh diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama405b/implementations/tinybox_8xMI350X/dev_beam.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama405b/implementations/tinybox_8xMI350X/dev_beam.sh new file mode 100755 index 0000000000..e32fab719d --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama405b/implementations/tinybox_8xMI350X/dev_beam.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash + +export PYTHONPATH="." +export DEV=${DEV:-AMD} +export EMULATE="AMD_CDNA4" +export CHECK_OOB=0 +export REWRITE_STACK_LIMIT=5000000 HCQDEV_WAIT_TIMEOUT_MS=240000 + +export DEBUG=${DEBUG:-2} +export HK_FLASH_ATTENTION=${HK_FLASH_ATTENTION:-1} +export ALL2ALL=${ALL2ALL:-1} +export USE_ATOMICS=${USE_ATOMICS:-0} +export ASM_GEMM=${ASM_GEMM:-1} +export WQKV=${WQKV:-1} + +export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16" +export DP=${DP:-1} MP=${MP:-8} +export BS=${BS:-1} EVAL_BS=${EVAL_BS:-1} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2} + +export MODEL="llama3" +export BASEDIR="/raid/datasets/c4/" +export LLAMA3_SIZE=${LLAMA3_SIZE:-"405B"} +export SEQLEN=${SEQLEN:-8192} + +export SEED=${SEED:-5760} +export DATA_SEED=${DATA_SEED:-5760} + +export JITBEAM=${JITBEAM:-3} +export BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=1 + +export FAKEDATA=1 BENCHMARK=10 +if [ -z "$FULL_LAYERS" ]; then + export LLAMA_LAYERS=2 +fi + +python3 examples/mlperf/model_train.py diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama405b/implementations/tinybox_8xMI350X/dev_run.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama405b/implementations/tinybox_8xMI350X/dev_run.sh new file mode 100755 index 0000000000..66be620677 --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama405b/implementations/tinybox_8xMI350X/dev_run.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +export PYTHONPATH="." +export DEV=${DEV:-AMD} +export EMULATE="AMD_CDNA4" +export CHECK_OOB=0 +export REWRITE_STACK_LIMIT=5000000 HCQDEV_WAIT_TIMEOUT_MS=240000 + +export DEBUG=${DEBUG:-0} +export HK_FLASH_ATTENTION=${HK_FLASH_ATTENTION:-1} +export ALL2ALL=${ALL2ALL:-1} +export USE_ATOMICS=${USE_ATOMICS:-0} +export ASM_GEMM=${ASM_GEMM:-1} +export WQKV=${WQKV:-1} + +export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16" +export DP=${DP:-1} MP=${MP:-8} +export BS=${BS:-1} EVAL_BS=${EVAL_BS:-1} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-1152} + +export MODEL="llama3" +export BASEDIR="/raid/datasets/c4/" +export LLAMA3_SIZE=${LLAMA3_SIZE:-"405B"} +export SEQLEN=${SEQLEN:-8192} + +export SEED=${SEED:-$RANDOM} +export DATA_SEED=${DATA_SEED:-5760} + +export JITBEAM=${JITBEAM:-3} +export BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=1 + +python3 examples/mlperf/model_train.py diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_beam.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_beam.sh index 62c1048632..277c340f5b 100755 --- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_beam.sh +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_beam.sh @@ -5,6 +5,7 @@ export DEV=${DEV:-AMD} export EMULATE="AMD_CDNA4" export CHECK_OOB=0 export REWRITE_STACK_LIMIT=5000000 HCQDEV_WAIT_TIMEOUT_MS=240000 +export DEVICE_IN_FUNCTION_BUG=1 export DEBUG=${DEBUG:-2} export HK_FLASH_ATTENTION=${HK_FLASH_ATTENTION:-1} @@ -14,7 +15,7 @@ export ASM_GEMM=${ASM_GEMM:-1} export WQKV=${WQKV:-0} export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16" -export DP=${DP:-8} BS=${BS:-8} EVAL_BS=${EVAL_BS:-8} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2} +export DP=${DP:-8} MP=${MP:-1} BS=${BS:-8} EVAL_BS=${EVAL_BS:-8} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2} export GBS=$((BS * GRADIENT_ACC_STEPS)) export MODEL="llama3" @@ -22,7 +23,7 @@ export BASEDIR="/raid/datasets/c4-8b/" export SMALL=1 export LLAMA3_SIZE=${LLAMA3_SIZE:-"8B"} export EVAL_TARGET=3.3 EVAL_FREQ=12288 -export LR="4e-4" END_LR="4e-5" WARMUP_SAMPLES=256 MAX_STEPS=1200000 +export LR="1e-3" END_LR="1e-4" WARMUP_SAMPLES=4096 MAX_STEPS=1200000 export WARMUP_STEPS=$((WARMUP_SAMPLES / GBS)) export SAMPLES=$((MAX_STEPS * GBS)) export SEQLEN=${SEQLEN:-8192} diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_beam_mp.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_beam_mp.sh new file mode 100755 index 0000000000..69e902ddbe --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_beam_mp.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash + +export PYTHONPATH="." +export DEV=${DEV:-AMD} +export EMULATE="AMD_CDNA4" +export CHECK_OOB=0 +export REWRITE_STACK_LIMIT=5000000 HCQDEV_WAIT_TIMEOUT_MS=240000 +export DEVICE_IN_FUNCTION_BUG=1 + +export DEBUG=${DEBUG:-2} +export HK_FLASH_ATTENTION=${HK_FLASH_ATTENTION:-1} +export ALL2ALL=${ALL2ALL:-1} +export USE_ATOMICS=${USE_ATOMICS:-0} +export ASM_GEMM=${ASM_GEMM:-1} +export WQKV=${WQKV:-1} +export OFFLOAD_OPTIM=${OFFLOAD_OPTIM:-1} + +export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16" +export DP=${DP:-1} MP=${MP:-8} BS=${BS:-1} EVAL_BS=${EVAL_BS:-1} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2} +export GBS=$((BS * GRADIENT_ACC_STEPS)) + +export MODEL="llama3" +export BASEDIR="/raid/datasets/c4-8b/" +export SMALL=1 +export LLAMA3_SIZE=${LLAMA3_SIZE:-"8B"} +export EVAL_TARGET=3.3 EVAL_FREQ=12288 +export LR="1e-3" END_LR="1e-4" WARMUP_SAMPLES=4096 MAX_STEPS=1200000 +export WARMUP_STEPS=$((WARMUP_SAMPLES / GBS)) +export SAMPLES=$((MAX_STEPS * GBS)) +export SEQLEN=${SEQLEN:-8192} + +export SEED=${SEED:-5760} +export DATA_SEED=${DATA_SEED:-5760} + +export JITBEAM=${JITBEAM:-3} +export BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=1 + +export FAKEDATA=1 BENCHMARK=10 +if [ -z "$FULL_LAYERS" ]; then + export LLAMA_LAYERS=2 +fi + +python3 examples/mlperf/model_train.py diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_run.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_run.sh index 89c48e4d6d..5c175514c8 100755 --- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_run.sh +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_run.sh @@ -15,7 +15,7 @@ export ASM_GEMM=${ASM_GEMM:-1} export WQKV=${WQKV:-0} export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16" -export DP=${DP:-8} BS=${BS:-8} EVAL_BS=${EVAL_BS:-8} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2} +export DP=${DP:-8} MP=${MP:-1} BS=${BS:-8} EVAL_BS=${EVAL_BS:-8} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-4} export GBS=$((BS * GRADIENT_ACC_STEPS)) export MODEL="llama3" @@ -23,7 +23,7 @@ export BASEDIR="/raid/datasets/c4-8b/" export SMALL=1 export LLAMA3_SIZE=${LLAMA3_SIZE:-"8B"} export EVAL_TARGET=3.3 EVAL_FREQ=12288 -export LR="4e-4" END_LR="4e-5" WARMUP_SAMPLES=256 MAX_STEPS=1200000 +export LR="1e-3" END_LR="1e-4" WARMUP_SAMPLES=4096 MAX_STEPS=1200000 export WARMUP_STEPS=$((WARMUP_SAMPLES / GBS)) export SAMPLES=$((MAX_STEPS * GBS)) export SEQLEN=${SEQLEN:-8192} diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_run_mp.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_run_mp.sh new file mode 100755 index 0000000000..04b40de827 --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_run_mp.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +export PYTHONPATH="." +export DEV=${DEV:-AMD} +export EMULATE="AMD_CDNA4" +export CHECK_OOB=0 +export REWRITE_STACK_LIMIT=5000000 HCQDEV_WAIT_TIMEOUT_MS=240000 +export DEVICE_IN_FUNCTION_BUG=1 + +export DEBUG=${DEBUG:-0} +export HK_FLASH_ATTENTION=${HK_FLASH_ATTENTION:-1} +export ALL2ALL=${ALL2ALL:-1} +export USE_ATOMICS=${USE_ATOMICS:-0} +export ASM_GEMM=${ASM_GEMM:-1} +export WQKV=${WQKV:-1} +export OFFLOAD_OPTIM=${OFFLOAD_OPTIM:-1} + +export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16" +export DP=${DP:-1} MP=${MP:-8} BS=${BS:-1} EVAL_BS=${EVAL_BS:-1} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-32} +export GBS=$((BS * GRADIENT_ACC_STEPS)) + +export MODEL="llama3" +export BASEDIR="/raid/datasets/c4-8b/" +export SMALL=1 +export LLAMA3_SIZE=${LLAMA3_SIZE:-"8B"} +export EVAL_TARGET=3.3 EVAL_FREQ=12288 +export LR="1e-3" END_LR="1e-4" WARMUP_SAMPLES=4096 MAX_STEPS=1200000 +export WARMUP_STEPS=$((WARMUP_SAMPLES / GBS)) +export SAMPLES=$((MAX_STEPS * GBS)) +export SEQLEN=${SEQLEN:-8192} + +export SEED=${SEED:-$RANDOM} +export DATA_SEED=${DATA_SEED:-5760} + +export JITBEAM=${JITBEAM:-3} +export BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=1 + +python3 examples/mlperf/model_train.py