llama3: more scripts (#15107)

This commit is contained in:
wozeparrot
2026-03-05 14:18:03 +08:00
committed by GitHub
parent fb43b415f9
commit 0c769289eb
6 changed files with 153 additions and 4 deletions

View File

@@ -0,0 +1,36 @@
#!/usr/bin/env bash
export PYTHONPATH="."
export DEV=${DEV:-AMD}
export EMULATE="AMD_CDNA4"
export CHECK_OOB=0
export REWRITE_STACK_LIMIT=5000000 HCQDEV_WAIT_TIMEOUT_MS=240000
export DEBUG=${DEBUG:-2}
export HK_FLASH_ATTENTION=${HK_FLASH_ATTENTION:-1}
export ALL2ALL=${ALL2ALL:-1}
export USE_ATOMICS=${USE_ATOMICS:-0}
export ASM_GEMM=${ASM_GEMM:-1}
export WQKV=${WQKV:-1}
export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16"
export DP=${DP:-1} MP=${MP:-8}
export BS=${BS:-1} EVAL_BS=${EVAL_BS:-1} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2}
export MODEL="llama3"
export BASEDIR="/raid/datasets/c4/"
export LLAMA3_SIZE=${LLAMA3_SIZE:-"405B"}
export SEQLEN=${SEQLEN:-8192}
export SEED=${SEED:-5760}
export DATA_SEED=${DATA_SEED:-5760}
export JITBEAM=${JITBEAM:-3}
export BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=1
export FAKEDATA=1 BENCHMARK=10
if [ -z "$FULL_LAYERS" ]; then
export LLAMA_LAYERS=2
fi
python3 examples/mlperf/model_train.py

View File

@@ -0,0 +1,31 @@
#!/usr/bin/env bash
export PYTHONPATH="."
export DEV=${DEV:-AMD}
export EMULATE="AMD_CDNA4"
export CHECK_OOB=0
export REWRITE_STACK_LIMIT=5000000 HCQDEV_WAIT_TIMEOUT_MS=240000
export DEBUG=${DEBUG:-0}
export HK_FLASH_ATTENTION=${HK_FLASH_ATTENTION:-1}
export ALL2ALL=${ALL2ALL:-1}
export USE_ATOMICS=${USE_ATOMICS:-0}
export ASM_GEMM=${ASM_GEMM:-1}
export WQKV=${WQKV:-1}
export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16"
export DP=${DP:-1} MP=${MP:-8}
export BS=${BS:-1} EVAL_BS=${EVAL_BS:-1} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-1152}
export MODEL="llama3"
export BASEDIR="/raid/datasets/c4/"
export LLAMA3_SIZE=${LLAMA3_SIZE:-"405B"}
export SEQLEN=${SEQLEN:-8192}
export SEED=${SEED:-$RANDOM}
export DATA_SEED=${DATA_SEED:-5760}
export JITBEAM=${JITBEAM:-3}
export BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=1
python3 examples/mlperf/model_train.py

View File

@@ -5,6 +5,7 @@ export DEV=${DEV:-AMD}
export EMULATE="AMD_CDNA4"
export CHECK_OOB=0
export REWRITE_STACK_LIMIT=5000000 HCQDEV_WAIT_TIMEOUT_MS=240000
export DEVICE_IN_FUNCTION_BUG=1
export DEBUG=${DEBUG:-2}
export HK_FLASH_ATTENTION=${HK_FLASH_ATTENTION:-1}
@@ -14,7 +15,7 @@ export ASM_GEMM=${ASM_GEMM:-1}
export WQKV=${WQKV:-0}
export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16"
export DP=${DP:-8} BS=${BS:-8} EVAL_BS=${EVAL_BS:-8} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2}
export DP=${DP:-8} MP=${MP:-1} BS=${BS:-8} EVAL_BS=${EVAL_BS:-8} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2}
export GBS=$((BS * GRADIENT_ACC_STEPS))
export MODEL="llama3"
@@ -22,7 +23,7 @@ export BASEDIR="/raid/datasets/c4-8b/"
export SMALL=1
export LLAMA3_SIZE=${LLAMA3_SIZE:-"8B"}
export EVAL_TARGET=3.3 EVAL_FREQ=12288
export LR="4e-4" END_LR="4e-5" WARMUP_SAMPLES=256 MAX_STEPS=1200000
export LR="1e-3" END_LR="1e-4" WARMUP_SAMPLES=4096 MAX_STEPS=1200000
export WARMUP_STEPS=$((WARMUP_SAMPLES / GBS))
export SAMPLES=$((MAX_STEPS * GBS))
export SEQLEN=${SEQLEN:-8192}

View File

@@ -0,0 +1,43 @@
#!/usr/bin/env bash
export PYTHONPATH="."
export DEV=${DEV:-AMD}
export EMULATE="AMD_CDNA4"
export CHECK_OOB=0
export REWRITE_STACK_LIMIT=5000000 HCQDEV_WAIT_TIMEOUT_MS=240000
export DEVICE_IN_FUNCTION_BUG=1
export DEBUG=${DEBUG:-2}
export HK_FLASH_ATTENTION=${HK_FLASH_ATTENTION:-1}
export ALL2ALL=${ALL2ALL:-1}
export USE_ATOMICS=${USE_ATOMICS:-0}
export ASM_GEMM=${ASM_GEMM:-1}
export WQKV=${WQKV:-1}
export OFFLOAD_OPTIM=${OFFLOAD_OPTIM:-1}
export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16"
export DP=${DP:-1} MP=${MP:-8} BS=${BS:-1} EVAL_BS=${EVAL_BS:-1} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2}
export GBS=$((BS * GRADIENT_ACC_STEPS))
export MODEL="llama3"
export BASEDIR="/raid/datasets/c4-8b/"
export SMALL=1
export LLAMA3_SIZE=${LLAMA3_SIZE:-"8B"}
export EVAL_TARGET=3.3 EVAL_FREQ=12288
export LR="1e-3" END_LR="1e-4" WARMUP_SAMPLES=4096 MAX_STEPS=1200000
export WARMUP_STEPS=$((WARMUP_SAMPLES / GBS))
export SAMPLES=$((MAX_STEPS * GBS))
export SEQLEN=${SEQLEN:-8192}
export SEED=${SEED:-5760}
export DATA_SEED=${DATA_SEED:-5760}
export JITBEAM=${JITBEAM:-3}
export BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=1
export FAKEDATA=1 BENCHMARK=10
if [ -z "$FULL_LAYERS" ]; then
export LLAMA_LAYERS=2
fi
python3 examples/mlperf/model_train.py

View File

@@ -15,7 +15,7 @@ export ASM_GEMM=${ASM_GEMM:-1}
export WQKV=${WQKV:-0}
export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16"
export DP=${DP:-8} BS=${BS:-8} EVAL_BS=${EVAL_BS:-8} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2}
export DP=${DP:-8} MP=${MP:-1} BS=${BS:-8} EVAL_BS=${EVAL_BS:-8} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-4}
export GBS=$((BS * GRADIENT_ACC_STEPS))
export MODEL="llama3"
@@ -23,7 +23,7 @@ export BASEDIR="/raid/datasets/c4-8b/"
export SMALL=1
export LLAMA3_SIZE=${LLAMA3_SIZE:-"8B"}
export EVAL_TARGET=3.3 EVAL_FREQ=12288
export LR="4e-4" END_LR="4e-5" WARMUP_SAMPLES=256 MAX_STEPS=1200000
export LR="1e-3" END_LR="1e-4" WARMUP_SAMPLES=4096 MAX_STEPS=1200000
export WARMUP_STEPS=$((WARMUP_SAMPLES / GBS))
export SAMPLES=$((MAX_STEPS * GBS))
export SEQLEN=${SEQLEN:-8192}

View File

@@ -0,0 +1,38 @@
#!/usr/bin/env bash
export PYTHONPATH="."
export DEV=${DEV:-AMD}
export EMULATE="AMD_CDNA4"
export CHECK_OOB=0
export REWRITE_STACK_LIMIT=5000000 HCQDEV_WAIT_TIMEOUT_MS=240000
export DEVICE_IN_FUNCTION_BUG=1
export DEBUG=${DEBUG:-0}
export HK_FLASH_ATTENTION=${HK_FLASH_ATTENTION:-1}
export ALL2ALL=${ALL2ALL:-1}
export USE_ATOMICS=${USE_ATOMICS:-0}
export ASM_GEMM=${ASM_GEMM:-1}
export WQKV=${WQKV:-1}
export OFFLOAD_OPTIM=${OFFLOAD_OPTIM:-1}
export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16"
export DP=${DP:-1} MP=${MP:-8} BS=${BS:-1} EVAL_BS=${EVAL_BS:-1} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-32}
export GBS=$((BS * GRADIENT_ACC_STEPS))
export MODEL="llama3"
export BASEDIR="/raid/datasets/c4-8b/"
export SMALL=1
export LLAMA3_SIZE=${LLAMA3_SIZE:-"8B"}
export EVAL_TARGET=3.3 EVAL_FREQ=12288
export LR="1e-3" END_LR="1e-4" WARMUP_SAMPLES=4096 MAX_STEPS=1200000
export WARMUP_STEPS=$((WARMUP_SAMPLES / GBS))
export SAMPLES=$((MAX_STEPS * GBS))
export SEQLEN=${SEQLEN:-8192}
export SEED=${SEED:-$RANDOM}
export DATA_SEED=${DATA_SEED:-5760}
export JITBEAM=${JITBEAM:-3}
export BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=1
python3 examples/mlperf/model_train.py