mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-04-29 03:00:14 -04:00
llama3: more scripts (#15107)
This commit is contained in:
@@ -0,0 +1,36 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
export PYTHONPATH="."
|
||||
export DEV=${DEV:-AMD}
|
||||
export EMULATE="AMD_CDNA4"
|
||||
export CHECK_OOB=0
|
||||
export REWRITE_STACK_LIMIT=5000000 HCQDEV_WAIT_TIMEOUT_MS=240000
|
||||
|
||||
export DEBUG=${DEBUG:-2}
|
||||
export HK_FLASH_ATTENTION=${HK_FLASH_ATTENTION:-1}
|
||||
export ALL2ALL=${ALL2ALL:-1}
|
||||
export USE_ATOMICS=${USE_ATOMICS:-0}
|
||||
export ASM_GEMM=${ASM_GEMM:-1}
|
||||
export WQKV=${WQKV:-1}
|
||||
|
||||
export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16"
|
||||
export DP=${DP:-1} MP=${MP:-8}
|
||||
export BS=${BS:-1} EVAL_BS=${EVAL_BS:-1} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2}
|
||||
|
||||
export MODEL="llama3"
|
||||
export BASEDIR="/raid/datasets/c4/"
|
||||
export LLAMA3_SIZE=${LLAMA3_SIZE:-"405B"}
|
||||
export SEQLEN=${SEQLEN:-8192}
|
||||
|
||||
export SEED=${SEED:-5760}
|
||||
export DATA_SEED=${DATA_SEED:-5760}
|
||||
|
||||
export JITBEAM=${JITBEAM:-3}
|
||||
export BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=1
|
||||
|
||||
export FAKEDATA=1 BENCHMARK=10
|
||||
if [ -z "$FULL_LAYERS" ]; then
|
||||
export LLAMA_LAYERS=2
|
||||
fi
|
||||
|
||||
python3 examples/mlperf/model_train.py
|
||||
@@ -0,0 +1,31 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
export PYTHONPATH="."
|
||||
export DEV=${DEV:-AMD}
|
||||
export EMULATE="AMD_CDNA4"
|
||||
export CHECK_OOB=0
|
||||
export REWRITE_STACK_LIMIT=5000000 HCQDEV_WAIT_TIMEOUT_MS=240000
|
||||
|
||||
export DEBUG=${DEBUG:-0}
|
||||
export HK_FLASH_ATTENTION=${HK_FLASH_ATTENTION:-1}
|
||||
export ALL2ALL=${ALL2ALL:-1}
|
||||
export USE_ATOMICS=${USE_ATOMICS:-0}
|
||||
export ASM_GEMM=${ASM_GEMM:-1}
|
||||
export WQKV=${WQKV:-1}
|
||||
|
||||
export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16"
|
||||
export DP=${DP:-1} MP=${MP:-8}
|
||||
export BS=${BS:-1} EVAL_BS=${EVAL_BS:-1} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-1152}
|
||||
|
||||
export MODEL="llama3"
|
||||
export BASEDIR="/raid/datasets/c4/"
|
||||
export LLAMA3_SIZE=${LLAMA3_SIZE:-"405B"}
|
||||
export SEQLEN=${SEQLEN:-8192}
|
||||
|
||||
export SEED=${SEED:-$RANDOM}
|
||||
export DATA_SEED=${DATA_SEED:-5760}
|
||||
|
||||
export JITBEAM=${JITBEAM:-3}
|
||||
export BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=1
|
||||
|
||||
python3 examples/mlperf/model_train.py
|
||||
@@ -5,6 +5,7 @@ export DEV=${DEV:-AMD}
|
||||
export EMULATE="AMD_CDNA4"
|
||||
export CHECK_OOB=0
|
||||
export REWRITE_STACK_LIMIT=5000000 HCQDEV_WAIT_TIMEOUT_MS=240000
|
||||
export DEVICE_IN_FUNCTION_BUG=1
|
||||
|
||||
export DEBUG=${DEBUG:-2}
|
||||
export HK_FLASH_ATTENTION=${HK_FLASH_ATTENTION:-1}
|
||||
@@ -14,7 +15,7 @@ export ASM_GEMM=${ASM_GEMM:-1}
|
||||
export WQKV=${WQKV:-0}
|
||||
|
||||
export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16"
|
||||
export DP=${DP:-8} BS=${BS:-8} EVAL_BS=${EVAL_BS:-8} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2}
|
||||
export DP=${DP:-8} MP=${MP:-1} BS=${BS:-8} EVAL_BS=${EVAL_BS:-8} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2}
|
||||
export GBS=$((BS * GRADIENT_ACC_STEPS))
|
||||
|
||||
export MODEL="llama3"
|
||||
@@ -22,7 +23,7 @@ export BASEDIR="/raid/datasets/c4-8b/"
|
||||
export SMALL=1
|
||||
export LLAMA3_SIZE=${LLAMA3_SIZE:-"8B"}
|
||||
export EVAL_TARGET=3.3 EVAL_FREQ=12288
|
||||
export LR="4e-4" END_LR="4e-5" WARMUP_SAMPLES=256 MAX_STEPS=1200000
|
||||
export LR="1e-3" END_LR="1e-4" WARMUP_SAMPLES=4096 MAX_STEPS=1200000
|
||||
export WARMUP_STEPS=$((WARMUP_SAMPLES / GBS))
|
||||
export SAMPLES=$((MAX_STEPS * GBS))
|
||||
export SEQLEN=${SEQLEN:-8192}
|
||||
|
||||
@@ -0,0 +1,43 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
export PYTHONPATH="."
|
||||
export DEV=${DEV:-AMD}
|
||||
export EMULATE="AMD_CDNA4"
|
||||
export CHECK_OOB=0
|
||||
export REWRITE_STACK_LIMIT=5000000 HCQDEV_WAIT_TIMEOUT_MS=240000
|
||||
export DEVICE_IN_FUNCTION_BUG=1
|
||||
|
||||
export DEBUG=${DEBUG:-2}
|
||||
export HK_FLASH_ATTENTION=${HK_FLASH_ATTENTION:-1}
|
||||
export ALL2ALL=${ALL2ALL:-1}
|
||||
export USE_ATOMICS=${USE_ATOMICS:-0}
|
||||
export ASM_GEMM=${ASM_GEMM:-1}
|
||||
export WQKV=${WQKV:-1}
|
||||
export OFFLOAD_OPTIM=${OFFLOAD_OPTIM:-1}
|
||||
|
||||
export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16"
|
||||
export DP=${DP:-1} MP=${MP:-8} BS=${BS:-1} EVAL_BS=${EVAL_BS:-1} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2}
|
||||
export GBS=$((BS * GRADIENT_ACC_STEPS))
|
||||
|
||||
export MODEL="llama3"
|
||||
export BASEDIR="/raid/datasets/c4-8b/"
|
||||
export SMALL=1
|
||||
export LLAMA3_SIZE=${LLAMA3_SIZE:-"8B"}
|
||||
export EVAL_TARGET=3.3 EVAL_FREQ=12288
|
||||
export LR="1e-3" END_LR="1e-4" WARMUP_SAMPLES=4096 MAX_STEPS=1200000
|
||||
export WARMUP_STEPS=$((WARMUP_SAMPLES / GBS))
|
||||
export SAMPLES=$((MAX_STEPS * GBS))
|
||||
export SEQLEN=${SEQLEN:-8192}
|
||||
|
||||
export SEED=${SEED:-5760}
|
||||
export DATA_SEED=${DATA_SEED:-5760}
|
||||
|
||||
export JITBEAM=${JITBEAM:-3}
|
||||
export BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=1
|
||||
|
||||
export FAKEDATA=1 BENCHMARK=10
|
||||
if [ -z "$FULL_LAYERS" ]; then
|
||||
export LLAMA_LAYERS=2
|
||||
fi
|
||||
|
||||
python3 examples/mlperf/model_train.py
|
||||
@@ -15,7 +15,7 @@ export ASM_GEMM=${ASM_GEMM:-1}
|
||||
export WQKV=${WQKV:-0}
|
||||
|
||||
export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16"
|
||||
export DP=${DP:-8} BS=${BS:-8} EVAL_BS=${EVAL_BS:-8} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2}
|
||||
export DP=${DP:-8} MP=${MP:-1} BS=${BS:-8} EVAL_BS=${EVAL_BS:-8} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-4}
|
||||
export GBS=$((BS * GRADIENT_ACC_STEPS))
|
||||
|
||||
export MODEL="llama3"
|
||||
@@ -23,7 +23,7 @@ export BASEDIR="/raid/datasets/c4-8b/"
|
||||
export SMALL=1
|
||||
export LLAMA3_SIZE=${LLAMA3_SIZE:-"8B"}
|
||||
export EVAL_TARGET=3.3 EVAL_FREQ=12288
|
||||
export LR="4e-4" END_LR="4e-5" WARMUP_SAMPLES=256 MAX_STEPS=1200000
|
||||
export LR="1e-3" END_LR="1e-4" WARMUP_SAMPLES=4096 MAX_STEPS=1200000
|
||||
export WARMUP_STEPS=$((WARMUP_SAMPLES / GBS))
|
||||
export SAMPLES=$((MAX_STEPS * GBS))
|
||||
export SEQLEN=${SEQLEN:-8192}
|
||||
|
||||
@@ -0,0 +1,38 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
export PYTHONPATH="."
|
||||
export DEV=${DEV:-AMD}
|
||||
export EMULATE="AMD_CDNA4"
|
||||
export CHECK_OOB=0
|
||||
export REWRITE_STACK_LIMIT=5000000 HCQDEV_WAIT_TIMEOUT_MS=240000
|
||||
export DEVICE_IN_FUNCTION_BUG=1
|
||||
|
||||
export DEBUG=${DEBUG:-0}
|
||||
export HK_FLASH_ATTENTION=${HK_FLASH_ATTENTION:-1}
|
||||
export ALL2ALL=${ALL2ALL:-1}
|
||||
export USE_ATOMICS=${USE_ATOMICS:-0}
|
||||
export ASM_GEMM=${ASM_GEMM:-1}
|
||||
export WQKV=${WQKV:-1}
|
||||
export OFFLOAD_OPTIM=${OFFLOAD_OPTIM:-1}
|
||||
|
||||
export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16"
|
||||
export DP=${DP:-1} MP=${MP:-8} BS=${BS:-1} EVAL_BS=${EVAL_BS:-1} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-32}
|
||||
export GBS=$((BS * GRADIENT_ACC_STEPS))
|
||||
|
||||
export MODEL="llama3"
|
||||
export BASEDIR="/raid/datasets/c4-8b/"
|
||||
export SMALL=1
|
||||
export LLAMA3_SIZE=${LLAMA3_SIZE:-"8B"}
|
||||
export EVAL_TARGET=3.3 EVAL_FREQ=12288
|
||||
export LR="1e-3" END_LR="1e-4" WARMUP_SAMPLES=4096 MAX_STEPS=1200000
|
||||
export WARMUP_STEPS=$((WARMUP_SAMPLES / GBS))
|
||||
export SAMPLES=$((MAX_STEPS * GBS))
|
||||
export SEQLEN=${SEQLEN:-8192}
|
||||
|
||||
export SEED=${SEED:-$RANDOM}
|
||||
export DATA_SEED=${DATA_SEED:-5760}
|
||||
|
||||
export JITBEAM=${JITBEAM:-3}
|
||||
export BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=1
|
||||
|
||||
python3 examples/mlperf/model_train.py
|
||||
Reference in New Issue
Block a user