fix bert training config (#12647)

FREE_INTERMEDIATE=0 REWRITE_STACK_LIMIT=500000
This commit is contained in:
chenyu
2025-10-13 15:03:47 -04:00
committed by GitHub
parent f1041dc0ac
commit 77b5e6774e
10 changed files with 13 additions and 2 deletions

View File

@@ -1188,7 +1188,9 @@ def train_bert():
if MLLOGGER and RUNMLPERF:
MLLOGGER.start(key=mllog_constants.EVAL_START, value=None, metadata={"epoch_num": i*GBS, "step_num": i})
if getenv("RESET_STEP"): train_step_bert.reset()
elif getenv("FREE_INTERMEDIATE", 1) and train_step_bert.captured is not None: train_step_bert.captured.free_intermediates()
elif getenv("FREE_INTERMEDIATE", 0) and train_step_bert.captured is not None:
# TODO: FREE_INTERMEDIATE nan'ed after jit step 2
train_step_bert.captured.free_intermediates()
eval_lm_losses = []
eval_clsf_losses = []
eval_lm_accs = []
@@ -1222,7 +1224,7 @@ def train_bert():
return
if getenv("RESET_STEP"): eval_step_bert.reset()
elif getenv("FREE_INTERMEDIATE", 1) and eval_step_bert.captured is not None: eval_step_bert.captured.free_intermediates()
elif getenv("FREE_INTERMEDIATE", 0) and eval_step_bert.captured is not None: eval_step_bert.captured.free_intermediates()
del eval_data
avg_lm_loss = sum(eval_lm_losses) / len(eval_lm_losses)

View File

@@ -6,6 +6,7 @@ export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024
export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1
export IGNORE_OOB=1
export REWRITE_STACK_LIMIT=500000
export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0

View File

@@ -9,6 +9,7 @@ export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.8
export TRAIN_STEPS=3900
export IGNORE_OOB=1
export REWRITE_STACK_LIMIT=500000
export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0

View File

@@ -12,6 +12,7 @@ export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.8
export TRAIN_STEPS=3900
export IGNORE_OOB=1
export REWRITE_STACK_LIMIT=500000
export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0

View File

@@ -5,6 +5,7 @@ export MODEL="bert"
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90
export IGNORE_OOB=1
export REWRITE_STACK_LIMIT=500000
export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
export IGNORE_JIT_FIRST_BEAM=1

View File

@@ -5,6 +5,7 @@ export MODEL="bert"
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90
export IGNORE_OOB=1
export REWRITE_STACK_LIMIT=500000
export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
export IGNORE_JIT_FIRST_BEAM=1

View File

@@ -8,6 +8,7 @@ export SUBMISSION_PLATFORM="tinybox_green"
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90
export IGNORE_OOB=1
export REWRITE_STACK_LIMIT=500000
export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
export IGNORE_JIT_FIRST_BEAM=1

View File

@@ -5,6 +5,7 @@ export MODEL="bert"
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90
export IGNORE_OOB=1
export REWRITE_STACK_LIMIT=500000
export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
export IGNORE_JIT_FIRST_BEAM=1

View File

@@ -5,6 +5,7 @@ export MODEL="bert"
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90
export IGNORE_OOB=1
export REWRITE_STACK_LIMIT=500000
export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
export IGNORE_JIT_FIRST_BEAM=1

View File

@@ -8,6 +8,7 @@ export SUBMISSION_PLATFORM="tinybox_red"
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90
export IGNORE_OOB=1
export REWRITE_STACK_LIMIT=500000
export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
export IGNORE_JIT_FIRST_BEAM=1