From 77b5e6774e4bf8452c932d60ff1f4a834fccc871 Mon Sep 17 00:00:00 2001 From: chenyu Date: Mon, 13 Oct 2025 15:03:47 -0400 Subject: [PATCH] fix bert training config (#12647) FREE_INTERMEDIATE=0 REWRITE_STACK_LIMIT=500000 --- examples/mlperf/model_train.py | 6 ++++-- .../bert/implementations/tinybox_8xMI300X/dev_beam.sh | 1 + .../bert/implementations/tinybox_8xMI300X/dev_run.sh | 1 + .../bert/implementations/tinybox_8xMI300X/run_and_time.sh | 1 + .../bert/implementations/tinybox_green/dev_beam.sh | 1 + .../bert/implementations/tinybox_green/dev_run.sh | 1 + .../bert/implementations/tinybox_green/run_and_time.sh | 1 + .../benchmarks/bert/implementations/tinybox_red/dev_beam.sh | 1 + .../benchmarks/bert/implementations/tinybox_red/dev_run.sh | 1 + .../bert/implementations/tinybox_red/run_and_time.sh | 1 + 10 files changed, 13 insertions(+), 2 deletions(-) diff --git a/examples/mlperf/model_train.py b/examples/mlperf/model_train.py index 6354155b14..a8189c4ee5 100644 --- a/examples/mlperf/model_train.py +++ b/examples/mlperf/model_train.py @@ -1188,7 +1188,9 @@ def train_bert(): if MLLOGGER and RUNMLPERF: MLLOGGER.start(key=mllog_constants.EVAL_START, value=None, metadata={"epoch_num": i*GBS, "step_num": i}) if getenv("RESET_STEP"): train_step_bert.reset() - elif getenv("FREE_INTERMEDIATE", 1) and train_step_bert.captured is not None: train_step_bert.captured.free_intermediates() + elif getenv("FREE_INTERMEDIATE", 0) and train_step_bert.captured is not None: + # TODO: FREE_INTERMEDIATE nan'ed after jit step 2 + train_step_bert.captured.free_intermediates() eval_lm_losses = [] eval_clsf_losses = [] eval_lm_accs = [] @@ -1222,7 +1224,7 @@ def train_bert(): return if getenv("RESET_STEP"): eval_step_bert.reset() - elif getenv("FREE_INTERMEDIATE", 1) and eval_step_bert.captured is not None: eval_step_bert.captured.free_intermediates() + elif getenv("FREE_INTERMEDIATE", 0) and eval_step_bert.captured is not None: eval_step_bert.captured.free_intermediates() del eval_data avg_lm_loss = sum(eval_lm_losses) / len(eval_lm_losses) diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_beam.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_beam.sh index cfaad1e59e..278eff316d 100755 --- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_beam.sh +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_beam.sh @@ -6,6 +6,7 @@ export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024 export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1 export IGNORE_OOB=1 +export REWRITE_STACK_LIMIT=500000 export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0 diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_run.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_run.sh index 6ef7c1b996..a6a42a6de0 100755 --- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_run.sh +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_run.sh @@ -9,6 +9,7 @@ export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.8 export TRAIN_STEPS=3900 export IGNORE_OOB=1 +export REWRITE_STACK_LIMIT=500000 export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0 diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh index cd2f30579b..1dbef0e48e 100755 --- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh @@ -12,6 +12,7 @@ export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.8 export TRAIN_STEPS=3900 export IGNORE_OOB=1 +export REWRITE_STACK_LIMIT=500000 export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0 diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh index a2d477312d..2865fbe06d 100755 --- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh @@ -5,6 +5,7 @@ export MODEL="bert" export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90 export IGNORE_OOB=1 +export REWRITE_STACK_LIMIT=500000 export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 export IGNORE_JIT_FIRST_BEAM=1 diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh index 4365466211..22573ae491 100755 --- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh @@ -5,6 +5,7 @@ export MODEL="bert" export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90 export IGNORE_OOB=1 +export REWRITE_STACK_LIMIT=500000 export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 export IGNORE_JIT_FIRST_BEAM=1 diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh index 4b3b911933..e533aea2a7 100755 --- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh @@ -8,6 +8,7 @@ export SUBMISSION_PLATFORM="tinybox_green" export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90 export IGNORE_OOB=1 +export REWRITE_STACK_LIMIT=500000 export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 export IGNORE_JIT_FIRST_BEAM=1 diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh index 881dd247b4..98f8d560d5 100755 --- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh @@ -5,6 +5,7 @@ export MODEL="bert" export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90 export IGNORE_OOB=1 +export REWRITE_STACK_LIMIT=500000 export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 export IGNORE_JIT_FIRST_BEAM=1 diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh index 719ecd5bf9..426e657ab9 100755 --- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh @@ -5,6 +5,7 @@ export MODEL="bert" export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90 export IGNORE_OOB=1 +export REWRITE_STACK_LIMIT=500000 export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 export IGNORE_JIT_FIRST_BEAM=1 diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh index 4b30305947..f54ba4b9d0 100755 --- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh @@ -8,6 +8,7 @@ export SUBMISSION_PLATFORM="tinybox_red" export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90 export IGNORE_OOB=1 +export REWRITE_STACK_LIMIT=500000 export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 export IGNORE_JIT_FIRST_BEAM=1