From 77b5e6774e4bf8452c932d60ff1f4a834fccc871 Mon Sep 17 00:00:00 2001
From: chenyu <chenyu@fastmail.com>
Date: Mon, 13 Oct 2025 15:03:47 -0400
Subject: [PATCH] fix bert training config (#12647)

FREE_INTERMEDIATE=0 REWRITE_STACK_LIMIT=500000
---
 examples/mlperf/model_train.py                              | 6 ++++--
 .../bert/implementations/tinybox_8xMI300X/dev_beam.sh       | 1 +
 .../bert/implementations/tinybox_8xMI300X/dev_run.sh        | 1 +
 .../bert/implementations/tinybox_8xMI300X/run_and_time.sh   | 1 +
 .../bert/implementations/tinybox_green/dev_beam.sh          | 1 +
 .../bert/implementations/tinybox_green/dev_run.sh           | 1 +
 .../bert/implementations/tinybox_green/run_and_time.sh      | 1 +
 .../benchmarks/bert/implementations/tinybox_red/dev_beam.sh | 1 +
 .../benchmarks/bert/implementations/tinybox_red/dev_run.sh  | 1 +
 .../bert/implementations/tinybox_red/run_and_time.sh        | 1 +
 10 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/examples/mlperf/model_train.py b/examples/mlperf/model_train.py
index 6354155b14..a8189c4ee5 100644
--- a/examples/mlperf/model_train.py
+++ b/examples/mlperf/model_train.py
@@ -1188,7 +1188,9 @@ def train_bert():
       if MLLOGGER and RUNMLPERF:
         MLLOGGER.start(key=mllog_constants.EVAL_START, value=None, metadata={"epoch_num": i*GBS, "step_num": i})
       if getenv("RESET_STEP"): train_step_bert.reset()
-      elif getenv("FREE_INTERMEDIATE", 1) and train_step_bert.captured is not None: train_step_bert.captured.free_intermediates()
+      elif getenv("FREE_INTERMEDIATE", 0) and train_step_bert.captured is not None:
+        # TODO: FREE_INTERMEDIATE nan'ed after jit step 2
+        train_step_bert.captured.free_intermediates()
       eval_lm_losses = []
       eval_clsf_losses = []
       eval_lm_accs = []
@@ -1222,7 +1224,7 @@ def train_bert():
           return
 
       if getenv("RESET_STEP"): eval_step_bert.reset()
-      elif getenv("FREE_INTERMEDIATE", 1) and eval_step_bert.captured is not None: eval_step_bert.captured.free_intermediates()
+      elif getenv("FREE_INTERMEDIATE", 0) and eval_step_bert.captured is not None: eval_step_bert.captured.free_intermediates()
 
       del eval_data
       avg_lm_loss = sum(eval_lm_losses) / len(eval_lm_losses)
diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_beam.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_beam.sh
index cfaad1e59e..278eff316d 100755
--- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_beam.sh
+++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_beam.sh
@@ -6,6 +6,7 @@ export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024
 export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1
 
 export IGNORE_OOB=1
+export REWRITE_STACK_LIMIT=500000
 
 export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0
diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_run.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_run.sh
index 6ef7c1b996..a6a42a6de0 100755
--- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_run.sh
+++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_run.sh
@@ -9,6 +9,7 @@ export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.8
 export TRAIN_STEPS=3900
 
 export IGNORE_OOB=1
+export REWRITE_STACK_LIMIT=500000
 
 export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0
diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
index cd2f30579b..1dbef0e48e 100755
--- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
+++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
@@ -12,6 +12,7 @@ export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.8
 export TRAIN_STEPS=3900
 
 export IGNORE_OOB=1
+export REWRITE_STACK_LIMIT=500000
 
 export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0
diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh
index a2d477312d..2865fbe06d 100755
--- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh
+++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh
@@ -5,6 +5,7 @@ export MODEL="bert"
 export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90
 
 export IGNORE_OOB=1
+export REWRITE_STACK_LIMIT=500000
 
 export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1
diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh
index 4365466211..22573ae491 100755
--- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh
+++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh
@@ -5,6 +5,7 @@ export MODEL="bert"
 export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90
 
 export IGNORE_OOB=1
+export REWRITE_STACK_LIMIT=500000
 
 export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1
diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
index 4b3b911933..e533aea2a7 100755
--- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
+++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
@@ -8,6 +8,7 @@ export SUBMISSION_PLATFORM="tinybox_green"
 export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90
 
 export IGNORE_OOB=1
+export REWRITE_STACK_LIMIT=500000
 
 export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1
diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh
index 881dd247b4..98f8d560d5 100755
--- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh
+++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh
@@ -5,6 +5,7 @@ export MODEL="bert"
 export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90
 
 export IGNORE_OOB=1
+export REWRITE_STACK_LIMIT=500000
 
 export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1
diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh
index 719ecd5bf9..426e657ab9 100755
--- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh
+++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh
@@ -5,6 +5,7 @@ export MODEL="bert"
 export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90
 
 export IGNORE_OOB=1
+export REWRITE_STACK_LIMIT=500000
 
 export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1
diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
index 4b30305947..f54ba4b9d0 100755
--- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
+++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
@@ -8,6 +8,7 @@ export SUBMISSION_PLATFORM="tinybox_red"
 export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90
 
 export IGNORE_OOB=1
+export REWRITE_STACK_LIMIT=500000
 
 export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1