bert use BS=66 and update hparams (#6932)

with dropout memory improvement, we can fit BS=66 now. revert back to the hparams in #5891 too
2026-01-10 07:28:15 -05:00 · 2024-10-07 05:08:27 -04:00
parent 32ac24c45c
commit 0cf815a93a
7 changed files with 9 additions and 9 deletions
--- a/examples/mlperf/model_train.py
+++ b/examples/mlperf/model_train.py
@@ -647,9 +647,9 @@ def train_bert():
  # ** hyperparameters **
  BS                 = config["GLOBAL_BATCH_SIZE"]      = getenv("BS", 11 * len(GPUS) if dtypes.default_float in (dtypes.float16, dtypes.bfloat16) else 8 * len(GPUS))
  EVAL_BS            = config["EVAL_BS"]                = getenv("EVAL_BS", 1 * len(GPUS))
-  max_lr             = config["OPT_BASE_LEARNING_RATE"] = getenv("OPT_BASE_LEARNING_RATE", 0.0001 * math.sqrt(BS/54))
+  max_lr             = config["OPT_BASE_LEARNING_RATE"] = getenv("OPT_BASE_LEARNING_RATE", 0.0001 * math.sqrt(BS/66))

-  train_steps        = config["TRAIN_STEPS"]            = getenv("TRAIN_STEPS", 3300000 // BS)
+  train_steps        = config["TRAIN_STEPS"]            = getenv("TRAIN_STEPS", 3000000 // BS)
  warmup_steps       = config["NUM_WARMUP_STEPS"]       = getenv("NUM_WARMUP_STEPS", 1)
  max_eval_steps     = config["MAX_EVAL_STEPS"]         = getenv("MAX_EVAL_STEPS", (10000 + EVAL_BS - 1) // EVAL_BS) # EVAL_BS * MAX_EVAL_STEPS >= 10000
  eval_step_freq     = config["EVAL_STEP_FREQ"]         = getenv("EVAL_STEP_FREQ", int((math.floor(0.05 * (230.23 * BS + 3000000) / 25000) * 25000) / BS)) # Round down
@@ -658,7 +658,7 @@ def train_bert():
  save_ckpt_dir      = config["SAVE_CKPT_DIR"]          = getenv("SAVE_CKPT_DIR", "./ckpts")
  init_ckpt          = config["INIT_CKPT_DIR"]          = getenv("INIT_CKPT_DIR", BASEDIR)

-  loss_scaler        = config["LOSS_SCALER"]            = getenv("LOSS_SCALER", 2.0**10 if dtypes.default_float == dtypes.float16 else 1.0)
+  loss_scaler        = config["LOSS_SCALER"]            = getenv("LOSS_SCALER", 2.0**13 if dtypes.default_float == dtypes.float16 else 1.0)
  decay              = config["DECAY"]                  = getenv("DECAY", 0.01)
  epsilon            = config["EPSILON"]                = getenv("EPSILON", 1e-6)
  poly_power         = config["POLY_POWER"]             = getenv("POLY_POWER", 1.0)
--- a/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh
+++ b/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh
@@ -2,7 +2,7 @@

 export PYTHONPATH="."
 export MODEL="bert"
-export DEFAULT_FLOAT="HALF" GPUS=6 BS=54 EVAL_BS=6
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6

 export BEAM=4
 export IGNORE_JIT_FIRST_BEAM=1
--- a/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh
+++ b/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh
@@ -2,7 +2,7 @@

 export PYTHONPATH="."
 export MODEL="bert"
-export DEFAULT_FLOAT="HALF" GPUS=6 BS=54 EVAL_BS=6
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6

 export BEAM=4
 export IGNORE_JIT_FIRST_BEAM=1
--- a/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
+++ b/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
@@ -3,7 +3,7 @@
 export PYTHONPATH="."
 export MODEL="bert"
 export SUBMISSION_PLATFORM="tinybox_green"
-export DEFAULT_FLOAT="HALF" GPUS=6 BS=54 EVAL_BS=6
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6

 export BEAM=4
 export IGNORE_JIT_FIRST_BEAM=1
--- a/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh
+++ b/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh
@@ -2,7 +2,7 @@

 export PYTHONPATH="."
 export MODEL="bert"
-export DEFAULT_FLOAT="HALF" GPUS=6 BS=54 EVAL_BS=6
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6

 export BEAM=3
 export IGNORE_JIT_FIRST_BEAM=1
--- a/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh
+++ b/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh
@@ -2,7 +2,7 @@

 export PYTHONPATH="."
 export MODEL="bert"
-export DEFAULT_FLOAT="HALF" GPUS=6 BS=54 EVAL_BS=6
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6

 export BEAM=3
 export IGNORE_JIT_FIRST_BEAM=1
--- a/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
+++ b/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
@@ -3,7 +3,7 @@
 export PYTHONPATH="."
 export MODEL="bert"
 export SUBMISSION_PLATFORM="tinybox_red"
-export DEFAULT_FLOAT="HALF" GPUS=6 BS=54 EVAL_BS=6
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6

 export BEAM=3
 export IGNORE_JIT_FIRST_BEAM=1