diff --git a/examples/mlperf/model_train.py b/examples/mlperf/model_train.py index a011f3c1d2..1a4e7c9e17 100644 --- a/examples/mlperf/model_train.py +++ b/examples/mlperf/model_train.py @@ -647,9 +647,9 @@ def train_bert(): # ** hyperparameters ** BS = config["GLOBAL_BATCH_SIZE"] = getenv("BS", 11 * len(GPUS) if dtypes.default_float in (dtypes.float16, dtypes.bfloat16) else 8 * len(GPUS)) EVAL_BS = config["EVAL_BS"] = getenv("EVAL_BS", 1 * len(GPUS)) - max_lr = config["OPT_BASE_LEARNING_RATE"] = getenv("OPT_BASE_LEARNING_RATE", 0.0001 * math.sqrt(BS/54)) + max_lr = config["OPT_BASE_LEARNING_RATE"] = getenv("OPT_BASE_LEARNING_RATE", 0.0001 * math.sqrt(BS/66)) - train_steps = config["TRAIN_STEPS"] = getenv("TRAIN_STEPS", 3300000 // BS) + train_steps = config["TRAIN_STEPS"] = getenv("TRAIN_STEPS", 3000000 // BS) warmup_steps = config["NUM_WARMUP_STEPS"] = getenv("NUM_WARMUP_STEPS", 1) max_eval_steps = config["MAX_EVAL_STEPS"] = getenv("MAX_EVAL_STEPS", (10000 + EVAL_BS - 1) // EVAL_BS) # EVAL_BS * MAX_EVAL_STEPS >= 10000 eval_step_freq = config["EVAL_STEP_FREQ"] = getenv("EVAL_STEP_FREQ", int((math.floor(0.05 * (230.23 * BS + 3000000) / 25000) * 25000) / BS)) # Round down @@ -658,7 +658,7 @@ def train_bert(): save_ckpt_dir = config["SAVE_CKPT_DIR"] = getenv("SAVE_CKPT_DIR", "./ckpts") init_ckpt = config["INIT_CKPT_DIR"] = getenv("INIT_CKPT_DIR", BASEDIR) - loss_scaler = config["LOSS_SCALER"] = getenv("LOSS_SCALER", 2.0**10 if dtypes.default_float == dtypes.float16 else 1.0) + loss_scaler = config["LOSS_SCALER"] = getenv("LOSS_SCALER", 2.0**13 if dtypes.default_float == dtypes.float16 else 1.0) decay = config["DECAY"] = getenv("DECAY", 0.01) epsilon = config["EPSILON"] = getenv("EPSILON", 1e-6) poly_power = config["POLY_POWER"] = getenv("POLY_POWER", 1.0) diff --git a/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh b/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh index 98bacec1a1..dc31e449a1 100755 --- a/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh +++ b/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh @@ -2,7 +2,7 @@ export PYTHONPATH="." export MODEL="bert" -export DEFAULT_FLOAT="HALF" GPUS=6 BS=54 EVAL_BS=6 +export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6 export BEAM=4 export IGNORE_JIT_FIRST_BEAM=1 diff --git a/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh b/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh index 41bbc19740..3206693a33 100755 --- a/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh +++ b/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh @@ -2,7 +2,7 @@ export PYTHONPATH="." export MODEL="bert" -export DEFAULT_FLOAT="HALF" GPUS=6 BS=54 EVAL_BS=6 +export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6 export BEAM=4 export IGNORE_JIT_FIRST_BEAM=1 diff --git a/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh b/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh index a300ce39aa..12b547b16b 100755 --- a/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh +++ b/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh @@ -3,7 +3,7 @@ export PYTHONPATH="." export MODEL="bert" export SUBMISSION_PLATFORM="tinybox_green" -export DEFAULT_FLOAT="HALF" GPUS=6 BS=54 EVAL_BS=6 +export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6 export BEAM=4 export IGNORE_JIT_FIRST_BEAM=1 diff --git a/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh b/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh index 9c2d247f16..649d3942ef 100755 --- a/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh +++ b/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh @@ -2,7 +2,7 @@ export PYTHONPATH="." export MODEL="bert" -export DEFAULT_FLOAT="HALF" GPUS=6 BS=54 EVAL_BS=6 +export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6 export BEAM=3 export IGNORE_JIT_FIRST_BEAM=1 diff --git a/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh b/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh index 02d2902f1b..9a6ee03432 100755 --- a/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh +++ b/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh @@ -2,7 +2,7 @@ export PYTHONPATH="." export MODEL="bert" -export DEFAULT_FLOAT="HALF" GPUS=6 BS=54 EVAL_BS=6 +export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6 export BEAM=3 export IGNORE_JIT_FIRST_BEAM=1 diff --git a/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh b/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh index b25865214f..9b2f895b04 100755 --- a/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh +++ b/examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh @@ -3,7 +3,7 @@ export PYTHONPATH="." export MODEL="bert" export SUBMISSION_PLATFORM="tinybox_red" -export DEFAULT_FLOAT="HALF" GPUS=6 BS=54 EVAL_BS=6 +export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6 export BEAM=3 export IGNORE_JIT_FIRST_BEAM=1