From 102dfe5510bddfdbd646de44c3641bdc34cc6bc3 Mon Sep 17 00:00:00 2001 From: chenyu Date: Mon, 7 Oct 2024 10:17:21 -0400 Subject: [PATCH] back to 2**10 for bert loss scaler (#6934) getting 2 NaN for this, revert back to 2**10 --- examples/mlperf/model_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/mlperf/model_train.py b/examples/mlperf/model_train.py index 1a4e7c9e17..b083b3bdd2 100644 --- a/examples/mlperf/model_train.py +++ b/examples/mlperf/model_train.py @@ -658,7 +658,7 @@ def train_bert(): save_ckpt_dir = config["SAVE_CKPT_DIR"] = getenv("SAVE_CKPT_DIR", "./ckpts") init_ckpt = config["INIT_CKPT_DIR"] = getenv("INIT_CKPT_DIR", BASEDIR) - loss_scaler = config["LOSS_SCALER"] = getenv("LOSS_SCALER", 2.0**13 if dtypes.default_float == dtypes.float16 else 1.0) + loss_scaler = config["LOSS_SCALER"] = getenv("LOSS_SCALER", 2.0**10 if dtypes.default_float == dtypes.float16 else 1.0) decay = config["DECAY"] = getenv("DECAY", 0.01) epsilon = config["EPSILON"] = getenv("EPSILON", 1e-6) poly_power = config["POLY_POWER"] = getenv("POLY_POWER", 1.0)