From 7cb226d757ec020a2065feb553ea704ccfab69a4 Mon Sep 17 00:00:00 2001 From: Francis Lata Date: Wed, 26 Feb 2025 15:43:20 +0000 Subject: [PATCH] Revert "Revert "add nan check during training"" This reverts commit b7b2943197f386e73e7943642b307b8a3a754c40. --- examples/mlperf/model_train.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/mlperf/model_train.py b/examples/mlperf/model_train.py index 6e1680400f..6ce2180bc9 100644 --- a/examples/mlperf/model_train.py +++ b/examples/mlperf/model_train.py @@ -497,6 +497,10 @@ def train_retinanet(): cl = time.perf_counter() if BENCHMARK: step_times.append(cl - st) + if not math.isfinite(loss): + print("loss is nan") + return + tqdm.write( f"{i:5} {((cl - st)) * 1000.0:7.2f} ms run, {(pt - st) * 1000.0:7.2f} ms python, {(dt - pt) * 1000.0:6.2f} ms fetch data, " f"{(cl - dt) * 1000.0:7.2f} ms {device_str}, {loss:5.2f} loss, {losses['classification_loss'].item():5.4f} classification loss, {losses['regression_loss'].item():5.4f} regression loss, "