Revert "Revert "add nan check during training""

This reverts commit b7b2943197.
This commit is contained in:
Francis Lata
2025-02-26 15:43:20 +00:00
parent e0e50fc482
commit 7cb226d757

View File

@@ -497,6 +497,10 @@ def train_retinanet():
cl = time.perf_counter()
if BENCHMARK: step_times.append(cl - st)
if not math.isfinite(loss):
print("loss is nan")
return
tqdm.write(
f"{i:5} {((cl - st)) * 1000.0:7.2f} ms run, {(pt - st) * 1000.0:7.2f} ms python, {(dt - pt) * 1000.0:6.2f} ms fetch data, "
f"{(cl - dt) * 1000.0:7.2f} ms {device_str}, {loss:5.2f} loss, {losses['classification_loss'].item():5.4f} classification loss, {losses['regression_loss'].item():5.4f} regression loss, "