From 39d962106f8145b6744a8c41951e9831526ebeef Mon Sep 17 00:00:00 2001 From: chenyu Date: Mon, 22 Dec 2025 11:28:29 -0500 Subject: [PATCH] update llama logging (#13803) ``` REWRITE_STACK_LIMIT=1000000 SMALL=1 BASEDIR=/raid/datasets/c4-8b SAMPLES=1000 BS=8 DP=8 DEFAULT_FLOAT=bfloat16 OPTIM_DTYPE=bfloat16 LLAMA3_SIZE=8B SEQLEN=1024 PYTHONPATH=. MODEL=llama3 python3 examples/mlperf/model_train.py 1 93.44 s run, 11.8750 loss, 0.000000000001 LR, 642.43 GB used, 19644.30 GFLOPS 2 101.78 s run, 11.8750 loss, 0.000000000001 LR, 1454.57 GB used, 17039.35 GFLOPS 3 7.34 s run, 11.8750 loss, 0.000000000002 LR, 1454.57 GB used, 236258.78 GFLOPS 4 4.32 s run, 11.8750 loss, 0.000000000002 LR, 1454.57 GB used, 401488.40 GFLOPS 5 4.36 s run, 11.9375 loss, 0.000000000003 LR, 1454.57 GB used, 398116.13 GFLOPS 6 4.32 s run, 11.8750 loss, 0.000000000003 LR, 1454.57 GB used, 401878.60 GFLOPS 7 4.34 s run, 11.8750 loss, 0.000000000004 LR, 1454.57 GB used, 399822.57 GFLOPS 8 4.35 s run, 11.8750 loss, 0.000000000004 LR, 1454.57 GB used, 398512.24 GFLOPS 9 4.36 s run, 11.8750 loss, 0.000000000005 LR, 1454.57 GB used, 397832.61 GFLOPS 10 4.40 s run, 11.8750 loss, 0.000000000005 LR, 1454.57 GB used, 394520.83 GFLOPS ``` --- examples/mlperf/model_train.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/examples/mlperf/model_train.py b/examples/mlperf/model_train.py index 884bf338ca..7c2611f7e6 100644 --- a/examples/mlperf/model_train.py +++ b/examples/mlperf/model_train.py @@ -1442,11 +1442,16 @@ def train_llama3(): GlobalCounters.reset() loss, lr = train_step(model, tokens) loss = loss.float().item() + lr = lr.item() i += 1 sequences_seen += tokens.shape[0] - tqdm.write(f"{loss:.4f} loss, {lr.item():.12f} LR, {GlobalCounters.mem_used / 1e9:.2f} GB used, {time.perf_counter()-t:.2f} s") + sec = time.perf_counter()-t + tqdm.write( + f"{i:5} {sec:.2f} s run, {loss:.4f} loss, {lr:.12f} LR, {GlobalCounters.mem_used / 1e9:.2f} GB used, " + f"{GlobalCounters.global_ops * 1e-9 / sec:9.2f} GFLOPS") + if (fname:=getenv("LOSS_FILE", "")): with open(fname, "a") as f: f.write(f"{i} {loss:.4f} {lr.item():.12f} {GlobalCounters.mem_used / 1e9:.2f}\n")