update llama logging (#13803)

```
REWRITE_STACK_LIMIT=1000000 SMALL=1 BASEDIR=/raid/datasets/c4-8b SAMPLES=1000 BS=8 DP=8 DEFAULT_FLOAT=bfloat16 OPTIM_DTYPE=bfloat16 LLAMA3_SIZE=8B SEQLEN=1024 PYTHONPATH=. MODEL=llama3 python3 examples/mlperf/model_train.py

    1 93.44 s run, 11.8750 loss, 0.000000000001 LR, 642.43 GB used,  19644.30 GFLOPS
    2 101.78 s run, 11.8750 loss, 0.000000000001 LR, 1454.57 GB used,  17039.35 GFLOPS
    3 7.34 s run, 11.8750 loss, 0.000000000002 LR, 1454.57 GB used, 236258.78 GFLOPS
    4 4.32 s run, 11.8750 loss, 0.000000000002 LR, 1454.57 GB used, 401488.40 GFLOPS
    5 4.36 s run, 11.9375 loss, 0.000000000003 LR, 1454.57 GB used, 398116.13 GFLOPS
    6 4.32 s run, 11.8750 loss, 0.000000000003 LR, 1454.57 GB used, 401878.60 GFLOPS
    7 4.34 s run, 11.8750 loss, 0.000000000004 LR, 1454.57 GB used, 399822.57 GFLOPS
    8 4.35 s run, 11.8750 loss, 0.000000000004 LR, 1454.57 GB used, 398512.24 GFLOPS
    9 4.36 s run, 11.8750 loss, 0.000000000005 LR, 1454.57 GB used, 397832.61 GFLOPS
   10 4.40 s run, 11.8750 loss, 0.000000000005 LR, 1454.57 GB used, 394520.83 GFLOPS
```
This commit is contained in:
chenyu
2025-12-22 11:28:29 -05:00
committed by GitHub
parent 389f01c7f4
commit 39d962106f

View File

@@ -1442,11 +1442,16 @@ def train_llama3():
GlobalCounters.reset()
loss, lr = train_step(model, tokens)
loss = loss.float().item()
lr = lr.item()
i += 1
sequences_seen += tokens.shape[0]
tqdm.write(f"{loss:.4f} loss, {lr.item():.12f} LR, {GlobalCounters.mem_used / 1e9:.2f} GB used, {time.perf_counter()-t:.2f} s")
sec = time.perf_counter()-t
tqdm.write(
f"{i:5} {sec:.2f} s run, {loss:.4f} loss, {lr:.12f} LR, {GlobalCounters.mem_used / 1e9:.2f} GB used, "
f"{GlobalCounters.global_ops * 1e-9 / sec:9.2f} GFLOPS")
if (fname:=getenv("LOSS_FILE", "")):
with open(fname, "a") as f:
f.write(f"{i} {loss:.4f} {lr.item():.12f} {GlobalCounters.mem_used / 1e9:.2f}\n")