From 2471b49e45f6144f3be9f764426639837b6def29 Mon Sep 17 00:00:00 2001 From: chenyu Date: Mon, 8 Dec 2025 16:04:14 -0500 Subject: [PATCH] minor bert / llama change from grad acc branch (#13622) * minor bert / llama change from grad acc branch * revert those --- .github/workflows/benchmark.yml | 4 ++-- examples/mlperf/model_train.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 6850bd5e4e..3855abd206 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -342,7 +342,7 @@ jobs: run: BENCHMARK_LOG=resnet_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt - name: Run 10 MLPerf Bert training steps (6 gpu) # TODO: remove BERT_LAYERS once scheduler is fast - run: BENCHMARK_LOG=bert_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt + run: BENCHMARK_LOG=bert_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=72 GPUS=6 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt - uses: actions/upload-artifact@v4 with: name: Speed (NVIDIA Training) @@ -594,7 +594,7 @@ jobs: run: BENCHMARK_LOG=resnet_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt - name: Run 10 MLPerf Bert training steps (6 gpu) # TODO: remove BERT_LAYERS once scheduler is fast - run: BENCHMARK_LOG=bert_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt + run: BENCHMARK_LOG=bert_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=72 GPUS=6 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt - uses: actions/upload-artifact@v4 with: name: Speed (AMD MLPerf) diff --git a/examples/mlperf/model_train.py b/examples/mlperf/model_train.py index 51dd9cb2c9..1b80181067 100644 --- a/examples/mlperf/model_train.py +++ b/examples/mlperf/model_train.py @@ -1040,8 +1040,8 @@ def train_bert(): # ** Optimizer ** parameters_no_wd = [v for k, v in get_state_dict(model).items() if "bias" in k or "LayerNorm" in k] - parameters = [x for x in parameters if x not in set(parameters_no_wd)] - optimizer_wd = LAMB(parameters, lr=max_lr, b1=opt_lamb_beta_1, b2=opt_lamb_beta_2, eps=epsilon, weight_decay=decay, adam=False) + parameters_wd = [x for x in parameters if x not in set(parameters_no_wd)] + optimizer_wd = LAMB(parameters_wd, lr=max_lr, b1=opt_lamb_beta_1, b2=opt_lamb_beta_2, eps=epsilon, weight_decay=decay, adam=False) optimizer_no_wd = LAMB(parameters_no_wd, lr=max_lr, b1=opt_lamb_beta_1, b2=opt_lamb_beta_2, eps=epsilon, weight_decay=0.0, adam=False) optimizer_group = OptimizerGroup(optimizer_wd, optimizer_no_wd) @@ -1418,14 +1418,14 @@ def train_llama3(): def get_train_iter(): if getenv("FAKEDATA", 0): - return fake_data(GBS, SAMPLES) + return fake_data(BS, SAMPLES) else: if SMALL: from examples.mlperf.dataloader import batch_load_llama3_small - return batch_load_llama3_small(GBS, SAMPLES, SEQLEN, BASEDIR, seed=SEED, val=bool(TRAIN_ON_VAL)) + return batch_load_llama3_small(BS, SAMPLES, SEQLEN, BASEDIR, seed=SEED, val=bool(TRAIN_ON_VAL)) else: from examples.mlperf.dataloader import batch_load_llama3 - return batch_load_llama3(GBS, SAMPLES, SEQLEN, BASEDIR, seed=SEED, val=bool(TRAIN_ON_VAL)) + return batch_load_llama3(BS, SAMPLES, SEQLEN, BASEDIR, seed=SEED, val=bool(TRAIN_ON_VAL)) def get_eval_iter(): if getenv("FAKEDATA", 0):