dev_run for bert on mi300x (#9706)

2026-01-08 22:48:25 -05:00 · 2025-04-02 21:12:55 -04:00
parent d96b4983ac
commit a6fec2f5ae
3 changed files with 16 additions and 2 deletions
--- a/examples/mlperf/model_train.py
+++ b/examples/mlperf/model_train.py
@@ -873,7 +873,7 @@ def train_bert():

      if WANDB:
        wandb.log({"eval/lm_loss": avg_lm_loss, "eval/clsf_loss": avg_clsf_loss, "eval/lm_accuracy": avg_lm_acc, \
-                    "eval/clsf_accuracy": avg_clsf_acc, "eval/forward_time": avg_fw_time})
+                    "eval/clsf_accuracy": avg_clsf_acc, "eval/forward_time": avg_fw_time, "epoch": (i+1)*BS})

      if MLLOGGER and RUNMLPERF:
        MLLOGGER.end(key=mllog_constants.EVAL_STOP, value=i*BS, metadata={"epoch_count": i*BS, "step_num": i, "samples_count": config["EVAL_BS"] * config["MAX_EVAL_STEPS"]})
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_amd/dev_beam.sh
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_amd/dev_beam.sh
@@ -2,7 +2,7 @@

 export PYTHONPATH="."
 export MODEL="bert"
-export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=8 BS=1024 EVAL_BS=1024
+export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024

 export BEAM=3 BEAM_UOPS_MAX=4000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_amd/dev_run.sh
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_amd/dev_run.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+export PYTHONPATH="."
+export MODEL="bert"
+export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024
+export OPT_BASE_LEARNING_RATE=0.001
+
+export BEAM=3 BEAM_UOPS_MAX=4000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+export IGNORE_JIT_FIRST_BEAM=1
+export BASEDIR="/raid/datasets/wiki"
+
+export WANDB=1 PARALLEL=0
+
+RUNMLPERF=1 python3 examples/mlperf/model_train.py