diff --git a/examples/mlperf/model_train.py b/examples/mlperf/model_train.py index 4a8192c80f..147dafc19c 100644 --- a/examples/mlperf/model_train.py +++ b/examples/mlperf/model_train.py @@ -873,7 +873,7 @@ def train_bert(): if WANDB: wandb.log({"eval/lm_loss": avg_lm_loss, "eval/clsf_loss": avg_clsf_loss, "eval/lm_accuracy": avg_lm_acc, \ - "eval/clsf_accuracy": avg_clsf_acc, "eval/forward_time": avg_fw_time}) + "eval/clsf_accuracy": avg_clsf_acc, "eval/forward_time": avg_fw_time, "epoch": (i+1)*BS}) if MLLOGGER and RUNMLPERF: MLLOGGER.end(key=mllog_constants.EVAL_STOP, value=i*BS, metadata={"epoch_count": i*BS, "step_num": i, "samples_count": config["EVAL_BS"] * config["MAX_EVAL_STEPS"]}) diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_amd/dev_beam.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_amd/dev_beam.sh index 1b6ff92c14..da566ed807 100755 --- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_amd/dev_beam.sh +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_amd/dev_beam.sh @@ -2,7 +2,7 @@ export PYTHONPATH="." export MODEL="bert" -export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=8 BS=1024 EVAL_BS=1024 +export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024 export BEAM=3 BEAM_UOPS_MAX=4000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 export IGNORE_JIT_FIRST_BEAM=1 diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_amd/dev_run.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_amd/dev_run.sh new file mode 100755 index 0000000000..a2eb6494f8 --- /dev/null +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_amd/dev_run.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +export PYTHONPATH="." +export MODEL="bert" +export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024 +export OPT_BASE_LEARNING_RATE=0.001 + +export BEAM=3 BEAM_UOPS_MAX=4000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 +export IGNORE_JIT_FIRST_BEAM=1 +export BASEDIR="/raid/datasets/wiki" + +export WANDB=1 PARALLEL=0 + +RUNMLPERF=1 python3 examples/mlperf/model_train.py \ No newline at end of file