dev_run for bert on mi300x (#9706)

This commit is contained in:
chenyu
2025-04-02 21:12:55 -04:00
committed by GitHub
parent d96b4983ac
commit a6fec2f5ae
3 changed files with 16 additions and 2 deletions

View File

@@ -873,7 +873,7 @@ def train_bert():
if WANDB:
wandb.log({"eval/lm_loss": avg_lm_loss, "eval/clsf_loss": avg_clsf_loss, "eval/lm_accuracy": avg_lm_acc, \
"eval/clsf_accuracy": avg_clsf_acc, "eval/forward_time": avg_fw_time})
"eval/clsf_accuracy": avg_clsf_acc, "eval/forward_time": avg_fw_time, "epoch": (i+1)*BS})
if MLLOGGER and RUNMLPERF:
MLLOGGER.end(key=mllog_constants.EVAL_STOP, value=i*BS, metadata={"epoch_count": i*BS, "step_num": i, "samples_count": config["EVAL_BS"] * config["MAX_EVAL_STEPS"]})

View File

@@ -2,7 +2,7 @@
export PYTHONPATH="."
export MODEL="bert"
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=8 BS=1024 EVAL_BS=1024
export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024
export BEAM=3 BEAM_UOPS_MAX=4000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
export IGNORE_JIT_FIRST_BEAM=1

View File

@@ -0,0 +1,14 @@
#!/bin/bash
export PYTHONPATH="."
export MODEL="bert"
export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024
export OPT_BASE_LEARNING_RATE=0.001
export BEAM=3 BEAM_UOPS_MAX=4000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
export IGNORE_JIT_FIRST_BEAM=1
export BASEDIR="/raid/datasets/wiki"
export WANDB=1 PARALLEL=0
RUNMLPERF=1 python3 examples/mlperf/model_train.py