free_intermediates in bert (#9040)

also re-enable dropout and update EVAL_BS
This commit is contained in:
chenyu
2025-02-12 10:00:39 -05:00
committed by GitHub
parent 916d5e7f08
commit 7b5ac2c15e
7 changed files with 10 additions and 20 deletions

View File

@@ -804,7 +804,8 @@ def train_bert():
if i % eval_step_freq == 0 or (BENCHMARK and i == BENCHMARK):
if MLLOGGER and RUNMLPERF:
MLLOGGER.start(key=mllog_constants.EVAL_START, value=None, metadata={"epoch_num": i*BS, "step_num": i})
if getenv("RESET_STEP", 1): train_step_bert.reset()
if getenv("RESET_STEP", 0): train_step_bert.reset()
train_step_bert.captured.free_intermediates()
eval_lm_losses = []
eval_clsf_losses = []
eval_lm_accs = []
@@ -840,7 +841,8 @@ def train_bert():
MLLOGGER.event(key=mllog_constants.INIT_STOP, value=None)
return
if getenv("RESET_STEP", 1): eval_step_bert.reset()
if getenv("RESET_STEP", 0): eval_step_bert.reset()
eval_step_bert.captured.free_intermediates()
del eval_data
avg_lm_loss = sum(eval_lm_losses) / len(eval_lm_losses)
avg_clsf_loss = sum(eval_clsf_losses) / len(eval_clsf_losses)

View File

@@ -2,13 +2,11 @@
export PYTHONPATH="."
export MODEL="bert"
export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=36
export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=66
export BEAM=4 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024
export IGNORE_JIT_FIRST_BEAM=1
export BASEDIR="/raid/datasets/wiki"
# TODO: remove DISABLE_DROPOUT=1
export DISABLE_DROPOUT=1
export BENCHMARK=10 DEBUG=2

View File

@@ -2,13 +2,11 @@
export PYTHONPATH="."
export MODEL="bert"
export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=36
export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=66
export BEAM=4 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024
export IGNORE_JIT_FIRST_BEAM=1
export BASEDIR="/raid/datasets/wiki"
# TODO: remove DISABLE_DROPOUT=1
export DISABLE_DROPOUT=1
export WANDB=1 PARALLEL=0

View File

@@ -3,13 +3,11 @@
export PYTHONPATH="."
export MODEL="bert"
export SUBMISSION_PLATFORM="tinybox_green"
export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=36
export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=66
export BEAM=4 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024
export IGNORE_JIT_FIRST_BEAM=1
export BASEDIR="/raid/datasets/wiki"
# TODO: remove DISABLE_DROPOUT=1
export DISABLE_DROPOUT=1
# pip install -e ".[mlperf]"
export LOGMLPERF=1

View File

@@ -2,13 +2,11 @@
export PYTHONPATH="."
export MODEL="bert"
export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=36
export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=66
export BEAM=3 BEAM_UOPS_MAX=3000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024
export IGNORE_JIT_FIRST_BEAM=1
export BASEDIR="/raid/datasets/wiki"
# TODO: remove DISABLE_DROPOUT=1
export DISABLE_DROPOUT=1
export BENCHMARK=10 DEBUG=2

View File

@@ -2,13 +2,11 @@
export PYTHONPATH="."
export MODEL="bert"
export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=36
export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=66
export BEAM=3 BEAM_UOPS_MAX=3000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024
export IGNORE_JIT_FIRST_BEAM=1
export BASEDIR="/raid/datasets/wiki"
# TODO: remove DISABLE_DROPOUT=1
export DISABLE_DROPOUT=1
export WANDB=1 PARALLEL=0

View File

@@ -3,13 +3,11 @@
export PYTHONPATH="."
export MODEL="bert"
export SUBMISSION_PLATFORM="tinybox_red"
export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=36
export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=66
export BEAM=3 BEAM_UOPS_MAX=3000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024
export IGNORE_JIT_FIRST_BEAM=1
export BASEDIR="/raid/datasets/wiki"
# TODO: remove DISABLE_DROPOUT=1
export DISABLE_DROPOUT=1
# pip install -e ".[mlperf]"
export LOGMLPERF=1