mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-08 22:48:25 -05:00
bert dev_beam for mi300x box (#9648)
* bert dev_beam for mi300x box * terminate BENCHMARK properly
This commit is contained in:
@@ -853,7 +853,7 @@ def train_bert():
|
||||
et = time.time()
|
||||
eval_times.append(et - st)
|
||||
|
||||
if BENCHMARK and j == BENCHMARK:
|
||||
if BENCHMARK and (j+1) == min(BENCHMARK, max_eval_steps):
|
||||
# assume INITMLPERF has BENCHMARK set
|
||||
if MLLOGGER and INITMLPERF:
|
||||
MLLOGGER.event(key=mllog_constants.INIT_STOP, value=None)
|
||||
@@ -900,6 +900,9 @@ def train_bert():
|
||||
# stop once hitting the target
|
||||
break
|
||||
|
||||
# should not happen, BENCHMARK not properly terminated
|
||||
if BENCHMARK: assert i < BENCHMARK, i
|
||||
|
||||
if getenv("CKPT") and i % save_ckpt_freq == 0:
|
||||
if MLLOGGER and RUNMLPERF:
|
||||
if previous_step:
|
||||
|
||||
@@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
|
||||
export PYTHONPATH="."
|
||||
export MODEL="bert"
|
||||
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=8 BS=1024 EVAL_BS=1024
|
||||
|
||||
export BEAM=3 BEAM_UOPS_MAX=4000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
|
||||
export IGNORE_JIT_FIRST_BEAM=1
|
||||
# export BEAM_LOG_SURPASS_MAX=1
|
||||
# export BASEDIR="/raid/datasets/wiki"
|
||||
|
||||
export RESET_STEP=1
|
||||
export BENCHMARK=10 DEBUG=2
|
||||
|
||||
python3 examples/mlperf/model_train.py
|
||||
Reference in New Issue
Block a user