mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-08 22:48:25 -05:00
increase bert TRAIN_STEPS for mi300x (#9833)
got a few non converged ones so try to increase steps. we need >= 90% runs to converge
This commit is contained in:
@@ -7,7 +7,7 @@ export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024
|
||||
|
||||
# similar to https://github.com/mlcommons/training_results_v3.1/blob/d06288b2bd675a9d88e0e6181f5bb5626b71ec19/Quanta_Cloud_Technology/results/D54U-3U/bert/result_1.txt#L54
|
||||
export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1
|
||||
export TRAIN_STEPS=3418
|
||||
export TRAIN_STEPS=3600
|
||||
|
||||
export BEAM=3 BEAM_UOPS_MAX=3000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
|
||||
export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0
|
||||
|
||||
Reference in New Issue
Block a user