mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-10 07:28:15 -05:00
bert use BS=66 and update hparams (#6932)
with dropout memory improvement, we can fit BS=66 now. revert back to the hparams in #5891 too
This commit is contained in:
@@ -647,9 +647,9 @@ def train_bert():
|
||||
# ** hyperparameters **
|
||||
BS = config["GLOBAL_BATCH_SIZE"] = getenv("BS", 11 * len(GPUS) if dtypes.default_float in (dtypes.float16, dtypes.bfloat16) else 8 * len(GPUS))
|
||||
EVAL_BS = config["EVAL_BS"] = getenv("EVAL_BS", 1 * len(GPUS))
|
||||
max_lr = config["OPT_BASE_LEARNING_RATE"] = getenv("OPT_BASE_LEARNING_RATE", 0.0001 * math.sqrt(BS/54))
|
||||
max_lr = config["OPT_BASE_LEARNING_RATE"] = getenv("OPT_BASE_LEARNING_RATE", 0.0001 * math.sqrt(BS/66))
|
||||
|
||||
train_steps = config["TRAIN_STEPS"] = getenv("TRAIN_STEPS", 3300000 // BS)
|
||||
train_steps = config["TRAIN_STEPS"] = getenv("TRAIN_STEPS", 3000000 // BS)
|
||||
warmup_steps = config["NUM_WARMUP_STEPS"] = getenv("NUM_WARMUP_STEPS", 1)
|
||||
max_eval_steps = config["MAX_EVAL_STEPS"] = getenv("MAX_EVAL_STEPS", (10000 + EVAL_BS - 1) // EVAL_BS) # EVAL_BS * MAX_EVAL_STEPS >= 10000
|
||||
eval_step_freq = config["EVAL_STEP_FREQ"] = getenv("EVAL_STEP_FREQ", int((math.floor(0.05 * (230.23 * BS + 3000000) / 25000) * 25000) / BS)) # Round down
|
||||
@@ -658,7 +658,7 @@ def train_bert():
|
||||
save_ckpt_dir = config["SAVE_CKPT_DIR"] = getenv("SAVE_CKPT_DIR", "./ckpts")
|
||||
init_ckpt = config["INIT_CKPT_DIR"] = getenv("INIT_CKPT_DIR", BASEDIR)
|
||||
|
||||
loss_scaler = config["LOSS_SCALER"] = getenv("LOSS_SCALER", 2.0**10 if dtypes.default_float == dtypes.float16 else 1.0)
|
||||
loss_scaler = config["LOSS_SCALER"] = getenv("LOSS_SCALER", 2.0**13 if dtypes.default_float == dtypes.float16 else 1.0)
|
||||
decay = config["DECAY"] = getenv("DECAY", 0.01)
|
||||
epsilon = config["EPSILON"] = getenv("EPSILON", 1e-6)
|
||||
poly_power = config["POLY_POWER"] = getenv("POLY_POWER", 1.0)
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
export PYTHONPATH="."
|
||||
export MODEL="bert"
|
||||
export DEFAULT_FLOAT="HALF" GPUS=6 BS=54 EVAL_BS=6
|
||||
export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6
|
||||
|
||||
export BEAM=4
|
||||
export IGNORE_JIT_FIRST_BEAM=1
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
export PYTHONPATH="."
|
||||
export MODEL="bert"
|
||||
export DEFAULT_FLOAT="HALF" GPUS=6 BS=54 EVAL_BS=6
|
||||
export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6
|
||||
|
||||
export BEAM=4
|
||||
export IGNORE_JIT_FIRST_BEAM=1
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
export PYTHONPATH="."
|
||||
export MODEL="bert"
|
||||
export SUBMISSION_PLATFORM="tinybox_green"
|
||||
export DEFAULT_FLOAT="HALF" GPUS=6 BS=54 EVAL_BS=6
|
||||
export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6
|
||||
|
||||
export BEAM=4
|
||||
export IGNORE_JIT_FIRST_BEAM=1
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
export PYTHONPATH="."
|
||||
export MODEL="bert"
|
||||
export DEFAULT_FLOAT="HALF" GPUS=6 BS=54 EVAL_BS=6
|
||||
export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6
|
||||
|
||||
export BEAM=3
|
||||
export IGNORE_JIT_FIRST_BEAM=1
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
export PYTHONPATH="."
|
||||
export MODEL="bert"
|
||||
export DEFAULT_FLOAT="HALF" GPUS=6 BS=54 EVAL_BS=6
|
||||
export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6
|
||||
|
||||
export BEAM=3
|
||||
export IGNORE_JIT_FIRST_BEAM=1
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
export PYTHONPATH="."
|
||||
export MODEL="bert"
|
||||
export SUBMISSION_PLATFORM="tinybox_red"
|
||||
export DEFAULT_FLOAT="HALF" GPUS=6 BS=54 EVAL_BS=6
|
||||
export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6
|
||||
|
||||
export BEAM=3
|
||||
export IGNORE_JIT_FIRST_BEAM=1
|
||||
|
||||
Reference in New Issue
Block a user