mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-07 22:23:55 -05:00
minor bert / llama change from grad acc branch (#13622)
* minor bert / llama change from grad acc branch * revert those
This commit is contained in:
4
.github/workflows/benchmark.yml
vendored
4
.github/workflows/benchmark.yml
vendored
@@ -342,7 +342,7 @@ jobs:
|
||||
run: BENCHMARK_LOG=resnet_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
|
||||
- name: Run 10 MLPerf Bert training steps (6 gpu)
|
||||
# TODO: remove BERT_LAYERS once scheduler is fast
|
||||
run: BENCHMARK_LOG=bert_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
|
||||
run: BENCHMARK_LOG=bert_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=72 GPUS=6 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: Speed (NVIDIA Training)
|
||||
@@ -594,7 +594,7 @@ jobs:
|
||||
run: BENCHMARK_LOG=resnet_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
|
||||
- name: Run 10 MLPerf Bert training steps (6 gpu)
|
||||
# TODO: remove BERT_LAYERS once scheduler is fast
|
||||
run: BENCHMARK_LOG=bert_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
|
||||
run: BENCHMARK_LOG=bert_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=72 GPUS=6 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: Speed (AMD MLPerf)
|
||||
|
||||
@@ -1040,8 +1040,8 @@ def train_bert():
|
||||
|
||||
# ** Optimizer **
|
||||
parameters_no_wd = [v for k, v in get_state_dict(model).items() if "bias" in k or "LayerNorm" in k]
|
||||
parameters = [x for x in parameters if x not in set(parameters_no_wd)]
|
||||
optimizer_wd = LAMB(parameters, lr=max_lr, b1=opt_lamb_beta_1, b2=opt_lamb_beta_2, eps=epsilon, weight_decay=decay, adam=False)
|
||||
parameters_wd = [x for x in parameters if x not in set(parameters_no_wd)]
|
||||
optimizer_wd = LAMB(parameters_wd, lr=max_lr, b1=opt_lamb_beta_1, b2=opt_lamb_beta_2, eps=epsilon, weight_decay=decay, adam=False)
|
||||
optimizer_no_wd = LAMB(parameters_no_wd, lr=max_lr, b1=opt_lamb_beta_1, b2=opt_lamb_beta_2, eps=epsilon, weight_decay=0.0, adam=False)
|
||||
optimizer_group = OptimizerGroup(optimizer_wd, optimizer_no_wd)
|
||||
|
||||
@@ -1418,14 +1418,14 @@ def train_llama3():
|
||||
|
||||
def get_train_iter():
|
||||
if getenv("FAKEDATA", 0):
|
||||
return fake_data(GBS, SAMPLES)
|
||||
return fake_data(BS, SAMPLES)
|
||||
else:
|
||||
if SMALL:
|
||||
from examples.mlperf.dataloader import batch_load_llama3_small
|
||||
return batch_load_llama3_small(GBS, SAMPLES, SEQLEN, BASEDIR, seed=SEED, val=bool(TRAIN_ON_VAL))
|
||||
return batch_load_llama3_small(BS, SAMPLES, SEQLEN, BASEDIR, seed=SEED, val=bool(TRAIN_ON_VAL))
|
||||
else:
|
||||
from examples.mlperf.dataloader import batch_load_llama3
|
||||
return batch_load_llama3(GBS, SAMPLES, SEQLEN, BASEDIR, seed=SEED, val=bool(TRAIN_ON_VAL))
|
||||
return batch_load_llama3(BS, SAMPLES, SEQLEN, BASEDIR, seed=SEED, val=bool(TRAIN_ON_VAL))
|
||||
|
||||
def get_eval_iter():
|
||||
if getenv("FAKEDATA", 0):
|
||||
|
||||
Reference in New Issue
Block a user