mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-09 15:08:02 -05:00
minor bert / llama change from grad acc branch (#13622)
* minor bert / llama change from grad acc branch * revert those
This commit is contained in:
4
.github/workflows/benchmark.yml
vendored
4
.github/workflows/benchmark.yml
vendored
@@ -342,7 +342,7 @@ jobs:
|
|||||||
run: BENCHMARK_LOG=resnet_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
|
run: BENCHMARK_LOG=resnet_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
|
||||||
- name: Run 10 MLPerf Bert training steps (6 gpu)
|
- name: Run 10 MLPerf Bert training steps (6 gpu)
|
||||||
# TODO: remove BERT_LAYERS once scheduler is fast
|
# TODO: remove BERT_LAYERS once scheduler is fast
|
||||||
run: BENCHMARK_LOG=bert_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
|
run: BENCHMARK_LOG=bert_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=72 GPUS=6 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: Speed (NVIDIA Training)
|
name: Speed (NVIDIA Training)
|
||||||
@@ -594,7 +594,7 @@ jobs:
|
|||||||
run: BENCHMARK_LOG=resnet_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
|
run: BENCHMARK_LOG=resnet_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
|
||||||
- name: Run 10 MLPerf Bert training steps (6 gpu)
|
- name: Run 10 MLPerf Bert training steps (6 gpu)
|
||||||
# TODO: remove BERT_LAYERS once scheduler is fast
|
# TODO: remove BERT_LAYERS once scheduler is fast
|
||||||
run: BENCHMARK_LOG=bert_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
|
run: BENCHMARK_LOG=bert_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=72 GPUS=6 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: Speed (AMD MLPerf)
|
name: Speed (AMD MLPerf)
|
||||||
|
|||||||
@@ -1040,8 +1040,8 @@ def train_bert():
|
|||||||
|
|
||||||
# ** Optimizer **
|
# ** Optimizer **
|
||||||
parameters_no_wd = [v for k, v in get_state_dict(model).items() if "bias" in k or "LayerNorm" in k]
|
parameters_no_wd = [v for k, v in get_state_dict(model).items() if "bias" in k or "LayerNorm" in k]
|
||||||
parameters = [x for x in parameters if x not in set(parameters_no_wd)]
|
parameters_wd = [x for x in parameters if x not in set(parameters_no_wd)]
|
||||||
optimizer_wd = LAMB(parameters, lr=max_lr, b1=opt_lamb_beta_1, b2=opt_lamb_beta_2, eps=epsilon, weight_decay=decay, adam=False)
|
optimizer_wd = LAMB(parameters_wd, lr=max_lr, b1=opt_lamb_beta_1, b2=opt_lamb_beta_2, eps=epsilon, weight_decay=decay, adam=False)
|
||||||
optimizer_no_wd = LAMB(parameters_no_wd, lr=max_lr, b1=opt_lamb_beta_1, b2=opt_lamb_beta_2, eps=epsilon, weight_decay=0.0, adam=False)
|
optimizer_no_wd = LAMB(parameters_no_wd, lr=max_lr, b1=opt_lamb_beta_1, b2=opt_lamb_beta_2, eps=epsilon, weight_decay=0.0, adam=False)
|
||||||
optimizer_group = OptimizerGroup(optimizer_wd, optimizer_no_wd)
|
optimizer_group = OptimizerGroup(optimizer_wd, optimizer_no_wd)
|
||||||
|
|
||||||
@@ -1418,14 +1418,14 @@ def train_llama3():
|
|||||||
|
|
||||||
def get_train_iter():
|
def get_train_iter():
|
||||||
if getenv("FAKEDATA", 0):
|
if getenv("FAKEDATA", 0):
|
||||||
return fake_data(GBS, SAMPLES)
|
return fake_data(BS, SAMPLES)
|
||||||
else:
|
else:
|
||||||
if SMALL:
|
if SMALL:
|
||||||
from examples.mlperf.dataloader import batch_load_llama3_small
|
from examples.mlperf.dataloader import batch_load_llama3_small
|
||||||
return batch_load_llama3_small(GBS, SAMPLES, SEQLEN, BASEDIR, seed=SEED, val=bool(TRAIN_ON_VAL))
|
return batch_load_llama3_small(BS, SAMPLES, SEQLEN, BASEDIR, seed=SEED, val=bool(TRAIN_ON_VAL))
|
||||||
else:
|
else:
|
||||||
from examples.mlperf.dataloader import batch_load_llama3
|
from examples.mlperf.dataloader import batch_load_llama3
|
||||||
return batch_load_llama3(GBS, SAMPLES, SEQLEN, BASEDIR, seed=SEED, val=bool(TRAIN_ON_VAL))
|
return batch_load_llama3(BS, SAMPLES, SEQLEN, BASEDIR, seed=SEED, val=bool(TRAIN_ON_VAL))
|
||||||
|
|
||||||
def get_eval_iter():
|
def get_eval_iter():
|
||||||
if getenv("FAKEDATA", 0):
|
if getenv("FAKEDATA", 0):
|
||||||
|
|||||||
Reference in New Issue
Block a user