add bert to bechmark ci (#8741)

with `DISABLE_DROPOUT=1 BERT_LAYERS=2` for now
2026-02-17 10:02:00 -05:00 · 2025-01-24 14:45:11 -05:00
parent e0e176efbc
commit 0c759e1ff6
1 changed files with 12 additions and 2 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -299,6 +299,10 @@ jobs:
      run: NV=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt
    - name: Run 10 MLPerf ResNet50 training steps (6 gpu)
      run: NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
+    - name: Run 10 MLPerf Bert training steps (6 gpu)
+      # TODO: remove DISABLE_DROPOUT once dropout is fixed
+      # TODO: remove BERT_LAYERS once scheduler is fast
+      run: NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 DISABLE_DROPOUT=1 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
    - uses: actions/upload-artifact@v4
      with:
        name: Speed (NVIDIA Training)
@@ -309,9 +313,10 @@ jobs:
          train_cifar_bf16.txt
          train_cifar_wino.txt
          train_cifar_one_gpu.txt
+          train_cifar_six_gpu.txt
          train_resnet.txt
          train_resnet_one_gpu.txt
-          train_cifar_six_gpu.txt
+          train_bert.txt
    - name: Run process replay tests
      run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py

@@ -492,6 +497,10 @@ jobs:
      run: AMD=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt
    - name: Run 10 MLPerf ResNet50 training steps (6 gpu)
      run: AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
+    - name: Run 10 MLPerf Bert training steps (6 gpu)
+      # TODO: remove DISABLE_DROPOUT once dropout is fixed
+      # TODO: remove BERT_LAYERS once scheduler is fast
+      run: AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 DISABLE_DROPOUT=1 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
    - uses: actions/upload-artifact@v4
      with:
        name: Speed (AMD Training)
@@ -502,9 +511,10 @@ jobs:
          train_cifar_bf16.txt
          train_cifar_wino.txt
          train_cifar_one_gpu.txt
+          train_cifar_six_gpu.txt
          train_resnet.txt
          train_resnet_one_gpu.txt
-          train_cifar_six_gpu.txt
+          train_bert.txt
    - name: Run process replay tests
      run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py