remove FUSE_ARANGE_UINT (#11567)

also add IGNORE_OOB=1 to bert runs. lowered BS on tinybox to 90 since 96 oom during eval without reset
2026-01-07 22:23:55 -05:00 · 2025-08-07 13:49:06 -07:00
parent 6ed2dfd187
commit 702e38dc19
12 changed files with 24 additions and 17 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -325,7 +325,7 @@ jobs:
      run: BENCHMARK_LOG=resnet_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
    - name: Run 10 MLPerf Bert training steps (6 gpu)
      # TODO: remove BERT_LAYERS once scheduler is fast
-      run: BENCHMARK_LOG=bert_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
+      run: BENCHMARK_LOG=bert_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
    - uses: actions/upload-artifact@v4
      with:
        name: Speed (NVIDIA Training)
@@ -576,7 +576,7 @@ jobs:
      run: BENCHMARK_LOG=resnet_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
    - name: Run 10 MLPerf Bert training steps (6 gpu)
      # TODO: remove BERT_LAYERS once scheduler is fast
-      run: BENCHMARK_LOG=bert_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
+      run: BENCHMARK_LOG=bert_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
    - uses: actions/upload-artifact@v4
      with:
        name: Speed (AMD MLPerf)
@@ -697,7 +697,7 @@ jobs:
    #   run: BENCHMARK_LOG=resnet_10steps AMD=1 MNISTMOCK=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee am_train_resnet_one_gpu.txt
    - name: Run 10 MLPerf Bert training steps (1 gpu)
      # TODO: remove BERT_LAYERS once scheduler is fast
-      run: BENCHMARK_LOG=bert_10steps AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=1 BERT_LAYERS=2 FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 MODEL=bert python3 examples/mlperf/model_train.py | tee am_train_bert_one_gpu.txt
+      run: BENCHMARK_LOG=bert_10steps AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=1 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee am_train_bert_one_gpu.txt
    - uses: actions/upload-artifact@v4
      with:
        name: Speed (AM Driver)
@@ -755,7 +755,7 @@ jobs:
      run: BENCHMARK_LOG=resnet_10steps NV=1 MNISTMOCK=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee nv_train_resnet_one_gpu.txt
    - name: Run 10 MLPerf Bert training steps (1 gpu)
      # TODO: remove BERT_LAYERS once scheduler is fast
-      run: BENCHMARK_LOG=bert_10steps NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=1 BERT_LAYERS=2 FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 MODEL=bert python3 examples/mlperf/model_train.py | tee nv_train_bert_one_gpu.txt
+      run: BENCHMARK_LOG=bert_10steps NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=1 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee nv_train_bert_one_gpu.txt
    - uses: actions/upload-artifact@v4
      with:
        name: Speed (NV Driver)
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_1xMI300X/dev_beam.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_1xMI300X/dev_beam.sh
@@ -4,6 +4,8 @@ export PYTHONPATH="." AMD=1
 export MODEL="bert"
 export DEFAULT_FLOAT="HALF" GPUS=1 BS=128 EVAL_BS=128

+export IGNORE_OOB=1
+
 export BEAM=3 BEAM_UOPS_MAX=4000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1
 # export BEAM_LOG_SURPASS_MAX=1
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_beam.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_beam.sh
@@ -5,6 +5,8 @@ export MODEL="bert"
 export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024
 export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1

+export IGNORE_OOB=1
+
 export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0
 export BASEDIR="/raid/datasets/wiki"
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_run.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_run.sh
@@ -8,6 +8,8 @@ export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024
 export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1
 export TRAIN_STEPS=3900

+export IGNORE_OOB=1
+
 export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0
 export BASEDIR="/raid/datasets/wiki"
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
@@ -11,6 +11,8 @@ export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024
 export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1
 export TRAIN_STEPS=3900

+export IGNORE_OOB=1
+
 export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0
 export BASEDIR="/raid/datasets/wiki"
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh
@@ -2,9 +2,9 @@

 export PYTHONPATH="." NV=1
 export MODEL="bert"
-export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
+export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90

-export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
+export IGNORE_OOB=1

 export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh
@@ -2,9 +2,9 @@

 export PYTHONPATH="." NV=1
 export MODEL="bert"
-export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
+export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90

-export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
+export IGNORE_OOB=1

 export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
@@ -5,9 +5,9 @@ set -o pipefail  # Make pipeline fail if any command fails
 export PYTHONPATH="." NV=1
 export MODEL="bert"
 export SUBMISSION_PLATFORM="tinybox_green"
-export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
+export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90

-export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
+export IGNORE_OOB=1

 export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh
@@ -2,9 +2,9 @@

 export PYTHONPATH="." AMD=1
 export MODEL="bert"
-export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
+export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90

-export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
+export IGNORE_OOB=1

 export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh
@@ -2,9 +2,9 @@

 export PYTHONPATH="." AMD=1
 export MODEL="bert"
-export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
+export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90

-export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
+export IGNORE_OOB=1

 export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
@@ -5,9 +5,9 @@ set -o pipefail  # Make pipeline fail if any command fails
 export PYTHONPATH="." AMD=1
 export MODEL="bert"
 export SUBMISSION_PLATFORM="tinybox_red"
-export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
+export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90

-export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
+export IGNORE_OOB=1

 export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1
--- a/extra/hcqfuzz/tests/bert.py
+++ b/extra/hcqfuzz/tests/bert.py
@@ -8,7 +8,6 @@ bert_train_params = {
  "BS": 96,
  "EVAL_BS": 96,
  "FUSE_ARANGE": 1,
-  "FUSE_ARANGE_UINT": 0,
  "BASEDIR": "/raid/datasets/wiki",
 }