RUN_PROCESS_REPLAY=0 on llama 70B and resnet training (#7272)

* RUN_PROCESS_REPLAY=0 on llama 70B and resnet training also added a 15 minutes total timeout, this cannot grow indefinitely * add a few more * a few more just for NV
2026-04-29 03:00:14 -04:00 · 2024-10-24 12:09:54 -04:00
parent b777cfdcba
commit e6929f2402
1 changed files with 25 additions and 18 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -1,6 +1,7 @@
 name: Benchmarks
 env:
  # TODO: this rescheduling makes gpt2, mixtral and llama unjitted slower
+  # TODO: very slow for llama 70B and resnet training 6 GPU
  RUN_PROCESS_REPLAY: "1"
  ASSERT_PROCESS_REPLAY: "0"
  PYTHONPATH: .
@@ -24,6 +25,7 @@ jobs:
  testmacbenchmark:
    name: Mac Benchmark
    runs-on: [self-hosted, macOS]
+    timeout-minutes: 15
    defaults:
      run:
        shell: bash -o pipefail {0}
@@ -126,6 +128,7 @@ jobs:
  testnvidiabenchmark:
    name: tinybox green Benchmark
    runs-on: [self-hosted, Linux, tinyboxgreen]
+    timeout-minutes: 15
    defaults:
      run:
        shell: bash -o pipefail {0}
@@ -152,9 +155,9 @@ jobs:
    - name: reset process replay
      run: test/external/process_replay/reset.py
    - name: Run model inference benchmark
-      run: NV=1 NOCLANG=1 python3 test/external/external_model_benchmark.py
+      run: NV=1 RUN_PROCESS_REPLAY=0 NOCLANG=1 python3 test/external/external_model_benchmark.py
    - name: Test speed vs torch
-      run: NV=1 BIG=2 TORCHCUDA=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt
+      run: NV=1 RUN_PROCESS_REPLAY=0 BIG=2 TORCHCUDA=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt
    - name: Test tensor cores
      run: |
        NV=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded
@@ -172,7 +175,7 @@ jobs:
    - name: Run Stable Diffusion
      run: NV=1 python3 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd.txt
    - name: Run SDXL
-      run: NV=1 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
+      run: NV=1 RUN_PROCESS_REPLAY=0 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
    - name: Run LLaMA
      run: |
        NV=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
@@ -180,19 +183,19 @@ jobs:
    - name: Run LLaMA with BEAM
      run: NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt
    - name: Run LLaMA 7B on 4 GPUs
-      run: NV=1 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0  --timing | tee llama_four_gpu.txt
+      run: NV=1 RUN_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0  --timing | tee llama_four_gpu.txt
    - name: Run LLaMA 7B on 6 GPUs
-      run: NV=1 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0  --timing | tee llama_six_gpu.txt
+      run: NV=1 RUN_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0  --timing | tee llama_six_gpu.txt
    - name: Run LLaMA-3 8B BEAM
      run: NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt
    - name: Run LLaMA-3 8B on 4 GPUs
-      run: NV=1 python3 examples/llama3.py --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
+      run: NV=1 RUN_PROCESS_REPLAY=0 python3 examples/llama3.py --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
    - name: Run LLaMA-3 8B on 6 GPUs
-      run: NV=1 python3 examples/llama3.py --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt
+      run: NV=1 RUN_PROCESS_REPLAY=0 python3 examples/llama3.py --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt
    - name: Run LLaMA-2 70B
-      run: NV=1 MAX_CONTEXT=256 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0  --timing | tee llama_2_70B.txt
+      run: NV=1 RUN_PROCESS_REPLAY=0 MAX_CONTEXT=256 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0  --timing | tee llama_2_70B.txt
    - name: Run Mixtral 8x7B
-      run: time NV=1 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt
+      run: time NV=1 RUN_PROCESS_REPLAY=0 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt
    - name: Run GPT2
      run: |
        NV=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
@@ -233,6 +236,7 @@ jobs:
  testmorenvidiabenchmark:
    name: tinybox green Training Benchmark
    runs-on: [self-hosted, Linux, tinyboxgreen]
+    timeout-minutes: 15
    defaults:
      run:
        shell: bash -o pipefail {0}
@@ -270,17 +274,17 @@ jobs:
    - name: Run 10 CIFAR training steps w BF16
      run: NV=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
    - name: Run 10 CIFAR training steps w winograd
-      run: NV=1 WINO=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
+      run: NV=1 RUN_PROCESS_REPLAY=0 WINO=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
    - name: Run full CIFAR training w 1 GPU
      run: time NV=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
    - name: Run full CIFAR training steps w 6 GPUS
-      run: time NV=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt
+      run: time RUN_PROCESS_REPLAY=0 NV=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt
    - name: Run MLPerf resnet eval on training data
      run: time NV=1 MODEL=resnet python3 examples/mlperf/model_eval.py
    - name: Run 10 MLPerf ResNet50 training steps (1 gpu)
      run: NV=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt
    - name: Run 10 MLPerf ResNet50 training steps (6 gpu)
-      run: NV=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
+      run: NV=1 RUN_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
    - uses: actions/upload-artifact@v4
      with:
        name: Speed (NVIDIA Training)
@@ -300,6 +304,7 @@ jobs:
  testamdbenchmark:
    name: tinybox red Benchmark
    runs-on: [self-hosted, Linux, tinybox]
+    timeout-minutes: 15
    defaults:
      run:
        shell: bash -o pipefail {0}
@@ -354,17 +359,17 @@ jobs:
    - name: Run LLaMA 7B with BEAM
      run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt
    - name: Run LLaMA 7B on 4 GPUs
-      run: AMD=1 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0  --timing | tee llama_four_gpu.txt
+      run: AMD=1 RUN_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0  --timing | tee llama_four_gpu.txt
    - name: Run LLaMA 7B on 6 GPUs
-      run: AMD=1 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0  --timing | tee llama_six_gpu.txt
+      run: AMD=1 RUN_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0  --timing | tee llama_six_gpu.txt
    - name: Run LLaMA-3 8B BEAM
      run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt
    - name: Run LLaMA-3 8B on 4 GPUs
-      run: AMD=1 python3 examples/llama3.py --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
+      run: AMD=1 RUN_PROCESS_REPLAY=0 python3 examples/llama3.py --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
    - name: Run LLaMA-3 8B on 6 GPUs
-      run: AMD=1 python3 examples/llama3.py --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt
+      run: AMD=1 RUN_PROCESS_REPLAY=0 python3 examples/llama3.py --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt
    - name: Run LLaMA-2 70B
-      run: AMD=1 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0  --timing | tee llama_2_70B.txt
+      run: AMD=1 RUN_PROCESS_REPLAY=0 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0  --timing | tee llama_2_70B.txt
    - name: Run Mixtral 8x7B
      run: time AMD=1 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt
    - name: Run GPT2
@@ -405,6 +410,7 @@ jobs:
  testmoreamdbenchmark:
    name: tinybox red Training Benchmark
    runs-on: [self-hosted, Linux, tinybox]
+    timeout-minutes: 15
    defaults:
      run:
        shell: bash -o pipefail {0}
@@ -448,7 +454,7 @@ jobs:
    - name: Run 10 MLPerf ResNet50 training steps (1 gpu)
      run: AMD=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt
    - name: Run 10 MLPerf ResNet50 training steps (6 gpu)
-      run: AMD=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
+      run: AMD=1 RUN_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
    - uses: actions/upload-artifact@v4
      with:
        name: Speed (AMD Training)
@@ -468,6 +474,7 @@ jobs:
  testqualcommbenchmark:
    name: comma Benchmark
    runs-on: [self-hosted, Linux, comma]
+    timeout-minutes: 15
    defaults:
      run:
        shell: bash -o pipefail {0}