|
|
|
|
@@ -2,7 +2,7 @@ name: Benchmarks
|
|
|
|
|
env:
|
|
|
|
|
# TODO: this rescheduling makes gpt2, mixtral and llama unjitted slower
|
|
|
|
|
# TODO: very slow for llama 70B and resnet training 6 GPU
|
|
|
|
|
RUN_PROCESS_REPLAY: "1"
|
|
|
|
|
CAPTURE_PROCESS_REPLAY: "1"
|
|
|
|
|
ASSERT_PROCESS_REPLAY: "0"
|
|
|
|
|
PYTHONPATH: .
|
|
|
|
|
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
|
|
|
@@ -163,9 +163,9 @@ jobs:
|
|
|
|
|
- name: reset process replay
|
|
|
|
|
run: test/external/process_replay/reset.py
|
|
|
|
|
- name: Run model inference benchmark
|
|
|
|
|
run: NV=1 RUN_PROCESS_REPLAY=0 NOCLANG=1 python3 test/external/external_model_benchmark.py
|
|
|
|
|
run: NV=1 CAPTURE_PROCESS_REPLAY=0 NOCLANG=1 python3 test/external/external_model_benchmark.py
|
|
|
|
|
- name: Test speed vs torch
|
|
|
|
|
run: NV=1 RUN_PROCESS_REPLAY=0 HALF=1 BIG=2 TORCHCUDA=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt
|
|
|
|
|
run: NV=1 CAPTURE_PROCESS_REPLAY=0 HALF=1 BIG=2 TORCHCUDA=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt
|
|
|
|
|
- name: Test speed vs theoretical
|
|
|
|
|
run: NV=1 IGNORE_BEAM_CACHE=1 BEAM_DEBUG=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py --durations=20
|
|
|
|
|
- name: Test benchmark allreduce
|
|
|
|
|
@@ -189,7 +189,7 @@ jobs:
|
|
|
|
|
- name: Run Stable Diffusion
|
|
|
|
|
run: NV=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
|
|
|
|
|
- name: Run SDXL
|
|
|
|
|
run: NV=1 RUN_PROCESS_REPLAY=0 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
|
|
|
|
|
run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
|
|
|
|
|
- name: Run LLaMA
|
|
|
|
|
run: |
|
|
|
|
|
NV=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
|
|
|
|
|
@@ -197,19 +197,19 @@ jobs:
|
|
|
|
|
- name: Run LLaMA with BEAM
|
|
|
|
|
run: NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt
|
|
|
|
|
# - name: Run LLaMA 7B on 4 GPUs
|
|
|
|
|
# run: NV=1 RUN_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_four_gpu.txt
|
|
|
|
|
# run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_four_gpu.txt
|
|
|
|
|
# - name: Run LLaMA 7B on 6 GPUs
|
|
|
|
|
# run: NV=1 RUN_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_six_gpu.txt
|
|
|
|
|
# run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_six_gpu.txt
|
|
|
|
|
- name: Run LLaMA-3 8B BEAM
|
|
|
|
|
run: NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt
|
|
|
|
|
- name: Run LLaMA-3 8B on 4 GPUs
|
|
|
|
|
run: NV=1 RUN_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
|
|
|
|
|
run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
|
|
|
|
|
- name: Run LLaMA-3 8B on 6 GPUs
|
|
|
|
|
run: NV=1 RUN_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt
|
|
|
|
|
run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt
|
|
|
|
|
- name: Run LLaMA-2 70B
|
|
|
|
|
run: NV=1 RUN_PROCESS_REPLAY=0 MAX_CONTEXT=256 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_2_70B.txt
|
|
|
|
|
run: NV=1 CAPTURE_PROCESS_REPLAY=0 MAX_CONTEXT=256 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_2_70B.txt
|
|
|
|
|
- name: Run Mixtral 8x7B
|
|
|
|
|
run: time NV=1 RUN_PROCESS_REPLAY=0 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt
|
|
|
|
|
run: time NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt
|
|
|
|
|
- name: Run GPT2
|
|
|
|
|
run: |
|
|
|
|
|
NV=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
|
|
|
|
|
@@ -286,17 +286,17 @@ jobs:
|
|
|
|
|
- name: Run 10 CIFAR training steps w BF16
|
|
|
|
|
run: NV=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
|
|
|
|
|
- name: Run 10 CIFAR training steps w winograd
|
|
|
|
|
run: NV=1 RUN_PROCESS_REPLAY=0 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
|
|
|
|
|
run: NV=1 CAPTURE_PROCESS_REPLAY=0 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
|
|
|
|
|
- name: Run full CIFAR training w 1 GPU
|
|
|
|
|
run: time NV=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
|
|
|
|
|
- name: Run full CIFAR training steps w 6 GPUS
|
|
|
|
|
run: time RUN_PROCESS_REPLAY=0 NV=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt
|
|
|
|
|
run: time CAPTURE_PROCESS_REPLAY=0 NV=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt
|
|
|
|
|
- name: Run MLPerf resnet eval on training data
|
|
|
|
|
run: time NV=1 MODEL=resnet python3 examples/mlperf/model_eval.py
|
|
|
|
|
- name: Run 10 MLPerf ResNet50 training steps (1 gpu)
|
|
|
|
|
run: NV=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt
|
|
|
|
|
- name: Run 10 MLPerf ResNet50 training steps (6 gpu)
|
|
|
|
|
run: NV=1 RUN_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
|
|
|
|
|
run: NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
|
|
|
|
|
- uses: actions/upload-artifact@v4
|
|
|
|
|
with:
|
|
|
|
|
name: Speed (NVIDIA Training)
|
|
|
|
|
@@ -380,17 +380,17 @@ jobs:
|
|
|
|
|
- name: Run LLaMA 7B with BEAM
|
|
|
|
|
run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt
|
|
|
|
|
# - name: Run LLaMA 7B on 4 GPUs
|
|
|
|
|
# run: AMD=1 RUN_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_four_gpu.txt
|
|
|
|
|
# run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_four_gpu.txt
|
|
|
|
|
# - name: Run LLaMA 7B on 6 GPUs
|
|
|
|
|
# run: AMD=1 RUN_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_six_gpu.txt
|
|
|
|
|
# run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_six_gpu.txt
|
|
|
|
|
- name: Run LLaMA-3 8B BEAM
|
|
|
|
|
run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt
|
|
|
|
|
- name: Run LLaMA-3 8B on 4 GPUs
|
|
|
|
|
run: AMD=1 RUN_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
|
|
|
|
|
run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
|
|
|
|
|
- name: Run LLaMA-3 8B on 6 GPUs
|
|
|
|
|
run: AMD=1 RUN_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt
|
|
|
|
|
run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt
|
|
|
|
|
- name: Run LLaMA-2 70B
|
|
|
|
|
run: AMD=1 RUN_PROCESS_REPLAY=0 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_2_70B.txt
|
|
|
|
|
run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_2_70B.txt
|
|
|
|
|
- name: Run Mixtral 8x7B
|
|
|
|
|
run: time AMD=1 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt
|
|
|
|
|
- name: Run GPT2
|
|
|
|
|
@@ -477,7 +477,7 @@ jobs:
|
|
|
|
|
- name: Run 10 MLPerf ResNet50 training steps (1 gpu)
|
|
|
|
|
run: AMD=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt
|
|
|
|
|
- name: Run 10 MLPerf ResNet50 training steps (6 gpu)
|
|
|
|
|
run: AMD=1 RUN_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
|
|
|
|
|
run: AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
|
|
|
|
|
- uses: actions/upload-artifact@v4
|
|
|
|
|
with:
|
|
|
|
|
name: Speed (AMD Training)
|
|
|
|
|
|