process replay diffs 3 things now (#5731)

* github api infra

* process replay is 3 parts now

* parse benchmarks

* add gh_token

* complete diff

* move process replay tests

* last successful run

* add tempdir

* skip master
This commit is contained in:
qazal
2024-07-27 17:52:20 +08:00
committed by GitHub
parent 57b4a8e98d
commit 3e49d86c01
3 changed files with 75 additions and 37 deletions

View File

@@ -3,6 +3,7 @@ env:
RUN_PROCESS_REPLAY: "1"
ASSERT_PROCESS_REPLAY: "0"
PYTHONPATH: .
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
on:
push:
@@ -90,9 +91,6 @@ jobs:
# run: STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
- name: Run 10 CIFAR training steps w winograd
run: JIT=2 WINO=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
- name: Run process replay tests
if: env.RUN_PROCESS_REPLAY == '1'
run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py
- uses: actions/upload-artifact@v4
with:
name: Speed (Mac)
@@ -119,6 +117,8 @@ jobs:
train_cifar_half.txt
train_cifar_bf16.txt
train_cifar_wino.txt
- name: Run process replay tests
run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py
testnvidiabenchmark:
name: tinybox green Benchmark
@@ -200,9 +200,6 @@ jobs:
run: NV=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
- name: Run GPT2 w HALF/BEAM
run: NV=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAST_BEFORE_VIEW=0 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
- name: Run process replay tests
if: env.RUN_PROCESS_REPLAY == '1'
run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py
- uses: actions/upload-artifact@v4
with:
name: Speed (NVIDIA)
@@ -229,6 +226,8 @@ jobs:
gpt2_jitted.txt
gpt2_half.txt
gpt2_half_beam.txt
- name: Run process replay tests
run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py
testmorenvidiabenchmark:
name: tinybox green Training Benchmark
@@ -275,9 +274,6 @@ jobs:
run: NV=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt
- name: Run 10 MLPerf ResNet50 training steps (6 gpu)
run: NV=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
- name: Run process replay tests
if: env.RUN_PROCESS_REPLAY == '1'
run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py
- uses: actions/upload-artifact@v4
with:
name: Speed (NVIDIA Training)
@@ -291,6 +287,8 @@ jobs:
train_resnet.txt
train_resnet_one_gpu.txt
train_cifar_six_gpu.txt
- name: Run process replay tests
run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py
testamdbenchmark:
name: tinybox red Benchmark
@@ -368,9 +366,6 @@ jobs:
run: AMD=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
- name: Run GPT2 w HALF/BEAM
run: AMD=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAST_BEFORE_VIEW=0 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
- name: Run process replay tests
if: env.RUN_PROCESS_REPLAY == '1'
run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py
- uses: actions/upload-artifact@v4
with:
name: Speed (AMD)
@@ -395,6 +390,8 @@ jobs:
sd.txt
sdxl.txt
mixtral.txt
- name: Run process replay tests
run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py
testmoreamdbenchmark:
name: tinybox red Training Benchmark
@@ -441,9 +438,6 @@ jobs:
run: AMD=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt
- name: Run 10 MLPerf ResNet50 training steps (6 gpu)
run: AMD=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
- name: Run process replay tests
if: env.RUN_PROCESS_REPLAY == '1'
run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py
- uses: actions/upload-artifact@v4
with:
name: Speed (AMD Training)
@@ -457,3 +451,5 @@ jobs:
train_resnet.txt
train_resnet_one_gpu.txt
train_cifar_six_gpu.txt
- name: Run process replay tests
run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py

View File

@@ -3,6 +3,7 @@ env:
# increment this when downloads substantially change to avoid the internet
DOWNLOAD_CACHE_VERSION: '5'
RUN_PROCESS_REPLAY: 1
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
on:
push:
@@ -508,7 +509,7 @@ jobs:
run: TRANSCENDENTAL=2 python -m pytest -n=auto test/test_ops.py::TestOps::test_sin test/test_ops.py::TestOps::test_cos test/test_ops.py::TestOps::test_tan test/test_ops.py::TestOps::test_exp test/test_ops.py::TestOps::test_log --durations=20
- name: Run process replay tests
run: |
if [ "${{ matrix.backend }}" == "amd" ]; then
if [ "${{ matrix.backend }}" == "amd" ] && [ "${GITHUB_REF_NAME}" != "master" ]; then
MAX_DIFF_PCT=1 RUN_PROCESS_REPLAY=0 test/external/process_replay/test_process_replay.sh
fi
export PR_TITLE=$(jq -r .pull_request.title "$GITHUB_EVENT_PATH")