|
|
|
|
@@ -52,14 +52,14 @@ jobs:
|
|
|
|
|
- name: reset process replay
|
|
|
|
|
run: python3.11 test/external/process_replay/reset.py
|
|
|
|
|
- name: Run Stable Diffusion
|
|
|
|
|
run: BENCHMARK_LOG=stable_diffusion JIT=1 python3.11 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
|
|
|
|
|
run: BENCHMARK_LOG=stable_diffusion JIT=1 ASSERT_MIN_STEP_TIME=500 python3.11 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
|
|
|
|
|
- name: Run Stable Diffusion without fp16
|
|
|
|
|
run: BENCHMARK_LOG=stable_diffusion_fp32 JIT=1 python3.11 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd_no_fp16.txt
|
|
|
|
|
run: BENCHMARK_LOG=stable_diffusion_fp32 JIT=1 ASSERT_MIN_STEP_TIME=700 python3.11 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd_no_fp16.txt
|
|
|
|
|
- name: Run Stable Diffusion v2
|
|
|
|
|
run: BENCHMARK_LOG=stable_diffusion_v2 JIT=1 python3.11 examples/sdv2.py --fp16 --seed 0 --noshow --timing | tee sdv2.txt
|
|
|
|
|
run: BENCHMARK_LOG=stable_diffusion_v2 JIT=1 ASSERT_MIN_STEP_TIME=1600 python3.11 examples/sdv2.py --fp16 --seed 0 --noshow --timing | tee sdv2.txt
|
|
|
|
|
# process replay can't capture this, the graph is too large
|
|
|
|
|
- name: Run SDXL
|
|
|
|
|
run: BENCHMARK_LOG=stable_diffusion_xl CAPTURE_PROCESS_REPLAY=0 JIT=1 python3.11 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
|
|
|
|
|
run: BENCHMARK_LOG=stable_diffusion_xl ASSERT_MIN_STEP_TIME=3000 CAPTURE_PROCESS_REPLAY=0 JIT=1 python3.11 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
|
|
|
|
|
- name: Run model inference benchmark
|
|
|
|
|
run: METAL=1 python3.11 test/external/external_model_benchmark.py
|
|
|
|
|
- name: Test speed vs torch
|
|
|
|
|
@@ -99,7 +99,7 @@ jobs:
|
|
|
|
|
- name: Run GPT2
|
|
|
|
|
run: |
|
|
|
|
|
BENCHMARK_LOG=gpt2_nojit JIT=0 python3.11 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
|
|
|
|
|
BENCHMARK_LOG=gpt2 JIT=1 python3.11 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
|
|
|
|
|
BENCHMARK_LOG=gpt2 JIT=1 ASSERT_MIN_STEP_TIME=8 python3.11 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
|
|
|
|
|
- name: Run GPT2 w HALF
|
|
|
|
|
run: BENCHMARK_LOG=gpt2_half HALF=1 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
|
|
|
|
|
- name: Run GPT2 w HALF/BEAM
|
|
|
|
|
@@ -109,13 +109,13 @@ jobs:
|
|
|
|
|
- name: Train MNIST
|
|
|
|
|
run: time PYTHONPATH=. TARGET_EVAL_ACC_PCT=96.0 python3.11 examples/beautiful_mnist.py | tee beautiful_mnist.txt
|
|
|
|
|
- name: Run 10 CIFAR training steps
|
|
|
|
|
run: BENCHMARK_LOG=cifar_10steps JIT=1 STEPS=10 python3.11 examples/hlb_cifar10.py | tee train_cifar.txt
|
|
|
|
|
run: BENCHMARK_LOG=cifar_10steps JIT=1 ASSERT_MIN_STEP_TIME=320 STEPS=10 python3.11 examples/hlb_cifar10.py | tee train_cifar.txt
|
|
|
|
|
- name: Run 10 CIFAR training steps w HALF
|
|
|
|
|
run: BENCHMARK_LOG=cifar_10steps_half JIT=2 STEPS=10 DEFAULT_FLOAT=HALF python3.11 examples/hlb_cifar10.py | tee train_cifar_half.txt
|
|
|
|
|
run: BENCHMARK_LOG=cifar_10steps_half JIT=2 ASSERT_MIN_STEP_TIME=385 STEPS=10 DEFAULT_FLOAT=HALF python3.11 examples/hlb_cifar10.py | tee train_cifar_half.txt
|
|
|
|
|
#- name: Run 10 CIFAR training steps w BF16
|
|
|
|
|
# run: STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3.11 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
|
|
|
|
|
- name: Run 10 CIFAR training steps w winograd
|
|
|
|
|
run: BENCHMARK_LOG=cifar_10steps_wino JIT=1 WINO=1 STEPS=10 python3.11 examples/hlb_cifar10.py | tee train_cifar_wino.txt
|
|
|
|
|
run: BENCHMARK_LOG=cifar_10steps_wino JIT=1 ASSERT_MIN_STEP_TIME=150 WINO=1 STEPS=10 python3.11 examples/hlb_cifar10.py | tee train_cifar_wino.txt
|
|
|
|
|
- name: UsbGPU boot time
|
|
|
|
|
run: sudo -E PYTHONPATH=. DEBUG=2 AM_RESET=1 AMD=1 AMD_IFACE=USB time python3.11 test/test_tiny.py TestTiny.test_plus
|
|
|
|
|
- name: UsbGPU tiny tests
|
|
|
|
|
@@ -214,7 +214,7 @@ jobs:
|
|
|
|
|
- name: Run Stable Diffusion
|
|
|
|
|
run: BENCHMARK_LOG=stable_diffusion NV=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
|
|
|
|
|
- name: Run SDXL
|
|
|
|
|
run: BENCHMARK_LOG=stable_diffusion_xl CAPTURE_PROCESS_REPLAY=0 NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
|
|
|
|
|
run: BENCHMARK_LOG=stable_diffusion_xl ASSERT_MIN_STEP_TIME=2000 CAPTURE_PROCESS_REPLAY=0 NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
|
|
|
|
|
- name: Run LLaMA
|
|
|
|
|
run: |
|
|
|
|
|
BENCHMARK_LOG=llama_nojit NV=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
|
|
|
|
|
@@ -238,9 +238,9 @@ jobs:
|
|
|
|
|
- name: Run GPT2
|
|
|
|
|
run: |
|
|
|
|
|
BENCHMARK_LOG=gpt2_nojit NV=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
|
|
|
|
|
BENCHMARK_LOG=gpt2 NV=1 JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
|
|
|
|
|
BENCHMARK_LOG=gpt2 NV=1 JIT=1 ASSERT_MIN_STEP_TIME=5 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
|
|
|
|
|
- name: Run GPT2 w HALF
|
|
|
|
|
run: BENCHMARK_LOG=gpt2_half NV=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
|
|
|
|
|
run: BENCHMARK_LOG=gpt2_half NV=1 HALF=1 ASSERT_MIN_STEP_TIME=5 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
|
|
|
|
|
- name: Run GPT2 w HALF/BEAM
|
|
|
|
|
run: BENCHMARK_LOG=gpt2_half_beam NV=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
|
|
|
|
|
- uses: actions/upload-artifact@v4
|
|
|
|
|
@@ -306,13 +306,13 @@ jobs:
|
|
|
|
|
- name: Train MNIST
|
|
|
|
|
run: time PYTHONPATH=. NV=1 TARGET_EVAL_ACC_PCT=96.0 python3 examples/beautiful_mnist.py | tee beautiful_mnist.txt
|
|
|
|
|
- name: Run 10 CIFAR training steps
|
|
|
|
|
run: BENCHMARK_LOG=cifar_10steps NV=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
|
|
|
|
|
run: BENCHMARK_LOG=cifar_10steps ASSERT_MIN_STEP_TIME=85 NV=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
|
|
|
|
|
- name: Run 10 CIFAR training steps w HALF
|
|
|
|
|
run: BENCHMARK_LOG=cifar_10steps_half NV=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
|
|
|
|
|
run: BENCHMARK_LOG=cifar_10steps_half ASSERT_MIN_STEP_TIME=68 NV=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
|
|
|
|
|
- name: Run 10 CIFAR training steps w BF16
|
|
|
|
|
run: BENCHMARK_LOG=cifar_10steps_bf16 NV=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
|
|
|
|
|
run: BENCHMARK_LOG=cifar_10steps_bf16 ASSERT_MIN_STEP_TIME=75 NV=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
|
|
|
|
|
- name: Run 10 CIFAR training steps w winograd
|
|
|
|
|
run: BENCHMARK_LOG=cifar_10steps_half_wino NV=1 CAPTURE_PROCESS_REPLAY=0 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
|
|
|
|
|
run: BENCHMARK_LOG=cifar_10steps_half_wino ASSERT_MIN_STEP_TIME=35 NV=1 CAPTURE_PROCESS_REPLAY=0 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
|
|
|
|
|
- name: Run full CIFAR training w 1 GPU
|
|
|
|
|
run: time BENCHMARK_LOG=cifar NV=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
|
|
|
|
|
- name: Run full CIFAR training steps w 6 GPUS
|
|
|
|
|
@@ -415,9 +415,9 @@ jobs:
|
|
|
|
|
- name: Test AM warm start time
|
|
|
|
|
run: time AMD=1 python3 test/test_tiny.py TestTiny.test_plus
|
|
|
|
|
- name: Run Stable Diffusion
|
|
|
|
|
run: BENCHMARK_LOG=stable_diffusion AMD=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
|
|
|
|
|
run: BENCHMARK_LOG=stable_diffusion ASSERT_MIN_STEP_TIME=450 AMD=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
|
|
|
|
|
- name: Run SDXL
|
|
|
|
|
run: BENCHMARK_LOG=stable_diffusion_xl CAPTURE_PROCESS_REPLAY=0 AMD=1 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
|
|
|
|
|
run: BENCHMARK_LOG=stable_diffusion_xl ASSERT_MIN_STEP_TIME=1400 CAPTURE_PROCESS_REPLAY=0 AMD=1 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
|
|
|
|
|
- name: Run LLaMA 7B
|
|
|
|
|
run: |
|
|
|
|
|
BENCHMARK_LOG=llama_nojit AMD=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
|
|
|
|
|
@@ -443,9 +443,9 @@ jobs:
|
|
|
|
|
- name: Run GPT2
|
|
|
|
|
run: |
|
|
|
|
|
BENCHMARK_LOG=gpt2_nojit AMD=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
|
|
|
|
|
BENCHMARK_LOG=gpt2 AMD=1 JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
|
|
|
|
|
BENCHMARK_LOG=gpt2 AMD=1 JIT=1 ASSERT_MIN_STEP_TIME=5 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
|
|
|
|
|
- name: Run GPT2 w HALF
|
|
|
|
|
run: BENCHMARK_LOG=gpt2_half AMD=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
|
|
|
|
|
run: BENCHMARK_LOG=gpt2_half AMD=1 HALF=1 ASSERT_MIN_STEP_TIME=5 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
|
|
|
|
|
- name: Run GPT2 w HALF/BEAM
|
|
|
|
|
run: BENCHMARK_LOG=gpt2_half_beam AMD=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
|
|
|
|
|
- uses: actions/upload-artifact@v4
|
|
|
|
|
@@ -508,13 +508,13 @@ jobs:
|
|
|
|
|
- name: Train MNIST
|
|
|
|
|
run: time PYTHONPATH=. AMD=1 TARGET_EVAL_ACC_PCT=96.0 python3 examples/beautiful_mnist.py | tee beautiful_mnist.txt
|
|
|
|
|
- name: Run 10 CIFAR training steps
|
|
|
|
|
run: BENCHMARK_LOG=cifar_10steps AMD=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
|
|
|
|
|
run: BENCHMARK_LOG=cifar_10steps ASSERT_MIN_STEP_TIME=85 AMD=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
|
|
|
|
|
- name: Run 10 CIFAR training steps w HALF
|
|
|
|
|
run: BENCHMARK_LOG=cifar_10steps_half AMD=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
|
|
|
|
|
run: BENCHMARK_LOG=cifar_10steps_half ASSERT_MIN_STEP_TIME=188 AMD=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
|
|
|
|
|
- name: Run 10 CIFAR training steps w BF16
|
|
|
|
|
run: BENCHMARK_LOG=cifar_10steps_bf16 AMD=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
|
|
|
|
|
run: BENCHMARK_LOG=cifar_10steps_bf16 ASSERT_MIN_STEP_TIME=288 AMD=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
|
|
|
|
|
- name: Run 10 CIFAR training steps w winograd
|
|
|
|
|
run: BENCHMARK_LOG=cifar_10steps_half_wino AMD=1 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
|
|
|
|
|
run: BENCHMARK_LOG=cifar_10steps_half_wino ASSERT_MIN_STEP_TIME=66 AMD=1 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
|
|
|
|
|
- name: Run full CIFAR training w 1 GPU
|
|
|
|
|
run: time BENCHMARK_LOG=cifar AMD=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
|
|
|
|
|
#- name: Run full CIFAR training steps w 6 GPUS
|
|
|
|
|
@@ -606,11 +606,11 @@ jobs:
|
|
|
|
|
- name: reset process replay
|
|
|
|
|
run: test/external/process_replay/reset.py
|
|
|
|
|
- name: benchmark openpilot 0.9.9 driving_vision
|
|
|
|
|
run: BENCHMARK_LOG=openpilot_0_9_9_vision PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.9/selfdrive/modeld/models/driving_vision.onnx
|
|
|
|
|
run: BENCHMARK_LOG=openpilot_0_9_9_vision ASSERT_MIN_STEP_TIME=30 PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.9/selfdrive/modeld/models/driving_vision.onnx
|
|
|
|
|
- name: benchmark openpilot 0.9.9 driving_policy
|
|
|
|
|
run: BENCHMARK_LOG=openpilot_0_9_9_policy PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.9/selfdrive/modeld/models/driving_policy.onnx
|
|
|
|
|
run: BENCHMARK_LOG=openpilot_0_9_9_policy ASSERT_MIN_STEP_TIME=45 PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.9/selfdrive/modeld/models/driving_policy.onnx
|
|
|
|
|
- name: benchmark openpilot 0.9.9 dmonitoring
|
|
|
|
|
run: BENCHMARK_LOG=openpilot_0_9_9_dmonitoring PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.9/selfdrive/modeld/models/dmonitoring_model.onnx
|
|
|
|
|
run: BENCHMARK_LOG=openpilot_0_9_9_dmonitoring ASSERT_MIN_STEP_TIME=70 PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.9/selfdrive/modeld/models/dmonitoring_model.onnx
|
|
|
|
|
- name: openpilot compile3 0.9.9 driving_vision
|
|
|
|
|
run: PYTHONPATH="." QCOM=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.9/selfdrive/modeld/models/driving_vision.onnx
|
|
|
|
|
- name: openpilot compile3 0.9.9 driving_policy
|
|
|
|
|
|