move benchmark stat tracking to influxdb (#10185)

This commit is contained in:
wozeparrot
2025-05-15 16:14:56 -07:00
committed by GitHub
parent f59ecf2116
commit 1ed04f993b
15 changed files with 527 additions and 280 deletions

View File

@@ -52,14 +52,14 @@ jobs:
- name: reset process replay
run: python3.11 test/external/process_replay/reset.py
- name: Run Stable Diffusion
run: JIT=1 python3.11 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
run: BENCHMARK_LOG=stable_diffusion JIT=1 python3.11 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
- name: Run Stable Diffusion without fp16
run: JIT=1 python3.11 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd_no_fp16.txt
run: BENCHMARK_LOG=stable_diffusion_fp32 JIT=1 python3.11 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd_no_fp16.txt
- name: Run Stable Diffusion v2
run: JIT=1 python3.11 examples/sdv2.py --fp16 --seed 0 --noshow --timing | tee sdv2.txt
run: BENCHMARK_LOG=stable_diffusion_v2 JIT=1 python3.11 examples/sdv2.py --fp16 --seed 0 --noshow --timing | tee sdv2.txt
# process replay can't capture this, the graph is too large
- name: Run SDXL
run: CAPTURE_PROCESS_REPLAY=0 JIT=1 python3.11 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
run: BENCHMARK_LOG=stable_diffusion_xl CAPTURE_PROCESS_REPLAY=0 JIT=1 python3.11 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
- name: Run model inference benchmark
run: METAL=1 python3.11 test/external/external_model_benchmark.py
- name: Test speed vs torch
@@ -80,40 +80,40 @@ jobs:
run: METAL=1 M_START=6 M_STOP=10 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=6 K_STOP=24 K_STEP=1 TC_OPT=2 DEBUG=2 python3.11 ./extra/gemm/fuzz_matmul.py
- name: Run LLaMA
run: |
JIT=0 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
JIT=1 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt
BENCHMARK_LOG=llama_nojit JIT=0 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
BENCHMARK_LOG=llama JIT=1 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt
- name: Run LLaMA with BEAM
run: JITBEAM=2 IGNORE_BEAM_CACHE=1 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt
run: BENCHMARK_LOG=llama_beam JITBEAM=2 IGNORE_BEAM_CACHE=1 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt
- name: Run quantized LLaMA
run: |
python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize int8 | tee llama_int8.txt
python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize nf4 | tee llama_nf4.txt
BENCHMARK_LOG=llama_int8 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize int8 | tee llama_int8.txt
BENCHMARK_LOG=llama_nf4 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize nf4 | tee llama_nf4.txt
- name: Run quantized LLaMA3
run: |
python3.11 examples/llama3.py --size 8B --temperature 0 --benchmark --quantize int8 | tee llama3_int8.txt
python3.11 examples/llama3.py --size 8B --temperature 0 --benchmark --quantize nf4 | tee llama3_nf4.txt
BENCHMARK_LOG=llama3_int8 python3.11 examples/llama3.py --size 8B --temperature 0 --benchmark --quantize int8 | tee llama3_int8.txt
BENCHMARK_LOG=llama3_nf4 python3.11 examples/llama3.py --size 8B --temperature 0 --benchmark --quantize nf4 | tee llama3_nf4.txt
#- name: Run LLaMA 7B on 4 (virtual) GPUs
# run: python3.11 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_four_gpu.txt
- name: Run GPT2
run: |
JIT=0 python3.11 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
JIT=1 python3.11 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
BENCHMARK_LOG=gpt2_nojit JIT=0 python3.11 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
BENCHMARK_LOG=gpt2 JIT=1 python3.11 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
- name: Run GPT2 w HALF
run: HALF=1 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
run: BENCHMARK_LOG=gpt2_half HALF=1 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
- name: Run GPT2 w HALF/BEAM
run: HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
run: BENCHMARK_LOG=gpt2_half_beam HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
- name: Run OLMoE
run: python3.11 examples/olmoe.py
run: BENCHMARK_LOG=olmoe python3.11 examples/olmoe.py
- name: Train MNIST
run: time PYTHONPATH=. TARGET_EVAL_ACC_PCT=96.0 python3.11 examples/beautiful_mnist.py | tee beautiful_mnist.txt
- name: Run 10 CIFAR training steps
run: JIT=1 STEPS=10 python3.11 examples/hlb_cifar10.py | tee train_cifar.txt
run: BENCHMARK_LOG=cifar_10steps JIT=1 STEPS=10 python3.11 examples/hlb_cifar10.py | tee train_cifar.txt
- name: Run 10 CIFAR training steps w HALF
run: JIT=2 STEPS=10 DEFAULT_FLOAT=HALF python3.11 examples/hlb_cifar10.py | tee train_cifar_half.txt
run: BENCHMARK_LOG=cifar_10steps_half JIT=2 STEPS=10 DEFAULT_FLOAT=HALF python3.11 examples/hlb_cifar10.py | tee train_cifar_half.txt
#- name: Run 10 CIFAR training steps w BF16
# run: STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3.11 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
- name: Run 10 CIFAR training steps w winograd
run: JIT=1 WINO=1 STEPS=10 python3.11 examples/hlb_cifar10.py | tee train_cifar_wino.txt
run: BENCHMARK_LOG=cifar_10steps_wino JIT=1 WINO=1 STEPS=10 python3.11 examples/hlb_cifar10.py | tee train_cifar_wino.txt
- name: UsbGPU boot time
run: sudo -E PYTHONPATH=. DEBUG=2 AM_RESET=1 AMD=1 AMD_IFACE=USB time python3.11 test/test_tiny.py TestTiny.test_plus
- name: UsbGPU tiny tests
@@ -210,37 +210,37 @@ jobs:
- name: Test CUDA=1
run: DEBUG=2 CUDA=1 python -m pytest -rA test/test_tiny.py
- name: Run Stable Diffusion
run: NV=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
run: BENCHMARK_LOG=stable_diffusion NV=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
- name: Run SDXL
run: CAPTURE_PROCESS_REPLAY=0 NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
run: BENCHMARK_LOG=stable_diffusion_xl CAPTURE_PROCESS_REPLAY=0 NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
- name: Run LLaMA
run: |
NV=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
NV=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt
BENCHMARK_LOG=llama_nojit NV=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
BENCHMARK_LOG=llama NV=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt
- name: Run LLaMA with BEAM
run: NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt
run: BENCHMARK_LOG=llama_beam NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt
# - name: Run LLaMA 7B on 4 GPUs
# run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_four_gpu.txt
# - name: Run LLaMA 7B on 6 GPUs
# run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_six_gpu.txt
- name: Run LLaMA-3 8B BEAM
run: NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt
run: BENCHMARK_LOG=llama3_beam NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt
- name: Run LLaMA-3 8B on 4 GPUs with BEAM
run: NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
run: BENCHMARK_LOG=llama3_beam_4gpu NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
# - name: Run LLaMA-3 8B on 6 GPUs
# run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt
# - name: Run LLaMA-2 70B
# run: NV=1 CAPTURE_PROCESS_REPLAY=0 MAX_CONTEXT=256 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_2_70B.txt
- name: Run Mixtral 8x7B
run: time NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt
run: time BENCHMARK_LOG=mixtral NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt
- name: Run GPT2
run: |
NV=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
NV=1 JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
BENCHMARK_LOG=gpt2_nojit NV=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
BENCHMARK_LOG=gpt2 NV=1 JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
- name: Run GPT2 w HALF
run: NV=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
run: BENCHMARK_LOG=gpt2_half NV=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
- name: Run GPT2 w HALF/BEAM
run: NV=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
run: BENCHMARK_LOG=gpt2_half_beam NV=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
- uses: actions/upload-artifact@v4
with:
name: Speed (NVIDIA)
@@ -304,26 +304,26 @@ jobs:
- name: Train MNIST
run: time PYTHONPATH=. NV=1 TARGET_EVAL_ACC_PCT=96.0 python3 examples/beautiful_mnist.py | tee beautiful_mnist.txt
- name: Run 10 CIFAR training steps
run: NV=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
run: BENCHMARK_LOG=cifar_10steps NV=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
- name: Run 10 CIFAR training steps w HALF
run: NV=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
run: BENCHMARK_LOG=cifar_10steps_half NV=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
- name: Run 10 CIFAR training steps w BF16
run: NV=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
run: BENCHMARK_LOG=cifar_10steps_bf16 NV=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
- name: Run 10 CIFAR training steps w winograd
run: NV=1 CAPTURE_PROCESS_REPLAY=0 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
run: BENCHMARK_LOG=cifar_10steps_half_wino NV=1 CAPTURE_PROCESS_REPLAY=0 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
- name: Run full CIFAR training w 1 GPU
run: time NV=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
run: time BENCHMARK_LOG=cifar NV=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
- name: Run full CIFAR training steps w 6 GPUS
run: time CAPTURE_PROCESS_REPLAY=0 NV=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt
run: time BENCHMARK_LOG=cifar_6gpu CAPTURE_PROCESS_REPLAY=0 NV=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt
- name: Run MLPerf resnet eval on training data
run: time NV=1 MODEL=resnet python3 examples/mlperf/model_eval.py
run: time BENCHMARK_LOG=resnet_eval NV=1 MODEL=resnet python3 examples/mlperf/model_eval.py
- name: Run 10 MLPerf ResNet50 training steps (1 gpu)
run: NV=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt
run: BENCHMARK_LOG=resnet_10steps NV=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt
- name: Run 10 MLPerf ResNet50 training steps (6 gpu)
run: NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
run: BENCHMARK_LOG=resnet_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
- name: Run 10 MLPerf Bert training steps (6 gpu)
# TODO: remove BERT_LAYERS once scheduler is fast
run: NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
run: BENCHMARK_LOG=bert_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
- uses: actions/upload-artifact@v4
with:
name: Speed (NVIDIA Training)
@@ -409,23 +409,23 @@ jobs:
- name: Test AM warm start time
run: time AMD=1 python3 test/test_tiny.py TestTiny.test_plus
- name: Run Stable Diffusion
run: AMD=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
run: BENCHMARK_LOG=stable_diffusion AMD=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
- name: Run SDXL
run: CAPTURE_PROCESS_REPLAY=0 AMD=1 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
run: BENCHMARK_LOG=stable_diffusion_xl CAPTURE_PROCESS_REPLAY=0 AMD=1 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
- name: Run LLaMA 7B
run: |
AMD=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
AMD=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt
BENCHMARK_LOG=llama_nojit AMD=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
BENCHMARK_LOG=llama AMD=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt
- name: Run LLaMA 7B with BEAM
run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt
run: BENCHMARK_LOG=llama_beam AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt
# - name: Run LLaMA 7B on 4 GPUs
# run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_four_gpu.txt
# - name: Run LLaMA 7B on 6 GPUs
# run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_six_gpu.txt
- name: Run LLaMA-3 8B BEAM
run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt
run: BENCHMARK_LOG=llama3_beam AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt
- name: Run LLaMA-3 8B on 4 GPUs with BEAM
run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
run: BENCHMARK_LOG=llama3_beam_4gpu AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
# - name: Run LLaMA-3 8B on 6 GPUs
# run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt
- name: Restore amdgpu
@@ -433,15 +433,15 @@ jobs:
# - name: Run LLaMA-2 70B
# run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_2_70B.txt
- name: Run Mixtral 8x7B
run: time AMD=1 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt
run: time BENCHMARK_LOG=mixtral AMD=1 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt
- name: Run GPT2
run: |
AMD=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
AMD=1 JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
BENCHMARK_LOG=gpt2_nojit AMD=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
BENCHMARK_LOG=gpt2 AMD=1 JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
- name: Run GPT2 w HALF
run: AMD=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
run: BENCHMARK_LOG=gpt2_half AMD=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
- name: Run GPT2 w HALF/BEAM
run: AMD=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
run: BENCHMARK_LOG=gpt2_half_beam AMD=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
- uses: actions/upload-artifact@v4
with:
name: Speed (AMD)
@@ -500,26 +500,26 @@ jobs:
- name: Train MNIST
run: time PYTHONPATH=. AMD=1 TARGET_EVAL_ACC_PCT=96.0 python3 examples/beautiful_mnist.py | tee beautiful_mnist.txt
- name: Run 10 CIFAR training steps
run: AMD=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
run: BENCHMARK_LOG=cifar_10steps AMD=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
- name: Run 10 CIFAR training steps w HALF
run: AMD=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
run: BENCHMARK_LOG=cifar_10steps_half AMD=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
- name: Run 10 CIFAR training steps w BF16
run: AMD=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
run: BENCHMARK_LOG=cifar_10steps_bf16 AMD=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
- name: Run 10 CIFAR training steps w winograd
run: AMD=1 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
run: BENCHMARK_LOG=cifar_10steps_half_wino AMD=1 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
- name: Run full CIFAR training w 1 GPU
run: time AMD=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
run: time BENCHMARK_LOG=cifar AMD=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
- name: Run full CIFAR training steps w 6 GPUS
run: time AMD=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt
run: time BENCHMARK_LOG=cifar_6gpu AMD=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt
- name: Run MLPerf resnet eval
run: time AMD=1 MODEL=resnet python3 examples/mlperf/model_eval.py
run: time BENCHMARK_LOG=resnet_eval AMD=1 MODEL=resnet python3 examples/mlperf/model_eval.py
- name: Run 10 MLPerf ResNet50 training steps (1 gpu)
run: AMD=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt
run: BENCHMARK_LOG=resnet_10steps AMD=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt
- name: Run 10 MLPerf ResNet50 training steps (6 gpu)
run: AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
run: BENCHMARK_LOG=resnet_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
- name: Run 10 MLPerf Bert training steps (6 gpu)
# TODO: remove BERT_LAYERS once scheduler is fast
run: AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
run: BENCHMARK_LOG=bert_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
- uses: actions/upload-artifact@v4
with:
name: Speed (AMD Training)
@@ -558,13 +558,13 @@ jobs:
- name: validate openpilot 0.9.7
run: PYTHONPATH=. FLOAT16=0 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_image_0_9_7.txt
- name: benchmark openpilot 0.9.4
run: PYTHONPATH=. QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx | tee openpilot_0_9_4.txt
run: BENCHMARK_LOG=openpilot_0_9_4 PYTHONPATH=. QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx | tee openpilot_0_9_4.txt
- name: benchmark openpilot 0.9.7
run: PYTHONPATH=. QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_0_9_7.txt
run: BENCHMARK_LOG=openpilot_0_9_7 PYTHONPATH=. QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_0_9_7.txt
- name: benchmark openpilot w IMAGE=2 0.9.4
run: PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx | tee openpilot_image_0_9_4.txt
run: BENCHMARK_LOG=openpilot_0_9_4_image PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx | tee openpilot_image_0_9_4.txt
- name: benchmark openpilot w IMAGE=2 0.9.7
run: PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_image_0_9_7.txt
run: BENCHMARK_LOG=openpilot_0_9_7_image PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_image_0_9_7.txt
- name: openpilot compile3 0.9.7
run: PYTHONPATH="." QCOM=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx
- name: openpilot compile3 0.9.7+ tomb raider

View File

@@ -7,6 +7,7 @@ from tinygrad.ops import UOp
from tinygrad.helpers import Timing, DEBUG, JIT, getenv, fetch, colored, trange
from tinygrad.nn import Embedding, Linear, LayerNorm
from tinygrad.nn.state import gguf_load, torch_load, load_state_dict, get_state_dict
from extra.bench_log import BenchEvent, WallTimeEvent
MAX_CONTEXT = getenv("MAX_CONTEXT", 128)
HALF = getenv("HALF")
@@ -134,11 +135,12 @@ class GPT2:
# lm head and wte are tied
weights['lm_head.weight'] = weights['wte.weight']
load_state_dict(model, weights)
with WallTimeEvent(BenchEvent.LOAD_WEIGHTS):
load_state_dict(model, weights)
if HALF:
for l in get_state_dict(model).values():
l.replace(l.half().realize())
if HALF:
for l in get_state_dict(model).values():
l.replace(l.half().realize())
return GPT2(model, tokenizer)
@@ -167,7 +169,8 @@ class GPT2:
return key
state_dict = { _remap_gguf_key(k): v for k, v in state_dict.items() }
model = Transformer(**gpt2_params)
load_state_dict(model, state_dict)
with WallTimeEvent(BenchEvent.LOAD_WEIGHTS):
load_state_dict(model, state_dict)
return GPT2(model, tiktoken.get_encoding("gpt2"))
def __init__(self, model, tokenizer):
@@ -185,11 +188,12 @@ class GPT2:
with Timing("ran model in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+
f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+
(f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=timing):
if batch_size == 1 and len(toks[0][start_pos:]) == 1:
tokens = Variable("tokens", 0, VOCAB_SIZE).bind(toks[0][start_pos])
else:
tokens = Tensor([x[start_pos:] for x in toks])
tok = self.model(tokens, Variable("start_pos", 1 if start_pos else 0, MAX_CONTEXT-1).bind(start_pos), temperature).tolist()
with WallTimeEvent(BenchEvent.STEP):
if batch_size == 1 and len(toks[0][start_pos:]) == 1:
tokens = Variable("tokens", 0, VOCAB_SIZE).bind(toks[0][start_pos])
else:
tokens = Tensor([x[start_pos:] for x in toks])
tok = self.model(tokens, Variable("start_pos", 1 if start_pos else 0, MAX_CONTEXT-1).bind(start_pos), temperature).tolist()
start_pos = len(toks[0])
for i,t in enumerate(tok): toks[i].append(t)
return [self.tokenizer.decode(x) for x in toks]

View File

@@ -11,6 +11,7 @@ from tinygrad import nn, dtypes, Tensor, Device, GlobalCounters, TinyJit
from tinygrad.nn.state import get_state_dict, get_parameters
from tinygrad.nn import optim
from tinygrad.helpers import Context, BEAM, WINO, getenv, colored, prod
from extra.bench_log import BenchEvent, WallTimeEvent, WallTimeEvent
cifar_mean = [0.4913997551666284, 0.48215855929893703, 0.4465309133731618]
cifar_std = [0.24703225141799082, 0.24348516474564, 0.26158783926049628]
@@ -395,20 +396,23 @@ def train_cifar():
if STEPS == 0 or i == STEPS: break
GlobalCounters.reset()
X, Y = next(batcher)
if len(GPUS) > 1:
X.shard_(GPUS, axis=0)
Y.shard_(GPUS, axis=0)
with Context(BEAM=getenv("LATEBEAM", BEAM.value), WINO=getenv("LATEWINO", WINO.value)):
loss = train_step_jitted(model, optim.OptimizerGroup(opt_bias, opt_non_bias), [lr_sched_bias, lr_sched_non_bias], X, Y)
et = time.monotonic()
loss_cpu = loss.numpy()
# EMA for network weights
if getenv("EMA") and i > hyp['ema']['steps'] and (i+1) % hyp['ema']['every_n_steps'] == 0:
if model_ema is None:
model_ema = modelEMA(W, model)
model_ema.update(model, Tensor([projected_ema_decay_val*(i/STEPS)**hyp['ema']['decay_pow']]))
with WallTimeEvent(BenchEvent.STEP):
X, Y = next(batcher)
if len(GPUS) > 1:
X.shard_(GPUS, axis=0)
Y.shard_(GPUS, axis=0)
with Context(BEAM=getenv("LATEBEAM", BEAM.value), WINO=getenv("LATEWINO", WINO.value)):
loss = train_step_jitted(model, optim.OptimizerGroup(opt_bias, opt_non_bias), [lr_sched_bias, lr_sched_non_bias], X, Y)
et = time.monotonic()
loss_cpu = loss.numpy()
# EMA for network weights
if getenv("EMA") and i > hyp['ema']['steps'] and (i+1) % hyp['ema']['every_n_steps'] == 0:
if model_ema is None:
model_ema = modelEMA(W, model)
model_ema.update(model, Tensor([projected_ema_decay_val*(i/STEPS)**hyp['ema']['decay_pow']]))
cl = time.monotonic()
device_str = loss.device if isinstance(loss.device, str) else f"{loss.device[0]} * {len(loss.device)}"
# 53 221.74 ms run, 2.22 ms python, 219.52 ms CL, 803.39 loss, 0.000807 LR, 4.66 GB used, 3042.49 GFLOPS, 674.65 GOPS
@@ -424,4 +428,5 @@ def train_cifar():
raise ValueError(colored(f"{eval_acc_pct=} < {target}", "red"))
if __name__ == "__main__":
train_cifar()
with WallTimeEvent(BenchEvent.FULL):
train_cifar()

View File

@@ -13,6 +13,7 @@ from extra.models.llama import Transformer, convert_from_huggingface, fix_bf16
from sentencepiece import SentencePieceProcessor
import tiktoken, sys
from tiktoken.load import load_tiktoken_bpe
from extra.bench_log import BenchEvent, WallTimeEvent
MAX_CONTEXT = getenv("MAX_CONTEXT", 4096)
@@ -206,42 +207,43 @@ class LLaMa:
model = Transformer(**params["args"], linear=linear, max_context=MAX_CONTEXT, jit=bool(JIT))
if model_path.is_dir():
weights = concat_weights([load(filename) for filename in [f"{model_path}/consolidated.{i:02d}.pth" for i in range(params["files"])]], device[0] if isinstance(device, tuple) else device)
else:
weights = load(str(model_path))
if "model.embed_tokens.weight" in weights:
weights = convert_from_huggingface(weights, params["args"]["n_layers"], params["args"]["n_heads"], params["args"].get("n_kv_heads", params["args"]["n_heads"]))
with WallTimeEvent(BenchEvent.LOAD_WEIGHTS):
if model_path.is_dir():
weights = concat_weights([load(filename) for filename in [f"{model_path}/consolidated.{i:02d}.pth" for i in range(params["files"])]], device[0] if isinstance(device, tuple) else device)
else:
weights = load(str(model_path))
if "model.embed_tokens.weight" in weights:
weights = convert_from_huggingface(weights, params["args"]["n_layers"], params["args"]["n_heads"], params["args"].get("n_kv_heads", params["args"]["n_heads"]))
weights = fix_bf16(weights)
weights = fix_bf16(weights)
# prevent tracking model weights
# this is a part of a larger problem with BUFFER UOps and gc in TRACK_MATCH_STATS=2
with Context(BEAM=0, TRACK_MATCH_STATS=0):
# quantize
if quantize is not None:
weights = linear.quantize(weights, device)
for _,v in weights.items(): v.realize()
# prevent tracking model weights
# this is a part of a larger problem with BUFFER UOps and gc in TRACK_MATCH_STATS=2
with Context(BEAM=0, TRACK_MATCH_STATS=0):
# quantize
if quantize is not None:
weights = linear.quantize(weights, device)
for _,v in weights.items(): v.realize()
# shard
if isinstance(device, tuple):
for k,v in nn.state.get_state_dict(model).items():
if 'scale' in k: v.shard_(device, axis=None) # from quantized
elif '.attention.' in k:
if getenv("SHARD_KVCACHE") and ('.wq.' in k or '.wk.' in k or '.wv.' in k): v.shard_(device, axis=0)
else: v.shard_(device, axis=-1)
elif '.feed_forward.w1.' in k: v.shard_(device, axis=0)
elif '.feed_forward.w3.' in k: v.shard_(device, axis=0)
elif '.feed_forward.' in k: v.shard_(device, axis=-1)
elif 'tok_embeddings.weight' in k: v.shard_(device, axis=0)
elif 'output.weight' in k: v.shard_(device, axis=-1)
#elif k.endswith('.weight'): v.shard_(device, axis=-1)
#elif 'norm.' in k: v.shard_(device, axis=-1)
else: v.shard_(device, axis=None)
#print(k, v.shape, v.lazydata.axis)
# shard
if isinstance(device, tuple):
for k,v in nn.state.get_state_dict(model).items():
if 'scale' in k: v.shard_(device, axis=None) # from quantized
elif '.attention.' in k:
if getenv("SHARD_KVCACHE") and ('.wq.' in k or '.wk.' in k or '.wv.' in k): v.shard_(device, axis=0)
else: v.shard_(device, axis=-1)
elif '.feed_forward.w1.' in k: v.shard_(device, axis=0)
elif '.feed_forward.w3.' in k: v.shard_(device, axis=0)
elif '.feed_forward.' in k: v.shard_(device, axis=-1)
elif 'tok_embeddings.weight' in k: v.shard_(device, axis=0)
elif 'output.weight' in k: v.shard_(device, axis=-1)
#elif k.endswith('.weight'): v.shard_(device, axis=-1)
#elif 'norm.' in k: v.shard_(device, axis=-1)
else: v.shard_(device, axis=None)
#print(k, v.shape, v.lazydata.axis)
# replace weights in model
load_state_dict(model, weights, strict=False, consume=True)
# replace weights in model
load_state_dict(model, weights, strict=False, consume=True)
return LLaMa(model, tokenizer)
@@ -477,11 +479,12 @@ After you are done speaking, output [EOS]. You are not Chad.
next_tok = Tensor([toks[start_pos:]], device=device) if tok_tensor is None or (len(toks)-start_pos) > 1 else tok_tensor.reshape(1, 1)
with Profiling(enabled=args.profile):
with Timing("total ", enabled=args.timing, on_exit=lambda x: f", {1e9/x:.2f} tok/s, {GlobalCounters.global_mem/x:.2f} GB/s, param {param_bytes/x:.2f} GB/s"):
with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+
f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+
(f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_bytes*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=args.timing):
tok_tensor = llama.model(next_tok, start_pos, args.temperature)
tok = tok_tensor.item()
with WallTimeEvent(BenchEvent.STEP):
with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+
f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+
(f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_bytes*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=args.timing):
tok_tensor = llama.model(next_tok, start_pos, args.temperature)
tok = tok_tensor.item()
# use the kv cache
start_pos = len(toks)

View File

@@ -7,6 +7,7 @@ from extra.models.llama import Transformer, convert_from_huggingface, convert_fr
from tinygrad.nn.state import safe_load, torch_load, load_state_dict, get_parameters, gguf_load
from tinygrad import Tensor, dtypes, nn, Context, Device, GlobalCounters
from tinygrad.helpers import Profiling, Timing, DEBUG, colored, fetch, tqdm
from extra.bench_log import BenchEvent, WallTimeEvent
class Tokenizer:
pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
@@ -166,40 +167,42 @@ def build_transformer(model_path: Path, model_size="8B", quantize=None, scale_dt
model = Transformer(**MODEL_PARAMS[model_size]["args"], linear=linear, embedding=embedding, max_context=max_context, jit=True)
if not load_weights: return model
# load weights
if model_path.is_dir():
if (model_path / "model.safetensors.index.json").exists(): weights = load(str(model_path / "model.safetensors.index.json"))
elif (model_path / "model.safetensors").exists(): weights = load(str(model_path / "model.safetensors"))
else: weights = concat_weights([load(str(model_path / f"consolidated.{i:02d}.pth")) for i in range(MODEL_PARAMS[model_size]["files"])], device[0] if isinstance(device, tuple) else device)
else:
weights = load(str(model_path))
if "model.embed_tokens.weight" in weights:
weights = convert_from_huggingface(weights, MODEL_PARAMS[model_size]["args"]["n_layers"], MODEL_PARAMS[model_size]["args"]["n_heads"], MODEL_PARAMS[model_size]["args"]["n_kv_heads"])
elif "token_embd.weight" in weights:
weights = convert_from_gguf(weights, MODEL_PARAMS[model_size]["args"]["n_layers"])
weights = fix_bf16(weights)
with WallTimeEvent(BenchEvent.LOAD_WEIGHTS):
if model_path.is_dir():
if (model_path / "model.safetensors.index.json").exists(): weights = load(str(model_path / "model.safetensors.index.json"))
elif (model_path / "model.safetensors").exists(): weights = load(str(model_path / "model.safetensors"))
else: weights = concat_weights([load(str(model_path / f"consolidated.{i:02d}.pth")) for i in range(MODEL_PARAMS[model_size]["files"])], device[0] if isinstance(device, tuple) else device)
else:
weights = load(str(model_path))
if "model.embed_tokens.weight" in weights:
weights = convert_from_huggingface(weights, MODEL_PARAMS[model_size]["args"]["n_layers"], MODEL_PARAMS[model_size]["args"]["n_heads"], MODEL_PARAMS[model_size]["args"]["n_kv_heads"])
elif "token_embd.weight" in weights:
weights = convert_from_gguf(weights, MODEL_PARAMS[model_size]["args"]["n_layers"])
weights = fix_bf16(weights)
with Context(BEAM=0):
# quantize
if quantize == "float16": weights = {k:v.cast(quantize).contiguous() for k,v in weights.items()}
elif quantize is not None:
weights = linear.quantize(weights, device, scale_dtype, quantize_embeds)
for _,v in weights.items(): v.realize()
with Context(BEAM=0):
# quantize
if quantize == "float16": weights = {k:v.cast(quantize).contiguous() for k,v in weights.items()}
elif quantize is not None:
weights = linear.quantize(weights, device, scale_dtype, quantize_embeds)
for _,v in weights.items(): v.realize()
# shard
if isinstance(device, tuple):
for k,v in nn.state.get_state_dict(model).items():
if 'scale' in k: v.shard_(device, axis=None) # from quantized
elif '.attention.' in k: v.shard_(device, axis=-1)
elif '.feed_forward.w1.' in k: v.shard_(device, axis=0)
elif '.feed_forward.w3.' in k: v.shard_(device, axis=0)
elif '.feed_forward.' in k: v.shard_(device, axis=-1)
elif 'tok_embeddings.weight' in k: v.shard_(device, axis=0)
elif 'output.weight' in k: v.shard_(device, axis=0)
else: v.shard_(device, axis=None)
# shard
if isinstance(device, tuple):
for k,v in nn.state.get_state_dict(model).items():
if 'scale' in k: v.shard_(device, axis=None) # from quantized
elif '.attention.' in k: v.shard_(device, axis=-1)
elif '.feed_forward.w1.' in k: v.shard_(device, axis=0)
elif '.feed_forward.w3.' in k: v.shard_(device, axis=0)
elif '.feed_forward.' in k: v.shard_(device, axis=-1)
elif 'tok_embeddings.weight' in k: v.shard_(device, axis=0)
elif 'output.weight' in k: v.shard_(device, axis=0)
else: v.shard_(device, axis=None)
# replace weights in model
load_state_dict(model, weights, strict=False, consume=True)
# replace weights in model
load_state_dict(model, weights, strict=False, consume=True)
return model
# default settings
@@ -435,11 +438,12 @@ if __name__ == "__main__":
st = GlobalCounters.time_sum_s
with Profiling(enabled=args.profile):
with Timing("total ", on_exit=lambda x: f", {1e9/x:.2f} tok/s, {GlobalCounters.global_mem/x:.2f} GB/s, param {param_bytes/x:.2f} GB/s"):
with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+
f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+
(f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_bytes*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None):
tok = model(Tensor([[last_tok]], device=device), start_pos, TEMPERATURE, TOP_K, TOP_P, ALPHA_F, ALPHA_P)
tok = tok.item()
with WallTimeEvent(BenchEvent.STEP):
with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+
f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+
(f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_bytes*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None):
tok = model(Tensor([[last_tok]], device=device), start_pos, TEMPERATURE, TOP_K, TOP_P, ALPHA_F, ALPHA_P)
tok = tok.item()
start_pos += 1
last_tok = tok
generated += tokenizer.decode([tok])

View File

@@ -3,6 +3,7 @@ from tinygrad import Tensor, nn, Device, GlobalCounters, Variable
from tinygrad.helpers import Timing, Profiling, CI, tqdm
from tinygrad.nn.state import torch_load, get_state_dict
from extra.models.llama import FeedForward, Transformer
from extra.bench_log import BenchEvent, WallTimeEvent
class MixtureFeedForward:
def __init__(self, num_experts:int, dim:int, hidden_dim:int, linear=nn.Linear):
@@ -30,18 +31,19 @@ if __name__ == "__main__":
help="Path to the downloaded weights")
args = parser.parse_args()
state = torch_load(args.weights + "/consolidated.00.pth.b")
model = Transformer(n_layers=32, dim=4096, hidden_dim=14336, n_heads=32, n_kv_heads=8, norm_eps=1e-5, vocab_size=32000, feed_forward=functools.partial(MixtureFeedForward, 8), jit=False)
model_state_dict = get_state_dict(model)
with WallTimeEvent(BenchEvent.LOAD_WEIGHTS):
state = torch_load(args.weights + "/consolidated.00.pth.b")
model = Transformer(n_layers=32, dim=4096, hidden_dim=14336, n_heads=32, n_kv_heads=8, norm_eps=1e-5, vocab_size=32000, feed_forward=functools.partial(MixtureFeedForward, 8), jit=False)
model_state_dict = get_state_dict(model)
for k in (t := tqdm(state, disable=CI)):
if 'feed_forward.experts.' in k:
expert_no = int(k.split('feed_forward.experts.')[1].split('.')[0])
device = Device.DEFAULT + ":" + str((expert_no//2)+1)
else:
device = Device.DEFAULT
t.set_description(f"ram used: {GlobalCounters.mem_used/1e9:5.2f} GB, loading {k} to {device}")
model_state_dict[k].replace(state[k].to(device).half()).realize()
for k in (t := tqdm(state, disable=CI)):
if 'feed_forward.experts.' in k:
expert_no = int(k.split('feed_forward.experts.')[1].split('.')[0])
device = Device.DEFAULT + ":" + str((expert_no//2)+1)
else:
device = Device.DEFAULT
t.set_description(f"ram used: {GlobalCounters.mem_used/1e9:5.2f} GB, loading {k} to {device}")
model_state_dict[k].replace(state[k].to(device).half()).realize()
if CI: print(f"ram used: {GlobalCounters.mem_used/1e9:5.2f} GB")
from sentencepiece import SentencePieceProcessor
@@ -53,7 +55,8 @@ if __name__ == "__main__":
GlobalCounters.reset()
with Profiling(sort="time", frac=0.1, enabled=args.profile):
with Timing("total ", enabled=args.timing, on_exit=lambda x: f", {1e9/x:.2f} tok/sec"):
tok = model(Tensor([toks[start_pos:]]), 0 if start_pos == 0 else Variable("start_pos", 1, 1024).bind(start_pos), args.temperature).item()
with WallTimeEvent(BenchEvent.STEP):
tok = model(Tensor([toks[start_pos:]]), 0 if start_pos == 0 else Variable("start_pos", 1, 1024).bind(start_pos), args.temperature).item()
toks.append(tok)
start_pos += 1
print(spp.decode(toks))

View File

@@ -5,61 +5,63 @@ import numpy as np
from tinygrad import Tensor, Device, dtypes, GlobalCounters, TinyJit
from tinygrad.nn.state import get_parameters, load_state_dict, safe_load
from tinygrad.helpers import getenv
from extra.bench_log import BenchEvent, WallTimeEvent
def tlog(x): print(f"{x:25s} @ {time.perf_counter()-start:5.2f}s")
def eval_resnet():
Tensor.no_grad = True
# Resnet50-v1.5
from extra.models.resnet import ResNet50
tlog("imports")
GPUS = [f'{Device.DEFAULT}:{i}' for i in range(getenv("GPUS", 6))]
for x in GPUS: Device[x]
tlog("got devices") # NOTE: this is faster with rocm-smi running
with WallTimeEvent(BenchEvent.FULL):
# Resnet50-v1.5
from extra.models.resnet import ResNet50
tlog("imports")
GPUS = [f'{Device.DEFAULT}:{i}' for i in range(getenv("GPUS", 6))]
for x in GPUS: Device[x]
tlog("got devices") # NOTE: this is faster with rocm-smi running
class ResnetRunner:
def __init__(self, device=None):
self.mdl = ResNet50()
for x in get_parameters(self.mdl) if device else []: x.to_(device)
if (fn:=getenv("RESNET_MODEL", "")): load_state_dict(self.mdl, safe_load(fn))
else: self.mdl.load_from_pretrained()
self.input_mean = Tensor([0.485, 0.456, 0.406], device=device).reshape(1, -1, 1, 1)
self.input_std = Tensor([0.229, 0.224, 0.225], device=device).reshape(1, -1, 1, 1)
def __call__(self, x:Tensor) -> Tensor:
x = x.permute([0,3,1,2]).cast(dtypes.float32) / 255.0
x -= self.input_mean
x /= self.input_std
return self.mdl(x).log_softmax().argmax(axis=1).realize()
class ResnetRunner:
def __init__(self, device=None):
self.mdl = ResNet50()
for x in get_parameters(self.mdl) if device else []: x.to_(device)
if (fn:=getenv("RESNET_MODEL", "")): load_state_dict(self.mdl, safe_load(fn))
else: self.mdl.load_from_pretrained()
self.input_mean = Tensor([0.485, 0.456, 0.406], device=device).reshape(1, -1, 1, 1)
self.input_std = Tensor([0.229, 0.224, 0.225], device=device).reshape(1, -1, 1, 1)
def __call__(self, x:Tensor) -> Tensor:
x = x.permute([0,3,1,2]).cast(dtypes.float32) / 255.0
x -= self.input_mean
x /= self.input_std
return self.mdl(x).log_softmax().argmax(axis=1).realize()
mdl = TinyJit(ResnetRunner(GPUS))
tlog("loaded models")
mdl = TinyJit(ResnetRunner(GPUS))
tlog("loaded models")
# evaluation on the mlperf classes of the validation set from imagenet
from examples.mlperf.dataloader import batch_load_resnet
iterator = batch_load_resnet(getenv("BS", 128*6), val=getenv("VAL", 1), shuffle=False, pad_first_batch=True)
def data_get():
x,y,cookie = next(iterator)
return x.shard(GPUS, axis=0).realize(), y, cookie
n,d = 0,0
proc = data_get()
tlog("loaded initial data")
st = time.perf_counter()
while proc is not None:
GlobalCounters.reset()
proc = (mdl(proc[0]), proc[1], proc[2]) # this frees the images
run = time.perf_counter()
# load the next data here
try: next_proc = data_get()
except StopIteration: next_proc = None
nd = time.perf_counter()
y = np.array(proc[1])
proc = (proc[0].numpy() == y) & (y != -1) # this realizes the models and frees the cookies
n += proc.sum()
d += (y != -1).sum()
et = time.perf_counter()
tlog(f"****** {n:5d}/{d:5d} {n*100.0/d:.2f}% -- {(run-st)*1000:7.2f} ms to enqueue, {(et-run)*1000:7.2f} ms to realize ({(nd-run)*1000:7.2f} ms fetching). {(len(proc))/(et-st):8.2f} examples/sec. {GlobalCounters.global_ops*1e-12/(et-st):5.2f} TFLOPS")
st = et
proc, next_proc = next_proc, None
tlog("done")
# evaluation on the mlperf classes of the validation set from imagenet
from examples.mlperf.dataloader import batch_load_resnet
iterator = batch_load_resnet(getenv("BS", 128*6), val=getenv("VAL", 1), shuffle=False, pad_first_batch=True)
def data_get():
x,y,cookie = next(iterator)
return x.shard(GPUS, axis=0).realize(), y, cookie
n,d = 0,0
proc = data_get()
tlog("loaded initial data")
st = time.perf_counter()
while proc is not None:
GlobalCounters.reset()
proc = (mdl(proc[0]), proc[1], proc[2]) # this frees the images
run = time.perf_counter()
# load the next data here
try: next_proc = data_get()
except StopIteration: next_proc = None
nd = time.perf_counter()
y = np.array(proc[1])
proc = (proc[0].numpy() == y) & (y != -1) # this realizes the models and frees the cookies
n += proc.sum()
d += (y != -1).sum()
et = time.perf_counter()
tlog(f"****** {n:5d}/{d:5d} {n*100.0/d:.2f}% -- {(run-st)*1000:7.2f} ms to enqueue, {(et-run)*1000:7.2f} ms to realize ({(nd-run)*1000:7.2f} ms fetching). {(len(proc))/(et-st):8.2f} examples/sec. {GlobalCounters.global_ops*1e-12/(et-st):5.2f} TFLOPS")
st = et
proc, next_proc = next_proc, None
tlog("done")
def eval_unet3d():
# UNet3D

View File

@@ -9,6 +9,7 @@ from tinygrad.nn.optim import LAMB, LARS, SGD, OptimizerGroup, Adam
from extra.lr_scheduler import LRSchedulerGroup
from examples.mlperf.helpers import get_training_state, load_training_state
from extra.bench_log import BenchEvent, WallTimeEvent
# TODO: fix benchmark logging and use tinygrad tqdm
from tqdm import tqdm
@@ -205,24 +206,25 @@ def train_resnet():
st = time.perf_counter()
while proc is not None:
GlobalCounters.reset()
(loss, top_1), y, proc = train_step(proc[0], proc[1]), proc[2], proc[3]
with WallTimeEvent(BenchEvent.STEP):
(loss, top_1), y, proc = train_step(proc[0], proc[1]), proc[2], proc[3]
pt = time.perf_counter()
pt = time.perf_counter()
if len(prev_cookies) == getenv("STORE_COOKIES", 1): prev_cookies = [] # free previous cookies after gpu work has been enqueued
try:
if INITMLPERF:
next_proc = fake_data_get(BS)
else:
next_proc = data_get(it)
except StopIteration:
next_proc = None
if len(prev_cookies) == getenv("STORE_COOKIES", 1): prev_cookies = [] # free previous cookies after gpu work has been enqueued
try:
if INITMLPERF:
next_proc = fake_data_get(BS)
else:
next_proc = data_get(it)
except StopIteration:
next_proc = None
dt = time.perf_counter()
dt = time.perf_counter()
device_str = loss.device if isinstance(loss.device, str) else f"{loss.device[0]} * {len(loss.device)}"
loss, top_1 = loss.numpy().item(), top_1.numpy().item()
top_1_acc = top_1 / sum(yi != -1 for yi in y)
device_str = loss.device if isinstance(loss.device, str) else f"{loss.device[0]} * {len(loss.device)}"
loss, top_1 = loss.numpy().item(), top_1.numpy().item()
top_1_acc = top_1 / sum(yi != -1 for yi in y)
cl = time.perf_counter()
if BENCHMARK:
@@ -1124,23 +1126,24 @@ def train_bert():
BEAM.value = TRAIN_BEAM
st = time.perf_counter()
GlobalCounters.reset()
loss, global_norm, lr = train_step_bert(model, optimizer_group, scheduler_group, loss_scaler,
train_data["input_ids"], train_data["segment_ids"], train_data["input_mask"], train_data["masked_lm_positions"], \
train_data["masked_lm_ids"], train_data["masked_lm_weights"], train_data["next_sentence_labels"], GPUS)
with WallTimeEvent(BenchEvent.STEP):
loss, global_norm, lr = train_step_bert(model, optimizer_group, scheduler_group, loss_scaler,
train_data["input_ids"], train_data["segment_ids"], train_data["input_mask"], train_data["masked_lm_positions"], \
train_data["masked_lm_ids"], train_data["masked_lm_weights"], train_data["next_sentence_labels"], GPUS)
pt = time.perf_counter()
pt = time.perf_counter()
try:
next_data = next(train_it)
except StopIteration:
next_data = None
try:
next_data = next(train_it)
except StopIteration:
next_data = None
dt = time.perf_counter()
dt = time.perf_counter()
device_str = parameters[0].device if isinstance(parameters[0].device, str) else f"{parameters[0].device[0]} * {len(parameters[0].device)}"
loss = loss.item()
assert not math.isnan(loss)
lr = lr.item()
device_str = parameters[0].device if isinstance(parameters[0].device, str) else f"{parameters[0].device[0]} * {len(parameters[0].device)}"
loss = loss.item()
assert not math.isnan(loss)
lr = lr.item()
cl = time.perf_counter()
if BENCHMARK: step_times.append(cl - st)

View File

@@ -5,6 +5,7 @@ from examples.stable_diffusion import AutoencoderKL, get_alphas_cumprod
from examples.sdxl import DPMPP2MSampler, append_dims, LegacyDDPMDiscretization
from extra.models.unet import UNetModel
from extra.models.clip import FrozenOpenClipEmbedder
from extra.bench_log import BenchEvent, WallTimeEvent
from typing import Dict
import argparse, tempfile, os
@@ -117,12 +118,14 @@ if __name__ == "__main__":
if not weights_fn:
weights_url = args.weights_url if args.weights_url else default_weights_url
weights_fn = fetch(weights_url, os.path.basename(str(weights_url)))
load_state_dict(model, safe_load(weights_fn), strict=False)
if args.fp16:
for k,v in get_state_dict(model).items():
if k.startswith("model"):
v.replace(v.cast(dtypes.float16).realize())
with WallTimeEvent(BenchEvent.LOAD_WEIGHTS):
load_state_dict(model, safe_load(weights_fn), strict=False)
if args.fp16:
for k,v in get_state_dict(model).items():
if k.startswith("model"):
v.replace(v.cast(dtypes.float16).realize())
c = { "crossattn": model.cond_stage_model(args.prompt) }
uc = { "crossattn": model.cond_stage_model("") }

View File

@@ -9,6 +9,7 @@ from tinygrad.nn.state import safe_load, load_state_dict, get_state_dict
from tinygrad.helpers import fetch, trange, colored, Timing
from extra.models.clip import Embedder, FrozenClosedClipEmbedder, FrozenOpenClipEmbedder
from extra.models.unet import UNetModel, Upsample, Downsample, timestep_embedding
from extra.bench_log import BenchEvent, WallTimeEvent
from examples.stable_diffusion import ResnetBlock, Mid
import numpy as np
@@ -346,17 +347,18 @@ class DPMPP2MSampler:
for i in trange(num_sigmas - 1):
with Timing("step in ", enabled=timing, on_exit=lambda _: f", using {GlobalCounters.mem_used/1e9:.2f} GB"):
GlobalCounters.reset()
x, old_denoised = self.sampler_step(
old_denoised=old_denoised,
prev_sigma=(None if i==0 else sigmas[i-1].expand(x.shape[0])),
sigma=sigmas[i].expand(x.shape[0]),
next_sigma=sigmas[i+1].expand(x.shape[0]),
denoiser=denoiser,
x=x,
c=c,
uc=uc,
)
x.realize(old_denoised)
with WallTimeEvent(BenchEvent.STEP):
x, old_denoised = self.sampler_step(
old_denoised=old_denoised,
prev_sigma=(None if i==0 else sigmas[i-1].expand(x.shape[0])),
sigma=sigmas[i].expand(x.shape[0]),
next_sigma=sigmas[i+1].expand(x.shape[0]),
denoiser=denoiser,
x=x,
c=c,
uc=uc,
)
x.realize(old_denoised)
return x
@@ -388,7 +390,8 @@ if __name__ == "__main__":
start_mem_used = GlobalCounters.mem_used
with Timing("loaded weights in ", lambda et_ns: f", {(B:=(GlobalCounters.mem_used-start_mem_used))/1e9:.2f} GB loaded at {B/et_ns:.2f} GB/s"):
Tensor.realize(*loaded_weights)
with WallTimeEvent(BenchEvent.LOAD_WEIGHTS):
Tensor.realize(*loaded_weights)
del loaded_weights
N = 1

View File

@@ -14,6 +14,7 @@ from tinygrad.nn import Conv2d, GroupNorm
from tinygrad.nn.state import torch_load, load_state_dict, get_state_dict
from extra.models.clip import Closed, Tokenizer
from extra.models.unet import UNetModel
from extra.bench_log import BenchEvent, WallTimeEvent
class AttnBlock:
def __init__(self, in_channels):
@@ -232,12 +233,13 @@ if __name__ == "__main__":
model = StableDiffusion()
# load in weights
load_state_dict(model, torch_load(fetch('https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt', 'sd-v1-4.ckpt'))['state_dict'], strict=False)
with WallTimeEvent(BenchEvent.LOAD_WEIGHTS):
load_state_dict(model, torch_load(fetch('https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt', 'sd-v1-4.ckpt'))['state_dict'], strict=False)
if args.fp16:
for k,v in get_state_dict(model).items():
if k.startswith("model"):
v.replace(v.cast(dtypes.float16).realize())
if args.fp16:
for k,v in get_state_dict(model).items():
if k.startswith("model"):
v.replace(v.cast(dtypes.float16).realize())
# run through CLIP to get context
tokenizer = Tokenizer.ClipTokenizer()
@@ -270,9 +272,10 @@ if __name__ == "__main__":
GlobalCounters.reset()
t.set_description("%3d %3d" % (index, timestep))
with Timing("step in ", enabled=args.timing, on_exit=lambda _: f", using {GlobalCounters.mem_used/1e9:.2f} GB"):
tid = Tensor([index])
latent = run(model, unconditional_context, context, latent, Tensor([timestep]), alphas[tid], alphas_prev[tid], Tensor([args.guidance]))
if args.timing: Device[Device.DEFAULT].synchronize()
with WallTimeEvent(BenchEvent.STEP):
tid = Tensor([index])
latent = run(model, unconditional_context, context, latent, Tensor([timestep]), alphas[tid], alphas_prev[tid], Tensor([args.guidance]))
if args.timing: Device[Device.DEFAULT].synchronize()
del run
# upsample latent space to image with autoencoder

108
extra/bench_log.py Normal file
View File

@@ -0,0 +1,108 @@
import time, atexit, uuid
from enum import Enum
from tinygrad.device import Device
from tinygrad.helpers import DEBUG, ContextVar, getenv, GlobalCounters
BENCHMARK_LOG = ContextVar("BENCHMARK_LOG", "")
if BENCHMARK_LOG:
from influxdb_client_3 import InfluxDBClient3, Point, WriteOptions, write_client_options
from influxdb_client_3.write_client.client.write_api import WriteType
class BenchEvent(Enum):
LOAD_WEIGHTS = "load_weights"
STEP = "step"
FULL = "full"
class InstantBenchEvent(Enum):
GFLOPS = "gflops"
_events = {}
def clear_events():
for event in BenchEvent:
_events[event] = {"wall": [], "kernel": []}
for event in InstantBenchEvent:
_events[event] = []
clear_events()
class WallTimeEvent:
def __init__(self, event:BenchEvent):
self.event = event
def __enter__(self):
self.start = time.monotonic()
return self
def __exit__(self, *_):
_events[self.event]["wall"].append(time.monotonic() - self.start)
return False
class KernelTimeEvent:
def __init__(self, event:BenchEvent):
if DEBUG < 2:
raise Exception("KernelTimeEvent should only be used in DEBUG >= 2")
self.event = event
def __enter__(self):
self.start = GlobalCounters.time_sum_s
return self
def __exit__(self, *_):
_events[self.event]["kernel"].append(GlobalCounters.time_sum_s - self.start)
return False
def log_event_instant(event:InstantBenchEvent, value:float):
_events[event].append(value)
if BENCHMARK_LOG:
INFLUXDB_HOST = getenv("INFLUXDB_HOST", "")
INFLUXDB_ORG = getenv("INFLUXDB_ORG", "tiny")
INFLUXDB_TOKEN = getenv("INFLUXDB_TOKEN", "")
def _create_point(run_id, i, attempt, ref, commit, name, value, run):
point = Point(BENCHMARK_LOG.value).tag("id", run_id).tag("index", i)
point = point.tag("device", Device.DEFAULT)
point = point.tag("attempt", attempt).tag("ref", ref).tag("commit", commit)
point = point.field(name, value).field("x", run)
return point
@atexit.register
def write_events():
# see if there are any events to write
have_events = False
for event in _events:
if isinstance(event, BenchEvent):
for event_type, values in _events[event].items():
if len(values) > 0:
have_events = True
else:
if len(_events[event]) > 0:
have_events = True
if not have_events:
return
# pull from github envvars
ref = getenv("GITHUB_REF_NAME", "")
commit = getenv("GITHUB_SHA", "")
run = getenv("GITHUB_RUN_NUMBER", "")
attempt = getenv("GITHUB_RUN_ATTEMPT", "")
points = []
for event in _events:
run_id = str(uuid.uuid4())
if isinstance(event, BenchEvent):
for event_type, values in _events[event].items():
for i, value in enumerate(values):
point = _create_point(run_id, i, attempt, ref, commit, f"{event.value}_{event_type}", value, run)
points.append(point)
else:
for i, value in enumerate(_events[event]):
point = _create_point(run_id, i, attempt, ref, commit, event.value, value, run)
points.append(point)
write_options = WriteOptions(write_type=WriteType.synchronous, retry_interval=5000, max_retries=5, max_retry_delay=30000, exponential_base=2)
wco = write_client_options(write_options=write_options)
with InfluxDBClient3(
host=INFLUXDB_HOST,
org=INFLUXDB_ORG,
token=INFLUXDB_TOKEN,
auth_scheme="Basic",
database="benchmarks",
write_client_options=wco) as client:
client.write(points)

View File

@@ -73,7 +73,8 @@ setup(name='tinygrad',
"capstone",
"pycocotools",
"boto3",
"pandas"
"pandas",
"influxdb3-python"
],
'docs': [
"mkdocs",

View File

@@ -7,6 +7,7 @@ from tinygrad import Tensor, dtypes, TinyJit
from tinygrad.helpers import IMAGE, GlobalCounters, fetch, colored, getenv, trange
from tinygrad.tensor import _from_np_dtype
import numpy as np
from extra.bench_log import BenchEvent, WallTimeEvent
OPENPILOT_MODEL = sys.argv[1] if len(sys.argv) > 1 else "https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx"
@@ -33,10 +34,11 @@ if __name__ == "__main__":
for _ in range(20):
GlobalCounters.reset()
st = time.perf_counter_ns()
# Need to cast non-image inputs from numpy, this is only realistic way to run model
inputs = {**{k:v for k,v in new_inputs_junk.items() if 'img' in k},
**{k:Tensor(v) for k,v in new_inputs_junk_numpy.items() if 'img' not in k}}
ret = next(iter(run_onnx_jit(**inputs).values())).cast(dtypes.float32).numpy()
with WallTimeEvent(BenchEvent.STEP):
# Need to cast non-image inputs from numpy, this is only realistic way to run model
inputs = {**{k:v for k,v in new_inputs_junk.items() if 'img' in k},
**{k:Tensor(v) for k,v in new_inputs_junk_numpy.items() if 'img' not in k}}
ret = next(iter(run_onnx_jit(**inputs).values())).cast(dtypes.float32).numpy()
print(f"jitted: {(time.perf_counter_ns() - st)*1e-6:7.4f} ms")
suffix = ""

View File

@@ -0,0 +1,103 @@
import unittest, time
from extra.bench_log import BenchEvent, InstantBenchEvent, WallTimeEvent, KernelTimeEvent, log_event_instant, _events, clear_events
from tinygrad.helpers import Context
from tinygrad.tensor import Tensor
class TestBenchLog(unittest.TestCase):
def setUp(self):
clear_events()
def test_log_single_wall_time(self):
for event in BenchEvent:
with WallTimeEvent(event):
time.sleep(0.1)
# check event list
for event in BenchEvent:
self.assertEqual(len(_events[event]["wall"]), 1)
self.assertGreater(_events[event]["wall"][0], 0)
def test_log_double_wall_time(self):
for event in BenchEvent:
with WallTimeEvent(event):
time.sleep(0.1)
for event in reversed(BenchEvent):
with WallTimeEvent(event):
time.sleep(0.2)
# check event list
for event in BenchEvent:
self.assertEqual(len(_events[event]["wall"]), 2)
self.assertGreater(_events[event]["wall"][0], 0)
self.assertGreater(_events[event]["wall"][1], 0)
def test_log_single_kernel_time(self):
wall_times = []
with Context(DEBUG=2):
for event in BenchEvent:
with KernelTimeEvent(event):
st = time.perf_counter()
Tensor.rand(32, 32).sum().realize().item()
wall_times.append(time.perf_counter() - st)
# check event list
for event in BenchEvent:
self.assertEqual(len(_events[event]["kernel"]), 1)
self.assertLess(_events[event]["kernel"][0], wall_times[0])
self.assertGreater(_events[event]["kernel"][0], 0)
def test_interleaved_wall_kernel_time(self):
wall_times = []
with Context(DEBUG=2):
for event in BenchEvent:
with KernelTimeEvent(event):
st = time.perf_counter()
Tensor.rand(32, 32).sum().realize().item()
wall_times.append(time.perf_counter() - st)
with WallTimeEvent(event):
st = time.perf_counter()
Tensor.rand(32, 32).sum().realize().item()
wall_times.append(time.perf_counter() - st)
# check event list
for event in BenchEvent:
self.assertEqual(len(_events[event]["wall"]), 1)
self.assertEqual(len(_events[event]["kernel"]), 1)
self.assertLess(_events[event]["kernel"][0], wall_times[0])
self.assertGreater(_events[event]["kernel"][0], 0)
def test_stacked_wall_kernel_time(self):
with Context(DEBUG=2):
for event in BenchEvent:
with KernelTimeEvent(event):
with WallTimeEvent(event):
Tensor.rand(32, 32).sum().realize().item()
for event in BenchEvent:
with WallTimeEvent(event):
with KernelTimeEvent(event):
Tensor.rand(32, 32).sum().realize().item()
for event in BenchEvent:
self.assertEqual(len(_events[event]["wall"]), 2)
self.assertEqual(len(_events[event]["kernel"]), 2)
self.assertLess(_events[event]["kernel"][0], _events[event]["wall"][0])
self.assertGreater(_events[event]["kernel"][0], 0)
self.assertLess(_events[event]["kernel"][1], _events[event]["wall"][1])
self.assertGreater(_events[event]["kernel"][1], 0)
def test_log_instant_event(self):
for event in InstantBenchEvent:
log_event_instant(event, 1000)
# check event list
for event in InstantBenchEvent:
self.assertEqual(len(_events[event]), 1)
self.assertEqual(_events[event][0], 1000)
if __name__ == '__main__':
unittest.main()