diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 3ed6348fe4..cc7c9551a9 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -52,14 +52,14 @@ jobs: - name: reset process replay run: python3.11 test/external/process_replay/reset.py - name: Run Stable Diffusion - run: JIT=1 python3.11 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt + run: BENCHMARK_LOG=stable_diffusion JIT=1 python3.11 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt - name: Run Stable Diffusion without fp16 - run: JIT=1 python3.11 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd_no_fp16.txt + run: BENCHMARK_LOG=stable_diffusion_fp32 JIT=1 python3.11 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd_no_fp16.txt - name: Run Stable Diffusion v2 - run: JIT=1 python3.11 examples/sdv2.py --fp16 --seed 0 --noshow --timing | tee sdv2.txt + run: BENCHMARK_LOG=stable_diffusion_v2 JIT=1 python3.11 examples/sdv2.py --fp16 --seed 0 --noshow --timing | tee sdv2.txt # process replay can't capture this, the graph is too large - name: Run SDXL - run: CAPTURE_PROCESS_REPLAY=0 JIT=1 python3.11 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt + run: BENCHMARK_LOG=stable_diffusion_xl CAPTURE_PROCESS_REPLAY=0 JIT=1 python3.11 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt - name: Run model inference benchmark run: METAL=1 python3.11 test/external/external_model_benchmark.py - name: Test speed vs torch @@ -80,40 +80,40 @@ jobs: run: METAL=1 M_START=6 M_STOP=10 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=6 K_STOP=24 K_STEP=1 TC_OPT=2 DEBUG=2 python3.11 ./extra/gemm/fuzz_matmul.py - name: Run LLaMA run: | - JIT=0 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt - JIT=1 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt + BENCHMARK_LOG=llama_nojit JIT=0 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt + BENCHMARK_LOG=llama JIT=1 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt - name: Run LLaMA with BEAM - run: JITBEAM=2 IGNORE_BEAM_CACHE=1 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt + run: BENCHMARK_LOG=llama_beam JITBEAM=2 IGNORE_BEAM_CACHE=1 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt - name: Run quantized LLaMA run: | - python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize int8 | tee llama_int8.txt - python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize nf4 | tee llama_nf4.txt + BENCHMARK_LOG=llama_int8 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize int8 | tee llama_int8.txt + BENCHMARK_LOG=llama_nf4 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize nf4 | tee llama_nf4.txt - name: Run quantized LLaMA3 run: | - python3.11 examples/llama3.py --size 8B --temperature 0 --benchmark --quantize int8 | tee llama3_int8.txt - python3.11 examples/llama3.py --size 8B --temperature 0 --benchmark --quantize nf4 | tee llama3_nf4.txt + BENCHMARK_LOG=llama3_int8 python3.11 examples/llama3.py --size 8B --temperature 0 --benchmark --quantize int8 | tee llama3_int8.txt + BENCHMARK_LOG=llama3_nf4 python3.11 examples/llama3.py --size 8B --temperature 0 --benchmark --quantize nf4 | tee llama3_nf4.txt #- name: Run LLaMA 7B on 4 (virtual) GPUs # run: python3.11 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_four_gpu.txt - name: Run GPT2 run: | - JIT=0 python3.11 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt - JIT=1 python3.11 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt + BENCHMARK_LOG=gpt2_nojit JIT=0 python3.11 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt + BENCHMARK_LOG=gpt2 JIT=1 python3.11 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt - name: Run GPT2 w HALF - run: HALF=1 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt + run: BENCHMARK_LOG=gpt2_half HALF=1 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt - name: Run GPT2 w HALF/BEAM - run: HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt + run: BENCHMARK_LOG=gpt2_half_beam HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt - name: Run OLMoE - run: python3.11 examples/olmoe.py + run: BENCHMARK_LOG=olmoe python3.11 examples/olmoe.py - name: Train MNIST run: time PYTHONPATH=. TARGET_EVAL_ACC_PCT=96.0 python3.11 examples/beautiful_mnist.py | tee beautiful_mnist.txt - name: Run 10 CIFAR training steps - run: JIT=1 STEPS=10 python3.11 examples/hlb_cifar10.py | tee train_cifar.txt + run: BENCHMARK_LOG=cifar_10steps JIT=1 STEPS=10 python3.11 examples/hlb_cifar10.py | tee train_cifar.txt - name: Run 10 CIFAR training steps w HALF - run: JIT=2 STEPS=10 DEFAULT_FLOAT=HALF python3.11 examples/hlb_cifar10.py | tee train_cifar_half.txt + run: BENCHMARK_LOG=cifar_10steps_half JIT=2 STEPS=10 DEFAULT_FLOAT=HALF python3.11 examples/hlb_cifar10.py | tee train_cifar_half.txt #- name: Run 10 CIFAR training steps w BF16 # run: STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3.11 examples/hlb_cifar10.py | tee train_cifar_bf16.txt - name: Run 10 CIFAR training steps w winograd - run: JIT=1 WINO=1 STEPS=10 python3.11 examples/hlb_cifar10.py | tee train_cifar_wino.txt + run: BENCHMARK_LOG=cifar_10steps_wino JIT=1 WINO=1 STEPS=10 python3.11 examples/hlb_cifar10.py | tee train_cifar_wino.txt - name: UsbGPU boot time run: sudo -E PYTHONPATH=. DEBUG=2 AM_RESET=1 AMD=1 AMD_IFACE=USB time python3.11 test/test_tiny.py TestTiny.test_plus - name: UsbGPU tiny tests @@ -210,37 +210,37 @@ jobs: - name: Test CUDA=1 run: DEBUG=2 CUDA=1 python -m pytest -rA test/test_tiny.py - name: Run Stable Diffusion - run: NV=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt + run: BENCHMARK_LOG=stable_diffusion NV=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt - name: Run SDXL - run: CAPTURE_PROCESS_REPLAY=0 NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt + run: BENCHMARK_LOG=stable_diffusion_xl CAPTURE_PROCESS_REPLAY=0 NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt - name: Run LLaMA run: | - NV=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt - NV=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt + BENCHMARK_LOG=llama_nojit NV=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt + BENCHMARK_LOG=llama NV=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt - name: Run LLaMA with BEAM - run: NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt + run: BENCHMARK_LOG=llama_beam NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt # - name: Run LLaMA 7B on 4 GPUs # run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_four_gpu.txt # - name: Run LLaMA 7B on 6 GPUs # run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_six_gpu.txt - name: Run LLaMA-3 8B BEAM - run: NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt + run: BENCHMARK_LOG=llama3_beam NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt - name: Run LLaMA-3 8B on 4 GPUs with BEAM - run: NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt + run: BENCHMARK_LOG=llama3_beam_4gpu NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt # - name: Run LLaMA-3 8B on 6 GPUs # run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt # - name: Run LLaMA-2 70B # run: NV=1 CAPTURE_PROCESS_REPLAY=0 MAX_CONTEXT=256 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_2_70B.txt - name: Run Mixtral 8x7B - run: time NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt + run: time BENCHMARK_LOG=mixtral NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt - name: Run GPT2 run: | - NV=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt - NV=1 JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt + BENCHMARK_LOG=gpt2_nojit NV=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt + BENCHMARK_LOG=gpt2 NV=1 JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt - name: Run GPT2 w HALF - run: NV=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt + run: BENCHMARK_LOG=gpt2_half NV=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt - name: Run GPT2 w HALF/BEAM - run: NV=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt + run: BENCHMARK_LOG=gpt2_half_beam NV=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt - uses: actions/upload-artifact@v4 with: name: Speed (NVIDIA) @@ -304,26 +304,26 @@ jobs: - name: Train MNIST run: time PYTHONPATH=. NV=1 TARGET_EVAL_ACC_PCT=96.0 python3 examples/beautiful_mnist.py | tee beautiful_mnist.txt - name: Run 10 CIFAR training steps - run: NV=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt + run: BENCHMARK_LOG=cifar_10steps NV=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt - name: Run 10 CIFAR training steps w HALF - run: NV=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt + run: BENCHMARK_LOG=cifar_10steps_half NV=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt - name: Run 10 CIFAR training steps w BF16 - run: NV=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt + run: BENCHMARK_LOG=cifar_10steps_bf16 NV=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt - name: Run 10 CIFAR training steps w winograd - run: NV=1 CAPTURE_PROCESS_REPLAY=0 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt + run: BENCHMARK_LOG=cifar_10steps_half_wino NV=1 CAPTURE_PROCESS_REPLAY=0 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt - name: Run full CIFAR training w 1 GPU - run: time NV=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt + run: time BENCHMARK_LOG=cifar NV=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt - name: Run full CIFAR training steps w 6 GPUS - run: time CAPTURE_PROCESS_REPLAY=0 NV=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt + run: time BENCHMARK_LOG=cifar_6gpu CAPTURE_PROCESS_REPLAY=0 NV=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt - name: Run MLPerf resnet eval on training data - run: time NV=1 MODEL=resnet python3 examples/mlperf/model_eval.py + run: time BENCHMARK_LOG=resnet_eval NV=1 MODEL=resnet python3 examples/mlperf/model_eval.py - name: Run 10 MLPerf ResNet50 training steps (1 gpu) - run: NV=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt + run: BENCHMARK_LOG=resnet_10steps NV=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt - name: Run 10 MLPerf ResNet50 training steps (6 gpu) - run: NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt + run: BENCHMARK_LOG=resnet_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt - name: Run 10 MLPerf Bert training steps (6 gpu) # TODO: remove BERT_LAYERS once scheduler is fast - run: NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt + run: BENCHMARK_LOG=bert_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt - uses: actions/upload-artifact@v4 with: name: Speed (NVIDIA Training) @@ -409,23 +409,23 @@ jobs: - name: Test AM warm start time run: time AMD=1 python3 test/test_tiny.py TestTiny.test_plus - name: Run Stable Diffusion - run: AMD=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt + run: BENCHMARK_LOG=stable_diffusion AMD=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt - name: Run SDXL - run: CAPTURE_PROCESS_REPLAY=0 AMD=1 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt + run: BENCHMARK_LOG=stable_diffusion_xl CAPTURE_PROCESS_REPLAY=0 AMD=1 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt - name: Run LLaMA 7B run: | - AMD=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt - AMD=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt + BENCHMARK_LOG=llama_nojit AMD=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt + BENCHMARK_LOG=llama AMD=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt - name: Run LLaMA 7B with BEAM - run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt + run: BENCHMARK_LOG=llama_beam AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt # - name: Run LLaMA 7B on 4 GPUs # run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_four_gpu.txt # - name: Run LLaMA 7B on 6 GPUs # run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_six_gpu.txt - name: Run LLaMA-3 8B BEAM - run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt + run: BENCHMARK_LOG=llama3_beam AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt - name: Run LLaMA-3 8B on 4 GPUs with BEAM - run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt + run: BENCHMARK_LOG=llama3_beam_4gpu AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt # - name: Run LLaMA-3 8B on 6 GPUs # run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt - name: Restore amdgpu @@ -433,15 +433,15 @@ jobs: # - name: Run LLaMA-2 70B # run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_2_70B.txt - name: Run Mixtral 8x7B - run: time AMD=1 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt + run: time BENCHMARK_LOG=mixtral AMD=1 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt - name: Run GPT2 run: | - AMD=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt - AMD=1 JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt + BENCHMARK_LOG=gpt2_nojit AMD=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt + BENCHMARK_LOG=gpt2 AMD=1 JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt - name: Run GPT2 w HALF - run: AMD=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt + run: BENCHMARK_LOG=gpt2_half AMD=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt - name: Run GPT2 w HALF/BEAM - run: AMD=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt + run: BENCHMARK_LOG=gpt2_half_beam AMD=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt - uses: actions/upload-artifact@v4 with: name: Speed (AMD) @@ -500,26 +500,26 @@ jobs: - name: Train MNIST run: time PYTHONPATH=. AMD=1 TARGET_EVAL_ACC_PCT=96.0 python3 examples/beautiful_mnist.py | tee beautiful_mnist.txt - name: Run 10 CIFAR training steps - run: AMD=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt + run: BENCHMARK_LOG=cifar_10steps AMD=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt - name: Run 10 CIFAR training steps w HALF - run: AMD=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt + run: BENCHMARK_LOG=cifar_10steps_half AMD=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt - name: Run 10 CIFAR training steps w BF16 - run: AMD=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt + run: BENCHMARK_LOG=cifar_10steps_bf16 AMD=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt - name: Run 10 CIFAR training steps w winograd - run: AMD=1 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt + run: BENCHMARK_LOG=cifar_10steps_half_wino AMD=1 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt - name: Run full CIFAR training w 1 GPU - run: time AMD=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt + run: time BENCHMARK_LOG=cifar AMD=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt - name: Run full CIFAR training steps w 6 GPUS - run: time AMD=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt + run: time BENCHMARK_LOG=cifar_6gpu AMD=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt - name: Run MLPerf resnet eval - run: time AMD=1 MODEL=resnet python3 examples/mlperf/model_eval.py + run: time BENCHMARK_LOG=resnet_eval AMD=1 MODEL=resnet python3 examples/mlperf/model_eval.py - name: Run 10 MLPerf ResNet50 training steps (1 gpu) - run: AMD=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt + run: BENCHMARK_LOG=resnet_10steps AMD=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt - name: Run 10 MLPerf ResNet50 training steps (6 gpu) - run: AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt + run: BENCHMARK_LOG=resnet_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt - name: Run 10 MLPerf Bert training steps (6 gpu) # TODO: remove BERT_LAYERS once scheduler is fast - run: AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt + run: BENCHMARK_LOG=bert_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt - uses: actions/upload-artifact@v4 with: name: Speed (AMD Training) @@ -558,13 +558,13 @@ jobs: - name: validate openpilot 0.9.7 run: PYTHONPATH=. FLOAT16=0 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_image_0_9_7.txt - name: benchmark openpilot 0.9.4 - run: PYTHONPATH=. QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx | tee openpilot_0_9_4.txt + run: BENCHMARK_LOG=openpilot_0_9_4 PYTHONPATH=. QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx | tee openpilot_0_9_4.txt - name: benchmark openpilot 0.9.7 - run: PYTHONPATH=. QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_0_9_7.txt + run: BENCHMARK_LOG=openpilot_0_9_7 PYTHONPATH=. QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_0_9_7.txt - name: benchmark openpilot w IMAGE=2 0.9.4 - run: PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx | tee openpilot_image_0_9_4.txt + run: BENCHMARK_LOG=openpilot_0_9_4_image PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx | tee openpilot_image_0_9_4.txt - name: benchmark openpilot w IMAGE=2 0.9.7 - run: PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_image_0_9_7.txt + run: BENCHMARK_LOG=openpilot_0_9_7_image PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_image_0_9_7.txt - name: openpilot compile3 0.9.7 run: PYTHONPATH="." QCOM=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx - name: openpilot compile3 0.9.7+ tomb raider diff --git a/examples/gpt2.py b/examples/gpt2.py index 16fd4966bd..d24afcc590 100644 --- a/examples/gpt2.py +++ b/examples/gpt2.py @@ -7,6 +7,7 @@ from tinygrad.ops import UOp from tinygrad.helpers import Timing, DEBUG, JIT, getenv, fetch, colored, trange from tinygrad.nn import Embedding, Linear, LayerNorm from tinygrad.nn.state import gguf_load, torch_load, load_state_dict, get_state_dict +from extra.bench_log import BenchEvent, WallTimeEvent MAX_CONTEXT = getenv("MAX_CONTEXT", 128) HALF = getenv("HALF") @@ -134,11 +135,12 @@ class GPT2: # lm head and wte are tied weights['lm_head.weight'] = weights['wte.weight'] - load_state_dict(model, weights) + with WallTimeEvent(BenchEvent.LOAD_WEIGHTS): + load_state_dict(model, weights) - if HALF: - for l in get_state_dict(model).values(): - l.replace(l.half().realize()) + if HALF: + for l in get_state_dict(model).values(): + l.replace(l.half().realize()) return GPT2(model, tokenizer) @@ -167,7 +169,8 @@ class GPT2: return key state_dict = { _remap_gguf_key(k): v for k, v in state_dict.items() } model = Transformer(**gpt2_params) - load_state_dict(model, state_dict) + with WallTimeEvent(BenchEvent.LOAD_WEIGHTS): + load_state_dict(model, state_dict) return GPT2(model, tiktoken.get_encoding("gpt2")) def __init__(self, model, tokenizer): @@ -185,11 +188,12 @@ class GPT2: with Timing("ran model in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+ f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+ (f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=timing): - if batch_size == 1 and len(toks[0][start_pos:]) == 1: - tokens = Variable("tokens", 0, VOCAB_SIZE).bind(toks[0][start_pos]) - else: - tokens = Tensor([x[start_pos:] for x in toks]) - tok = self.model(tokens, Variable("start_pos", 1 if start_pos else 0, MAX_CONTEXT-1).bind(start_pos), temperature).tolist() + with WallTimeEvent(BenchEvent.STEP): + if batch_size == 1 and len(toks[0][start_pos:]) == 1: + tokens = Variable("tokens", 0, VOCAB_SIZE).bind(toks[0][start_pos]) + else: + tokens = Tensor([x[start_pos:] for x in toks]) + tok = self.model(tokens, Variable("start_pos", 1 if start_pos else 0, MAX_CONTEXT-1).bind(start_pos), temperature).tolist() start_pos = len(toks[0]) for i,t in enumerate(tok): toks[i].append(t) return [self.tokenizer.decode(x) for x in toks] diff --git a/examples/hlb_cifar10.py b/examples/hlb_cifar10.py index d2008ef87d..3dec8bf08d 100644 --- a/examples/hlb_cifar10.py +++ b/examples/hlb_cifar10.py @@ -11,6 +11,7 @@ from tinygrad import nn, dtypes, Tensor, Device, GlobalCounters, TinyJit from tinygrad.nn.state import get_state_dict, get_parameters from tinygrad.nn import optim from tinygrad.helpers import Context, BEAM, WINO, getenv, colored, prod +from extra.bench_log import BenchEvent, WallTimeEvent, WallTimeEvent cifar_mean = [0.4913997551666284, 0.48215855929893703, 0.4465309133731618] cifar_std = [0.24703225141799082, 0.24348516474564, 0.26158783926049628] @@ -395,20 +396,23 @@ def train_cifar(): if STEPS == 0 or i == STEPS: break GlobalCounters.reset() - X, Y = next(batcher) - if len(GPUS) > 1: - X.shard_(GPUS, axis=0) - Y.shard_(GPUS, axis=0) - with Context(BEAM=getenv("LATEBEAM", BEAM.value), WINO=getenv("LATEWINO", WINO.value)): - loss = train_step_jitted(model, optim.OptimizerGroup(opt_bias, opt_non_bias), [lr_sched_bias, lr_sched_non_bias], X, Y) - et = time.monotonic() - loss_cpu = loss.numpy() - # EMA for network weights - if getenv("EMA") and i > hyp['ema']['steps'] and (i+1) % hyp['ema']['every_n_steps'] == 0: - if model_ema is None: - model_ema = modelEMA(W, model) - model_ema.update(model, Tensor([projected_ema_decay_val*(i/STEPS)**hyp['ema']['decay_pow']])) + with WallTimeEvent(BenchEvent.STEP): + X, Y = next(batcher) + if len(GPUS) > 1: + X.shard_(GPUS, axis=0) + Y.shard_(GPUS, axis=0) + + with Context(BEAM=getenv("LATEBEAM", BEAM.value), WINO=getenv("LATEWINO", WINO.value)): + loss = train_step_jitted(model, optim.OptimizerGroup(opt_bias, opt_non_bias), [lr_sched_bias, lr_sched_non_bias], X, Y) + et = time.monotonic() + loss_cpu = loss.numpy() + # EMA for network weights + if getenv("EMA") and i > hyp['ema']['steps'] and (i+1) % hyp['ema']['every_n_steps'] == 0: + if model_ema is None: + model_ema = modelEMA(W, model) + model_ema.update(model, Tensor([projected_ema_decay_val*(i/STEPS)**hyp['ema']['decay_pow']])) + cl = time.monotonic() device_str = loss.device if isinstance(loss.device, str) else f"{loss.device[0]} * {len(loss.device)}" # 53 221.74 ms run, 2.22 ms python, 219.52 ms CL, 803.39 loss, 0.000807 LR, 4.66 GB used, 3042.49 GFLOPS, 674.65 GOPS @@ -424,4 +428,5 @@ def train_cifar(): raise ValueError(colored(f"{eval_acc_pct=} < {target}", "red")) if __name__ == "__main__": - train_cifar() + with WallTimeEvent(BenchEvent.FULL): + train_cifar() diff --git a/examples/llama.py b/examples/llama.py index 4dd2455cec..8abdd9df98 100755 --- a/examples/llama.py +++ b/examples/llama.py @@ -13,6 +13,7 @@ from extra.models.llama import Transformer, convert_from_huggingface, fix_bf16 from sentencepiece import SentencePieceProcessor import tiktoken, sys from tiktoken.load import load_tiktoken_bpe +from extra.bench_log import BenchEvent, WallTimeEvent MAX_CONTEXT = getenv("MAX_CONTEXT", 4096) @@ -206,42 +207,43 @@ class LLaMa: model = Transformer(**params["args"], linear=linear, max_context=MAX_CONTEXT, jit=bool(JIT)) - if model_path.is_dir(): - weights = concat_weights([load(filename) for filename in [f"{model_path}/consolidated.{i:02d}.pth" for i in range(params["files"])]], device[0] if isinstance(device, tuple) else device) - else: - weights = load(str(model_path)) - if "model.embed_tokens.weight" in weights: - weights = convert_from_huggingface(weights, params["args"]["n_layers"], params["args"]["n_heads"], params["args"].get("n_kv_heads", params["args"]["n_heads"])) + with WallTimeEvent(BenchEvent.LOAD_WEIGHTS): + if model_path.is_dir(): + weights = concat_weights([load(filename) for filename in [f"{model_path}/consolidated.{i:02d}.pth" for i in range(params["files"])]], device[0] if isinstance(device, tuple) else device) + else: + weights = load(str(model_path)) + if "model.embed_tokens.weight" in weights: + weights = convert_from_huggingface(weights, params["args"]["n_layers"], params["args"]["n_heads"], params["args"].get("n_kv_heads", params["args"]["n_heads"])) - weights = fix_bf16(weights) + weights = fix_bf16(weights) - # prevent tracking model weights - # this is a part of a larger problem with BUFFER UOps and gc in TRACK_MATCH_STATS=2 - with Context(BEAM=0, TRACK_MATCH_STATS=0): - # quantize - if quantize is not None: - weights = linear.quantize(weights, device) - for _,v in weights.items(): v.realize() + # prevent tracking model weights + # this is a part of a larger problem with BUFFER UOps and gc in TRACK_MATCH_STATS=2 + with Context(BEAM=0, TRACK_MATCH_STATS=0): + # quantize + if quantize is not None: + weights = linear.quantize(weights, device) + for _,v in weights.items(): v.realize() - # shard - if isinstance(device, tuple): - for k,v in nn.state.get_state_dict(model).items(): - if 'scale' in k: v.shard_(device, axis=None) # from quantized - elif '.attention.' in k: - if getenv("SHARD_KVCACHE") and ('.wq.' in k or '.wk.' in k or '.wv.' in k): v.shard_(device, axis=0) - else: v.shard_(device, axis=-1) - elif '.feed_forward.w1.' in k: v.shard_(device, axis=0) - elif '.feed_forward.w3.' in k: v.shard_(device, axis=0) - elif '.feed_forward.' in k: v.shard_(device, axis=-1) - elif 'tok_embeddings.weight' in k: v.shard_(device, axis=0) - elif 'output.weight' in k: v.shard_(device, axis=-1) - #elif k.endswith('.weight'): v.shard_(device, axis=-1) - #elif 'norm.' in k: v.shard_(device, axis=-1) - else: v.shard_(device, axis=None) - #print(k, v.shape, v.lazydata.axis) + # shard + if isinstance(device, tuple): + for k,v in nn.state.get_state_dict(model).items(): + if 'scale' in k: v.shard_(device, axis=None) # from quantized + elif '.attention.' in k: + if getenv("SHARD_KVCACHE") and ('.wq.' in k or '.wk.' in k or '.wv.' in k): v.shard_(device, axis=0) + else: v.shard_(device, axis=-1) + elif '.feed_forward.w1.' in k: v.shard_(device, axis=0) + elif '.feed_forward.w3.' in k: v.shard_(device, axis=0) + elif '.feed_forward.' in k: v.shard_(device, axis=-1) + elif 'tok_embeddings.weight' in k: v.shard_(device, axis=0) + elif 'output.weight' in k: v.shard_(device, axis=-1) + #elif k.endswith('.weight'): v.shard_(device, axis=-1) + #elif 'norm.' in k: v.shard_(device, axis=-1) + else: v.shard_(device, axis=None) + #print(k, v.shape, v.lazydata.axis) - # replace weights in model - load_state_dict(model, weights, strict=False, consume=True) + # replace weights in model + load_state_dict(model, weights, strict=False, consume=True) return LLaMa(model, tokenizer) @@ -477,11 +479,12 @@ After you are done speaking, output [EOS]. You are not Chad. next_tok = Tensor([toks[start_pos:]], device=device) if tok_tensor is None or (len(toks)-start_pos) > 1 else tok_tensor.reshape(1, 1) with Profiling(enabled=args.profile): with Timing("total ", enabled=args.timing, on_exit=lambda x: f", {1e9/x:.2f} tok/s, {GlobalCounters.global_mem/x:.2f} GB/s, param {param_bytes/x:.2f} GB/s"): - with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+ - f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+ - (f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_bytes*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=args.timing): - tok_tensor = llama.model(next_tok, start_pos, args.temperature) - tok = tok_tensor.item() + with WallTimeEvent(BenchEvent.STEP): + with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+ + f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+ + (f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_bytes*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=args.timing): + tok_tensor = llama.model(next_tok, start_pos, args.temperature) + tok = tok_tensor.item() # use the kv cache start_pos = len(toks) diff --git a/examples/llama3.py b/examples/llama3.py index 6969818faa..0e49371caa 100644 --- a/examples/llama3.py +++ b/examples/llama3.py @@ -7,6 +7,7 @@ from extra.models.llama import Transformer, convert_from_huggingface, convert_fr from tinygrad.nn.state import safe_load, torch_load, load_state_dict, get_parameters, gguf_load from tinygrad import Tensor, dtypes, nn, Context, Device, GlobalCounters from tinygrad.helpers import Profiling, Timing, DEBUG, colored, fetch, tqdm +from extra.bench_log import BenchEvent, WallTimeEvent class Tokenizer: pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+" @@ -166,40 +167,42 @@ def build_transformer(model_path: Path, model_size="8B", quantize=None, scale_dt model = Transformer(**MODEL_PARAMS[model_size]["args"], linear=linear, embedding=embedding, max_context=max_context, jit=True) if not load_weights: return model + # load weights - if model_path.is_dir(): - if (model_path / "model.safetensors.index.json").exists(): weights = load(str(model_path / "model.safetensors.index.json")) - elif (model_path / "model.safetensors").exists(): weights = load(str(model_path / "model.safetensors")) - else: weights = concat_weights([load(str(model_path / f"consolidated.{i:02d}.pth")) for i in range(MODEL_PARAMS[model_size]["files"])], device[0] if isinstance(device, tuple) else device) - else: - weights = load(str(model_path)) - if "model.embed_tokens.weight" in weights: - weights = convert_from_huggingface(weights, MODEL_PARAMS[model_size]["args"]["n_layers"], MODEL_PARAMS[model_size]["args"]["n_heads"], MODEL_PARAMS[model_size]["args"]["n_kv_heads"]) - elif "token_embd.weight" in weights: - weights = convert_from_gguf(weights, MODEL_PARAMS[model_size]["args"]["n_layers"]) - weights = fix_bf16(weights) + with WallTimeEvent(BenchEvent.LOAD_WEIGHTS): + if model_path.is_dir(): + if (model_path / "model.safetensors.index.json").exists(): weights = load(str(model_path / "model.safetensors.index.json")) + elif (model_path / "model.safetensors").exists(): weights = load(str(model_path / "model.safetensors")) + else: weights = concat_weights([load(str(model_path / f"consolidated.{i:02d}.pth")) for i in range(MODEL_PARAMS[model_size]["files"])], device[0] if isinstance(device, tuple) else device) + else: + weights = load(str(model_path)) + if "model.embed_tokens.weight" in weights: + weights = convert_from_huggingface(weights, MODEL_PARAMS[model_size]["args"]["n_layers"], MODEL_PARAMS[model_size]["args"]["n_heads"], MODEL_PARAMS[model_size]["args"]["n_kv_heads"]) + elif "token_embd.weight" in weights: + weights = convert_from_gguf(weights, MODEL_PARAMS[model_size]["args"]["n_layers"]) + weights = fix_bf16(weights) - with Context(BEAM=0): - # quantize - if quantize == "float16": weights = {k:v.cast(quantize).contiguous() for k,v in weights.items()} - elif quantize is not None: - weights = linear.quantize(weights, device, scale_dtype, quantize_embeds) - for _,v in weights.items(): v.realize() + with Context(BEAM=0): + # quantize + if quantize == "float16": weights = {k:v.cast(quantize).contiguous() for k,v in weights.items()} + elif quantize is not None: + weights = linear.quantize(weights, device, scale_dtype, quantize_embeds) + for _,v in weights.items(): v.realize() - # shard - if isinstance(device, tuple): - for k,v in nn.state.get_state_dict(model).items(): - if 'scale' in k: v.shard_(device, axis=None) # from quantized - elif '.attention.' in k: v.shard_(device, axis=-1) - elif '.feed_forward.w1.' in k: v.shard_(device, axis=0) - elif '.feed_forward.w3.' in k: v.shard_(device, axis=0) - elif '.feed_forward.' in k: v.shard_(device, axis=-1) - elif 'tok_embeddings.weight' in k: v.shard_(device, axis=0) - elif 'output.weight' in k: v.shard_(device, axis=0) - else: v.shard_(device, axis=None) + # shard + if isinstance(device, tuple): + for k,v in nn.state.get_state_dict(model).items(): + if 'scale' in k: v.shard_(device, axis=None) # from quantized + elif '.attention.' in k: v.shard_(device, axis=-1) + elif '.feed_forward.w1.' in k: v.shard_(device, axis=0) + elif '.feed_forward.w3.' in k: v.shard_(device, axis=0) + elif '.feed_forward.' in k: v.shard_(device, axis=-1) + elif 'tok_embeddings.weight' in k: v.shard_(device, axis=0) + elif 'output.weight' in k: v.shard_(device, axis=0) + else: v.shard_(device, axis=None) - # replace weights in model - load_state_dict(model, weights, strict=False, consume=True) + # replace weights in model + load_state_dict(model, weights, strict=False, consume=True) return model # default settings @@ -435,11 +438,12 @@ if __name__ == "__main__": st = GlobalCounters.time_sum_s with Profiling(enabled=args.profile): with Timing("total ", on_exit=lambda x: f", {1e9/x:.2f} tok/s, {GlobalCounters.global_mem/x:.2f} GB/s, param {param_bytes/x:.2f} GB/s"): - with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+ - f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+ - (f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_bytes*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None): - tok = model(Tensor([[last_tok]], device=device), start_pos, TEMPERATURE, TOP_K, TOP_P, ALPHA_F, ALPHA_P) - tok = tok.item() + with WallTimeEvent(BenchEvent.STEP): + with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+ + f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+ + (f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_bytes*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None): + tok = model(Tensor([[last_tok]], device=device), start_pos, TEMPERATURE, TOP_K, TOP_P, ALPHA_F, ALPHA_P) + tok = tok.item() start_pos += 1 last_tok = tok generated += tokenizer.decode([tok]) diff --git a/examples/mixtral.py b/examples/mixtral.py index f2627f5dff..3266c8248e 100644 --- a/examples/mixtral.py +++ b/examples/mixtral.py @@ -3,6 +3,7 @@ from tinygrad import Tensor, nn, Device, GlobalCounters, Variable from tinygrad.helpers import Timing, Profiling, CI, tqdm from tinygrad.nn.state import torch_load, get_state_dict from extra.models.llama import FeedForward, Transformer +from extra.bench_log import BenchEvent, WallTimeEvent class MixtureFeedForward: def __init__(self, num_experts:int, dim:int, hidden_dim:int, linear=nn.Linear): @@ -30,18 +31,19 @@ if __name__ == "__main__": help="Path to the downloaded weights") args = parser.parse_args() - state = torch_load(args.weights + "/consolidated.00.pth.b") - model = Transformer(n_layers=32, dim=4096, hidden_dim=14336, n_heads=32, n_kv_heads=8, norm_eps=1e-5, vocab_size=32000, feed_forward=functools.partial(MixtureFeedForward, 8), jit=False) - model_state_dict = get_state_dict(model) + with WallTimeEvent(BenchEvent.LOAD_WEIGHTS): + state = torch_load(args.weights + "/consolidated.00.pth.b") + model = Transformer(n_layers=32, dim=4096, hidden_dim=14336, n_heads=32, n_kv_heads=8, norm_eps=1e-5, vocab_size=32000, feed_forward=functools.partial(MixtureFeedForward, 8), jit=False) + model_state_dict = get_state_dict(model) - for k in (t := tqdm(state, disable=CI)): - if 'feed_forward.experts.' in k: - expert_no = int(k.split('feed_forward.experts.')[1].split('.')[0]) - device = Device.DEFAULT + ":" + str((expert_no//2)+1) - else: - device = Device.DEFAULT - t.set_description(f"ram used: {GlobalCounters.mem_used/1e9:5.2f} GB, loading {k} to {device}") - model_state_dict[k].replace(state[k].to(device).half()).realize() + for k in (t := tqdm(state, disable=CI)): + if 'feed_forward.experts.' in k: + expert_no = int(k.split('feed_forward.experts.')[1].split('.')[0]) + device = Device.DEFAULT + ":" + str((expert_no//2)+1) + else: + device = Device.DEFAULT + t.set_description(f"ram used: {GlobalCounters.mem_used/1e9:5.2f} GB, loading {k} to {device}") + model_state_dict[k].replace(state[k].to(device).half()).realize() if CI: print(f"ram used: {GlobalCounters.mem_used/1e9:5.2f} GB") from sentencepiece import SentencePieceProcessor @@ -53,7 +55,8 @@ if __name__ == "__main__": GlobalCounters.reset() with Profiling(sort="time", frac=0.1, enabled=args.profile): with Timing("total ", enabled=args.timing, on_exit=lambda x: f", {1e9/x:.2f} tok/sec"): - tok = model(Tensor([toks[start_pos:]]), 0 if start_pos == 0 else Variable("start_pos", 1, 1024).bind(start_pos), args.temperature).item() + with WallTimeEvent(BenchEvent.STEP): + tok = model(Tensor([toks[start_pos:]]), 0 if start_pos == 0 else Variable("start_pos", 1, 1024).bind(start_pos), args.temperature).item() toks.append(tok) start_pos += 1 print(spp.decode(toks)) diff --git a/examples/mlperf/model_eval.py b/examples/mlperf/model_eval.py index 3630ebeb80..35ad33eabb 100644 --- a/examples/mlperf/model_eval.py +++ b/examples/mlperf/model_eval.py @@ -5,61 +5,63 @@ import numpy as np from tinygrad import Tensor, Device, dtypes, GlobalCounters, TinyJit from tinygrad.nn.state import get_parameters, load_state_dict, safe_load from tinygrad.helpers import getenv +from extra.bench_log import BenchEvent, WallTimeEvent def tlog(x): print(f"{x:25s} @ {time.perf_counter()-start:5.2f}s") def eval_resnet(): Tensor.no_grad = True - # Resnet50-v1.5 - from extra.models.resnet import ResNet50 - tlog("imports") - GPUS = [f'{Device.DEFAULT}:{i}' for i in range(getenv("GPUS", 6))] - for x in GPUS: Device[x] - tlog("got devices") # NOTE: this is faster with rocm-smi running + with WallTimeEvent(BenchEvent.FULL): + # Resnet50-v1.5 + from extra.models.resnet import ResNet50 + tlog("imports") + GPUS = [f'{Device.DEFAULT}:{i}' for i in range(getenv("GPUS", 6))] + for x in GPUS: Device[x] + tlog("got devices") # NOTE: this is faster with rocm-smi running - class ResnetRunner: - def __init__(self, device=None): - self.mdl = ResNet50() - for x in get_parameters(self.mdl) if device else []: x.to_(device) - if (fn:=getenv("RESNET_MODEL", "")): load_state_dict(self.mdl, safe_load(fn)) - else: self.mdl.load_from_pretrained() - self.input_mean = Tensor([0.485, 0.456, 0.406], device=device).reshape(1, -1, 1, 1) - self.input_std = Tensor([0.229, 0.224, 0.225], device=device).reshape(1, -1, 1, 1) - def __call__(self, x:Tensor) -> Tensor: - x = x.permute([0,3,1,2]).cast(dtypes.float32) / 255.0 - x -= self.input_mean - x /= self.input_std - return self.mdl(x).log_softmax().argmax(axis=1).realize() + class ResnetRunner: + def __init__(self, device=None): + self.mdl = ResNet50() + for x in get_parameters(self.mdl) if device else []: x.to_(device) + if (fn:=getenv("RESNET_MODEL", "")): load_state_dict(self.mdl, safe_load(fn)) + else: self.mdl.load_from_pretrained() + self.input_mean = Tensor([0.485, 0.456, 0.406], device=device).reshape(1, -1, 1, 1) + self.input_std = Tensor([0.229, 0.224, 0.225], device=device).reshape(1, -1, 1, 1) + def __call__(self, x:Tensor) -> Tensor: + x = x.permute([0,3,1,2]).cast(dtypes.float32) / 255.0 + x -= self.input_mean + x /= self.input_std + return self.mdl(x).log_softmax().argmax(axis=1).realize() - mdl = TinyJit(ResnetRunner(GPUS)) - tlog("loaded models") + mdl = TinyJit(ResnetRunner(GPUS)) + tlog("loaded models") - # evaluation on the mlperf classes of the validation set from imagenet - from examples.mlperf.dataloader import batch_load_resnet - iterator = batch_load_resnet(getenv("BS", 128*6), val=getenv("VAL", 1), shuffle=False, pad_first_batch=True) - def data_get(): - x,y,cookie = next(iterator) - return x.shard(GPUS, axis=0).realize(), y, cookie - n,d = 0,0 - proc = data_get() - tlog("loaded initial data") - st = time.perf_counter() - while proc is not None: - GlobalCounters.reset() - proc = (mdl(proc[0]), proc[1], proc[2]) # this frees the images - run = time.perf_counter() - # load the next data here - try: next_proc = data_get() - except StopIteration: next_proc = None - nd = time.perf_counter() - y = np.array(proc[1]) - proc = (proc[0].numpy() == y) & (y != -1) # this realizes the models and frees the cookies - n += proc.sum() - d += (y != -1).sum() - et = time.perf_counter() - tlog(f"****** {n:5d}/{d:5d} {n*100.0/d:.2f}% -- {(run-st)*1000:7.2f} ms to enqueue, {(et-run)*1000:7.2f} ms to realize ({(nd-run)*1000:7.2f} ms fetching). {(len(proc))/(et-st):8.2f} examples/sec. {GlobalCounters.global_ops*1e-12/(et-st):5.2f} TFLOPS") - st = et - proc, next_proc = next_proc, None - tlog("done") + # evaluation on the mlperf classes of the validation set from imagenet + from examples.mlperf.dataloader import batch_load_resnet + iterator = batch_load_resnet(getenv("BS", 128*6), val=getenv("VAL", 1), shuffle=False, pad_first_batch=True) + def data_get(): + x,y,cookie = next(iterator) + return x.shard(GPUS, axis=0).realize(), y, cookie + n,d = 0,0 + proc = data_get() + tlog("loaded initial data") + st = time.perf_counter() + while proc is not None: + GlobalCounters.reset() + proc = (mdl(proc[0]), proc[1], proc[2]) # this frees the images + run = time.perf_counter() + # load the next data here + try: next_proc = data_get() + except StopIteration: next_proc = None + nd = time.perf_counter() + y = np.array(proc[1]) + proc = (proc[0].numpy() == y) & (y != -1) # this realizes the models and frees the cookies + n += proc.sum() + d += (y != -1).sum() + et = time.perf_counter() + tlog(f"****** {n:5d}/{d:5d} {n*100.0/d:.2f}% -- {(run-st)*1000:7.2f} ms to enqueue, {(et-run)*1000:7.2f} ms to realize ({(nd-run)*1000:7.2f} ms fetching). {(len(proc))/(et-st):8.2f} examples/sec. {GlobalCounters.global_ops*1e-12/(et-st):5.2f} TFLOPS") + st = et + proc, next_proc = next_proc, None + tlog("done") def eval_unet3d(): # UNet3D diff --git a/examples/mlperf/model_train.py b/examples/mlperf/model_train.py index 3a74571afd..b971c8b6c1 100644 --- a/examples/mlperf/model_train.py +++ b/examples/mlperf/model_train.py @@ -9,6 +9,7 @@ from tinygrad.nn.optim import LAMB, LARS, SGD, OptimizerGroup, Adam from extra.lr_scheduler import LRSchedulerGroup from examples.mlperf.helpers import get_training_state, load_training_state +from extra.bench_log import BenchEvent, WallTimeEvent # TODO: fix benchmark logging and use tinygrad tqdm from tqdm import tqdm @@ -205,24 +206,25 @@ def train_resnet(): st = time.perf_counter() while proc is not None: GlobalCounters.reset() - (loss, top_1), y, proc = train_step(proc[0], proc[1]), proc[2], proc[3] + with WallTimeEvent(BenchEvent.STEP): + (loss, top_1), y, proc = train_step(proc[0], proc[1]), proc[2], proc[3] - pt = time.perf_counter() + pt = time.perf_counter() - if len(prev_cookies) == getenv("STORE_COOKIES", 1): prev_cookies = [] # free previous cookies after gpu work has been enqueued - try: - if INITMLPERF: - next_proc = fake_data_get(BS) - else: - next_proc = data_get(it) - except StopIteration: - next_proc = None + if len(prev_cookies) == getenv("STORE_COOKIES", 1): prev_cookies = [] # free previous cookies after gpu work has been enqueued + try: + if INITMLPERF: + next_proc = fake_data_get(BS) + else: + next_proc = data_get(it) + except StopIteration: + next_proc = None - dt = time.perf_counter() + dt = time.perf_counter() - device_str = loss.device if isinstance(loss.device, str) else f"{loss.device[0]} * {len(loss.device)}" - loss, top_1 = loss.numpy().item(), top_1.numpy().item() - top_1_acc = top_1 / sum(yi != -1 for yi in y) + device_str = loss.device if isinstance(loss.device, str) else f"{loss.device[0]} * {len(loss.device)}" + loss, top_1 = loss.numpy().item(), top_1.numpy().item() + top_1_acc = top_1 / sum(yi != -1 for yi in y) cl = time.perf_counter() if BENCHMARK: @@ -1124,23 +1126,24 @@ def train_bert(): BEAM.value = TRAIN_BEAM st = time.perf_counter() GlobalCounters.reset() - loss, global_norm, lr = train_step_bert(model, optimizer_group, scheduler_group, loss_scaler, - train_data["input_ids"], train_data["segment_ids"], train_data["input_mask"], train_data["masked_lm_positions"], \ - train_data["masked_lm_ids"], train_data["masked_lm_weights"], train_data["next_sentence_labels"], GPUS) + with WallTimeEvent(BenchEvent.STEP): + loss, global_norm, lr = train_step_bert(model, optimizer_group, scheduler_group, loss_scaler, + train_data["input_ids"], train_data["segment_ids"], train_data["input_mask"], train_data["masked_lm_positions"], \ + train_data["masked_lm_ids"], train_data["masked_lm_weights"], train_data["next_sentence_labels"], GPUS) - pt = time.perf_counter() + pt = time.perf_counter() - try: - next_data = next(train_it) - except StopIteration: - next_data = None + try: + next_data = next(train_it) + except StopIteration: + next_data = None - dt = time.perf_counter() + dt = time.perf_counter() - device_str = parameters[0].device if isinstance(parameters[0].device, str) else f"{parameters[0].device[0]} * {len(parameters[0].device)}" - loss = loss.item() - assert not math.isnan(loss) - lr = lr.item() + device_str = parameters[0].device if isinstance(parameters[0].device, str) else f"{parameters[0].device[0]} * {len(parameters[0].device)}" + loss = loss.item() + assert not math.isnan(loss) + lr = lr.item() cl = time.perf_counter() if BENCHMARK: step_times.append(cl - st) diff --git a/examples/sdv2.py b/examples/sdv2.py index a3f4d30bca..89af31a8a8 100644 --- a/examples/sdv2.py +++ b/examples/sdv2.py @@ -5,6 +5,7 @@ from examples.stable_diffusion import AutoencoderKL, get_alphas_cumprod from examples.sdxl import DPMPP2MSampler, append_dims, LegacyDDPMDiscretization from extra.models.unet import UNetModel from extra.models.clip import FrozenOpenClipEmbedder +from extra.bench_log import BenchEvent, WallTimeEvent from typing import Dict import argparse, tempfile, os @@ -117,12 +118,14 @@ if __name__ == "__main__": if not weights_fn: weights_url = args.weights_url if args.weights_url else default_weights_url weights_fn = fetch(weights_url, os.path.basename(str(weights_url))) - load_state_dict(model, safe_load(weights_fn), strict=False) - if args.fp16: - for k,v in get_state_dict(model).items(): - if k.startswith("model"): - v.replace(v.cast(dtypes.float16).realize()) + with WallTimeEvent(BenchEvent.LOAD_WEIGHTS): + load_state_dict(model, safe_load(weights_fn), strict=False) + + if args.fp16: + for k,v in get_state_dict(model).items(): + if k.startswith("model"): + v.replace(v.cast(dtypes.float16).realize()) c = { "crossattn": model.cond_stage_model(args.prompt) } uc = { "crossattn": model.cond_stage_model("") } diff --git a/examples/sdxl.py b/examples/sdxl.py index 149642cd28..0b7e13cc82 100644 --- a/examples/sdxl.py +++ b/examples/sdxl.py @@ -9,6 +9,7 @@ from tinygrad.nn.state import safe_load, load_state_dict, get_state_dict from tinygrad.helpers import fetch, trange, colored, Timing from extra.models.clip import Embedder, FrozenClosedClipEmbedder, FrozenOpenClipEmbedder from extra.models.unet import UNetModel, Upsample, Downsample, timestep_embedding +from extra.bench_log import BenchEvent, WallTimeEvent from examples.stable_diffusion import ResnetBlock, Mid import numpy as np @@ -346,17 +347,18 @@ class DPMPP2MSampler: for i in trange(num_sigmas - 1): with Timing("step in ", enabled=timing, on_exit=lambda _: f", using {GlobalCounters.mem_used/1e9:.2f} GB"): GlobalCounters.reset() - x, old_denoised = self.sampler_step( - old_denoised=old_denoised, - prev_sigma=(None if i==0 else sigmas[i-1].expand(x.shape[0])), - sigma=sigmas[i].expand(x.shape[0]), - next_sigma=sigmas[i+1].expand(x.shape[0]), - denoiser=denoiser, - x=x, - c=c, - uc=uc, - ) - x.realize(old_denoised) + with WallTimeEvent(BenchEvent.STEP): + x, old_denoised = self.sampler_step( + old_denoised=old_denoised, + prev_sigma=(None if i==0 else sigmas[i-1].expand(x.shape[0])), + sigma=sigmas[i].expand(x.shape[0]), + next_sigma=sigmas[i+1].expand(x.shape[0]), + denoiser=denoiser, + x=x, + c=c, + uc=uc, + ) + x.realize(old_denoised) return x @@ -388,7 +390,8 @@ if __name__ == "__main__": start_mem_used = GlobalCounters.mem_used with Timing("loaded weights in ", lambda et_ns: f", {(B:=(GlobalCounters.mem_used-start_mem_used))/1e9:.2f} GB loaded at {B/et_ns:.2f} GB/s"): - Tensor.realize(*loaded_weights) + with WallTimeEvent(BenchEvent.LOAD_WEIGHTS): + Tensor.realize(*loaded_weights) del loaded_weights N = 1 diff --git a/examples/stable_diffusion.py b/examples/stable_diffusion.py index be4305771d..e47d6bf96b 100644 --- a/examples/stable_diffusion.py +++ b/examples/stable_diffusion.py @@ -14,6 +14,7 @@ from tinygrad.nn import Conv2d, GroupNorm from tinygrad.nn.state import torch_load, load_state_dict, get_state_dict from extra.models.clip import Closed, Tokenizer from extra.models.unet import UNetModel +from extra.bench_log import BenchEvent, WallTimeEvent class AttnBlock: def __init__(self, in_channels): @@ -232,12 +233,13 @@ if __name__ == "__main__": model = StableDiffusion() # load in weights - load_state_dict(model, torch_load(fetch('https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt', 'sd-v1-4.ckpt'))['state_dict'], strict=False) + with WallTimeEvent(BenchEvent.LOAD_WEIGHTS): + load_state_dict(model, torch_load(fetch('https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt', 'sd-v1-4.ckpt'))['state_dict'], strict=False) - if args.fp16: - for k,v in get_state_dict(model).items(): - if k.startswith("model"): - v.replace(v.cast(dtypes.float16).realize()) + if args.fp16: + for k,v in get_state_dict(model).items(): + if k.startswith("model"): + v.replace(v.cast(dtypes.float16).realize()) # run through CLIP to get context tokenizer = Tokenizer.ClipTokenizer() @@ -270,9 +272,10 @@ if __name__ == "__main__": GlobalCounters.reset() t.set_description("%3d %3d" % (index, timestep)) with Timing("step in ", enabled=args.timing, on_exit=lambda _: f", using {GlobalCounters.mem_used/1e9:.2f} GB"): - tid = Tensor([index]) - latent = run(model, unconditional_context, context, latent, Tensor([timestep]), alphas[tid], alphas_prev[tid], Tensor([args.guidance])) - if args.timing: Device[Device.DEFAULT].synchronize() + with WallTimeEvent(BenchEvent.STEP): + tid = Tensor([index]) + latent = run(model, unconditional_context, context, latent, Tensor([timestep]), alphas[tid], alphas_prev[tid], Tensor([args.guidance])) + if args.timing: Device[Device.DEFAULT].synchronize() del run # upsample latent space to image with autoencoder diff --git a/extra/bench_log.py b/extra/bench_log.py new file mode 100644 index 0000000000..8294bdfc2b --- /dev/null +++ b/extra/bench_log.py @@ -0,0 +1,108 @@ +import time, atexit, uuid +from enum import Enum + +from tinygrad.device import Device +from tinygrad.helpers import DEBUG, ContextVar, getenv, GlobalCounters + +BENCHMARK_LOG = ContextVar("BENCHMARK_LOG", "") + +if BENCHMARK_LOG: + from influxdb_client_3 import InfluxDBClient3, Point, WriteOptions, write_client_options + from influxdb_client_3.write_client.client.write_api import WriteType + +class BenchEvent(Enum): + LOAD_WEIGHTS = "load_weights" + STEP = "step" + FULL = "full" +class InstantBenchEvent(Enum): + GFLOPS = "gflops" + +_events = {} +def clear_events(): + for event in BenchEvent: + _events[event] = {"wall": [], "kernel": []} + for event in InstantBenchEvent: + _events[event] = [] +clear_events() + +class WallTimeEvent: + def __init__(self, event:BenchEvent): + self.event = event + def __enter__(self): + self.start = time.monotonic() + return self + def __exit__(self, *_): + _events[self.event]["wall"].append(time.monotonic() - self.start) + return False + +class KernelTimeEvent: + def __init__(self, event:BenchEvent): + if DEBUG < 2: + raise Exception("KernelTimeEvent should only be used in DEBUG >= 2") + self.event = event + def __enter__(self): + self.start = GlobalCounters.time_sum_s + return self + def __exit__(self, *_): + _events[self.event]["kernel"].append(GlobalCounters.time_sum_s - self.start) + return False + +def log_event_instant(event:InstantBenchEvent, value:float): + _events[event].append(value) + +if BENCHMARK_LOG: + INFLUXDB_HOST = getenv("INFLUXDB_HOST", "") + INFLUXDB_ORG = getenv("INFLUXDB_ORG", "tiny") + INFLUXDB_TOKEN = getenv("INFLUXDB_TOKEN", "") + + def _create_point(run_id, i, attempt, ref, commit, name, value, run): + point = Point(BENCHMARK_LOG.value).tag("id", run_id).tag("index", i) + point = point.tag("device", Device.DEFAULT) + point = point.tag("attempt", attempt).tag("ref", ref).tag("commit", commit) + point = point.field(name, value).field("x", run) + return point + + @atexit.register + def write_events(): + # see if there are any events to write + have_events = False + for event in _events: + if isinstance(event, BenchEvent): + for event_type, values in _events[event].items(): + if len(values) > 0: + have_events = True + else: + if len(_events[event]) > 0: + have_events = True + if not have_events: + return + + # pull from github envvars + ref = getenv("GITHUB_REF_NAME", "") + commit = getenv("GITHUB_SHA", "") + run = getenv("GITHUB_RUN_NUMBER", "") + attempt = getenv("GITHUB_RUN_ATTEMPT", "") + + points = [] + for event in _events: + run_id = str(uuid.uuid4()) + if isinstance(event, BenchEvent): + for event_type, values in _events[event].items(): + for i, value in enumerate(values): + point = _create_point(run_id, i, attempt, ref, commit, f"{event.value}_{event_type}", value, run) + points.append(point) + else: + for i, value in enumerate(_events[event]): + point = _create_point(run_id, i, attempt, ref, commit, event.value, value, run) + points.append(point) + + write_options = WriteOptions(write_type=WriteType.synchronous, retry_interval=5000, max_retries=5, max_retry_delay=30000, exponential_base=2) + wco = write_client_options(write_options=write_options) + with InfluxDBClient3( + host=INFLUXDB_HOST, + org=INFLUXDB_ORG, + token=INFLUXDB_TOKEN, + auth_scheme="Basic", + database="benchmarks", + write_client_options=wco) as client: + client.write(points) diff --git a/setup.py b/setup.py index d7676f59bc..2a503fba71 100644 --- a/setup.py +++ b/setup.py @@ -73,7 +73,8 @@ setup(name='tinygrad', "capstone", "pycocotools", "boto3", - "pandas" + "pandas", + "influxdb3-python" ], 'docs': [ "mkdocs", diff --git a/test/external/external_benchmark_openpilot.py b/test/external/external_benchmark_openpilot.py index 910d8a5ed3..7316a11bb0 100644 --- a/test/external/external_benchmark_openpilot.py +++ b/test/external/external_benchmark_openpilot.py @@ -7,6 +7,7 @@ from tinygrad import Tensor, dtypes, TinyJit from tinygrad.helpers import IMAGE, GlobalCounters, fetch, colored, getenv, trange from tinygrad.tensor import _from_np_dtype import numpy as np +from extra.bench_log import BenchEvent, WallTimeEvent OPENPILOT_MODEL = sys.argv[1] if len(sys.argv) > 1 else "https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx" @@ -33,10 +34,11 @@ if __name__ == "__main__": for _ in range(20): GlobalCounters.reset() st = time.perf_counter_ns() - # Need to cast non-image inputs from numpy, this is only realistic way to run model - inputs = {**{k:v for k,v in new_inputs_junk.items() if 'img' in k}, - **{k:Tensor(v) for k,v in new_inputs_junk_numpy.items() if 'img' not in k}} - ret = next(iter(run_onnx_jit(**inputs).values())).cast(dtypes.float32).numpy() + with WallTimeEvent(BenchEvent.STEP): + # Need to cast non-image inputs from numpy, this is only realistic way to run model + inputs = {**{k:v for k,v in new_inputs_junk.items() if 'img' in k}, + **{k:Tensor(v) for k,v in new_inputs_junk_numpy.items() if 'img' not in k}} + ret = next(iter(run_onnx_jit(**inputs).values())).cast(dtypes.float32).numpy() print(f"jitted: {(time.perf_counter_ns() - st)*1e-6:7.4f} ms") suffix = "" diff --git a/test/testextra/test_bench_log.py b/test/testextra/test_bench_log.py new file mode 100644 index 0000000000..0798ac561e --- /dev/null +++ b/test/testextra/test_bench_log.py @@ -0,0 +1,103 @@ +import unittest, time + +from extra.bench_log import BenchEvent, InstantBenchEvent, WallTimeEvent, KernelTimeEvent, log_event_instant, _events, clear_events +from tinygrad.helpers import Context +from tinygrad.tensor import Tensor + +class TestBenchLog(unittest.TestCase): + def setUp(self): + clear_events() + + def test_log_single_wall_time(self): + for event in BenchEvent: + with WallTimeEvent(event): + time.sleep(0.1) + + # check event list + for event in BenchEvent: + self.assertEqual(len(_events[event]["wall"]), 1) + self.assertGreater(_events[event]["wall"][0], 0) + + def test_log_double_wall_time(self): + for event in BenchEvent: + with WallTimeEvent(event): + time.sleep(0.1) + + for event in reversed(BenchEvent): + with WallTimeEvent(event): + time.sleep(0.2) + + # check event list + for event in BenchEvent: + self.assertEqual(len(_events[event]["wall"]), 2) + self.assertGreater(_events[event]["wall"][0], 0) + self.assertGreater(_events[event]["wall"][1], 0) + + def test_log_single_kernel_time(self): + wall_times = [] + + with Context(DEBUG=2): + for event in BenchEvent: + with KernelTimeEvent(event): + st = time.perf_counter() + Tensor.rand(32, 32).sum().realize().item() + wall_times.append(time.perf_counter() - st) + + # check event list + for event in BenchEvent: + self.assertEqual(len(_events[event]["kernel"]), 1) + self.assertLess(_events[event]["kernel"][0], wall_times[0]) + self.assertGreater(_events[event]["kernel"][0], 0) + + def test_interleaved_wall_kernel_time(self): + wall_times = [] + with Context(DEBUG=2): + for event in BenchEvent: + with KernelTimeEvent(event): + st = time.perf_counter() + Tensor.rand(32, 32).sum().realize().item() + wall_times.append(time.perf_counter() - st) + + with WallTimeEvent(event): + st = time.perf_counter() + Tensor.rand(32, 32).sum().realize().item() + wall_times.append(time.perf_counter() - st) + + # check event list + for event in BenchEvent: + self.assertEqual(len(_events[event]["wall"]), 1) + self.assertEqual(len(_events[event]["kernel"]), 1) + self.assertLess(_events[event]["kernel"][0], wall_times[0]) + self.assertGreater(_events[event]["kernel"][0], 0) + + def test_stacked_wall_kernel_time(self): + with Context(DEBUG=2): + for event in BenchEvent: + with KernelTimeEvent(event): + with WallTimeEvent(event): + Tensor.rand(32, 32).sum().realize().item() + + for event in BenchEvent: + with WallTimeEvent(event): + with KernelTimeEvent(event): + Tensor.rand(32, 32).sum().realize().item() + + for event in BenchEvent: + self.assertEqual(len(_events[event]["wall"]), 2) + self.assertEqual(len(_events[event]["kernel"]), 2) + self.assertLess(_events[event]["kernel"][0], _events[event]["wall"][0]) + self.assertGreater(_events[event]["kernel"][0], 0) + self.assertLess(_events[event]["kernel"][1], _events[event]["wall"][1]) + self.assertGreater(_events[event]["kernel"][1], 0) + + def test_log_instant_event(self): + for event in InstantBenchEvent: + log_event_instant(event, 1000) + + # check event list + for event in InstantBenchEvent: + self.assertEqual(len(_events[event]), 1) + self.assertEqual(_events[event][0], 1000) + +if __name__ == '__main__': + unittest.main()