move benchmark stat tracking to influxdb (#10185)

2026-01-08 22:48:25 -05:00 · 2025-05-15 16:14:56 -07:00
parent f59ecf2116
commit 1ed04f993b
15 changed files with 527 additions and 280 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -52,14 +52,14 @@ jobs:
    - name: reset process replay
      run: python3.11 test/external/process_replay/reset.py
    - name: Run Stable Diffusion
-      run: JIT=1 python3.11 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
+      run: BENCHMARK_LOG=stable_diffusion JIT=1 python3.11 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
    - name: Run Stable Diffusion without fp16
-      run: JIT=1 python3.11 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd_no_fp16.txt
+      run: BENCHMARK_LOG=stable_diffusion_fp32 JIT=1 python3.11 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd_no_fp16.txt
    - name: Run Stable Diffusion v2
-      run: JIT=1 python3.11 examples/sdv2.py --fp16 --seed 0 --noshow --timing | tee sdv2.txt
+      run: BENCHMARK_LOG=stable_diffusion_v2 JIT=1 python3.11 examples/sdv2.py --fp16 --seed 0 --noshow --timing | tee sdv2.txt
    # process replay can't capture this, the graph is too large
    - name: Run SDXL
-      run: CAPTURE_PROCESS_REPLAY=0 JIT=1 python3.11 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
+      run: BENCHMARK_LOG=stable_diffusion_xl CAPTURE_PROCESS_REPLAY=0 JIT=1 python3.11 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
    - name: Run model inference benchmark
      run: METAL=1 python3.11 test/external/external_model_benchmark.py
    - name: Test speed vs torch
@@ -80,40 +80,40 @@ jobs:
      run: METAL=1 M_START=6 M_STOP=10 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=6 K_STOP=24 K_STEP=1 TC_OPT=2 DEBUG=2 python3.11 ./extra/gemm/fuzz_matmul.py
    - name: Run LLaMA
      run: |
-        JIT=0 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
-        JIT=1 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt
+        BENCHMARK_LOG=llama_nojit JIT=0 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
+        BENCHMARK_LOG=llama JIT=1 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt
    - name: Run LLaMA with BEAM
-      run: JITBEAM=2 IGNORE_BEAM_CACHE=1 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt
+      run: BENCHMARK_LOG=llama_beam JITBEAM=2 IGNORE_BEAM_CACHE=1 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt
    - name: Run quantized LLaMA
      run: |
-        python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize int8 | tee llama_int8.txt
-        python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize nf4 | tee llama_nf4.txt
+        BENCHMARK_LOG=llama_int8 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize int8 | tee llama_int8.txt
+        BENCHMARK_LOG=llama_nf4 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize nf4 | tee llama_nf4.txt
    - name: Run quantized LLaMA3
      run: |
-        python3.11 examples/llama3.py --size 8B --temperature 0 --benchmark --quantize int8 | tee llama3_int8.txt
-        python3.11 examples/llama3.py --size 8B --temperature 0 --benchmark --quantize nf4 | tee llama3_nf4.txt
+        BENCHMARK_LOG=llama3_int8 python3.11 examples/llama3.py --size 8B --temperature 0 --benchmark --quantize int8 | tee llama3_int8.txt
+        BENCHMARK_LOG=llama3_nf4 python3.11 examples/llama3.py --size 8B --temperature 0 --benchmark --quantize nf4 | tee llama3_nf4.txt
    #- name: Run LLaMA 7B on 4 (virtual) GPUs
    #  run: python3.11 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0  --timing | tee llama_four_gpu.txt
    - name: Run GPT2
      run: |
-        JIT=0 python3.11 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
-        JIT=1 python3.11 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
+        BENCHMARK_LOG=gpt2_nojit JIT=0 python3.11 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
+        BENCHMARK_LOG=gpt2 JIT=1 python3.11 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
    - name: Run GPT2 w HALF
-      run: HALF=1 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
+      run: BENCHMARK_LOG=gpt2_half HALF=1 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
    - name: Run GPT2 w HALF/BEAM
-      run: HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
+      run: BENCHMARK_LOG=gpt2_half_beam HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
    - name: Run OLMoE
-      run: python3.11 examples/olmoe.py
+      run: BENCHMARK_LOG=olmoe python3.11 examples/olmoe.py
    - name: Train MNIST
      run: time PYTHONPATH=. TARGET_EVAL_ACC_PCT=96.0 python3.11 examples/beautiful_mnist.py | tee beautiful_mnist.txt
    - name: Run 10 CIFAR training steps
-      run: JIT=1 STEPS=10 python3.11 examples/hlb_cifar10.py | tee train_cifar.txt
+      run: BENCHMARK_LOG=cifar_10steps JIT=1 STEPS=10 python3.11 examples/hlb_cifar10.py | tee train_cifar.txt
    - name: Run 10 CIFAR training steps w HALF
-      run: JIT=2 STEPS=10 DEFAULT_FLOAT=HALF python3.11 examples/hlb_cifar10.py | tee train_cifar_half.txt
+      run: BENCHMARK_LOG=cifar_10steps_half JIT=2 STEPS=10 DEFAULT_FLOAT=HALF python3.11 examples/hlb_cifar10.py | tee train_cifar_half.txt
    #- name: Run 10 CIFAR training steps w BF16
    #  run: STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3.11 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
    - name: Run 10 CIFAR training steps w winograd
-      run: JIT=1 WINO=1 STEPS=10 python3.11 examples/hlb_cifar10.py | tee train_cifar_wino.txt
+      run: BENCHMARK_LOG=cifar_10steps_wino JIT=1 WINO=1 STEPS=10 python3.11 examples/hlb_cifar10.py | tee train_cifar_wino.txt
    - name: UsbGPU boot time
      run: sudo -E PYTHONPATH=. DEBUG=2 AM_RESET=1 AMD=1 AMD_IFACE=USB time python3.11 test/test_tiny.py TestTiny.test_plus
    - name: UsbGPU tiny tests
@@ -210,37 +210,37 @@ jobs:
    - name: Test CUDA=1
      run: DEBUG=2 CUDA=1 python -m pytest -rA test/test_tiny.py
    - name: Run Stable Diffusion
-      run: NV=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
+      run: BENCHMARK_LOG=stable_diffusion NV=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
    - name: Run SDXL
-      run: CAPTURE_PROCESS_REPLAY=0 NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
+      run: BENCHMARK_LOG=stable_diffusion_xl CAPTURE_PROCESS_REPLAY=0 NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
    - name: Run LLaMA
      run: |
-        NV=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
-        NV=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt
+        BENCHMARK_LOG=llama_nojit NV=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
+        BENCHMARK_LOG=llama NV=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt
    - name: Run LLaMA with BEAM
-      run: NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt
+      run: BENCHMARK_LOG=llama_beam NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt
    # - name: Run LLaMA 7B on 4 GPUs
    #   run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0  --timing | tee llama_four_gpu.txt
    # - name: Run LLaMA 7B on 6 GPUs
    #   run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0  --timing | tee llama_six_gpu.txt
    - name: Run LLaMA-3 8B BEAM
-      run: NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt
+      run: BENCHMARK_LOG=llama3_beam NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt
    - name: Run LLaMA-3 8B on 4 GPUs with BEAM
-      run: NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
+      run: BENCHMARK_LOG=llama3_beam_4gpu NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
    # - name: Run LLaMA-3 8B on 6 GPUs
    #   run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt
    # - name: Run LLaMA-2 70B
    #   run: NV=1 CAPTURE_PROCESS_REPLAY=0 MAX_CONTEXT=256 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0  --timing | tee llama_2_70B.txt
    - name: Run Mixtral 8x7B
-      run: time NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt
+      run: time BENCHMARK_LOG=mixtral NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt
    - name: Run GPT2
      run: |
-        NV=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
-        NV=1 JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
+        BENCHMARK_LOG=gpt2_nojit NV=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
+        BENCHMARK_LOG=gpt2 NV=1 JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
    - name: Run GPT2 w HALF
-      run: NV=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
+      run: BENCHMARK_LOG=gpt2_half NV=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
    - name: Run GPT2 w HALF/BEAM
-      run: NV=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
+      run: BENCHMARK_LOG=gpt2_half_beam NV=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
    - uses: actions/upload-artifact@v4
      with:
        name: Speed (NVIDIA)
@@ -304,26 +304,26 @@ jobs:
    - name: Train MNIST
      run: time PYTHONPATH=. NV=1 TARGET_EVAL_ACC_PCT=96.0 python3 examples/beautiful_mnist.py | tee beautiful_mnist.txt
    - name: Run 10 CIFAR training steps
-      run: NV=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
+      run: BENCHMARK_LOG=cifar_10steps NV=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
    - name: Run 10 CIFAR training steps w HALF
-      run: NV=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
+      run: BENCHMARK_LOG=cifar_10steps_half NV=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
    - name: Run 10 CIFAR training steps w BF16
-      run: NV=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
+      run: BENCHMARK_LOG=cifar_10steps_bf16 NV=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
    - name: Run 10 CIFAR training steps w winograd
-      run: NV=1 CAPTURE_PROCESS_REPLAY=0 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
+      run: BENCHMARK_LOG=cifar_10steps_half_wino NV=1 CAPTURE_PROCESS_REPLAY=0 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
    - name: Run full CIFAR training w 1 GPU
-      run: time NV=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
+      run: time BENCHMARK_LOG=cifar NV=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
    - name: Run full CIFAR training steps w 6 GPUS
-      run: time CAPTURE_PROCESS_REPLAY=0 NV=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt
+      run: time BENCHMARK_LOG=cifar_6gpu CAPTURE_PROCESS_REPLAY=0 NV=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt
    - name: Run MLPerf resnet eval on training data
-      run: time NV=1 MODEL=resnet python3 examples/mlperf/model_eval.py
+      run: time BENCHMARK_LOG=resnet_eval NV=1 MODEL=resnet python3 examples/mlperf/model_eval.py
    - name: Run 10 MLPerf ResNet50 training steps (1 gpu)
-      run: NV=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt
+      run: BENCHMARK_LOG=resnet_10steps NV=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt
    - name: Run 10 MLPerf ResNet50 training steps (6 gpu)
-      run: NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
+      run: BENCHMARK_LOG=resnet_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
    - name: Run 10 MLPerf Bert training steps (6 gpu)
      # TODO: remove BERT_LAYERS once scheduler is fast
-      run: NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
+      run: BENCHMARK_LOG=bert_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
    - uses: actions/upload-artifact@v4
      with:
        name: Speed (NVIDIA Training)
@@ -409,23 +409,23 @@ jobs:
    - name: Test AM warm start time
      run: time AMD=1 python3 test/test_tiny.py TestTiny.test_plus
    - name: Run Stable Diffusion
-      run: AMD=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
+      run: BENCHMARK_LOG=stable_diffusion AMD=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
    - name: Run SDXL
-      run: CAPTURE_PROCESS_REPLAY=0 AMD=1 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
+      run: BENCHMARK_LOG=stable_diffusion_xl CAPTURE_PROCESS_REPLAY=0 AMD=1 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
    - name: Run LLaMA 7B
      run: |
-        AMD=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
-        AMD=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt
+        BENCHMARK_LOG=llama_nojit AMD=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
+        BENCHMARK_LOG=llama AMD=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt
    - name: Run LLaMA 7B with BEAM
-      run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt
+      run: BENCHMARK_LOG=llama_beam AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt
    # - name: Run LLaMA 7B on 4 GPUs
    #   run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0  --timing | tee llama_four_gpu.txt
    # - name: Run LLaMA 7B on 6 GPUs
    #   run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0  --timing | tee llama_six_gpu.txt
    - name: Run LLaMA-3 8B BEAM
-      run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt
+      run: BENCHMARK_LOG=llama3_beam AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt
    - name: Run LLaMA-3 8B on 4 GPUs with BEAM
-      run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
+      run: BENCHMARK_LOG=llama3_beam_4gpu AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
    # - name: Run LLaMA-3 8B on 6 GPUs
    #   run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt
    - name: Restore amdgpu
@@ -433,15 +433,15 @@ jobs:
    # - name: Run LLaMA-2 70B
    #   run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0  --timing | tee llama_2_70B.txt
    - name: Run Mixtral 8x7B
-      run: time AMD=1 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt
+      run: time BENCHMARK_LOG=mixtral AMD=1 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt
    - name: Run GPT2
      run: |
-        AMD=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
-        AMD=1 JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
+        BENCHMARK_LOG=gpt2_nojit AMD=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
+        BENCHMARK_LOG=gpt2 AMD=1 JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
    - name: Run GPT2 w HALF
-      run: AMD=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
+      run: BENCHMARK_LOG=gpt2_half AMD=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
    - name: Run GPT2 w HALF/BEAM
-      run: AMD=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
+      run: BENCHMARK_LOG=gpt2_half_beam AMD=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
    - uses: actions/upload-artifact@v4
      with:
        name: Speed (AMD)
@@ -500,26 +500,26 @@ jobs:
    - name: Train MNIST
      run: time PYTHONPATH=. AMD=1 TARGET_EVAL_ACC_PCT=96.0 python3 examples/beautiful_mnist.py | tee beautiful_mnist.txt
    - name: Run 10 CIFAR training steps
-      run: AMD=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
+      run: BENCHMARK_LOG=cifar_10steps AMD=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
    - name: Run 10 CIFAR training steps w HALF
-      run: AMD=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
+      run: BENCHMARK_LOG=cifar_10steps_half AMD=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
    - name: Run 10 CIFAR training steps w BF16
-      run: AMD=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
+      run: BENCHMARK_LOG=cifar_10steps_bf16 AMD=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
    - name: Run 10 CIFAR training steps w winograd
-      run: AMD=1 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
+      run: BENCHMARK_LOG=cifar_10steps_half_wino AMD=1 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
    - name: Run full CIFAR training w 1 GPU
-      run: time AMD=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
+      run: time BENCHMARK_LOG=cifar AMD=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
    - name: Run full CIFAR training steps w 6 GPUS
-      run: time AMD=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt
+      run: time BENCHMARK_LOG=cifar_6gpu AMD=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt
    - name: Run MLPerf resnet eval
-      run: time AMD=1 MODEL=resnet python3 examples/mlperf/model_eval.py
+      run: time BENCHMARK_LOG=resnet_eval AMD=1 MODEL=resnet python3 examples/mlperf/model_eval.py
    - name: Run 10 MLPerf ResNet50 training steps (1 gpu)
-      run: AMD=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt
+      run: BENCHMARK_LOG=resnet_10steps AMD=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt
    - name: Run 10 MLPerf ResNet50 training steps (6 gpu)
-      run: AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
+      run: BENCHMARK_LOG=resnet_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
    - name: Run 10 MLPerf Bert training steps (6 gpu)
      # TODO: remove BERT_LAYERS once scheduler is fast
-      run: AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
+      run: BENCHMARK_LOG=bert_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
    - uses: actions/upload-artifact@v4
      with:
        name: Speed (AMD Training)
@@ -558,13 +558,13 @@ jobs:
    - name: validate openpilot 0.9.7
      run: PYTHONPATH=. FLOAT16=0 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_image_0_9_7.txt
    - name: benchmark openpilot 0.9.4
-      run: PYTHONPATH=. QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx | tee openpilot_0_9_4.txt
+      run: BENCHMARK_LOG=openpilot_0_9_4 PYTHONPATH=. QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx | tee openpilot_0_9_4.txt
    - name: benchmark openpilot 0.9.7
-      run: PYTHONPATH=. QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_0_9_7.txt
+      run: BENCHMARK_LOG=openpilot_0_9_7 PYTHONPATH=. QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_0_9_7.txt
    - name: benchmark openpilot w IMAGE=2 0.9.4
-      run: PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx | tee openpilot_image_0_9_4.txt
+      run: BENCHMARK_LOG=openpilot_0_9_4_image PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx | tee openpilot_image_0_9_4.txt
    - name: benchmark openpilot w IMAGE=2 0.9.7
-      run: PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_image_0_9_7.txt
+      run: BENCHMARK_LOG=openpilot_0_9_7_image PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_image_0_9_7.txt
    - name: openpilot compile3 0.9.7
      run: PYTHONPATH="." QCOM=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx
    - name: openpilot compile3 0.9.7+ tomb raider
--- a/examples/gpt2.py
+++ b/examples/gpt2.py
@@ -7,6 +7,7 @@ from tinygrad.ops import UOp
 from tinygrad.helpers import Timing, DEBUG, JIT, getenv, fetch, colored, trange
 from tinygrad.nn import Embedding, Linear, LayerNorm
 from tinygrad.nn.state import gguf_load, torch_load, load_state_dict, get_state_dict
+from extra.bench_log import BenchEvent, WallTimeEvent

 MAX_CONTEXT = getenv("MAX_CONTEXT", 128)
 HALF = getenv("HALF")
@@ -134,11 +135,12 @@ class GPT2:
    # lm head and wte are tied
    weights['lm_head.weight'] = weights['wte.weight']

-    load_state_dict(model, weights)
+    with WallTimeEvent(BenchEvent.LOAD_WEIGHTS):
+      load_state_dict(model, weights)

-    if HALF:
-      for l in get_state_dict(model).values():
-        l.replace(l.half().realize())
+      if HALF:
+        for l in get_state_dict(model).values():
+          l.replace(l.half().realize())

    return GPT2(model, tokenizer)

@@ -167,7 +169,8 @@ class GPT2:
      return key
    state_dict = { _remap_gguf_key(k): v for k, v in state_dict.items() }
    model = Transformer(**gpt2_params)
-    load_state_dict(model, state_dict)
+    with WallTimeEvent(BenchEvent.LOAD_WEIGHTS):
+      load_state_dict(model, state_dict)
    return GPT2(model, tiktoken.get_encoding("gpt2"))

  def __init__(self, model, tokenizer):
@@ -185,11 +188,12 @@ class GPT2:
      with Timing("ran model in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+
                  f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+
                  (f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=timing):
-        if batch_size == 1 and len(toks[0][start_pos:]) == 1:
-          tokens = Variable("tokens", 0, VOCAB_SIZE).bind(toks[0][start_pos])
-        else:
-          tokens = Tensor([x[start_pos:] for x in toks])
-        tok = self.model(tokens, Variable("start_pos", 1 if start_pos else 0, MAX_CONTEXT-1).bind(start_pos), temperature).tolist()
+        with WallTimeEvent(BenchEvent.STEP):
+          if batch_size == 1 and len(toks[0][start_pos:]) == 1:
+            tokens = Variable("tokens", 0, VOCAB_SIZE).bind(toks[0][start_pos])
+          else:
+            tokens = Tensor([x[start_pos:] for x in toks])
+          tok = self.model(tokens, Variable("start_pos", 1 if start_pos else 0, MAX_CONTEXT-1).bind(start_pos), temperature).tolist()
      start_pos = len(toks[0])
      for i,t in enumerate(tok): toks[i].append(t)
    return [self.tokenizer.decode(x) for x in toks]
--- a/examples/hlb_cifar10.py
+++ b/examples/hlb_cifar10.py
@@ -11,6 +11,7 @@ from tinygrad import nn, dtypes, Tensor, Device, GlobalCounters, TinyJit
 from tinygrad.nn.state import get_state_dict, get_parameters
 from tinygrad.nn import optim
 from tinygrad.helpers import Context, BEAM, WINO, getenv, colored, prod
+from extra.bench_log import BenchEvent, WallTimeEvent, WallTimeEvent

 cifar_mean = [0.4913997551666284, 0.48215855929893703, 0.4465309133731618]
 cifar_std = [0.24703225141799082, 0.24348516474564, 0.26158783926049628]
@@ -395,20 +396,23 @@ def train_cifar():
      if STEPS == 0 or i == STEPS: break

      GlobalCounters.reset()
-      X, Y = next(batcher)
-      if len(GPUS) > 1:
-        X.shard_(GPUS, axis=0)
-        Y.shard_(GPUS, axis=0)

-      with Context(BEAM=getenv("LATEBEAM", BEAM.value), WINO=getenv("LATEWINO", WINO.value)):
-        loss = train_step_jitted(model, optim.OptimizerGroup(opt_bias, opt_non_bias), [lr_sched_bias, lr_sched_non_bias], X, Y)
-        et = time.monotonic()
-        loss_cpu = loss.numpy()
-      # EMA for network weights
-      if getenv("EMA") and i > hyp['ema']['steps'] and (i+1) % hyp['ema']['every_n_steps'] == 0:
-        if model_ema is None:
-          model_ema = modelEMA(W, model)
-        model_ema.update(model, Tensor([projected_ema_decay_val*(i/STEPS)**hyp['ema']['decay_pow']]))
+      with WallTimeEvent(BenchEvent.STEP):
+        X, Y = next(batcher)
+        if len(GPUS) > 1:
+          X.shard_(GPUS, axis=0)
+          Y.shard_(GPUS, axis=0)
+
+        with Context(BEAM=getenv("LATEBEAM", BEAM.value), WINO=getenv("LATEWINO", WINO.value)):
+          loss = train_step_jitted(model, optim.OptimizerGroup(opt_bias, opt_non_bias), [lr_sched_bias, lr_sched_non_bias], X, Y)
+          et = time.monotonic()
+          loss_cpu = loss.numpy()
+        # EMA for network weights
+        if getenv("EMA") and i > hyp['ema']['steps'] and (i+1) % hyp['ema']['every_n_steps'] == 0:
+          if model_ema is None:
+            model_ema = modelEMA(W, model)
+          model_ema.update(model, Tensor([projected_ema_decay_val*(i/STEPS)**hyp['ema']['decay_pow']]))
+
      cl = time.monotonic()
      device_str = loss.device if isinstance(loss.device, str) else f"{loss.device[0]} * {len(loss.device)}"
      #  53  221.74 ms run,    2.22 ms python,  219.52 ms CL,  803.39 loss, 0.000807 LR, 4.66 GB used,   3042.49 GFLOPS,    674.65 GOPS
@@ -424,4 +428,5 @@ def train_cifar():
      raise ValueError(colored(f"{eval_acc_pct=} < {target}", "red"))

 if __name__ == "__main__":
-  train_cifar()
+  with WallTimeEvent(BenchEvent.FULL):
+    train_cifar()
--- a/examples/llama.py
+++ b/examples/llama.py
@@ -13,6 +13,7 @@ from extra.models.llama import Transformer, convert_from_huggingface, fix_bf16
 from sentencepiece import SentencePieceProcessor
 import tiktoken, sys
 from tiktoken.load import load_tiktoken_bpe
+from extra.bench_log import BenchEvent, WallTimeEvent

 MAX_CONTEXT = getenv("MAX_CONTEXT", 4096)

@@ -206,42 +207,43 @@ class LLaMa:

    model = Transformer(**params["args"], linear=linear, max_context=MAX_CONTEXT, jit=bool(JIT))

-    if model_path.is_dir():
-      weights = concat_weights([load(filename) for filename in [f"{model_path}/consolidated.{i:02d}.pth" for i in range(params["files"])]], device[0] if isinstance(device, tuple) else device)
-    else:
-      weights = load(str(model_path))
-    if "model.embed_tokens.weight" in weights:
-      weights = convert_from_huggingface(weights, params["args"]["n_layers"], params["args"]["n_heads"], params["args"].get("n_kv_heads", params["args"]["n_heads"]))
+    with WallTimeEvent(BenchEvent.LOAD_WEIGHTS):
+      if model_path.is_dir():
+        weights = concat_weights([load(filename) for filename in [f"{model_path}/consolidated.{i:02d}.pth" for i in range(params["files"])]], device[0] if isinstance(device, tuple) else device)
+      else:
+        weights = load(str(model_path))
+      if "model.embed_tokens.weight" in weights:
+        weights = convert_from_huggingface(weights, params["args"]["n_layers"], params["args"]["n_heads"], params["args"].get("n_kv_heads", params["args"]["n_heads"]))

-    weights = fix_bf16(weights)
+      weights = fix_bf16(weights)

-    # prevent tracking model weights
-    # this is a part of a larger problem with BUFFER UOps and gc in TRACK_MATCH_STATS=2
-    with Context(BEAM=0, TRACK_MATCH_STATS=0):
-      # quantize
-      if quantize is not None:
-        weights = linear.quantize(weights, device)
-        for _,v in weights.items(): v.realize()
+      # prevent tracking model weights
+      # this is a part of a larger problem with BUFFER UOps and gc in TRACK_MATCH_STATS=2
+      with Context(BEAM=0, TRACK_MATCH_STATS=0):
+        # quantize
+        if quantize is not None:
+          weights = linear.quantize(weights, device)
+          for _,v in weights.items(): v.realize()

-      # shard
-      if isinstance(device, tuple):
-        for k,v in nn.state.get_state_dict(model).items():
-          if 'scale' in k: v.shard_(device, axis=None)  # from quantized
-          elif '.attention.' in k:
-            if getenv("SHARD_KVCACHE") and ('.wq.' in k or '.wk.' in k or '.wv.' in k): v.shard_(device, axis=0)
-            else: v.shard_(device, axis=-1)
-          elif '.feed_forward.w1.' in k: v.shard_(device, axis=0)
-          elif '.feed_forward.w3.' in k: v.shard_(device, axis=0)
-          elif '.feed_forward.' in k: v.shard_(device, axis=-1)
-          elif 'tok_embeddings.weight' in k: v.shard_(device, axis=0)
-          elif 'output.weight' in k: v.shard_(device, axis=-1)
-          #elif k.endswith('.weight'): v.shard_(device, axis=-1)
-          #elif 'norm.' in k: v.shard_(device, axis=-1)
-          else: v.shard_(device, axis=None)
-          #print(k, v.shape, v.lazydata.axis)
+        # shard
+        if isinstance(device, tuple):
+          for k,v in nn.state.get_state_dict(model).items():
+            if 'scale' in k: v.shard_(device, axis=None)  # from quantized
+            elif '.attention.' in k:
+              if getenv("SHARD_KVCACHE") and ('.wq.' in k or '.wk.' in k or '.wv.' in k): v.shard_(device, axis=0)
+              else: v.shard_(device, axis=-1)
+            elif '.feed_forward.w1.' in k: v.shard_(device, axis=0)
+            elif '.feed_forward.w3.' in k: v.shard_(device, axis=0)
+            elif '.feed_forward.' in k: v.shard_(device, axis=-1)
+            elif 'tok_embeddings.weight' in k: v.shard_(device, axis=0)
+            elif 'output.weight' in k: v.shard_(device, axis=-1)
+            #elif k.endswith('.weight'): v.shard_(device, axis=-1)
+            #elif 'norm.' in k: v.shard_(device, axis=-1)
+            else: v.shard_(device, axis=None)
+            #print(k, v.shape, v.lazydata.axis)

-      # replace weights in model
-      load_state_dict(model, weights, strict=False, consume=True)
+        # replace weights in model
+        load_state_dict(model, weights, strict=False, consume=True)

    return LLaMa(model, tokenizer)

@@ -477,11 +479,12 @@ After you are done speaking, output [EOS]. You are not Chad.
      next_tok = Tensor([toks[start_pos:]], device=device) if tok_tensor is None or (len(toks)-start_pos) > 1 else tok_tensor.reshape(1, 1)
      with Profiling(enabled=args.profile):
        with Timing("total ", enabled=args.timing, on_exit=lambda x: f", {1e9/x:.2f} tok/s, {GlobalCounters.global_mem/x:.2f} GB/s, param {param_bytes/x:.2f} GB/s"):
-          with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+
-                      f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+
-                      (f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_bytes*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=args.timing):
-            tok_tensor = llama.model(next_tok, start_pos, args.temperature)
-          tok = tok_tensor.item()
+          with WallTimeEvent(BenchEvent.STEP):
+            with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+
+                        f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+
+                        (f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_bytes*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=args.timing):
+              tok_tensor = llama.model(next_tok, start_pos, args.temperature)
+            tok = tok_tensor.item()

      # use the kv cache
      start_pos = len(toks)
--- a/examples/llama3.py
+++ b/examples/llama3.py
@@ -7,6 +7,7 @@ from extra.models.llama import Transformer, convert_from_huggingface, convert_fr
 from tinygrad.nn.state import safe_load, torch_load, load_state_dict, get_parameters, gguf_load
 from tinygrad import Tensor, dtypes, nn, Context, Device, GlobalCounters
 from tinygrad.helpers import Profiling, Timing, DEBUG, colored, fetch, tqdm
+from extra.bench_log import BenchEvent, WallTimeEvent

 class Tokenizer:
  pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
@@ -166,40 +167,42 @@ def build_transformer(model_path: Path, model_size="8B", quantize=None, scale_dt
  model = Transformer(**MODEL_PARAMS[model_size]["args"], linear=linear, embedding=embedding, max_context=max_context, jit=True)

  if not load_weights: return model
+
  # load weights
-  if model_path.is_dir():
-    if (model_path / "model.safetensors.index.json").exists(): weights = load(str(model_path / "model.safetensors.index.json"))
-    elif (model_path / "model.safetensors").exists(): weights = load(str(model_path / "model.safetensors"))
-    else: weights = concat_weights([load(str(model_path / f"consolidated.{i:02d}.pth")) for i in range(MODEL_PARAMS[model_size]["files"])], device[0] if isinstance(device, tuple) else device)
-  else:
-    weights = load(str(model_path))
-  if "model.embed_tokens.weight" in weights:
-    weights = convert_from_huggingface(weights, MODEL_PARAMS[model_size]["args"]["n_layers"], MODEL_PARAMS[model_size]["args"]["n_heads"], MODEL_PARAMS[model_size]["args"]["n_kv_heads"])
-  elif "token_embd.weight" in weights:
-    weights = convert_from_gguf(weights, MODEL_PARAMS[model_size]["args"]["n_layers"])
-  weights = fix_bf16(weights)
+  with WallTimeEvent(BenchEvent.LOAD_WEIGHTS):
+    if model_path.is_dir():
+      if (model_path / "model.safetensors.index.json").exists(): weights = load(str(model_path / "model.safetensors.index.json"))
+      elif (model_path / "model.safetensors").exists(): weights = load(str(model_path / "model.safetensors"))
+      else: weights = concat_weights([load(str(model_path / f"consolidated.{i:02d}.pth")) for i in range(MODEL_PARAMS[model_size]["files"])], device[0] if isinstance(device, tuple) else device)
+    else:
+      weights = load(str(model_path))
+    if "model.embed_tokens.weight" in weights:
+      weights = convert_from_huggingface(weights, MODEL_PARAMS[model_size]["args"]["n_layers"], MODEL_PARAMS[model_size]["args"]["n_heads"], MODEL_PARAMS[model_size]["args"]["n_kv_heads"])
+    elif "token_embd.weight" in weights:
+      weights = convert_from_gguf(weights, MODEL_PARAMS[model_size]["args"]["n_layers"])
+    weights = fix_bf16(weights)

-  with Context(BEAM=0):
-    # quantize
-    if quantize == "float16": weights = {k:v.cast(quantize).contiguous() for k,v in weights.items()}
-    elif quantize is not None:
-      weights = linear.quantize(weights, device, scale_dtype, quantize_embeds)
-      for _,v in weights.items(): v.realize()
+    with Context(BEAM=0):
+      # quantize
+      if quantize == "float16": weights = {k:v.cast(quantize).contiguous() for k,v in weights.items()}
+      elif quantize is not None:
+        weights = linear.quantize(weights, device, scale_dtype, quantize_embeds)
+        for _,v in weights.items(): v.realize()

-    # shard
-    if isinstance(device, tuple):
-      for k,v in nn.state.get_state_dict(model).items():
-        if 'scale' in k: v.shard_(device, axis=None)  # from quantized
-        elif '.attention.' in k: v.shard_(device, axis=-1)
-        elif '.feed_forward.w1.' in k: v.shard_(device, axis=0)
-        elif '.feed_forward.w3.' in k: v.shard_(device, axis=0)
-        elif '.feed_forward.' in k: v.shard_(device, axis=-1)
-        elif 'tok_embeddings.weight' in k: v.shard_(device, axis=0)
-        elif 'output.weight' in k: v.shard_(device, axis=0)
-        else: v.shard_(device, axis=None)
+      # shard
+      if isinstance(device, tuple):
+        for k,v in nn.state.get_state_dict(model).items():
+          if 'scale' in k: v.shard_(device, axis=None)  # from quantized
+          elif '.attention.' in k: v.shard_(device, axis=-1)
+          elif '.feed_forward.w1.' in k: v.shard_(device, axis=0)
+          elif '.feed_forward.w3.' in k: v.shard_(device, axis=0)
+          elif '.feed_forward.' in k: v.shard_(device, axis=-1)
+          elif 'tok_embeddings.weight' in k: v.shard_(device, axis=0)
+          elif 'output.weight' in k: v.shard_(device, axis=0)
+          else: v.shard_(device, axis=None)

-    # replace weights in model
-    load_state_dict(model, weights, strict=False, consume=True)
+      # replace weights in model
+      load_state_dict(model, weights, strict=False, consume=True)
  return model

 # default settings
@@ -435,11 +438,12 @@ if __name__ == "__main__":
      st = GlobalCounters.time_sum_s
      with Profiling(enabled=args.profile):
        with Timing("total ", on_exit=lambda x: f", {1e9/x:.2f} tok/s, {GlobalCounters.global_mem/x:.2f} GB/s, param {param_bytes/x:.2f} GB/s"):
-          with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+
-                      f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+
-                      (f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_bytes*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None):
-            tok = model(Tensor([[last_tok]], device=device), start_pos, TEMPERATURE, TOP_K, TOP_P, ALPHA_F, ALPHA_P)
-          tok = tok.item()
+          with WallTimeEvent(BenchEvent.STEP):
+            with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+
+                        f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+
+                        (f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_bytes*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None):
+              tok = model(Tensor([[last_tok]], device=device), start_pos, TEMPERATURE, TOP_K, TOP_P, ALPHA_F, ALPHA_P)
+            tok = tok.item()
      start_pos += 1
      last_tok = tok
      generated += tokenizer.decode([tok])
--- a/examples/mixtral.py
+++ b/examples/mixtral.py
@@ -3,6 +3,7 @@ from tinygrad import Tensor, nn, Device, GlobalCounters, Variable
 from tinygrad.helpers import Timing, Profiling, CI, tqdm
 from tinygrad.nn.state import torch_load, get_state_dict
 from extra.models.llama import FeedForward, Transformer
+from extra.bench_log import BenchEvent, WallTimeEvent

 class MixtureFeedForward:
  def __init__(self, num_experts:int, dim:int, hidden_dim:int, linear=nn.Linear):
@@ -30,18 +31,19 @@ if __name__ == "__main__":
                      help="Path to the downloaded weights")
  args = parser.parse_args()

-  state = torch_load(args.weights + "/consolidated.00.pth.b")
-  model = Transformer(n_layers=32, dim=4096, hidden_dim=14336, n_heads=32, n_kv_heads=8, norm_eps=1e-5, vocab_size=32000, feed_forward=functools.partial(MixtureFeedForward, 8), jit=False)
-  model_state_dict = get_state_dict(model)
+  with WallTimeEvent(BenchEvent.LOAD_WEIGHTS):
+    state = torch_load(args.weights + "/consolidated.00.pth.b")
+    model = Transformer(n_layers=32, dim=4096, hidden_dim=14336, n_heads=32, n_kv_heads=8, norm_eps=1e-5, vocab_size=32000, feed_forward=functools.partial(MixtureFeedForward, 8), jit=False)
+    model_state_dict = get_state_dict(model)

-  for k in (t := tqdm(state, disable=CI)):
-    if 'feed_forward.experts.' in k:
-      expert_no = int(k.split('feed_forward.experts.')[1].split('.')[0])
-      device = Device.DEFAULT + ":" + str((expert_no//2)+1)
-    else:
-      device = Device.DEFAULT
-    t.set_description(f"ram used: {GlobalCounters.mem_used/1e9:5.2f} GB, loading {k} to {device}")
-    model_state_dict[k].replace(state[k].to(device).half()).realize()
+    for k in (t := tqdm(state, disable=CI)):
+      if 'feed_forward.experts.' in k:
+        expert_no = int(k.split('feed_forward.experts.')[1].split('.')[0])
+        device = Device.DEFAULT + ":" + str((expert_no//2)+1)
+      else:
+        device = Device.DEFAULT
+      t.set_description(f"ram used: {GlobalCounters.mem_used/1e9:5.2f} GB, loading {k} to {device}")
+      model_state_dict[k].replace(state[k].to(device).half()).realize()
  if CI: print(f"ram used: {GlobalCounters.mem_used/1e9:5.2f} GB")

  from sentencepiece import SentencePieceProcessor
@@ -53,7 +55,8 @@ if __name__ == "__main__":
    GlobalCounters.reset()
    with Profiling(sort="time", frac=0.1, enabled=args.profile):
      with Timing("total ", enabled=args.timing, on_exit=lambda x: f", {1e9/x:.2f} tok/sec"):
-        tok = model(Tensor([toks[start_pos:]]), 0 if start_pos == 0 else Variable("start_pos", 1, 1024).bind(start_pos), args.temperature).item()
+        with WallTimeEvent(BenchEvent.STEP):
+          tok = model(Tensor([toks[start_pos:]]), 0 if start_pos == 0 else Variable("start_pos", 1, 1024).bind(start_pos), args.temperature).item()
    toks.append(tok)
    start_pos += 1
    print(spp.decode(toks))
--- a/examples/mlperf/model_eval.py
+++ b/examples/mlperf/model_eval.py
@@ -5,61 +5,63 @@ import numpy as np
 from tinygrad import Tensor, Device, dtypes, GlobalCounters, TinyJit
 from tinygrad.nn.state import get_parameters, load_state_dict, safe_load
 from tinygrad.helpers import getenv
+from extra.bench_log import BenchEvent, WallTimeEvent
 def tlog(x): print(f"{x:25s}  @ {time.perf_counter()-start:5.2f}s")

 def eval_resnet():
  Tensor.no_grad = True
-  # Resnet50-v1.5
-  from extra.models.resnet import ResNet50
-  tlog("imports")
-  GPUS = [f'{Device.DEFAULT}:{i}' for i in range(getenv("GPUS", 6))]
-  for x in GPUS: Device[x]
-  tlog("got devices")    # NOTE: this is faster with rocm-smi running
+  with WallTimeEvent(BenchEvent.FULL):
+    # Resnet50-v1.5
+    from extra.models.resnet import ResNet50
+    tlog("imports")
+    GPUS = [f'{Device.DEFAULT}:{i}' for i in range(getenv("GPUS", 6))]
+    for x in GPUS: Device[x]
+    tlog("got devices")    # NOTE: this is faster with rocm-smi running

-  class ResnetRunner:
-    def __init__(self, device=None):
-      self.mdl = ResNet50()
-      for x in get_parameters(self.mdl) if device else []: x.to_(device)
-      if (fn:=getenv("RESNET_MODEL", "")): load_state_dict(self.mdl, safe_load(fn))
-      else: self.mdl.load_from_pretrained()
-      self.input_mean = Tensor([0.485, 0.456, 0.406], device=device).reshape(1, -1, 1, 1)
-      self.input_std = Tensor([0.229, 0.224, 0.225], device=device).reshape(1, -1, 1, 1)
-    def __call__(self, x:Tensor) -> Tensor:
-      x = x.permute([0,3,1,2]).cast(dtypes.float32) / 255.0
-      x -= self.input_mean
-      x /= self.input_std
-      return self.mdl(x).log_softmax().argmax(axis=1).realize()
+    class ResnetRunner:
+      def __init__(self, device=None):
+        self.mdl = ResNet50()
+        for x in get_parameters(self.mdl) if device else []: x.to_(device)
+        if (fn:=getenv("RESNET_MODEL", "")): load_state_dict(self.mdl, safe_load(fn))
+        else: self.mdl.load_from_pretrained()
+        self.input_mean = Tensor([0.485, 0.456, 0.406], device=device).reshape(1, -1, 1, 1)
+        self.input_std = Tensor([0.229, 0.224, 0.225], device=device).reshape(1, -1, 1, 1)
+      def __call__(self, x:Tensor) -> Tensor:
+        x = x.permute([0,3,1,2]).cast(dtypes.float32) / 255.0
+        x -= self.input_mean
+        x /= self.input_std
+        return self.mdl(x).log_softmax().argmax(axis=1).realize()

-  mdl = TinyJit(ResnetRunner(GPUS))
-  tlog("loaded models")
+    mdl = TinyJit(ResnetRunner(GPUS))
+    tlog("loaded models")

-  # evaluation on the mlperf classes of the validation set from imagenet
-  from examples.mlperf.dataloader import batch_load_resnet
-  iterator = batch_load_resnet(getenv("BS", 128*6), val=getenv("VAL", 1), shuffle=False, pad_first_batch=True)
-  def data_get():
-    x,y,cookie = next(iterator)
-    return x.shard(GPUS, axis=0).realize(), y, cookie
-  n,d = 0,0
-  proc = data_get()
-  tlog("loaded initial data")
-  st = time.perf_counter()
-  while proc is not None:
-    GlobalCounters.reset()
-    proc = (mdl(proc[0]), proc[1], proc[2])  # this frees the images
-    run = time.perf_counter()
-    # load the next data here
-    try: next_proc = data_get()
-    except StopIteration: next_proc = None
-    nd = time.perf_counter()
-    y = np.array(proc[1])
-    proc = (proc[0].numpy() == y) & (y != -1)  # this realizes the models and frees the cookies
-    n += proc.sum()
-    d += (y != -1).sum()
-    et = time.perf_counter()
-    tlog(f"****** {n:5d}/{d:5d}  {n*100.0/d:.2f}% -- {(run-st)*1000:7.2f} ms to enqueue, {(et-run)*1000:7.2f} ms to realize ({(nd-run)*1000:7.2f} ms fetching). {(len(proc))/(et-st):8.2f} examples/sec. {GlobalCounters.global_ops*1e-12/(et-st):5.2f} TFLOPS")
-    st = et
-    proc, next_proc = next_proc, None
-  tlog("done")
+    # evaluation on the mlperf classes of the validation set from imagenet
+    from examples.mlperf.dataloader import batch_load_resnet
+    iterator = batch_load_resnet(getenv("BS", 128*6), val=getenv("VAL", 1), shuffle=False, pad_first_batch=True)
+    def data_get():
+      x,y,cookie = next(iterator)
+      return x.shard(GPUS, axis=0).realize(), y, cookie
+    n,d = 0,0
+    proc = data_get()
+    tlog("loaded initial data")
+    st = time.perf_counter()
+    while proc is not None:
+      GlobalCounters.reset()
+      proc = (mdl(proc[0]), proc[1], proc[2])  # this frees the images
+      run = time.perf_counter()
+      # load the next data here
+      try: next_proc = data_get()
+      except StopIteration: next_proc = None
+      nd = time.perf_counter()
+      y = np.array(proc[1])
+      proc = (proc[0].numpy() == y) & (y != -1)  # this realizes the models and frees the cookies
+      n += proc.sum()
+      d += (y != -1).sum()
+      et = time.perf_counter()
+      tlog(f"****** {n:5d}/{d:5d}  {n*100.0/d:.2f}% -- {(run-st)*1000:7.2f} ms to enqueue, {(et-run)*1000:7.2f} ms to realize ({(nd-run)*1000:7.2f} ms fetching). {(len(proc))/(et-st):8.2f} examples/sec. {GlobalCounters.global_ops*1e-12/(et-st):5.2f} TFLOPS")
+      st = et
+      proc, next_proc = next_proc, None
+    tlog("done")

 def eval_unet3d():
  # UNet3D
--- a/examples/mlperf/model_train.py
+++ b/examples/mlperf/model_train.py
@@ -9,6 +9,7 @@ from tinygrad.nn.optim import LAMB, LARS, SGD, OptimizerGroup, Adam

 from extra.lr_scheduler import LRSchedulerGroup
 from examples.mlperf.helpers import get_training_state, load_training_state
+from extra.bench_log import BenchEvent, WallTimeEvent
 # TODO: fix benchmark logging and use tinygrad tqdm
 from tqdm import tqdm

@@ -205,24 +206,25 @@ def train_resnet():
    st = time.perf_counter()
    while proc is not None:
      GlobalCounters.reset()
-      (loss, top_1), y, proc = train_step(proc[0], proc[1]), proc[2], proc[3]
+      with WallTimeEvent(BenchEvent.STEP):
+        (loss, top_1), y, proc = train_step(proc[0], proc[1]), proc[2], proc[3]

-      pt = time.perf_counter()
+        pt = time.perf_counter()

-      if len(prev_cookies) == getenv("STORE_COOKIES", 1): prev_cookies = []  # free previous cookies after gpu work has been enqueued
-      try:
-        if INITMLPERF:
-          next_proc = fake_data_get(BS)
-        else:
-          next_proc = data_get(it)
-      except StopIteration:
-        next_proc = None
+        if len(prev_cookies) == getenv("STORE_COOKIES", 1): prev_cookies = []  # free previous cookies after gpu work has been enqueued
+        try:
+          if INITMLPERF:
+            next_proc = fake_data_get(BS)
+          else:
+            next_proc = data_get(it)
+        except StopIteration:
+          next_proc = None

-      dt = time.perf_counter()
+        dt = time.perf_counter()

-      device_str = loss.device if isinstance(loss.device, str) else f"{loss.device[0]} * {len(loss.device)}"
-      loss, top_1 = loss.numpy().item(), top_1.numpy().item()
-      top_1_acc = top_1 / sum(yi != -1 for yi in y)
+        device_str = loss.device if isinstance(loss.device, str) else f"{loss.device[0]} * {len(loss.device)}"
+        loss, top_1 = loss.numpy().item(), top_1.numpy().item()
+        top_1_acc = top_1 / sum(yi != -1 for yi in y)

      cl = time.perf_counter()
      if BENCHMARK:
@@ -1124,23 +1126,24 @@ def train_bert():
      BEAM.value = TRAIN_BEAM
      st = time.perf_counter()
      GlobalCounters.reset()
-      loss, global_norm, lr = train_step_bert(model, optimizer_group, scheduler_group, loss_scaler,
-        train_data["input_ids"], train_data["segment_ids"], train_data["input_mask"], train_data["masked_lm_positions"], \
-        train_data["masked_lm_ids"], train_data["masked_lm_weights"], train_data["next_sentence_labels"], GPUS)
+      with WallTimeEvent(BenchEvent.STEP):
+        loss, global_norm, lr = train_step_bert(model, optimizer_group, scheduler_group, loss_scaler,
+          train_data["input_ids"], train_data["segment_ids"], train_data["input_mask"], train_data["masked_lm_positions"], \
+          train_data["masked_lm_ids"], train_data["masked_lm_weights"], train_data["next_sentence_labels"], GPUS)

-      pt = time.perf_counter()
+        pt = time.perf_counter()

-      try:
-        next_data = next(train_it)
-      except StopIteration:
-        next_data = None
+        try:
+          next_data = next(train_it)
+        except StopIteration:
+          next_data = None

-      dt = time.perf_counter()
+        dt = time.perf_counter()

-      device_str = parameters[0].device if isinstance(parameters[0].device, str) else f"{parameters[0].device[0]} * {len(parameters[0].device)}"
-      loss = loss.item()
-      assert not math.isnan(loss)
-      lr = lr.item()
+        device_str = parameters[0].device if isinstance(parameters[0].device, str) else f"{parameters[0].device[0]} * {len(parameters[0].device)}"
+        loss = loss.item()
+        assert not math.isnan(loss)
+        lr = lr.item()

      cl = time.perf_counter()
      if BENCHMARK: step_times.append(cl - st)
--- a/examples/sdv2.py
+++ b/examples/sdv2.py
@@ -5,6 +5,7 @@ from examples.stable_diffusion import AutoencoderKL, get_alphas_cumprod
 from examples.sdxl import DPMPP2MSampler, append_dims, LegacyDDPMDiscretization
 from extra.models.unet import UNetModel
 from extra.models.clip import FrozenOpenClipEmbedder
+from extra.bench_log import BenchEvent, WallTimeEvent

 from typing import Dict
 import argparse, tempfile, os
@@ -117,12 +118,14 @@ if __name__ == "__main__":
  if not weights_fn:
    weights_url = args.weights_url if args.weights_url else default_weights_url
    weights_fn  = fetch(weights_url, os.path.basename(str(weights_url)))
-  load_state_dict(model, safe_load(weights_fn), strict=False)

-  if args.fp16:
-    for k,v in get_state_dict(model).items():
-      if k.startswith("model"):
-        v.replace(v.cast(dtypes.float16).realize())
+  with WallTimeEvent(BenchEvent.LOAD_WEIGHTS):
+    load_state_dict(model, safe_load(weights_fn), strict=False)
+
+    if args.fp16:
+      for k,v in get_state_dict(model).items():
+        if k.startswith("model"):
+          v.replace(v.cast(dtypes.float16).realize())

  c  = { "crossattn": model.cond_stage_model(args.prompt) }
  uc = { "crossattn": model.cond_stage_model("") }
--- a/examples/sdxl.py
+++ b/examples/sdxl.py
@@ -9,6 +9,7 @@ from tinygrad.nn.state import safe_load, load_state_dict, get_state_dict
 from tinygrad.helpers import fetch, trange, colored, Timing
 from extra.models.clip import Embedder, FrozenClosedClipEmbedder, FrozenOpenClipEmbedder
 from extra.models.unet import UNetModel, Upsample, Downsample, timestep_embedding
+from extra.bench_log import BenchEvent, WallTimeEvent
 from examples.stable_diffusion import ResnetBlock, Mid
 import numpy as np

@@ -346,17 +347,18 @@ class DPMPP2MSampler:
    for i in trange(num_sigmas - 1):
      with Timing("step in ", enabled=timing, on_exit=lambda _: f", using {GlobalCounters.mem_used/1e9:.2f} GB"):
        GlobalCounters.reset()
-        x, old_denoised = self.sampler_step(
-          old_denoised=old_denoised,
-          prev_sigma=(None if i==0 else sigmas[i-1].expand(x.shape[0])),
-          sigma=sigmas[i].expand(x.shape[0]),
-          next_sigma=sigmas[i+1].expand(x.shape[0]),
-          denoiser=denoiser,
-          x=x,
-          c=c,
-          uc=uc,
-        )
-        x.realize(old_denoised)
+        with WallTimeEvent(BenchEvent.STEP):
+          x, old_denoised = self.sampler_step(
+            old_denoised=old_denoised,
+            prev_sigma=(None if i==0 else sigmas[i-1].expand(x.shape[0])),
+            sigma=sigmas[i].expand(x.shape[0]),
+            next_sigma=sigmas[i+1].expand(x.shape[0]),
+            denoiser=denoiser,
+            x=x,
+            c=c,
+            uc=uc,
+          )
+          x.realize(old_denoised)

    return x

@@ -388,7 +390,8 @@ if __name__ == "__main__":

  start_mem_used = GlobalCounters.mem_used
  with Timing("loaded weights in ", lambda et_ns: f", {(B:=(GlobalCounters.mem_used-start_mem_used))/1e9:.2f} GB loaded at {B/et_ns:.2f} GB/s"):
-    Tensor.realize(*loaded_weights)
+    with WallTimeEvent(BenchEvent.LOAD_WEIGHTS):
+      Tensor.realize(*loaded_weights)
    del loaded_weights

  N = 1
--- a/examples/stable_diffusion.py
+++ b/examples/stable_diffusion.py
@@ -14,6 +14,7 @@ from tinygrad.nn import Conv2d, GroupNorm
 from tinygrad.nn.state import torch_load, load_state_dict, get_state_dict
 from extra.models.clip import Closed, Tokenizer
 from extra.models.unet import UNetModel
+from extra.bench_log import BenchEvent, WallTimeEvent

 class AttnBlock:
  def __init__(self, in_channels):
@@ -232,12 +233,13 @@ if __name__ == "__main__":
  model = StableDiffusion()

  # load in weights
-  load_state_dict(model, torch_load(fetch('https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt', 'sd-v1-4.ckpt'))['state_dict'], strict=False)
+  with WallTimeEvent(BenchEvent.LOAD_WEIGHTS):
+    load_state_dict(model, torch_load(fetch('https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt', 'sd-v1-4.ckpt'))['state_dict'], strict=False)

-  if args.fp16:
-    for k,v in get_state_dict(model).items():
-      if k.startswith("model"):
-        v.replace(v.cast(dtypes.float16).realize())
+    if args.fp16:
+      for k,v in get_state_dict(model).items():
+        if k.startswith("model"):
+          v.replace(v.cast(dtypes.float16).realize())

  # run through CLIP to get context
  tokenizer = Tokenizer.ClipTokenizer()
@@ -270,9 +272,10 @@ if __name__ == "__main__":
      GlobalCounters.reset()
      t.set_description("%3d %3d" % (index, timestep))
      with Timing("step in ", enabled=args.timing, on_exit=lambda _: f", using {GlobalCounters.mem_used/1e9:.2f} GB"):
-        tid = Tensor([index])
-        latent = run(model, unconditional_context, context, latent, Tensor([timestep]), alphas[tid], alphas_prev[tid], Tensor([args.guidance]))
-        if args.timing: Device[Device.DEFAULT].synchronize()
+        with WallTimeEvent(BenchEvent.STEP):
+          tid = Tensor([index])
+          latent = run(model, unconditional_context, context, latent, Tensor([timestep]), alphas[tid], alphas_prev[tid], Tensor([args.guidance]))
+          if args.timing: Device[Device.DEFAULT].synchronize()
    del run

  # upsample latent space to image with autoencoder
--- a/extra/bench_log.py
+++ b/extra/bench_log.py
@@ -0,0 +1,108 @@
+import time, atexit, uuid
+from enum import Enum
+
+from tinygrad.device import Device
+from tinygrad.helpers import DEBUG, ContextVar, getenv, GlobalCounters
+
+BENCHMARK_LOG = ContextVar("BENCHMARK_LOG", "")
+
+if BENCHMARK_LOG:
+  from influxdb_client_3 import InfluxDBClient3, Point, WriteOptions, write_client_options
+  from influxdb_client_3.write_client.client.write_api import WriteType
+
+class BenchEvent(Enum):
+  LOAD_WEIGHTS = "load_weights"
+  STEP = "step"
+  FULL = "full"
+class InstantBenchEvent(Enum):
+  GFLOPS = "gflops"
+
+_events = {}
+def clear_events():
+  for event in BenchEvent:
+    _events[event] = {"wall": [], "kernel": []}
+  for event in InstantBenchEvent:
+    _events[event] = []
+clear_events()
+
+class WallTimeEvent:
+  def __init__(self, event:BenchEvent):
+    self.event = event
+  def __enter__(self):
+    self.start = time.monotonic()
+    return self
+  def __exit__(self, *_):
+    _events[self.event]["wall"].append(time.monotonic() - self.start)
+    return False
+
+class KernelTimeEvent:
+  def __init__(self, event:BenchEvent):
+    if DEBUG < 2:
+      raise Exception("KernelTimeEvent should only be used in DEBUG >= 2")
+    self.event = event
+  def __enter__(self):
+    self.start = GlobalCounters.time_sum_s
+    return self
+  def __exit__(self, *_):
+    _events[self.event]["kernel"].append(GlobalCounters.time_sum_s - self.start)
+    return False
+
+def log_event_instant(event:InstantBenchEvent, value:float):
+  _events[event].append(value)
+
+if BENCHMARK_LOG:
+  INFLUXDB_HOST = getenv("INFLUXDB_HOST", "")
+  INFLUXDB_ORG = getenv("INFLUXDB_ORG", "tiny")
+  INFLUXDB_TOKEN = getenv("INFLUXDB_TOKEN", "")
+
+  def _create_point(run_id, i, attempt, ref, commit, name, value, run):
+    point = Point(BENCHMARK_LOG.value).tag("id", run_id).tag("index", i)
+    point = point.tag("device", Device.DEFAULT)
+    point = point.tag("attempt", attempt).tag("ref", ref).tag("commit", commit)
+    point = point.field(name, value).field("x", run)
+    return point
+
+  @atexit.register
+  def write_events():
+    # see if there are any events to write
+    have_events = False
+    for event in _events:
+      if isinstance(event, BenchEvent):
+        for event_type, values in _events[event].items():
+          if len(values) > 0:
+            have_events = True
+      else:
+        if len(_events[event]) > 0:
+          have_events = True
+    if not have_events:
+      return
+
+    # pull from github envvars
+    ref = getenv("GITHUB_REF_NAME", "")
+    commit = getenv("GITHUB_SHA", "")
+    run = getenv("GITHUB_RUN_NUMBER", "")
+    attempt = getenv("GITHUB_RUN_ATTEMPT", "")
+
+    points = []
+    for event in _events:
+      run_id = str(uuid.uuid4())
+      if isinstance(event, BenchEvent):
+        for event_type, values in _events[event].items():
+          for i, value in enumerate(values):
+            point = _create_point(run_id, i, attempt, ref, commit, f"{event.value}_{event_type}", value, run)
+            points.append(point)
+      else:
+        for i, value in enumerate(_events[event]):
+          point = _create_point(run_id, i, attempt, ref, commit, event.value, value, run)
+          points.append(point)
+
+    write_options = WriteOptions(write_type=WriteType.synchronous, retry_interval=5000, max_retries=5, max_retry_delay=30000, exponential_base=2)
+    wco = write_client_options(write_options=write_options)
+    with InfluxDBClient3(
+        host=INFLUXDB_HOST,
+        org=INFLUXDB_ORG,
+        token=INFLUXDB_TOKEN,
+        auth_scheme="Basic",
+        database="benchmarks",
+        write_client_options=wco) as client:
+      client.write(points)
--- a/setup.py
+++ b/setup.py
@@ -73,7 +73,8 @@ setup(name='tinygrad',
            "capstone",
            "pycocotools",
            "boto3",
-            "pandas"
+            "pandas",
+            "influxdb3-python"
        ],
        'docs': [
            "mkdocs",
--- a/test/external/external_benchmark_openpilot.py
+++ b/test/external/external_benchmark_openpilot.py
@@ -7,6 +7,7 @@ from tinygrad import Tensor, dtypes, TinyJit
 from tinygrad.helpers import IMAGE, GlobalCounters, fetch, colored, getenv, trange
 from tinygrad.tensor import _from_np_dtype
 import numpy as np
+from extra.bench_log import BenchEvent, WallTimeEvent

 OPENPILOT_MODEL = sys.argv[1] if len(sys.argv) > 1 else "https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx"

@@ -33,10 +34,11 @@ if __name__ == "__main__":
  for _ in range(20):
    GlobalCounters.reset()
    st = time.perf_counter_ns()
-    # Need to cast non-image inputs from numpy, this is only realistic way to run model
-    inputs = {**{k:v for k,v in new_inputs_junk.items() if 'img' in k},
-              **{k:Tensor(v) for k,v in new_inputs_junk_numpy.items() if 'img' not in k}}
-    ret = next(iter(run_onnx_jit(**inputs).values())).cast(dtypes.float32).numpy()
+    with WallTimeEvent(BenchEvent.STEP):
+      # Need to cast non-image inputs from numpy, this is only realistic way to run model
+      inputs = {**{k:v for k,v in new_inputs_junk.items() if 'img' in k},
+                **{k:Tensor(v) for k,v in new_inputs_junk_numpy.items() if 'img' not in k}}
+      ret = next(iter(run_onnx_jit(**inputs).values())).cast(dtypes.float32).numpy()
    print(f"jitted:  {(time.perf_counter_ns() - st)*1e-6:7.4f} ms")

  suffix = ""
--- a/test/testextra/test_bench_log.py
+++ b/test/testextra/test_bench_log.py
@@ -0,0 +1,103 @@
+import unittest, time
+
+from extra.bench_log import BenchEvent, InstantBenchEvent, WallTimeEvent, KernelTimeEvent, log_event_instant, _events, clear_events
+from tinygrad.helpers import Context
+from tinygrad.tensor import Tensor
+
+class TestBenchLog(unittest.TestCase):
+  def setUp(self):
+    clear_events()
+
+  def test_log_single_wall_time(self):
+    for event in BenchEvent:
+      with WallTimeEvent(event):
+        time.sleep(0.1)
+
+    # check event list
+    for event in BenchEvent:
+      self.assertEqual(len(_events[event]["wall"]), 1)
+      self.assertGreater(_events[event]["wall"][0], 0)
+
+  def test_log_double_wall_time(self):
+    for event in BenchEvent:
+      with WallTimeEvent(event):
+        time.sleep(0.1)
+
+    for event in reversed(BenchEvent):
+      with WallTimeEvent(event):
+        time.sleep(0.2)
+
+    # check event list
+    for event in BenchEvent:
+      self.assertEqual(len(_events[event]["wall"]), 2)
+      self.assertGreater(_events[event]["wall"][0], 0)
+      self.assertGreater(_events[event]["wall"][1], 0)
+
+  def test_log_single_kernel_time(self):
+    wall_times = []
+
+    with Context(DEBUG=2):
+      for event in BenchEvent:
+        with KernelTimeEvent(event):
+          st = time.perf_counter()
+          Tensor.rand(32, 32).sum().realize().item()
+          wall_times.append(time.perf_counter() - st)
+
+    # check event list
+    for event in BenchEvent:
+      self.assertEqual(len(_events[event]["kernel"]), 1)
+      self.assertLess(_events[event]["kernel"][0], wall_times[0])
+      self.assertGreater(_events[event]["kernel"][0], 0)
+
+  def test_interleaved_wall_kernel_time(self):
+    wall_times = []
+    with Context(DEBUG=2):
+      for event in BenchEvent:
+        with KernelTimeEvent(event):
+          st = time.perf_counter()
+          Tensor.rand(32, 32).sum().realize().item()
+          wall_times.append(time.perf_counter() - st)
+
+        with WallTimeEvent(event):
+          st = time.perf_counter()
+          Tensor.rand(32, 32).sum().realize().item()
+          wall_times.append(time.perf_counter() - st)
+
+    # check event list
+    for event in BenchEvent:
+      self.assertEqual(len(_events[event]["wall"]), 1)
+      self.assertEqual(len(_events[event]["kernel"]), 1)
+      self.assertLess(_events[event]["kernel"][0], wall_times[0])
+      self.assertGreater(_events[event]["kernel"][0], 0)
+
+  def test_stacked_wall_kernel_time(self):
+    with Context(DEBUG=2):
+      for event in BenchEvent:
+        with KernelTimeEvent(event):
+          with WallTimeEvent(event):
+            Tensor.rand(32, 32).sum().realize().item()
+
+      for event in BenchEvent:
+        with WallTimeEvent(event):
+          with KernelTimeEvent(event):
+            Tensor.rand(32, 32).sum().realize().item()
+
+    for event in BenchEvent:
+      self.assertEqual(len(_events[event]["wall"]), 2)
+      self.assertEqual(len(_events[event]["kernel"]), 2)
+      self.assertLess(_events[event]["kernel"][0], _events[event]["wall"][0])
+      self.assertGreater(_events[event]["kernel"][0], 0)
+      self.assertLess(_events[event]["kernel"][1], _events[event]["wall"][1])
+      self.assertGreater(_events[event]["kernel"][1], 0)
+
+  def test_log_instant_event(self):
+    for event in InstantBenchEvent:
+      log_event_instant(event, 1000)
+
+    # check event list
+    for event in InstantBenchEvent:
+      self.assertEqual(len(_events[event]), 1)
+      self.assertEqual(_events[event][0], 1000)
+
+if __name__ == '__main__':
+  unittest.main()