mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-08 22:48:25 -05:00
move benchmark stat tracking to influxdb (#10185)
This commit is contained in:
134
.github/workflows/benchmark.yml
vendored
134
.github/workflows/benchmark.yml
vendored
@@ -52,14 +52,14 @@ jobs:
|
||||
- name: reset process replay
|
||||
run: python3.11 test/external/process_replay/reset.py
|
||||
- name: Run Stable Diffusion
|
||||
run: JIT=1 python3.11 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
|
||||
run: BENCHMARK_LOG=stable_diffusion JIT=1 python3.11 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
|
||||
- name: Run Stable Diffusion without fp16
|
||||
run: JIT=1 python3.11 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd_no_fp16.txt
|
||||
run: BENCHMARK_LOG=stable_diffusion_fp32 JIT=1 python3.11 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd_no_fp16.txt
|
||||
- name: Run Stable Diffusion v2
|
||||
run: JIT=1 python3.11 examples/sdv2.py --fp16 --seed 0 --noshow --timing | tee sdv2.txt
|
||||
run: BENCHMARK_LOG=stable_diffusion_v2 JIT=1 python3.11 examples/sdv2.py --fp16 --seed 0 --noshow --timing | tee sdv2.txt
|
||||
# process replay can't capture this, the graph is too large
|
||||
- name: Run SDXL
|
||||
run: CAPTURE_PROCESS_REPLAY=0 JIT=1 python3.11 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
|
||||
run: BENCHMARK_LOG=stable_diffusion_xl CAPTURE_PROCESS_REPLAY=0 JIT=1 python3.11 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
|
||||
- name: Run model inference benchmark
|
||||
run: METAL=1 python3.11 test/external/external_model_benchmark.py
|
||||
- name: Test speed vs torch
|
||||
@@ -80,40 +80,40 @@ jobs:
|
||||
run: METAL=1 M_START=6 M_STOP=10 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=6 K_STOP=24 K_STEP=1 TC_OPT=2 DEBUG=2 python3.11 ./extra/gemm/fuzz_matmul.py
|
||||
- name: Run LLaMA
|
||||
run: |
|
||||
JIT=0 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
|
||||
JIT=1 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt
|
||||
BENCHMARK_LOG=llama_nojit JIT=0 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
|
||||
BENCHMARK_LOG=llama JIT=1 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt
|
||||
- name: Run LLaMA with BEAM
|
||||
run: JITBEAM=2 IGNORE_BEAM_CACHE=1 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt
|
||||
run: BENCHMARK_LOG=llama_beam JITBEAM=2 IGNORE_BEAM_CACHE=1 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt
|
||||
- name: Run quantized LLaMA
|
||||
run: |
|
||||
python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize int8 | tee llama_int8.txt
|
||||
python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize nf4 | tee llama_nf4.txt
|
||||
BENCHMARK_LOG=llama_int8 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize int8 | tee llama_int8.txt
|
||||
BENCHMARK_LOG=llama_nf4 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize nf4 | tee llama_nf4.txt
|
||||
- name: Run quantized LLaMA3
|
||||
run: |
|
||||
python3.11 examples/llama3.py --size 8B --temperature 0 --benchmark --quantize int8 | tee llama3_int8.txt
|
||||
python3.11 examples/llama3.py --size 8B --temperature 0 --benchmark --quantize nf4 | tee llama3_nf4.txt
|
||||
BENCHMARK_LOG=llama3_int8 python3.11 examples/llama3.py --size 8B --temperature 0 --benchmark --quantize int8 | tee llama3_int8.txt
|
||||
BENCHMARK_LOG=llama3_nf4 python3.11 examples/llama3.py --size 8B --temperature 0 --benchmark --quantize nf4 | tee llama3_nf4.txt
|
||||
#- name: Run LLaMA 7B on 4 (virtual) GPUs
|
||||
# run: python3.11 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_four_gpu.txt
|
||||
- name: Run GPT2
|
||||
run: |
|
||||
JIT=0 python3.11 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
|
||||
JIT=1 python3.11 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
|
||||
BENCHMARK_LOG=gpt2_nojit JIT=0 python3.11 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
|
||||
BENCHMARK_LOG=gpt2 JIT=1 python3.11 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
|
||||
- name: Run GPT2 w HALF
|
||||
run: HALF=1 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
|
||||
run: BENCHMARK_LOG=gpt2_half HALF=1 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
|
||||
- name: Run GPT2 w HALF/BEAM
|
||||
run: HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
|
||||
run: BENCHMARK_LOG=gpt2_half_beam HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
|
||||
- name: Run OLMoE
|
||||
run: python3.11 examples/olmoe.py
|
||||
run: BENCHMARK_LOG=olmoe python3.11 examples/olmoe.py
|
||||
- name: Train MNIST
|
||||
run: time PYTHONPATH=. TARGET_EVAL_ACC_PCT=96.0 python3.11 examples/beautiful_mnist.py | tee beautiful_mnist.txt
|
||||
- name: Run 10 CIFAR training steps
|
||||
run: JIT=1 STEPS=10 python3.11 examples/hlb_cifar10.py | tee train_cifar.txt
|
||||
run: BENCHMARK_LOG=cifar_10steps JIT=1 STEPS=10 python3.11 examples/hlb_cifar10.py | tee train_cifar.txt
|
||||
- name: Run 10 CIFAR training steps w HALF
|
||||
run: JIT=2 STEPS=10 DEFAULT_FLOAT=HALF python3.11 examples/hlb_cifar10.py | tee train_cifar_half.txt
|
||||
run: BENCHMARK_LOG=cifar_10steps_half JIT=2 STEPS=10 DEFAULT_FLOAT=HALF python3.11 examples/hlb_cifar10.py | tee train_cifar_half.txt
|
||||
#- name: Run 10 CIFAR training steps w BF16
|
||||
# run: STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3.11 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
|
||||
- name: Run 10 CIFAR training steps w winograd
|
||||
run: JIT=1 WINO=1 STEPS=10 python3.11 examples/hlb_cifar10.py | tee train_cifar_wino.txt
|
||||
run: BENCHMARK_LOG=cifar_10steps_wino JIT=1 WINO=1 STEPS=10 python3.11 examples/hlb_cifar10.py | tee train_cifar_wino.txt
|
||||
- name: UsbGPU boot time
|
||||
run: sudo -E PYTHONPATH=. DEBUG=2 AM_RESET=1 AMD=1 AMD_IFACE=USB time python3.11 test/test_tiny.py TestTiny.test_plus
|
||||
- name: UsbGPU tiny tests
|
||||
@@ -210,37 +210,37 @@ jobs:
|
||||
- name: Test CUDA=1
|
||||
run: DEBUG=2 CUDA=1 python -m pytest -rA test/test_tiny.py
|
||||
- name: Run Stable Diffusion
|
||||
run: NV=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
|
||||
run: BENCHMARK_LOG=stable_diffusion NV=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
|
||||
- name: Run SDXL
|
||||
run: CAPTURE_PROCESS_REPLAY=0 NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
|
||||
run: BENCHMARK_LOG=stable_diffusion_xl CAPTURE_PROCESS_REPLAY=0 NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
|
||||
- name: Run LLaMA
|
||||
run: |
|
||||
NV=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
|
||||
NV=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt
|
||||
BENCHMARK_LOG=llama_nojit NV=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
|
||||
BENCHMARK_LOG=llama NV=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt
|
||||
- name: Run LLaMA with BEAM
|
||||
run: NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt
|
||||
run: BENCHMARK_LOG=llama_beam NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt
|
||||
# - name: Run LLaMA 7B on 4 GPUs
|
||||
# run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_four_gpu.txt
|
||||
# - name: Run LLaMA 7B on 6 GPUs
|
||||
# run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_six_gpu.txt
|
||||
- name: Run LLaMA-3 8B BEAM
|
||||
run: NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt
|
||||
run: BENCHMARK_LOG=llama3_beam NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt
|
||||
- name: Run LLaMA-3 8B on 4 GPUs with BEAM
|
||||
run: NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
|
||||
run: BENCHMARK_LOG=llama3_beam_4gpu NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
|
||||
# - name: Run LLaMA-3 8B on 6 GPUs
|
||||
# run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt
|
||||
# - name: Run LLaMA-2 70B
|
||||
# run: NV=1 CAPTURE_PROCESS_REPLAY=0 MAX_CONTEXT=256 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_2_70B.txt
|
||||
- name: Run Mixtral 8x7B
|
||||
run: time NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt
|
||||
run: time BENCHMARK_LOG=mixtral NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt
|
||||
- name: Run GPT2
|
||||
run: |
|
||||
NV=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
|
||||
NV=1 JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
|
||||
BENCHMARK_LOG=gpt2_nojit NV=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
|
||||
BENCHMARK_LOG=gpt2 NV=1 JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
|
||||
- name: Run GPT2 w HALF
|
||||
run: NV=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
|
||||
run: BENCHMARK_LOG=gpt2_half NV=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
|
||||
- name: Run GPT2 w HALF/BEAM
|
||||
run: NV=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
|
||||
run: BENCHMARK_LOG=gpt2_half_beam NV=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: Speed (NVIDIA)
|
||||
@@ -304,26 +304,26 @@ jobs:
|
||||
- name: Train MNIST
|
||||
run: time PYTHONPATH=. NV=1 TARGET_EVAL_ACC_PCT=96.0 python3 examples/beautiful_mnist.py | tee beautiful_mnist.txt
|
||||
- name: Run 10 CIFAR training steps
|
||||
run: NV=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
|
||||
run: BENCHMARK_LOG=cifar_10steps NV=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
|
||||
- name: Run 10 CIFAR training steps w HALF
|
||||
run: NV=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
|
||||
run: BENCHMARK_LOG=cifar_10steps_half NV=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
|
||||
- name: Run 10 CIFAR training steps w BF16
|
||||
run: NV=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
|
||||
run: BENCHMARK_LOG=cifar_10steps_bf16 NV=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
|
||||
- name: Run 10 CIFAR training steps w winograd
|
||||
run: NV=1 CAPTURE_PROCESS_REPLAY=0 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
|
||||
run: BENCHMARK_LOG=cifar_10steps_half_wino NV=1 CAPTURE_PROCESS_REPLAY=0 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
|
||||
- name: Run full CIFAR training w 1 GPU
|
||||
run: time NV=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
|
||||
run: time BENCHMARK_LOG=cifar NV=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
|
||||
- name: Run full CIFAR training steps w 6 GPUS
|
||||
run: time CAPTURE_PROCESS_REPLAY=0 NV=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt
|
||||
run: time BENCHMARK_LOG=cifar_6gpu CAPTURE_PROCESS_REPLAY=0 NV=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt
|
||||
- name: Run MLPerf resnet eval on training data
|
||||
run: time NV=1 MODEL=resnet python3 examples/mlperf/model_eval.py
|
||||
run: time BENCHMARK_LOG=resnet_eval NV=1 MODEL=resnet python3 examples/mlperf/model_eval.py
|
||||
- name: Run 10 MLPerf ResNet50 training steps (1 gpu)
|
||||
run: NV=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt
|
||||
run: BENCHMARK_LOG=resnet_10steps NV=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt
|
||||
- name: Run 10 MLPerf ResNet50 training steps (6 gpu)
|
||||
run: NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
|
||||
run: BENCHMARK_LOG=resnet_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
|
||||
- name: Run 10 MLPerf Bert training steps (6 gpu)
|
||||
# TODO: remove BERT_LAYERS once scheduler is fast
|
||||
run: NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
|
||||
run: BENCHMARK_LOG=bert_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: Speed (NVIDIA Training)
|
||||
@@ -409,23 +409,23 @@ jobs:
|
||||
- name: Test AM warm start time
|
||||
run: time AMD=1 python3 test/test_tiny.py TestTiny.test_plus
|
||||
- name: Run Stable Diffusion
|
||||
run: AMD=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
|
||||
run: BENCHMARK_LOG=stable_diffusion AMD=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
|
||||
- name: Run SDXL
|
||||
run: CAPTURE_PROCESS_REPLAY=0 AMD=1 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
|
||||
run: BENCHMARK_LOG=stable_diffusion_xl CAPTURE_PROCESS_REPLAY=0 AMD=1 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
|
||||
- name: Run LLaMA 7B
|
||||
run: |
|
||||
AMD=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
|
||||
AMD=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt
|
||||
BENCHMARK_LOG=llama_nojit AMD=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
|
||||
BENCHMARK_LOG=llama AMD=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt
|
||||
- name: Run LLaMA 7B with BEAM
|
||||
run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt
|
||||
run: BENCHMARK_LOG=llama_beam AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt
|
||||
# - name: Run LLaMA 7B on 4 GPUs
|
||||
# run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_four_gpu.txt
|
||||
# - name: Run LLaMA 7B on 6 GPUs
|
||||
# run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_six_gpu.txt
|
||||
- name: Run LLaMA-3 8B BEAM
|
||||
run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt
|
||||
run: BENCHMARK_LOG=llama3_beam AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt
|
||||
- name: Run LLaMA-3 8B on 4 GPUs with BEAM
|
||||
run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
|
||||
run: BENCHMARK_LOG=llama3_beam_4gpu AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
|
||||
# - name: Run LLaMA-3 8B on 6 GPUs
|
||||
# run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt
|
||||
- name: Restore amdgpu
|
||||
@@ -433,15 +433,15 @@ jobs:
|
||||
# - name: Run LLaMA-2 70B
|
||||
# run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_2_70B.txt
|
||||
- name: Run Mixtral 8x7B
|
||||
run: time AMD=1 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt
|
||||
run: time BENCHMARK_LOG=mixtral AMD=1 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt
|
||||
- name: Run GPT2
|
||||
run: |
|
||||
AMD=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
|
||||
AMD=1 JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
|
||||
BENCHMARK_LOG=gpt2_nojit AMD=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
|
||||
BENCHMARK_LOG=gpt2 AMD=1 JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
|
||||
- name: Run GPT2 w HALF
|
||||
run: AMD=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
|
||||
run: BENCHMARK_LOG=gpt2_half AMD=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
|
||||
- name: Run GPT2 w HALF/BEAM
|
||||
run: AMD=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
|
||||
run: BENCHMARK_LOG=gpt2_half_beam AMD=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: Speed (AMD)
|
||||
@@ -500,26 +500,26 @@ jobs:
|
||||
- name: Train MNIST
|
||||
run: time PYTHONPATH=. AMD=1 TARGET_EVAL_ACC_PCT=96.0 python3 examples/beautiful_mnist.py | tee beautiful_mnist.txt
|
||||
- name: Run 10 CIFAR training steps
|
||||
run: AMD=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
|
||||
run: BENCHMARK_LOG=cifar_10steps AMD=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
|
||||
- name: Run 10 CIFAR training steps w HALF
|
||||
run: AMD=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
|
||||
run: BENCHMARK_LOG=cifar_10steps_half AMD=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
|
||||
- name: Run 10 CIFAR training steps w BF16
|
||||
run: AMD=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
|
||||
run: BENCHMARK_LOG=cifar_10steps_bf16 AMD=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
|
||||
- name: Run 10 CIFAR training steps w winograd
|
||||
run: AMD=1 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
|
||||
run: BENCHMARK_LOG=cifar_10steps_half_wino AMD=1 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
|
||||
- name: Run full CIFAR training w 1 GPU
|
||||
run: time AMD=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
|
||||
run: time BENCHMARK_LOG=cifar AMD=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
|
||||
- name: Run full CIFAR training steps w 6 GPUS
|
||||
run: time AMD=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt
|
||||
run: time BENCHMARK_LOG=cifar_6gpu AMD=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt
|
||||
- name: Run MLPerf resnet eval
|
||||
run: time AMD=1 MODEL=resnet python3 examples/mlperf/model_eval.py
|
||||
run: time BENCHMARK_LOG=resnet_eval AMD=1 MODEL=resnet python3 examples/mlperf/model_eval.py
|
||||
- name: Run 10 MLPerf ResNet50 training steps (1 gpu)
|
||||
run: AMD=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt
|
||||
run: BENCHMARK_LOG=resnet_10steps AMD=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt
|
||||
- name: Run 10 MLPerf ResNet50 training steps (6 gpu)
|
||||
run: AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
|
||||
run: BENCHMARK_LOG=resnet_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
|
||||
- name: Run 10 MLPerf Bert training steps (6 gpu)
|
||||
# TODO: remove BERT_LAYERS once scheduler is fast
|
||||
run: AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
|
||||
run: BENCHMARK_LOG=bert_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: Speed (AMD Training)
|
||||
@@ -558,13 +558,13 @@ jobs:
|
||||
- name: validate openpilot 0.9.7
|
||||
run: PYTHONPATH=. FLOAT16=0 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_image_0_9_7.txt
|
||||
- name: benchmark openpilot 0.9.4
|
||||
run: PYTHONPATH=. QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx | tee openpilot_0_9_4.txt
|
||||
run: BENCHMARK_LOG=openpilot_0_9_4 PYTHONPATH=. QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx | tee openpilot_0_9_4.txt
|
||||
- name: benchmark openpilot 0.9.7
|
||||
run: PYTHONPATH=. QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_0_9_7.txt
|
||||
run: BENCHMARK_LOG=openpilot_0_9_7 PYTHONPATH=. QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_0_9_7.txt
|
||||
- name: benchmark openpilot w IMAGE=2 0.9.4
|
||||
run: PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx | tee openpilot_image_0_9_4.txt
|
||||
run: BENCHMARK_LOG=openpilot_0_9_4_image PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx | tee openpilot_image_0_9_4.txt
|
||||
- name: benchmark openpilot w IMAGE=2 0.9.7
|
||||
run: PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_image_0_9_7.txt
|
||||
run: BENCHMARK_LOG=openpilot_0_9_7_image PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_image_0_9_7.txt
|
||||
- name: openpilot compile3 0.9.7
|
||||
run: PYTHONPATH="." QCOM=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx
|
||||
- name: openpilot compile3 0.9.7+ tomb raider
|
||||
|
||||
@@ -7,6 +7,7 @@ from tinygrad.ops import UOp
|
||||
from tinygrad.helpers import Timing, DEBUG, JIT, getenv, fetch, colored, trange
|
||||
from tinygrad.nn import Embedding, Linear, LayerNorm
|
||||
from tinygrad.nn.state import gguf_load, torch_load, load_state_dict, get_state_dict
|
||||
from extra.bench_log import BenchEvent, WallTimeEvent
|
||||
|
||||
MAX_CONTEXT = getenv("MAX_CONTEXT", 128)
|
||||
HALF = getenv("HALF")
|
||||
@@ -134,11 +135,12 @@ class GPT2:
|
||||
# lm head and wte are tied
|
||||
weights['lm_head.weight'] = weights['wte.weight']
|
||||
|
||||
load_state_dict(model, weights)
|
||||
with WallTimeEvent(BenchEvent.LOAD_WEIGHTS):
|
||||
load_state_dict(model, weights)
|
||||
|
||||
if HALF:
|
||||
for l in get_state_dict(model).values():
|
||||
l.replace(l.half().realize())
|
||||
if HALF:
|
||||
for l in get_state_dict(model).values():
|
||||
l.replace(l.half().realize())
|
||||
|
||||
return GPT2(model, tokenizer)
|
||||
|
||||
@@ -167,7 +169,8 @@ class GPT2:
|
||||
return key
|
||||
state_dict = { _remap_gguf_key(k): v for k, v in state_dict.items() }
|
||||
model = Transformer(**gpt2_params)
|
||||
load_state_dict(model, state_dict)
|
||||
with WallTimeEvent(BenchEvent.LOAD_WEIGHTS):
|
||||
load_state_dict(model, state_dict)
|
||||
return GPT2(model, tiktoken.get_encoding("gpt2"))
|
||||
|
||||
def __init__(self, model, tokenizer):
|
||||
@@ -185,11 +188,12 @@ class GPT2:
|
||||
with Timing("ran model in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+
|
||||
f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+
|
||||
(f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=timing):
|
||||
if batch_size == 1 and len(toks[0][start_pos:]) == 1:
|
||||
tokens = Variable("tokens", 0, VOCAB_SIZE).bind(toks[0][start_pos])
|
||||
else:
|
||||
tokens = Tensor([x[start_pos:] for x in toks])
|
||||
tok = self.model(tokens, Variable("start_pos", 1 if start_pos else 0, MAX_CONTEXT-1).bind(start_pos), temperature).tolist()
|
||||
with WallTimeEvent(BenchEvent.STEP):
|
||||
if batch_size == 1 and len(toks[0][start_pos:]) == 1:
|
||||
tokens = Variable("tokens", 0, VOCAB_SIZE).bind(toks[0][start_pos])
|
||||
else:
|
||||
tokens = Tensor([x[start_pos:] for x in toks])
|
||||
tok = self.model(tokens, Variable("start_pos", 1 if start_pos else 0, MAX_CONTEXT-1).bind(start_pos), temperature).tolist()
|
||||
start_pos = len(toks[0])
|
||||
for i,t in enumerate(tok): toks[i].append(t)
|
||||
return [self.tokenizer.decode(x) for x in toks]
|
||||
|
||||
@@ -11,6 +11,7 @@ from tinygrad import nn, dtypes, Tensor, Device, GlobalCounters, TinyJit
|
||||
from tinygrad.nn.state import get_state_dict, get_parameters
|
||||
from tinygrad.nn import optim
|
||||
from tinygrad.helpers import Context, BEAM, WINO, getenv, colored, prod
|
||||
from extra.bench_log import BenchEvent, WallTimeEvent, WallTimeEvent
|
||||
|
||||
cifar_mean = [0.4913997551666284, 0.48215855929893703, 0.4465309133731618]
|
||||
cifar_std = [0.24703225141799082, 0.24348516474564, 0.26158783926049628]
|
||||
@@ -395,20 +396,23 @@ def train_cifar():
|
||||
if STEPS == 0 or i == STEPS: break
|
||||
|
||||
GlobalCounters.reset()
|
||||
X, Y = next(batcher)
|
||||
if len(GPUS) > 1:
|
||||
X.shard_(GPUS, axis=0)
|
||||
Y.shard_(GPUS, axis=0)
|
||||
|
||||
with Context(BEAM=getenv("LATEBEAM", BEAM.value), WINO=getenv("LATEWINO", WINO.value)):
|
||||
loss = train_step_jitted(model, optim.OptimizerGroup(opt_bias, opt_non_bias), [lr_sched_bias, lr_sched_non_bias], X, Y)
|
||||
et = time.monotonic()
|
||||
loss_cpu = loss.numpy()
|
||||
# EMA for network weights
|
||||
if getenv("EMA") and i > hyp['ema']['steps'] and (i+1) % hyp['ema']['every_n_steps'] == 0:
|
||||
if model_ema is None:
|
||||
model_ema = modelEMA(W, model)
|
||||
model_ema.update(model, Tensor([projected_ema_decay_val*(i/STEPS)**hyp['ema']['decay_pow']]))
|
||||
with WallTimeEvent(BenchEvent.STEP):
|
||||
X, Y = next(batcher)
|
||||
if len(GPUS) > 1:
|
||||
X.shard_(GPUS, axis=0)
|
||||
Y.shard_(GPUS, axis=0)
|
||||
|
||||
with Context(BEAM=getenv("LATEBEAM", BEAM.value), WINO=getenv("LATEWINO", WINO.value)):
|
||||
loss = train_step_jitted(model, optim.OptimizerGroup(opt_bias, opt_non_bias), [lr_sched_bias, lr_sched_non_bias], X, Y)
|
||||
et = time.monotonic()
|
||||
loss_cpu = loss.numpy()
|
||||
# EMA for network weights
|
||||
if getenv("EMA") and i > hyp['ema']['steps'] and (i+1) % hyp['ema']['every_n_steps'] == 0:
|
||||
if model_ema is None:
|
||||
model_ema = modelEMA(W, model)
|
||||
model_ema.update(model, Tensor([projected_ema_decay_val*(i/STEPS)**hyp['ema']['decay_pow']]))
|
||||
|
||||
cl = time.monotonic()
|
||||
device_str = loss.device if isinstance(loss.device, str) else f"{loss.device[0]} * {len(loss.device)}"
|
||||
# 53 221.74 ms run, 2.22 ms python, 219.52 ms CL, 803.39 loss, 0.000807 LR, 4.66 GB used, 3042.49 GFLOPS, 674.65 GOPS
|
||||
@@ -424,4 +428,5 @@ def train_cifar():
|
||||
raise ValueError(colored(f"{eval_acc_pct=} < {target}", "red"))
|
||||
|
||||
if __name__ == "__main__":
|
||||
train_cifar()
|
||||
with WallTimeEvent(BenchEvent.FULL):
|
||||
train_cifar()
|
||||
|
||||
@@ -13,6 +13,7 @@ from extra.models.llama import Transformer, convert_from_huggingface, fix_bf16
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
import tiktoken, sys
|
||||
from tiktoken.load import load_tiktoken_bpe
|
||||
from extra.bench_log import BenchEvent, WallTimeEvent
|
||||
|
||||
MAX_CONTEXT = getenv("MAX_CONTEXT", 4096)
|
||||
|
||||
@@ -206,42 +207,43 @@ class LLaMa:
|
||||
|
||||
model = Transformer(**params["args"], linear=linear, max_context=MAX_CONTEXT, jit=bool(JIT))
|
||||
|
||||
if model_path.is_dir():
|
||||
weights = concat_weights([load(filename) for filename in [f"{model_path}/consolidated.{i:02d}.pth" for i in range(params["files"])]], device[0] if isinstance(device, tuple) else device)
|
||||
else:
|
||||
weights = load(str(model_path))
|
||||
if "model.embed_tokens.weight" in weights:
|
||||
weights = convert_from_huggingface(weights, params["args"]["n_layers"], params["args"]["n_heads"], params["args"].get("n_kv_heads", params["args"]["n_heads"]))
|
||||
with WallTimeEvent(BenchEvent.LOAD_WEIGHTS):
|
||||
if model_path.is_dir():
|
||||
weights = concat_weights([load(filename) for filename in [f"{model_path}/consolidated.{i:02d}.pth" for i in range(params["files"])]], device[0] if isinstance(device, tuple) else device)
|
||||
else:
|
||||
weights = load(str(model_path))
|
||||
if "model.embed_tokens.weight" in weights:
|
||||
weights = convert_from_huggingface(weights, params["args"]["n_layers"], params["args"]["n_heads"], params["args"].get("n_kv_heads", params["args"]["n_heads"]))
|
||||
|
||||
weights = fix_bf16(weights)
|
||||
weights = fix_bf16(weights)
|
||||
|
||||
# prevent tracking model weights
|
||||
# this is a part of a larger problem with BUFFER UOps and gc in TRACK_MATCH_STATS=2
|
||||
with Context(BEAM=0, TRACK_MATCH_STATS=0):
|
||||
# quantize
|
||||
if quantize is not None:
|
||||
weights = linear.quantize(weights, device)
|
||||
for _,v in weights.items(): v.realize()
|
||||
# prevent tracking model weights
|
||||
# this is a part of a larger problem with BUFFER UOps and gc in TRACK_MATCH_STATS=2
|
||||
with Context(BEAM=0, TRACK_MATCH_STATS=0):
|
||||
# quantize
|
||||
if quantize is not None:
|
||||
weights = linear.quantize(weights, device)
|
||||
for _,v in weights.items(): v.realize()
|
||||
|
||||
# shard
|
||||
if isinstance(device, tuple):
|
||||
for k,v in nn.state.get_state_dict(model).items():
|
||||
if 'scale' in k: v.shard_(device, axis=None) # from quantized
|
||||
elif '.attention.' in k:
|
||||
if getenv("SHARD_KVCACHE") and ('.wq.' in k or '.wk.' in k or '.wv.' in k): v.shard_(device, axis=0)
|
||||
else: v.shard_(device, axis=-1)
|
||||
elif '.feed_forward.w1.' in k: v.shard_(device, axis=0)
|
||||
elif '.feed_forward.w3.' in k: v.shard_(device, axis=0)
|
||||
elif '.feed_forward.' in k: v.shard_(device, axis=-1)
|
||||
elif 'tok_embeddings.weight' in k: v.shard_(device, axis=0)
|
||||
elif 'output.weight' in k: v.shard_(device, axis=-1)
|
||||
#elif k.endswith('.weight'): v.shard_(device, axis=-1)
|
||||
#elif 'norm.' in k: v.shard_(device, axis=-1)
|
||||
else: v.shard_(device, axis=None)
|
||||
#print(k, v.shape, v.lazydata.axis)
|
||||
# shard
|
||||
if isinstance(device, tuple):
|
||||
for k,v in nn.state.get_state_dict(model).items():
|
||||
if 'scale' in k: v.shard_(device, axis=None) # from quantized
|
||||
elif '.attention.' in k:
|
||||
if getenv("SHARD_KVCACHE") and ('.wq.' in k or '.wk.' in k or '.wv.' in k): v.shard_(device, axis=0)
|
||||
else: v.shard_(device, axis=-1)
|
||||
elif '.feed_forward.w1.' in k: v.shard_(device, axis=0)
|
||||
elif '.feed_forward.w3.' in k: v.shard_(device, axis=0)
|
||||
elif '.feed_forward.' in k: v.shard_(device, axis=-1)
|
||||
elif 'tok_embeddings.weight' in k: v.shard_(device, axis=0)
|
||||
elif 'output.weight' in k: v.shard_(device, axis=-1)
|
||||
#elif k.endswith('.weight'): v.shard_(device, axis=-1)
|
||||
#elif 'norm.' in k: v.shard_(device, axis=-1)
|
||||
else: v.shard_(device, axis=None)
|
||||
#print(k, v.shape, v.lazydata.axis)
|
||||
|
||||
# replace weights in model
|
||||
load_state_dict(model, weights, strict=False, consume=True)
|
||||
# replace weights in model
|
||||
load_state_dict(model, weights, strict=False, consume=True)
|
||||
|
||||
return LLaMa(model, tokenizer)
|
||||
|
||||
@@ -477,11 +479,12 @@ After you are done speaking, output [EOS]. You are not Chad.
|
||||
next_tok = Tensor([toks[start_pos:]], device=device) if tok_tensor is None or (len(toks)-start_pos) > 1 else tok_tensor.reshape(1, 1)
|
||||
with Profiling(enabled=args.profile):
|
||||
with Timing("total ", enabled=args.timing, on_exit=lambda x: f", {1e9/x:.2f} tok/s, {GlobalCounters.global_mem/x:.2f} GB/s, param {param_bytes/x:.2f} GB/s"):
|
||||
with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+
|
||||
f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+
|
||||
(f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_bytes*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=args.timing):
|
||||
tok_tensor = llama.model(next_tok, start_pos, args.temperature)
|
||||
tok = tok_tensor.item()
|
||||
with WallTimeEvent(BenchEvent.STEP):
|
||||
with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+
|
||||
f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+
|
||||
(f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_bytes*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=args.timing):
|
||||
tok_tensor = llama.model(next_tok, start_pos, args.temperature)
|
||||
tok = tok_tensor.item()
|
||||
|
||||
# use the kv cache
|
||||
start_pos = len(toks)
|
||||
|
||||
@@ -7,6 +7,7 @@ from extra.models.llama import Transformer, convert_from_huggingface, convert_fr
|
||||
from tinygrad.nn.state import safe_load, torch_load, load_state_dict, get_parameters, gguf_load
|
||||
from tinygrad import Tensor, dtypes, nn, Context, Device, GlobalCounters
|
||||
from tinygrad.helpers import Profiling, Timing, DEBUG, colored, fetch, tqdm
|
||||
from extra.bench_log import BenchEvent, WallTimeEvent
|
||||
|
||||
class Tokenizer:
|
||||
pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
|
||||
@@ -166,40 +167,42 @@ def build_transformer(model_path: Path, model_size="8B", quantize=None, scale_dt
|
||||
model = Transformer(**MODEL_PARAMS[model_size]["args"], linear=linear, embedding=embedding, max_context=max_context, jit=True)
|
||||
|
||||
if not load_weights: return model
|
||||
|
||||
# load weights
|
||||
if model_path.is_dir():
|
||||
if (model_path / "model.safetensors.index.json").exists(): weights = load(str(model_path / "model.safetensors.index.json"))
|
||||
elif (model_path / "model.safetensors").exists(): weights = load(str(model_path / "model.safetensors"))
|
||||
else: weights = concat_weights([load(str(model_path / f"consolidated.{i:02d}.pth")) for i in range(MODEL_PARAMS[model_size]["files"])], device[0] if isinstance(device, tuple) else device)
|
||||
else:
|
||||
weights = load(str(model_path))
|
||||
if "model.embed_tokens.weight" in weights:
|
||||
weights = convert_from_huggingface(weights, MODEL_PARAMS[model_size]["args"]["n_layers"], MODEL_PARAMS[model_size]["args"]["n_heads"], MODEL_PARAMS[model_size]["args"]["n_kv_heads"])
|
||||
elif "token_embd.weight" in weights:
|
||||
weights = convert_from_gguf(weights, MODEL_PARAMS[model_size]["args"]["n_layers"])
|
||||
weights = fix_bf16(weights)
|
||||
with WallTimeEvent(BenchEvent.LOAD_WEIGHTS):
|
||||
if model_path.is_dir():
|
||||
if (model_path / "model.safetensors.index.json").exists(): weights = load(str(model_path / "model.safetensors.index.json"))
|
||||
elif (model_path / "model.safetensors").exists(): weights = load(str(model_path / "model.safetensors"))
|
||||
else: weights = concat_weights([load(str(model_path / f"consolidated.{i:02d}.pth")) for i in range(MODEL_PARAMS[model_size]["files"])], device[0] if isinstance(device, tuple) else device)
|
||||
else:
|
||||
weights = load(str(model_path))
|
||||
if "model.embed_tokens.weight" in weights:
|
||||
weights = convert_from_huggingface(weights, MODEL_PARAMS[model_size]["args"]["n_layers"], MODEL_PARAMS[model_size]["args"]["n_heads"], MODEL_PARAMS[model_size]["args"]["n_kv_heads"])
|
||||
elif "token_embd.weight" in weights:
|
||||
weights = convert_from_gguf(weights, MODEL_PARAMS[model_size]["args"]["n_layers"])
|
||||
weights = fix_bf16(weights)
|
||||
|
||||
with Context(BEAM=0):
|
||||
# quantize
|
||||
if quantize == "float16": weights = {k:v.cast(quantize).contiguous() for k,v in weights.items()}
|
||||
elif quantize is not None:
|
||||
weights = linear.quantize(weights, device, scale_dtype, quantize_embeds)
|
||||
for _,v in weights.items(): v.realize()
|
||||
with Context(BEAM=0):
|
||||
# quantize
|
||||
if quantize == "float16": weights = {k:v.cast(quantize).contiguous() for k,v in weights.items()}
|
||||
elif quantize is not None:
|
||||
weights = linear.quantize(weights, device, scale_dtype, quantize_embeds)
|
||||
for _,v in weights.items(): v.realize()
|
||||
|
||||
# shard
|
||||
if isinstance(device, tuple):
|
||||
for k,v in nn.state.get_state_dict(model).items():
|
||||
if 'scale' in k: v.shard_(device, axis=None) # from quantized
|
||||
elif '.attention.' in k: v.shard_(device, axis=-1)
|
||||
elif '.feed_forward.w1.' in k: v.shard_(device, axis=0)
|
||||
elif '.feed_forward.w3.' in k: v.shard_(device, axis=0)
|
||||
elif '.feed_forward.' in k: v.shard_(device, axis=-1)
|
||||
elif 'tok_embeddings.weight' in k: v.shard_(device, axis=0)
|
||||
elif 'output.weight' in k: v.shard_(device, axis=0)
|
||||
else: v.shard_(device, axis=None)
|
||||
# shard
|
||||
if isinstance(device, tuple):
|
||||
for k,v in nn.state.get_state_dict(model).items():
|
||||
if 'scale' in k: v.shard_(device, axis=None) # from quantized
|
||||
elif '.attention.' in k: v.shard_(device, axis=-1)
|
||||
elif '.feed_forward.w1.' in k: v.shard_(device, axis=0)
|
||||
elif '.feed_forward.w3.' in k: v.shard_(device, axis=0)
|
||||
elif '.feed_forward.' in k: v.shard_(device, axis=-1)
|
||||
elif 'tok_embeddings.weight' in k: v.shard_(device, axis=0)
|
||||
elif 'output.weight' in k: v.shard_(device, axis=0)
|
||||
else: v.shard_(device, axis=None)
|
||||
|
||||
# replace weights in model
|
||||
load_state_dict(model, weights, strict=False, consume=True)
|
||||
# replace weights in model
|
||||
load_state_dict(model, weights, strict=False, consume=True)
|
||||
return model
|
||||
|
||||
# default settings
|
||||
@@ -435,11 +438,12 @@ if __name__ == "__main__":
|
||||
st = GlobalCounters.time_sum_s
|
||||
with Profiling(enabled=args.profile):
|
||||
with Timing("total ", on_exit=lambda x: f", {1e9/x:.2f} tok/s, {GlobalCounters.global_mem/x:.2f} GB/s, param {param_bytes/x:.2f} GB/s"):
|
||||
with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+
|
||||
f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+
|
||||
(f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_bytes*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None):
|
||||
tok = model(Tensor([[last_tok]], device=device), start_pos, TEMPERATURE, TOP_K, TOP_P, ALPHA_F, ALPHA_P)
|
||||
tok = tok.item()
|
||||
with WallTimeEvent(BenchEvent.STEP):
|
||||
with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+
|
||||
f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+
|
||||
(f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_bytes*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None):
|
||||
tok = model(Tensor([[last_tok]], device=device), start_pos, TEMPERATURE, TOP_K, TOP_P, ALPHA_F, ALPHA_P)
|
||||
tok = tok.item()
|
||||
start_pos += 1
|
||||
last_tok = tok
|
||||
generated += tokenizer.decode([tok])
|
||||
|
||||
@@ -3,6 +3,7 @@ from tinygrad import Tensor, nn, Device, GlobalCounters, Variable
|
||||
from tinygrad.helpers import Timing, Profiling, CI, tqdm
|
||||
from tinygrad.nn.state import torch_load, get_state_dict
|
||||
from extra.models.llama import FeedForward, Transformer
|
||||
from extra.bench_log import BenchEvent, WallTimeEvent
|
||||
|
||||
class MixtureFeedForward:
|
||||
def __init__(self, num_experts:int, dim:int, hidden_dim:int, linear=nn.Linear):
|
||||
@@ -30,18 +31,19 @@ if __name__ == "__main__":
|
||||
help="Path to the downloaded weights")
|
||||
args = parser.parse_args()
|
||||
|
||||
state = torch_load(args.weights + "/consolidated.00.pth.b")
|
||||
model = Transformer(n_layers=32, dim=4096, hidden_dim=14336, n_heads=32, n_kv_heads=8, norm_eps=1e-5, vocab_size=32000, feed_forward=functools.partial(MixtureFeedForward, 8), jit=False)
|
||||
model_state_dict = get_state_dict(model)
|
||||
with WallTimeEvent(BenchEvent.LOAD_WEIGHTS):
|
||||
state = torch_load(args.weights + "/consolidated.00.pth.b")
|
||||
model = Transformer(n_layers=32, dim=4096, hidden_dim=14336, n_heads=32, n_kv_heads=8, norm_eps=1e-5, vocab_size=32000, feed_forward=functools.partial(MixtureFeedForward, 8), jit=False)
|
||||
model_state_dict = get_state_dict(model)
|
||||
|
||||
for k in (t := tqdm(state, disable=CI)):
|
||||
if 'feed_forward.experts.' in k:
|
||||
expert_no = int(k.split('feed_forward.experts.')[1].split('.')[0])
|
||||
device = Device.DEFAULT + ":" + str((expert_no//2)+1)
|
||||
else:
|
||||
device = Device.DEFAULT
|
||||
t.set_description(f"ram used: {GlobalCounters.mem_used/1e9:5.2f} GB, loading {k} to {device}")
|
||||
model_state_dict[k].replace(state[k].to(device).half()).realize()
|
||||
for k in (t := tqdm(state, disable=CI)):
|
||||
if 'feed_forward.experts.' in k:
|
||||
expert_no = int(k.split('feed_forward.experts.')[1].split('.')[0])
|
||||
device = Device.DEFAULT + ":" + str((expert_no//2)+1)
|
||||
else:
|
||||
device = Device.DEFAULT
|
||||
t.set_description(f"ram used: {GlobalCounters.mem_used/1e9:5.2f} GB, loading {k} to {device}")
|
||||
model_state_dict[k].replace(state[k].to(device).half()).realize()
|
||||
if CI: print(f"ram used: {GlobalCounters.mem_used/1e9:5.2f} GB")
|
||||
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
@@ -53,7 +55,8 @@ if __name__ == "__main__":
|
||||
GlobalCounters.reset()
|
||||
with Profiling(sort="time", frac=0.1, enabled=args.profile):
|
||||
with Timing("total ", enabled=args.timing, on_exit=lambda x: f", {1e9/x:.2f} tok/sec"):
|
||||
tok = model(Tensor([toks[start_pos:]]), 0 if start_pos == 0 else Variable("start_pos", 1, 1024).bind(start_pos), args.temperature).item()
|
||||
with WallTimeEvent(BenchEvent.STEP):
|
||||
tok = model(Tensor([toks[start_pos:]]), 0 if start_pos == 0 else Variable("start_pos", 1, 1024).bind(start_pos), args.temperature).item()
|
||||
toks.append(tok)
|
||||
start_pos += 1
|
||||
print(spp.decode(toks))
|
||||
|
||||
@@ -5,61 +5,63 @@ import numpy as np
|
||||
from tinygrad import Tensor, Device, dtypes, GlobalCounters, TinyJit
|
||||
from tinygrad.nn.state import get_parameters, load_state_dict, safe_load
|
||||
from tinygrad.helpers import getenv
|
||||
from extra.bench_log import BenchEvent, WallTimeEvent
|
||||
def tlog(x): print(f"{x:25s} @ {time.perf_counter()-start:5.2f}s")
|
||||
|
||||
def eval_resnet():
|
||||
Tensor.no_grad = True
|
||||
# Resnet50-v1.5
|
||||
from extra.models.resnet import ResNet50
|
||||
tlog("imports")
|
||||
GPUS = [f'{Device.DEFAULT}:{i}' for i in range(getenv("GPUS", 6))]
|
||||
for x in GPUS: Device[x]
|
||||
tlog("got devices") # NOTE: this is faster with rocm-smi running
|
||||
with WallTimeEvent(BenchEvent.FULL):
|
||||
# Resnet50-v1.5
|
||||
from extra.models.resnet import ResNet50
|
||||
tlog("imports")
|
||||
GPUS = [f'{Device.DEFAULT}:{i}' for i in range(getenv("GPUS", 6))]
|
||||
for x in GPUS: Device[x]
|
||||
tlog("got devices") # NOTE: this is faster with rocm-smi running
|
||||
|
||||
class ResnetRunner:
|
||||
def __init__(self, device=None):
|
||||
self.mdl = ResNet50()
|
||||
for x in get_parameters(self.mdl) if device else []: x.to_(device)
|
||||
if (fn:=getenv("RESNET_MODEL", "")): load_state_dict(self.mdl, safe_load(fn))
|
||||
else: self.mdl.load_from_pretrained()
|
||||
self.input_mean = Tensor([0.485, 0.456, 0.406], device=device).reshape(1, -1, 1, 1)
|
||||
self.input_std = Tensor([0.229, 0.224, 0.225], device=device).reshape(1, -1, 1, 1)
|
||||
def __call__(self, x:Tensor) -> Tensor:
|
||||
x = x.permute([0,3,1,2]).cast(dtypes.float32) / 255.0
|
||||
x -= self.input_mean
|
||||
x /= self.input_std
|
||||
return self.mdl(x).log_softmax().argmax(axis=1).realize()
|
||||
class ResnetRunner:
|
||||
def __init__(self, device=None):
|
||||
self.mdl = ResNet50()
|
||||
for x in get_parameters(self.mdl) if device else []: x.to_(device)
|
||||
if (fn:=getenv("RESNET_MODEL", "")): load_state_dict(self.mdl, safe_load(fn))
|
||||
else: self.mdl.load_from_pretrained()
|
||||
self.input_mean = Tensor([0.485, 0.456, 0.406], device=device).reshape(1, -1, 1, 1)
|
||||
self.input_std = Tensor([0.229, 0.224, 0.225], device=device).reshape(1, -1, 1, 1)
|
||||
def __call__(self, x:Tensor) -> Tensor:
|
||||
x = x.permute([0,3,1,2]).cast(dtypes.float32) / 255.0
|
||||
x -= self.input_mean
|
||||
x /= self.input_std
|
||||
return self.mdl(x).log_softmax().argmax(axis=1).realize()
|
||||
|
||||
mdl = TinyJit(ResnetRunner(GPUS))
|
||||
tlog("loaded models")
|
||||
mdl = TinyJit(ResnetRunner(GPUS))
|
||||
tlog("loaded models")
|
||||
|
||||
# evaluation on the mlperf classes of the validation set from imagenet
|
||||
from examples.mlperf.dataloader import batch_load_resnet
|
||||
iterator = batch_load_resnet(getenv("BS", 128*6), val=getenv("VAL", 1), shuffle=False, pad_first_batch=True)
|
||||
def data_get():
|
||||
x,y,cookie = next(iterator)
|
||||
return x.shard(GPUS, axis=0).realize(), y, cookie
|
||||
n,d = 0,0
|
||||
proc = data_get()
|
||||
tlog("loaded initial data")
|
||||
st = time.perf_counter()
|
||||
while proc is not None:
|
||||
GlobalCounters.reset()
|
||||
proc = (mdl(proc[0]), proc[1], proc[2]) # this frees the images
|
||||
run = time.perf_counter()
|
||||
# load the next data here
|
||||
try: next_proc = data_get()
|
||||
except StopIteration: next_proc = None
|
||||
nd = time.perf_counter()
|
||||
y = np.array(proc[1])
|
||||
proc = (proc[0].numpy() == y) & (y != -1) # this realizes the models and frees the cookies
|
||||
n += proc.sum()
|
||||
d += (y != -1).sum()
|
||||
et = time.perf_counter()
|
||||
tlog(f"****** {n:5d}/{d:5d} {n*100.0/d:.2f}% -- {(run-st)*1000:7.2f} ms to enqueue, {(et-run)*1000:7.2f} ms to realize ({(nd-run)*1000:7.2f} ms fetching). {(len(proc))/(et-st):8.2f} examples/sec. {GlobalCounters.global_ops*1e-12/(et-st):5.2f} TFLOPS")
|
||||
st = et
|
||||
proc, next_proc = next_proc, None
|
||||
tlog("done")
|
||||
# evaluation on the mlperf classes of the validation set from imagenet
|
||||
from examples.mlperf.dataloader import batch_load_resnet
|
||||
iterator = batch_load_resnet(getenv("BS", 128*6), val=getenv("VAL", 1), shuffle=False, pad_first_batch=True)
|
||||
def data_get():
|
||||
x,y,cookie = next(iterator)
|
||||
return x.shard(GPUS, axis=0).realize(), y, cookie
|
||||
n,d = 0,0
|
||||
proc = data_get()
|
||||
tlog("loaded initial data")
|
||||
st = time.perf_counter()
|
||||
while proc is not None:
|
||||
GlobalCounters.reset()
|
||||
proc = (mdl(proc[0]), proc[1], proc[2]) # this frees the images
|
||||
run = time.perf_counter()
|
||||
# load the next data here
|
||||
try: next_proc = data_get()
|
||||
except StopIteration: next_proc = None
|
||||
nd = time.perf_counter()
|
||||
y = np.array(proc[1])
|
||||
proc = (proc[0].numpy() == y) & (y != -1) # this realizes the models and frees the cookies
|
||||
n += proc.sum()
|
||||
d += (y != -1).sum()
|
||||
et = time.perf_counter()
|
||||
tlog(f"****** {n:5d}/{d:5d} {n*100.0/d:.2f}% -- {(run-st)*1000:7.2f} ms to enqueue, {(et-run)*1000:7.2f} ms to realize ({(nd-run)*1000:7.2f} ms fetching). {(len(proc))/(et-st):8.2f} examples/sec. {GlobalCounters.global_ops*1e-12/(et-st):5.2f} TFLOPS")
|
||||
st = et
|
||||
proc, next_proc = next_proc, None
|
||||
tlog("done")
|
||||
|
||||
def eval_unet3d():
|
||||
# UNet3D
|
||||
|
||||
@@ -9,6 +9,7 @@ from tinygrad.nn.optim import LAMB, LARS, SGD, OptimizerGroup, Adam
|
||||
|
||||
from extra.lr_scheduler import LRSchedulerGroup
|
||||
from examples.mlperf.helpers import get_training_state, load_training_state
|
||||
from extra.bench_log import BenchEvent, WallTimeEvent
|
||||
# TODO: fix benchmark logging and use tinygrad tqdm
|
||||
from tqdm import tqdm
|
||||
|
||||
@@ -205,24 +206,25 @@ def train_resnet():
|
||||
st = time.perf_counter()
|
||||
while proc is not None:
|
||||
GlobalCounters.reset()
|
||||
(loss, top_1), y, proc = train_step(proc[0], proc[1]), proc[2], proc[3]
|
||||
with WallTimeEvent(BenchEvent.STEP):
|
||||
(loss, top_1), y, proc = train_step(proc[0], proc[1]), proc[2], proc[3]
|
||||
|
||||
pt = time.perf_counter()
|
||||
pt = time.perf_counter()
|
||||
|
||||
if len(prev_cookies) == getenv("STORE_COOKIES", 1): prev_cookies = [] # free previous cookies after gpu work has been enqueued
|
||||
try:
|
||||
if INITMLPERF:
|
||||
next_proc = fake_data_get(BS)
|
||||
else:
|
||||
next_proc = data_get(it)
|
||||
except StopIteration:
|
||||
next_proc = None
|
||||
if len(prev_cookies) == getenv("STORE_COOKIES", 1): prev_cookies = [] # free previous cookies after gpu work has been enqueued
|
||||
try:
|
||||
if INITMLPERF:
|
||||
next_proc = fake_data_get(BS)
|
||||
else:
|
||||
next_proc = data_get(it)
|
||||
except StopIteration:
|
||||
next_proc = None
|
||||
|
||||
dt = time.perf_counter()
|
||||
dt = time.perf_counter()
|
||||
|
||||
device_str = loss.device if isinstance(loss.device, str) else f"{loss.device[0]} * {len(loss.device)}"
|
||||
loss, top_1 = loss.numpy().item(), top_1.numpy().item()
|
||||
top_1_acc = top_1 / sum(yi != -1 for yi in y)
|
||||
device_str = loss.device if isinstance(loss.device, str) else f"{loss.device[0]} * {len(loss.device)}"
|
||||
loss, top_1 = loss.numpy().item(), top_1.numpy().item()
|
||||
top_1_acc = top_1 / sum(yi != -1 for yi in y)
|
||||
|
||||
cl = time.perf_counter()
|
||||
if BENCHMARK:
|
||||
@@ -1124,23 +1126,24 @@ def train_bert():
|
||||
BEAM.value = TRAIN_BEAM
|
||||
st = time.perf_counter()
|
||||
GlobalCounters.reset()
|
||||
loss, global_norm, lr = train_step_bert(model, optimizer_group, scheduler_group, loss_scaler,
|
||||
train_data["input_ids"], train_data["segment_ids"], train_data["input_mask"], train_data["masked_lm_positions"], \
|
||||
train_data["masked_lm_ids"], train_data["masked_lm_weights"], train_data["next_sentence_labels"], GPUS)
|
||||
with WallTimeEvent(BenchEvent.STEP):
|
||||
loss, global_norm, lr = train_step_bert(model, optimizer_group, scheduler_group, loss_scaler,
|
||||
train_data["input_ids"], train_data["segment_ids"], train_data["input_mask"], train_data["masked_lm_positions"], \
|
||||
train_data["masked_lm_ids"], train_data["masked_lm_weights"], train_data["next_sentence_labels"], GPUS)
|
||||
|
||||
pt = time.perf_counter()
|
||||
pt = time.perf_counter()
|
||||
|
||||
try:
|
||||
next_data = next(train_it)
|
||||
except StopIteration:
|
||||
next_data = None
|
||||
try:
|
||||
next_data = next(train_it)
|
||||
except StopIteration:
|
||||
next_data = None
|
||||
|
||||
dt = time.perf_counter()
|
||||
dt = time.perf_counter()
|
||||
|
||||
device_str = parameters[0].device if isinstance(parameters[0].device, str) else f"{parameters[0].device[0]} * {len(parameters[0].device)}"
|
||||
loss = loss.item()
|
||||
assert not math.isnan(loss)
|
||||
lr = lr.item()
|
||||
device_str = parameters[0].device if isinstance(parameters[0].device, str) else f"{parameters[0].device[0]} * {len(parameters[0].device)}"
|
||||
loss = loss.item()
|
||||
assert not math.isnan(loss)
|
||||
lr = lr.item()
|
||||
|
||||
cl = time.perf_counter()
|
||||
if BENCHMARK: step_times.append(cl - st)
|
||||
|
||||
@@ -5,6 +5,7 @@ from examples.stable_diffusion import AutoencoderKL, get_alphas_cumprod
|
||||
from examples.sdxl import DPMPP2MSampler, append_dims, LegacyDDPMDiscretization
|
||||
from extra.models.unet import UNetModel
|
||||
from extra.models.clip import FrozenOpenClipEmbedder
|
||||
from extra.bench_log import BenchEvent, WallTimeEvent
|
||||
|
||||
from typing import Dict
|
||||
import argparse, tempfile, os
|
||||
@@ -117,12 +118,14 @@ if __name__ == "__main__":
|
||||
if not weights_fn:
|
||||
weights_url = args.weights_url if args.weights_url else default_weights_url
|
||||
weights_fn = fetch(weights_url, os.path.basename(str(weights_url)))
|
||||
load_state_dict(model, safe_load(weights_fn), strict=False)
|
||||
|
||||
if args.fp16:
|
||||
for k,v in get_state_dict(model).items():
|
||||
if k.startswith("model"):
|
||||
v.replace(v.cast(dtypes.float16).realize())
|
||||
with WallTimeEvent(BenchEvent.LOAD_WEIGHTS):
|
||||
load_state_dict(model, safe_load(weights_fn), strict=False)
|
||||
|
||||
if args.fp16:
|
||||
for k,v in get_state_dict(model).items():
|
||||
if k.startswith("model"):
|
||||
v.replace(v.cast(dtypes.float16).realize())
|
||||
|
||||
c = { "crossattn": model.cond_stage_model(args.prompt) }
|
||||
uc = { "crossattn": model.cond_stage_model("") }
|
||||
|
||||
@@ -9,6 +9,7 @@ from tinygrad.nn.state import safe_load, load_state_dict, get_state_dict
|
||||
from tinygrad.helpers import fetch, trange, colored, Timing
|
||||
from extra.models.clip import Embedder, FrozenClosedClipEmbedder, FrozenOpenClipEmbedder
|
||||
from extra.models.unet import UNetModel, Upsample, Downsample, timestep_embedding
|
||||
from extra.bench_log import BenchEvent, WallTimeEvent
|
||||
from examples.stable_diffusion import ResnetBlock, Mid
|
||||
import numpy as np
|
||||
|
||||
@@ -346,17 +347,18 @@ class DPMPP2MSampler:
|
||||
for i in trange(num_sigmas - 1):
|
||||
with Timing("step in ", enabled=timing, on_exit=lambda _: f", using {GlobalCounters.mem_used/1e9:.2f} GB"):
|
||||
GlobalCounters.reset()
|
||||
x, old_denoised = self.sampler_step(
|
||||
old_denoised=old_denoised,
|
||||
prev_sigma=(None if i==0 else sigmas[i-1].expand(x.shape[0])),
|
||||
sigma=sigmas[i].expand(x.shape[0]),
|
||||
next_sigma=sigmas[i+1].expand(x.shape[0]),
|
||||
denoiser=denoiser,
|
||||
x=x,
|
||||
c=c,
|
||||
uc=uc,
|
||||
)
|
||||
x.realize(old_denoised)
|
||||
with WallTimeEvent(BenchEvent.STEP):
|
||||
x, old_denoised = self.sampler_step(
|
||||
old_denoised=old_denoised,
|
||||
prev_sigma=(None if i==0 else sigmas[i-1].expand(x.shape[0])),
|
||||
sigma=sigmas[i].expand(x.shape[0]),
|
||||
next_sigma=sigmas[i+1].expand(x.shape[0]),
|
||||
denoiser=denoiser,
|
||||
x=x,
|
||||
c=c,
|
||||
uc=uc,
|
||||
)
|
||||
x.realize(old_denoised)
|
||||
|
||||
return x
|
||||
|
||||
@@ -388,7 +390,8 @@ if __name__ == "__main__":
|
||||
|
||||
start_mem_used = GlobalCounters.mem_used
|
||||
with Timing("loaded weights in ", lambda et_ns: f", {(B:=(GlobalCounters.mem_used-start_mem_used))/1e9:.2f} GB loaded at {B/et_ns:.2f} GB/s"):
|
||||
Tensor.realize(*loaded_weights)
|
||||
with WallTimeEvent(BenchEvent.LOAD_WEIGHTS):
|
||||
Tensor.realize(*loaded_weights)
|
||||
del loaded_weights
|
||||
|
||||
N = 1
|
||||
|
||||
@@ -14,6 +14,7 @@ from tinygrad.nn import Conv2d, GroupNorm
|
||||
from tinygrad.nn.state import torch_load, load_state_dict, get_state_dict
|
||||
from extra.models.clip import Closed, Tokenizer
|
||||
from extra.models.unet import UNetModel
|
||||
from extra.bench_log import BenchEvent, WallTimeEvent
|
||||
|
||||
class AttnBlock:
|
||||
def __init__(self, in_channels):
|
||||
@@ -232,12 +233,13 @@ if __name__ == "__main__":
|
||||
model = StableDiffusion()
|
||||
|
||||
# load in weights
|
||||
load_state_dict(model, torch_load(fetch('https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt', 'sd-v1-4.ckpt'))['state_dict'], strict=False)
|
||||
with WallTimeEvent(BenchEvent.LOAD_WEIGHTS):
|
||||
load_state_dict(model, torch_load(fetch('https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt', 'sd-v1-4.ckpt'))['state_dict'], strict=False)
|
||||
|
||||
if args.fp16:
|
||||
for k,v in get_state_dict(model).items():
|
||||
if k.startswith("model"):
|
||||
v.replace(v.cast(dtypes.float16).realize())
|
||||
if args.fp16:
|
||||
for k,v in get_state_dict(model).items():
|
||||
if k.startswith("model"):
|
||||
v.replace(v.cast(dtypes.float16).realize())
|
||||
|
||||
# run through CLIP to get context
|
||||
tokenizer = Tokenizer.ClipTokenizer()
|
||||
@@ -270,9 +272,10 @@ if __name__ == "__main__":
|
||||
GlobalCounters.reset()
|
||||
t.set_description("%3d %3d" % (index, timestep))
|
||||
with Timing("step in ", enabled=args.timing, on_exit=lambda _: f", using {GlobalCounters.mem_used/1e9:.2f} GB"):
|
||||
tid = Tensor([index])
|
||||
latent = run(model, unconditional_context, context, latent, Tensor([timestep]), alphas[tid], alphas_prev[tid], Tensor([args.guidance]))
|
||||
if args.timing: Device[Device.DEFAULT].synchronize()
|
||||
with WallTimeEvent(BenchEvent.STEP):
|
||||
tid = Tensor([index])
|
||||
latent = run(model, unconditional_context, context, latent, Tensor([timestep]), alphas[tid], alphas_prev[tid], Tensor([args.guidance]))
|
||||
if args.timing: Device[Device.DEFAULT].synchronize()
|
||||
del run
|
||||
|
||||
# upsample latent space to image with autoencoder
|
||||
|
||||
108
extra/bench_log.py
Normal file
108
extra/bench_log.py
Normal file
@@ -0,0 +1,108 @@
|
||||
import time, atexit, uuid
|
||||
from enum import Enum
|
||||
|
||||
from tinygrad.device import Device
|
||||
from tinygrad.helpers import DEBUG, ContextVar, getenv, GlobalCounters
|
||||
|
||||
BENCHMARK_LOG = ContextVar("BENCHMARK_LOG", "")
|
||||
|
||||
if BENCHMARK_LOG:
|
||||
from influxdb_client_3 import InfluxDBClient3, Point, WriteOptions, write_client_options
|
||||
from influxdb_client_3.write_client.client.write_api import WriteType
|
||||
|
||||
class BenchEvent(Enum):
|
||||
LOAD_WEIGHTS = "load_weights"
|
||||
STEP = "step"
|
||||
FULL = "full"
|
||||
class InstantBenchEvent(Enum):
|
||||
GFLOPS = "gflops"
|
||||
|
||||
_events = {}
|
||||
def clear_events():
|
||||
for event in BenchEvent:
|
||||
_events[event] = {"wall": [], "kernel": []}
|
||||
for event in InstantBenchEvent:
|
||||
_events[event] = []
|
||||
clear_events()
|
||||
|
||||
class WallTimeEvent:
|
||||
def __init__(self, event:BenchEvent):
|
||||
self.event = event
|
||||
def __enter__(self):
|
||||
self.start = time.monotonic()
|
||||
return self
|
||||
def __exit__(self, *_):
|
||||
_events[self.event]["wall"].append(time.monotonic() - self.start)
|
||||
return False
|
||||
|
||||
class KernelTimeEvent:
|
||||
def __init__(self, event:BenchEvent):
|
||||
if DEBUG < 2:
|
||||
raise Exception("KernelTimeEvent should only be used in DEBUG >= 2")
|
||||
self.event = event
|
||||
def __enter__(self):
|
||||
self.start = GlobalCounters.time_sum_s
|
||||
return self
|
||||
def __exit__(self, *_):
|
||||
_events[self.event]["kernel"].append(GlobalCounters.time_sum_s - self.start)
|
||||
return False
|
||||
|
||||
def log_event_instant(event:InstantBenchEvent, value:float):
|
||||
_events[event].append(value)
|
||||
|
||||
if BENCHMARK_LOG:
|
||||
INFLUXDB_HOST = getenv("INFLUXDB_HOST", "")
|
||||
INFLUXDB_ORG = getenv("INFLUXDB_ORG", "tiny")
|
||||
INFLUXDB_TOKEN = getenv("INFLUXDB_TOKEN", "")
|
||||
|
||||
def _create_point(run_id, i, attempt, ref, commit, name, value, run):
|
||||
point = Point(BENCHMARK_LOG.value).tag("id", run_id).tag("index", i)
|
||||
point = point.tag("device", Device.DEFAULT)
|
||||
point = point.tag("attempt", attempt).tag("ref", ref).tag("commit", commit)
|
||||
point = point.field(name, value).field("x", run)
|
||||
return point
|
||||
|
||||
@atexit.register
|
||||
def write_events():
|
||||
# see if there are any events to write
|
||||
have_events = False
|
||||
for event in _events:
|
||||
if isinstance(event, BenchEvent):
|
||||
for event_type, values in _events[event].items():
|
||||
if len(values) > 0:
|
||||
have_events = True
|
||||
else:
|
||||
if len(_events[event]) > 0:
|
||||
have_events = True
|
||||
if not have_events:
|
||||
return
|
||||
|
||||
# pull from github envvars
|
||||
ref = getenv("GITHUB_REF_NAME", "")
|
||||
commit = getenv("GITHUB_SHA", "")
|
||||
run = getenv("GITHUB_RUN_NUMBER", "")
|
||||
attempt = getenv("GITHUB_RUN_ATTEMPT", "")
|
||||
|
||||
points = []
|
||||
for event in _events:
|
||||
run_id = str(uuid.uuid4())
|
||||
if isinstance(event, BenchEvent):
|
||||
for event_type, values in _events[event].items():
|
||||
for i, value in enumerate(values):
|
||||
point = _create_point(run_id, i, attempt, ref, commit, f"{event.value}_{event_type}", value, run)
|
||||
points.append(point)
|
||||
else:
|
||||
for i, value in enumerate(_events[event]):
|
||||
point = _create_point(run_id, i, attempt, ref, commit, event.value, value, run)
|
||||
points.append(point)
|
||||
|
||||
write_options = WriteOptions(write_type=WriteType.synchronous, retry_interval=5000, max_retries=5, max_retry_delay=30000, exponential_base=2)
|
||||
wco = write_client_options(write_options=write_options)
|
||||
with InfluxDBClient3(
|
||||
host=INFLUXDB_HOST,
|
||||
org=INFLUXDB_ORG,
|
||||
token=INFLUXDB_TOKEN,
|
||||
auth_scheme="Basic",
|
||||
database="benchmarks",
|
||||
write_client_options=wco) as client:
|
||||
client.write(points)
|
||||
3
setup.py
3
setup.py
@@ -73,7 +73,8 @@ setup(name='tinygrad',
|
||||
"capstone",
|
||||
"pycocotools",
|
||||
"boto3",
|
||||
"pandas"
|
||||
"pandas",
|
||||
"influxdb3-python"
|
||||
],
|
||||
'docs': [
|
||||
"mkdocs",
|
||||
|
||||
10
test/external/external_benchmark_openpilot.py
vendored
10
test/external/external_benchmark_openpilot.py
vendored
@@ -7,6 +7,7 @@ from tinygrad import Tensor, dtypes, TinyJit
|
||||
from tinygrad.helpers import IMAGE, GlobalCounters, fetch, colored, getenv, trange
|
||||
from tinygrad.tensor import _from_np_dtype
|
||||
import numpy as np
|
||||
from extra.bench_log import BenchEvent, WallTimeEvent
|
||||
|
||||
OPENPILOT_MODEL = sys.argv[1] if len(sys.argv) > 1 else "https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx"
|
||||
|
||||
@@ -33,10 +34,11 @@ if __name__ == "__main__":
|
||||
for _ in range(20):
|
||||
GlobalCounters.reset()
|
||||
st = time.perf_counter_ns()
|
||||
# Need to cast non-image inputs from numpy, this is only realistic way to run model
|
||||
inputs = {**{k:v for k,v in new_inputs_junk.items() if 'img' in k},
|
||||
**{k:Tensor(v) for k,v in new_inputs_junk_numpy.items() if 'img' not in k}}
|
||||
ret = next(iter(run_onnx_jit(**inputs).values())).cast(dtypes.float32).numpy()
|
||||
with WallTimeEvent(BenchEvent.STEP):
|
||||
# Need to cast non-image inputs from numpy, this is only realistic way to run model
|
||||
inputs = {**{k:v for k,v in new_inputs_junk.items() if 'img' in k},
|
||||
**{k:Tensor(v) for k,v in new_inputs_junk_numpy.items() if 'img' not in k}}
|
||||
ret = next(iter(run_onnx_jit(**inputs).values())).cast(dtypes.float32).numpy()
|
||||
print(f"jitted: {(time.perf_counter_ns() - st)*1e-6:7.4f} ms")
|
||||
|
||||
suffix = ""
|
||||
|
||||
103
test/testextra/test_bench_log.py
Normal file
103
test/testextra/test_bench_log.py
Normal file
@@ -0,0 +1,103 @@
|
||||
import unittest, time
|
||||
|
||||
from extra.bench_log import BenchEvent, InstantBenchEvent, WallTimeEvent, KernelTimeEvent, log_event_instant, _events, clear_events
|
||||
from tinygrad.helpers import Context
|
||||
from tinygrad.tensor import Tensor
|
||||
|
||||
class TestBenchLog(unittest.TestCase):
|
||||
def setUp(self):
|
||||
clear_events()
|
||||
|
||||
def test_log_single_wall_time(self):
|
||||
for event in BenchEvent:
|
||||
with WallTimeEvent(event):
|
||||
time.sleep(0.1)
|
||||
|
||||
# check event list
|
||||
for event in BenchEvent:
|
||||
self.assertEqual(len(_events[event]["wall"]), 1)
|
||||
self.assertGreater(_events[event]["wall"][0], 0)
|
||||
|
||||
def test_log_double_wall_time(self):
|
||||
for event in BenchEvent:
|
||||
with WallTimeEvent(event):
|
||||
time.sleep(0.1)
|
||||
|
||||
for event in reversed(BenchEvent):
|
||||
with WallTimeEvent(event):
|
||||
time.sleep(0.2)
|
||||
|
||||
# check event list
|
||||
for event in BenchEvent:
|
||||
self.assertEqual(len(_events[event]["wall"]), 2)
|
||||
self.assertGreater(_events[event]["wall"][0], 0)
|
||||
self.assertGreater(_events[event]["wall"][1], 0)
|
||||
|
||||
def test_log_single_kernel_time(self):
|
||||
wall_times = []
|
||||
|
||||
with Context(DEBUG=2):
|
||||
for event in BenchEvent:
|
||||
with KernelTimeEvent(event):
|
||||
st = time.perf_counter()
|
||||
Tensor.rand(32, 32).sum().realize().item()
|
||||
wall_times.append(time.perf_counter() - st)
|
||||
|
||||
# check event list
|
||||
for event in BenchEvent:
|
||||
self.assertEqual(len(_events[event]["kernel"]), 1)
|
||||
self.assertLess(_events[event]["kernel"][0], wall_times[0])
|
||||
self.assertGreater(_events[event]["kernel"][0], 0)
|
||||
|
||||
def test_interleaved_wall_kernel_time(self):
|
||||
wall_times = []
|
||||
with Context(DEBUG=2):
|
||||
for event in BenchEvent:
|
||||
with KernelTimeEvent(event):
|
||||
st = time.perf_counter()
|
||||
Tensor.rand(32, 32).sum().realize().item()
|
||||
wall_times.append(time.perf_counter() - st)
|
||||
|
||||
with WallTimeEvent(event):
|
||||
st = time.perf_counter()
|
||||
Tensor.rand(32, 32).sum().realize().item()
|
||||
wall_times.append(time.perf_counter() - st)
|
||||
|
||||
# check event list
|
||||
for event in BenchEvent:
|
||||
self.assertEqual(len(_events[event]["wall"]), 1)
|
||||
self.assertEqual(len(_events[event]["kernel"]), 1)
|
||||
self.assertLess(_events[event]["kernel"][0], wall_times[0])
|
||||
self.assertGreater(_events[event]["kernel"][0], 0)
|
||||
|
||||
def test_stacked_wall_kernel_time(self):
|
||||
with Context(DEBUG=2):
|
||||
for event in BenchEvent:
|
||||
with KernelTimeEvent(event):
|
||||
with WallTimeEvent(event):
|
||||
Tensor.rand(32, 32).sum().realize().item()
|
||||
|
||||
for event in BenchEvent:
|
||||
with WallTimeEvent(event):
|
||||
with KernelTimeEvent(event):
|
||||
Tensor.rand(32, 32).sum().realize().item()
|
||||
|
||||
for event in BenchEvent:
|
||||
self.assertEqual(len(_events[event]["wall"]), 2)
|
||||
self.assertEqual(len(_events[event]["kernel"]), 2)
|
||||
self.assertLess(_events[event]["kernel"][0], _events[event]["wall"][0])
|
||||
self.assertGreater(_events[event]["kernel"][0], 0)
|
||||
self.assertLess(_events[event]["kernel"][1], _events[event]["wall"][1])
|
||||
self.assertGreater(_events[event]["kernel"][1], 0)
|
||||
|
||||
def test_log_instant_event(self):
|
||||
for event in InstantBenchEvent:
|
||||
log_event_instant(event, 1000)
|
||||
|
||||
# check event list
|
||||
for event in InstantBenchEvent:
|
||||
self.assertEqual(len(_events[event]), 1)
|
||||
self.assertEqual(_events[event][0], 1000)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user