diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 9c7b59eba4..cf5f1e33b6 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -637,3 +637,127 @@ jobs: openpilot_0_9_7.txt openpilot_image_0_9_4.txt openpilot_image_0_9_7.txt + + testreddriverbenchmark: + name: AM Benchmark + runs-on: [self-hosted, Linux, tinyboxrandom] + timeout-minutes: 10 + defaults: + run: + shell: bash -e -o pipefail {0} + if: github.repository_owner == 'tinygrad' + steps: + - name: Checkout Code + uses: actions/checkout@v4 + - name: Remove amdgpu + run: sudo rmmod amdgpu || true + - name: Cleanup running AM processes + run: python extra/amdpci/am_smi.py --pids --kill + - name: Symlink models and datasets + run: | + mkdir -p weights + ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz + ln -s ~/tinygrad/weights/LLaMA weights/LLaMA + ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz + ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen + ln -s /raid/weights/LLaMA-2 weights/LLaMA-2 + mkdir -p extra/datasets + ln -s /raid/datasets/imagenet extra/datasets/imagenet + - name: setup staging db + if: github.ref == 'refs/heads/update_benchmark_staging' + run: | + echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV + rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal + - name: reset process replay + run: test/external/process_replay/reset.py + # Fails on 9070 + # - name: Test tensor cores + # run: | + # AMD=1 AMD_LLVM=0 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded_amd TestLinearizer.test_tensor_cores_padded_uops + # AMD=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded_amd TestLinearizer.test_tensor_cores_padded_uops + # AMD=1 SHOULD_USE_TC=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py + - name: Run Tensor Core GEMM (AMD) + run: AMD=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 ATOL=2e-2 python3 extra/gemm/simple_matmul.py | tee am_matmul_amd.txt + - name: Test AMD=1 + run: DEBUG=2 AMD=1 python -m pytest -rA test/test_tiny.py + - name: Test driver cold start time + run: time AMD=1 AM_RESET=1 python3 test/test_tiny.py TestTiny.test_plus + - name: Test driver warm start time + run: time AMD=1 python3 test/test_tiny.py TestTiny.test_plus + - name: Test DISK copy time + run: AMD=1 TESTFILE=/raid/downloads/llama3-8b-sfr/model-00001-of-00004.safetensors python3 test/external/external_benchmark_disk_raw.py + - name: Run full CIFAR training w 1 GPU + run: time BENCHMARK_LOG=cifar AMD=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee am_train_cifar_one_gpu.txt + - name: Run 10 MLPerf ResNet50 training steps (1 gpu) + run: BENCHMARK_LOG=resnet_10steps AMD=1 MNISTMOCK=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee am_train_resnet_one_gpu.txt + - name: Run 10 MLPerf Bert training steps (6 gpu) + # TODO: remove BERT_LAYERS once scheduler is fast + run: BENCHMARK_LOG=bert_10steps AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=1 BERT_LAYERS=2 FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 MODEL=bert python3 examples/mlperf/model_train.py | tee am_train_bert_one_gpu.txt + - uses: actions/upload-artifact@v4 + with: + name: Speed (AM Driver) + path: | + am_matmul_amd.txt + am_train_cifar_one_gpu.txt + am_train_resnet_one_gpu.txt + am_train_bert_one_gpu.txt + - name: Run process replay tests + run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py + + testgreendriverbenchmark: + name: NV Benchmark + runs-on: [self-hosted, Linux, tinyboxrandom] + timeout-minutes: 10 + defaults: + run: + shell: bash -e -o pipefail {0} + if: github.repository_owner == 'tinygrad' + steps: + - name: Checkout Code + uses: actions/checkout@v4 + - name: Remove nv modules + run: ./extra/nvpci/nv_smi.py rmmod + - name: Symlink models and datasets + run: | + mkdir -p weights + ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz + ln -s ~/tinygrad/weights/LLaMA weights/LLaMA + ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz + ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen + ln -s /raid/weights/LLaMA-2 weights/LLaMA-2 + mkdir -p extra/datasets + ln -s /raid/datasets/imagenet extra/datasets/imagenet + - name: setup staging db + if: github.ref == 'refs/heads/update_benchmark_staging' + run: | + echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV + rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal + - name: reset process replay + run: test/external/process_replay/reset.py + - name: Test tensor cores + run: NV=1 ALLOW_TF32=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops + - name: Test driver cold start time + run: time NV=1 AM_RESET=1 python3 test/test_tiny.py TestTiny.test_plus + - name: Test driver warm start time + run: time NV=1 python3 test/test_tiny.py TestTiny.test_plus + - name: Test DISK copy time + run: NV=1 TESTFILE=/raid/downloads/llama3-8b-sfr/model-00001-of-00004.safetensors python3 test/external/external_benchmark_disk_raw.py + - name: Test LLAMA-3 + run: BENCHMARK_LOG=llama3_beam NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --benchmark --temperature 0 | tee nv_llama3_beam.txt + - name: Run full CIFAR training w 1 GPU + run: time BENCHMARK_LOG=cifar AMD=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee nv_train_cifar_one_gpu.txt + - name: Run 10 MLPerf ResNet50 training steps (1 gpu) + run: BENCHMARK_LOG=resnet_10steps NV=1 MNISTMOCK=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee nv_train_resnet_one_gpu.txt + - name: Run 10 MLPerf Bert training steps (6 gpu) + # TODO: remove BERT_LAYERS once scheduler is fast + run: BENCHMARK_LOG=bert_10steps NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=1 BERT_LAYERS=2 FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 MODEL=bert python3 examples/mlperf/model_train.py | tee nv_train_bert_one_gpu.txt + - uses: actions/upload-artifact@v4 + with: + name: Speed (NV Driver) + path: | + nv_llama3_beam.txt + nv_train_cifar_one_gpu.txt + nv_train_resnet_one_gpu.txt + nv_train_bert_one_gpu.txt + - name: Run process replay tests + run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py diff --git a/test/external/external_benchmark_disk_raw.py b/test/external/external_benchmark_disk_raw.py index 6566ae4109..4d4962352d 100644 --- a/test/external/external_benchmark_disk_raw.py +++ b/test/external/external_benchmark_disk_raw.py @@ -1,7 +1,8 @@ import pathlib from tinygrad import Tensor, Device, Context +from tinygrad.helpers import getenv if __name__ == "__main__": with Context(DEBUG=2): - disk_llama = Tensor(pathlib.Path("/raid/weights/LLaMA-3/8B/consolidated.00.pth")) + disk_llama = Tensor(pathlib.Path(getenv("TESTFILE", "/raid/weights/LLaMA-3/8B/consolidated.00.pth"))) device_llama = disk_llama.to(Device.DEFAULT).realize()