mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-08 22:48:25 -05:00
ci: add h machines (#11416)
* ci: add h machines * more * fix names * names not collide * 20 * 10
This commit is contained in:
124
.github/workflows/benchmark.yml
vendored
124
.github/workflows/benchmark.yml
vendored
@@ -637,3 +637,127 @@ jobs:
|
||||
openpilot_0_9_7.txt
|
||||
openpilot_image_0_9_4.txt
|
||||
openpilot_image_0_9_7.txt
|
||||
|
||||
testreddriverbenchmark:
|
||||
name: AM Benchmark
|
||||
runs-on: [self-hosted, Linux, tinyboxrandom]
|
||||
timeout-minutes: 10
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -e -o pipefail {0}
|
||||
if: github.repository_owner == 'tinygrad'
|
||||
steps:
|
||||
- name: Checkout Code
|
||||
uses: actions/checkout@v4
|
||||
- name: Remove amdgpu
|
||||
run: sudo rmmod amdgpu || true
|
||||
- name: Cleanup running AM processes
|
||||
run: python extra/amdpci/am_smi.py --pids --kill
|
||||
- name: Symlink models and datasets
|
||||
run: |
|
||||
mkdir -p weights
|
||||
ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz
|
||||
ln -s ~/tinygrad/weights/LLaMA weights/LLaMA
|
||||
ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz
|
||||
ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen
|
||||
ln -s /raid/weights/LLaMA-2 weights/LLaMA-2
|
||||
mkdir -p extra/datasets
|
||||
ln -s /raid/datasets/imagenet extra/datasets/imagenet
|
||||
- name: setup staging db
|
||||
if: github.ref == 'refs/heads/update_benchmark_staging'
|
||||
run: |
|
||||
echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV
|
||||
rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal
|
||||
- name: reset process replay
|
||||
run: test/external/process_replay/reset.py
|
||||
# Fails on 9070
|
||||
# - name: Test tensor cores
|
||||
# run: |
|
||||
# AMD=1 AMD_LLVM=0 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded_amd TestLinearizer.test_tensor_cores_padded_uops
|
||||
# AMD=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded_amd TestLinearizer.test_tensor_cores_padded_uops
|
||||
# AMD=1 SHOULD_USE_TC=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
|
||||
- name: Run Tensor Core GEMM (AMD)
|
||||
run: AMD=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 ATOL=2e-2 python3 extra/gemm/simple_matmul.py | tee am_matmul_amd.txt
|
||||
- name: Test AMD=1
|
||||
run: DEBUG=2 AMD=1 python -m pytest -rA test/test_tiny.py
|
||||
- name: Test driver cold start time
|
||||
run: time AMD=1 AM_RESET=1 python3 test/test_tiny.py TestTiny.test_plus
|
||||
- name: Test driver warm start time
|
||||
run: time AMD=1 python3 test/test_tiny.py TestTiny.test_plus
|
||||
- name: Test DISK copy time
|
||||
run: AMD=1 TESTFILE=/raid/downloads/llama3-8b-sfr/model-00001-of-00004.safetensors python3 test/external/external_benchmark_disk_raw.py
|
||||
- name: Run full CIFAR training w 1 GPU
|
||||
run: time BENCHMARK_LOG=cifar AMD=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee am_train_cifar_one_gpu.txt
|
||||
- name: Run 10 MLPerf ResNet50 training steps (1 gpu)
|
||||
run: BENCHMARK_LOG=resnet_10steps AMD=1 MNISTMOCK=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee am_train_resnet_one_gpu.txt
|
||||
- name: Run 10 MLPerf Bert training steps (6 gpu)
|
||||
# TODO: remove BERT_LAYERS once scheduler is fast
|
||||
run: BENCHMARK_LOG=bert_10steps AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=1 BERT_LAYERS=2 FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 MODEL=bert python3 examples/mlperf/model_train.py | tee am_train_bert_one_gpu.txt
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: Speed (AM Driver)
|
||||
path: |
|
||||
am_matmul_amd.txt
|
||||
am_train_cifar_one_gpu.txt
|
||||
am_train_resnet_one_gpu.txt
|
||||
am_train_bert_one_gpu.txt
|
||||
- name: Run process replay tests
|
||||
run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py
|
||||
|
||||
testgreendriverbenchmark:
|
||||
name: NV Benchmark
|
||||
runs-on: [self-hosted, Linux, tinyboxrandom]
|
||||
timeout-minutes: 10
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -e -o pipefail {0}
|
||||
if: github.repository_owner == 'tinygrad'
|
||||
steps:
|
||||
- name: Checkout Code
|
||||
uses: actions/checkout@v4
|
||||
- name: Remove nv modules
|
||||
run: ./extra/nvpci/nv_smi.py rmmod
|
||||
- name: Symlink models and datasets
|
||||
run: |
|
||||
mkdir -p weights
|
||||
ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz
|
||||
ln -s ~/tinygrad/weights/LLaMA weights/LLaMA
|
||||
ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz
|
||||
ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen
|
||||
ln -s /raid/weights/LLaMA-2 weights/LLaMA-2
|
||||
mkdir -p extra/datasets
|
||||
ln -s /raid/datasets/imagenet extra/datasets/imagenet
|
||||
- name: setup staging db
|
||||
if: github.ref == 'refs/heads/update_benchmark_staging'
|
||||
run: |
|
||||
echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV
|
||||
rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal
|
||||
- name: reset process replay
|
||||
run: test/external/process_replay/reset.py
|
||||
- name: Test tensor cores
|
||||
run: NV=1 ALLOW_TF32=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
|
||||
- name: Test driver cold start time
|
||||
run: time NV=1 AM_RESET=1 python3 test/test_tiny.py TestTiny.test_plus
|
||||
- name: Test driver warm start time
|
||||
run: time NV=1 python3 test/test_tiny.py TestTiny.test_plus
|
||||
- name: Test DISK copy time
|
||||
run: NV=1 TESTFILE=/raid/downloads/llama3-8b-sfr/model-00001-of-00004.safetensors python3 test/external/external_benchmark_disk_raw.py
|
||||
- name: Test LLAMA-3
|
||||
run: BENCHMARK_LOG=llama3_beam NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --benchmark --temperature 0 | tee nv_llama3_beam.txt
|
||||
- name: Run full CIFAR training w 1 GPU
|
||||
run: time BENCHMARK_LOG=cifar AMD=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee nv_train_cifar_one_gpu.txt
|
||||
- name: Run 10 MLPerf ResNet50 training steps (1 gpu)
|
||||
run: BENCHMARK_LOG=resnet_10steps NV=1 MNISTMOCK=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee nv_train_resnet_one_gpu.txt
|
||||
- name: Run 10 MLPerf Bert training steps (6 gpu)
|
||||
# TODO: remove BERT_LAYERS once scheduler is fast
|
||||
run: BENCHMARK_LOG=bert_10steps NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=1 BERT_LAYERS=2 FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 MODEL=bert python3 examples/mlperf/model_train.py | tee nv_train_bert_one_gpu.txt
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: Speed (NV Driver)
|
||||
path: |
|
||||
nv_llama3_beam.txt
|
||||
nv_train_cifar_one_gpu.txt
|
||||
nv_train_resnet_one_gpu.txt
|
||||
nv_train_bert_one_gpu.txt
|
||||
- name: Run process replay tests
|
||||
run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py
|
||||
|
||||
3
test/external/external_benchmark_disk_raw.py
vendored
3
test/external/external_benchmark_disk_raw.py
vendored
@@ -1,7 +1,8 @@
|
||||
import pathlib
|
||||
from tinygrad import Tensor, Device, Context
|
||||
from tinygrad.helpers import getenv
|
||||
|
||||
if __name__ == "__main__":
|
||||
with Context(DEBUG=2):
|
||||
disk_llama = Tensor(pathlib.Path("/raid/weights/LLaMA-3/8B/consolidated.00.pth"))
|
||||
disk_llama = Tensor(pathlib.Path(getenv("TESTFILE", "/raid/weights/LLaMA-3/8B/consolidated.00.pth")))
|
||||
device_llama = disk_llama.to(Device.DEFAULT).realize()
|
||||
|
||||
Reference in New Issue
Block a user