diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_1xMI300X/dev_beam.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_1xMI300X/dev_beam.sh new file mode 100755 index 0000000000..68e5fdfcde --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_1xMI300X/dev_beam.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +export PYTHONPATH="." AMD=1 +export MODEL="bert" +export DEFAULT_FLOAT="HALF" GPUS=1 BS=128 EVAL_BS=128 + +export IGNORE_OOB=1 + +export BEAM=3 BEAM_UOPS_MAX=4000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 +export IGNORE_JIT_FIRST_BEAM=1 +# export BEAM_LOG_SURPASS_MAX=1 +# export BASEDIR="/raid/datasets/wiki" + +export RESET_STEP=1 +export BENCHMARK=10 BERT_LAYERS=2 DEBUG=2 + +python3 examples/mlperf/model_train.py diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/README.md b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/README.md new file mode 100644 index 0000000000..844b90f949 --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/README.md @@ -0,0 +1,69 @@ +# 1. Problem + +This problem uses BERT for NLP. + +## Requirements + +Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0. +``` +git clone https://github.com/tinygrad/tinygrad.git +python3 -m pip install -e ".[mlperf]" +``` +Also install gdown (for dataset), numpy, tqdm and tensorflow. +``` +pip install gdown numpy tqdm tensorflow +``` + +### tinybox_green +Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md) +This is the default on production tinybox green. + +# 2. Directions + +## Steps to download and verify data + +### 1. Download raw data + +``` +BASEDIR="/raid/datasets/wiki" WIKI_TRAIN=1 VERIFY_CHECKSUM=1 python3 extra/datasets/wikipedia_download.py +``` + +### 2. Preprocess train and validation data + +Note: The number of threads used for preprocessing is limited by available memory. With 128GB of RAM, a maximum of 16 threads is recommended. + +#### Training: +``` +BASEDIR="/raid/datasets/wiki" NUM_WORKERS=16 python3 extra/datasets/wikipedia.py pre-train all +``` + +Generating a specific topic (Between 0 and 499) +``` +BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-train 42 +``` + +#### Validation: +``` +BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval +``` +## Running + +### tinybox_green + +#### Steps to run benchmark +``` +examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh +``` + +### tinybox_red + +#### Steps to run benchmark +``` +examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh +``` +### tinybox_8xMI300X + +#### Steps to run benchmark +``` +examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh +``` \ No newline at end of file diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_beam.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_beam.sh new file mode 100755 index 0000000000..cfaad1e59e --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_beam.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +export PYTHONPATH="." AMD=1 +export MODEL="bert" +export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024 +export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1 + +export IGNORE_OOB=1 + +export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 +export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0 +export BASEDIR="/raid/datasets/wiki" + +export BENCHMARK=10 BERT_LAYERS=2 + +python3 examples/mlperf/model_train.py diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_run.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_run.sh new file mode 100755 index 0000000000..6ef7c1b996 --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_run.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +export PYTHONPATH="." AMD=1 +export MODEL="bert" +export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024 + +# similar to https://github.com/mlcommons/training_results_v3.1/blob/d06288b2bd675a9d88e0e6181f5bb5626b71ec19/Quanta_Cloud_Technology/results/D54U-3U/bert/result_1.txt#L54 +export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1 +export TRAIN_STEPS=3900 + +export IGNORE_OOB=1 + +export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 +export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0 +export BASEDIR="/raid/datasets/wiki" + +export WANDB=1 PARALLEL=0 + +RUNMLPERF=1 python3 examples/mlperf/model_train.py \ No newline at end of file diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh new file mode 100755 index 0000000000..cd2f30579b --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh @@ -0,0 +1,30 @@ +#!/bin/bash +set -e # Exit on any error +set -o pipefail # Make pipeline fail if any command fails + +export PYTHONPATH="." AMD=1 +export MODEL="bert" +export SUBMISSION_PLATFORM="tinybox_8xMI300X" +export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024 + +# similar to https://github.com/mlcommons/training_results_v3.1/blob/d06288b2bd675a9d88e0e6181f5bb5626b71ec19/Quanta_Cloud_Technology/results/D54U-3U/bert/result_1.txt#L54 +export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1 +export TRAIN_STEPS=3900 + +export IGNORE_OOB=1 + +export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 +export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0 +export BASEDIR="/raid/datasets/wiki" + +# pip install -e ".[mlperf]" +export LOGMLPERF=1 + +export SEED=$RANDOM +DATETIME=$(date "+%m%d%H%M") +LOGFILE="bert_8xMI300x_${DATETIME}_${SEED}.log" + +BENCHMARK=10 INITMLPERF=1 BERT_LAYERS=2 python3 examples/mlperf/model_train.py | tee $LOGFILE + +# run +PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/README.md b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/README.md new file mode 100644 index 0000000000..844b90f949 --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/README.md @@ -0,0 +1,69 @@ +# 1. Problem + +This problem uses BERT for NLP. + +## Requirements + +Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0. +``` +git clone https://github.com/tinygrad/tinygrad.git +python3 -m pip install -e ".[mlperf]" +``` +Also install gdown (for dataset), numpy, tqdm and tensorflow. +``` +pip install gdown numpy tqdm tensorflow +``` + +### tinybox_green +Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md) +This is the default on production tinybox green. + +# 2. Directions + +## Steps to download and verify data + +### 1. Download raw data + +``` +BASEDIR="/raid/datasets/wiki" WIKI_TRAIN=1 VERIFY_CHECKSUM=1 python3 extra/datasets/wikipedia_download.py +``` + +### 2. Preprocess train and validation data + +Note: The number of threads used for preprocessing is limited by available memory. With 128GB of RAM, a maximum of 16 threads is recommended. + +#### Training: +``` +BASEDIR="/raid/datasets/wiki" NUM_WORKERS=16 python3 extra/datasets/wikipedia.py pre-train all +``` + +Generating a specific topic (Between 0 and 499) +``` +BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-train 42 +``` + +#### Validation: +``` +BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval +``` +## Running + +### tinybox_green + +#### Steps to run benchmark +``` +examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh +``` + +### tinybox_red + +#### Steps to run benchmark +``` +examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh +``` +### tinybox_8xMI300X + +#### Steps to run benchmark +``` +examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh +``` \ No newline at end of file diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh new file mode 100755 index 0000000000..a2d477312d --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +export PYTHONPATH="." NV=1 +export MODEL="bert" +export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90 + +export IGNORE_OOB=1 + +export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 +export IGNORE_JIT_FIRST_BEAM=1 +export BEAM_LOG_SURPASS_MAX=1 +export BASEDIR="/raid/datasets/wiki" + +export BENCHMARK=10 BERT_LAYERS=2 DEBUG=2 + +python3 examples/mlperf/model_train.py diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh new file mode 100755 index 0000000000..4365466211 --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +export PYTHONPATH="." NV=1 +export MODEL="bert" +export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90 + +export IGNORE_OOB=1 + +export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 +export IGNORE_JIT_FIRST_BEAM=1 +export BASEDIR="/raid/datasets/wiki" + +export WANDB=1 PARALLEL=0 + +RUNMLPERF=1 python3 examples/mlperf/model_train.py \ No newline at end of file diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh new file mode 100755 index 0000000000..4b3b911933 --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh @@ -0,0 +1,27 @@ +#!/bin/bash +set -e # Exit on any error +set -o pipefail # Make pipeline fail if any command fails + +export PYTHONPATH="." NV=1 +export MODEL="bert" +export SUBMISSION_PLATFORM="tinybox_green" +export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90 + +export IGNORE_OOB=1 + +export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 +export IGNORE_JIT_FIRST_BEAM=1 +export BASEDIR="/raid/datasets/wiki" + +# pip install -e ".[mlperf]" +export LOGMLPERF=1 + +export SEED=$RANDOM +DATETIME=$(date "+%m%d%H%M") +LOGFILE="bert_green_${DATETIME}_${SEED}.log" + +# init +BENCHMARK=10 INITMLPERF=1 BERT_LAYERS=2 python3 examples/mlperf/model_train.py | tee $LOGFILE + +# run +PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/README.md b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/README.md new file mode 100644 index 0000000000..844b90f949 --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/README.md @@ -0,0 +1,69 @@ +# 1. Problem + +This problem uses BERT for NLP. + +## Requirements + +Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0. +``` +git clone https://github.com/tinygrad/tinygrad.git +python3 -m pip install -e ".[mlperf]" +``` +Also install gdown (for dataset), numpy, tqdm and tensorflow. +``` +pip install gdown numpy tqdm tensorflow +``` + +### tinybox_green +Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md) +This is the default on production tinybox green. + +# 2. Directions + +## Steps to download and verify data + +### 1. Download raw data + +``` +BASEDIR="/raid/datasets/wiki" WIKI_TRAIN=1 VERIFY_CHECKSUM=1 python3 extra/datasets/wikipedia_download.py +``` + +### 2. Preprocess train and validation data + +Note: The number of threads used for preprocessing is limited by available memory. With 128GB of RAM, a maximum of 16 threads is recommended. + +#### Training: +``` +BASEDIR="/raid/datasets/wiki" NUM_WORKERS=16 python3 extra/datasets/wikipedia.py pre-train all +``` + +Generating a specific topic (Between 0 and 499) +``` +BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-train 42 +``` + +#### Validation: +``` +BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval +``` +## Running + +### tinybox_green + +#### Steps to run benchmark +``` +examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh +``` + +### tinybox_red + +#### Steps to run benchmark +``` +examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh +``` +### tinybox_8xMI300X + +#### Steps to run benchmark +``` +examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh +``` \ No newline at end of file diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh new file mode 100755 index 0000000000..881dd247b4 --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +export PYTHONPATH="." AMD=1 +export MODEL="bert" +export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90 + +export IGNORE_OOB=1 + +export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 +export IGNORE_JIT_FIRST_BEAM=1 +export BEAM_LOG_SURPASS_MAX=1 +export BASEDIR="/raid/datasets/wiki" + +export RESET_STEP=1 +export BENCHMARK=10 BERT_LAYERS=2 DEBUG=2 + +python3 examples/mlperf/model_train.py diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh new file mode 100755 index 0000000000..719ecd5bf9 --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +export PYTHONPATH="." AMD=1 +export MODEL="bert" +export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90 + +export IGNORE_OOB=1 + +export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 +export IGNORE_JIT_FIRST_BEAM=1 +export BASEDIR="/raid/datasets/wiki" + +export WANDB=1 PARALLEL=0 + +RUNMLPERF=1 python3 examples/mlperf/model_train.py \ No newline at end of file diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh new file mode 100755 index 0000000000..4b30305947 --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh @@ -0,0 +1,30 @@ +#!/bin/bash +set -e # Exit on any error +set -o pipefail # Make pipeline fail if any command fails + +export PYTHONPATH="." AMD=1 +export MODEL="bert" +export SUBMISSION_PLATFORM="tinybox_red" +export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=90 EVAL_BS=90 + +export IGNORE_OOB=1 + +export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 +export IGNORE_JIT_FIRST_BEAM=1 +export BASEDIR="/raid/datasets/wiki" + +# pip install -e ".[mlperf]" +export LOGMLPERF=1 + +export SEED=$RANDOM +DATETIME=$(date "+%m%d%H%M") +LOGFILE="bert_red_${DATETIME}_${SEED}.log" + +export HCQDEV_WAIT_TIMEOUT_MS=100000 # prevents hang? + +# init +sleep 5 && sudo rmmod amdgpu || true +BENCHMARK=10 INITMLPERF=1 BERT_LAYERS=2 python3 examples/mlperf/model_train.py | tee $LOGFILE + +# run +PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/README.md b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/README.md new file mode 100644 index 0000000000..d380cec5b5 --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/README.md @@ -0,0 +1,50 @@ +# 1. Problem + +This problem uses the ResNet-50 CNN to do image classification. + +## Requirements + +Install tinygrad and mlperf-logging from master. +``` +git clone https://github.com/tinygrad/tinygrad.git +python3 -m pip install -e ".[mlperf]" +``` + +### tinybox_green +Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md) +This is the default on production tinybox green. + +### tinybox_red +Disable cwsr +This is the default on production tinybox red. +``` +sudo vi /etc/modprobe.d/amdgpu.conf +cat < /etc/modprobe.d/amdgpu.conf +options amdgpu cwsr_enable=0 +EOF +sudo update-initramfs -u +sudo reboot + +# validate +sudo cat /sys/module/amdgpu/parameters/cwsr_enable #= 0 +``` + +# 2. Directions + +## Steps to download and verify data + +``` +IMGNET_TRAIN=1 python3 extra/datasets/imagenet_download.py +``` + +## Steps for one time setup + +### tinybox_red +``` +examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh +``` + +## Steps to run benchmark +``` +examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh +``` diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_beam.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_beam.sh new file mode 100755 index 0000000000..2319da3fdc --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_beam.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +export PYTHONPATH="." NV=1 +export MODEL="resnet" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192 + +export RESET_STEP=0 + +export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0 + +export BENCHMARK=10 DEBUG=2 + +python3 examples/mlperf/model_train.py diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_run.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_run.sh new file mode 100755 index 0000000000..ebe927c373 --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_run.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +export PYTHONPATH="." NV=1 +export MODEL="resnet" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192 + +export RESET_STEP=0 + +export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0 + +export EVAL_START_EPOCH=3 EVAL_FREQ=4 + +export WANDB=1 PARALLEL=0 + +python3 examples/mlperf/model_train.py diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/run_and_time.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/run_and_time.sh new file mode 100755 index 0000000000..9c7193288a --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/run_and_time.sh @@ -0,0 +1,25 @@ +#!/bin/bash +set -e # Exit on any error +set -o pipefail # Make pipeline fail if any command fails + +export PYTHONPATH="." NV=1 +export MODEL="resnet" +export SUBMISSION_PLATFORM="tinybox_green" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192 + +export RESET_STEP=0 + +export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0 + +# pip install -e ".[mlperf]" +export LOGMLPERF=${LOGMLPERF:-1} + +export SEED=$RANDOM +DATETIME=$(date "+%m%d%H%M") +LOGFILE="resnet_green_${DATETIME}_${SEED}.log" + +# init +BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE + +# run +PARALLEL=0 RUNMLPERF=1 EVAL_START_EPOCH=3 EVAL_FREQ=4 python3 examples/mlperf/model_train.py | tee -a $LOGFILE diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/README.md b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/README.md new file mode 100644 index 0000000000..d380cec5b5 --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/README.md @@ -0,0 +1,50 @@ +# 1. Problem + +This problem uses the ResNet-50 CNN to do image classification. + +## Requirements + +Install tinygrad and mlperf-logging from master. +``` +git clone https://github.com/tinygrad/tinygrad.git +python3 -m pip install -e ".[mlperf]" +``` + +### tinybox_green +Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md) +This is the default on production tinybox green. + +### tinybox_red +Disable cwsr +This is the default on production tinybox red. +``` +sudo vi /etc/modprobe.d/amdgpu.conf +cat < /etc/modprobe.d/amdgpu.conf +options amdgpu cwsr_enable=0 +EOF +sudo update-initramfs -u +sudo reboot + +# validate +sudo cat /sys/module/amdgpu/parameters/cwsr_enable #= 0 +``` + +# 2. Directions + +## Steps to download and verify data + +``` +IMGNET_TRAIN=1 python3 extra/datasets/imagenet_download.py +``` + +## Steps for one time setup + +### tinybox_red +``` +examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh +``` + +## Steps to run benchmark +``` +examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh +``` diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_beam.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_beam.sh new file mode 100755 index 0000000000..7bcbec2f03 --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_beam.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +export PYTHONPATH="." AMD=1 +export MODEL="resnet" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192 + +export RESET_STEP=0 + +export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=96 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0 + +export BENCHMARK=10 DEBUG=${DEBUG:-2} + +python3 examples/mlperf/model_train.py diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_run.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_run.sh new file mode 100755 index 0000000000..aad23e43df --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_run.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +export PYTHONPATH="." AMD=1 +export MODEL="resnet" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192 + +export RESET_STEP=0 + +export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=96 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0 + +export EVAL_START_EPOCH=3 EVAL_FREQ=4 + +export WANDB=1 PARALLEL=0 + +python3 examples/mlperf/model_train.py diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh new file mode 100755 index 0000000000..7a93d435a5 --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh @@ -0,0 +1,26 @@ +#!/bin/bash +set -e # Exit on any error +set -o pipefail # Make pipeline fail if any command fails + +export PYTHONPATH="." AMD=1 +export MODEL="resnet" +export SUBMISSION_PLATFORM="tinybox_red" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192 + +export RESET_STEP=0 + +export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=96 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0 + +# pip install -e ".[mlperf]" +export LOGMLPERF=${LOGMLPERF:-1} + +export SEED=$RANDOM +DATETIME=$(date "+%m%d%H%M") +LOGFILE="resnet_red_${DATETIME}_${SEED}.log" + +# init +sleep 5 && sudo rmmod amdgpu || true +BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE + +# run +PARALLEL=0 RUNMLPERF=1 EVAL_START_EPOCH=3 EVAL_FREQ=4 python3 examples/mlperf/model_train.py | tee -a $LOGFILE diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh new file mode 100755 index 0000000000..a9806164f4 --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +rocm-smi --setprofile compute +rocm-smi --setmclk 3 +rocm-smi --setperflevel high + +# power cap to 350W +echo "350000000" | sudo tee /sys/class/drm/card{1..6}/device/hwmon/hwmon*/power1_cap diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/retinanet/implementations/tinybox_green/README.md b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/retinanet/implementations/tinybox_green/README.md new file mode 100644 index 0000000000..ce1ac9b9a3 --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/retinanet/implementations/tinybox_green/README.md @@ -0,0 +1,38 @@ +# 1. Problem + +This problem uses RetinaNet for SSD. + +## Requirements + +Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0. +``` +git clone https://github.com/tinygrad/tinygrad.git +python3 -m pip install -e ".[mlperf]" +``` + +Also install the following dependencies: +``` +pip install tqdm numpy pycocotools boto3 pandas torch torchvision +``` + +### tinybox_green +Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md) +This is the default on production tinybox green. + +# 2. Directions + +## Steps to download data + +Run the following: +``` +BASEDIR=/raid/datasets/openimages python3 extra/datasets/openimages.py +``` + +## Running + +### tinybox_green + +#### Steps to run benchmark +``` +examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/retinanet/implementations/tinybox_green/run_and_time.sh +``` diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/retinanet/implementations/tinybox_green/dev_beam.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/retinanet/implementations/tinybox_green/dev_beam.sh new file mode 100755 index 0000000000..6e25bb9671 --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/retinanet/implementations/tinybox_green/dev_beam.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +export PYTHONPATH="." NV=1 +export MODEL="retinanet" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=96 EVAL_BS=96 +export BASEDIR="/raid/datasets/openimages" + +# export RESET_STEP=0 + +export TRAIN_BEAM=2 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0 + +export BENCHMARK=5 DEBUG=2 + +python examples/mlperf/model_train.py diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/retinanet/implementations/tinybox_green/dev_run.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/retinanet/implementations/tinybox_green/dev_run.sh new file mode 100755 index 0000000000..7a3ee0dfa2 --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/retinanet/implementations/tinybox_green/dev_run.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +export PYTHONPATH="." NV=1 +export MODEL="retinanet" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=96 EVAL_BS=96 +export BASEDIR="/raid/datasets/openimages" + +# export RESET_STEP=0 + +export TRAIN_BEAM=2 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0 + +export WANDB=1 PARALLEL=0 +export RUNMLPERF=1 + +python examples/mlperf/model_train.py diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/retinanet/implementations/tinybox_green/run_and_time.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/retinanet/implementations/tinybox_green/run_and_time.sh new file mode 100755 index 0000000000..74cdc87a1b --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/retinanet/implementations/tinybox_green/run_and_time.sh @@ -0,0 +1,25 @@ +#!/bin/bash +set -e # Exit on any error +set -o pipefail # Make pipeline fail if any command fails + +export PYTHONPATH="." NV=1 +export MODEL="retinanet" +export SUBMISSION_PLATFORM="tinybox_green" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=96 EVAL_BS=96 + +export TRAIN_BEAM=2 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0 +export IGNORE_JIT_FIRST_BEAM=1 +export BASEDIR="/raid/datasets/openimages" + +# pip install -e ".[mlperf]" +export LOGMLPERF=1 + +export SEED=$RANDOM +DATETIME=$(date "+%m%d%H%M") +LOGFILE="retinanet_green_${DATETIME}_${SEED}.log" + +# init +BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE + +# run +PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/retinanet/implementations/tinybox_red/dev_beam.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/retinanet/implementations/tinybox_red/dev_beam.sh new file mode 100755 index 0000000000..97aa5155eb --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/retinanet/implementations/tinybox_red/dev_beam.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +export PYTHONPATH="." AMD=1 +export MODEL="retinanet" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=96 EVAL_BS=96 +export BASEDIR="/raid/datasets/openimages" + +# export RESET_STEP=0 + +export TRAIN_BEAM=2 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0 + +export BENCHMARK=5 DEBUG=2 + +python examples/mlperf/model_train.py diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/retinanet/implementations/tinybox_red/dev_run.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/retinanet/implementations/tinybox_red/dev_run.sh new file mode 100755 index 0000000000..5fb4d109fd --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/retinanet/implementations/tinybox_red/dev_run.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +export PYTHONPATH="." AMD=1 +export MODEL="retinanet" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=96 EVAL_BS=96 +export BASEDIR="/raid/datasets/openimages" + +# export RESET_STEP=0 + +export TRAIN_BEAM=2 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0 + +export WANDB=1 PARALLEL=0 +export RUNMLPERF=1 + +python examples/mlperf/model_train.py diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/systems/tinybox_8xMI300X.json b/examples/mlperf/training_submission_v6.0/tinycorp/systems/tinybox_8xMI300X.json new file mode 100644 index 0000000000..1e0f789430 --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/systems/tinybox_8xMI300X.json @@ -0,0 +1,38 @@ +{ + "submitter": "tinycorp", + "division": "closed", + "status": "Available on-premise", + "system_name": "tinybox 8xMI300X", + "number_of_nodes": "1", + "host_processors_per_node": "2", + "host_processor_model_name": "AMD EPYC 9354", + "host_processor_core_count": "32", + "host_processor_vcpu_count": "64", + "host_processor_frequency": "", + "host_processor_caches": "", + "host_processor_interconnect": "", + "host_memory_capacity": "2304GB", + "host_storage_type": "NVMe SSD", + "host_storage_capacity": "3x 4TB raid array", + "host_networking": "", + "host_networking_topology": "", + "host_memory_configuration": "24x 96GB DDR5", + "accelerators_per_node": "8", + "accelerator_model_name": "AMD Instinct MI300X 192GB HBM3", + "accelerator_host_interconnect": "PCIe 5.0 x16", + "accelerator_frequency": "", + "accelerator_on-chip_memories": "", + "accelerator_memory_configuration": "HBM3", + "accelerator_memory_capacity": "192GB", + "accelerator_interconnect": "", + "accelerator_interconnect_topology": "", + "cooling": "air", + "hw_notes": "", + "framework": "tinygrad, branch mlperf_training_v5.0", + "other_software_stack": { + "python": "3.10.16", + "ROCm": "3.0.0+94441cb" + }, + "operating_system": "Ubuntu 24.04.1 LTS", + "sw_notes": "" + } \ No newline at end of file diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/systems/tinybox_green.json b/examples/mlperf/training_submission_v6.0/tinycorp/systems/tinybox_green.json new file mode 100644 index 0000000000..24cbce1f1c --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/systems/tinybox_green.json @@ -0,0 +1,38 @@ +{ + "submitter": "tinycorp", + "division": "closed", + "status": "Available on-premise", + "system_name": "tinybox green", + "number_of_nodes": "1", + "host_processors_per_node": "1", + "host_processor_model_name": "AMD EPYC 7532", + "host_processor_core_count": "32", + "host_processor_vcpu_count": "64", + "host_processor_frequency": "", + "host_processor_caches": "", + "host_processor_interconnect": "", + "host_memory_capacity": "128GB", + "host_storage_type": "NVMe SSD", + "host_storage_capacity": "4 TB raid array + 1 TB boot", + "host_networking": "", + "host_networking_topology": "", + "host_memory_configuration": "8x 16GB DDR4", + "accelerators_per_node": "6", + "accelerator_model_name": "NVIDIA GeForce RTX 4090", + "accelerator_host_interconnect": "PCIe 4.0 x16", + "accelerator_frequency": "", + "accelerator_on-chip_memories": "", + "accelerator_memory_configuration": "GDDR6X", + "accelerator_memory_capacity": "24GB", + "accelerator_interconnect": "", + "accelerator_interconnect_topology": "", + "cooling": "air", + "hw_notes": "", + "framework": "tinygrad, branch mlperf_training_v5.0", + "other_software_stack": { + "python": "3.10.12", + "CUDA": "12.4" + }, + "operating_system": "Ubuntu 22.04.4", + "sw_notes": "" +} \ No newline at end of file diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/systems/tinybox_red.json b/examples/mlperf/training_submission_v6.0/tinycorp/systems/tinybox_red.json new file mode 100644 index 0000000000..58b6efe77c --- /dev/null +++ b/examples/mlperf/training_submission_v6.0/tinycorp/systems/tinybox_red.json @@ -0,0 +1,37 @@ +{ + "submitter": "tinycorp", + "division": "closed", + "status": "Available on-premise", + "system_name": "tinybox red", + "number_of_nodes": "1", + "host_processors_per_node": "1", + "host_processor_model_name": "AMD EPYC 7532", + "host_processor_core_count": "32", + "host_processor_vcpu_count": "64", + "host_processor_frequency": "", + "host_processor_caches": "", + "host_processor_interconnect": "", + "host_memory_capacity": "128GB", + "host_storage_type": "NVMe SSD", + "host_storage_capacity": "4 TB raid array + 1 TB boot", + "host_networking": "", + "host_networking_topology": "", + "host_memory_configuration": "8x 16GB DDR4", + "accelerators_per_node": "6", + "accelerator_model_name": "AMD Radeon RX 7900 XTX", + "accelerator_host_interconnect": "PCIe 4.0 x16", + "accelerator_frequency": "", + "accelerator_on-chip_memories": "", + "accelerator_memory_configuration": "GDDR6", + "accelerator_memory_capacity": "24GB", + "accelerator_interconnect": "", + "accelerator_interconnect_topology": "", + "cooling": "air", + "hw_notes": "", + "framework": "tinygrad, branch mlperf_training_v5.0", + "other_software_stack": { + "python": "3.10.12" + }, + "operating_system": "Ubuntu 22.04.4", + "sw_notes": "" +} \ No newline at end of file