copy mlperf stuff to 5.1 (#10576)

5.0 is finalized, new changes go to 5.1
2026-01-10 23:48:01 -05:00 · 2025-05-30 16:12:39 -04:00
parent 883bb4541c
commit baf482d314
31 changed files with 847 additions and 0 deletions
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_1xMI300X/dev_beam.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_1xMI300X/dev_beam.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+export PYTHONPATH="." AMD=1
+export MODEL="bert"
+export DEFAULT_FLOAT="HALF" GPUS=1 BS=128 EVAL_BS=128
+
+export BEAM=3 BEAM_UOPS_MAX=4000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+export IGNORE_JIT_FIRST_BEAM=1
+# export BEAM_LOG_SURPASS_MAX=1
+# export BASEDIR="/raid/datasets/wiki"
+
+export RESET_STEP=1
+export BENCHMARK=10 BERT_LAYERS=2 DEBUG=2
+
+python3 examples/mlperf/model_train.py
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/README.md
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/README.md
@@ -0,0 +1,69 @@
+# 1. Problem
+
+This problem uses BERT for NLP.
+
+## Requirements
+
+Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0.
+```
+git clone https://github.com/tinygrad/tinygrad.git
+python3 -m pip install -e ".[mlperf]"
+```
+Also install gdown (for dataset), numpy, tqdm and tensorflow.
+```
+pip install gdown numpy tqdm tensorflow
+```
+
+### tinybox_green
+Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
+This is the default on production tinybox green.
+
+# 2. Directions
+
+## Steps to download and verify data
+
+### 1. Download raw data
+
+```
+BASEDIR="/raid/datasets/wiki" WIKI_TRAIN=1 VERIFY_CHECKSUM=1 python3 extra/datasets/wikipedia_download.py
+```
+
+### 2. Preprocess train and validation data
+
+Note: The number of threads used for preprocessing is limited by available memory. With 128GB of RAM, a maximum of 16 threads is recommended. 
+
+#### Training:
+```
+BASEDIR="/raid/datasets/wiki" NUM_WORKERS=16 python3 extra/datasets/wikipedia.py pre-train all
+```
+
+Generating a specific topic (Between 0 and 499)
+```
+BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-train 42
+```
+
+#### Validation:
+```
+BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval
+```
+## Running
+
+### tinybox_green
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
+```
+
+### tinybox_red
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
+```
+### tinybox_8xMI300X
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
+```
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_beam.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_beam.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+export PYTHONPATH="." AMD=1
+export MODEL="bert"
+export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024
+export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1
+
+export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0
+export BASEDIR="/raid/datasets/wiki"
+
+export BENCHMARK=10 BERT_LAYERS=2 DEBUG=2
+
+python3 examples/mlperf/model_train.py
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_run.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_run.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+export PYTHONPATH="." AMD=1
+export MODEL="bert"
+export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024
+
+# similar to https://github.com/mlcommons/training_results_v3.1/blob/d06288b2bd675a9d88e0e6181f5bb5626b71ec19/Quanta_Cloud_Technology/results/D54U-3U/bert/result_1.txt#L54
+export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1
+export TRAIN_STEPS=3900
+
+export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0
+export BASEDIR="/raid/datasets/wiki"
+
+export WANDB=1 PARALLEL=0
+
+RUNMLPERF=1 python3 examples/mlperf/model_train.py
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+set -e  # Exit on any error
+
+export PYTHONPATH="." AMD=1
+export MODEL="bert"
+export SUBMISSION_PLATFORM="tinybox_8xMI300X"
+export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024
+
+# similar to https://github.com/mlcommons/training_results_v3.1/blob/d06288b2bd675a9d88e0e6181f5bb5626b71ec19/Quanta_Cloud_Technology/results/D54U-3U/bert/result_1.txt#L54
+export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1
+export TRAIN_STEPS=3900
+
+export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0
+export BASEDIR="/raid/datasets/wiki"
+
+# pip install -e ".[mlperf]"
+export LOGMLPERF=1
+
+export SEED=$RANDOM
+DATETIME=$(date "+%m%d%H%M")
+LOGFILE="bert_8xMI300x_${DATETIME}_${SEED}.log"
+
+# init  # TODO: without DEBUG=2 it hangs
+BENCHMARK=10 INITMLPERF=1 BERT_LAYERS=2 DEBUG=2 python3 examples/mlperf/model_train.py | tee $LOGFILE
+
+# run
+PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/README.md
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/README.md
@@ -0,0 +1,69 @@
+# 1. Problem
+
+This problem uses BERT for NLP.
+
+## Requirements
+
+Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0.
+```
+git clone https://github.com/tinygrad/tinygrad.git
+python3 -m pip install -e ".[mlperf]"
+```
+Also install gdown (for dataset), numpy, tqdm and tensorflow.
+```
+pip install gdown numpy tqdm tensorflow
+```
+
+### tinybox_green
+Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
+This is the default on production tinybox green.
+
+# 2. Directions
+
+## Steps to download and verify data
+
+### 1. Download raw data
+
+```
+BASEDIR="/raid/datasets/wiki" WIKI_TRAIN=1 VERIFY_CHECKSUM=1 python3 extra/datasets/wikipedia_download.py
+```
+
+### 2. Preprocess train and validation data
+
+Note: The number of threads used for preprocessing is limited by available memory. With 128GB of RAM, a maximum of 16 threads is recommended. 
+
+#### Training:
+```
+BASEDIR="/raid/datasets/wiki" NUM_WORKERS=16 python3 extra/datasets/wikipedia.py pre-train all
+```
+
+Generating a specific topic (Between 0 and 499)
+```
+BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-train 42
+```
+
+#### Validation:
+```
+BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval
+```
+## Running
+
+### tinybox_green
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
+```
+
+### tinybox_red
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
+```
+### tinybox_8xMI300X
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
+```
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+export PYTHONPATH="." NV=1
+export MODEL="bert"
+export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
+
+export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
+
+export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+export IGNORE_JIT_FIRST_BEAM=1
+export BEAM_LOG_SURPASS_MAX=1
+export BASEDIR="/raid/datasets/wiki"
+
+export BENCHMARK=10 BERT_LAYERS=2 DEBUG=2
+
+python3 examples/mlperf/model_train.py
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+export PYTHONPATH="." NV=1
+export MODEL="bert"
+export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
+
+export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
+
+export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+export IGNORE_JIT_FIRST_BEAM=1
+export BASEDIR="/raid/datasets/wiki"
+
+export WANDB=1 PARALLEL=0
+
+RUNMLPERF=1 python3 examples/mlperf/model_train.py
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+set -e  # Exit on any error
+
+export PYTHONPATH="." NV=1
+export MODEL="bert"
+export SUBMISSION_PLATFORM="tinybox_green"
+export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
+
+export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
+
+export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+export IGNORE_JIT_FIRST_BEAM=1
+export BASEDIR="/raid/datasets/wiki"
+
+# pip install -e ".[mlperf]"
+export LOGMLPERF=1
+
+export SEED=$RANDOM
+DATETIME=$(date "+%m%d%H%M")
+LOGFILE="bert_green_${DATETIME}_${SEED}.log"
+
+# init
+BENCHMARK=10 INITMLPERF=1 BERT_LAYERS=2 python3 examples/mlperf/model_train.py | tee $LOGFILE
+
+# run
+PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/README.md
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/README.md
@@ -0,0 +1,69 @@
+# 1. Problem
+
+This problem uses BERT for NLP.
+
+## Requirements
+
+Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0.
+```
+git clone https://github.com/tinygrad/tinygrad.git
+python3 -m pip install -e ".[mlperf]"
+```
+Also install gdown (for dataset), numpy, tqdm and tensorflow.
+```
+pip install gdown numpy tqdm tensorflow
+```
+
+### tinybox_green
+Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
+This is the default on production tinybox green.
+
+# 2. Directions
+
+## Steps to download and verify data
+
+### 1. Download raw data
+
+```
+BASEDIR="/raid/datasets/wiki" WIKI_TRAIN=1 VERIFY_CHECKSUM=1 python3 extra/datasets/wikipedia_download.py
+```
+
+### 2. Preprocess train and validation data
+
+Note: The number of threads used for preprocessing is limited by available memory. With 128GB of RAM, a maximum of 16 threads is recommended. 
+
+#### Training:
+```
+BASEDIR="/raid/datasets/wiki" NUM_WORKERS=16 python3 extra/datasets/wikipedia.py pre-train all
+```
+
+Generating a specific topic (Between 0 and 499)
+```
+BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-train 42
+```
+
+#### Validation:
+```
+BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval
+```
+## Running
+
+### tinybox_green
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
+```
+
+### tinybox_red
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
+```
+### tinybox_8xMI300X
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
+```
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+export PYTHONPATH="." AMD=1
+export MODEL="bert"
+export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
+
+export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
+
+export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+export IGNORE_JIT_FIRST_BEAM=1
+export BEAM_LOG_SURPASS_MAX=1
+export BASEDIR="/raid/datasets/wiki"
+
+export RESET_STEP=1
+export BENCHMARK=10 BERT_LAYERS=2 DEBUG=2
+
+python3 examples/mlperf/model_train.py
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+export PYTHONPATH="." AMD=1
+export MODEL="bert"
+export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
+
+export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
+
+export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+export IGNORE_JIT_FIRST_BEAM=1
+export BASEDIR="/raid/datasets/wiki"
+
+export WANDB=1 PARALLEL=0
+
+RUNMLPERF=1 python3 examples/mlperf/model_train.py
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+set -e  # Exit on any error
+
+export PYTHONPATH="." AMD=1
+export MODEL="bert"
+export SUBMISSION_PLATFORM="tinybox_red"
+export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
+
+export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
+
+export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+export IGNORE_JIT_FIRST_BEAM=1
+export BASEDIR="/raid/datasets/wiki"
+
+# pip install -e ".[mlperf]"
+export LOGMLPERF=1
+
+export SEED=$RANDOM
+DATETIME=$(date "+%m%d%H%M")
+LOGFILE="bert_red_${DATETIME}_${SEED}.log"
+
+export HCQDEV_WAIT_TIMEOUT_MS=100000  # prevents hang?
+
+# init
+sleep 5 && sudo rmmod amdgpu || true
+BENCHMARK=10 INITMLPERF=1 BERT_LAYERS=2 python3 examples/mlperf/model_train.py | tee $LOGFILE
+
+# run
+# TODO: AM driver resulted in nan
+sudo modprobe amdgpu
+PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/README.md
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/README.md
@@ -0,0 +1,50 @@
+# 1. Problem
+
+This problem uses the ResNet-50 CNN to do image classification.
+
+## Requirements
+
+Install tinygrad and mlperf-logging from master.
+```
+git clone https://github.com/tinygrad/tinygrad.git
+python3 -m pip install -e ".[mlperf]"
+```
+
+### tinybox_green
+Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
+This is the default on production tinybox green.
+
+### tinybox_red
+Disable cwsr
+This is the default on production tinybox red.
+```
+sudo vi /etc/modprobe.d/amdgpu.conf
+cat <<EOF > /etc/modprobe.d/amdgpu.conf
+options amdgpu cwsr_enable=0
+EOF
+sudo update-initramfs -u
+sudo reboot
+
+# validate
+sudo cat /sys/module/amdgpu/parameters/cwsr_enable #= 0
+```
+
+# 2. Directions
+
+## Steps to download and verify data
+
+```
+IMGNET_TRAIN=1 python3 extra/datasets/imagenet_download.py
+```
+
+## Steps for one time setup
+
+### tinybox_red
+```
+examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh
+```
+
+## Steps to run benchmark
+```
+examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh
+```
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_beam.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_beam.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+export PYTHONPATH="." NV=1
+export MODEL="resnet"
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
+
+export RESET_STEP=0
+
+export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0
+
+export BENCHMARK=10 DEBUG=2
+
+python3 examples/mlperf/model_train.py
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_run.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_run.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+export PYTHONPATH="." NV=1
+export MODEL="resnet"
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
+
+export RESET_STEP=0
+
+export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0
+
+export EVAL_START_EPOCH=3 EVAL_FREQ=4
+
+export WANDB=1 PARALLEL=0
+
+python3 examples/mlperf/model_train.py
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/run_and_time.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/run_and_time.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+set -e  # Exit on any error
+
+export PYTHONPATH="." NV=1
+export MODEL="resnet"
+export SUBMISSION_PLATFORM="tinybox_green"
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
+
+export RESET_STEP=0
+
+export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0
+
+# pip install -e ".[mlperf]"
+export LOGMLPERF=${LOGMLPERF:-1}
+
+export SEED=$RANDOM
+DATETIME=$(date "+%m%d%H%M")
+LOGFILE="resnet_green_${DATETIME}_${SEED}.log"
+
+# init
+BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE
+
+# run
+PARALLEL=0 RUNMLPERF=1 EVAL_START_EPOCH=3 EVAL_FREQ=4 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/README.md
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/README.md
@@ -0,0 +1,50 @@
+# 1. Problem
+
+This problem uses the ResNet-50 CNN to do image classification.
+
+## Requirements
+
+Install tinygrad and mlperf-logging from master.
+```
+git clone https://github.com/tinygrad/tinygrad.git
+python3 -m pip install -e ".[mlperf]"
+```
+
+### tinybox_green
+Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
+This is the default on production tinybox green.
+
+### tinybox_red
+Disable cwsr
+This is the default on production tinybox red.
+```
+sudo vi /etc/modprobe.d/amdgpu.conf
+cat <<EOF > /etc/modprobe.d/amdgpu.conf
+options amdgpu cwsr_enable=0
+EOF
+sudo update-initramfs -u
+sudo reboot
+
+# validate
+sudo cat /sys/module/amdgpu/parameters/cwsr_enable #= 0
+```
+
+# 2. Directions
+
+## Steps to download and verify data
+
+```
+IMGNET_TRAIN=1 python3 extra/datasets/imagenet_download.py
+```
+
+## Steps for one time setup
+
+### tinybox_red
+```
+examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh
+```
+
+## Steps to run benchmark
+```
+examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh
+```
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_beam.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_beam.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+export PYTHONPATH="." AMD=1
+export MODEL="resnet"
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
+
+export RESET_STEP=0
+
+export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=96 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0
+
+export BENCHMARK=10 DEBUG=${DEBUG:-2}
+
+python3 examples/mlperf/model_train.py
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_run.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_run.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+export PYTHONPATH="." AMD=1
+export MODEL="resnet"
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
+
+export RESET_STEP=0
+
+export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=96 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0
+
+export EVAL_START_EPOCH=3 EVAL_FREQ=4
+
+export WANDB=1 PARALLEL=0
+
+python3 examples/mlperf/model_train.py
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+set -e  # Exit on any error
+
+export PYTHONPATH="." AMD=1
+export MODEL="resnet"
+export SUBMISSION_PLATFORM="tinybox_red"
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
+
+export RESET_STEP=0
+
+export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=96 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0
+
+# pip install -e ".[mlperf]"
+export LOGMLPERF=${LOGMLPERF:-1}
+
+export SEED=$RANDOM
+DATETIME=$(date "+%m%d%H%M")
+LOGFILE="resnet_red_${DATETIME}_${SEED}.log"
+
+# init
+sleep 5 && sudo rmmod amdgpu || true
+BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE
+
+# run
+PARALLEL=0 RUNMLPERF=1 EVAL_START_EPOCH=3 EVAL_FREQ=4 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+rocm-smi --setprofile compute
+rocm-smi --setmclk 3
+rocm-smi --setperflevel high
+
+# power cap to 350W
+echo "350000000" | sudo tee /sys/class/drm/card{1..6}/device/hwmon/hwmon*/power1_cap
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_green/README.md
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_green/README.md
@@ -0,0 +1,38 @@
+# 1. Problem
+
+This problem uses RetinaNet for SSD.
+
+## Requirements
+
+Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0.
+```
+git clone https://github.com/tinygrad/tinygrad.git
+python3 -m pip install -e ".[mlperf]"
+```
+
+Also install the following dependencies:
+```
+pip install tqdm numpy pycocotools boto3 pandas torch torchvision
+```
+
+### tinybox_green
+Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
+This is the default on production tinybox green.
+
+# 2. Directions
+
+## Steps to download data
+
+Run the following:
+```
+BASEDIR=/raid/datasets/openimages python3 extra/datasets/openimages.py
+```
+
+## Running
+
+### tinybox_green
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/retinanet/implementations/tinybox_green/run_and_time.sh
+```
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_green/dev_beam.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_green/dev_beam.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+export PYTHONPATH="." NV=1
+export MODEL="retinanet"
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=96 EVAL_BS=96
+export BASEDIR="/raid/datasets/openimages"
+
+# export RESET_STEP=0
+
+export TRAIN_BEAM=2 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0
+
+export BENCHMARK=5 DEBUG=2
+
+python examples/mlperf/model_train.py
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_green/dev_run.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_green/dev_run.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+export PYTHONPATH="." NV=1
+export MODEL="retinanet"
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=96 EVAL_BS=96
+export BASEDIR="/raid/datasets/openimages"
+
+# export RESET_STEP=0
+
+export TRAIN_BEAM=2 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0
+
+export WANDB=1 PARALLEL=0
+export RUNMLPERF=1
+
+python examples/mlperf/model_train.py
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_green/run_and_time.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_green/run_and_time.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+set -e  # Exit on any error
+
+export PYTHONPATH="." NV=1
+export MODEL="retinanet"
+export SUBMISSION_PLATFORM="tinybox_green"
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=96 EVAL_BS=96
+
+export TRAIN_BEAM=2 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0
+export IGNORE_JIT_FIRST_BEAM=1
+export BASEDIR="/raid/datasets/openimages"
+
+# pip install -e ".[mlperf]"
+export LOGMLPERF=1
+
+export SEED=$RANDOM
+DATETIME=$(date "+%m%d%H%M")
+LOGFILE="retinanet_green_${DATETIME}_${SEED}.log"
+
+# init
+BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE
+
+# run
+PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_red/dev_beam.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_red/dev_beam.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+export PYTHONPATH="." AMD=1
+export MODEL="retinanet"
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=96 EVAL_BS=96
+export BASEDIR="/raid/datasets/openimages"
+
+# export RESET_STEP=0
+
+export TRAIN_BEAM=2 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0
+
+export BENCHMARK=5 DEBUG=2
+
+python examples/mlperf/model_train.py
--- a/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_red/dev_run.sh
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_red/dev_run.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+export PYTHONPATH="." AMD=1
+export MODEL="retinanet"
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=96 EVAL_BS=96
+export BASEDIR="/raid/datasets/openimages"
+
+# export RESET_STEP=0
+
+export TRAIN_BEAM=2 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0
+
+export WANDB=1 PARALLEL=0
+export RUNMLPERF=1
+
+python examples/mlperf/model_train.py
--- a/examples/mlperf/training_submission_v5.1/tinycorp/systems/tinybox_8xMI300X.json
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/systems/tinybox_8xMI300X.json
@@ -0,0 +1,38 @@
+{
+    "submitter": "tinycorp",
+    "division": "closed",
+    "status": "Available on-premise",
+    "system_name": "tinybox 8xMI300X",
+    "number_of_nodes": "1",
+    "host_processors_per_node": "2",
+    "host_processor_model_name": "AMD EPYC 9354",
+    "host_processor_core_count": "32",
+    "host_processor_vcpu_count": "64",
+    "host_processor_frequency": "",
+    "host_processor_caches": "",
+    "host_processor_interconnect": "",
+    "host_memory_capacity": "2304GB",
+    "host_storage_type": "NVMe SSD",
+    "host_storage_capacity": "3x 4TB raid array",
+    "host_networking": "",
+    "host_networking_topology": "",
+    "host_memory_configuration": "24x 96GB DDR5",
+    "accelerators_per_node": "8",
+    "accelerator_model_name": "AMD Instinct MI300X 192GB HBM3",
+    "accelerator_host_interconnect": "PCIe 5.0 x16",
+    "accelerator_frequency": "",
+    "accelerator_on-chip_memories": "",
+    "accelerator_memory_configuration": "HBM3",
+    "accelerator_memory_capacity": "192GB",
+    "accelerator_interconnect": "",
+    "accelerator_interconnect_topology": "",
+    "cooling": "air",
+    "hw_notes": "",
+    "framework": "tinygrad, branch mlperf_training_v5.0",
+    "other_software_stack": {
+        "python": "3.10.16",
+        "ROCm": "3.0.0+94441cb"
+    },
+    "operating_system": "Ubuntu 24.04.1 LTS",
+    "sw_notes": ""
+  }
--- a/examples/mlperf/training_submission_v5.1/tinycorp/systems/tinybox_green.json
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/systems/tinybox_green.json
@@ -0,0 +1,38 @@
+{
+  "submitter": "tinycorp",
+  "division": "closed",
+  "status": "Available on-premise",
+  "system_name": "tinybox green",
+  "number_of_nodes": "1",
+  "host_processors_per_node": "1",
+  "host_processor_model_name": "AMD EPYC 7532",
+  "host_processor_core_count": "32",
+  "host_processor_vcpu_count": "64",
+  "host_processor_frequency": "",
+  "host_processor_caches": "",
+  "host_processor_interconnect": "",
+  "host_memory_capacity": "128GB",
+  "host_storage_type": "NVMe SSD",
+  "host_storage_capacity": "4 TB raid array + 1 TB boot",
+  "host_networking": "",
+  "host_networking_topology": "",
+  "host_memory_configuration": "8x 16GB DDR4",
+  "accelerators_per_node": "6",
+  "accelerator_model_name": "NVIDIA GeForce RTX 4090",
+  "accelerator_host_interconnect": "PCIe 4.0 x16",
+  "accelerator_frequency": "",
+  "accelerator_on-chip_memories": "",
+  "accelerator_memory_configuration": "GDDR6X",
+  "accelerator_memory_capacity": "24GB",
+  "accelerator_interconnect": "",
+  "accelerator_interconnect_topology": "",
+  "cooling": "air",
+  "hw_notes": "",
+  "framework": "tinygrad, branch mlperf_training_v5.0",
+  "other_software_stack": {
+    "python": "3.10.12",
+    "CUDA": "12.4"
+  },
+  "operating_system": "Ubuntu 22.04.4",
+  "sw_notes": ""
+}
--- a/examples/mlperf/training_submission_v5.1/tinycorp/systems/tinybox_red.json
+++ b/examples/mlperf/training_submission_v5.1/tinycorp/systems/tinybox_red.json
@@ -0,0 +1,37 @@
+{
+  "submitter": "tinycorp",
+  "division": "closed",
+  "status": "Available on-premise",
+  "system_name": "tinybox red",
+  "number_of_nodes": "1",
+  "host_processors_per_node": "1",
+  "host_processor_model_name": "AMD EPYC 7532",
+  "host_processor_core_count": "32",
+  "host_processor_vcpu_count": "64",
+  "host_processor_frequency": "",
+  "host_processor_caches": "",
+  "host_processor_interconnect": "",
+  "host_memory_capacity": "128GB",
+  "host_storage_type": "NVMe SSD",
+  "host_storage_capacity": "4 TB raid array + 1 TB boot",
+  "host_networking": "",
+  "host_networking_topology": "",
+  "host_memory_configuration": "8x 16GB DDR4",
+  "accelerators_per_node": "6",
+  "accelerator_model_name": "AMD Radeon RX 7900 XTX",
+  "accelerator_host_interconnect": "PCIe 4.0 x16",
+  "accelerator_frequency": "",
+  "accelerator_on-chip_memories": "",
+  "accelerator_memory_configuration": "GDDR6",
+  "accelerator_memory_capacity": "24GB",
+  "accelerator_interconnect": "",
+  "accelerator_interconnect_topology": "",
+  "cooling": "air",
+  "hw_notes": "",
+  "framework": "tinygrad, branch mlperf_training_v5.0",
+  "other_software_stack": {
+    "python": "3.10.12"
+  },
+  "operating_system": "Ubuntu 22.04.4",
+  "sw_notes": ""
+}