mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-09 23:18:04 -05:00
copy mlperf 4.0 to mlperf 4.1 (#5614)
This commit is contained in:
@@ -1,7 +1,6 @@
|
|||||||
{
|
{
|
||||||
"submitter": "tinycorp",
|
"submitter": "tinycorp",
|
||||||
"division": "closed",
|
"division": "closed",
|
||||||
"system_type": "datacenter",
|
|
||||||
"status": "available",
|
"status": "available",
|
||||||
"system_name": "tinybox green",
|
"system_name": "tinybox green",
|
||||||
"number_of_nodes": "1",
|
"number_of_nodes": "1",
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
{
|
{
|
||||||
"submitter": "tinycorp",
|
"submitter": "tinycorp",
|
||||||
"division": "closed",
|
"division": "closed",
|
||||||
"system_type": "datacenter",
|
|
||||||
"status": "available",
|
"status": "available",
|
||||||
"system_name": "tinybox red",
|
"system_name": "tinybox red",
|
||||||
"number_of_nodes": "1",
|
"number_of_nodes": "1",
|
||||||
|
|||||||
@@ -0,0 +1,50 @@
|
|||||||
|
# 1. Problem
|
||||||
|
|
||||||
|
This problem uses the ResNet-50 CNN to do image classification.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
Install tinygrad and mlperf-logging from master.
|
||||||
|
```
|
||||||
|
git clone https://github.com/tinygrad/tinygrad.git
|
||||||
|
python3 -m pip install -e ".[mlperf]"
|
||||||
|
```
|
||||||
|
|
||||||
|
### tinybox_green
|
||||||
|
Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
|
||||||
|
This is the default on production tinybox green.
|
||||||
|
|
||||||
|
### tinybox_red
|
||||||
|
Disable cwsr
|
||||||
|
This is the default on production tinybox red.
|
||||||
|
```
|
||||||
|
sudo vi /etc/modprobe.d/amdgpu.conf
|
||||||
|
cat <<EOF > /etc/modprobe.d/amdgpu.conf
|
||||||
|
options amdgpu cwsr_enable=0
|
||||||
|
EOF
|
||||||
|
sudo update-initramfs -u
|
||||||
|
sudo reboot
|
||||||
|
|
||||||
|
# validate
|
||||||
|
sudo cat /sys/module/amdgpu/parameters/cwsr_enable #= 0
|
||||||
|
```
|
||||||
|
|
||||||
|
# 2. Directions
|
||||||
|
|
||||||
|
## Steps to download and verify data
|
||||||
|
|
||||||
|
```
|
||||||
|
IMGNET_TRAIN=1 python3 extra/datasets/imagenet_download.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## Steps for one time setup
|
||||||
|
|
||||||
|
### tinybox_red
|
||||||
|
```
|
||||||
|
examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
## Steps to run benchmark
|
||||||
|
```
|
||||||
|
examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh
|
||||||
|
```
|
||||||
@@ -0,0 +1,13 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
export PYTHONPATH="."
|
||||||
|
export MODEL="resnet"
|
||||||
|
export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
|
||||||
|
|
||||||
|
export LAZYCACHE=0 RESET_STEP=0
|
||||||
|
|
||||||
|
export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0
|
||||||
|
|
||||||
|
export BENCHMARK=10 DEBUG=2
|
||||||
|
|
||||||
|
python3 examples/mlperf/model_train.py
|
||||||
@@ -0,0 +1,15 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
export PYTHONPATH="."
|
||||||
|
export MODEL="resnet"
|
||||||
|
export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
|
||||||
|
|
||||||
|
export LAZYCACHE=0 RESET_STEP=0
|
||||||
|
|
||||||
|
export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0
|
||||||
|
|
||||||
|
export EVAL_START_EPOCH=3 EVAL_FREQ=4
|
||||||
|
|
||||||
|
export WANDB=1 PARALLEL=0
|
||||||
|
|
||||||
|
python3 examples/mlperf/model_train.py
|
||||||
@@ -0,0 +1,23 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
export PYTHONPATH="."
|
||||||
|
export MODEL="resnet"
|
||||||
|
export SUBMISSION_PLATFORM="tinybox_green"
|
||||||
|
export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
|
||||||
|
|
||||||
|
export LAZYCACHE=0 RESET_STEP=0
|
||||||
|
|
||||||
|
export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0
|
||||||
|
|
||||||
|
# pip install -e ".[mlperf]"
|
||||||
|
export LOGMLPERF=1
|
||||||
|
|
||||||
|
export SEED=$RANDOM
|
||||||
|
DATETIME=$(date "+%m%d%H%M")
|
||||||
|
LOGFILE="resnet_green_${DATETIME}_${SEED}.log"
|
||||||
|
|
||||||
|
# init
|
||||||
|
BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE
|
||||||
|
|
||||||
|
# run
|
||||||
|
PARALLEL=0 RUNMLPERF=1 EVAL_START_EPOCH=3 EVAL_FREQ=4 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
|
||||||
@@ -0,0 +1,50 @@
|
|||||||
|
# 1. Problem
|
||||||
|
|
||||||
|
This problem uses the ResNet-50 CNN to do image classification.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
Install tinygrad and mlperf-logging from master.
|
||||||
|
```
|
||||||
|
git clone https://github.com/tinygrad/tinygrad.git
|
||||||
|
python3 -m pip install -e ".[mlperf]"
|
||||||
|
```
|
||||||
|
|
||||||
|
### tinybox_green
|
||||||
|
Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
|
||||||
|
This is the default on production tinybox green.
|
||||||
|
|
||||||
|
### tinybox_red
|
||||||
|
Disable cwsr
|
||||||
|
This is the default on production tinybox red.
|
||||||
|
```
|
||||||
|
sudo vi /etc/modprobe.d/amdgpu.conf
|
||||||
|
cat <<EOF > /etc/modprobe.d/amdgpu.conf
|
||||||
|
options amdgpu cwsr_enable=0
|
||||||
|
EOF
|
||||||
|
sudo update-initramfs -u
|
||||||
|
sudo reboot
|
||||||
|
|
||||||
|
# validate
|
||||||
|
sudo cat /sys/module/amdgpu/parameters/cwsr_enable #= 0
|
||||||
|
```
|
||||||
|
|
||||||
|
# 2. Directions
|
||||||
|
|
||||||
|
## Steps to download and verify data
|
||||||
|
|
||||||
|
```
|
||||||
|
IMGNET_TRAIN=1 python3 extra/datasets/imagenet_download.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## Steps for one time setup
|
||||||
|
|
||||||
|
### tinybox_red
|
||||||
|
```
|
||||||
|
examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
## Steps to run benchmark
|
||||||
|
```
|
||||||
|
examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh
|
||||||
|
```
|
||||||
@@ -0,0 +1,13 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
export PYTHONPATH="."
|
||||||
|
export MODEL="resnet"
|
||||||
|
export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
|
||||||
|
|
||||||
|
export LAZYCACHE=0 RESET_STEP=0
|
||||||
|
|
||||||
|
export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=96 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0
|
||||||
|
|
||||||
|
export BENCHMARK=10 DEBUG=2
|
||||||
|
|
||||||
|
python3 examples/mlperf/model_train.py
|
||||||
@@ -0,0 +1,15 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
export PYTHONPATH="."
|
||||||
|
export MODEL="resnet"
|
||||||
|
export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
|
||||||
|
|
||||||
|
export LAZYCACHE=0 RESET_STEP=0
|
||||||
|
|
||||||
|
export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=96 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0
|
||||||
|
|
||||||
|
export EVAL_START_EPOCH=3 EVAL_FREQ=4
|
||||||
|
|
||||||
|
export WANDB=1 PARALLEL=0
|
||||||
|
|
||||||
|
python3 examples/mlperf/model_train.py
|
||||||
@@ -0,0 +1,23 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
export PYTHONPATH="."
|
||||||
|
export MODEL="resnet"
|
||||||
|
export SUBMISSION_PLATFORM="tinybox_red"
|
||||||
|
export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
|
||||||
|
|
||||||
|
export LAZYCACHE=0 RESET_STEP=0
|
||||||
|
|
||||||
|
export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=96 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0
|
||||||
|
|
||||||
|
# pip install -e ".[mlperf]"
|
||||||
|
export LOGMLPERF=1
|
||||||
|
|
||||||
|
export SEED=$RANDOM
|
||||||
|
DATETIME=$(date "+%m%d%H%M")
|
||||||
|
LOGFILE="resnet_red_${DATETIME}_${SEED}.log"
|
||||||
|
|
||||||
|
# init
|
||||||
|
BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE
|
||||||
|
|
||||||
|
# run
|
||||||
|
PARALLEL=0 RUNMLPERF=1 EVAL_START_EPOCH=3 EVAL_FREQ=4 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
|
||||||
@@ -0,0 +1,8 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
rocm-smi --setprofile compute
|
||||||
|
rocm-smi --setmclk 3
|
||||||
|
rocm-smi --setperflevel high
|
||||||
|
|
||||||
|
# power cap to 350W
|
||||||
|
echo "350000000" | sudo tee /sys/class/drm/card{1..6}/device/hwmon/hwmon*/power1_cap
|
||||||
@@ -0,0 +1,37 @@
|
|||||||
|
{
|
||||||
|
"submitter": "tinycorp",
|
||||||
|
"division": "closed",
|
||||||
|
"status": "available",
|
||||||
|
"system_name": "tinybox green",
|
||||||
|
"number_of_nodes": "1",
|
||||||
|
"host_processors_per_node": "1",
|
||||||
|
"host_processor_model_name": "AMD EPYC 7532 32-Core Processor",
|
||||||
|
"host_processor_core_count": "64",
|
||||||
|
"host_processor_frequency": "",
|
||||||
|
"host_processor_caches": "",
|
||||||
|
"host_processor_interconnect": "",
|
||||||
|
"host_memory_capacity": "128GB",
|
||||||
|
"host_storage_type": "NVMe SSD",
|
||||||
|
"host_storage_capacity": "4 TB raid array + 1 TB boot",
|
||||||
|
"host_networking": "",
|
||||||
|
"host_networking_topology": "",
|
||||||
|
"host_memory_configuration": "8x 16GB DDR4",
|
||||||
|
"accelerators_per_node": "6",
|
||||||
|
"accelerator_model_name": "NVIDIA GeForce RTX 4090",
|
||||||
|
"accelerator_host_interconnect": "PCIe 4.0 x16",
|
||||||
|
"accelerator_frequency": "",
|
||||||
|
"accelerator_on-chip_memories": "",
|
||||||
|
"accelerator_memory_configuration": "GDDR6X",
|
||||||
|
"accelerator_memory_capacity": "24GB",
|
||||||
|
"accelerator_interconnect": "",
|
||||||
|
"accelerator_interconnect_topology": "",
|
||||||
|
"cooling": "air",
|
||||||
|
"hw_notes": "",
|
||||||
|
"framework": "tinygrad, commit 0e8aa0e2886bf9a2d3ce093bce87305e182e6d4a",
|
||||||
|
"other_software_stack": {
|
||||||
|
"python": "3.10.12",
|
||||||
|
"CUDA": "12.4"
|
||||||
|
},
|
||||||
|
"operating_system": "Ubuntu 22.04.4",
|
||||||
|
"sw_notes": ""
|
||||||
|
}
|
||||||
@@ -0,0 +1,37 @@
|
|||||||
|
{
|
||||||
|
"submitter": "tinycorp",
|
||||||
|
"division": "closed",
|
||||||
|
"status": "available",
|
||||||
|
"system_name": "tinybox red",
|
||||||
|
"number_of_nodes": "1",
|
||||||
|
"host_processors_per_node": "1",
|
||||||
|
"host_processor_model_name": "AMD EPYC 7532 32-Core Processor",
|
||||||
|
"host_processor_core_count": "64",
|
||||||
|
"host_processor_frequency": "",
|
||||||
|
"host_processor_caches": "",
|
||||||
|
"host_processor_interconnect": "",
|
||||||
|
"host_memory_capacity": "128GB",
|
||||||
|
"host_storage_type": "NVMe SSD",
|
||||||
|
"host_storage_capacity": "4 TB raid array + 1 TB boot",
|
||||||
|
"host_networking": "",
|
||||||
|
"host_networking_topology": "",
|
||||||
|
"host_memory_configuration": "8x 16GB DDR4",
|
||||||
|
"accelerators_per_node": "6",
|
||||||
|
"accelerator_model_name": "AMD Radeon RX 7900 XTX",
|
||||||
|
"accelerator_host_interconnect": "PCIe 4.0 x16",
|
||||||
|
"accelerator_frequency": "",
|
||||||
|
"accelerator_on-chip_memories": "",
|
||||||
|
"accelerator_memory_configuration": "GDDR6",
|
||||||
|
"accelerator_memory_capacity": "24GB",
|
||||||
|
"accelerator_interconnect": "",
|
||||||
|
"accelerator_interconnect_topology": "",
|
||||||
|
"cooling": "air",
|
||||||
|
"hw_notes": "",
|
||||||
|
"framework": "tinygrad, commit 0e8aa0e2886bf9a2d3ce093bce87305e182e6d4a",
|
||||||
|
"other_software_stack": {
|
||||||
|
"python": "3.10.12",
|
||||||
|
"ROCm": "6.1"
|
||||||
|
},
|
||||||
|
"operating_system": "Ubuntu 22.04.4",
|
||||||
|
"sw_notes": ""
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user