Compare commits

...

22 Commits

Author SHA1 Message Date
Adel Johar
5393e90a8e Merge pull request #4393 from ROCm/docs_fix_arch
Docs: Fix gpu-arch-spec.rst
2025-02-27 16:35:33 +01:00
Peter Park
fbc2815223 Merge pull request #4417 from peterjunpark/docs/6.3.3
[docs/6.3.3] Update PT and TF docker inventories in compatibility docs (#4415)
2025-02-26 09:28:30 -05:00
Peter Park
2b96a37b08 Fix tensorflow-rocm repo.radeon.com url 2025-02-25 12:58:02 -05:00
Peter Park
1e5ad14d86 Update PT and TF docker inventories in compatibility docs (#4415)
* update PyTorch docker inventories in compatibility doc

* update TF docker inventories in compatibility doc

* update text to rocm 6.3.3

(cherry picked from commit 934767322b)
2025-02-25 12:38:25 -05:00
Peter Park
f9d6bd4db8 Merge pull request #4410 from peterjunpark/docs/6.3.3
[docs/6.3.3] fix tab sync and nested tab Megatron-LM doc (#4409)
2025-02-21 17:23:06 -05:00
Peter Park
23e78c8d55 fix tab sync and nested tab Megatron-LM doc (#4409)
(cherry picked from commit 1ea1c5c6e0)
2025-02-21 17:20:15 -05:00
Peter Park
0edd31bde6 Merge pull request #4408 from peterjunpark/docs/6.3.3
Update docs on Megatron-LM and PyTorch training Dockers (#4407)
2025-02-21 13:29:10 -05:00
Peter Park
4af488e27d Update docs on Megatron-LM and PyTorch training Dockers (#4407)
* Update Megatron-LM and PyTorch Training Docker docs

Also restructure TOC

* Apply suggestions from code review

Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com>

update "start training" text

Apply suggestions from code review

Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com>

update conf.py

fix spacing

fix branding issue

add disable numa

reorg

remove extra text

(cherry picked from commit 389fa7071b)
2025-02-21 13:10:42 -05:00
Parag Bhandari
7ae7046301 Merge branch 'roc-6.3.x' into docs/6.3.3 2025-02-19 17:25:14 -05:00
Parag Bhandari
358092386e Merge branch 'develop' into roc-6.3.x 2025-02-19 17:25:03 -05:00
Parag Bhandari
e071738908 Merge branch 'roc-6.3.x' into docs/6.3.3 2025-02-19 17:22:38 -05:00
pbhandar-amd
cd79403931 Update vllm-benchmark.rst 2025-02-19 17:21:29 -05:00
pbhandar-amd
e44499357e Merge pull request #4400 from ROCm/amd/pbhandar/roc_633
Add changes for rocm 6.3.3 release.
2025-02-19 17:15:53 -05:00
pbhandar-amd
ce3bc46fcb Create rocm-6.3.3.xml 2025-02-19 17:10:47 -05:00
pbhandar-amd
7f66041b96 Update components.xml 2025-02-19 17:00:34 -05:00
pbhandar-amd
1d312ac9fd Update default.xml 2025-02-19 17:00:06 -05:00
pbhandar-amd
ebc39487a8 Update README.md 2025-02-19 16:59:26 -05:00
Parag Bhandari
275ef1d511 Merge branch 'roc-6.3.x' into docs/6.3.3 2025-02-19 16:41:11 -05:00
Parag Bhandari
be36c1808e Merge branch 'develop' into docs/6.3.3 2025-02-19 15:34:46 -05:00
Parag Bhandari
64c362a961 Manually update requirements.in and txt 2025-02-19 11:35:30 -05:00
pbhandar-amd
d392eca232 Update documentation requirements 2025-02-19 11:10:09 -05:00
pbhandar-amd
1b58c08394 Sync develop into docs/6.3.3 2025-02-18 14:05:45 -05:00
18 changed files with 1273 additions and 665 deletions

View File

@@ -117,6 +117,7 @@ FX
Filesystem
FindDb
Flang
FluxBenchmark
Fortran
Fuyu
GALB
@@ -131,6 +132,7 @@ GDS
GEMM
GEMMs
GFortran
GFXIP
Gemma
GiB
GIM
@@ -317,6 +319,7 @@ PipelineParallel
PnP
PowerEdge
PowerShell
Pretraining
Profiler's
PyPi
Pytest
@@ -716,6 +719,7 @@ preprocessing
preprocessor
prequantized
prerequisites
pretraining
profiler
profilers
protobuf

View File

@@ -50,7 +50,7 @@ The following example shows how to use the repo tool to download the ROCm source
```bash
mkdir -p ~/ROCm/
cd ~/ROCm/
export ROCM_VERSION=6.3.2
export ROCM_VERSION=6.3.3
~/bin/repo init -u http://github.com/ROCm/ROCm.git -b roc-6.3.x -m tools/rocm-build/rocm-${ROCM_VERSION}.xml
~/bin/repo sync
```
@@ -77,8 +77,8 @@ The Build time will reduce significantly if we limit the GPU Architecture/s agai
mkdir -p ~/WORKSPACE/ # Or any folder name other than WORKSPACE
cd ~/WORKSPACE/
export ROCM_VERSION=6.3.2
~/bin/repo init -u http://github.com/ROCm/ROCm.git -b develop -m tools/rocm-build/rocm-${ROCM_VERSION}.xml
export ROCM_VERSION=6.3.3
~/bin/repo init -u http://github.com/ROCm/ROCm.git -b roc-6.3.x -m tools/rocm-build/rocm-${ROCM_VERSION}.xml
~/bin/repo sync
# --------------------------------------

View File

@@ -1,7 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<manifest>
<remote name="rocm-org" fetch="https://github.com/ROCm/" />
<default revision="refs/tags/rocm-6.3.2"
<default revision="refs/tags/rocm-6.3.3"
remote="rocm-org"
sync-c="true"
sync-j="4" />

View File

@@ -56,7 +56,7 @@ Docker image compatibility
AMD validates and publishes ready-made `PyTorch images <https://hub.docker.com/r/rocm/pytorch>`_
with ROCm backends on Docker Hub. The following Docker image tags and
associated inventories are validated for `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_.
associated inventories are validated for `ROCm 6.3.3 <https://repo.radeon.com/rocm/apt/6.3.3/>`_.
Click the |docker-icon| icon to view the image on Docker Hub.
.. list-table:: PyTorch Docker image components
@@ -77,26 +77,26 @@ Click the |docker-icon| icon to view the image on Docker Hub.
* - .. raw:: html
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu24.04_py3.12_pytorch_release_2.4.0/images/sha256-98ddf20333bd01ff749b8092b1190ee369a75d3b8c71c2fac80ffdcb1a98d529?context=explore"><i class="fab fa-docker fa-lg"></i></a>
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu24.04_py3.12_pytorch_release_2.4.0/images/sha256-6c798857b2c9526b44ba535710b93a1737546acea79b53a93c646195c272f1d5"><i class="fab fa-docker fa-lg"></i></a>
- `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
- 24.04
- `3.12 <https://www.python.org/downloads/release/python-3128/>`_
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
- `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`_
- `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`_
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
- `master <https://bitbucket.org/icl/magma/src/master/>`_
- `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
- `4.0.7 <https://github.com/open-mpi/ompi/tree/v4.0.7>`_
- `4.0.3 <https://github.com/open-mpi/ompi/tree/v4.0.3>`_
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
* - .. raw:: html
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.10_pytorch_release_2.4.0/images/sha256-402c9b4f1a6b5a81c634a1932b56cbe01abb699cfcc7463d226276997c6cf8ea?context=explore"><i class="fab fa-docker fa-lg"></i></a>
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu22.04_py3.10_pytorch_release_2.4.0/images/sha256-a09b21248133876fc8912a5ff4e6ee2c8d62b14120313e426b3dadda5702713d"><i class="fab fa-docker fa-lg"></i></a>
- `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
- 22.04
- `3.10 <https://www.python.org/downloads/release/python-31016/>`_
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
- `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`_
- `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`_
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
@@ -107,11 +107,11 @@ Click the |docker-icon| icon to view the image on Docker Hub.
* - .. raw:: html
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.9_pytorch_release_2.4.0/images/sha256-e0608b55d408c3bfe5c19fdd57a4ced3e0eb3a495b74c309980b60b156c526dd?context=explore"><i class="fab fa-docker fa-lg"></i></a>
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu22.04_py3.9_pytorch_release_2.4.0/images/sha256-963187534467f0f9da77996762fc1d112a6faa5372277c348a505533e7876ec8"><i class="fab fa-docker fa-lg"></i></a>
- `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
- 22.04
- `3.9.18 <https://www.python.org/downloads/release/python-3918/>`_
- `3.9.21 <https://www.python.org/downloads/release/python-3921/>`_
- `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`_
- `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`_
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
@@ -122,11 +122,11 @@ Click the |docker-icon| icon to view the image on Docker Hub.
* - .. raw:: html
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-652cf25263d05b1de548222970aeb76e60b12de101de66751264709c0d0ff9d8?context=explore"><i class="fab fa-docker fa-lg"></i></a>
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-952f2621bd2bf3078bef19061e05b209105a82a7908e7e6cdf85014938a4d93a"><i class="fab fa-docker fa-lg"></i></a>
- `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`_
- 22.04
- `3.10 <https://www.python.org/downloads/release/python-31016/>`_
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
- `1.3.0 <https://github.com/ROCm/apex/tree/release/1.3.0>`_
- `0.18.0 <https://github.com/pytorch/vision/tree/v0.18.0>`_
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
@@ -137,7 +137,7 @@ Click the |docker-icon| icon to view the image on Docker Hub.
* - .. raw:: html
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.10_pytorch_release_2.2.1/images/sha256-051976f26beab8f9aa65d999e3ad546c027b39240a0cc3ee81b114a9024f2912?context=explore"><i class="fab fa-docker fa-lg"></i></a>
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu22.04_py3.10_pytorch_release_2.2.1/images/sha256-a2fe20e170feb9e05da3e5728bb98e40d08567e137be8e6ba797962ed2852608"><i class="fab fa-docker fa-lg"></i></a>
- `2.2.1 <https://github.com/ROCm/pytorch/tree/release/2.2>`_
- 22.04
@@ -152,7 +152,7 @@ Click the |docker-icon| icon to view the image on Docker Hub.
* - .. raw:: html
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu20.04_py3.9_pytorch_release_2.2.1/images/sha256-88c839a364d109d3748c100385bfa100d28090d25118cc723fd0406390ab2f7e?context=explore"><i class="fab fa-docker fa-lg"></i></a>
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu20.04_py3.9_pytorch_release_2.2.1/images/sha256-7f231937c897cca5f89e360be33c70a2017d60f62d1fbe81292be48c15fe345b"><i class="fab fa-docker fa-lg"></i></a>
- `2.2.1 <https://github.com/ROCm/pytorch/tree/release/2.2>`_
- 20.04
@@ -167,14 +167,14 @@ Click the |docker-icon| icon to view the image on Docker Hub.
* - .. raw:: html
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.9_pytorch_release_1.13.1/images/sha256-994424ed07a63113f79dd9aa72159124c00f5fbfe18127151e6658f7d0b6f821?context=explore"><i class="fab fa-docker fa-lg"></i></a>
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu22.04_py3.9_pytorch_release_1.13.1/images/sha256-616a47758004f91951e2da6c1fe291f903de65a7b2318d4b18359b48fe3032f4"><i class="fab fa-docker fa-lg"></i></a>
- `1.13.1 <https://github.com/ROCm/pytorch/tree/release/1.13>`_
- 22.04
- `3.9.21 <https://www.python.org/downloads/release/python-3921/>`_
- `1.0.0 <https://github.com/ROCm/apex/tree/release/1.0.0>`_
- `0.14.0 <https://github.com/pytorch/vision/tree/v0.14.0>`_
- `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18>`_
- `2.19.0 <https://github.com/tensorflow/tensorboard/tree/2.19>`_
- `master <https://bitbucket.org/icl/magma/src/master/>`_
- `1.14.1 <https://github.com/openucx/ucx/tree/v1.14.1>`_
- `4.1.5 <https://github.com/open-mpi/ompi/tree/v4.1.5>`_
@@ -182,7 +182,7 @@ Click the |docker-icon| icon to view the image on Docker Hub.
* - .. raw:: html
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu20.04_py3.9_pytorch_release_1.13.1/images/sha256-7b8139fe40a9aeb4bca3aecd15c22c1fa96e867d93479fa3a24fdeeeeafa1219?context=explore"><i class="fab fa-docker fa-lg"></i></a>
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu20.04_py3.9_pytorch_release_1.13.1/images/sha256-a2cfb365aea58b84595e241ffdb0d5ef3e6566e98c10b5499f4aa29983a74ea2"><i class="fab fa-docker fa-lg"></i></a>
- `1.13.1 <https://github.com/ROCm/pytorch/tree/release/1.13>`_
- 20.04

View File

@@ -54,7 +54,7 @@ Docker image compatibility
AMD validates and publishes ready-made `TensorFlow images
<https://hub.docker.com/r/rocm/tensorflow>`_ with ROCm backends on
Docker Hub. The following Docker image tags and associated inventories are
validated for `ROCm 6.3.1 <https://repo.radeon.com/rocm/apt/6.3.1/>`_. Click
validated for `ROCm 6.3.3 <https://repo.radeon.com/rocm/apt/6.3.3/>`_. Click
the |docker-icon| icon to view the image on Docker Hub.
.. list-table:: TensorFlow Docker image components
@@ -68,47 +68,47 @@ the |docker-icon| icon to view the image on Docker Hub.
* - .. raw:: html
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.1-py3.12-tf2.17.0-dev/images/sha256-804121ee4985718277ba7dcec53c57bdade130a1ef42f544b6c48090ad379c17"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.3-py3.12-tf2.17-dev/images/sha256-fd2653f436880366cc874aa24264ca9dabd892d76ccb63fb807debba459bcaaf"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
- `tensorflow-rocm 2.17.0 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3/tensorflow_rocm-2.17.0-cp312-cp312-manylinux_2_28_x86_64.whl>`__
- `tensorflow-rocm 2.17.0 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3.3/tensorflow_rocm-2.17.0-cp312-cp312-manylinux_2_28_x86_64.whl>`__
- dev
- `Python 3.12 <https://www.python.org/downloads/release/python-3124/>`_
- `Python 3.12.4 <https://www.python.org/downloads/release/python-3124/>`_
- `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_
* - .. raw:: html
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.1-py3.10-tf2.17.0-dev/images/sha256-776837ffa945913f6c466bfe477810a11453d21d5b6afb200be1c36e48fbc08e"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.3-py3.10-tf2.17-dev/images/sha256-8a5eb7443798935dd269575e2abae847b702e1dfb06766ab84f081a6314d8b95"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
- `tensorflow-rocm 2.17.0 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3/tensorflow_rocm-2.17.0-cp310-cp310-manylinux_2_28_x86_64.whl>`__
- `tensorflow-rocm 2.17.0 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3.3/tensorflow_rocm-2.17.0-cp310-cp310-manylinux_2_28_x86_64.whl>`__
- dev
- `Python 3.10 <https://www.python.org/downloads/release/python-31012/>`_
- `TensorBoard 2.17.0 <https://github.com/tensorflow/tensorboard/tree/2.17.0>`_
- `Python 3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
- `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_
* - .. raw:: html
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.1-py3.12-tf2.16.2-dev/images/sha256-c793e1483e30809c3c28fc5d7805bedc033c73da224f839fff370717cb100944"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.3-py3.12-tf2.16-dev/images/sha256-8fc939b10cdd6d2b11407474880d4c8ab2b52ab6e2d1743c921fc2adbfd0422f"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
- `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
- `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3.3/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
- dev
- `Python 3.12 <https://www.python.org/downloads/release/python-3124/>`_
- `Python 3.12.4 <https://www.python.org/downloads/release/python-3124/>`_
- `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
* - .. raw:: html
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.1-py3.10-tf2.16.0-dev/images/sha256-263e78414ae85d7bcd52a025a94131d0a279872a45ed632b9165336dfdcd4443"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.3-py3.10-tf2.16-dev/images/sha256-a4cc6ab23d59fdf5459ceac1f0a603e6c16ae7f885d30e42c0c2b3ac60c2ad10"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
- `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3/tensorflow_rocm-2.16.2-cp310-cp310-manylinux_2_28_x86_64.whl>`__
- `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3.3/tensorflow_rocm-2.16.2-cp310-cp310-manylinux_2_28_x86_64.whl>`__
- dev
- `Python 3.10 <https://www.python.org/downloads/release/python-31012/>`_
- `Python 3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
- `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
* - .. raw:: html
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.1-py3.10-tf2.15.0-dev/images/sha256-479046a8477ca701a9494a813ab17e8ab4f6baa54641e65dc8d07629f1e6a880"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.3-py3.10-tf2.15-dev/images/sha256-60887c488421184adcb60b9ed4f72a8bd7bdb64d238e50943ca7cbde38e4aa48"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
- `tensorflow-rocm 2.15.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3/tensorflow_rocm-2.15.1-cp310-cp310-manylinux_2_28_x86_64.whl>`_
- `tensorflow-rocm 2.15.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3.3/tensorflow_rocm-2.15.1-cp310-cp310-manylinux_2_28_x86_64.whl>`_
- dev
- `Python 3.10 <https://www.python.org/downloads/release/python-31012/>`_
- `Python 3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
- `TensorBoard 2.15.2 <https://github.com/tensorflow/tensorboard/tree/2.15.2>`_
Critical ROCm libraries for TensorFlow

View File

@@ -49,6 +49,9 @@ article_pages = [
{"file": "how-to/rocm-for-ai/training/index", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/training/train-a-model", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/training/prerequisite-system-validation", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/training/train-a-model/benchmark-docker/megatron-lm", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/training/train-a-model/benchmark-docker/pytorch-training", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/training/scale-model-training", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/fine-tuning/index", "os": ["linux"]},

View File

@@ -444,8 +444,6 @@ Further reading
- To learn how to optimize inference on LLMs, see
:doc:`Inference optimization <../inference-optimization/index>`.
<<<<<<< HEAD:docs/how-to/performance-validation/mi300x/vllm-benchmark.rst
=======
- To learn how to fine-tune LLMs, see
:doc:`Fine-tuning LLMs <../fine-tuning/index>`.

View File

@@ -0,0 +1,547 @@
:orphan:
.. meta::
:description: How to train a model using Megatron-LM for ROCm.
:keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
******************************************
Training a model with Megatron-LM for ROCm
******************************************
The Megatron-LM framework for ROCm is a specialized fork of the robust Megatron-LM,
designed to enable efficient training of large-scale language models on AMD
GPUs. By leveraging AMD Instinct™ MI300X series accelerators, Megatron-LM delivers
enhanced scalability, performance, and resource utilization for AI workloads.
It is purpose-built to support models like Llama 2, Llama 3, Llama 3.1, and
DeepSeek, enabling developers to train next-generation AI models more
efficiently. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.
AMD provides a ready-to-use Docker image for MI300X accelerators containing
essential components, including PyTorch, ROCm libraries, and Megatron-LM
utilities. It contains the following software components to accelerate training
workloads:
+--------------------------+--------------------------------+
| Software component | Version |
+==========================+================================+
| ROCm | 6.3.0 |
+--------------------------+--------------------------------+
| PyTorch | 2.7.0a0+git637433 |
+--------------------------+--------------------------------+
| Python | 3.10 |
+--------------------------+--------------------------------+
| Transformer Engine | 1.11 |
+--------------------------+--------------------------------+
| Flash Attention | 3.0.0 |
+--------------------------+--------------------------------+
| hipBLASLt | git258a2162 |
+--------------------------+--------------------------------+
| Triton | 3.1 |
+--------------------------+--------------------------------+
Supported features and models
=============================
Megatron-LM provides the following key features to train large language models efficiently:
- Transformer Engine (TE)
- APEX
- GEMM tuning
- Torch.compile
- 3D parallelism: TP + SP + CP
- Distributed optimizer
- Flash Attention (FA) 3
- Fused kernels
- Pre-training
.. _amd-megatron-lm-model-support:
The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
* Llama 2 7B
* Llama 2 70B
* Llama 3 8B
* Llama 3 70B
* Llama 3.1 8B
* Llama 3.1 70B
* DeepSeek-V2-Lite
.. note::
Some models, such as Llama 3, require an external license agreement through
a third party (for example, Meta).
System validation
=================
If you have already validated your system settings, skip this step. Otherwise,
complete the :ref:`system validation and optimization steps <train-a-model-system-validation>`
to set up your system before starting training.
Disable NUMA auto-balancing
---------------------------
Generally, application performance can benefit from disabling NUMA auto-balancing. However,
it might be detrimental to performance with certain types of workloads.
Run the command ``cat /proc/sys/kernel/numa_balancing`` to check your current NUMA (Non-Uniform
Memory Access) settings. Output ``0`` indicates this setting is disabled. If there is no output or
the output is ``1``, run the following command to disable NUMA auto-balancing.
.. code-block:: shell
sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
See :ref:`mi300x-disable-numa` for more information.
.. _mi300x-amd-megatron-lm-training:
Environment setup
=================
The pre-built ROCm Megatron-LM environment allows users to quickly validate system performance, conduct
training benchmarks, and achieve superior performance for models like Llama 3.1, Llama 2, and DeepSeek V2.
Use the following instructions to set up the environment, configure the script to train models, and
reproduce the benchmark results on the MI300X accelerators with the AMD Megatron-LM Docker
image.
.. _amd-megatron-lm-requirements:
Download the Docker image
-------------------------
1. Use the following command to pull the Docker image from Docker Hub.
.. code-block:: shell
docker pull rocm/megatron-lm:v25.3
2. Launch the Docker container.
.. code-block:: shell
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 64G --name megatron_training_env rocm/megatron-lm:v25.3
3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it.
.. code-block:: shell
docker start megatron_training_env
docker exec -it megatron_training_env bash
The Docker container includes a pre-installed, verified version of Megatron-LM from the `release branch <https://github.com/ROCm/Megatron-LM/tree/megatron_release_v25.3>`_.
.. _amd-megatron-lm-environment-setup:
Configuration scripts
---------------------
.. tab-set::
.. tab-item:: Llama
:sync: llama
If you're working with Llama 2 7B or Llama 2 70 B, use the ``train_llama2.sh`` configuration
script in the ``examples/llama`` directory of
`<https://github.com/ROCm/Megatron-LM/tree/megatron_release_v25.3/examples/llama>`__.
Likewise, if you're working with Llama 3 or Llama 3.1, then use ``train_llama3.sh`` and update
the configuration script accordingly.
.. tab-item:: DeepSeek V2
:sync: deepseek
Use the ``train_deepseek_v2.sh`` configuration script in the ``examples/deepseek_v2``
directory of
`<https://github.com/ROCm/Megatron-LM/tree/megatron_release_v25.3/examples/deepseek_v2>`__
and update the configuration script accordingly.
Network interface
^^^^^^^^^^^^^^^^^
.. tab-set::
.. tab-item:: Llama
:sync: llama
To avoid connectivity issues in multi-node deployments, ensure the correct network interface
is set in your training scripts.
1. Run the following command (outside the container) to find the active network interface on your system.
.. code-block:: shell
ip a
2. Update the ``NCCL_SOCKET_IFNAME`` and ``GLOO_SOCKET_IFNAME`` variables with your systems network interface. For
example:
.. code-block:: shell
export NCCL_SOCKET_IFNAME=ens50f0np0
export GLOO_SOCKET_IFNAME=ens50f0np0
Dataset options
^^^^^^^^^^^^^^^
.. tab-set::
.. tab-item:: Llama
:sync: llama
You can use either mock data or real data for training.
* Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default
value is ``1`` for enabled.
.. code-block:: bash
MOCK_DATA=1
* If you're using a real dataset, update the ``DATA_PATH`` variable to point to the location of your dataset.
.. code-block:: bash
MOCK_DATA=0
DATA_PATH=${DATA_PATH:-"/data/bookcorpus_text_sentence"} # Change to where your dataset is stored
Ensure that the files are accessible inside the Docker container.
.. tab-item:: DeepSeek V2
:sync: deepseek
If you don't already have the dataset, download the DeepSeek dataset using the following
commands:
.. code-block:: shell
mkdir deepseek-datasets
cd deepseek-datasets
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.bin
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.idx
You can use either mock data or real data for training.
* Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default
value is ``1`` for enabled.
.. code-block:: bash
MOCK_DATA=1
* If you're using a real dataset, update the ``DATA_DIR`` variable to point to the location of your dataset.
.. code-block:: bash
MOCK_DATA=0
DATA_DIR="/root/data/deepseek-datasets" # Change to where your dataset is stored
Ensure that the files are accessible inside the Docker container.
Tokenizer
^^^^^^^^^
Tokenization is the process of converting raw text into tokens that can be processed by the model. For Llama
models, this typically involves sub-word tokenization, where words are broken down into smaller units based on
a fixed vocabulary. The tokenizer is trained along with the model on a large corpus of text, and it learns a
fixed vocabulary that can represent a wide range of text from different domains. This allows Llama models to
handle a variety of input sequences, including unseen words or domain-specific terms.
.. tab-set::
.. tab-item:: Llama
:sync: llama
To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``.
To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable.
For example, if you're using the Llama 3.1 8B model:
.. code-block:: shell
TOKENIZER_MODEL=meta-llama/Llama-3.1-8B
.. tab-item:: DeepSeek V2
:sync: deepseek
To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
Multi-node training
^^^^^^^^^^^^^^^^^^^
.. tab-set::
.. tab-item:: Llama
:sync: llama
If you're running multi-node training, update the following environment variables. They can
also be passed as command line arguments.
* Change ``localhost`` to the master node's hostname:
.. code-block:: shell
MASTER_ADDR="${MASTER_ADDR:-localhost}"
* Set the number of nodes you want to train on (for instance, ``2``, ``4``, ``8``):
.. code-block:: shell
NNODES="${NNODES:-1}"
* Set the rank of each node (0 for master, 1 for the first worker node, and so on):
.. code-block:: shell
NODE_RANK="${NODE_RANK:-0}"
* Set ``DATA_CACHE_PATH`` to a common directory accessible by all the nodes (for example, an
NFS directory) for multi-node runs:
.. code-block:: shell
DATA_CACHE_PATH=/root/cache # Set to a common directory for multi-node runs
* For multi-node runs, make sure the correct network drivers are installed on the nodes. If
inside a Docker, either install the drivers inside the Docker container or pass the network
drivers from the host while creating the Docker container.
Start training on AMD Instinct accelerators
===========================================
The prebuilt Megatron-LM with ROCm training environment allows users to quickly validate
system performance, conduct training benchmarks, and achieve superior
performance for models like Llama 3.1 and Llama 2. This container should not be
expected to provide generalized performance across all training workloads. You
can expect the container to perform in the model configurations described in
the following section, but other configurations are not validated by AMD.
Use the following instructions to set up the environment, configure the script
to train models, and reproduce the benchmark results on MI300X series
accelerators with the AMD Megatron-LM Docker image.
.. tab-set::
.. tab-item:: Llama
:sync: llama
.. tab-set::
.. tab-item:: Single node training
:sync: single-node
To run training on a single node, navigate to the Megatron-LM folder and use the
following command:
.. code-block:: shell
TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 bash examples/llama/train_llama3.sh
.. tab-item:: Multi-node training
:sync: multi-node
To run training on multiple nodes, launch the Docker container on each node. For example, for a two node setup (``NODE0`` as the master node), use these commands.
* On the master node ``NODE0``:
.. code-block:: shell
TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=0 bash examples/llama/train_llama3.sh
* On the worker node ``NODE1``:
.. code-block:: shell
TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=1 bash examples/llama/train_llama3.sh
.. tab-item:: DeepSeek V2
:sync: deepseek
To run the training on a single node, go to ``/Megatron-LM`` folder and use the following command:
.. code-block:: shell
cd /workspace/Megatron-LM
GEMM_TUNING=1 PR=bf16 MBS=4 AC=none bash examples/deepseek_v2/train_deepseekv2.sh
Key options
-----------
.. _amd-megatron-lm-benchmark-test-vars:
The benchmark tests support the following sets of variables:
.. tab-set::
.. tab-item:: Llama
:sync: llama
``TEE_OUTPUT``
``1`` to enable training logs or ``0`` to disable.
``TE_FP8``
``0`` for BP16 (default) or ``1`` for FP8 GEMMs.
``GEMM_TUNING``
``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
``USE_FLASH_ATTN``
``1`` to enable Flash Attention.
``ENABLE_PROFILING``
``1`` to enable PyTorch profiling for performance analysis.
``transformer-impl``
``transformer_engine`` to use the Transformer Engine (TE) or ``local`` to disable TE.
``MODEL_SIZE``
``8B`` or ``70B`` for Llama 3 and 3.1. ``7B`` or ``70B`` for Llama 2.
``TOTAL_ITERS``
The total number of iterations -- ``10`` by default.
``MOCK_DATA``
``1`` to use mock data or ``0`` to use real data provided by you.
``MBS``
Micro batch size.
``BS``
Global batch size.
``TP``
Tensor parallel (``1``, ``2``, ``4``, ``8``).
``SEQ_LENGTH``
Input sequence length.
.. tab-item:: DeepSeek V2
:sync: deepseek
``PR``
Precision for training. ``bf16`` for BF16 (default) or ``fp8`` for FP8 GEMMs.
``GEMM_TUNING``
``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
``TOTAL_ITERS``
The total number of iterations -- ``10`` by default.
``MOCK_DATA``
``1`` to use mock data or ``0`` to use real data provided by you.
``MBS``
Micro batch size.
``GBS``
Global batch size.
Benchmarking examples
---------------------
.. tab-set::
.. tab-item:: Llama
:sync: llama
.. tab-set::
.. tab-item:: Single node training
:sync: single-node
Use this command to run training with Llama 2 7B model on a single node. You can specify MBS, BS, FP,
datatype, and so on.
.. code-block:: bash
TEE_OUTPUT=1 MBS=5 BS=120 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
See the sample output:
.. image:: ../../../../data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
:width: 800
.. tab-item:: Multi-node training
:sync: multi-node
Launch the Docker container on each node.
In this example, run training with Llama 2 7B model on 2 nodes with specific MBS, BS, FP, datatype, and
so on.
On the master node:
.. code-block:: bash
TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
On the worker node:
.. code-block:: bash
TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
Sample output for 2-node training:
Master node:
.. image:: ../../../../data/how-to/rocm-for-ai/2-node-training-master.png
:width: 800
Worker node:
.. image:: ../../../../data/how-to/rocm-for-ai/2-node-training-worker.png
:width: 800
Previous versions
=================
This table lists previous versions of the ROCm Megatron-LM Docker image for training
performance validation. For detailed information about available models for
benchmarking, see the version-specific documentation.
.. list-table::
:header-rows: 1
:stub-columns: 1
* - ROCm version
- Megatron-LM version
- PyTorch version
- Resources
* - 6.1
- 24.12-dev
- 2.4.0
-
* `Documentation <https://rocm.docs.amd.com/en/docs-6.3.0/how-to/rocm-for-ai/train-a-model.html>`_
* `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/24.12-dev/images/sha256-5818c50334ce3d69deeeb8f589d83ec29003817da34158ebc9e2d112b929bf2e>`_

View File

@@ -0,0 +1,341 @@
:orphan:
.. meta::
:description: How to train a model using PyTorch for ROCm.
:keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
**************************************
Training a model with PyTorch for ROCm
**************************************
PyTorch is an open-source machine learning framework that is widely used for
model training with GPU-optimized components for transformer-based models.
The PyTorch for ROCm training Docker (``rocm/pytorch-training:v25.3``) image
provides a prebuilt optimized environment for fine-tuning and pretraining a
model on AMD Instinct MI325X and MI300X accelerators. It includes the following
software components to accelerate training workloads:
+--------------------------+--------------------------------+
| Software component | Version |
+==========================+================================+
| ROCm | 6.3.0 |
+--------------------------+--------------------------------+
| PyTorch | 2.7.0a0+git637433 |
+--------------------------+--------------------------------+
| Python | 3.10 |
+--------------------------+--------------------------------+
| Transformer Engine | 1.11 |
+--------------------------+--------------------------------+
| Flash Attention | 3.0.0 |
+--------------------------+--------------------------------+
| hipBLASLt | git258a2162 |
+--------------------------+--------------------------------+
| Triton | 3.1 |
+--------------------------+--------------------------------+
.. _amd-pytorch-training-model-support:
Supported models
================
The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
* Llama 3.1 8B
* Llama 3.1 70B
* FLUX.1-dev
.. note::
Only these models are supported in the following steps.
Some models, such as Llama 3, require an external license agreement through
a third party (for example, Meta).
System validation
=================
If you have already validated your system settings, skip this step. Otherwise,
complete the :ref:`system validation and optimization steps <train-a-model-system-validation>`
to set up your system before starting training.
Disable NUMA auto-balancing
---------------------------
Generally, application performance can benefit from disabling NUMA auto-balancing. However,
it might be detrimental to performance with certain types of workloads.
Run the command ``cat /proc/sys/kernel/numa_balancing`` to check your current NUMA (Non-Uniform
Memory Access) settings. Output ``0`` indicates this setting is disabled. If there is no output or
the output is ``1``, run the following command to disable NUMA auto-balancing.
.. code-block:: shell
sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
See :ref:`mi300x-disable-numa` for more information.
Environment setup
=================
This Docker image is optimized for specific model configurations outlined
below. Performance can vary for other training workloads, as AMD
doesnt validate configurations and run conditions outside those described.
Download the Docker image
-------------------------
1. Use the following command to pull the Docker image from Docker Hub.
.. code-block:: shell
docker pull rocm/pytorch-training:v25.3
2. Run the Docker container.
.. code-block:: shell
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env rocm/pytorch-training:v25.3
3. Use these commands if you exit the ``training_env`` container and need to return to it.
.. code-block:: shell
docker start training_env
docker exec -it training_env bash
4. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__ repository and navigate to the benchmark scripts directory.
.. code-block:: shell
git clone https://github.com/ROCm/MAD
cd MAD/scripts/pytorch-train
Prepare training datasets and dependencies
------------------------------------------
The following benchmarking examples may require downloading models and datasets
from Hugging Face. To ensure successful access to gated repos, set your
``HF_TOKEN``.
Run the setup script to install libraries and datasets needed for benchmarking.
.. code-block:: shell
./pytorch_benchmark_setup.sh
``pytorch_benchmark_setup.sh`` installs the following libraries:
.. list-table::
:header-rows: 1
* - Library
- Benchmark model
- Reference
* - ``accelerate``
- Llama 3.1 8B, FLUX
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
* - ``datasets``
- Llama 3.1 8B, 70B, FLUX
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
* - ``torchdata``
- Llama 3.1 70B
- `TorchData <https://pytorch.org/data/beta/index.html>`_
* - ``tomli``
- Llama 3.1 70B
- `Tomli <https://pypi.org/project/tomli/>`_
* - ``tiktoken``
- Llama 3.1 70B
- `tiktoken <https://github.com/openai/tiktoken>`_
* - ``blobfile``
- Llama 3.1 70B
- `blobfile <https://pypi.org/project/blobfile/>`_
* - ``tabulate``
- Llama 3.1 70B
- `tabulate <https://pypi.org/project/tabulate/>`_
* - ``wandb``
- Llama 3.1 70B
- `Weights & Biases <https://github.com/wandb/wandb>`_
* - ``sentencepiece``
- Llama 3.1 70B, FLUX
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
* - ``tensorboard``
- Llama 3.1 70 B, FLUX
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
* - ``csvkit``
- FLUX
- `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
* - ``deepspeed``
- FLUX
- `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
* - ``diffusers``
- FLUX
- `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
* - ``GitPython``
- FLUX
- `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
* - ``opencv-python-headless``
- FLUX
- `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
* - ``peft``
- FLUX
- `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
* - ``protobuf``
- FLUX
- `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
* - ``pytest``
- FLUX
- `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
* - ``python-dotenv``
- FLUX
- `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
* - ``seaborn``
- FLUX
- `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
* - ``transformers``
- FLUX
- `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
``pytorch_benchmark_setup.sh`` downloads the following models from Hugging Face:
* `meta-llama/Llama-3.1-70B-Instruct <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
* `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
Along with the following datasets:
* `WikiText <https://huggingface.co/datasets/Salesforce/wikitext>`_
* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
Start training on AMD Instinct accelerators
===========================================
The prebuilt PyTorch with ROCm training environment allows users to quickly validate
system performance, conduct training benchmarks, and achieve superior
performance for models like Llama 3.1 and Llama 2. This container should not be
expected to provide generalized performance across all training workloads. You
can expect the container to perform in the model configurations described in
the following section, but other configurations are not validated by AMD.
Use the following instructions to set up the environment, configure the script
to train models, and reproduce the benchmark results on MI300X series
accelerators with the AMD PyTorch training Docker image.
Once your environment is set up, use the following commands and examples to start benchmarking.
Pretraining
-----------
To start the pretraining benchmark, use the following command with the
appropriate options. See the following list of options and their descriptions.
.. code-block:: shell
./pytorch_benchmark_report.sh -t $training_mode -m $model_repo -p $datatype -s $sequence_length
Options and available models
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. list-table::
:header-rows: 1
* - Name
- Options
- Description
* - ``$training_mode``
- ``pretrain``
- Benchmark pretraining
* -
- ``finetune_fw``
- Benchmark full weight fine-tuning (Llama 3.1 70B with BF16)
* -
- ``finetune_lora``
- Benchmark LoRA fine-tuning (Llama 3.1 70B with BF16)
* - ``$datatype``
- FP8 or BF16
- Only Llama 3.1 8B supports FP8 precision.
* - ``$model_repo``
- Llama-3.1-8B
- `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct>`_
* -
- Llama-3.1-70B
- `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
* -
- Flux
- `FLUX.1 [dev] <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
Fine-tuning
-----------
To start the fine-tuning benchmark, use the following command. It will run the benchmarking example of Llama 2 70B
with the WikiText dataset using the AMD fork of `torchtune <https://github.com/AMD-AIG-AIMA/torchtune>`_.
.. code-block:: shell
./pytorch_benchmark_report.sh -t {finetune_fw, finetune_lora} -p BF16 -m Llama-3.1-70B
Benchmarking examples
---------------------
Here are some examples of how to use the command.
* Example 1: Llama 3.1 70B with BF16 precision with `torchtitan <https://github.com/ROCm/torchtitan>`_.
.. code-block:: shell
./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Llama-3.1-70B -s 8192
* Example 2: Llama 3.1 8B with FP8 precision using Transformer Engine (TE) and Hugging Face Accelerator.
.. code-block:: shell
./pytorch_benchmark_report.sh -t pretrain -p FP8 -m Llama-3.1-70B -s 8192
* Example 3: FLUX.1-dev with BF16 precision with FluxBenchmark.
.. code-block:: shell
./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Flux
* Example 4: Torchtune full weight fine-tuning with Llama 3.1 70B
.. code-block:: shell
./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.1-70B
* Example 5: Torchtune LoRA fine-tuning with Llama 3.1 70B
.. code-block:: shell
./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.1-70B

View File

@@ -19,6 +19,10 @@ training, fine-tuning, and inference. It leverages popular machine learning fram
In this guide, you'll learn about:
- :doc:`Training a model <train-a-model>`
- Training a model
- :doc:`Scale model training <scale-model-training>`
- :doc:`Train a model with Megatron-LM <benchmark-docker/megatron-lm>`
- :doc:`Train a model with PyTorch <benchmark-docker/pytorch-training>`
- :doc:`Scaling model training <scale-model-training>`

View File

@@ -0,0 +1,130 @@
:orphan:
.. meta::
:description: Prerequisite system validation before using ROCm for AI.
:keywords: ROCm, AI, LLM, train, megatron, Llama, tutorial, docker, torch, pytorch, jax
.. _train-a-model-system-validation:
**********************************************
Prerequisite system validation before training
**********************************************
Complete the following system validation and optimization steps to set up your system before starting training.
Disable NUMA auto-balancing
---------------------------
Generally, application performance can benefit from disabling NUMA auto-balancing. However,
it might be detrimental to performance with certain types of workloads.
Run the command ``cat /proc/sys/kernel/numa_balancing`` to check your current NUMA (Non-Uniform
Memory Access) settings. Output ``0`` indicates this setting is disabled. If there is no output or
the output is ``1``, run the following command to disable NUMA auto-balancing.
.. code-block:: shell
sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
See :ref:`mi300x-disable-numa` for more information.
Hardware verification with ROCm
-------------------------------
Use the command ``rocm-smi --setperfdeterminism 1900`` to set the max clock speed up to 1900 MHz
instead of the default 2100 MHz. This can reduce the chance of a PCC event lowering the attainable
GPU clocks. This setting will not be required for new IFWI releases with the production PRC feature.
You can restore this setting to its default value with the ``rocm-smi -r`` command.
Run the command:
.. code-block:: shell
rocm-smi --setperfdeterminism 1900
See :ref:`mi300x-hardware-verification-with-rocm` for more information.
RCCL Bandwidth Test for multi-node setups
-----------------------------------------
ROCm Collective Communications Library (RCCL) is a standalone library of standard collective communication
routines for GPUs. See the :doc:`RCCL documentation <rccl:index>` for more information. Before starting
pretraining, running a RCCL bandwidth test helps ensure that the multi-GPU or multi-node setup is optimized
for efficient distributed training.
Running the RCCL bandwidth test helps verify that:
- The GPUs can communicate across nodes or within a single node.
- The interconnect (such as InfiniBand, Ethernet, or Infinite fabric) is functioning as expected and
provides adequate bandwidth for communication.
- No hardware setup or cabling issues could affect the communication between GPUs
Tuning and optimizing hyperparameters
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
In distributed training, specific hyperparameters related to distributed communication can be tuned based on
the results of the RCCL bandwidth test. These variables are already set in the Docker image:
.. code-block:: shell
# force all RCCL streams to be high priority
export TORCH_NCCL_HIGH_PRIORITY=1
# specify which RDMA interfaces to use for communication
export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
# define the Global ID index used in RoCE mode
export NCCL_IB_GID_INDEX=3
# avoid data corruption/mismatch issue that existed in past releases
export RCCL_MSCCL_ENABLE=0
Running the RCCL Bandwidth Test
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
It's recommended you run the RCCL bandwidth test before launching training. It ensures system
performance is sufficient to launch training. RCCL is not included in the AMD Megatron-LM Docker
image; follow the instructions in `<https://github.com/ROCm/rccl-tests>`__ to get started.
See :ref:`mi300x-rccl` for more information.
Run on 8 GPUs (``-g 8``), scanning from 8 bytes to 10 GB:
.. code-block:: shell
./build/all_reduce_perf -b 8 -e 10G -f 2 -g 8
.. image:: ../../../data/how-to/rocm-for-ai/rccl-tests-8-gpu.png
:width: 800
Using one MPI process per GPU and ``-g 1`` for performance-oriented runs on both single-node and multi-node is
recommended. So, a run on 8 GPUs looks something like:
.. code-block:: shell
mpirun -np 8 --bind-to numa ./build/all_reduce_perf -b 8 -e 10G -f 2 -g 1
.. image:: ../../../data/how-to/rocm-for-ai/rccl-tests-1-mpi-process-per-gpu.png
:width: 800
Running with one MPI process per GPU ensures a one-to-one mapping for CPUs and GPUs, which can be beneficial
for smaller message sizes. This better represents the real-world use of RCCL in deep learning frameworks like
PyTorch and TensorFlow.
Use the following script to run the RCCL test for four MI300X GPU nodes. Modify paths and node addresses as needed.
.. code-block::
/home/$USER/ompi_for_gpu/ompi/bin/mpirun -np 32 -H tw022:8,tw024:8,tw010:8, tw015:8 \
--mca pml ucx \
--mca btl ^openib \
-x NCCL_SOCKET_IFNAME=ens50f0np0 \
-x NCCL_IB_HCA=rdma0:1,rdma1:1,rdma2:1,rdma3:1,rdma4:1,rdma5:1,rdma6:1,rdma7:1 \
-x NCCL_IB_GID_INDEX=3 \
-x NCCL_MIN_NCHANNELS=40 \
-x NCCL_DEBUG=version \
$HOME/rccl-tests/build/all_reduce_perf -b 8 -e 8g -f 2 -g 1
.. image:: ../../../data/how-to/rocm-for-ai/rccl-tests-4-mi300x-gpu-nodes.png
:width: 800

View File

@@ -1,503 +0,0 @@
.. meta::
:description: How to train a model using ROCm Megatron-LM
:keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
**************************************
Training a model with ROCm Megatron-LM
**************************************
.. _amd-megatron-lm:
The ROCm Megatron-LM framework is a specialized fork of the robust Megatron-LM, designed to
enable efficient training of large-scale language models on AMD GPUs. By leveraging AMD Instinct™ MI300X
accelerators, AMD Megatron-LM delivers enhanced scalability, performance, and resource utilization for AI
workloads. It is purpose-built to :ref:`support models <amd-megatron-lm-model-support>`
like Meta's Llama 2, Llama 3, and Llama 3.1, enabling developers to train next-generation AI models with greater
efficiency. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.
For ease of use, AMD provides a ready-to-use Docker image for MI300X accelerators containing essential
components, including PyTorch, PyTorch Lightning, ROCm libraries, and Megatron-LM utilities. It contains the
following software to accelerate training workloads:
+--------------------------+--------------------------------+
| Software component | Version |
+==========================+================================+
| ROCm | 6.1 |
+--------------------------+--------------------------------+
| PyTorch | 2.4.0 |
+--------------------------+--------------------------------+
| PyTorch Lightning | 2.4.0 |
+--------------------------+--------------------------------+
| Megatron Core | 0.9.0 |
+--------------------------+--------------------------------+
| Transformer Engine | 1.5.0 |
+--------------------------+--------------------------------+
| Flash Attention | v2.6 |
+--------------------------+--------------------------------+
| Transformers | 4.44.0 |
+--------------------------+--------------------------------+
Supported features and models
=============================
Megatron-LM provides the following key features to train large language models efficiently:
- Transformer Engine (TE)
- APEX
- GEMM tuning
- Torch.compile
- 3D parallelism: TP + SP + CP
- Distributed optimizer
- Flash Attention (FA) 2
- Fused kernels
- Pre-training
.. _amd-megatron-lm-model-support:
The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
* Llama 2 7B
* Llama 2 70B
* Llama 3 8B
* Llama 3 70B
* Llama 3.1 8B
* Llama 3.1 70B
Prerequisite system validation steps
====================================
Complete the following system validation and optimization steps to set up your system before starting training.
Disable NUMA auto-balancing
---------------------------
Generally, application performance can benefit from disabling NUMA auto-balancing. However,
it might be detrimental to performance with certain types of workloads.
Run the command ``cat /proc/sys/kernel/numa_balancing`` to check your current NUMA (Non-Uniform
Memory Access) settings. Output ``0`` indicates this setting is disabled. If there is no output or
the output is ``1``, run the following command to disable NUMA auto-balancing.
.. code-block:: shell
sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
See :ref:`mi300x-disable-numa` for more information.
Hardware verification with ROCm
-------------------------------
Use the command ``rocm-smi --setperfdeterminism 1900`` to set the max clock speed up to 1900 MHz
instead of the default 2100 MHz. This can reduce the chance of a PCC event lowering the attainable
GPU clocks. This setting will not be required for new IFWI releases with the production PRC feature.
You can restore this setting to its default value with the ``rocm-smi -r`` command.
Run the command:
.. code-block:: shell
rocm-smi --setperfdeterminism 1900
See :ref:`mi300x-hardware-verification-with-rocm` for more information.
RCCL Bandwidth Test
-------------------
ROCm Collective Communications Library (RCCL) is a standalone library of standard collective communication
routines for GPUs. See the :doc:`RCCL documentation <rccl:index>` for more information. Before starting
pre-training, running a RCCL bandwidth test helps ensure that the multi-GPU or multi-node setup is optimized
for efficient distributed training.
Running the RCCL bandwidth test helps verify that:
- The GPUs can communicate across nodes or within a single node.
- The interconnect (such as InfiniBand, Ethernet, or Infinite fabric) is functioning as expected and
provides adequate bandwidth for communication.
- No hardware setup or cabling issues could affect the communication between GPUs
Tuning and optimizing hyperparameters
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
In distributed training, specific hyperparameters related to distributed communication can be tuned based on
the results of the RCCL bandwidth test. These variables are already set in the Docker image:
.. code-block:: shell
# force all RCCL streams to be high priority
export TORCH_NCCL_HIGH_PRIORITY=1
# specify which RDMA interfaces to use for communication
export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
# define the Global ID index used in RoCE mode
export NCCL_IB_GID_INDEX=3
# avoid data corruption/mismatch issue that existed in past releases
export RCCL_MSCCL_ENABLE=0
Running the RCCL Bandwidth Test
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
It's recommended you run the RCCL bandwidth test before launching training. It ensures system
performance is sufficient to launch training. RCCL is not included in the AMD Megatron-LM Docker
image; follow the instructions in `<https://github.com/ROCm/rccl-tests>`__ to get started.
See :ref:`mi300x-rccl` for more information.
Run on 8 GPUs (``-g 8``), scanning from 8 bytes to 10 GB:
.. code-block:: shell
./build/all_reduce_perf -b 8 -e 10G -f 2 -g 8
.. image:: ../../../data/how-to/rocm-for-ai/rccl-tests-8-gpu.png
:width: 800
Using one MPI process per GPU and ``-g 1`` for performance-oriented runs on both single-node and multi-node is
recommended. So, a run on 8 GPUs looks something like:
.. code-block:: shell
mpirun -np 8 --bind-to numa ./build/all_reduce_perf -b 8 -e 10G -f 2 -g 1
.. image:: ../../../data/how-to/rocm-for-ai/rccl-tests-1-mpi-process-per-gpu.png
:width: 800
Running with one MPI process per GPU ensures a one-to-one mapping for CPUs and GPUs, which can be beneficial
for smaller message sizes. This better represents the real-world use of RCCL in deep learning frameworks like
PyTorch and TensorFlow.
Use the following script to run the RCCL test for four MI300X GPU nodes. Modify paths and node addresses as needed.
.. code-block::
/home/$USER/ompi_for_gpu/ompi/bin/mpirun -np 32 -H tw022:8,tw024:8,tw010:8, tw015:8 \
--mca pml ucx \
--mca btl ^openib \
-x NCCL_SOCKET_IFNAME=ens50f0np0 \
-x NCCL_IB_HCA=rdma0:1,rdma1:1,rdma2:1,rdma3:1,rdma4:1,rdma5:1,rdma6:1,rdma7:1 \
-x NCCL_IB_GID_INDEX=3 \
-x NCCL_MIN_NCHANNELS=40 \
-x NCCL_DEBUG=version \
$HOME/rccl-tests/build/all_reduce_perf -b 8 -e 8g -f 2 -g 1
.. image:: ../../../data/how-to/rocm-for-ai/rccl-tests-4-mi300x-gpu-nodes.png
:width: 800
.. _mi300x-amd-megatron-lm-training:
Start training on MI300X accelerators
=====================================
The pre-built ROCm Megatron-LM environment allows users to quickly validate system performance, conduct
training benchmarks, and achieve superior performance for models like Llama 2 and Llama 3.1.
Use the following instructions to set up the environment, configure the script to train models, and
reproduce the benchmark results on the MI300X accelerators with the AMD Megatron-LM Docker
image.
.. _amd-megatron-lm-requirements:
Download the Docker image and required packages
-----------------------------------------------
1. Use the following command to pull the Docker image from Docker Hub.
.. code-block:: shell
docker pull rocm/megatron-lm:24.12-dev
2. Launch the Docker container.
.. code-block:: shell
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $CACHE_DIR:/root/.cache --name megatron-dev-env rocm/megatron-lm:24.12-dev /bin/bash
3. Clone the ROCm Megatron-LM repository to a local directory and install the required packages on the host machine.
.. code-block:: shell
git clone https://github.com/ROCm/Megatron-LM
cd Megatron-LM
.. note::
This release is validated with ``ROCm/Megatron-LM`` commit `bb93ccb <https://github.com/ROCm/Megatron-LM/tree/bb93ccbfeae6363c67b361a97a27c74ab86e7e92>`_.
Checking out this specific commit is recommended for a stable and reproducible environment.
.. code-block:: shell
git checkout bb93ccbfeae6363c67b361a97a27c74ab86e7e92
Prepare training datasets
-------------------------
If you already have the preprocessed data, you can skip this section.
Use the following command to process datasets. We use GPT data as an example. You may change the merge table, use an
end-of-document token, remove sentence splitting, and use the tokenizer type.
.. code-block:: shell
python tools/preprocess_data.py \
--input my-corpus.json \
--output-prefix my-gpt2 \
--vocab-file gpt2-vocab.json \
--tokenizer-type GPT2BPETokenizer \
--merge-file gpt2-merges.txt \
--append-eod
In this case, the automatically generated output files are named ``my-gpt2_text_document.bin`` and
``my-gpt2_text_document.idx``.
.. image:: ../../../data/how-to/rocm-for-ai/prep-training-datasets-my-gpt2-text-document.png
:width: 800
.. _amd-megatron-lm-environment-setup:
Environment setup
-----------------
In the ``examples/llama`` directory of Megatron-LM, if you're working with Llama 2 7B or Llama 2 70 B, use the
``train_llama2.sh`` configuration script. Likewise, if you're working with Llama 3 or Llama 3.1, then use
``train_llama3.sh`` and update the configuration script accordingly.
Network interface
^^^^^^^^^^^^^^^^^
To avoid connectivity issues, ensure the correct network interface is set in your training scripts.
1. Run the following command to find the active network interface on your system.
.. code-block:: shell
ip a
2. Update the ``NCCL_SOCKET_IFNAME`` and ``GLOO_SOCKET_IFNAME`` variables with your systems network interface. For
example:
.. code-block:: shell
export NCCL_SOCKET_IFNAME=ens50f0np0
export GLOO_SOCKET_IFNAME=ens50f0np0
Dataset options
^^^^^^^^^^^^^^^
You can use either mock data or real data for training.
* If you're using a real dataset, update the ``DATA_PATH`` variable to point to the location of your dataset.
.. code-block:: shell
DATA_DIR="/root/.cache/data" # Change to where your dataset is stored
DATA_PATH=${DATA_DIR}/bookcorpus_text_sentence
.. code-block:: shell
--data-path $DATA_PATH
Ensure that the files are accessible inside the Docker container.
* Mock data can be useful for testing and validation. If you're using mock data, replace ``--data-path $DATA_PATH`` with the ``--mock-data`` option.
.. code-block:: shell
--mock-data
Tokenizer
^^^^^^^^^
Tokenization is the process of converting raw text into tokens that can be processed by the model. For Llama
models, this typically involves sub-word tokenization, where words are broken down into smaller units based on
a fixed vocabulary. The tokenizer is trained along with the model on a large corpus of text, and it learns a
fixed vocabulary that can represent a wide range of text from different domains. This allows Llama models to
handle a variety of input sequences, including unseen words or domain-specific terms.
To train any of the Llama 2 models that this Docker image supports, use the ``Llama2Tokenizer``.
To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable.
For example, if you're using the Llama 3.1 8B model:
.. code-block:: shell
TOKENIZER_MODEL=meta-llama/Llama-3.1-8B
Run benchmark tests
-------------------
.. note::
If you're running **multi node training**, update the following environment variables. They can
also be passed as command line arguments.
* Change ``localhost`` to the master node's hostname:
.. code-block:: shell
MASTER_ADDR="${MASTER_ADDR:-localhost}"
* Set the number of nodes you want to train on (for instance, ``2``, ``4``, ``8``):
.. code-block:: shell
NNODES="${NNODES:-1}"
* Set the rank of each node (0 for master, 1 for the first worker node, and so on):
.. code-block:: shell
NODE_RANK="${NODE_RANK:-0}"
* Use this command to run a performance benchmark test of any of the Llama 2 models that this Docker image supports (see :ref:`variables <amd-megatron-lm-benchmark-test-vars>`).
.. code-block:: shell
{variables} bash examples/llama/train_llama2.sh
* Use this command to run a performance benchmark test of any of the Llama 3 and Llama 3.1 models that this Docker image supports (see :ref:`variables <amd-megatron-lm-benchmark-test-vars>`).
.. code-block:: shell
{variables} bash examples/llama/train_llama3.sh
.. _amd-megatron-lm-benchmark-test-vars:
The benchmark tests support the same set of variables:
+--------------------------+-----------------------+-----------------------+
| Name | Options | Description |
+==========================+=======================+=======================+
| ``TEE_OUTPUT`` | 0 or 1 | 0: disable training |
| | | log |
| | | |
| | | 1: enable training |
| | | log |
+--------------------------+-----------------------+-----------------------+
| ``MBS`` | | Micro batch size |
+--------------------------+-----------------------+-----------------------+
| ``BS`` | | Batch size |
+--------------------------+-----------------------+-----------------------+
| ``TP`` | 1, 2, 4, 8 | Tensor parallel |
+--------------------------+-----------------------+-----------------------+
| ``TE_FP8`` | 0 or 1 | Datatype. |
| | | If it is set to 1, |
| | | FP8. |
| | | |
| | | If it is set to 0. |
| | | BP16 |
+--------------------------+-----------------------+-----------------------+
| ``NO_TORCH_COMPILE`` | 0 or 1 | If it is set to 1, |
| | | enable torch.compile. |
| | | |
| | | If it is set to 0. |
| | | Disable torch.compile |
| | | (default) |
+--------------------------+-----------------------+-----------------------+
| ``SEQ_LENGTH`` | | Input sequence length |
+--------------------------+-----------------------+-----------------------+
| ``GEMM_TUNING`` | 0 or 1 | If it is set to 1, |
| | | enable gemm tuning. |
| | | |
| | | If it is set to 0, |
| | | disable gemm tuning |
+--------------------------+-----------------------+-----------------------+
| ``USE_FLASH_ATTN`` | 0 or 1 | 0: disable flash |
| | | attention |
| | | |
| | | 1: enable flash |
| | | attention |
+--------------------------+-----------------------+-----------------------+
| ``ENABLE_PROFILING`` | 0 or 1 | 0: disable torch |
| | | profiling |
| | | |
| | | 1: enable torch |
| | | profiling |
+--------------------------+-----------------------+-----------------------+
| ``MODEL_SIZE`` | | The size of the mode: |
| | | 7B/70B, etc. |
+--------------------------+-----------------------+-----------------------+
| ``TOTAL_ITERS`` | | Total number of |
| | | iterations |
+--------------------------+-----------------------+-----------------------+
| ``transformer-impl`` | transformer_engine or | Enable transformer |
| | local | engine by default |
+--------------------------+-----------------------+-----------------------+
Benchmarking examples
^^^^^^^^^^^^^^^^^^^^^
.. tab-set::
.. tab-item:: Single node training
:sync: single
Use this command to run training with Llama 2 7B model on a single node. You can specify MBS, BS, FP,
datatype, and so on.
.. code-block:: bash
TEE_OUTPUT=1 MBS=5 BS=120 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
See the sample output:
.. image:: ../../../data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
:width: 800
.. tab-item:: Multi node training
:sync: multi
Launch the Docker container on each node.
In this example, run training with Llama 2 7B model on 2 nodes with specific MBS, BS, FP, datatype, and
so on.
On the master node:
.. code-block:: bash
TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
On the worker node:
.. code-block:: bash
TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
Sample output for 2-node training:
Master node:
.. image:: ../../../data/how-to/rocm-for-ai/2-node-training-master.png
:width: 800
Worker node:
.. image:: ../../../data/how-to/rocm-for-ai/2-node-training-worker.png
:width: 800

View File

@@ -21,8 +21,6 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- Model
- Architecture
- LLVM target name
- Device Major version
- Device Minor version
- VRAM (GiB)
- Compute Units
- Wavefront Size
@@ -34,12 +32,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- L1 Instruction Cache (KiB)
- VGPR File (KiB)
- SGPR File (KiB)
- GFXIP Major version
- GFXIP Minor version
*
- MI325X
- CDNA3
- gfx942
- 9
- 4
- 256
- 304 (38 per XCD)
- 64
@@ -51,12 +49,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 64 per 2 CUs
- 512
- 12.5
- 9
- 4
*
- MI300X
- CDNA3
- gfx942
- 9
- 4
- 192
- 304 (38 per XCD)
- 64
@@ -68,12 +66,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 64 per 2 CUs
- 512
- 12.5
- 9
- 4
*
- MI300A
- CDNA3
- gfx942
- 9
- 4
- 128
- 228 (38 per XCD)
- 64
@@ -85,12 +83,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 64 per 2 CUs
- 512
- 12.5
- 9
- 4
*
- MI250X
- CDNA2
- gfx90a
- 9
- 0
- 128
- 220 (110 per GCD)
- 64
@@ -102,12 +100,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32 per 2 CUs
- 512
- 12.5
- 9
- 0
*
- MI250
- CDNA2
- gfx90a
- 9
- 0
- 128
- 208 (104 per GCD)
- 64
@@ -119,12 +117,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32 per 2 CUs
- 512
- 12.5
- 9
- 0
*
- MI210
- CDNA2
- gfx90a
- 9
- 0
- 64
- 104
- 64
@@ -136,12 +134,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32 per 2 CUs
- 512
- 12.5
- 9
- 0
*
- MI100
- CDNA
- gfx908
- 9
- 0
- 32
- 120
- 64
@@ -153,12 +151,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32 per 3 CUs
- 256 VGPR and 256 AccVGPR
- 12.5
- 9
- 0
*
- MI60
- GCN5.1
- gfx906
- 9
- 0
- 32
- 64
- 64
@@ -170,12 +168,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32 per 3 CUs
- 256
- 12.5
- 9
- 0
*
- MI50 (32GB)
- GCN5.1
- gfx906
- 9
- 0
- 32
- 60
- 64
@@ -187,12 +185,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32 per 3 CUs
- 256
- 12.5
- 9
- 0
*
- MI50 (16GB)
- GCN5.1
- gfx906
- 9
- 0
- 16
- 60
- 64
@@ -204,12 +202,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32 per 3 CUs
- 256
- 12.5
- 9
- 0
*
- MI25
- GCN5.0
- gfx900
- 9
- 0
- 16 
- 64
- 64
@@ -221,12 +219,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32 per 3 CUs
- 256
- 12.5
- 9
- 0
*
- MI8
- GCN3.0
- gfx803
- 8
- 0
- 4
- 64
- 64
@@ -238,12 +236,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32 per 4 CUs
- 256
- 12.5
- 8
- 0
*
- MI6
- GCN4.0
- gfx803
- 8
- 0
- 16
- 36
- 64
@@ -255,6 +253,8 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32 per 4 CUs
- 256
- 12.5
- 8
- 0
.. tab-item:: AMD Radeon PRO GPUs
@@ -266,8 +266,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- Model
- Architecture
- LLVM target name
- Device Major version
- Device Minor version
- VRAM (GiB)
- Compute Units
- Wavefront Size
@@ -280,12 +279,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- L0 Instruction Cache (KiB)
- VGPR File (KiB)
- SGPR File (KiB)
- GFXIP Major version
- GFXIP Minor version
*
- Radeon PRO V710
- RDNA3
- gfx1101
- 11
- 0
- 28
- 54
- 32
@@ -298,12 +297,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32
- 768
- 16
- 11
- 0
*
- Radeon PRO W7900 Dual Slot
- RDNA3
- gfx1100
- 11
- 0
- 48
- 96
- 32
@@ -316,12 +315,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32
- 768
- 16
- 11
- 0
*
- Radeon PRO W7900
- RDNA3
- gfx1100
- 11
- 0
- 48
- 96
- 32
@@ -334,12 +333,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32
- 768
- 16
- 11
- 0
*
- Radeon PRO W7800
- RDNA3
- gfx1100
- 11
- 0
- 32
- 70
- 32
@@ -352,12 +351,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32
- 768
- 16
- 11
- 0
*
- Radeon PRO W7700
- RDNA3
- gfx1101
- 11
- 0
- 16
- 48
- 32
@@ -370,12 +369,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32
- 768
- 16
- 11
- 0
*
- Radeon PRO W6800
- RDNA2
- gfx1030
- 10
- 3
- 32
- 60
- 32
@@ -388,12 +387,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32
- 512
- 16
- 10
- 3
*
- Radeon PRO W6600
- RDNA2
- gfx1032
- 10
- 3
- 8
- 28
- 32
@@ -406,12 +405,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32
- 512
- 16
- 10
- 3
*
- Radeon PRO V620
- RDNA2
- gfx1030
- 10
- 3
- 32
- 72
- 32
@@ -424,12 +423,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32
- 512
- 16
- 10
- 3
*
- Radeon Pro W5500
- RDNA
- gfx1012
- 10
- 1
- 8
- 22
- 32
@@ -442,12 +441,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32
- 512
- 20
- 10
- 1
*
- Radeon Pro VII
- GCN5.1
- gfx906
- 9
- 0
- 16
- 60
- 64
@@ -460,6 +459,8 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32 per 3 CUs
- 256
- 12.5
- 9
- 0
.. tab-item:: AMD Radeon GPUs
@@ -471,8 +472,6 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- Model
- Architecture
- LLVM target name
- Device Major version
- Device Minor version
- VRAM (GiB)
- Compute Units
- Wavefront Size
@@ -485,12 +484,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- L0 Instruction Cache (KiB)
- VGPR File (KiB)
- SGPR File (KiB)
- GFXIP Major version
- GFXIP Minor version
*
- Radeon RX 7900 XTX
- RDNA3
- gfx1100
- 11
- 0
- 24
- 96
- 32
@@ -503,12 +502,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32
- 768
- 16
- 11
- 0
*
- Radeon RX 7900 XT
- RDNA3
- gfx1100
- 11
- 0
- 20
- 84
- 32
@@ -521,12 +520,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32
- 768
- 16
- 11
- 0
*
- Radeon RX 7900 GRE
- RDNA3
- gfx1100
- 11
- 0
- 16
- 80
- 32
@@ -539,12 +538,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32
- 768
- 16
- 11
- 0
*
- Radeon RX 7800 XT
- RDNA3
- gfx1101
- 11
- 0
- 16
- 60
- 32
@@ -557,12 +556,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32
- 768
- 16
- 11
- 0
*
- Radeon RX 7700 XT
- RDNA3
- gfx1101
- 11
- 0
- 12
- 54
- 32
@@ -575,12 +574,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32
- 768
- 16
- 11
- 0
*
- Radeon RX 7600
- RDNA3
- gfx1102
- 11
- 0
- 8
- 32
- 32
@@ -593,12 +592,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32
- 512
- 16
- 11
- 0
*
- Radeon RX 6950 XT
- RDNA2
- gfx1030
- 10
- 3
- 16
- 80
- 32
@@ -611,12 +610,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32
- 512
- 16
- 10
- 3
*
- Radeon RX 6900 XT
- RDNA2
- gfx1030
- 10
- 3
- 16
- 80
- 32
@@ -629,12 +628,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32
- 512
- 16
- 10
- 3
*
- Radeon RX 6800 XT
- RDNA2
- gfx1030
- 10
- 3
- 16
- 72
- 32
@@ -647,12 +646,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32
- 512
- 16
- 10
- 3
*
- Radeon RX 6800
- RDNA2
- gfx1030
- 10
- 3
- 16
- 60
- 32
@@ -665,12 +664,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32
- 512
- 16
- 10
- 3
*
- Radeon RX 6750 XT
- RDNA2
- gfx1031
- 10
- 3
- 12
- 40
- 32
@@ -683,12 +682,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32
- 512
- 16
- 10
- 3
*
- Radeon RX 6700 XT
- RDNA2
- gfx1031
- 10
- 3
- 12
- 40
- 32
@@ -701,13 +700,13 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32
- 512
- 16
- 10
- 3
*
- Radeon RX 6700
- RDNA2
- gfx1031
- 10
- 3
- 10
- 36
- 32
- 128
@@ -719,12 +718,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32
- 512
- 16
- 10
- 3
*
- Radeon RX 6650 XT
- RDNA2
- gfx1032
- 10
- 3
- 8
- 32
- 32
@@ -737,12 +736,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32
- 512
- 16
- 10
- 3
*
- Radeon RX 6600 XT
- RDNA2
- gfx1032
- 10
- 3
- 8
- 32
- 32
@@ -755,12 +754,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32
- 512
- 16
- 10
- 3
*
- Radeon RX 6600
- RDNA2
- gfx1032
- 10
- 3
- 8
- 28
- 32
@@ -773,12 +772,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32
- 512
- 16
- 10
- 3
*
- Radeon VII
- GCN5.1
- gfx906
- 9
- 0
- 16
- 60
- 64
@@ -791,6 +790,8 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
- 32 per 3 CUs
- 256
- 12.5
- 9
- 0
Glossary
========
@@ -804,18 +805,6 @@ For more information about the terms used, see the
Argument to pass to clang in ``--offload-arch`` to compile code for the given
architecture.
**Device major version**
Indicates the core instruction set of the GPU architecture. For example, a value
of 11 would correspond to Navi III (RDNA3).
**Device minor version**
Indicates a particular configuration, feature set, or variation within the group
represented by the device compute version. For example, different models within
the same major version might have varying levels of support for certain features
or optimizations.
**VRAM**
Amount of memory available on the GPU.
@@ -898,6 +887,26 @@ Purpose Vector Registers, used specifically in matrix instructions.
Size of the Scalar General Purpose Register (SGPR) file. Holds data used in
scalar instructions.
**GFXIP**
GFXIP (Graphics IP) is a versioning system used by AMD to identify the GPU
architecture and its instruction set. It helps categorize different generations
of GPUs and their feature sets.
**GFXIP major version**
Defines the GPU's core instruction set and architecture, which determines
compatibility with software stacks such as HIP and OpenCL. For example, a GFXIP
11 major version corresponds to the RDNA 3 (Navi 3x) architecture, influencing
driver support and available compute features.
**GFXIP minor version**
Represents specific variations within a GFXIP major version and affects feature sets,
optimizations, and driver behavior in software stacks such as HIP and OpenCL. Different
GPU models within the same major version can have unique capabilities, impacting
performance and supported instructions.
**GCD**
Graphics Compute Die.

View File

@@ -40,11 +40,13 @@ subtrees:
title: Training
subtrees:
- entries:
- file: how-to/rocm-for-ai/training/train-a-model.rst
title: Train a model
- file: how-to/rocm-for-ai/training/benchmark-docker/megatron-lm
title: Train a model with Megatron-LM
- file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training
title: Train a model with PyTorch
- file: how-to/rocm-for-ai/training/scale-model-training.rst
title: Scale model training
- file: how-to/rocm-for-ai/fine-tuning/index.rst
title: Fine-tuning LLMs
subtrees:

View File

@@ -1,3 +1,3 @@
rocm-docs-core==1.15.0
rocm-docs-core==1.17.0
sphinx-reredirects
sphinx-sitemap

View File

@@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with Python 3.10
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements.in
@@ -8,6 +8,8 @@ accessible-pygments==0.0.5
# via pydata-sphinx-theme
alabaster==1.0.0
# via sphinx
appnope==0.1.4
# via ipykernel
asttokens==3.0.0
# via stack-data
attrs==25.1.0
@@ -23,7 +25,7 @@ beautifulsoup4==4.12.3
# via pydata-sphinx-theme
breathe==4.35.0
# via rocm-docs-core
certifi==2024.8.30
certifi==2024.12.14
# via requests
cffi==1.17.1
# via
@@ -37,7 +39,7 @@ click==8.1.7
# sphinx-external-toc
comm==0.2.2
# via ipykernel
cryptography==44.0.1
cryptography==44.0.0
# via pyjwt
debugpy==1.8.12
# via ipykernel
@@ -51,11 +53,9 @@ docutils==0.21.2
# myst-parser
# pydata-sphinx-theme
# sphinx
exceptiongroup==1.2.2
# via ipython
executing==2.2.0
# via stack-data
fastjsonschema==2.20.0
fastjsonschema==2.21.1
# via
# nbformat
# rocm-docs-core
@@ -63,8 +63,6 @@ gitdb==4.0.11
# via gitpython
gitpython==3.1.43
# via rocm-docs-core
greenlet==3.1.1
# via sqlalchemy
idna==3.10
# via requests
imagesize==1.4.1
@@ -75,13 +73,13 @@ importlib-metadata==8.6.1
# myst-nb
ipykernel==6.29.5
# via myst-nb
ipython==8.31.0
ipython==8.32.0
# via
# ipykernel
# myst-nb
jedi==0.19.2
# via ipython
jinja2==3.1.5
jinja2==3.1.4
# via
# myst-parser
# sphinx
@@ -115,7 +113,7 @@ mdit-py-plugins==0.4.2
# via myst-parser
mdurl==0.1.2
# via markdown-it-py
myst-nb==1.1.2
myst-nb==1.2.0
# via rocm-docs-core
myst-parser==4.0.0
# via myst-nb
@@ -142,7 +140,7 @@ platformdirs==4.3.6
# via jupyter-core
prompt-toolkit==3.0.50
# via ipython
psutil==6.1.1
psutil==7.0.0
# via ipykernel
ptyprocess==0.7.0
# via pexpect
@@ -150,7 +148,7 @@ pure-eval==0.2.3
# via stack-data
pycparser==2.22
# via cffi
pydata-sphinx-theme==0.16.0
pydata-sphinx-theme==0.16.1
# via
# rocm-docs-core
# sphinx-book-theme
@@ -162,7 +160,7 @@ pygments==2.18.0
# ipython
# pydata-sphinx-theme
# sphinx
pyjwt[crypto]==2.10.0
pyjwt[crypto]==2.10.1
# via pygithub
pynacl==1.5.0
# via pygithub
@@ -175,7 +173,7 @@ pyyaml==6.0.2
# myst-parser
# rocm-docs-core
# sphinx-external-toc
pyzmq==26.2.0
pyzmq==26.2.1
# via
# ipykernel
# jupyter-client
@@ -187,7 +185,7 @@ requests==2.32.3
# via
# pygithub
# sphinx
rocm-docs-core==1.15.0
rocm-docs-core==1.17.0
# via -r requirements.in
rpds-py==0.22.3
# via
@@ -241,14 +239,12 @@ sphinxcontrib-qthelp==2.0.0
# via sphinx
sphinxcontrib-serializinghtml==2.0.0
# via sphinx
sqlalchemy==2.0.37
sqlalchemy==2.0.38
# via jupyter-cache
stack-data==0.6.3
# via ipython
tabulate==0.9.0
# via jupyter-cache
tomli==2.1.0
# via sphinx
tornado==6.4.2
# via
# ipykernel

View File

@@ -1,7 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<manifest>
<remote name="rocm-org" fetch="https://github.com/ROCm/" />
<default revision="refs/tags/rocm-6.3.2"
<default revision="refs/tags/rocm-6.3.3"
remote="rocm-org"
sync-c="true"
sync-j="4" />

View File

@@ -0,0 +1,77 @@
<?xml version="1.0" encoding="UTF-8"?>
<manifest>
<remote name="rocm-org" fetch="https://github.com/ROCm/" />
<default revision="refs/tags/rocm-6.3.3"
remote="rocm-org"
sync-c="true"
sync-j="4" />
<!--list of projects for ROCm-->
<project name="ROCm" revision="roc-6.3.x" />
<project name="ROCK-Kernel-Driver" />
<project name="ROCR-Runtime" />
<project name="amdsmi" />
<project name="rdc" />
<project name="rocm_bandwidth_test" />
<project name="rocm_smi_lib" />
<project name="rocm-core" />
<project name="rocm-examples" />
<project name="rocminfo" />
<project name="rocprofiler" />
<project name="rocprofiler-register" />
<project name="rocprofiler-sdk" />
<project name="rocprofiler-compute" />
<project name="rocprofiler-systems" />
<project name="roctracer" />
<!--HIP Projects-->
<project name="HIP" />
<project name="hip-tests" />
<project name="HIPIFY" />
<project name="clr" />
<project name="hipother" />
<!-- The following projects are all associated with the AMDGPU LLVM compiler -->
<project name="half" />
<project name="llvm-project" />
<!-- gdb projects -->
<project name="ROCdbgapi" />
<project name="ROCgdb" />
<project name="rocr_debug_agent" />
<!-- ROCm Libraries -->
<project groups="mathlibs" name="AMDMIGraphX" />
<project groups="mathlibs" name="MIOpen" />
<project groups="mathlibs" name="MIVisionX" />
<project groups="mathlibs" name="ROCmValidationSuite" />
<project groups="mathlibs" name="Tensile" />
<project groups="mathlibs" name="composable_kernel" />
<project groups="mathlibs" name="hipBLAS-common" />
<project groups="mathlibs" name="hipBLAS" />
<project groups="mathlibs" name="hipBLASLt" />
<project groups="mathlibs" name="hipCUB" />
<project groups="mathlibs" name="hipFFT" />
<project groups="mathlibs" name="hipRAND" />
<project groups="mathlibs" name="hipSOLVER" />
<project groups="mathlibs" name="hipSPARSE" />
<project groups="mathlibs" name="hipSPARSELt" />
<project groups="mathlibs" name="hipTensor" />
<project groups="mathlibs" name="hipfort" />
<project groups="mathlibs" name="rccl" />
<project groups="mathlibs" name="rocAL" />
<project groups="mathlibs" name="rocALUTION" />
<project groups="mathlibs" name="rocBLAS" />
<project groups="mathlibs" name="rocDecode" />
<project groups="mathlibs" name="rocJPEG" />
<project groups="mathlibs" name="rocPyDecode" />
<project groups="mathlibs" name="rocFFT" />
<project groups="mathlibs" name="rocPRIM" />
<project groups="mathlibs" name="rocRAND" />
<project groups="mathlibs" name="rocSOLVER" />
<project groups="mathlibs" name="rocSPARSE" />
<project groups="mathlibs" name="rocThrust" />
<project groups="mathlibs" name="rocWMMA" />
<project groups="mathlibs" name="rocm-cmake" />
<project groups="mathlibs" name="rpp" />
<project groups="mathlibs" name="TransferBench" />
<!-- Projects for OpenMP-Extras -->
<project name="aomp" path="openmp-extras/aomp" />
<project name="aomp-extras" path="openmp-extras/aomp-extras" />
<project name="flang" path="openmp-extras/flang" />
</manifest>