mirror of
https://github.com/ROCm/ROCm.git
synced 2026-01-09 14:48:06 -05:00
Compare commits
47 Commits
rocm-7.1.1
...
docs_6.3.3
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5393e90a8e | ||
|
|
fbc2815223 | ||
|
|
2b96a37b08 | ||
|
|
1e5ad14d86 | ||
|
|
f9d6bd4db8 | ||
|
|
23e78c8d55 | ||
|
|
0edd31bde6 | ||
|
|
4af488e27d | ||
|
|
7ae7046301 | ||
|
|
358092386e | ||
|
|
e071738908 | ||
|
|
cd79403931 | ||
|
|
275ef1d511 | ||
|
|
065fe8b138 | ||
|
|
be36c1808e | ||
|
|
64c362a961 | ||
|
|
d392eca232 | ||
|
|
1b58c08394 | ||
|
|
73ab81fbaf | ||
|
|
ddfb5bda12 | ||
|
|
ae7f47a0a2 | ||
|
|
5e5f7d6bb7 | ||
|
|
da1125e228 | ||
|
|
e55b9f2a33 | ||
|
|
761a524d03 | ||
|
|
c895ee483c | ||
|
|
e049d952d4 | ||
|
|
ce41922bb5 | ||
|
|
2b53b40caa | ||
|
|
9250e1ba28 | ||
|
|
3c055ab65b | ||
|
|
44aaf1b57c | ||
|
|
822e789998 | ||
|
|
243ac78609 | ||
|
|
c2f483332f | ||
|
|
b35267b6bd | ||
|
|
deb4895b11 | ||
|
|
8c036531e8 | ||
|
|
484cbefc2e | ||
|
|
721b60d52f | ||
|
|
8ebe7be283 | ||
|
|
7e8947fdb4 | ||
|
|
66cac5301f | ||
|
|
9f3a1de117 | ||
|
|
0915fb17e8 | ||
|
|
0d3eb1d774 | ||
|
|
7a258cdba9 |
@@ -1,29 +0,0 @@
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
|
||||
parameters:
|
||||
- name: checkoutRef
|
||||
type: string
|
||||
default: refs/tags/$(LATEST_RELEASE_TAG)
|
||||
|
||||
resources:
|
||||
repositories:
|
||||
- repository: pipelines_repo
|
||||
type: github
|
||||
endpoint: ROCm
|
||||
name: ROCm/ROCm
|
||||
- repository: release_repo
|
||||
type: github
|
||||
endpoint: ROCm
|
||||
name: ROCm/TransferBench
|
||||
ref: ${{ parameters.checkoutRef }}
|
||||
|
||||
trigger: none
|
||||
pr: none
|
||||
|
||||
jobs:
|
||||
- template: ${{ variables.CI_COMPONENT_PATH }}/TransferBench.yml
|
||||
parameters:
|
||||
checkoutRepo: release_repo
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
@@ -117,6 +117,7 @@ FX
|
||||
Filesystem
|
||||
FindDb
|
||||
Flang
|
||||
FluxBenchmark
|
||||
Fortran
|
||||
Fuyu
|
||||
GALB
|
||||
@@ -131,6 +132,7 @@ GDS
|
||||
GEMM
|
||||
GEMMs
|
||||
GFortran
|
||||
GFXIP
|
||||
Gemma
|
||||
GiB
|
||||
GIM
|
||||
@@ -154,6 +156,7 @@ HCA
|
||||
HGX
|
||||
HIPCC
|
||||
HIPExtension
|
||||
HIPification
|
||||
HIPIFY
|
||||
HIPification
|
||||
HIPify
|
||||
@@ -316,6 +319,7 @@ PipelineParallel
|
||||
PnP
|
||||
PowerEdge
|
||||
PowerShell
|
||||
Pretraining
|
||||
Profiler's
|
||||
PyPi
|
||||
Pytest
|
||||
@@ -715,6 +719,7 @@ preprocessing
|
||||
preprocessor
|
||||
prequantized
|
||||
prerequisites
|
||||
pretraining
|
||||
profiler
|
||||
profilers
|
||||
protobuf
|
||||
|
||||
@@ -56,7 +56,7 @@ Docker image compatibility
|
||||
|
||||
AMD validates and publishes ready-made `PyTorch images <https://hub.docker.com/r/rocm/pytorch>`_
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and
|
||||
associated inventories are validated for `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_.
|
||||
associated inventories are validated for `ROCm 6.3.3 <https://repo.radeon.com/rocm/apt/6.3.3/>`_.
|
||||
Click the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
.. list-table:: PyTorch Docker image components
|
||||
@@ -77,26 +77,26 @@ Click the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu24.04_py3.12_pytorch_release_2.4.0/images/sha256-98ddf20333bd01ff749b8092b1190ee369a75d3b8c71c2fac80ffdcb1a98d529?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu24.04_py3.12_pytorch_release_2.4.0/images/sha256-6c798857b2c9526b44ba535710b93a1737546acea79b53a93c646195c272f1d5"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- 24.04
|
||||
- `3.12 <https://www.python.org/downloads/release/python-3128/>`_
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
|
||||
- `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`_
|
||||
- `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`_
|
||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
||||
- `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
|
||||
- `4.0.7 <https://github.com/open-mpi/ompi/tree/v4.0.7>`_
|
||||
- `4.0.3 <https://github.com/open-mpi/ompi/tree/v4.0.3>`_
|
||||
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.10_pytorch_release_2.4.0/images/sha256-402c9b4f1a6b5a81c634a1932b56cbe01abb699cfcc7463d226276997c6cf8ea?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu22.04_py3.10_pytorch_release_2.4.0/images/sha256-a09b21248133876fc8912a5ff4e6ee2c8d62b14120313e426b3dadda5702713d"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- 22.04
|
||||
- `3.10 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
- `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`_
|
||||
- `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`_
|
||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
|
||||
@@ -107,11 +107,11 @@ Click the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.9_pytorch_release_2.4.0/images/sha256-e0608b55d408c3bfe5c19fdd57a4ced3e0eb3a495b74c309980b60b156c526dd?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu22.04_py3.9_pytorch_release_2.4.0/images/sha256-963187534467f0f9da77996762fc1d112a6faa5372277c348a505533e7876ec8"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- 22.04
|
||||
- `3.9.18 <https://www.python.org/downloads/release/python-3918/>`_
|
||||
- `3.9.21 <https://www.python.org/downloads/release/python-3921/>`_
|
||||
- `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`_
|
||||
- `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`_
|
||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
|
||||
@@ -122,11 +122,11 @@ Click the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-652cf25263d05b1de548222970aeb76e60b12de101de66751264709c0d0ff9d8?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-952f2621bd2bf3078bef19061e05b209105a82a7908e7e6cdf85014938a4d93a"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`_
|
||||
- 22.04
|
||||
- `3.10 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
- `1.3.0 <https://github.com/ROCm/apex/tree/release/1.3.0>`_
|
||||
- `0.18.0 <https://github.com/pytorch/vision/tree/v0.18.0>`_
|
||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
|
||||
@@ -137,7 +137,7 @@ Click the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.10_pytorch_release_2.2.1/images/sha256-051976f26beab8f9aa65d999e3ad546c027b39240a0cc3ee81b114a9024f2912?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu22.04_py3.10_pytorch_release_2.2.1/images/sha256-a2fe20e170feb9e05da3e5728bb98e40d08567e137be8e6ba797962ed2852608"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.2.1 <https://github.com/ROCm/pytorch/tree/release/2.2>`_
|
||||
- 22.04
|
||||
@@ -152,7 +152,7 @@ Click the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu20.04_py3.9_pytorch_release_2.2.1/images/sha256-88c839a364d109d3748c100385bfa100d28090d25118cc723fd0406390ab2f7e?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu20.04_py3.9_pytorch_release_2.2.1/images/sha256-7f231937c897cca5f89e360be33c70a2017d60f62d1fbe81292be48c15fe345b"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.2.1 <https://github.com/ROCm/pytorch/tree/release/2.2>`_
|
||||
- 20.04
|
||||
@@ -167,14 +167,14 @@ Click the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.9_pytorch_release_1.13.1/images/sha256-994424ed07a63113f79dd9aa72159124c00f5fbfe18127151e6658f7d0b6f821?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu22.04_py3.9_pytorch_release_1.13.1/images/sha256-616a47758004f91951e2da6c1fe291f903de65a7b2318d4b18359b48fe3032f4"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `1.13.1 <https://github.com/ROCm/pytorch/tree/release/1.13>`_
|
||||
- 22.04
|
||||
- `3.9.21 <https://www.python.org/downloads/release/python-3921/>`_
|
||||
- `1.0.0 <https://github.com/ROCm/apex/tree/release/1.0.0>`_
|
||||
- `0.14.0 <https://github.com/pytorch/vision/tree/v0.14.0>`_
|
||||
- `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18>`_
|
||||
- `2.19.0 <https://github.com/tensorflow/tensorboard/tree/2.19>`_
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
||||
- `1.14.1 <https://github.com/openucx/ucx/tree/v1.14.1>`_
|
||||
- `4.1.5 <https://github.com/open-mpi/ompi/tree/v4.1.5>`_
|
||||
@@ -182,7 +182,7 @@ Click the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu20.04_py3.9_pytorch_release_1.13.1/images/sha256-7b8139fe40a9aeb4bca3aecd15c22c1fa96e867d93479fa3a24fdeeeeafa1219?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu20.04_py3.9_pytorch_release_1.13.1/images/sha256-a2cfb365aea58b84595e241ffdb0d5ef3e6566e98c10b5499f4aa29983a74ea2"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `1.13.1 <https://github.com/ROCm/pytorch/tree/release/1.13>`_
|
||||
- 20.04
|
||||
|
||||
@@ -54,7 +54,7 @@ Docker image compatibility
|
||||
AMD validates and publishes ready-made `TensorFlow images
|
||||
<https://hub.docker.com/r/rocm/tensorflow>`_ with ROCm backends on
|
||||
Docker Hub. The following Docker image tags and associated inventories are
|
||||
validated for `ROCm 6.3.1 <https://repo.radeon.com/rocm/apt/6.3.1/>`_. Click
|
||||
validated for `ROCm 6.3.3 <https://repo.radeon.com/rocm/apt/6.3.3/>`_. Click
|
||||
the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
.. list-table:: TensorFlow Docker image components
|
||||
@@ -68,47 +68,47 @@ the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.1-py3.12-tf2.17.0-dev/images/sha256-804121ee4985718277ba7dcec53c57bdade130a1ef42f544b6c48090ad379c17"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.3-py3.12-tf2.17-dev/images/sha256-fd2653f436880366cc874aa24264ca9dabd892d76ccb63fb807debba459bcaaf"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
|
||||
- `tensorflow-rocm 2.17.0 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3/tensorflow_rocm-2.17.0-cp312-cp312-manylinux_2_28_x86_64.whl>`__
|
||||
- `tensorflow-rocm 2.17.0 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3.3/tensorflow_rocm-2.17.0-cp312-cp312-manylinux_2_28_x86_64.whl>`__
|
||||
- dev
|
||||
- `Python 3.12 <https://www.python.org/downloads/release/python-3124/>`_
|
||||
- `Python 3.12.4 <https://www.python.org/downloads/release/python-3124/>`_
|
||||
- `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.1-py3.10-tf2.17.0-dev/images/sha256-776837ffa945913f6c466bfe477810a11453d21d5b6afb200be1c36e48fbc08e"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.3-py3.10-tf2.17-dev/images/sha256-8a5eb7443798935dd269575e2abae847b702e1dfb06766ab84f081a6314d8b95"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
|
||||
- `tensorflow-rocm 2.17.0 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3/tensorflow_rocm-2.17.0-cp310-cp310-manylinux_2_28_x86_64.whl>`__
|
||||
- `tensorflow-rocm 2.17.0 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3.3/tensorflow_rocm-2.17.0-cp310-cp310-manylinux_2_28_x86_64.whl>`__
|
||||
- dev
|
||||
- `Python 3.10 <https://www.python.org/downloads/release/python-31012/>`_
|
||||
- `TensorBoard 2.17.0 <https://github.com/tensorflow/tensorboard/tree/2.17.0>`_
|
||||
- `Python 3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
- `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.1-py3.12-tf2.16.2-dev/images/sha256-c793e1483e30809c3c28fc5d7805bedc033c73da224f839fff370717cb100944"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.3-py3.12-tf2.16-dev/images/sha256-8fc939b10cdd6d2b11407474880d4c8ab2b52ab6e2d1743c921fc2adbfd0422f"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
|
||||
- `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
|
||||
- `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3.3/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
|
||||
- dev
|
||||
- `Python 3.12 <https://www.python.org/downloads/release/python-3124/>`_
|
||||
- `Python 3.12.4 <https://www.python.org/downloads/release/python-3124/>`_
|
||||
- `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.1-py3.10-tf2.16.0-dev/images/sha256-263e78414ae85d7bcd52a025a94131d0a279872a45ed632b9165336dfdcd4443"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.3-py3.10-tf2.16-dev/images/sha256-a4cc6ab23d59fdf5459ceac1f0a603e6c16ae7f885d30e42c0c2b3ac60c2ad10"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
|
||||
- `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3/tensorflow_rocm-2.16.2-cp310-cp310-manylinux_2_28_x86_64.whl>`__
|
||||
- `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3.3/tensorflow_rocm-2.16.2-cp310-cp310-manylinux_2_28_x86_64.whl>`__
|
||||
- dev
|
||||
- `Python 3.10 <https://www.python.org/downloads/release/python-31012/>`_
|
||||
- `Python 3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
- `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.1-py3.10-tf2.15.0-dev/images/sha256-479046a8477ca701a9494a813ab17e8ab4f6baa54641e65dc8d07629f1e6a880"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.3-py3.10-tf2.15-dev/images/sha256-60887c488421184adcb60b9ed4f72a8bd7bdb64d238e50943ca7cbde38e4aa48"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
|
||||
- `tensorflow-rocm 2.15.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3/tensorflow_rocm-2.15.1-cp310-cp310-manylinux_2_28_x86_64.whl>`_
|
||||
- `tensorflow-rocm 2.15.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3.3/tensorflow_rocm-2.15.1-cp310-cp310-manylinux_2_28_x86_64.whl>`_
|
||||
- dev
|
||||
- `Python 3.10 <https://www.python.org/downloads/release/python-31012/>`_
|
||||
- `Python 3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
- `TensorBoard 2.15.2 <https://github.com/tensorflow/tensorboard/tree/2.15.2>`_
|
||||
|
||||
Critical ROCm libraries for TensorFlow
|
||||
|
||||
916
docs/compatibility/pytorch-compatibility.rst
Normal file
916
docs/compatibility/pytorch-compatibility.rst
Normal file
@@ -0,0 +1,916 @@
|
||||
.. meta::
|
||||
:description: PyTorch compatibility
|
||||
:keywords: GPU, PyTorch compatibility
|
||||
|
||||
********************************************************************************
|
||||
PyTorch compatibility
|
||||
********************************************************************************
|
||||
|
||||
`PyTorch <https://pytorch.org/>`_ is an open-source tensor library designed for
|
||||
deep learning. PyTorch on ROCm provides mixed-precision and large-scale training
|
||||
using `MIOpen <https://github.com/ROCm/MIOpen>`_ and
|
||||
`RCCL <https://github.com/ROCm/rccl>`_ libraries.
|
||||
|
||||
ROCm support for PyTorch is upstreamed into the official PyTorch repository. Due to independent
|
||||
compatibility considerations, this results in two distinct release cycles for PyTorch on ROCm:
|
||||
|
||||
- ROCm PyTorch release:
|
||||
|
||||
- Provides the latest version of ROCm but doesn't immediately support the latest stable PyTorch
|
||||
version.
|
||||
|
||||
- Offers :ref:`Docker images <pytorch-docker-compat>` with ROCm and PyTorch
|
||||
pre-installed.
|
||||
|
||||
- ROCm PyTorch repository: `<https://github.com/rocm/pytorch>`__
|
||||
|
||||
- See the :doc:`ROCm PyTorch installation guide <rocm-install-on-linux:install/3rd-party/pytorch-install>` to get started.
|
||||
|
||||
- Official PyTorch release:
|
||||
|
||||
- Provides the latest stable version of PyTorch but doesn't immediately support the latest ROCm version.
|
||||
|
||||
- Official PyTorch repository: `<https://github.com/pytorch/pytorch>`__
|
||||
|
||||
- See the `Nightly and latest stable version installation guide <https://pytorch.org/get-started/locally/>`_
|
||||
or `Previous versions <https://pytorch.org/get-started/previous-versions/>`_ to get started.
|
||||
|
||||
The upstream PyTorch includes an automatic HIPification solution that automatically generates HIP
|
||||
source code from the CUDA backend. This approach allows PyTorch to support ROCm without requiring
|
||||
manual code modifications.
|
||||
|
||||
ROCm's development is aligned with the stable release of PyTorch while upstream PyTorch testing uses
|
||||
the stable release of ROCm to maintain consistency.
|
||||
|
||||
.. _pytorch-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
================================================================================
|
||||
|
||||
AMD validates and publishes ready-made `PyTorch <https://hub.docker.com/r/rocm/pytorch>`_
|
||||
images with ROCm backends on Docker Hub. The following Docker image tags and
|
||||
associated inventories are validated for `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_.
|
||||
|
||||
.. list-table:: PyTorch Docker image components
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker
|
||||
- PyTorch
|
||||
- Ubuntu
|
||||
- Python
|
||||
- Apex
|
||||
- torchvision
|
||||
- TensorBoard
|
||||
- MAGMA
|
||||
- UCX
|
||||
- OMPI
|
||||
- OFED
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu24.04_py3.12_pytorch_release_2.4.0/images/sha256-98ddf20333bd01ff749b8092b1190ee369a75d3b8c71c2fac80ffdcb1a98d529?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- 24.04
|
||||
- `3.12 <https://www.python.org/downloads/release/python-3128/>`_
|
||||
- `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`_
|
||||
- `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`_
|
||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`_
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
||||
- `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
|
||||
- `4.0.7 <https://github.com/open-mpi/ompi/tree/v4.0.7>`_
|
||||
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.10_pytorch_release_2.4.0/images/sha256-402c9b4f1a6b5a81c634a1932b56cbe01abb699cfcc7463d226276997c6cf8ea?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- 22.04
|
||||
- `3.10 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
- `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`_
|
||||
- `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`_
|
||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`_
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
||||
- `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
|
||||
- `4.0.7 <https://github.com/open-mpi/ompi/tree/v4.0.7>`_
|
||||
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.9_pytorch_release_2.4.0/images/sha256-e0608b55d408c3bfe5c19fdd57a4ced3e0eb3a495b74c309980b60b156c526dd?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- 22.04
|
||||
- `3.9 <https://www.python.org/downloads/release/python-3918/>`_
|
||||
- `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`_
|
||||
- `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`_
|
||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`_
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
||||
- `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
|
||||
- `4.0.7 <https://github.com/open-mpi/ompi/tree/v4.0.7>`_
|
||||
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-652cf25263d05b1de548222970aeb76e60b12de101de66751264709c0d0ff9d8?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`_
|
||||
- 22.04
|
||||
- `3.10 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
- `1.3.0 <https://github.com/ROCm/apex/tree/release/1.3.0>`_
|
||||
- `0.18.0 <https://github.com/pytorch/vision/tree/v0.18.0>`_
|
||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`_
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
||||
- `1.14.1 <https://github.com/openucx/ucx/tree/v1.14.1>`_
|
||||
- `4.1.5 <https://github.com/open-mpi/ompi/tree/v4.1.5>`_
|
||||
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.10_pytorch_release_2.2.1/images/sha256-051976f26beab8f9aa65d999e3ad546c027b39240a0cc3ee81b114a9024f2912?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.2.1 <https://github.com/ROCm/pytorch/tree/release/2.2>`_
|
||||
- 22.04
|
||||
- `3.10 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
- `1.2.0 <https://github.com/ROCm/apex/tree/release/1.2.0>`_
|
||||
- `0.17.1 <https://github.com/pytorch/vision/tree/v0.17.1>`_
|
||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`_
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
||||
- `1.14.1 <https://github.com/openucx/ucx/tree/v1.14.1>`_
|
||||
- `4.1.5 <https://github.com/open-mpi/ompi/tree/v4.1.5>`_
|
||||
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu20.04_py3.9_pytorch_release_2.2.1/images/sha256-88c839a364d109d3748c100385bfa100d28090d25118cc723fd0406390ab2f7e?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.2.1 <https://github.com/ROCm/pytorch/tree/release/2.2>`_
|
||||
- 20.04
|
||||
- `3.9 <https://www.python.org/downloads/release/python-3921/>`_
|
||||
- `1.2.0 <https://github.com/ROCm/apex/tree/release/1.2.0>`_
|
||||
- `0.17.1 <https://github.com/pytorch/vision/tree/v0.17.1>`_
|
||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
||||
- `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
|
||||
- `4.0.3 <https://github.com/open-mpi/ompi/tree/v4.0.3>`_
|
||||
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.9_pytorch_release_1.13.1/images/sha256-994424ed07a63113f79dd9aa72159124c00f5fbfe18127151e6658f7d0b6f821?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `1.13.1 <https://github.com/ROCm/pytorch/tree/release/1.13>`_
|
||||
- 22.04
|
||||
- `3.9 <https://www.python.org/downloads/release/python-3921/>`_
|
||||
- `1.0.0 <https://github.com/ROCm/apex/tree/release/1.0.0>`_
|
||||
- `0.14.0 <https://github.com/pytorch/vision/tree/v0.14.0>`_
|
||||
- `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18>`_
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
||||
- `1.14.1 <https://github.com/openucx/ucx/tree/v1.14.1>`_
|
||||
- `4.1.5 <https://github.com/open-mpi/ompi/tree/v4.1.5>`_
|
||||
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu20.04_py3.9_pytorch_release_1.13.1/images/sha256-7b8139fe40a9aeb4bca3aecd15c22c1fa96e867d93479fa3a24fdeeeeafa1219?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `1.13.1 <https://github.com/ROCm/pytorch/tree/release/1.13>`_
|
||||
- 20.04
|
||||
- `3.9 <https://www.python.org/downloads/release/python-3921/>`_
|
||||
- `1.0.0 <https://github.com/ROCm/apex/tree/release/1.0.0>`_
|
||||
- `0.14.0 <https://github.com/pytorch/vision/tree/v0.14.0>`_
|
||||
- `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18>`_
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
||||
- `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
|
||||
- `4.0.3 <https://github.com/open-mpi/ompi/tree/v4.0.3>`_
|
||||
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
|
||||
|
||||
Critical ROCm libraries for PyTorch
|
||||
================================================================================
|
||||
|
||||
The functionality of PyTorch with ROCm is shaped by its underlying library
|
||||
dependencies. These critical ROCm components affect the capabilities,
|
||||
performance, and feature set available to developers.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - ROCm library
|
||||
- Version
|
||||
- Purpose
|
||||
- Used in
|
||||
* - `Composable Kernel <https://github.com/ROCm/composable_kernel>`_
|
||||
- 1.1.0
|
||||
- Enables faster execution of core operations like matrix multiplication
|
||||
(GEMM), convolutions and transformations.
|
||||
- Speeds up ``torch.permute``, ``torch.view``, ``torch.matmul``,
|
||||
``torch.mm``, ``torch.bmm``, ``torch.nn.Conv2d``, ``torch.nn.Conv3d``
|
||||
and ``torch.nn.MultiheadAttention``.
|
||||
* - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
|
||||
- 2.3.0
|
||||
- Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
|
||||
matrix and vector operations.
|
||||
- Supports operations like matrix multiplication, matrix-vector products,
|
||||
and tensor contractions. Utilized in both dense and batched linear
|
||||
algebra operations.
|
||||
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
|
||||
- 0.10.0
|
||||
- hipBLASLt is an extension of the hipBLAS library, providing additional
|
||||
features like epilogues fused into the matrix multiplication kernel or
|
||||
use of integer tensor cores.
|
||||
- It accelerates operations like ``torch.matmul``, ``torch.mm``, and the
|
||||
matrix multiplications used in convolutional and linear layers.
|
||||
* - `hipCUB <https://github.com/ROCm/hipCUB>`_
|
||||
- 3.3.0
|
||||
- Provides a C++ template library for parallel algorithms for reduction,
|
||||
scan, sort and select.
|
||||
- Supports operations like ``torch.sum``, ``torch.cumsum``, ``torch.sort``
|
||||
and ``torch.topk``. Operations on sparse tensors or tensors with
|
||||
irregular shapes often involve scanning, sorting, and filtering, which
|
||||
hipCUB handles efficiently.
|
||||
* - `hipFFT <https://github.com/ROCm/hipFFT>`_
|
||||
- 1.0.17
|
||||
- Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
|
||||
- Used in functions like the ``torch.fft`` module.
|
||||
* - `hipRAND <https://github.com/ROCm/hipRAND>`_
|
||||
- 2.11.0
|
||||
- Provides fast random number generation for GPUs.
|
||||
- The ``torch.rand``, ``torch.randn`` and stochastic layers like
|
||||
``torch.nn.Dropout``.
|
||||
* - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
|
||||
- 2.3.0
|
||||
- Provides GPU-accelerated solvers for linear systems, eigenvalues, and
|
||||
singular value decompositions (SVD).
|
||||
- Supports functions like ``torch.linalg.solve``,
|
||||
``torch.linalg.eig``, and ``torch.linalg.svd``.
|
||||
* - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
|
||||
- 3.1.2
|
||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||
or matrix-matrix products.
|
||||
- Sparse tensor operations ``torch.sparse``.
|
||||
* - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`_
|
||||
- 0.2.2
|
||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||
or matrix-matrix products.
|
||||
- Sparse tensor operations ``torch.sparse``.
|
||||
* - `hipTensor <https://github.com/ROCm/hipTensor>`_
|
||||
- 1.4.0
|
||||
- Optimizes for high-performance tensor operations, such as contractions.
|
||||
- Accelerates tensor algebra, especially in deep learning and scientific
|
||||
computing.
|
||||
* - `MIOpen <https://github.com/ROCm/MIOpen>`_
|
||||
- 3.3.0
|
||||
- Optimizes deep learning primitives such as convolutions, pooling,
|
||||
normalization, and activation functions.
|
||||
- Speeds up convolutional neural networks (CNNs), recurrent neural
|
||||
networks (RNNs), and other layers. Used in operations like
|
||||
``torch.nn.Conv2d``, ``torch.nn.ReLU``, and ``torch.nn.LSTM``.
|
||||
* - `MIGraphX <https://github.com/ROCm/AMDMIGraphX>`_
|
||||
- 2.11.0
|
||||
- Add graph-level optimizations, ONNX models and mixed precision support
|
||||
and enable Ahead-of-Time (AOT) Compilation.
|
||||
- Speeds up inference models and executes ONNX models for
|
||||
compatibility with other frameworks.
|
||||
``torch.nn.Conv2d``, ``torch.nn.ReLU``, and ``torch.nn.LSTM``.
|
||||
* - `MIVisionX <https://github.com/ROCm/MIVisionX>`_
|
||||
- 3.1.0
|
||||
- Optimizes acceleration for computer vision and AI workloads like
|
||||
preprocessing, augmentation, and inferencing.
|
||||
- Faster data preprocessing and augmentation pipelines for datasets like
|
||||
ImageNet or COCO and easy to integrate into PyTorch's ``torch.utils.data``
|
||||
and ``torchvision`` workflows.
|
||||
* - `rocAL <https://github.com/ROCm/rocAL>`_
|
||||
- 2.1.0
|
||||
- Accelerates the data pipeline by offloading intensive preprocessing and
|
||||
augmentation tasks. rocAL is part of MIVisionX.
|
||||
- Easy to integrate into PyTorch's ``torch.utils.data`` and
|
||||
``torchvision`` data load workloads.
|
||||
* - `RCCL <https://github.com/ROCm/rccl>`_
|
||||
- 2.21.5
|
||||
- Optimizes for multi-GPU communication for operations like AllReduce and
|
||||
Broadcast.
|
||||
- Distributed data parallel training (``torch.nn.parallel.DistributedDataParallel``).
|
||||
Handles communication in multi-GPU setups.
|
||||
* - `rocDecode <https://github.com/ROCm/rocDecode>`_
|
||||
- 0.8.0
|
||||
- Provide hardware-accelerated data decoding capabilities, particularly
|
||||
for image, video, and other dataset formats.
|
||||
- Can be integrated in ``torch.utils.data``, ``torchvision.transforms``
|
||||
and ``torch.distributed``.
|
||||
* - `rocJPEG <https://github.com/ROCm/rocJPEG>`_
|
||||
- 0.6.0
|
||||
- Provide hardware-accelerated JPEG image decoding and encoding.
|
||||
- GPU accelerated ``torchvision.io.decode_jpeg`` and
|
||||
``torchvision.io.encode_jpeg`` and can be integrated in
|
||||
``torch.utils.data`` and ``torchvision``.
|
||||
* - `RPP <https://github.com/ROCm/RPP>`_
|
||||
- 1.9.1
|
||||
- Speed up data augmentation, transformation, and other preprocessing step.
|
||||
- Easy to integrate into PyTorch's ``torch.utils.data`` and
|
||||
``torchvision`` data load workloads.
|
||||
* - `rocThrust <https://github.com/ROCm/rocThrust>`_
|
||||
- 3.3.0
|
||||
- Provides a C++ template library for parallel algorithms like sorting,
|
||||
reduction, and scanning.
|
||||
- Utilized in backend operations for tensor computations requiring
|
||||
parallel processing.
|
||||
* - `rocWMMA <https://github.com/ROCm/rocWMMA>`_
|
||||
- 1.6.0
|
||||
- Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
|
||||
multiplication (GEMM) and accumulation operations with mixed precision
|
||||
support.
|
||||
- Linear layers (``torch.nn.Linear``), convolutional layers
|
||||
(``torch.nn.Conv2d``), attention layers, general tensor operations that
|
||||
involve matrix products, such as ``torch.matmul``, ``torch.bmm``, and
|
||||
more.
|
||||
|
||||
Supported and unsupported features
|
||||
================================================================================
|
||||
|
||||
The following section maps GPU-accelerated PyTorch features to their supported
|
||||
ROCm and PyTorch versions.
|
||||
|
||||
torch
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
`torch <https://pytorch.org/docs/stable/index.html>`_ is the central module of
|
||||
PyTorch, providing data structures for multi-dimensional tensors and
|
||||
implementing mathematical operations on them. It also includes utilities for
|
||||
efficient serialization of tensors and arbitrary data types, along with various
|
||||
other tools.
|
||||
|
||||
Tensor data types
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The data type of a tensor is specified using the ``dtype`` attribute or argument, and PyTorch supports a wide range of data types for different use cases.
|
||||
|
||||
The following table lists `torch.Tensor <https://pytorch.org/docs/stable/tensors.html>`_'s single data types:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Data type
|
||||
- Description
|
||||
- Since PyTorch
|
||||
- Since ROCm
|
||||
* - ``torch.float8_e4m3fn``
|
||||
- 8-bit floating point, e4m3
|
||||
- 2.3
|
||||
- 5.5
|
||||
* - ``torch.float8_e5m2``
|
||||
- 8-bit floating point, e5m2
|
||||
- 2.3
|
||||
- 5.5
|
||||
* - ``torch.float16`` or ``torch.half``
|
||||
- 16-bit floating point
|
||||
- 0.1.6
|
||||
- 2.0
|
||||
* - ``torch.bfloat16``
|
||||
- 16-bit floating point
|
||||
- 1.6
|
||||
- 2.6
|
||||
* - ``torch.float32`` or ``torch.float``
|
||||
- 32-bit floating point
|
||||
- 0.1.12_2
|
||||
- 2.0
|
||||
* - ``torch.float64`` or ``torch.double``
|
||||
- 64-bit floating point
|
||||
- 0.1.12_2
|
||||
- 2.0
|
||||
* - ``torch.complex32`` or ``torch.chalf``
|
||||
- PyTorch provides native support for 32-bit complex numbers
|
||||
- 1.6
|
||||
- 2.0
|
||||
* - ``torch.complex64`` or ``torch.cfloat``
|
||||
- PyTorch provides native support for 64-bit complex numbers
|
||||
- 1.6
|
||||
- 2.0
|
||||
* - ``torch.complex128`` or ``torch.cdouble``
|
||||
- PyTorch provides native support for 128-bit complex numbers
|
||||
- 1.6
|
||||
- 2.0
|
||||
* - ``torch.uint8``
|
||||
- 8-bit integer (unsigned)
|
||||
- 0.1.12_2
|
||||
- 2.0
|
||||
* - ``torch.uint16``
|
||||
- 16-bit integer (unsigned)
|
||||
- 2.3
|
||||
- Not natively supported
|
||||
* - ``torch.uint32``
|
||||
- 32-bit integer (unsigned)
|
||||
- 2.3
|
||||
- Not natively supported
|
||||
* - ``torch.uint64``
|
||||
- 32-bit integer (unsigned)
|
||||
- 2.3
|
||||
- Not natively supported
|
||||
* - ``torch.int8``
|
||||
- 8-bit integer (signed)
|
||||
- 1.12
|
||||
- 5.0
|
||||
* - ``torch.int16`` or ``torch.short``
|
||||
- 16-bit integer (signed)
|
||||
- 0.1.12_2
|
||||
- 2.0
|
||||
* - ``torch.int32`` or ``torch.int``
|
||||
- 32-bit integer (signed)
|
||||
- 0.1.12_2
|
||||
- 2.0
|
||||
* - ``torch.int64`` or ``torch.long``
|
||||
- 64-bit integer (signed)
|
||||
- 0.1.12_2
|
||||
- 2.0
|
||||
* - ``torch.bool``
|
||||
- Boolean
|
||||
- 1.2
|
||||
- 2.0
|
||||
* - ``torch.quint8``
|
||||
- Quantized 8-bit integer (unsigned)
|
||||
- 1.8
|
||||
- 5.0
|
||||
* - ``torch.qint8``
|
||||
- Quantized 8-bit integer (signed)
|
||||
- 1.8
|
||||
- 5.0
|
||||
* - ``torch.qint32``
|
||||
- Quantized 32-bit integer (signed)
|
||||
- 1.8
|
||||
- 5.0
|
||||
* - ``torch.quint4x2``
|
||||
- Quantized 4-bit integer (unsigned)
|
||||
- 1.8
|
||||
- 5.0
|
||||
|
||||
.. note::
|
||||
|
||||
Unsigned types aside from ``uint8`` are currently only have limited support in
|
||||
eager mode (they primarily exist to assist usage with ``torch.compile``).
|
||||
|
||||
The :doc:`ROCm precision support page <rocm:reference/precision-support>`
|
||||
collected the native HW support of different data types.
|
||||
|
||||
torch.cuda
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
``torch.cuda`` in PyTorch is a module that provides utilities and functions for
|
||||
managing and utilizing AMD and NVIDIA GPUs. It enables GPU-accelerated
|
||||
computations, memory management, and efficient execution of tensor operations,
|
||||
leveraging ROCm and CUDA as the underlying frameworks.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Data type
|
||||
- Description
|
||||
- Since PyTorch
|
||||
- Since ROCm
|
||||
* - Device management
|
||||
- Utilities for managing and interacting with GPUs.
|
||||
- 0.4.0
|
||||
- 3.8
|
||||
* - Tensor operations on GPU
|
||||
- Perform tensor operations such as addition and matrix multiplications on
|
||||
the GPU.
|
||||
- 0.4.0
|
||||
- 3.8
|
||||
* - Streams and events
|
||||
- Streams allow overlapping computation and communication for optimized
|
||||
performance, events enable synchronization.
|
||||
- 1.6.0
|
||||
- 3.8
|
||||
* - Memory management
|
||||
- Functions to manage and inspect memory usage like
|
||||
``torch.cuda.memory_allocated()``, ``torch.cuda.max_memory_allocated()``,
|
||||
``torch.cuda.memory_reserved()`` and ``torch.cuda.empty_cache()``.
|
||||
- 0.3.0
|
||||
- 1.9.2
|
||||
* - Running process lists of memory management
|
||||
- Return a human-readable printout of the running processes and their GPU
|
||||
memory use for a given device with functions like
|
||||
``torch.cuda.memory_stats()`` and ``torch.cuda.memory_summary()``.
|
||||
- 1.8.0
|
||||
- 4.0
|
||||
* - Communication collectives
|
||||
- A set of APIs that enable efficient communication between multiple GPUs,
|
||||
allowing for distributed computing and data parallelism.
|
||||
- 1.9.0
|
||||
- 5.0
|
||||
* - ``torch.cuda.CUDAGraph``
|
||||
- Graphs capture sequences of GPU operations to minimize kernel launch
|
||||
overhead and improve performance.
|
||||
- 1.10.0
|
||||
- 5.3
|
||||
* - TunableOp
|
||||
- A mechanism that allows certain operations to be more flexible and
|
||||
optimized for performance. It enables automatic tuning of kernel
|
||||
configurations and other settings to achieve the best possible
|
||||
performance based on the specific hardware (GPU) and workload.
|
||||
- 2.0
|
||||
- 5.4
|
||||
* - NVIDIA Tools Extension (NVTX)
|
||||
- Integration with NVTX for profiling and debugging GPU performance using
|
||||
NVIDIA's Nsight tools.
|
||||
- 1.8.0
|
||||
- ❌
|
||||
* - Lazy loading NVRTC
|
||||
- Delays JIT compilation with NVRTC until the code is explicitly needed.
|
||||
- 1.13.0
|
||||
- ❌
|
||||
* - Jiterator (beta)
|
||||
- Jiterator allows asynchronous data streaming into computation streams
|
||||
during training loops.
|
||||
- 1.13.0
|
||||
- 5.2
|
||||
|
||||
.. Need to validate and extend.
|
||||
|
||||
torch.backends.cuda
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
``torch.backends.cuda`` is a PyTorch module that provides configuration options
|
||||
and flags to control the behavior of CUDA or ROCm operations. It is part of the
|
||||
PyTorch backend configuration system, which allows users to fine-tune how
|
||||
PyTorch interacts with the CUDA or ROCm environment.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Data type
|
||||
- Description
|
||||
- Since PyTorch
|
||||
- Since ROCm
|
||||
* - ``cufft_plan_cache``
|
||||
- Manages caching of GPU FFT plans to optimize repeated FFT computations.
|
||||
- 1.7.0
|
||||
- 5.0
|
||||
* - ``matmul.allow_tf32``
|
||||
- Enables or disables the use of TensorFloat-32 (TF32) precision for
|
||||
faster matrix multiplications on GPUs with Tensor Cores.
|
||||
- 1.10.0
|
||||
- ❌
|
||||
* - ``matmul.allow_fp16_reduced_precision_reduction``
|
||||
- Reduced precision reductions (e.g., with fp16 accumulation type) are
|
||||
allowed with fp16 GEMMs.
|
||||
- 2.0
|
||||
- ❌
|
||||
* - ``matmul.allow_bf16_reduced_precision_reduction``
|
||||
- Reduced precision reductions are allowed with bf16 GEMMs.
|
||||
- 2.0
|
||||
- ❌
|
||||
* - ``enable_cudnn_sdp``
|
||||
- Globally enables cuDNN SDPA's kernels within SDPA.
|
||||
- 2.0
|
||||
- ❌
|
||||
* - ``enable_flash_sdp``
|
||||
- Globally enables or disables FlashAttention for SDPA.
|
||||
- 2.1
|
||||
- ❌
|
||||
* - ``enable_mem_efficient_sdp``
|
||||
- Globally enables or disables Memory-Efficient Attention for SDPA.
|
||||
- 2.1
|
||||
- ❌
|
||||
* - ``enable_math_sdp``
|
||||
- Globally enables or disables the PyTorch C++ implementation within SDPA.
|
||||
- 2.1
|
||||
- ❌
|
||||
|
||||
.. Need to validate and extend.
|
||||
|
||||
torch.backends.cudnn
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Supported ``torch`` options:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Data type
|
||||
- Description
|
||||
- Since PyTorch
|
||||
- Since ROCm
|
||||
* - ``allow_tf32``
|
||||
- TensorFloat-32 tensor cores may be used in cuDNN convolutions on NVIDIA
|
||||
Ampere or newer GPUs.
|
||||
- 1.12.0
|
||||
- ❌
|
||||
* - ``deterministic``
|
||||
- A bool that, if True, causes cuDNN to only use deterministic
|
||||
convolution algorithms.
|
||||
- 1.12.0
|
||||
- 6.0
|
||||
|
||||
Automatic mixed precision: torch.amp
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
PyTorch that automates the process of using both 16-bit (half-precision,
|
||||
float16) and 32-bit (single-precision, float32) floating-point types in model
|
||||
training and inference.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Data type
|
||||
- Description
|
||||
- Since PyTorch
|
||||
- Since ROCm
|
||||
* - Autocasting
|
||||
- Instances of autocast serve as context managers or decorators that allow
|
||||
regions of your script to run in mixed precision.
|
||||
- 1.9
|
||||
- 2.5
|
||||
* - Gradient scaling
|
||||
- To prevent underflow, “gradient scaling” multiplies the network’s
|
||||
loss(es) by a scale factor and invokes a backward pass on the scaled
|
||||
loss(es). Gradients flowing backward through the network are then
|
||||
scaled by the same factor. In other words, gradient values have a
|
||||
larger magnitude, so they don’t flush to zero.
|
||||
- 1.9
|
||||
- 2.5
|
||||
* - CUDA op-specific behavior
|
||||
- These ops always go through autocasting whether they are invoked as part
|
||||
of a ``torch.nn.Module``, as a function, or as a ``torch.Tensor`` method. If
|
||||
functions are exposed in multiple namespaces, they go through
|
||||
autocasting regardless of the namespace.
|
||||
- 1.9
|
||||
- 2.5
|
||||
|
||||
Distributed library features
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The PyTorch distributed library includes a collective of parallelism modules, a
|
||||
communications layer, and infrastructure for launching and debugging large
|
||||
training jobs. See :ref:`rocm-for-ai-pytorch-distributed` for more information.
|
||||
|
||||
The Distributed Library feature in PyTorch provides tools and APIs for building
|
||||
and running distributed machine learning workflows. It allows training models
|
||||
across multiple processes, GPUs, or nodes in a cluster, enabling efficient use
|
||||
of computational resources and scalability for large-scale tasks.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Features
|
||||
- Description
|
||||
- Since PyTorch
|
||||
- Since ROCm
|
||||
* - TensorPipe
|
||||
- TensorPipe is a point-to-point communication library integrated into
|
||||
PyTorch for distributed training. It is designed to handle tensor data
|
||||
transfers efficiently between different processes or devices, including
|
||||
those on separate machines.
|
||||
- 1.8
|
||||
- 5.4
|
||||
* - Gloo
|
||||
- Gloo is designed for multi-machine and multi-GPU setups, enabling
|
||||
efficient communication and synchronization between processes. Gloo is
|
||||
one of the default backends for PyTorch's Distributed Data Parallel
|
||||
(DDP) and RPC frameworks, alongside other backends like NCCL and MPI.
|
||||
- 1.0
|
||||
- 2.0
|
||||
|
||||
torch.compiler
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Features
|
||||
- Description
|
||||
- Since PyTorch
|
||||
- Since ROCm
|
||||
* - ``torch.compiler`` (AOT Autograd)
|
||||
- Autograd captures not only the user-level code, but also backpropagation,
|
||||
which results in capturing the backwards pass “ahead-of-time”. This
|
||||
enables acceleration of both forwards and backwards pass using
|
||||
``TorchInductor``.
|
||||
- 2.0
|
||||
- 5.3
|
||||
* - ``torch.compiler`` (TorchInductor)
|
||||
- The default ``torch.compile`` deep learning compiler that generates fast
|
||||
code for multiple accelerators and backends. You need to use a backend
|
||||
compiler to make speedups through ``torch.compile`` possible. For AMD,
|
||||
NVIDIA, and Intel GPUs, it leverages OpenAI Triton as the key building block.
|
||||
- 2.0
|
||||
- 5.3
|
||||
|
||||
torchaudio
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
The `torchaudio <https://pytorch.org/audio/stable/index.html>`_ library provides
|
||||
utilities for processing audio data in PyTorch, such as audio loading,
|
||||
transformations, and feature extraction.
|
||||
|
||||
To ensure GPU-acceleration with ``torchaudio.transforms``, you need to move audio
|
||||
data (waveform tensor) explicitly to GPU using ``.to('cuda')``.
|
||||
|
||||
The following ``torchaudio`` features are GPU-accelerated.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Features
|
||||
- Description
|
||||
- Since torchaudio version
|
||||
- Since ROCm
|
||||
* - ``torchaudio.transforms.Spectrogram``
|
||||
- Generate spectrogram of an input waveform using STFT.
|
||||
- 0.6.0
|
||||
- 4.5
|
||||
* - ``torchaudio.transforms.MelSpectrogram``
|
||||
- Generate the mel-scale spectrogram of raw audio signals.
|
||||
- 0.9.0
|
||||
- 4.5
|
||||
* - ``torchaudio.transforms.MFCC``
|
||||
- Extract of MFCC features.
|
||||
- 0.9.0
|
||||
- 4.5
|
||||
* - ``torchaudio.transforms.Resample``
|
||||
- Resample a signal from one frequency to another
|
||||
- 0.9.0
|
||||
- 4.5
|
||||
|
||||
torchvision
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
The `torchvision <https://pytorch.org/vision/stable/index.html>`_ library
|
||||
provide datasets, model architectures, and common image transformations for
|
||||
computer vision.
|
||||
|
||||
The following ``torchvision`` features are GPU-accelerated.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Features
|
||||
- Description
|
||||
- Since torchvision version
|
||||
- Since ROCm
|
||||
* - ``torchvision.transforms.functional``
|
||||
- Provides GPU-compatible transformations for image preprocessing like
|
||||
resize, normalize, rotate and crop.
|
||||
- 0.2.0
|
||||
- 4.0
|
||||
* - ``torchvision.ops``
|
||||
- GPU-accelerated operations for object detection and segmentation tasks.
|
||||
``torchvision.ops.roi_align``, ``torchvision.ops.nms`` and
|
||||
``box_convert``.
|
||||
- 0.6.0
|
||||
- 3.3
|
||||
* - ``torchvision.models`` with ``.to('cuda')``
|
||||
- ``torchvision`` provides several pre-trained models (ResNet, Faster
|
||||
R-CNN, Mask R-CNN, ...) that can run on CUDA for faster inference and
|
||||
training.
|
||||
- 0.1.6
|
||||
- 2.x
|
||||
* - ``torchvision.io``
|
||||
- Video decoding and frame extraction using GPU acceleration with NVIDIA’s
|
||||
NVDEC and nvJPEG (rocJPEG) on CUDA-enabled GPUs.
|
||||
- 0.4.0
|
||||
- 6.3
|
||||
|
||||
torchtext
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
The `torchtext <https://pytorch.org/text/stable/index.html>`_ library provides
|
||||
utilities for processing and working with text data in PyTorch, including
|
||||
tokenization, vocabulary management, and text embeddings. torchtext supports
|
||||
preprocessing pipelines and integration with PyTorch models, simplifying the
|
||||
implementation of natural language processing (NLP) tasks.
|
||||
|
||||
To leverage GPU acceleration in torchtext, you need to move tensors
|
||||
explicitly to the GPU using ``.to('cuda')``.
|
||||
|
||||
* torchtext does not implement its own kernels. ROCm support is enabled by linking against ROCm libraries.
|
||||
|
||||
* Only official release exists.
|
||||
|
||||
torchtune
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
The `torchtune <https://pytorch.org/torchtune/stable/index.html>`_ library for
|
||||
authoring, fine-tuning and experimenting with LLMs.
|
||||
|
||||
* Usage: It works out-of-the-box, enabling developers to fine-tune ROCm PyTorch solutions.
|
||||
|
||||
* Only official release exists.
|
||||
|
||||
torchserve
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
The `torchserve <https://pytorch.org/torchserve/>`_ is a PyTorch domain library
|
||||
for common sparsity and parallelism primitives needed for large-scale recommender
|
||||
systems.
|
||||
|
||||
* torchtext does not implement its own kernels. ROCm support is enabled by linking against ROCm libraries.
|
||||
|
||||
* Only official release exists.
|
||||
|
||||
torchrec
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
The `torchrec <https://pytorch.org/torchrec/>`_ is a PyTorch domain library for
|
||||
common sparsity and parallelism primitives needed for large-scale recommender
|
||||
systems.
|
||||
|
||||
* torchrec does not implement its own kernels. ROCm support is enabled by linking against ROCm libraries.
|
||||
|
||||
* Only official release exists.
|
||||
|
||||
Unsupported PyTorch features
|
||||
----------------------------
|
||||
|
||||
The following are GPU-accelerated PyTorch features not currently supported by ROCm.
|
||||
|
||||
.. list-table::
|
||||
:widths: 30, 60, 10
|
||||
:header-rows: 1
|
||||
|
||||
* - Data type
|
||||
- Description
|
||||
- Since PyTorch
|
||||
* - APEX batch norm
|
||||
- Use APEX batch norm instead of PyTorch batch norm.
|
||||
- 1.6.0
|
||||
* - ``torch.backends.cuda`` / ``matmul.allow_tf32``
|
||||
- A bool that controls whether TensorFloat-32 tensor cores may be used in
|
||||
matrix multiplications.
|
||||
- 1.7
|
||||
* - ``torch.cuda`` / NVIDIA Tools Extension (NVTX)
|
||||
- Integration with NVTX for profiling and debugging GPU performance using
|
||||
NVIDIA's Nsight tools.
|
||||
- 1.7.0
|
||||
* - ``torch.cuda`` / Lazy loading NVRTC
|
||||
- Delays JIT compilation with NVRTC until the code is explicitly needed.
|
||||
- 1.8.0
|
||||
* - ``torch-tensorrt``
|
||||
- Integrate TensorRT library for optimizing and deploying PyTorch models.
|
||||
ROCm does not have equialent library for TensorRT.
|
||||
- 1.9.0
|
||||
* - ``torch.backends`` / ``cudnn.allow_tf32``
|
||||
- TensorFloat-32 tensor cores may be used in cuDNN convolutions.
|
||||
- 1.10.0
|
||||
* - ``torch.backends.cuda`` / ``matmul.allow_fp16_reduced_precision_reduction``
|
||||
- Reduced precision reductions with fp16 accumulation type are
|
||||
allowed with fp16 GEMMs.
|
||||
- 2.0
|
||||
* - ``torch.backends.cuda`` / ``matmul.allow_bf16_reduced_precision_reduction``
|
||||
- Reduced precision reductions are allowed with bf16 GEMMs.
|
||||
- 2.0
|
||||
* - ``torch.nn.functional`` / ``scaled_dot_product_attention``
|
||||
- Flash attention backend for SDPA to accelerate attention computation in
|
||||
transformer-based models.
|
||||
- 2.0
|
||||
* - ``torch.backends.cuda`` / ``enable_cudnn_sdp``
|
||||
- Globally enables cuDNN SDPA's kernels within SDPA.
|
||||
- 2.0
|
||||
* - ``torch.backends.cuda`` / ``enable_flash_sdp``
|
||||
- Globally enables or disables FlashAttention for SDPA.
|
||||
- 2.1
|
||||
* - ``torch.backends.cuda`` / ``enable_mem_efficient_sdp``
|
||||
- Globally enables or disables Memory-Efficient Attention for SDPA.
|
||||
- 2.1
|
||||
* - ``torch.backends.cuda`` / ``enable_math_sdp``
|
||||
- Globally enables or disables the PyTorch C++ implementation within SDPA.
|
||||
- 2.1
|
||||
* - Dynamic parallelism
|
||||
- PyTorch itself does not directly expose dynamic parallelism as a core
|
||||
feature. Dynamic parallelism allow GPU threads to launch additional
|
||||
threads which can be reached using custom operations via the
|
||||
``torch.utils.cpp_extension`` module.
|
||||
- Not a core feature
|
||||
* - Unified memory support in PyTorch
|
||||
- Unified Memory is not directly exposed in PyTorch's core API, it can be
|
||||
utilized effectively through custom CUDA extensions or advanced
|
||||
workflows.
|
||||
- Not a core feature
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
* :doc:`Using ROCm for AI: training a model </how-to/rocm-for-ai/train-a-model>` provides
|
||||
guidance on how to leverage the ROCm platform for training AI models. It covers the steps, tools, and best practices
|
||||
for optimizing training workflows on AMD GPUs using PyTorch features.
|
||||
|
||||
* :doc:`Single-GPU fine-tuning and inference </how-to/llm-fine-tuning-optimization/single-gpu-fine-tuning-and-inference>`
|
||||
describes and demonstrates how to use the ROCm platform for the fine-tuning and inference of
|
||||
machine learning models, particularly large language models (LLMs), on systems with a single AMD
|
||||
Instinct MI300X accelerator. This page provides a detailed guide for setting up, optimizing, and
|
||||
executing fine-tuning and inference workflows in such environments.
|
||||
|
||||
* :doc:`Multi-GPU fine-tuning and inference optimization </how-to/llm-fine-tuning-optimization/multi-gpu-fine-tuning-and-inference>`
|
||||
describes and demonstrates the fine-tuning and inference of machine learning models on systems
|
||||
with multi MI300X accelerators.
|
||||
|
||||
* The :doc:`Instinct MI300X workload optimization guide </how-to/tuning-guides/mi300x/workload>` provides detailed
|
||||
guidance on optimizing workloads for the AMD Instinct MI300X accelerator using ROCm. This guide is aimed at helping
|
||||
users achieve optimal performance for deep learning and other high-performance computing tasks on the MI300X
|
||||
accelerator.
|
||||
|
||||
* The :doc:`Inception with PyTorch documentation </conceptual/ai-pytorch-inception>`
|
||||
describes how PyTorch integrates with ROCm for AI workloads It outlines the use of PyTorch on the ROCm platform and
|
||||
focuses on how to efficiently leverage AMD GPU hardware for training and inference tasks in AI applications.
|
||||
|
||||
For more use cases and recommendations, see `ROCm PyTorch blog posts <https://rocm.blogs.amd.com/blog/tag/pytorch.html>`_
|
||||
@@ -49,6 +49,9 @@ article_pages = [
|
||||
|
||||
{"file": "how-to/rocm-for-ai/training/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/train-a-model", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/prerequisite-system-validation", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/train-a-model/benchmark-docker/megatron-lm", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/train-a-model/benchmark-docker/pytorch-training", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/scale-model-training", "os": ["linux"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/fine-tuning/index", "os": ["linux"]},
|
||||
|
||||
@@ -0,0 +1,547 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: How to train a model using Megatron-LM for ROCm.
|
||||
:keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
|
||||
|
||||
******************************************
|
||||
Training a model with Megatron-LM for ROCm
|
||||
******************************************
|
||||
|
||||
The Megatron-LM framework for ROCm is a specialized fork of the robust Megatron-LM,
|
||||
designed to enable efficient training of large-scale language models on AMD
|
||||
GPUs. By leveraging AMD Instinct™ MI300X series accelerators, Megatron-LM delivers
|
||||
enhanced scalability, performance, and resource utilization for AI workloads.
|
||||
It is purpose-built to support models like Llama 2, Llama 3, Llama 3.1, and
|
||||
DeepSeek, enabling developers to train next-generation AI models more
|
||||
efficiently. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.
|
||||
|
||||
AMD provides a ready-to-use Docker image for MI300X accelerators containing
|
||||
essential components, including PyTorch, ROCm libraries, and Megatron-LM
|
||||
utilities. It contains the following software components to accelerate training
|
||||
workloads:
|
||||
|
||||
+--------------------------+--------------------------------+
|
||||
| Software component | Version |
|
||||
+==========================+================================+
|
||||
| ROCm | 6.3.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
| PyTorch | 2.7.0a0+git637433 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Python | 3.10 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Transformer Engine | 1.11 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Flash Attention | 3.0.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
| hipBLASLt | git258a2162 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Triton | 3.1 |
|
||||
+--------------------------+--------------------------------+
|
||||
|
||||
Supported features and models
|
||||
=============================
|
||||
|
||||
Megatron-LM provides the following key features to train large language models efficiently:
|
||||
|
||||
- Transformer Engine (TE)
|
||||
|
||||
- APEX
|
||||
|
||||
- GEMM tuning
|
||||
|
||||
- Torch.compile
|
||||
|
||||
- 3D parallelism: TP + SP + CP
|
||||
|
||||
- Distributed optimizer
|
||||
|
||||
- Flash Attention (FA) 3
|
||||
|
||||
- Fused kernels
|
||||
|
||||
- Pre-training
|
||||
|
||||
.. _amd-megatron-lm-model-support:
|
||||
|
||||
The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
|
||||
|
||||
* Llama 2 7B
|
||||
|
||||
* Llama 2 70B
|
||||
|
||||
* Llama 3 8B
|
||||
|
||||
* Llama 3 70B
|
||||
|
||||
* Llama 3.1 8B
|
||||
|
||||
* Llama 3.1 70B
|
||||
|
||||
* DeepSeek-V2-Lite
|
||||
|
||||
.. note::
|
||||
|
||||
Some models, such as Llama 3, require an external license agreement through
|
||||
a third party (for example, Meta).
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
If you have already validated your system settings, skip this step. Otherwise,
|
||||
complete the :ref:`system validation and optimization steps <train-a-model-system-validation>`
|
||||
to set up your system before starting training.
|
||||
|
||||
Disable NUMA auto-balancing
|
||||
---------------------------
|
||||
|
||||
Generally, application performance can benefit from disabling NUMA auto-balancing. However,
|
||||
it might be detrimental to performance with certain types of workloads.
|
||||
|
||||
Run the command ``cat /proc/sys/kernel/numa_balancing`` to check your current NUMA (Non-Uniform
|
||||
Memory Access) settings. Output ``0`` indicates this setting is disabled. If there is no output or
|
||||
the output is ``1``, run the following command to disable NUMA auto-balancing.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||
|
||||
See :ref:`mi300x-disable-numa` for more information.
|
||||
|
||||
.. _mi300x-amd-megatron-lm-training:
|
||||
|
||||
Environment setup
|
||||
=================
|
||||
|
||||
The pre-built ROCm Megatron-LM environment allows users to quickly validate system performance, conduct
|
||||
training benchmarks, and achieve superior performance for models like Llama 3.1, Llama 2, and DeepSeek V2.
|
||||
|
||||
Use the following instructions to set up the environment, configure the script to train models, and
|
||||
reproduce the benchmark results on the MI300X accelerators with the AMD Megatron-LM Docker
|
||||
image.
|
||||
|
||||
.. _amd-megatron-lm-requirements:
|
||||
|
||||
Download the Docker image
|
||||
-------------------------
|
||||
|
||||
1. Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull rocm/megatron-lm:v25.3
|
||||
|
||||
2. Launch the Docker container.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 64G --name megatron_training_env rocm/megatron-lm:v25.3
|
||||
|
||||
3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker start megatron_training_env
|
||||
docker exec -it megatron_training_env bash
|
||||
|
||||
The Docker container includes a pre-installed, verified version of Megatron-LM from the `release branch <https://github.com/ROCm/Megatron-LM/tree/megatron_release_v25.3>`_.
|
||||
|
||||
.. _amd-megatron-lm-environment-setup:
|
||||
|
||||
Configuration scripts
|
||||
---------------------
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Llama
|
||||
:sync: llama
|
||||
|
||||
If you're working with Llama 2 7B or Llama 2 70 B, use the ``train_llama2.sh`` configuration
|
||||
script in the ``examples/llama`` directory of
|
||||
`<https://github.com/ROCm/Megatron-LM/tree/megatron_release_v25.3/examples/llama>`__.
|
||||
Likewise, if you're working with Llama 3 or Llama 3.1, then use ``train_llama3.sh`` and update
|
||||
the configuration script accordingly.
|
||||
|
||||
.. tab-item:: DeepSeek V2
|
||||
:sync: deepseek
|
||||
|
||||
Use the ``train_deepseek_v2.sh`` configuration script in the ``examples/deepseek_v2``
|
||||
directory of
|
||||
`<https://github.com/ROCm/Megatron-LM/tree/megatron_release_v25.3/examples/deepseek_v2>`__
|
||||
and update the configuration script accordingly.
|
||||
|
||||
Network interface
|
||||
^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Llama
|
||||
:sync: llama
|
||||
|
||||
To avoid connectivity issues in multi-node deployments, ensure the correct network interface
|
||||
is set in your training scripts.
|
||||
|
||||
1. Run the following command (outside the container) to find the active network interface on your system.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
ip a
|
||||
|
||||
2. Update the ``NCCL_SOCKET_IFNAME`` and ``GLOO_SOCKET_IFNAME`` variables with your system’s network interface. For
|
||||
example:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export NCCL_SOCKET_IFNAME=ens50f0np0
|
||||
|
||||
export GLOO_SOCKET_IFNAME=ens50f0np0
|
||||
|
||||
Dataset options
|
||||
^^^^^^^^^^^^^^^
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Llama
|
||||
:sync: llama
|
||||
|
||||
You can use either mock data or real data for training.
|
||||
|
||||
* Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default
|
||||
value is ``1`` for enabled.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
MOCK_DATA=1
|
||||
|
||||
* If you're using a real dataset, update the ``DATA_PATH`` variable to point to the location of your dataset.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
MOCK_DATA=0
|
||||
|
||||
DATA_PATH=${DATA_PATH:-"/data/bookcorpus_text_sentence"} # Change to where your dataset is stored
|
||||
|
||||
Ensure that the files are accessible inside the Docker container.
|
||||
|
||||
.. tab-item:: DeepSeek V2
|
||||
:sync: deepseek
|
||||
|
||||
If you don't already have the dataset, download the DeepSeek dataset using the following
|
||||
commands:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
mkdir deepseek-datasets
|
||||
cd deepseek-datasets
|
||||
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json
|
||||
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json
|
||||
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json
|
||||
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.bin
|
||||
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.idx
|
||||
|
||||
You can use either mock data or real data for training.
|
||||
|
||||
* Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default
|
||||
value is ``1`` for enabled.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
MOCK_DATA=1
|
||||
|
||||
* If you're using a real dataset, update the ``DATA_DIR`` variable to point to the location of your dataset.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
MOCK_DATA=0
|
||||
|
||||
DATA_DIR="/root/data/deepseek-datasets" # Change to where your dataset is stored
|
||||
|
||||
Ensure that the files are accessible inside the Docker container.
|
||||
|
||||
Tokenizer
|
||||
^^^^^^^^^
|
||||
|
||||
Tokenization is the process of converting raw text into tokens that can be processed by the model. For Llama
|
||||
models, this typically involves sub-word tokenization, where words are broken down into smaller units based on
|
||||
a fixed vocabulary. The tokenizer is trained along with the model on a large corpus of text, and it learns a
|
||||
fixed vocabulary that can represent a wide range of text from different domains. This allows Llama models to
|
||||
handle a variety of input sequences, including unseen words or domain-specific terms.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Llama
|
||||
:sync: llama
|
||||
|
||||
To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``.
|
||||
|
||||
To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
|
||||
Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable.
|
||||
|
||||
For example, if you're using the Llama 3.1 8B model:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
TOKENIZER_MODEL=meta-llama/Llama-3.1-8B
|
||||
|
||||
.. tab-item:: DeepSeek V2
|
||||
:sync: deepseek
|
||||
|
||||
To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
|
||||
|
||||
Multi-node training
|
||||
^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Llama
|
||||
:sync: llama
|
||||
|
||||
If you're running multi-node training, update the following environment variables. They can
|
||||
also be passed as command line arguments.
|
||||
|
||||
* Change ``localhost`` to the master node's hostname:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
MASTER_ADDR="${MASTER_ADDR:-localhost}"
|
||||
|
||||
* Set the number of nodes you want to train on (for instance, ``2``, ``4``, ``8``):
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
NNODES="${NNODES:-1}"
|
||||
|
||||
* Set the rank of each node (0 for master, 1 for the first worker node, and so on):
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
NODE_RANK="${NODE_RANK:-0}"
|
||||
|
||||
* Set ``DATA_CACHE_PATH`` to a common directory accessible by all the nodes (for example, an
|
||||
NFS directory) for multi-node runs:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
DATA_CACHE_PATH=/root/cache # Set to a common directory for multi-node runs
|
||||
|
||||
* For multi-node runs, make sure the correct network drivers are installed on the nodes. If
|
||||
inside a Docker, either install the drivers inside the Docker container or pass the network
|
||||
drivers from the host while creating the Docker container.
|
||||
|
||||
Start training on AMD Instinct accelerators
|
||||
===========================================
|
||||
|
||||
The prebuilt Megatron-LM with ROCm training environment allows users to quickly validate
|
||||
system performance, conduct training benchmarks, and achieve superior
|
||||
performance for models like Llama 3.1 and Llama 2. This container should not be
|
||||
expected to provide generalized performance across all training workloads. You
|
||||
can expect the container to perform in the model configurations described in
|
||||
the following section, but other configurations are not validated by AMD.
|
||||
|
||||
Use the following instructions to set up the environment, configure the script
|
||||
to train models, and reproduce the benchmark results on MI300X series
|
||||
accelerators with the AMD Megatron-LM Docker image.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Llama
|
||||
:sync: llama
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Single node training
|
||||
:sync: single-node
|
||||
|
||||
To run training on a single node, navigate to the Megatron-LM folder and use the
|
||||
following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 bash examples/llama/train_llama3.sh
|
||||
|
||||
.. tab-item:: Multi-node training
|
||||
:sync: multi-node
|
||||
|
||||
To run training on multiple nodes, launch the Docker container on each node. For example, for a two node setup (``NODE0`` as the master node), use these commands.
|
||||
|
||||
* On the master node ``NODE0``:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=0 bash examples/llama/train_llama3.sh
|
||||
|
||||
* On the worker node ``NODE1``:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=1 bash examples/llama/train_llama3.sh
|
||||
|
||||
|
||||
.. tab-item:: DeepSeek V2
|
||||
:sync: deepseek
|
||||
|
||||
To run the training on a single node, go to ``/Megatron-LM`` folder and use the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cd /workspace/Megatron-LM
|
||||
GEMM_TUNING=1 PR=bf16 MBS=4 AC=none bash examples/deepseek_v2/train_deepseekv2.sh
|
||||
|
||||
Key options
|
||||
-----------
|
||||
|
||||
.. _amd-megatron-lm-benchmark-test-vars:
|
||||
|
||||
The benchmark tests support the following sets of variables:
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Llama
|
||||
:sync: llama
|
||||
|
||||
``TEE_OUTPUT``
|
||||
``1`` to enable training logs or ``0`` to disable.
|
||||
|
||||
``TE_FP8``
|
||||
``0`` for BP16 (default) or ``1`` for FP8 GEMMs.
|
||||
|
||||
``GEMM_TUNING``
|
||||
``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
|
||||
|
||||
``USE_FLASH_ATTN``
|
||||
``1`` to enable Flash Attention.
|
||||
|
||||
``ENABLE_PROFILING``
|
||||
``1`` to enable PyTorch profiling for performance analysis.
|
||||
|
||||
``transformer-impl``
|
||||
``transformer_engine`` to use the Transformer Engine (TE) or ``local`` to disable TE.
|
||||
|
||||
``MODEL_SIZE``
|
||||
``8B`` or ``70B`` for Llama 3 and 3.1. ``7B`` or ``70B`` for Llama 2.
|
||||
|
||||
``TOTAL_ITERS``
|
||||
The total number of iterations -- ``10`` by default.
|
||||
|
||||
``MOCK_DATA``
|
||||
``1`` to use mock data or ``0`` to use real data provided by you.
|
||||
|
||||
``MBS``
|
||||
Micro batch size.
|
||||
|
||||
``BS``
|
||||
Global batch size.
|
||||
|
||||
``TP``
|
||||
Tensor parallel (``1``, ``2``, ``4``, ``8``).
|
||||
|
||||
``SEQ_LENGTH``
|
||||
Input sequence length.
|
||||
|
||||
.. tab-item:: DeepSeek V2
|
||||
:sync: deepseek
|
||||
|
||||
``PR``
|
||||
Precision for training. ``bf16`` for BF16 (default) or ``fp8`` for FP8 GEMMs.
|
||||
|
||||
``GEMM_TUNING``
|
||||
``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
|
||||
|
||||
``TOTAL_ITERS``
|
||||
The total number of iterations -- ``10`` by default.
|
||||
|
||||
``MOCK_DATA``
|
||||
``1`` to use mock data or ``0`` to use real data provided by you.
|
||||
|
||||
``MBS``
|
||||
Micro batch size.
|
||||
|
||||
``GBS``
|
||||
Global batch size.
|
||||
|
||||
Benchmarking examples
|
||||
---------------------
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Llama
|
||||
:sync: llama
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Single node training
|
||||
:sync: single-node
|
||||
|
||||
Use this command to run training with Llama 2 7B model on a single node. You can specify MBS, BS, FP,
|
||||
datatype, and so on.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
TEE_OUTPUT=1 MBS=5 BS=120 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
||||
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
||||
|
||||
You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
|
||||
|
||||
See the sample output:
|
||||
|
||||
.. image:: ../../../../data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
|
||||
:width: 800
|
||||
|
||||
.. tab-item:: Multi-node training
|
||||
:sync: multi-node
|
||||
|
||||
Launch the Docker container on each node.
|
||||
|
||||
In this example, run training with Llama 2 7B model on 2 nodes with specific MBS, BS, FP, datatype, and
|
||||
so on.
|
||||
|
||||
On the master node:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
||||
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
||||
|
||||
On the worker node:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
||||
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
||||
|
||||
You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
|
||||
|
||||
Sample output for 2-node training:
|
||||
|
||||
Master node:
|
||||
|
||||
.. image:: ../../../../data/how-to/rocm-for-ai/2-node-training-master.png
|
||||
:width: 800
|
||||
|
||||
Worker node:
|
||||
|
||||
.. image:: ../../../../data/how-to/rocm-for-ai/2-node-training-worker.png
|
||||
:width: 800
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
This table lists previous versions of the ROCm Megatron-LM Docker image for training
|
||||
performance validation. For detailed information about available models for
|
||||
benchmarking, see the version-specific documentation.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:stub-columns: 1
|
||||
|
||||
* - ROCm version
|
||||
- Megatron-LM version
|
||||
- PyTorch version
|
||||
- Resources
|
||||
|
||||
* - 6.1
|
||||
- 24.12-dev
|
||||
- 2.4.0
|
||||
-
|
||||
* `Documentation <https://rocm.docs.amd.com/en/docs-6.3.0/how-to/rocm-for-ai/train-a-model.html>`_
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/24.12-dev/images/sha256-5818c50334ce3d69deeeb8f589d83ec29003817da34158ebc9e2d112b929bf2e>`_
|
||||
@@ -0,0 +1,341 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: How to train a model using PyTorch for ROCm.
|
||||
:keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
|
||||
|
||||
**************************************
|
||||
Training a model with PyTorch for ROCm
|
||||
**************************************
|
||||
|
||||
PyTorch is an open-source machine learning framework that is widely used for
|
||||
model training with GPU-optimized components for transformer-based models.
|
||||
|
||||
The PyTorch for ROCm training Docker (``rocm/pytorch-training:v25.3``) image
|
||||
provides a prebuilt optimized environment for fine-tuning and pretraining a
|
||||
model on AMD Instinct MI325X and MI300X accelerators. It includes the following
|
||||
software components to accelerate training workloads:
|
||||
|
||||
+--------------------------+--------------------------------+
|
||||
| Software component | Version |
|
||||
+==========================+================================+
|
||||
| ROCm | 6.3.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
| PyTorch | 2.7.0a0+git637433 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Python | 3.10 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Transformer Engine | 1.11 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Flash Attention | 3.0.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
| hipBLASLt | git258a2162 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Triton | 3.1 |
|
||||
+--------------------------+--------------------------------+
|
||||
|
||||
.. _amd-pytorch-training-model-support:
|
||||
|
||||
Supported models
|
||||
================
|
||||
|
||||
The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
|
||||
|
||||
* Llama 3.1 8B
|
||||
|
||||
* Llama 3.1 70B
|
||||
|
||||
* FLUX.1-dev
|
||||
|
||||
.. note::
|
||||
|
||||
Only these models are supported in the following steps.
|
||||
|
||||
Some models, such as Llama 3, require an external license agreement through
|
||||
a third party (for example, Meta).
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
If you have already validated your system settings, skip this step. Otherwise,
|
||||
complete the :ref:`system validation and optimization steps <train-a-model-system-validation>`
|
||||
to set up your system before starting training.
|
||||
|
||||
Disable NUMA auto-balancing
|
||||
---------------------------
|
||||
|
||||
Generally, application performance can benefit from disabling NUMA auto-balancing. However,
|
||||
it might be detrimental to performance with certain types of workloads.
|
||||
|
||||
Run the command ``cat /proc/sys/kernel/numa_balancing`` to check your current NUMA (Non-Uniform
|
||||
Memory Access) settings. Output ``0`` indicates this setting is disabled. If there is no output or
|
||||
the output is ``1``, run the following command to disable NUMA auto-balancing.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||
|
||||
See :ref:`mi300x-disable-numa` for more information.
|
||||
|
||||
Environment setup
|
||||
=================
|
||||
|
||||
This Docker image is optimized for specific model configurations outlined
|
||||
below. Performance can vary for other training workloads, as AMD
|
||||
doesn’t validate configurations and run conditions outside those described.
|
||||
|
||||
Download the Docker image
|
||||
-------------------------
|
||||
|
||||
1. Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull rocm/pytorch-training:v25.3
|
||||
|
||||
2. Run the Docker container.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env rocm/pytorch-training:v25.3
|
||||
|
||||
3. Use these commands if you exit the ``training_env`` container and need to return to it.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker start training_env
|
||||
docker exec -it training_env bash
|
||||
|
||||
4. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__ repository and navigate to the benchmark scripts directory.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD/scripts/pytorch-train
|
||||
|
||||
Prepare training datasets and dependencies
|
||||
------------------------------------------
|
||||
|
||||
The following benchmarking examples may require downloading models and datasets
|
||||
from Hugging Face. To ensure successful access to gated repos, set your
|
||||
``HF_TOKEN``.
|
||||
|
||||
Run the setup script to install libraries and datasets needed for benchmarking.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_setup.sh
|
||||
|
||||
``pytorch_benchmark_setup.sh`` installs the following libraries:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Library
|
||||
- Benchmark model
|
||||
- Reference
|
||||
|
||||
* - ``accelerate``
|
||||
- Llama 3.1 8B, FLUX
|
||||
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
||||
|
||||
* - ``datasets``
|
||||
- Llama 3.1 8B, 70B, FLUX
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||
|
||||
* - ``torchdata``
|
||||
- Llama 3.1 70B
|
||||
- `TorchData <https://pytorch.org/data/beta/index.html>`_
|
||||
|
||||
* - ``tomli``
|
||||
- Llama 3.1 70B
|
||||
- `Tomli <https://pypi.org/project/tomli/>`_
|
||||
|
||||
* - ``tiktoken``
|
||||
- Llama 3.1 70B
|
||||
- `tiktoken <https://github.com/openai/tiktoken>`_
|
||||
|
||||
* - ``blobfile``
|
||||
- Llama 3.1 70B
|
||||
- `blobfile <https://pypi.org/project/blobfile/>`_
|
||||
|
||||
* - ``tabulate``
|
||||
- Llama 3.1 70B
|
||||
- `tabulate <https://pypi.org/project/tabulate/>`_
|
||||
|
||||
* - ``wandb``
|
||||
- Llama 3.1 70B
|
||||
- `Weights & Biases <https://github.com/wandb/wandb>`_
|
||||
|
||||
* - ``sentencepiece``
|
||||
- Llama 3.1 70B, FLUX
|
||||
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
|
||||
|
||||
* - ``tensorboard``
|
||||
- Llama 3.1 70 B, FLUX
|
||||
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
|
||||
|
||||
* - ``csvkit``
|
||||
- FLUX
|
||||
- `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
|
||||
|
||||
* - ``deepspeed``
|
||||
- FLUX
|
||||
- `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
|
||||
|
||||
* - ``diffusers``
|
||||
- FLUX
|
||||
- `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
|
||||
|
||||
* - ``GitPython``
|
||||
- FLUX
|
||||
- `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
|
||||
|
||||
* - ``opencv-python-headless``
|
||||
- FLUX
|
||||
- `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
|
||||
|
||||
* - ``peft``
|
||||
- FLUX
|
||||
- `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
|
||||
|
||||
* - ``protobuf``
|
||||
- FLUX
|
||||
- `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
|
||||
|
||||
* - ``pytest``
|
||||
- FLUX
|
||||
- `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
|
||||
|
||||
* - ``python-dotenv``
|
||||
- FLUX
|
||||
- `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
|
||||
|
||||
* - ``seaborn``
|
||||
- FLUX
|
||||
- `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
|
||||
|
||||
* - ``transformers``
|
||||
- FLUX
|
||||
- `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
|
||||
|
||||
``pytorch_benchmark_setup.sh`` downloads the following models from Hugging Face:
|
||||
|
||||
* `meta-llama/Llama-3.1-70B-Instruct <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
|
||||
|
||||
* `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
|
||||
|
||||
Along with the following datasets:
|
||||
|
||||
* `WikiText <https://huggingface.co/datasets/Salesforce/wikitext>`_
|
||||
|
||||
* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
|
||||
|
||||
Start training on AMD Instinct accelerators
|
||||
===========================================
|
||||
|
||||
The prebuilt PyTorch with ROCm training environment allows users to quickly validate
|
||||
system performance, conduct training benchmarks, and achieve superior
|
||||
performance for models like Llama 3.1 and Llama 2. This container should not be
|
||||
expected to provide generalized performance across all training workloads. You
|
||||
can expect the container to perform in the model configurations described in
|
||||
the following section, but other configurations are not validated by AMD.
|
||||
|
||||
Use the following instructions to set up the environment, configure the script
|
||||
to train models, and reproduce the benchmark results on MI300X series
|
||||
accelerators with the AMD PyTorch training Docker image.
|
||||
|
||||
Once your environment is set up, use the following commands and examples to start benchmarking.
|
||||
|
||||
Pretraining
|
||||
-----------
|
||||
|
||||
To start the pretraining benchmark, use the following command with the
|
||||
appropriate options. See the following list of options and their descriptions.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t $training_mode -m $model_repo -p $datatype -s $sequence_length
|
||||
|
||||
Options and available models
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Name
|
||||
- Options
|
||||
- Description
|
||||
|
||||
* - ``$training_mode``
|
||||
- ``pretrain``
|
||||
- Benchmark pretraining
|
||||
|
||||
* -
|
||||
- ``finetune_fw``
|
||||
- Benchmark full weight fine-tuning (Llama 3.1 70B with BF16)
|
||||
|
||||
* -
|
||||
- ``finetune_lora``
|
||||
- Benchmark LoRA fine-tuning (Llama 3.1 70B with BF16)
|
||||
|
||||
* - ``$datatype``
|
||||
- FP8 or BF16
|
||||
- Only Llama 3.1 8B supports FP8 precision.
|
||||
|
||||
* - ``$model_repo``
|
||||
- Llama-3.1-8B
|
||||
- `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct>`_
|
||||
|
||||
* -
|
||||
- Llama-3.1-70B
|
||||
- `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
|
||||
|
||||
* -
|
||||
- Flux
|
||||
- `FLUX.1 [dev] <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
|
||||
|
||||
Fine-tuning
|
||||
-----------
|
||||
|
||||
To start the fine-tuning benchmark, use the following command. It will run the benchmarking example of Llama 2 70B
|
||||
with the WikiText dataset using the AMD fork of `torchtune <https://github.com/AMD-AIG-AIMA/torchtune>`_.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t {finetune_fw, finetune_lora} -p BF16 -m Llama-3.1-70B
|
||||
|
||||
Benchmarking examples
|
||||
---------------------
|
||||
|
||||
Here are some examples of how to use the command.
|
||||
|
||||
* Example 1: Llama 3.1 70B with BF16 precision with `torchtitan <https://github.com/ROCm/torchtitan>`_.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Llama-3.1-70B -s 8192
|
||||
|
||||
* Example 2: Llama 3.1 8B with FP8 precision using Transformer Engine (TE) and Hugging Face Accelerator.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t pretrain -p FP8 -m Llama-3.1-70B -s 8192
|
||||
|
||||
* Example 3: FLUX.1-dev with BF16 precision with FluxBenchmark.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Flux
|
||||
|
||||
* Example 4: Torchtune full weight fine-tuning with Llama 3.1 70B
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.1-70B
|
||||
|
||||
* Example 5: Torchtune LoRA fine-tuning with Llama 3.1 70B
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.1-70B
|
||||
@@ -19,6 +19,10 @@ training, fine-tuning, and inference. It leverages popular machine learning fram
|
||||
|
||||
In this guide, you'll learn about:
|
||||
|
||||
- :doc:`Training a model <train-a-model>`
|
||||
- Training a model
|
||||
|
||||
- :doc:`Scale model training <scale-model-training>`
|
||||
- :doc:`Train a model with Megatron-LM <benchmark-docker/megatron-lm>`
|
||||
|
||||
- :doc:`Train a model with PyTorch <benchmark-docker/pytorch-training>`
|
||||
|
||||
- :doc:`Scaling model training <scale-model-training>`
|
||||
|
||||
@@ -0,0 +1,130 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: Prerequisite system validation before using ROCm for AI.
|
||||
:keywords: ROCm, AI, LLM, train, megatron, Llama, tutorial, docker, torch, pytorch, jax
|
||||
|
||||
.. _train-a-model-system-validation:
|
||||
|
||||
**********************************************
|
||||
Prerequisite system validation before training
|
||||
**********************************************
|
||||
|
||||
Complete the following system validation and optimization steps to set up your system before starting training.
|
||||
|
||||
Disable NUMA auto-balancing
|
||||
---------------------------
|
||||
|
||||
Generally, application performance can benefit from disabling NUMA auto-balancing. However,
|
||||
it might be detrimental to performance with certain types of workloads.
|
||||
|
||||
Run the command ``cat /proc/sys/kernel/numa_balancing`` to check your current NUMA (Non-Uniform
|
||||
Memory Access) settings. Output ``0`` indicates this setting is disabled. If there is no output or
|
||||
the output is ``1``, run the following command to disable NUMA auto-balancing.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||
|
||||
See :ref:`mi300x-disable-numa` for more information.
|
||||
|
||||
Hardware verification with ROCm
|
||||
-------------------------------
|
||||
|
||||
Use the command ``rocm-smi --setperfdeterminism 1900`` to set the max clock speed up to 1900 MHz
|
||||
instead of the default 2100 MHz. This can reduce the chance of a PCC event lowering the attainable
|
||||
GPU clocks. This setting will not be required for new IFWI releases with the production PRC feature.
|
||||
You can restore this setting to its default value with the ``rocm-smi -r`` command.
|
||||
|
||||
Run the command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
rocm-smi --setperfdeterminism 1900
|
||||
|
||||
See :ref:`mi300x-hardware-verification-with-rocm` for more information.
|
||||
|
||||
RCCL Bandwidth Test for multi-node setups
|
||||
-----------------------------------------
|
||||
|
||||
ROCm Collective Communications Library (RCCL) is a standalone library of standard collective communication
|
||||
routines for GPUs. See the :doc:`RCCL documentation <rccl:index>` for more information. Before starting
|
||||
pretraining, running a RCCL bandwidth test helps ensure that the multi-GPU or multi-node setup is optimized
|
||||
for efficient distributed training.
|
||||
|
||||
Running the RCCL bandwidth test helps verify that:
|
||||
|
||||
- The GPUs can communicate across nodes or within a single node.
|
||||
|
||||
- The interconnect (such as InfiniBand, Ethernet, or Infinite fabric) is functioning as expected and
|
||||
provides adequate bandwidth for communication.
|
||||
|
||||
- No hardware setup or cabling issues could affect the communication between GPUs
|
||||
|
||||
Tuning and optimizing hyperparameters
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
In distributed training, specific hyperparameters related to distributed communication can be tuned based on
|
||||
the results of the RCCL bandwidth test. These variables are already set in the Docker image:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# force all RCCL streams to be high priority
|
||||
export TORCH_NCCL_HIGH_PRIORITY=1
|
||||
|
||||
# specify which RDMA interfaces to use for communication
|
||||
export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
|
||||
|
||||
# define the Global ID index used in RoCE mode
|
||||
export NCCL_IB_GID_INDEX=3
|
||||
|
||||
# avoid data corruption/mismatch issue that existed in past releases
|
||||
export RCCL_MSCCL_ENABLE=0
|
||||
|
||||
Running the RCCL Bandwidth Test
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
It's recommended you run the RCCL bandwidth test before launching training. It ensures system
|
||||
performance is sufficient to launch training. RCCL is not included in the AMD Megatron-LM Docker
|
||||
image; follow the instructions in `<https://github.com/ROCm/rccl-tests>`__ to get started.
|
||||
See :ref:`mi300x-rccl` for more information.
|
||||
|
||||
Run on 8 GPUs (``-g 8``), scanning from 8 bytes to 10 GB:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./build/all_reduce_perf -b 8 -e 10G -f 2 -g 8
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/rccl-tests-8-gpu.png
|
||||
:width: 800
|
||||
|
||||
Using one MPI process per GPU and ``-g 1`` for performance-oriented runs on both single-node and multi-node is
|
||||
recommended. So, a run on 8 GPUs looks something like:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
mpirun -np 8 --bind-to numa ./build/all_reduce_perf -b 8 -e 10G -f 2 -g 1
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/rccl-tests-1-mpi-process-per-gpu.png
|
||||
:width: 800
|
||||
|
||||
Running with one MPI process per GPU ensures a one-to-one mapping for CPUs and GPUs, which can be beneficial
|
||||
for smaller message sizes. This better represents the real-world use of RCCL in deep learning frameworks like
|
||||
PyTorch and TensorFlow.
|
||||
|
||||
Use the following script to run the RCCL test for four MI300X GPU nodes. Modify paths and node addresses as needed.
|
||||
|
||||
.. code-block::
|
||||
|
||||
/home/$USER/ompi_for_gpu/ompi/bin/mpirun -np 32 -H tw022:8,tw024:8,tw010:8, tw015:8 \
|
||||
--mca pml ucx \
|
||||
--mca btl ^openib \
|
||||
-x NCCL_SOCKET_IFNAME=ens50f0np0 \
|
||||
-x NCCL_IB_HCA=rdma0:1,rdma1:1,rdma2:1,rdma3:1,rdma4:1,rdma5:1,rdma6:1,rdma7:1 \
|
||||
-x NCCL_IB_GID_INDEX=3 \
|
||||
-x NCCL_MIN_NCHANNELS=40 \
|
||||
-x NCCL_DEBUG=version \
|
||||
$HOME/rccl-tests/build/all_reduce_perf -b 8 -e 8g -f 2 -g 1
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/rccl-tests-4-mi300x-gpu-nodes.png
|
||||
:width: 800
|
||||
@@ -1,503 +0,0 @@
|
||||
.. meta::
|
||||
:description: How to train a model using ROCm Megatron-LM
|
||||
:keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
|
||||
|
||||
**************************************
|
||||
Training a model with ROCm Megatron-LM
|
||||
**************************************
|
||||
|
||||
.. _amd-megatron-lm:
|
||||
|
||||
The ROCm Megatron-LM framework is a specialized fork of the robust Megatron-LM, designed to
|
||||
enable efficient training of large-scale language models on AMD GPUs. By leveraging AMD Instinct™ MI300X
|
||||
accelerators, AMD Megatron-LM delivers enhanced scalability, performance, and resource utilization for AI
|
||||
workloads. It is purpose-built to :ref:`support models <amd-megatron-lm-model-support>`
|
||||
like Meta's Llama 2, Llama 3, and Llama 3.1, enabling developers to train next-generation AI models with greater
|
||||
efficiency. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.
|
||||
|
||||
For ease of use, AMD provides a ready-to-use Docker image for MI300X accelerators containing essential
|
||||
components, including PyTorch, PyTorch Lightning, ROCm libraries, and Megatron-LM utilities. It contains the
|
||||
following software to accelerate training workloads:
|
||||
|
||||
+--------------------------+--------------------------------+
|
||||
| Software component | Version |
|
||||
+==========================+================================+
|
||||
| ROCm | 6.1 |
|
||||
+--------------------------+--------------------------------+
|
||||
| PyTorch | 2.4.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
| PyTorch Lightning | 2.4.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Megatron Core | 0.9.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Transformer Engine | 1.5.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Flash Attention | v2.6 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Transformers | 4.44.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
|
||||
Supported features and models
|
||||
=============================
|
||||
|
||||
Megatron-LM provides the following key features to train large language models efficiently:
|
||||
|
||||
- Transformer Engine (TE)
|
||||
|
||||
- APEX
|
||||
|
||||
- GEMM tuning
|
||||
|
||||
- Torch.compile
|
||||
|
||||
- 3D parallelism: TP + SP + CP
|
||||
|
||||
- Distributed optimizer
|
||||
|
||||
- Flash Attention (FA) 2
|
||||
|
||||
- Fused kernels
|
||||
|
||||
- Pre-training
|
||||
|
||||
.. _amd-megatron-lm-model-support:
|
||||
|
||||
The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
|
||||
|
||||
* Llama 2 7B
|
||||
|
||||
* Llama 2 70B
|
||||
|
||||
* Llama 3 8B
|
||||
|
||||
* Llama 3 70B
|
||||
|
||||
* Llama 3.1 8B
|
||||
|
||||
* Llama 3.1 70B
|
||||
|
||||
Prerequisite system validation steps
|
||||
====================================
|
||||
|
||||
Complete the following system validation and optimization steps to set up your system before starting training.
|
||||
|
||||
Disable NUMA auto-balancing
|
||||
---------------------------
|
||||
|
||||
Generally, application performance can benefit from disabling NUMA auto-balancing. However,
|
||||
it might be detrimental to performance with certain types of workloads.
|
||||
|
||||
Run the command ``cat /proc/sys/kernel/numa_balancing`` to check your current NUMA (Non-Uniform
|
||||
Memory Access) settings. Output ``0`` indicates this setting is disabled. If there is no output or
|
||||
the output is ``1``, run the following command to disable NUMA auto-balancing.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||
|
||||
See :ref:`mi300x-disable-numa` for more information.
|
||||
|
||||
Hardware verification with ROCm
|
||||
-------------------------------
|
||||
|
||||
Use the command ``rocm-smi --setperfdeterminism 1900`` to set the max clock speed up to 1900 MHz
|
||||
instead of the default 2100 MHz. This can reduce the chance of a PCC event lowering the attainable
|
||||
GPU clocks. This setting will not be required for new IFWI releases with the production PRC feature.
|
||||
You can restore this setting to its default value with the ``rocm-smi -r`` command.
|
||||
|
||||
Run the command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
rocm-smi --setperfdeterminism 1900
|
||||
|
||||
See :ref:`mi300x-hardware-verification-with-rocm` for more information.
|
||||
|
||||
RCCL Bandwidth Test
|
||||
-------------------
|
||||
|
||||
ROCm Collective Communications Library (RCCL) is a standalone library of standard collective communication
|
||||
routines for GPUs. See the :doc:`RCCL documentation <rccl:index>` for more information. Before starting
|
||||
pre-training, running a RCCL bandwidth test helps ensure that the multi-GPU or multi-node setup is optimized
|
||||
for efficient distributed training.
|
||||
|
||||
Running the RCCL bandwidth test helps verify that:
|
||||
|
||||
- The GPUs can communicate across nodes or within a single node.
|
||||
|
||||
- The interconnect (such as InfiniBand, Ethernet, or Infinite fabric) is functioning as expected and
|
||||
provides adequate bandwidth for communication.
|
||||
|
||||
- No hardware setup or cabling issues could affect the communication between GPUs
|
||||
|
||||
Tuning and optimizing hyperparameters
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
In distributed training, specific hyperparameters related to distributed communication can be tuned based on
|
||||
the results of the RCCL bandwidth test. These variables are already set in the Docker image:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# force all RCCL streams to be high priority
|
||||
export TORCH_NCCL_HIGH_PRIORITY=1
|
||||
|
||||
# specify which RDMA interfaces to use for communication
|
||||
export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
|
||||
|
||||
# define the Global ID index used in RoCE mode
|
||||
export NCCL_IB_GID_INDEX=3
|
||||
|
||||
# avoid data corruption/mismatch issue that existed in past releases
|
||||
export RCCL_MSCCL_ENABLE=0
|
||||
|
||||
Running the RCCL Bandwidth Test
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
It's recommended you run the RCCL bandwidth test before launching training. It ensures system
|
||||
performance is sufficient to launch training. RCCL is not included in the AMD Megatron-LM Docker
|
||||
image; follow the instructions in `<https://github.com/ROCm/rccl-tests>`__ to get started.
|
||||
See :ref:`mi300x-rccl` for more information.
|
||||
|
||||
Run on 8 GPUs (``-g 8``), scanning from 8 bytes to 10 GB:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./build/all_reduce_perf -b 8 -e 10G -f 2 -g 8
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/rccl-tests-8-gpu.png
|
||||
:width: 800
|
||||
|
||||
Using one MPI process per GPU and ``-g 1`` for performance-oriented runs on both single-node and multi-node is
|
||||
recommended. So, a run on 8 GPUs looks something like:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
mpirun -np 8 --bind-to numa ./build/all_reduce_perf -b 8 -e 10G -f 2 -g 1
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/rccl-tests-1-mpi-process-per-gpu.png
|
||||
:width: 800
|
||||
|
||||
Running with one MPI process per GPU ensures a one-to-one mapping for CPUs and GPUs, which can be beneficial
|
||||
for smaller message sizes. This better represents the real-world use of RCCL in deep learning frameworks like
|
||||
PyTorch and TensorFlow.
|
||||
|
||||
Use the following script to run the RCCL test for four MI300X GPU nodes. Modify paths and node addresses as needed.
|
||||
|
||||
.. code-block::
|
||||
|
||||
/home/$USER/ompi_for_gpu/ompi/bin/mpirun -np 32 -H tw022:8,tw024:8,tw010:8, tw015:8 \
|
||||
--mca pml ucx \
|
||||
--mca btl ^openib \
|
||||
-x NCCL_SOCKET_IFNAME=ens50f0np0 \
|
||||
-x NCCL_IB_HCA=rdma0:1,rdma1:1,rdma2:1,rdma3:1,rdma4:1,rdma5:1,rdma6:1,rdma7:1 \
|
||||
-x NCCL_IB_GID_INDEX=3 \
|
||||
-x NCCL_MIN_NCHANNELS=40 \
|
||||
-x NCCL_DEBUG=version \
|
||||
$HOME/rccl-tests/build/all_reduce_perf -b 8 -e 8g -f 2 -g 1
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/rccl-tests-4-mi300x-gpu-nodes.png
|
||||
:width: 800
|
||||
|
||||
.. _mi300x-amd-megatron-lm-training:
|
||||
|
||||
Start training on MI300X accelerators
|
||||
=====================================
|
||||
|
||||
The pre-built ROCm Megatron-LM environment allows users to quickly validate system performance, conduct
|
||||
training benchmarks, and achieve superior performance for models like Llama 2 and Llama 3.1.
|
||||
|
||||
Use the following instructions to set up the environment, configure the script to train models, and
|
||||
reproduce the benchmark results on the MI300X accelerators with the AMD Megatron-LM Docker
|
||||
image.
|
||||
|
||||
.. _amd-megatron-lm-requirements:
|
||||
|
||||
Download the Docker image and required packages
|
||||
-----------------------------------------------
|
||||
|
||||
1. Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull rocm/megatron-lm:24.12-dev
|
||||
|
||||
2. Launch the Docker container.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $CACHE_DIR:/root/.cache --name megatron-dev-env rocm/megatron-lm:24.12-dev /bin/bash
|
||||
|
||||
3. Clone the ROCm Megatron-LM repository to a local directory and install the required packages on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/Megatron-LM
|
||||
cd Megatron-LM
|
||||
|
||||
.. note::
|
||||
|
||||
This release is validated with ``ROCm/Megatron-LM`` commit `bb93ccb <https://github.com/ROCm/Megatron-LM/tree/bb93ccbfeae6363c67b361a97a27c74ab86e7e92>`_.
|
||||
Checking out this specific commit is recommended for a stable and reproducible environment.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git checkout bb93ccbfeae6363c67b361a97a27c74ab86e7e92
|
||||
|
||||
Prepare training datasets
|
||||
-------------------------
|
||||
|
||||
If you already have the preprocessed data, you can skip this section.
|
||||
|
||||
Use the following command to process datasets. We use GPT data as an example. You may change the merge table, use an
|
||||
end-of-document token, remove sentence splitting, and use the tokenizer type.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
python tools/preprocess_data.py \
|
||||
--input my-corpus.json \
|
||||
--output-prefix my-gpt2 \
|
||||
--vocab-file gpt2-vocab.json \
|
||||
--tokenizer-type GPT2BPETokenizer \
|
||||
--merge-file gpt2-merges.txt \
|
||||
--append-eod
|
||||
|
||||
In this case, the automatically generated output files are named ``my-gpt2_text_document.bin`` and
|
||||
``my-gpt2_text_document.idx``.
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/prep-training-datasets-my-gpt2-text-document.png
|
||||
:width: 800
|
||||
|
||||
.. _amd-megatron-lm-environment-setup:
|
||||
|
||||
Environment setup
|
||||
-----------------
|
||||
|
||||
In the ``examples/llama`` directory of Megatron-LM, if you're working with Llama 2 7B or Llama 2 70 B, use the
|
||||
``train_llama2.sh`` configuration script. Likewise, if you're working with Llama 3 or Llama 3.1, then use
|
||||
``train_llama3.sh`` and update the configuration script accordingly.
|
||||
|
||||
Network interface
|
||||
^^^^^^^^^^^^^^^^^
|
||||
|
||||
To avoid connectivity issues, ensure the correct network interface is set in your training scripts.
|
||||
|
||||
1. Run the following command to find the active network interface on your system.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
ip a
|
||||
|
||||
2. Update the ``NCCL_SOCKET_IFNAME`` and ``GLOO_SOCKET_IFNAME`` variables with your system’s network interface. For
|
||||
example:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export NCCL_SOCKET_IFNAME=ens50f0np0
|
||||
|
||||
export GLOO_SOCKET_IFNAME=ens50f0np0
|
||||
|
||||
Dataset options
|
||||
^^^^^^^^^^^^^^^
|
||||
|
||||
You can use either mock data or real data for training.
|
||||
|
||||
* If you're using a real dataset, update the ``DATA_PATH`` variable to point to the location of your dataset.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
DATA_DIR="/root/.cache/data" # Change to where your dataset is stored
|
||||
|
||||
DATA_PATH=${DATA_DIR}/bookcorpus_text_sentence
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
--data-path $DATA_PATH
|
||||
|
||||
Ensure that the files are accessible inside the Docker container.
|
||||
|
||||
* Mock data can be useful for testing and validation. If you're using mock data, replace ``--data-path $DATA_PATH`` with the ``--mock-data`` option.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
--mock-data
|
||||
|
||||
Tokenizer
|
||||
^^^^^^^^^
|
||||
|
||||
Tokenization is the process of converting raw text into tokens that can be processed by the model. For Llama
|
||||
models, this typically involves sub-word tokenization, where words are broken down into smaller units based on
|
||||
a fixed vocabulary. The tokenizer is trained along with the model on a large corpus of text, and it learns a
|
||||
fixed vocabulary that can represent a wide range of text from different domains. This allows Llama models to
|
||||
handle a variety of input sequences, including unseen words or domain-specific terms.
|
||||
|
||||
To train any of the Llama 2 models that this Docker image supports, use the ``Llama2Tokenizer``.
|
||||
|
||||
To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
|
||||
Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable.
|
||||
|
||||
For example, if you're using the Llama 3.1 8B model:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
TOKENIZER_MODEL=meta-llama/Llama-3.1-8B
|
||||
|
||||
Run benchmark tests
|
||||
-------------------
|
||||
|
||||
.. note::
|
||||
|
||||
If you're running **multi node training**, update the following environment variables. They can
|
||||
also be passed as command line arguments.
|
||||
|
||||
* Change ``localhost`` to the master node's hostname:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
MASTER_ADDR="${MASTER_ADDR:-localhost}"
|
||||
|
||||
* Set the number of nodes you want to train on (for instance, ``2``, ``4``, ``8``):
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
NNODES="${NNODES:-1}"
|
||||
|
||||
* Set the rank of each node (0 for master, 1 for the first worker node, and so on):
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
NODE_RANK="${NODE_RANK:-0}"
|
||||
|
||||
* Use this command to run a performance benchmark test of any of the Llama 2 models that this Docker image supports (see :ref:`variables <amd-megatron-lm-benchmark-test-vars>`).
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
{variables} bash examples/llama/train_llama2.sh
|
||||
|
||||
* Use this command to run a performance benchmark test of any of the Llama 3 and Llama 3.1 models that this Docker image supports (see :ref:`variables <amd-megatron-lm-benchmark-test-vars>`).
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
{variables} bash examples/llama/train_llama3.sh
|
||||
|
||||
.. _amd-megatron-lm-benchmark-test-vars:
|
||||
|
||||
The benchmark tests support the same set of variables:
|
||||
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| Name | Options | Description |
|
||||
+==========================+=======================+=======================+
|
||||
| ``TEE_OUTPUT`` | 0 or 1 | 0: disable training |
|
||||
| | | log |
|
||||
| | | |
|
||||
| | | 1: enable training |
|
||||
| | | log |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``MBS`` | | Micro batch size |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``BS`` | | Batch size |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``TP`` | 1, 2, 4, 8 | Tensor parallel |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``TE_FP8`` | 0 or 1 | Datatype. |
|
||||
| | | If it is set to 1, |
|
||||
| | | FP8. |
|
||||
| | | |
|
||||
| | | If it is set to 0. |
|
||||
| | | BP16 |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``NO_TORCH_COMPILE`` | 0 or 1 | If it is set to 1, |
|
||||
| | | enable torch.compile. |
|
||||
| | | |
|
||||
| | | If it is set to 0. |
|
||||
| | | Disable torch.compile |
|
||||
| | | (default) |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``SEQ_LENGTH`` | | Input sequence length |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``GEMM_TUNING`` | 0 or 1 | If it is set to 1, |
|
||||
| | | enable gemm tuning. |
|
||||
| | | |
|
||||
| | | If it is set to 0, |
|
||||
| | | disable gemm tuning |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``USE_FLASH_ATTN`` | 0 or 1 | 0: disable flash |
|
||||
| | | attention |
|
||||
| | | |
|
||||
| | | 1: enable flash |
|
||||
| | | attention |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``ENABLE_PROFILING`` | 0 or 1 | 0: disable torch |
|
||||
| | | profiling |
|
||||
| | | |
|
||||
| | | 1: enable torch |
|
||||
| | | profiling |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``MODEL_SIZE`` | | The size of the mode: |
|
||||
| | | 7B/70B, etc. |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``TOTAL_ITERS`` | | Total number of |
|
||||
| | | iterations |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``transformer-impl`` | transformer_engine or | Enable transformer |
|
||||
| | local | engine by default |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
|
||||
Benchmarking examples
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Single node training
|
||||
:sync: single
|
||||
|
||||
Use this command to run training with Llama 2 7B model on a single node. You can specify MBS, BS, FP,
|
||||
datatype, and so on.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
TEE_OUTPUT=1 MBS=5 BS=120 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
||||
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
||||
|
||||
You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
|
||||
|
||||
See the sample output:
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
|
||||
:width: 800
|
||||
|
||||
.. tab-item:: Multi node training
|
||||
:sync: multi
|
||||
|
||||
Launch the Docker container on each node.
|
||||
|
||||
In this example, run training with Llama 2 7B model on 2 nodes with specific MBS, BS, FP, datatype, and
|
||||
so on.
|
||||
|
||||
On the master node:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
||||
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
||||
|
||||
On the worker node:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
||||
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
||||
|
||||
You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
|
||||
|
||||
Sample output for 2-node training:
|
||||
|
||||
Master node:
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/2-node-training-master.png
|
||||
:width: 800
|
||||
|
||||
Worker node:
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/2-node-training-worker.png
|
||||
:width: 800
|
||||
|
||||
@@ -21,8 +21,6 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- Model
|
||||
- Architecture
|
||||
- LLVM target name
|
||||
- Device Major version
|
||||
- Device Minor version
|
||||
- VRAM (GiB)
|
||||
- Compute Units
|
||||
- Wavefront Size
|
||||
@@ -34,12 +32,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- L1 Instruction Cache (KiB)
|
||||
- VGPR File (KiB)
|
||||
- SGPR File (KiB)
|
||||
- GFXIP Major version
|
||||
- GFXIP Minor version
|
||||
*
|
||||
- MI325X
|
||||
- CDNA3
|
||||
- gfx942
|
||||
- 9
|
||||
- 4
|
||||
- 256
|
||||
- 304 (38 per XCD)
|
||||
- 64
|
||||
@@ -51,12 +49,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 64 per 2 CUs
|
||||
- 512
|
||||
- 12.5
|
||||
- 9
|
||||
- 4
|
||||
*
|
||||
- MI300X
|
||||
- CDNA3
|
||||
- gfx942
|
||||
- 9
|
||||
- 4
|
||||
- 192
|
||||
- 304 (38 per XCD)
|
||||
- 64
|
||||
@@ -68,12 +66,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 64 per 2 CUs
|
||||
- 512
|
||||
- 12.5
|
||||
- 9
|
||||
- 4
|
||||
*
|
||||
- MI300A
|
||||
- CDNA3
|
||||
- gfx942
|
||||
- 9
|
||||
- 4
|
||||
- 128
|
||||
- 228 (38 per XCD)
|
||||
- 64
|
||||
@@ -85,12 +83,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 64 per 2 CUs
|
||||
- 512
|
||||
- 12.5
|
||||
- 9
|
||||
- 4
|
||||
*
|
||||
- MI250X
|
||||
- CDNA2
|
||||
- gfx90a
|
||||
- 9
|
||||
- 0
|
||||
- 128
|
||||
- 220 (110 per GCD)
|
||||
- 64
|
||||
@@ -102,12 +100,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 2 CUs
|
||||
- 512
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
*
|
||||
- MI250
|
||||
- CDNA2
|
||||
- gfx90a
|
||||
- 9
|
||||
- 0
|
||||
- 128
|
||||
- 208 (104 per GCD)
|
||||
- 64
|
||||
@@ -119,12 +117,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 2 CUs
|
||||
- 512
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
*
|
||||
- MI210
|
||||
- CDNA2
|
||||
- gfx90a
|
||||
- 9
|
||||
- 0
|
||||
- 64
|
||||
- 104
|
||||
- 64
|
||||
@@ -136,12 +134,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 2 CUs
|
||||
- 512
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
*
|
||||
- MI100
|
||||
- CDNA
|
||||
- gfx908
|
||||
- 9
|
||||
- 0
|
||||
- 32
|
||||
- 120
|
||||
- 64
|
||||
@@ -153,12 +151,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 3 CUs
|
||||
- 256 VGPR and 256 AccVGPR
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
*
|
||||
- MI60
|
||||
- GCN5.1
|
||||
- gfx906
|
||||
- 9
|
||||
- 0
|
||||
- 32
|
||||
- 64
|
||||
- 64
|
||||
@@ -170,12 +168,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 3 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
*
|
||||
- MI50 (32GB)
|
||||
- GCN5.1
|
||||
- gfx906
|
||||
- 9
|
||||
- 0
|
||||
- 32
|
||||
- 60
|
||||
- 64
|
||||
@@ -187,12 +185,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 3 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
*
|
||||
- MI50 (16GB)
|
||||
- GCN5.1
|
||||
- gfx906
|
||||
- 9
|
||||
- 0
|
||||
- 16
|
||||
- 60
|
||||
- 64
|
||||
@@ -204,12 +202,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 3 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
*
|
||||
- MI25
|
||||
- GCN5.0
|
||||
- gfx900
|
||||
- 9
|
||||
- 0
|
||||
- 16
|
||||
- 64
|
||||
- 64
|
||||
@@ -221,12 +219,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 3 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
*
|
||||
- MI8
|
||||
- GCN3.0
|
||||
- gfx803
|
||||
- 8
|
||||
- 0
|
||||
- 4
|
||||
- 64
|
||||
- 64
|
||||
@@ -238,12 +236,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 4 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
- 8
|
||||
- 0
|
||||
*
|
||||
- MI6
|
||||
- GCN4.0
|
||||
- gfx803
|
||||
- 8
|
||||
- 0
|
||||
- 16
|
||||
- 36
|
||||
- 64
|
||||
@@ -255,6 +253,8 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 4 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
- 8
|
||||
- 0
|
||||
|
||||
.. tab-item:: AMD Radeon PRO GPUs
|
||||
|
||||
@@ -266,8 +266,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- Model
|
||||
- Architecture
|
||||
- LLVM target name
|
||||
- Device Major version
|
||||
- Device Minor version
|
||||
|
||||
- VRAM (GiB)
|
||||
- Compute Units
|
||||
- Wavefront Size
|
||||
@@ -280,12 +279,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- L0 Instruction Cache (KiB)
|
||||
- VGPR File (KiB)
|
||||
- SGPR File (KiB)
|
||||
- GFXIP Major version
|
||||
- GFXIP Minor version
|
||||
*
|
||||
- Radeon PRO V710
|
||||
- RDNA3
|
||||
- gfx1101
|
||||
- 11
|
||||
- 0
|
||||
- 28
|
||||
- 54
|
||||
- 32
|
||||
@@ -298,12 +297,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon PRO W7900 Dual Slot
|
||||
- RDNA3
|
||||
- gfx1100
|
||||
- 11
|
||||
- 0
|
||||
- 48
|
||||
- 96
|
||||
- 32
|
||||
@@ -316,12 +315,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon PRO W7900
|
||||
- RDNA3
|
||||
- gfx1100
|
||||
- 11
|
||||
- 0
|
||||
- 48
|
||||
- 96
|
||||
- 32
|
||||
@@ -334,12 +333,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon PRO W7800
|
||||
- RDNA3
|
||||
- gfx1100
|
||||
- 11
|
||||
- 0
|
||||
- 32
|
||||
- 70
|
||||
- 32
|
||||
@@ -352,12 +351,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon PRO W7700
|
||||
- RDNA3
|
||||
- gfx1101
|
||||
- 11
|
||||
- 0
|
||||
- 16
|
||||
- 48
|
||||
- 32
|
||||
@@ -370,12 +369,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon PRO W6800
|
||||
- RDNA2
|
||||
- gfx1030
|
||||
- 10
|
||||
- 3
|
||||
- 32
|
||||
- 60
|
||||
- 32
|
||||
@@ -388,12 +387,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon PRO W6600
|
||||
- RDNA2
|
||||
- gfx1032
|
||||
- 10
|
||||
- 3
|
||||
- 8
|
||||
- 28
|
||||
- 32
|
||||
@@ -406,12 +405,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon PRO V620
|
||||
- RDNA2
|
||||
- gfx1030
|
||||
- 10
|
||||
- 3
|
||||
- 32
|
||||
- 72
|
||||
- 32
|
||||
@@ -424,12 +423,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon Pro W5500
|
||||
- RDNA
|
||||
- gfx1012
|
||||
- 10
|
||||
- 1
|
||||
- 8
|
||||
- 22
|
||||
- 32
|
||||
@@ -442,12 +441,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 20
|
||||
- 10
|
||||
- 1
|
||||
*
|
||||
- Radeon Pro VII
|
||||
- GCN5.1
|
||||
- gfx906
|
||||
- 9
|
||||
- 0
|
||||
- 16
|
||||
- 60
|
||||
- 64
|
||||
@@ -460,6 +459,8 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 3 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
|
||||
.. tab-item:: AMD Radeon GPUs
|
||||
|
||||
@@ -471,8 +472,6 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- Model
|
||||
- Architecture
|
||||
- LLVM target name
|
||||
- Device Major version
|
||||
- Device Minor version
|
||||
- VRAM (GiB)
|
||||
- Compute Units
|
||||
- Wavefront Size
|
||||
@@ -485,12 +484,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- L0 Instruction Cache (KiB)
|
||||
- VGPR File (KiB)
|
||||
- SGPR File (KiB)
|
||||
- GFXIP Major version
|
||||
- GFXIP Minor version
|
||||
*
|
||||
- Radeon RX 7900 XTX
|
||||
- RDNA3
|
||||
- gfx1100
|
||||
- 11
|
||||
- 0
|
||||
- 24
|
||||
- 96
|
||||
- 32
|
||||
@@ -503,12 +502,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon RX 7900 XT
|
||||
- RDNA3
|
||||
- gfx1100
|
||||
- 11
|
||||
- 0
|
||||
- 20
|
||||
- 84
|
||||
- 32
|
||||
@@ -521,12 +520,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon RX 7900 GRE
|
||||
- RDNA3
|
||||
- gfx1100
|
||||
- 11
|
||||
- 0
|
||||
- 16
|
||||
- 80
|
||||
- 32
|
||||
@@ -539,12 +538,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon RX 7800 XT
|
||||
- RDNA3
|
||||
- gfx1101
|
||||
- 11
|
||||
- 0
|
||||
- 16
|
||||
- 60
|
||||
- 32
|
||||
@@ -557,12 +556,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon RX 7700 XT
|
||||
- RDNA3
|
||||
- gfx1101
|
||||
- 11
|
||||
- 0
|
||||
- 12
|
||||
- 54
|
||||
- 32
|
||||
@@ -575,12 +574,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon RX 7600
|
||||
- RDNA3
|
||||
- gfx1102
|
||||
- 11
|
||||
- 0
|
||||
- 8
|
||||
- 32
|
||||
- 32
|
||||
@@ -593,12 +592,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon RX 6950 XT
|
||||
- RDNA2
|
||||
- gfx1030
|
||||
- 10
|
||||
- 3
|
||||
- 16
|
||||
- 80
|
||||
- 32
|
||||
@@ -611,12 +610,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon RX 6900 XT
|
||||
- RDNA2
|
||||
- gfx1030
|
||||
- 10
|
||||
- 3
|
||||
- 16
|
||||
- 80
|
||||
- 32
|
||||
@@ -629,12 +628,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon RX 6800 XT
|
||||
- RDNA2
|
||||
- gfx1030
|
||||
- 10
|
||||
- 3
|
||||
- 16
|
||||
- 72
|
||||
- 32
|
||||
@@ -647,12 +646,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon RX 6800
|
||||
- RDNA2
|
||||
- gfx1030
|
||||
- 10
|
||||
- 3
|
||||
- 16
|
||||
- 60
|
||||
- 32
|
||||
@@ -665,12 +664,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon RX 6750 XT
|
||||
- RDNA2
|
||||
- gfx1031
|
||||
- 10
|
||||
- 3
|
||||
- 12
|
||||
- 40
|
||||
- 32
|
||||
@@ -683,12 +682,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon RX 6700 XT
|
||||
- RDNA2
|
||||
- gfx1031
|
||||
- 10
|
||||
- 3
|
||||
- 12
|
||||
- 40
|
||||
- 32
|
||||
@@ -701,13 +700,13 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon RX 6700
|
||||
- RDNA2
|
||||
- gfx1031
|
||||
- 10
|
||||
- 3
|
||||
- 10
|
||||
- 36
|
||||
- 32
|
||||
- 128
|
||||
@@ -719,12 +718,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon RX 6650 XT
|
||||
- RDNA2
|
||||
- gfx1032
|
||||
- 10
|
||||
- 3
|
||||
- 8
|
||||
- 32
|
||||
- 32
|
||||
@@ -737,12 +736,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon RX 6600 XT
|
||||
- RDNA2
|
||||
- gfx1032
|
||||
- 10
|
||||
- 3
|
||||
- 8
|
||||
- 32
|
||||
- 32
|
||||
@@ -755,12 +754,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon RX 6600
|
||||
- RDNA2
|
||||
- gfx1032
|
||||
- 10
|
||||
- 3
|
||||
- 8
|
||||
- 28
|
||||
- 32
|
||||
@@ -773,12 +772,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon VII
|
||||
- GCN5.1
|
||||
- gfx906
|
||||
- 9
|
||||
- 0
|
||||
- 16
|
||||
- 60
|
||||
- 64
|
||||
@@ -791,6 +790,8 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 3 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
|
||||
Glossary
|
||||
========
|
||||
@@ -804,18 +805,6 @@ For more information about the terms used, see the
|
||||
Argument to pass to clang in ``--offload-arch`` to compile code for the given
|
||||
architecture.
|
||||
|
||||
**Device major version**
|
||||
|
||||
Indicates the core instruction set of the GPU architecture. For example, a value
|
||||
of 11 would correspond to Navi III (RDNA3).
|
||||
|
||||
**Device minor version**
|
||||
|
||||
Indicates a particular configuration, feature set, or variation within the group
|
||||
represented by the device compute version. For example, different models within
|
||||
the same major version might have varying levels of support for certain features
|
||||
or optimizations.
|
||||
|
||||
**VRAM**
|
||||
|
||||
Amount of memory available on the GPU.
|
||||
@@ -898,6 +887,26 @@ Purpose Vector Registers, used specifically in matrix instructions.
|
||||
Size of the Scalar General Purpose Register (SGPR) file. Holds data used in
|
||||
scalar instructions.
|
||||
|
||||
**GFXIP**
|
||||
|
||||
GFXIP (Graphics IP) is a versioning system used by AMD to identify the GPU
|
||||
architecture and its instruction set. It helps categorize different generations
|
||||
of GPUs and their feature sets.
|
||||
|
||||
**GFXIP major version**
|
||||
|
||||
Defines the GPU's core instruction set and architecture, which determines
|
||||
compatibility with software stacks such as HIP and OpenCL. For example, a GFXIP
|
||||
11 major version corresponds to the RDNA 3 (Navi 3x) architecture, influencing
|
||||
driver support and available compute features.
|
||||
|
||||
**GFXIP minor version**
|
||||
|
||||
Represents specific variations within a GFXIP major version and affects feature sets,
|
||||
optimizations, and driver behavior in software stacks such as HIP and OpenCL. Different
|
||||
GPU models within the same major version can have unique capabilities, impacting
|
||||
performance and supported instructions.
|
||||
|
||||
**GCD**
|
||||
|
||||
Graphics Compute Die.
|
||||
|
||||
@@ -40,11 +40,13 @@ subtrees:
|
||||
title: Training
|
||||
subtrees:
|
||||
- entries:
|
||||
- file: how-to/rocm-for-ai/training/train-a-model.rst
|
||||
title: Train a model
|
||||
- file: how-to/rocm-for-ai/training/benchmark-docker/megatron-lm
|
||||
title: Train a model with Megatron-LM
|
||||
- file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training
|
||||
title: Train a model with PyTorch
|
||||
- file: how-to/rocm-for-ai/training/scale-model-training.rst
|
||||
title: Scale model training
|
||||
|
||||
|
||||
- file: how-to/rocm-for-ai/fine-tuning/index.rst
|
||||
title: Fine-tuning LLMs
|
||||
subtrees:
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
rocm-docs-core==1.15.0
|
||||
rocm-docs-core==1.17.0
|
||||
sphinx-reredirects
|
||||
sphinx-sitemap
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.10
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements.in
|
||||
@@ -8,6 +8,8 @@ accessible-pygments==0.0.5
|
||||
# via pydata-sphinx-theme
|
||||
alabaster==1.0.0
|
||||
# via sphinx
|
||||
appnope==0.1.4
|
||||
# via ipykernel
|
||||
asttokens==3.0.0
|
||||
# via stack-data
|
||||
attrs==25.1.0
|
||||
@@ -23,7 +25,7 @@ beautifulsoup4==4.12.3
|
||||
# via pydata-sphinx-theme
|
||||
breathe==4.35.0
|
||||
# via rocm-docs-core
|
||||
certifi==2024.8.30
|
||||
certifi==2024.12.14
|
||||
# via requests
|
||||
cffi==1.17.1
|
||||
# via
|
||||
@@ -37,7 +39,7 @@ click==8.1.7
|
||||
# sphinx-external-toc
|
||||
comm==0.2.2
|
||||
# via ipykernel
|
||||
cryptography==44.0.1
|
||||
cryptography==44.0.0
|
||||
# via pyjwt
|
||||
debugpy==1.8.12
|
||||
# via ipykernel
|
||||
@@ -51,11 +53,9 @@ docutils==0.21.2
|
||||
# myst-parser
|
||||
# pydata-sphinx-theme
|
||||
# sphinx
|
||||
exceptiongroup==1.2.2
|
||||
# via ipython
|
||||
executing==2.2.0
|
||||
# via stack-data
|
||||
fastjsonschema==2.20.0
|
||||
fastjsonschema==2.21.1
|
||||
# via
|
||||
# nbformat
|
||||
# rocm-docs-core
|
||||
@@ -63,8 +63,6 @@ gitdb==4.0.11
|
||||
# via gitpython
|
||||
gitpython==3.1.43
|
||||
# via rocm-docs-core
|
||||
greenlet==3.1.1
|
||||
# via sqlalchemy
|
||||
idna==3.10
|
||||
# via requests
|
||||
imagesize==1.4.1
|
||||
@@ -75,13 +73,13 @@ importlib-metadata==8.6.1
|
||||
# myst-nb
|
||||
ipykernel==6.29.5
|
||||
# via myst-nb
|
||||
ipython==8.31.0
|
||||
ipython==8.32.0
|
||||
# via
|
||||
# ipykernel
|
||||
# myst-nb
|
||||
jedi==0.19.2
|
||||
# via ipython
|
||||
jinja2==3.1.5
|
||||
jinja2==3.1.4
|
||||
# via
|
||||
# myst-parser
|
||||
# sphinx
|
||||
@@ -115,7 +113,7 @@ mdit-py-plugins==0.4.2
|
||||
# via myst-parser
|
||||
mdurl==0.1.2
|
||||
# via markdown-it-py
|
||||
myst-nb==1.1.2
|
||||
myst-nb==1.2.0
|
||||
# via rocm-docs-core
|
||||
myst-parser==4.0.0
|
||||
# via myst-nb
|
||||
@@ -142,7 +140,7 @@ platformdirs==4.3.6
|
||||
# via jupyter-core
|
||||
prompt-toolkit==3.0.50
|
||||
# via ipython
|
||||
psutil==6.1.1
|
||||
psutil==7.0.0
|
||||
# via ipykernel
|
||||
ptyprocess==0.7.0
|
||||
# via pexpect
|
||||
@@ -150,7 +148,7 @@ pure-eval==0.2.3
|
||||
# via stack-data
|
||||
pycparser==2.22
|
||||
# via cffi
|
||||
pydata-sphinx-theme==0.16.0
|
||||
pydata-sphinx-theme==0.16.1
|
||||
# via
|
||||
# rocm-docs-core
|
||||
# sphinx-book-theme
|
||||
@@ -162,7 +160,7 @@ pygments==2.18.0
|
||||
# ipython
|
||||
# pydata-sphinx-theme
|
||||
# sphinx
|
||||
pyjwt[crypto]==2.10.0
|
||||
pyjwt[crypto]==2.10.1
|
||||
# via pygithub
|
||||
pynacl==1.5.0
|
||||
# via pygithub
|
||||
@@ -175,7 +173,7 @@ pyyaml==6.0.2
|
||||
# myst-parser
|
||||
# rocm-docs-core
|
||||
# sphinx-external-toc
|
||||
pyzmq==26.2.0
|
||||
pyzmq==26.2.1
|
||||
# via
|
||||
# ipykernel
|
||||
# jupyter-client
|
||||
@@ -187,7 +185,7 @@ requests==2.32.3
|
||||
# via
|
||||
# pygithub
|
||||
# sphinx
|
||||
rocm-docs-core==1.15.0
|
||||
rocm-docs-core==1.17.0
|
||||
# via -r requirements.in
|
||||
rpds-py==0.22.3
|
||||
# via
|
||||
@@ -241,14 +239,12 @@ sphinxcontrib-qthelp==2.0.0
|
||||
# via sphinx
|
||||
sphinxcontrib-serializinghtml==2.0.0
|
||||
# via sphinx
|
||||
sqlalchemy==2.0.37
|
||||
sqlalchemy==2.0.38
|
||||
# via jupyter-cache
|
||||
stack-data==0.6.3
|
||||
# via ipython
|
||||
tabulate==0.9.0
|
||||
# via jupyter-cache
|
||||
tomli==2.1.0
|
||||
# via sphinx
|
||||
tornado==6.4.2
|
||||
# via
|
||||
# ipykernel
|
||||
|
||||
@@ -68,85 +68,6 @@ set_address_sanitizer_off() {
|
||||
export LDFLAGS=""
|
||||
}
|
||||
|
||||
build_miopen_ckProf() {
|
||||
ENABLE_ADDRESS_SANITIZER=false
|
||||
echo "Start Building Composable Kernel Profiler"
|
||||
if [ "${ENABLE_ADDRESS_SANITIZER}" == "true" ]; then
|
||||
set_asan_env_vars
|
||||
set_address_sanitizer_on
|
||||
else
|
||||
unset_asan_env_vars
|
||||
set_address_sanitizer_off
|
||||
fi
|
||||
|
||||
cd $COMPONENT_SRC
|
||||
cd "$BUILD_DIR"
|
||||
rm -rf *
|
||||
|
||||
architectures='gfx10 gfx11 gfx90 gfx94'
|
||||
if [ -n "$GPU_ARCHS" ]; then
|
||||
architectures=$(echo ${GPU_ARCHS} | awk -F';' '{for(i=1;i<=NF;i++) a[substr($i,1,5)]} END{for(i in a) printf i" "}')
|
||||
fi
|
||||
|
||||
for arch in ${architectures}
|
||||
do
|
||||
if [ "${ASAN_CMAKE_PARAMS}" == "true" ] ; then
|
||||
cmake -DBUILD_DEV=OFF \
|
||||
-DCMAKE_PREFIX_PATH="${ROCM_PATH%-*}/lib/cmake;${ROCM_PATH%-*}/$ASAN_LIBDIR;${ROCM_PATH%-*}/llvm;${ROCM_PATH%-*}" \
|
||||
-DCMAKE_BUILD_TYPE=${BUILD_TYPE:-'RelWithDebInfo'} \
|
||||
-DCMAKE_SHARED_LINKER_FLAGS_INIT="-Wl,--enable-new-dtags,--rpath,$ROCM_ASAN_LIB_RPATH" \
|
||||
-DCMAKE_EXE_LINKER_FLAGS_INIT="-Wl,--enable-new-dtags,--rpath,$ROCM_ASAN_EXE_RPATH" \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=1 \
|
||||
-DCMAKE_INSTALL_RPATH_USE_LINK_PATH=FALSE \
|
||||
-DCMAKE_INSTALL_PREFIX="${ROCM_PATH}" \
|
||||
-DCMAKE_PACKAGING_INSTALL_PREFIX="${ROCM_PATH}" \
|
||||
-DBUILD_FILE_REORG_BACKWARD_COMPATIBILITY=OFF \
|
||||
-DROCM_SYMLINK_LIBS=OFF \
|
||||
-DCPACK_PACKAGING_INSTALL_PREFIX="${ROCM_PATH}" \
|
||||
-DROCM_DISABLE_LDCONFIG=ON \
|
||||
-DROCM_PATH="${ROCM_PATH}" \
|
||||
-DCPACK_GENERATOR="${PKGTYPE^^}" \
|
||||
-DCMAKE_CXX_COMPILER="${ROCM_PATH}/llvm/bin/clang++" \
|
||||
-DCMAKE_C_COMPILER="${ROCM_PATH}/llvm/bin/clang" \
|
||||
${LAUNCHER_FLAGS} \
|
||||
-DPROFILER_ONLY=ON \
|
||||
-DENABLE_ASAN_PACKAGING=true \
|
||||
-DGPU_ARCH="${arch}" \
|
||||
"$COMPONENT_SRC"
|
||||
else
|
||||
cmake -DBUILD_DEV=OFF \
|
||||
-DCMAKE_PREFIX_PATH="${ROCM_PATH%-*}" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_SHARED_LINKER_FLAGS_INIT='-Wl,--enable-new-dtags,--rpath,$ORIGIN' \
|
||||
-DCMAKE_EXE_LINKER_FLAGS_INIT='-Wl,--enable-new-dtags,--rpath,$ORIGIN/../lib' \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=1 \
|
||||
-DCMAKE_INSTALL_RPATH_USE_LINK_PATH=FALSE \
|
||||
-DCMAKE_INSTALL_PREFIX="${ROCM_PATH}" \
|
||||
-DCMAKE_PACKAGING_INSTALL_PREFIX="${ROCM_PATH}" \
|
||||
-DBUILD_FILE_REORG_BACKWARD_COMPATIBILITY=OFF \
|
||||
-DROCM_SYMLINK_LIBS=OFF \
|
||||
-DCPACK_PACKAGING_INSTALL_PREFIX="${ROCM_PATH}" \
|
||||
-DROCM_DISABLE_LDCONFIG=ON \
|
||||
-DROCM_PATH="${ROCM_PATH}" \
|
||||
-DCPACK_GENERATOR="${PKGTYPE^^}" \
|
||||
-DCMAKE_CXX_COMPILER="${ROCM_PATH}/llvm/bin/clang++" \
|
||||
-DCMAKE_C_COMPILER="${ROCM_PATH}/llvm/bin/clang" \
|
||||
${LAUNCHER_FLAGS} \
|
||||
-DPROFILER_ONLY=ON \
|
||||
-DGPU_ARCH="${arch}" \
|
||||
"$COMPONENT_SRC"
|
||||
fi
|
||||
|
||||
cmake --build . -- -j${PROC} package
|
||||
cp ./*ckprofiler*.${PKGTYPE} $PACKAGE_DIR
|
||||
rm -rf *
|
||||
done
|
||||
rm -rf _CPack_Packages/ && find -name '*.o' -delete
|
||||
|
||||
echo "Finished building Composable Kernel"
|
||||
show_build_cache_stats
|
||||
}
|
||||
|
||||
clean_miopen_ck() {
|
||||
echo "Cleaning MIOpen-CK build directory: ${BUILD_DIR} ${PACKAGE_DIR}"
|
||||
rm -rf "$BUILD_DIR" "$PACKAGE_DIR"
|
||||
|
||||
@@ -42,7 +42,6 @@ DEB_PATH="$(getDebPath $PROJ_NAME)"
|
||||
RPM_PATH="$(getRpmPath $PROJ_NAME)"
|
||||
INSTALL_PATH="${ROCM_INSTALL_PATH}/lib/llvm"
|
||||
LLVM_ROOT_LCL="${LLVM_ROOT}"
|
||||
ROCM_WHEEL_DIR="${BUILD_PATH}/_wheel"
|
||||
|
||||
TARGET="all"
|
||||
MAKEOPTS="$DASH_JAY"
|
||||
@@ -150,7 +149,6 @@ ENABLE_RUNTIMES="$ENABLE_RUNTIMES;libcxx;libcxxabi"
|
||||
BOOTSTRAPPING_BUILD_LIBCXX=1
|
||||
|
||||
clean_lightning() {
|
||||
rm -rf "$ROCM_WHEEL_DIR"
|
||||
rm -rf "$BUILD_PATH"
|
||||
rm -rf "$DEB_PATH"
|
||||
rm -rf "$RPM_PATH"
|
||||
@@ -332,15 +330,6 @@ build_lightning() {
|
||||
echo "End Workaround for race condition"
|
||||
cmake --build . -- $MAKEOPTS
|
||||
|
||||
case "$DISTRO_ID" in
|
||||
(rhel*|centos*)
|
||||
RHEL_BUILD=1
|
||||
;;
|
||||
(*)
|
||||
RHEL_BUILD=0
|
||||
;;
|
||||
esac
|
||||
|
||||
if [ $SKIP_LIT_TESTS -eq 0 ]; then
|
||||
if [ $RHEL_BUILD -eq 1 ]; then
|
||||
cmake --build . -- $MAKEOPTS check-lld check-mlir
|
||||
@@ -1158,9 +1147,4 @@ case $TARGET in
|
||||
(*) die "Invalid target $TARGET" ;;
|
||||
esac
|
||||
|
||||
if [[ $WHEEL_PACKAGE == true ]]; then
|
||||
echo "Wheel Package build started !!!!"
|
||||
create_wheel_package
|
||||
fi
|
||||
|
||||
echo "Operation complete"
|
||||
|
||||
@@ -1,171 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
source "$(dirname "${BASH_SOURCE}")/compute_utils.sh"
|
||||
|
||||
printUsage() {
|
||||
echo
|
||||
echo "Usage: ${BASH_SOURCE##*/} [options ...]"
|
||||
echo
|
||||
echo "Options:"
|
||||
echo " -c, --clean Clean output and delete all intermediate work"
|
||||
echo " -s, --static Build static lib (.a). build instead of dynamic/shared(.so) "
|
||||
echo " -p, --package <type> Specify packaging format"
|
||||
echo " -r, --release Make a release build instead of a debug build"
|
||||
echo " -a, --address_sanitizer Enable address sanitizer"
|
||||
echo " -o, --outdir <pkg_type> Print path of output directory containing packages of
|
||||
type referred to by pkg_type"
|
||||
echo " -w, --wheel Creates python wheel package of omniperf.
|
||||
It needs to be used along with -r option"
|
||||
echo " -h, --help Prints this help"
|
||||
echo
|
||||
echo "Possible values for <type>:"
|
||||
echo " deb -> Debian format (default)"
|
||||
echo " rpm -> RPM format"
|
||||
echo
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
API_NAME="omniperf"
|
||||
PROJ_NAME="$API_NAME"
|
||||
LIB_NAME="lib${API_NAME}"
|
||||
TARGET="build"
|
||||
MAKETARGET="deb"
|
||||
PACKAGE_ROOT="$(getPackageRoot)"
|
||||
PACKAGE_LIB="$(getLibPath)"
|
||||
BUILD_DIR="$(getBuildPath $API_NAME)"
|
||||
PACKAGE_DEB="$(getPackageRoot)/deb/$API_NAME"
|
||||
PACKAGE_RPM="$(getPackageRoot)/rpm/$API_NAME"
|
||||
ROCM_WHEEL_DIR="${BUILD_DIR}/_wheel"
|
||||
BUILD_TYPE="Debug"
|
||||
MAKE_OPTS="$DASH_JAY -C $BUILD_DIR"
|
||||
SHARED_LIBS="ON"
|
||||
CLEAN_OR_OUT=0;
|
||||
MAKETARGET="deb"
|
||||
PKGTYPE="deb"
|
||||
WHEEL_PACKAGE=false
|
||||
|
||||
|
||||
#parse the arguments
|
||||
VALID_STR=$(getopt -o hcraso:p:w --long help,clean,release,static,address_sanitizer,outdir:,package:,wheel -- "$@")
|
||||
eval set -- "$VALID_STR"
|
||||
|
||||
while true ;
|
||||
do
|
||||
case "$1" in
|
||||
-h | --help)
|
||||
printUsage ; exit 0;;
|
||||
-c | --clean)
|
||||
TARGET="clean" ; ((CLEAN_OR_OUT|=1)) ; shift ;;
|
||||
-r | --release)
|
||||
BUILD_TYPE="Release" ; shift ;;
|
||||
-a | --address_sanitizer)
|
||||
set_asan_env_vars
|
||||
set_address_sanitizer_on ; shift ;;
|
||||
-s | --static)
|
||||
SHARED_LIBS="OFF" ; shift ;;
|
||||
-o | --outdir)
|
||||
TARGET="outdir"; PKGTYPE=$2 ; OUT_DIR_SPECIFIED=1 ; ((CLEAN_OR_OUT|=2)) ; shift 2 ;;
|
||||
-p | --package)
|
||||
MAKETARGET="$2" ; shift 2 ;;
|
||||
-w | --wheel)
|
||||
WHEEL_PACKAGE=true ; shift ;;
|
||||
--) shift; break;; # end delimiter
|
||||
*)
|
||||
echo " This should never come but just incase : UNEXPECTED ERROR Parm : [$1] ">&2 ; exit 20;;
|
||||
esac
|
||||
|
||||
done
|
||||
|
||||
RET_CONFLICT=1
|
||||
check_conflicting_options "$CLEAN_OR_OUT" "$PKGTYPE" "$MAKETARGET"
|
||||
if [ $RET_CONFLICT -ge 30 ]; then
|
||||
print_vars "$API_NAME" "$TARGET" "$BUILD_TYPE" "$SHARED_LIBS" "$CLEAN_OR_OUT" "$PKGTYPE" "$MAKETARGET"
|
||||
exit $RET_CONFLICT
|
||||
fi
|
||||
|
||||
clean() {
|
||||
echo "Cleaning $PROJ_NAME"
|
||||
rm -rf "$ROCM_WHEEL_DIR"
|
||||
rm -rf "$BUILD_DIR"
|
||||
rm -rf "$PACKAGE_DEB"
|
||||
rm -rf "$PACKAGE_RPM"
|
||||
rm -rf "$PACKAGE_ROOT/${PROJ_NAME:?}"
|
||||
rm -rf "$PACKAGE_LIB/${LIB_NAME:?}"*
|
||||
}
|
||||
|
||||
build() {
|
||||
echo "Building $PROJ_NAME"
|
||||
if [ "$DISTRO_ID" = centos-7 ]; then
|
||||
echo "Skip make and uploading packages for Omniperf on Centos7 distro, due to python dependency"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ ! -d "$BUILD_DIR" ]; then
|
||||
mkdir -p "$BUILD_DIR"
|
||||
pushd "$BUILD_DIR" || exit
|
||||
|
||||
echo "ROCm CMake Params: $(rocm_cmake_params)"
|
||||
echo "ROCm Common CMake Params: $(rocm_common_cmake_params)"
|
||||
|
||||
print_lib_type $SHARED_LIBS
|
||||
cmake \
|
||||
$(rocm_cmake_params) \
|
||||
$(rocm_common_cmake_params) \
|
||||
-DCHECK_PYTHON_DEPS=NO \
|
||||
-DPYTHON_DEPS=${BUILD_DIR}/python-libs \
|
||||
-DMOD_INSTALL_PATH=${BUILD_DIR}/modulefiles \
|
||||
"$OMNIPERF_ROOT"
|
||||
fi
|
||||
|
||||
make $MAKE_OPTS
|
||||
make $MAKE_OPTS install
|
||||
make $MAKE_OPTS package
|
||||
|
||||
copy_if DEB "${CPACKGEN:-"DEB;RPM"}" "$PACKAGE_DEB" "$BUILD_DIR/${API_NAME}"*.deb
|
||||
copy_if RPM "${CPACKGEN:-"DEB;RPM"}" "$PACKAGE_RPM" "$BUILD_DIR/${API_NAME}"*.rpm
|
||||
}
|
||||
|
||||
create_wheel_package() {
|
||||
echo "Creating Omniperf wheel package"
|
||||
|
||||
# Copy the setup.py generator to build folder
|
||||
mkdir -p "$ROCM_WHEEL_DIR"
|
||||
cp -f "$SCRIPT_ROOT"/generate_setup_py.py "$ROCM_WHEEL_DIR"
|
||||
cp -f "$SCRIPT_ROOT"/repackage_wheel.sh "$ROCM_WHEEL_DIR"
|
||||
cd "$ROCM_WHEEL_DIR" || exit
|
||||
|
||||
# Currently only supports python3.6
|
||||
./repackage_wheel.sh "$BUILD_DIR"/*.rpm python3.6
|
||||
|
||||
# Copy the wheel created to RPM folder which will be uploaded to artifactory
|
||||
copy_if WHL "WHL" "$PACKAGE_RPM" "$ROCM_WHEEL_DIR"/dist/*.whl
|
||||
}
|
||||
|
||||
print_output_directory() {
|
||||
case ${PKGTYPE} in
|
||||
("deb")
|
||||
echo "${PACKAGE_DEB}";;
|
||||
("rpm")
|
||||
echo "${PACKAGE_RPM}";;
|
||||
(*)
|
||||
echo "Invalid package type \"${PKGTYPE}\" provided for -o" >&2; exit 1;;
|
||||
esac
|
||||
exit
|
||||
}
|
||||
|
||||
verifyEnvSetup
|
||||
|
||||
case "$TARGET" in
|
||||
(clean) clean ;;
|
||||
(build) build ;;
|
||||
(outdir) print_output_directory ;;
|
||||
(*) die "Invalid target $TARGET" ;;
|
||||
esac
|
||||
|
||||
if [[ $WHEEL_PACKAGE == true ]]; then
|
||||
echo "Wheel Package build started !!!!"
|
||||
create_wheel_package
|
||||
fi
|
||||
|
||||
echo "Operation complete"
|
||||
@@ -1,191 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
source "$(dirname "${BASH_SOURCE}")/compute_utils.sh"
|
||||
|
||||
printUsage() {
|
||||
echo
|
||||
echo "Usage: ${BASH_SOURCE##*/} [options ...]"
|
||||
echo
|
||||
echo "Options:"
|
||||
echo " -c, --clean Clean output and delete all intermediate work"
|
||||
echo " -s, --static Build static lib (.a). build instead of dynamic/shared(.so) "
|
||||
echo " -p, --package <type> Specify packaging format"
|
||||
echo " -r, --release Make a release build instead of a debug build"
|
||||
echo " -a, --address_sanitizer Enable address sanitizer"
|
||||
echo " -o, --outdir <pkg_type> Print path of output directory containing packages of
|
||||
type referred to by pkg_type"
|
||||
echo " -w, --wheel Creates python wheel package of omnitrace.
|
||||
It needs to be used along with -r option"
|
||||
echo " -h, --help Prints this help"
|
||||
echo
|
||||
echo "Possible values for <type>:"
|
||||
echo " deb -> Debian format (default)"
|
||||
echo " rpm -> RPM format"
|
||||
echo
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
API_NAME="omnitrace"
|
||||
PROJ_NAME="$API_NAME"
|
||||
LIB_NAME="lib${API_NAME}"
|
||||
TARGET="build"
|
||||
MAKETARGET="deb"
|
||||
PACKAGE_ROOT="$(getPackageRoot)"
|
||||
PACKAGE_LIB="$(getLibPath)"
|
||||
BUILD_DIR="$(getBuildPath $API_NAME)"
|
||||
PACKAGE_DEB="$(getPackageRoot)/deb/$API_NAME"
|
||||
PACKAGE_RPM="$(getPackageRoot)/rpm/$API_NAME"
|
||||
BUILD_TYPE="Debug"
|
||||
MAKE_OPTS="-j 8"
|
||||
SHARED_LIBS="ON"
|
||||
CLEAN_OR_OUT=0
|
||||
MAKETARGET="deb"
|
||||
PKGTYPE="deb"
|
||||
ASAN=0
|
||||
|
||||
#parse the arguments
|
||||
VALID_STR=$(getopt -o hcraso:p:w --long help,clean,release,address_sanitizer,static,outdir:,package:,wheel -- "$@")
|
||||
eval set -- "$VALID_STR"
|
||||
|
||||
while true; do
|
||||
case "$1" in
|
||||
-h | --help)
|
||||
printUsage
|
||||
exit 0
|
||||
;;
|
||||
-c | --clean)
|
||||
TARGET="clean"
|
||||
((CLEAN_OR_OUT |= 1))
|
||||
shift
|
||||
;;
|
||||
-r | --release)
|
||||
BUILD_TYPE="RelWithDebInfo"
|
||||
shift
|
||||
;;
|
||||
-a | --address_sanitizer)
|
||||
ack_and_ignore_asan
|
||||
|
||||
ASAN=1
|
||||
shift
|
||||
;;
|
||||
-s | --static)
|
||||
SHARED_LIBS="OFF"
|
||||
shift
|
||||
;;
|
||||
-o | --outdir)
|
||||
TARGET="outdir"
|
||||
PKGTYPE=$2
|
||||
((CLEAN_OR_OUT |= 2))
|
||||
shift 2
|
||||
;;
|
||||
-p | --package)
|
||||
MAKETARGET="$2"
|
||||
shift 2
|
||||
;;
|
||||
-w | --wheel)
|
||||
echo "omnitrace: wheel build option accepted and ignored"
|
||||
shift
|
||||
;;
|
||||
--)
|
||||
shift
|
||||
break
|
||||
;;
|
||||
*)
|
||||
echo " This should never come but just incase : UNEXPECTED ERROR Parm : [$1] " >&2
|
||||
exit 20
|
||||
;;
|
||||
esac
|
||||
|
||||
done
|
||||
|
||||
RET_CONFLICT=1
|
||||
check_conflicting_options $CLEAN_OR_OUT $PKGTYPE $MAKETARGET
|
||||
if [ $RET_CONFLICT -ge 30 ]; then
|
||||
print_vars $API_NAME $TARGET $BUILD_TYPE $SHARED_LIBS $CLEAN_OR_OUT $PKGTYPE $MAKETARGET
|
||||
exit $RET_CONFLICT
|
||||
fi
|
||||
|
||||
clean() {
|
||||
echo "Cleaning $PROJ_NAME"
|
||||
rm -rf "$BUILD_DIR"
|
||||
rm -rf "$PACKAGE_DEB"
|
||||
rm -rf "$PACKAGE_RPM"
|
||||
rm -rf "$PACKAGE_ROOT/${PROJ_NAME:?}"
|
||||
rm -rf "$PACKAGE_LIB/${LIB_NAME:?}"*
|
||||
}
|
||||
|
||||
build_omnitrace() {
|
||||
echo "Building $PROJ_NAME"
|
||||
if [ "$DISTRO_ID" = "mariner-2.0" ] || [ "$DISTRO_ID" = "ubuntu-24.04" ] || [ "$DISTRO_ID" = "azurelinux-3.0" ]; then
|
||||
echo "Skip make and uploading packages for Omnitrace on \"${DISTRO_ID}\" distro"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ $ASAN == 1 ]; then
|
||||
echo "Skip make and uploading packages for Omnitrace on ASAN build"
|
||||
exit 0
|
||||
fi
|
||||
if [ ! -d "$BUILD_DIR" ]; then
|
||||
mkdir -p "$BUILD_DIR"
|
||||
echo "Created build directory: $BUILD_DIR"
|
||||
fi
|
||||
|
||||
echo "Build directory: $BUILD_DIR"
|
||||
pushd "$BUILD_DIR" || exit
|
||||
print_lib_type $SHARED_LIBS
|
||||
|
||||
echo "ROCm CMake Params: $(rocm_cmake_params)"
|
||||
echo "ROCm Common CMake Params: $(rocm_common_cmake_params)"
|
||||
|
||||
|
||||
if [ $ASAN == 1 ]; then
|
||||
echo "Address Sanitizer path"
|
||||
|
||||
else
|
||||
cmake \
|
||||
$(rocm_cmake_params) \
|
||||
$(rocm_common_cmake_params) \
|
||||
-DOMNITRACE_BUILD_{LIBUNWIND,DYNINST}=ON \
|
||||
-DDYNINST_BUILD_{TBB,BOOST,ELFUTILS,LIBIBERTY}=ON \
|
||||
"$OMNITRACE_ROOT"
|
||||
fi
|
||||
|
||||
|
||||
popd || exit
|
||||
|
||||
echo "Make Options: $MAKE_OPTS"
|
||||
cmake --build "$BUILD_DIR" --target all -- $MAKE_OPTS
|
||||
cmake --build "$BUILD_DIR" --target install -- $MAKE_OPTS
|
||||
cmake --build "$BUILD_DIR" --target package -- $MAKE_OPTS
|
||||
|
||||
copy_if DEB "${CPACKGEN:-"DEB;RPM"}" "$PACKAGE_DEB" "$BUILD_DIR/${API_NAME}"*.deb
|
||||
copy_if RPM "${CPACKGEN:-"DEB;RPM"}" "$PACKAGE_RPM" "$BUILD_DIR/${API_NAME}"*.rpm
|
||||
}
|
||||
|
||||
print_output_directory() {
|
||||
case ${PKGTYPE} in
|
||||
"deb")
|
||||
echo "${PACKAGE_DEB}"
|
||||
;;
|
||||
"rpm")
|
||||
echo "${PACKAGE_RPM}"
|
||||
;;
|
||||
*)
|
||||
echo "Invalid package type \"${PKGTYPE}\" provided for -o" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
exit
|
||||
}
|
||||
|
||||
verifyEnvSetup
|
||||
|
||||
case "$TARGET" in
|
||||
clean) clean ;;
|
||||
build) build_omnitrace ;;
|
||||
outdir) print_output_directory ;;
|
||||
*) die "Invalid target $TARGET" ;;
|
||||
esac
|
||||
|
||||
echo "Operation complete"
|
||||
@@ -1,141 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
source "$(dirname "${BASH_SOURCE}")/compute_utils.sh"
|
||||
PROJ_NAME=OpenCL-ICD-Loader
|
||||
TARGET="build"
|
||||
MAKEOPTS="$DASH_JAY"
|
||||
BUILD_TYPE="Debug"
|
||||
PACKAGE_ROOT="$(getPackageRoot)"
|
||||
PACKAGE_DEB="$PACKAGE_ROOT/deb/${PROJ_NAME,,}"
|
||||
PACKAGE_RPM="$PACKAGE_ROOT/rpm/${PROJ_NAME,,}"
|
||||
CLEAN_OR_OUT=0;
|
||||
PKGTYPE="deb"
|
||||
MAKETARGET="deb"
|
||||
API_NAME="rocm-opencl-icd-loader"
|
||||
|
||||
printUsage() {
|
||||
echo
|
||||
echo "Usage: $(basename "${BASH_SOURCE}") [options ...]"
|
||||
echo
|
||||
echo "Options:"
|
||||
echo " -c, --clean Clean output and delete all intermediate work"
|
||||
echo " -p, --package <type> Specify packaging format"
|
||||
echo " -r, --release Make a release build instead of a debug build"
|
||||
echo " -h, --help Prints this help"
|
||||
echo " -o, --outdir Print path of output directory containing packages"
|
||||
echo " -s, --static Component/Build does not support static builds just accepting this param & ignore. No effect of the param on this build"
|
||||
echo
|
||||
echo "Possible values for <type>:"
|
||||
echo " deb -> Debian format (default)"
|
||||
echo " rpm -> RPM format"
|
||||
echo
|
||||
return 0
|
||||
}
|
||||
|
||||
RET_CONFLICT=1
|
||||
check_conflicting_options $CLEAN_OR_OUT $PKGTYPE $MAKETARGET
|
||||
if [ $RET_CONFLICT -ge 30 ]; then
|
||||
print_vars $TARGET $BUILD_TYPE $CLEAN_OR_OUT $PKGTYPE $MAKETARGET
|
||||
exit $RET_CONFLICT
|
||||
fi
|
||||
|
||||
clean_opencl_icd_loader() {
|
||||
echo "Cleaning $PROJ_NAME"
|
||||
rm -rf "$PACKAGE_DEB"
|
||||
rm -rf "$PACKAGE_RPM"
|
||||
rm -rf "$PACKAGE_ROOT/${PROJ_NAME,,}"
|
||||
}
|
||||
|
||||
copy_pkg_files_to_rocm() {
|
||||
local comp_folder=$1
|
||||
local comp_pkg_name=$2
|
||||
|
||||
cd "${OUT_DIR}/${PKGTYPE}/${comp_folder}"|| exit 2
|
||||
if [ "${PKGTYPE}" = 'deb' ]; then
|
||||
dpkg-deb -x ${comp_pkg_name}_*.deb pkg/
|
||||
else
|
||||
mkdir pkg && pushd pkg/ || exit 2
|
||||
if [[ "${comp_pkg_name}" != *-dev* ]]; then
|
||||
rpm2cpio ../${comp_pkg_name}-*.rpm | cpio -idmv
|
||||
else
|
||||
rpm2cpio ../${comp_pkg_name}el-*.rpm | cpio -idmv
|
||||
fi
|
||||
popd || exit 2
|
||||
fi
|
||||
ls ./pkg -alt
|
||||
cp -r ./pkg/*/rocm*/* "${ROCM_PATH}" || exit 2
|
||||
rm -rf pkg/
|
||||
}
|
||||
|
||||
build_opencl_icd_loader() {
|
||||
echo "Downloading $PROJ_NAME" package
|
||||
if [ "$DISTRO_NAME" = ubuntu ]; then
|
||||
mkdir -p "$PACKAGE_DEB"
|
||||
local rocm_ver=${ROCM_VERSION}
|
||||
if [ ${ROCM_VERSION##*.} = 0 ]; then
|
||||
rocm_ver=${ROCM_VERSION%.*}
|
||||
fi
|
||||
local url="https://repo.radeon.com/rocm/apt/${rocm_ver}/pool/main/r/${API_NAME}/"
|
||||
local package
|
||||
package=$(curl -s "$url" | grep -Po 'href="\K[^"]*' | grep "${DISTRO_RELEASE}" | head -n 1)
|
||||
|
||||
if [ -z "$package" ]; then
|
||||
echo "No package found for Ubuntu version $DISTRO_RELEASE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
wget -t3 -P "$PACKAGE_DEB" "${url}${package}"
|
||||
copy_pkg_files_to_rocm ${PROJ_NAME,,} ${API_NAME}
|
||||
else
|
||||
echo "$DISTRO_ID is not supported..."
|
||||
exit 2
|
||||
fi
|
||||
|
||||
echo "Installing $PROJ_NAME" package
|
||||
}
|
||||
|
||||
print_output_directory() {
|
||||
case ${PKGTYPE} in
|
||||
("deb")
|
||||
echo ${PACKAGE_DEB};;
|
||||
("rpm")
|
||||
echo ${PACKAGE_RPM};;
|
||||
(*)
|
||||
echo "Invalid package type \"${PKGTYPE}\" provided for -o" >&2; exit 1;;
|
||||
esac
|
||||
exit
|
||||
}
|
||||
|
||||
VALID_STR=`getopt -o hcraswlo:p: --long help,clean,release,outdir:,package: -- "$@"`
|
||||
eval set -- "$VALID_STR"
|
||||
while true ;
|
||||
do
|
||||
case "$1" in
|
||||
(-c | --clean )
|
||||
TARGET="clean" ; ((CLEAN_OR_OUT|=1)) ; shift ;;
|
||||
(-r | --release )
|
||||
BUILD_TYPE="RelWithDebInfo" ; shift ;;
|
||||
(-h | --help )
|
||||
printUsage ; exit 0 ;;
|
||||
(-a | --address_sanitizer)
|
||||
ack_and_ignore_asan ; shift ;;
|
||||
(-o | --outdir)
|
||||
TARGET="outdir"; PKGTYPE=$2 ; OUT_DIR_SPECIFIED=1 ; ((CLEAN_OR_OUT|=2)) ; shift 2 ;;
|
||||
(-p | --package)
|
||||
MAKETARGET="$2" ; shift 2;;
|
||||
(-s | --static)
|
||||
echo "-s parameter accepted but ignored" ; shift ;;
|
||||
--) shift; break;;
|
||||
(*)
|
||||
echo " This should never come but just incase : UNEXPECTED ERROR Parm : [$1] ">&2 ; exit 20;;
|
||||
esac
|
||||
done
|
||||
|
||||
case $TARGET in
|
||||
(clean) clean_opencl_icd_loader ;;
|
||||
(build) build_opencl_icd_loader ;;
|
||||
(outdir) print_output_directory ;;
|
||||
(*) die "Invalid target $TARGET" ;;
|
||||
esac
|
||||
|
||||
echo "Operation complete"
|
||||
@@ -32,7 +32,6 @@ ROCM_CMAKE_BUILD_DIR="$(getBuildPath rocm-cmake)"
|
||||
ROCM_CMAKE_BUILD_DIR="$(getBuildPath rocm-cmake)"
|
||||
ROCM_CMAKE_PACKAGE_DEB="$(getPackageRoot)/deb/rocm-cmake"
|
||||
ROCM_CMAKE_PACKAGE_RPM="$(getPackageRoot)/rpm/rocm-cmake"
|
||||
ROCM_WHEEL_DIR="${ROCM_CMAKE_BUILD_DIR}/_wheel"
|
||||
ROCM_CMAKE_BUILD_TYPE="debug"
|
||||
BUILD_TYPE="Debug"
|
||||
SHARED_LIBS="ON"
|
||||
@@ -56,8 +55,6 @@ do
|
||||
ack_and_ignore_asan ; shift ;;
|
||||
(-s | --static)
|
||||
SHARED_LIBS="OFF" ; shift ;;
|
||||
(-w | --wheel)
|
||||
WHEEL_PACKAGE=true ; shift ;;
|
||||
(-o | --outdir)
|
||||
TARGET="outdir"; PKGTYPE=$2 ; OUT_DIR_SPECIFIED=1 ; ((CLEAN_OR_OUT|=2)) ; shift 2 ;;
|
||||
(-p | --package)
|
||||
@@ -78,7 +75,6 @@ fi
|
||||
|
||||
|
||||
clean_rocm_cmake() {
|
||||
rm -rf "$ROCM_WHEEL_DIR"
|
||||
rm -rf $ROCM_CMAKE_BUILD_DIR
|
||||
rm -rf $ROCM_CMAKE_PACKAGE_DEB
|
||||
rm -rf $ROCM_CMAKE_PACKAGE_RPM
|
||||
@@ -106,19 +102,6 @@ build_rocm_cmake() {
|
||||
copy_if RPM "${CPACKGEN:-"DEB;RPM"}" "$ROCM_CMAKE_PACKAGE_RPM" $ROCM_CMAKE_BUILD_DIR/rocm-cmake*.rpm
|
||||
}
|
||||
|
||||
create_wheel_package() {
|
||||
echo "Creating rocm-cmake wheel package"
|
||||
# Copy the setup.py generator to build folder
|
||||
mkdir -p $ROCM_WHEEL_DIR
|
||||
cp -f $SCRIPT_ROOT/generate_setup_py.py $ROCM_WHEEL_DIR
|
||||
cp -f $SCRIPT_ROOT/repackage_wheel.sh $ROCM_WHEEL_DIR
|
||||
cd $ROCM_WHEEL_DIR
|
||||
# Currently only supports python3.6
|
||||
./repackage_wheel.sh $ROCM_CMAKE_BUILD_DIR/rocm-cmake*.rpm python3.6
|
||||
# Copy the wheel created to RPM folder which will be uploaded to artifactory
|
||||
copy_if WHL "WHL" "$ROCM_CMAKE_PACKAGE_RPM" "$ROCM_WHEEL_DIR"/dist/*.whl
|
||||
}
|
||||
|
||||
print_output_directory() {
|
||||
case ${PKGTYPE} in
|
||||
("deb")
|
||||
@@ -138,9 +121,4 @@ case $TARGET in
|
||||
(*) die "Invalid target $TARGET" ;;
|
||||
esac
|
||||
|
||||
if [[ $WHEEL_PACKAGE == true ]]; then
|
||||
echo "Wheel Package build started !!!!"
|
||||
create_wheel_package
|
||||
fi
|
||||
|
||||
echo "Operation complete"
|
||||
|
||||
@@ -7,7 +7,6 @@ bison
|
||||
bridge-utils
|
||||
build-essential
|
||||
bzip2
|
||||
ccache
|
||||
check
|
||||
chrpath
|
||||
cifs-utils
|
||||
@@ -121,11 +120,9 @@ python3-yaml
|
||||
python3.8-dev
|
||||
re2c
|
||||
redis-tools
|
||||
# Eventually we should be able to remove rpm for debian builds.
|
||||
rpm
|
||||
rsync
|
||||
ssh
|
||||
# This makes life more pleasent inside the container
|
||||
strace
|
||||
sudo
|
||||
systemtap-sdt-dev
|
||||
|
||||
@@ -1,285 +0,0 @@
|
||||
#! /usr/bin/bash
|
||||
|
||||
set -x
|
||||
|
||||
apt-get -y update
|
||||
DEBIAN_FRONTEND=noninteractive DEBCONF_NONINTERACTIVE_SEEN=true apt-get install --no-install-recommends -y $(sed 's/#.*//' /tmp/packages)
|
||||
apt-get clean
|
||||
rm -rf /var/cache/apt/ /var/lib/apt/lists/* /etc/apt/apt.conf.d/01proxy
|
||||
|
||||
#Install 2.17.1 version of git as we are seeing issues with 2.25 , where it was not allowing to add git submodules if the user is different for parent git directory
|
||||
curl -o git.tar.gz https://cdn.kernel.org/pub/software/scm/git/git-2.17.1.tar.gz
|
||||
tar -zxf git.tar.gz
|
||||
cd git-*
|
||||
make prefix=/usr/local all
|
||||
make prefix=/usr/local install
|
||||
git --version
|
||||
|
||||
#install argparse and CppHeaderParser python modules for roctracer and rocprofiler
|
||||
#install rocm-docs-core for the docs-as-code project. Only needed on one OS
|
||||
# CppHeader needs setuptools. setuptools needs wheel.
|
||||
# Looks like I need them as seperate commands
|
||||
# Sigh, install both python2 and python 3 version
|
||||
pip3 install --no-cache-dir setuptools wheel tox
|
||||
pip3 install --no-cache-dir CppHeaderParser argparse requests lxml barectf recommonmark jinja2==3.0.0 websockets matplotlib numpy scipy minimal msgpack pytest sphinx joblib PyYAML rocm-docs-core cmake==3.25.2 pandas myst-parser
|
||||
|
||||
# Allow sudo for everyone user
|
||||
echo 'ALL ALL=(ALL) NOPASSWD:ALL' > /etc/sudoers.d/everyone
|
||||
|
||||
# Install OCaml packages to build LLVM's OCaml bindings to be used in lightning compiler test pipeline
|
||||
wget -nv https://sourceforge.net/projects/opam.mirror/files/2.1.4/opam-2.1.4-x86_64-linux -O /usr/local/bin/opam
|
||||
chmod +x /usr/local/bin/opam
|
||||
opam init --yes --disable-sandboxing
|
||||
opam install ctypes --yes
|
||||
|
||||
# Install and modify git-repo (#!/usr/bin/env python -> #!/usr/bin/env python3)
|
||||
curl https://storage.googleapis.com/git-repo-downloads/repo > /usr/bin/repo
|
||||
chmod a+x /usr/bin/repo
|
||||
|
||||
# Build ccache from the source
|
||||
cd /tmp
|
||||
git clone https://github.com/ccache/ccache -b v4.7.5
|
||||
cd ccache
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release ..
|
||||
make
|
||||
make install
|
||||
cd /tmp
|
||||
rm -rf ccache
|
||||
|
||||
# Install sharp from MLNX_OFED_LINUX as dependency for rccl-rdma-sharp-plugins
|
||||
cd /var/tmp
|
||||
mkdir mlnx
|
||||
wget -O mlnx/tar.tgz https://content.mellanox.com/ofed/MLNX_OFED-24.01-0.3.3.1/MLNX_OFED_LINUX-24.01-0.3.3.1-ubuntu22.04-x86_64.tgz
|
||||
tar -xz -C mlnx -f mlnx/tar.tgz
|
||||
apt-key add mlnx/*/RPM-GPG-KEY-Mellanox
|
||||
echo "deb [arch=amd64] file:$(echo $PWD/mlnx/*/DEBS) ./" > /etc/apt/sources.list.d/sharp.list
|
||||
apt update
|
||||
apt install -y sharp
|
||||
apt clean
|
||||
rm -rf /var/cache/apt/ /var/lib/apt/lists/* mlnx /etc/apt/sources.list.d/sharp.list
|
||||
|
||||
apt update
|
||||
apt -y install libunwind-dev
|
||||
apt -y install libgoogle-glog-dev
|
||||
|
||||
# Install python3.8 from source
|
||||
curl -LO https://www.python.org/ftp/python/3.8.13/Python-3.8.13.tar.xz
|
||||
tar -xvf Python-3.8.13.tar.xz
|
||||
pwd
|
||||
ls /var/tmp/
|
||||
ls Python-3.8.13
|
||||
mv Python-3.8.13 /opt/
|
||||
apt install build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libsqlite3-dev libreadline-dev libffi-dev curl libbz2-dev pkg-config make -y
|
||||
cd /opt/Python-3.8.13/
|
||||
./configure --enable-optimizations --enable-shared
|
||||
make
|
||||
make -j 6
|
||||
make altinstall
|
||||
ldconfig /opt/Python3.8.13
|
||||
python3.8 --version
|
||||
|
||||
# roctracer and rocprofiler needs this python3.8
|
||||
python3.8 -m pip install setuptools wheel
|
||||
python3.8 -m pip install CppHeaderParser argparse requests lxml PyYAML joblib
|
||||
|
||||
#Install older version of hwloc-devel package for rocrtst
|
||||
curl -lO https://download.open-mpi.org/release/hwloc/v1.11/hwloc-1.11.13.tar.bz2
|
||||
tar -xvf hwloc-1.11.13.tar.bz2
|
||||
cd hwloc-1.11.13
|
||||
./configure
|
||||
make
|
||||
make install
|
||||
cp /usr/local/lib/libhwloc.so.5 /usr/lib
|
||||
hwloc-info --version
|
||||
|
||||
# Install gtest
|
||||
mkdir -p /tmp/gtest
|
||||
cd /tmp/gtest
|
||||
wget https://github.com/google/googletest/archive/refs/tags/v1.14.0.zip -O googletest.zip
|
||||
unzip googletest.zip
|
||||
cd googletest-1.14.0/
|
||||
mkdir build
|
||||
cd build
|
||||
cmake ..
|
||||
make -j$(nproc)
|
||||
make install
|
||||
rm -rf /tmp/gtest
|
||||
|
||||
## Install gRPC from source
|
||||
## RDC Pre-requisites
|
||||
GRPC_ARCHIVE=grpc-1.61.0.tar.gz
|
||||
mkdir /tmp/grpc
|
||||
mkdir /usr/grpc
|
||||
cd /tmp
|
||||
git clone --recurse-submodules -b v1.61.0 https://github.com/grpc/grpc
|
||||
cd grpc
|
||||
mkdir -p build
|
||||
cd build
|
||||
cmake -DgRPC_INSTALL=ON -DBUILD_SHARED_LIBS=ON -DgRPC_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=/usr/grpc -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=14 -DCMAKE_SHARED_LINKER_FLAGS_INIT=-Wl,--enable-new-dtags,--build-id=sha1,--rpath,'$ORIGIN' ..
|
||||
make -j $(nproc) install
|
||||
rm -rf /tmp/grpc
|
||||
|
||||
## rocBLAS Pre-requisites
|
||||
## Download prebuilt AMD multithreaded blis (2.0)
|
||||
## Reference : https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/install.sh#L403
|
||||
mkdir -p /tmp/blis
|
||||
cd /tmp/blis
|
||||
wget -O - https://github.com/amd/blis/releases/download/2.0/aocl-blis-mt-ubuntu-2.0.tar.gz | tar xfz -
|
||||
mv amd-blis-mt /usr/blis
|
||||
cd /
|
||||
rm -rf /tmp/blis
|
||||
|
||||
## rocBLAS Pre-requisites(SWDEV-404612)
|
||||
## Download aocl-linux-gcc-4.2.0_1_amd64.deb
|
||||
mkdir -p /tmp/aocl
|
||||
cd /tmp/aocl
|
||||
wget -nv https://download.amd.com/developer/eula/aocl/aocl-4-2/aocl-linux-gcc-4.2.0_1_amd64.deb
|
||||
apt install ./aocl-linux-gcc-4.2.0_1_amd64.deb
|
||||
rm -rf /tmp/aocl
|
||||
|
||||
## hipBLAS Pre-requisites
|
||||
## lapack(3.9.1v)
|
||||
## Reference https://github.com/ROCmSoftwarePlatform/rocSOLVER/blob/develop/install.sh#L174
|
||||
lapack_version=3.9.1
|
||||
lapack_srcdir=lapack-$lapack_version
|
||||
lapack_blddir=lapack-$lapack_version-bld
|
||||
mkdir -p /tmp/lapack
|
||||
cd /tmp/lapack
|
||||
rm -rf "$lapack_srcdir" "$lapack_blddir"
|
||||
wget -O - https://github.com/Reference-LAPACK/lapack/archive/refs/tags/v3.9.1.tar.gz | tar xzf -
|
||||
cmake -H$lapack_srcdir -B$lapack_blddir -DCMAKE_BUILD_TYPE=Release -DCMAKE_Fortran_FLAGS=-fno-optimize-sibling-calls -DBUILD_TESTING=OFF -DCBLAS=ON -DLAPACKE=OFF
|
||||
make -j$(nproc) -C "$lapack_blddir"
|
||||
make -C "$lapack_blddir" install
|
||||
cd $lapack_blddir
|
||||
cp -r ./include/* /usr/local/include/
|
||||
cp -r ./lib/* /usr/local/lib
|
||||
cd /
|
||||
rm -rf /tmp/lapack
|
||||
|
||||
## rocSOLVER Pre-requisites
|
||||
## FMT(7.1.3v)
|
||||
## Reference https://github.com/ROCmSoftwarePlatform/rocSOLVER/blob/develop/install.sh#L152
|
||||
fmt_version=7.1.3
|
||||
fmt_srcdir=fmt-$fmt_version
|
||||
fmt_blddir=fmt-$fmt_version-bld
|
||||
mkdir -p /tmp/fmt
|
||||
cd /tmp/fmt
|
||||
rm -rf "$fmt_srcdir" "$fmt_blddir"
|
||||
wget -O - https://github.com/fmtlib/fmt/archive/refs/tags/7.1.3.tar.gz | tar xzf -
|
||||
cmake -H$fmt_srcdir -B$fmt_blddir -DCMAKE_BUILD_TYPE=Release -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_EXTENSIONS=OFF -DCMAKE_CXX_STANDARD_REQUIRED=ON -DFMT_DOC=OFF -DFMT_TEST=OFF
|
||||
make -j$(nproc) -C "$fmt_blddir"
|
||||
make -C "$fmt_blddir" install
|
||||
|
||||
# Build and install libjpeg-turbo
|
||||
mkdir -p /tmp/libjpeg-turbo
|
||||
cd /tmp/libjpeg-turbo
|
||||
wget -nv https://github.com/rrawther/libjpeg-turbo/archive/refs/heads/2.0.6.2.zip -O libjpeg-turbo-2.0.6.2.zip
|
||||
unzip libjpeg-turbo-2.0.6.2.zip
|
||||
cd libjpeg-turbo-2.0.6.2
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_BUILD_TYPE=RELEASE -DENABLE_STATIC=FALSE -DCMAKE_INSTALL_DEFAULT_LIBDIR=lib ..
|
||||
make -j$(nproc) install
|
||||
rm -rf /tmp/libjpeg-turbo
|
||||
|
||||
# Get released ninja from source
|
||||
mkdir -p /tmp/ninja
|
||||
cd /tmp/ninja
|
||||
wget -nv https://codeload.github.com/Kitware/ninja/zip/refs/tags/v1.11.1.g95dee.kitware.jobserver-1 -O ninja.zip
|
||||
unzip ninja.zip
|
||||
cd ninja-1.11.1.g95dee.kitware.jobserver-1
|
||||
./configure.py --bootstrap
|
||||
cp ninja /usr/local/bin/
|
||||
rm -rf /tmp/ninja
|
||||
|
||||
# Install FFmpeg and dependencies
|
||||
# Build NASM
|
||||
mkdir -p /tmp/nasm-2.15.05
|
||||
cd /tmp
|
||||
wget -qO- "https://distfiles.macports.org/nasm/nasm-2.15.05.tar.bz2" | tar -xvj
|
||||
cd nasm-2.15.05
|
||||
./autogen.sh
|
||||
./configure --prefix="/usr/local"
|
||||
make -j$(nproc) install
|
||||
rm -rf /tmp/nasm-2.15.05
|
||||
|
||||
# Build YASM
|
||||
mkdir -p /tmp/yasm-1.3.0
|
||||
cd /tmp
|
||||
wget -qO- "http://www.tortall.net/projects/yasm/releases/yasm-1.3.0.tar.gz" | tar -xvz
|
||||
cd yasm-1.3.0
|
||||
./configure --prefix="/usr/local"
|
||||
make -j$(nproc) install
|
||||
rm -rf /tmp/yasm-1.3.0
|
||||
|
||||
# Build x264
|
||||
mkdir -p /tmp/x264-snapshot-20191217-2245-stable
|
||||
cd /tmp
|
||||
wget -qO- "https://download.videolan.org/pub/videolan/x264/snapshots/x264-snapshot-20191217-2245-stable.tar.bz2" | tar -xvj
|
||||
cd /tmp/x264-snapshot-20191217-2245-stable
|
||||
PKG_CONFIG_PATH="/usr/local/lib/pkgconfig" ./configure --prefix="/usr/local" --enable-shared
|
||||
make -j$(nproc) install
|
||||
rm -rf /tmp/x264-snapshot-20191217-2245-stable
|
||||
|
||||
# Build x265
|
||||
mkdir -p /tmp/x265_2.7
|
||||
cd /tmp
|
||||
wget -qO- "https://get.videolan.org/x265/x265_2.7.tar.gz" | tar -xvz
|
||||
cd /tmp/x265_2.7/build/linux
|
||||
cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX="/usr/local" -DENABLE_SHARED:bool=on ../../source
|
||||
make -j$(nproc) install
|
||||
rm -rf /tmp/x265_2.7
|
||||
|
||||
# Build fdk-aac
|
||||
mkdir -p /tmp/fdk-aac-2.0.2
|
||||
cd /tmp
|
||||
wget -qO- "https://sourceforge.net/projects/opencore-amr/files/fdk-aac/fdk-aac-2.0.2.tar.gz" | tar -xvz
|
||||
cd /tmp/fdk-aac-2.0.2
|
||||
autoreconf -fiv
|
||||
./configure --prefix="/usr/local" --enable-shared --disable-static
|
||||
make -j$(nproc) install
|
||||
rm -rf /tmp/fdk-aac-2.0.2
|
||||
|
||||
# Build FFmpeg
|
||||
cd /tmp
|
||||
git clone -b release/4.4 https://git.ffmpeg.org/ffmpeg.git ffmpeg
|
||||
cd ffmpeg
|
||||
PKG_CONFIG_PATH="/usr/local/lib/pkgconfig"
|
||||
./configure --prefix="/usr/local" --extra-cflags="-I/usr/local/include" --extra-ldflags="-L/usr/local/lib" --extra-libs=-lpthread --extra-libs=-lm --enable-shared --disable-static --enable-libx264 --enable-libx265 --enable-libfdk-aac --enable-gpl --enable-nonfree
|
||||
make -j$(nproc) install
|
||||
rm -rf /tmp/ffmpeg
|
||||
|
||||
cp /tmp/local-pin-600 /etc/apt/preferences.d
|
||||
|
||||
command -v lbzip2
|
||||
ln -sf $(command -v lbzip2) /usr/local/bin/compressor || ln -sf $(command -v bzip2) /usr/local/bin/compressor
|
||||
|
||||
# Install Google Benchmark
|
||||
mkdir -p /tmp/Gbenchmark
|
||||
cd /tmp/Gbenchmark
|
||||
wget -qO- https://github.com/google/benchmark/archive/refs/tags/v1.6.1.tar.gz | tar xz
|
||||
cmake -Sbenchmark-1.6.1 -Bbuild -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF -DBENCHMARK_ENABLE_TESTING=OFF -DCMAKE_CXX_STANDARD=14
|
||||
make -j -C build
|
||||
cd /tmp/Gbenchmark/build
|
||||
make install
|
||||
|
||||
# Build boost-1.85.0 from source for RPP
|
||||
# Installing in a non-standard location since the test packages of hipFFT and rocFFT pick up the version of
|
||||
# the installed Boost library and declare a package dependency on that specific version of Boost.
|
||||
# For example, if this was installed in the standard location it would declare a dependency on libboost-dev(el)1.85.0
|
||||
# which is not available as a package in any distro.
|
||||
# Once this is fixed, we can remove the Boost package from the requirements list and install this
|
||||
# in the standard location
|
||||
mkdir -p /tmp/boost-1.85.0
|
||||
cd /tmp/boost-1.85.0
|
||||
wget -nv https://sourceforge.net/projects/boost/files/boost/1.85.0/boost_1_85_0.tar.bz2 -O ./boost_1_85_0.tar.bz2
|
||||
tar -xf boost_1_85_0.tar.bz2 --use-compress-program="/usr/local/bin/compressor"
|
||||
cd boost_1_85_0
|
||||
./bootstrap.sh --prefix=${RPP_DEPS_LOCATION} --with-python=python3
|
||||
./b2 stage -j$(nproc) threading=multi link=shared cxxflags="-std=c++11"
|
||||
./b2 install threading=multi link=shared --with-system --with-filesystem
|
||||
./b2 stage -j$(nproc) threading=multi link=static cxxflags="-std=c++11 -fpic" cflags="-fpic"
|
||||
./b2 install threading=multi link=static --with-system --with-filesystem
|
||||
rm -rf /tmp/boost-1.85.0
|
||||
@@ -7,7 +7,6 @@ bison
|
||||
bridge-utils
|
||||
build-essential
|
||||
bzip2
|
||||
ccache
|
||||
check
|
||||
chrpath
|
||||
cifs-utils
|
||||
|
||||
Reference in New Issue
Block a user