mirror of
https://github.com/ROCm/ROCm.git
synced 2026-01-10 07:08:08 -05:00
Compare commits
56 Commits
revert-533
...
docs_6.3.3
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b12eb75be3 | ||
|
|
3d59247e7a | ||
|
|
d5b1fd4389 | ||
|
|
a05d9e2fa0 | ||
|
|
7ddb10a0fc | ||
|
|
63f9bc30bd | ||
|
|
b174ab767e | ||
|
|
f75ef9e2c1 | ||
|
|
e5bf76ead1 | ||
|
|
5393e90a8e | ||
|
|
fbc2815223 | ||
|
|
2b96a37b08 | ||
|
|
1e5ad14d86 | ||
|
|
f9d6bd4db8 | ||
|
|
23e78c8d55 | ||
|
|
0edd31bde6 | ||
|
|
4af488e27d | ||
|
|
7ae7046301 | ||
|
|
358092386e | ||
|
|
e071738908 | ||
|
|
cd79403931 | ||
|
|
275ef1d511 | ||
|
|
065fe8b138 | ||
|
|
be36c1808e | ||
|
|
64c362a961 | ||
|
|
d392eca232 | ||
|
|
1b58c08394 | ||
|
|
73ab81fbaf | ||
|
|
ddfb5bda12 | ||
|
|
ae7f47a0a2 | ||
|
|
5e5f7d6bb7 | ||
|
|
da1125e228 | ||
|
|
e55b9f2a33 | ||
|
|
761a524d03 | ||
|
|
c895ee483c | ||
|
|
e049d952d4 | ||
|
|
ce41922bb5 | ||
|
|
2b53b40caa | ||
|
|
9250e1ba28 | ||
|
|
3c055ab65b | ||
|
|
44aaf1b57c | ||
|
|
822e789998 | ||
|
|
243ac78609 | ||
|
|
c2f483332f | ||
|
|
b35267b6bd | ||
|
|
deb4895b11 | ||
|
|
8c036531e8 | ||
|
|
484cbefc2e | ||
|
|
721b60d52f | ||
|
|
8ebe7be283 | ||
|
|
7e8947fdb4 | ||
|
|
66cac5301f | ||
|
|
9f3a1de117 | ||
|
|
0915fb17e8 | ||
|
|
0d3eb1d774 | ||
|
|
7a258cdba9 |
@@ -1,29 +0,0 @@
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
|
||||
parameters:
|
||||
- name: checkoutRef
|
||||
type: string
|
||||
default: refs/tags/$(LATEST_RELEASE_TAG)
|
||||
|
||||
resources:
|
||||
repositories:
|
||||
- repository: pipelines_repo
|
||||
type: github
|
||||
endpoint: ROCm
|
||||
name: ROCm/ROCm
|
||||
- repository: release_repo
|
||||
type: github
|
||||
endpoint: ROCm
|
||||
name: ROCm/TransferBench
|
||||
ref: ${{ parameters.checkoutRef }}
|
||||
|
||||
trigger: none
|
||||
pr: none
|
||||
|
||||
jobs:
|
||||
- template: ${{ variables.CI_COMPONENT_PATH }}/TransferBench.yml
|
||||
parameters:
|
||||
checkoutRepo: release_repo
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -11,6 +11,7 @@ _toc.yml
|
||||
docBin/
|
||||
_doxygen/
|
||||
_readthedocs/
|
||||
__pycache__/
|
||||
|
||||
# avoid duplicating contributing.md due to conf.py
|
||||
docs/CHANGELOG.md
|
||||
|
||||
@@ -117,6 +117,7 @@ FX
|
||||
Filesystem
|
||||
FindDb
|
||||
Flang
|
||||
FluxBenchmark
|
||||
Fortran
|
||||
Fuyu
|
||||
GALB
|
||||
@@ -131,6 +132,7 @@ GDS
|
||||
GEMM
|
||||
GEMMs
|
||||
GFortran
|
||||
GFXIP
|
||||
Gemma
|
||||
GiB
|
||||
GIM
|
||||
@@ -154,6 +156,7 @@ HCA
|
||||
HGX
|
||||
HIPCC
|
||||
HIPExtension
|
||||
HIPification
|
||||
HIPIFY
|
||||
HIPification
|
||||
HIPify
|
||||
@@ -316,6 +319,7 @@ PipelineParallel
|
||||
PnP
|
||||
PowerEdge
|
||||
PowerShell
|
||||
Pretraining
|
||||
Profiler's
|
||||
PyPi
|
||||
Pytest
|
||||
@@ -478,6 +482,7 @@ ZenDNN
|
||||
accuracies
|
||||
activations
|
||||
addr
|
||||
ai
|
||||
alloc
|
||||
allocatable
|
||||
allocator
|
||||
@@ -543,6 +548,7 @@ cTDP
|
||||
dataset
|
||||
datasets
|
||||
dataspace
|
||||
datatemplate
|
||||
datatype
|
||||
datatypes
|
||||
dbgapi
|
||||
@@ -571,6 +577,7 @@ el
|
||||
embeddings
|
||||
enablement
|
||||
encodings
|
||||
endfor
|
||||
endpgm
|
||||
enqueue
|
||||
env
|
||||
@@ -691,6 +698,7 @@ pageable
|
||||
pallas
|
||||
parallelization
|
||||
parallelizing
|
||||
param
|
||||
parameterization
|
||||
passthrough
|
||||
perfcounter
|
||||
@@ -715,6 +723,7 @@ preprocessing
|
||||
preprocessor
|
||||
prequantized
|
||||
prerequisites
|
||||
pretraining
|
||||
profiler
|
||||
profilers
|
||||
protobuf
|
||||
@@ -807,6 +816,7 @@ supercomputing
|
||||
symlink
|
||||
symlinks
|
||||
sys
|
||||
tabindex
|
||||
td
|
||||
tensorfloat
|
||||
th
|
||||
@@ -852,6 +862,7 @@ vectorizes
|
||||
virtualize
|
||||
virtualized
|
||||
vjxb
|
||||
vllm
|
||||
voxel
|
||||
walkthrough
|
||||
walkthroughs
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
:description: JAX compatibility
|
||||
:keywords: GPU, JAX compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
*******************************************************************************
|
||||
JAX compatibility
|
||||
*******************************************************************************
|
||||
@@ -119,7 +121,8 @@ Critical ROCm libraries for JAX
|
||||
|
||||
The functionality of JAX with ROCm is determined by its underlying library
|
||||
dependencies. These critical ROCm components affect the capabilities,
|
||||
performance, and feature set available to developers.
|
||||
performance, and feature set available to developers. The versions described
|
||||
are available in ROCm :version:`rocm_version`.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
@@ -129,7 +132,7 @@ performance, and feature set available to developers.
|
||||
- Purpose
|
||||
- Used in
|
||||
* - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
|
||||
- 2.3.0
|
||||
- :version-ref:`hipBLAS rocm_version`
|
||||
- Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
|
||||
matrix and vector operations.
|
||||
- Matrix multiplication in ``jax.numpy.matmul``, ``jax.lax.dot`` and
|
||||
@@ -138,7 +141,7 @@ performance, and feature set available to developers.
|
||||
``jax.numpy.einsum`` with matrix-multiplication patterns algebra
|
||||
operations.
|
||||
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
|
||||
- 0.10.0
|
||||
- :version-ref:`hipBLASLt rocm_version`
|
||||
- hipBLASLt is an extension of hipBLAS, providing additional
|
||||
features like epilogues fused into the matrix multiplication kernel or
|
||||
use of integer tensor cores.
|
||||
@@ -147,7 +150,7 @@ performance, and feature set available to developers.
|
||||
operations, mixed-precision support, and hardware-specific
|
||||
optimizations.
|
||||
* - `hipCUB <https://github.com/ROCm/hipCUB>`_
|
||||
- 3.3.0
|
||||
- :version-ref:`hipCUB rocm_version`
|
||||
- Provides a C++ template library for parallel algorithms for reduction,
|
||||
scan, sort and select.
|
||||
- Reduction functions (``jax.numpy.sum``, ``jax.numpy.mean``,
|
||||
@@ -155,23 +158,23 @@ performance, and feature set available to developers.
|
||||
(``jax.numpy.cumsum``, ``jax.numpy.cumprod``) and sorting
|
||||
(``jax.numpy.sort``, ``jax.numpy.argsort``).
|
||||
* - `hipFFT <https://github.com/ROCm/hipFFT>`_
|
||||
- 1.0.17
|
||||
- :version-ref:`hipFFT rocm_version`
|
||||
- Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
|
||||
- Used in functions like ``jax.numpy.fft``.
|
||||
* - `hipRAND <https://github.com/ROCm/hipRAND>`_
|
||||
- 2.11.0
|
||||
- :version-ref:`hipRAND rocm_version`
|
||||
- Provides fast random number generation for GPUs.
|
||||
- The ``jax.random.uniform``, ``jax.random.normal``,
|
||||
``jax.random.randint`` and ``jax.random.split``.
|
||||
* - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
|
||||
- 2.3.0
|
||||
- :version-ref:`hipSOLVER rocm_version`
|
||||
- Provides GPU-accelerated solvers for linear systems, eigenvalues, and
|
||||
singular value decompositions (SVD).
|
||||
- Solving linear systems (``jax.numpy.linalg.solve``), matrix
|
||||
factorizations, SVD (``jax.numpy.linalg.svd``) and eigenvalue problems
|
||||
(``jax.numpy.linalg.eig``).
|
||||
* - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
|
||||
- 3.1.2
|
||||
- :version-ref:`hipSPARSE rocm_version`
|
||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||
or matrix-matrix products.
|
||||
- Sparse matrix multiplication (``jax.numpy.matmul``), sparse
|
||||
@@ -179,28 +182,28 @@ performance, and feature set available to developers.
|
||||
(``jax.experimental.sparse.dot``), sparse linear system solvers and
|
||||
sparse data handling.
|
||||
* - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`_
|
||||
- 0.2.2
|
||||
- :version-ref:`hipSPARSELt rocm_version`
|
||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||
or matrix-matrix products.
|
||||
- Sparse matrix multiplication (``jax.numpy.matmul``), sparse
|
||||
matrix-vector and matrix-matrix products
|
||||
(``jax.experimental.sparse.dot``) and sparse linear system solvers.
|
||||
* - `MIOpen <https://github.com/ROCm/MIOpen>`_
|
||||
- 3.3.0
|
||||
- :version-ref:`MIOpen rocm_version`
|
||||
- Optimized for deep learning primitives such as convolutions, pooling,
|
||||
normalization, and activation functions.
|
||||
- Speeds up convolutional neural networks (CNNs), recurrent neural
|
||||
networks (RNNs), and other layers. Used in operations like
|
||||
``jax.nn.conv``, ``jax.nn.relu``, and ``jax.nn.batch_norm``.
|
||||
* - `RCCL <https://github.com/ROCm/rccl>`_
|
||||
- 2.21.5
|
||||
- :version-ref:`RCCL rocm_version`
|
||||
- Optimized for multi-GPU communication for operations like all-reduce,
|
||||
broadcast, and scatter.
|
||||
- Distribute computations across multiple GPU with ``pmap`` and
|
||||
``jax.distributed``. XLA automatically uses rccl when executing
|
||||
operations across multiple GPUs on AMD hardware.
|
||||
* - `rocThrust <https://github.com/ROCm/rocThrust>`_
|
||||
- 3.3.0
|
||||
- :version-ref:`rocThrust rocm_version`
|
||||
- Provides a C++ template library for parallel algorithms like sorting,
|
||||
reduction, and scanning.
|
||||
- Reduction operations like ``jax.numpy.sum``, ``jax.pmap`` for
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
:description: PyTorch compatibility
|
||||
:keywords: GPU, PyTorch compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
********************************************************************************
|
||||
PyTorch compatibility
|
||||
********************************************************************************
|
||||
@@ -56,7 +58,7 @@ Docker image compatibility
|
||||
|
||||
AMD validates and publishes ready-made `PyTorch images <https://hub.docker.com/r/rocm/pytorch>`_
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and
|
||||
associated inventories are validated for `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_.
|
||||
associated inventories are validated for `ROCm 6.3.3 <https://repo.radeon.com/rocm/apt/6.3.3/>`_.
|
||||
Click the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
.. list-table:: PyTorch Docker image components
|
||||
@@ -77,26 +79,26 @@ Click the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu24.04_py3.12_pytorch_release_2.4.0/images/sha256-98ddf20333bd01ff749b8092b1190ee369a75d3b8c71c2fac80ffdcb1a98d529?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu24.04_py3.12_pytorch_release_2.4.0/images/sha256-6c798857b2c9526b44ba535710b93a1737546acea79b53a93c646195c272f1d5"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- 24.04
|
||||
- `3.12 <https://www.python.org/downloads/release/python-3128/>`_
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
|
||||
- `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`_
|
||||
- `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`_
|
||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
||||
- `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
|
||||
- `4.0.7 <https://github.com/open-mpi/ompi/tree/v4.0.7>`_
|
||||
- `4.0.3 <https://github.com/open-mpi/ompi/tree/v4.0.3>`_
|
||||
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.10_pytorch_release_2.4.0/images/sha256-402c9b4f1a6b5a81c634a1932b56cbe01abb699cfcc7463d226276997c6cf8ea?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu22.04_py3.10_pytorch_release_2.4.0/images/sha256-a09b21248133876fc8912a5ff4e6ee2c8d62b14120313e426b3dadda5702713d"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- 22.04
|
||||
- `3.10 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
- `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`_
|
||||
- `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`_
|
||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
|
||||
@@ -107,11 +109,11 @@ Click the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.9_pytorch_release_2.4.0/images/sha256-e0608b55d408c3bfe5c19fdd57a4ced3e0eb3a495b74c309980b60b156c526dd?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu22.04_py3.9_pytorch_release_2.4.0/images/sha256-963187534467f0f9da77996762fc1d112a6faa5372277c348a505533e7876ec8"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- 22.04
|
||||
- `3.9.18 <https://www.python.org/downloads/release/python-3918/>`_
|
||||
- `3.9.21 <https://www.python.org/downloads/release/python-3921/>`_
|
||||
- `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`_
|
||||
- `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`_
|
||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
|
||||
@@ -122,11 +124,11 @@ Click the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-652cf25263d05b1de548222970aeb76e60b12de101de66751264709c0d0ff9d8?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-952f2621bd2bf3078bef19061e05b209105a82a7908e7e6cdf85014938a4d93a"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`_
|
||||
- 22.04
|
||||
- `3.10 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
- `1.3.0 <https://github.com/ROCm/apex/tree/release/1.3.0>`_
|
||||
- `0.18.0 <https://github.com/pytorch/vision/tree/v0.18.0>`_
|
||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
|
||||
@@ -137,7 +139,7 @@ Click the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.10_pytorch_release_2.2.1/images/sha256-051976f26beab8f9aa65d999e3ad546c027b39240a0cc3ee81b114a9024f2912?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu22.04_py3.10_pytorch_release_2.2.1/images/sha256-a2fe20e170feb9e05da3e5728bb98e40d08567e137be8e6ba797962ed2852608"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.2.1 <https://github.com/ROCm/pytorch/tree/release/2.2>`_
|
||||
- 22.04
|
||||
@@ -152,7 +154,7 @@ Click the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu20.04_py3.9_pytorch_release_2.2.1/images/sha256-88c839a364d109d3748c100385bfa100d28090d25118cc723fd0406390ab2f7e?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu20.04_py3.9_pytorch_release_2.2.1/images/sha256-7f231937c897cca5f89e360be33c70a2017d60f62d1fbe81292be48c15fe345b"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.2.1 <https://github.com/ROCm/pytorch/tree/release/2.2>`_
|
||||
- 20.04
|
||||
@@ -167,14 +169,14 @@ Click the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.9_pytorch_release_1.13.1/images/sha256-994424ed07a63113f79dd9aa72159124c00f5fbfe18127151e6658f7d0b6f821?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu22.04_py3.9_pytorch_release_1.13.1/images/sha256-616a47758004f91951e2da6c1fe291f903de65a7b2318d4b18359b48fe3032f4"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `1.13.1 <https://github.com/ROCm/pytorch/tree/release/1.13>`_
|
||||
- 22.04
|
||||
- `3.9.21 <https://www.python.org/downloads/release/python-3921/>`_
|
||||
- `1.0.0 <https://github.com/ROCm/apex/tree/release/1.0.0>`_
|
||||
- `0.14.0 <https://github.com/pytorch/vision/tree/v0.14.0>`_
|
||||
- `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18>`_
|
||||
- `2.19.0 <https://github.com/tensorflow/tensorboard/tree/2.19>`_
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
||||
- `1.14.1 <https://github.com/openucx/ucx/tree/v1.14.1>`_
|
||||
- `4.1.5 <https://github.com/open-mpi/ompi/tree/v4.1.5>`_
|
||||
@@ -182,7 +184,7 @@ Click the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu20.04_py3.9_pytorch_release_1.13.1/images/sha256-7b8139fe40a9aeb4bca3aecd15c22c1fa96e867d93479fa3a24fdeeeeafa1219?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu20.04_py3.9_pytorch_release_1.13.1/images/sha256-a2cfb365aea58b84595e241ffdb0d5ef3e6566e98c10b5499f4aa29983a74ea2"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `1.13.1 <https://github.com/ROCm/pytorch/tree/release/1.13>`_
|
||||
- 20.04
|
||||
@@ -200,7 +202,8 @@ Critical ROCm libraries for PyTorch
|
||||
|
||||
The functionality of PyTorch with ROCm is determined by its underlying library
|
||||
dependencies. These critical ROCm components affect the capabilities,
|
||||
performance, and feature set available to developers.
|
||||
performance, and feature set available to developers. The versions described
|
||||
are available in ROCm :version:`rocm_version`.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
@@ -210,28 +213,28 @@ performance, and feature set available to developers.
|
||||
- Purpose
|
||||
- Used in
|
||||
* - `Composable Kernel <https://github.com/ROCm/composable_kernel>`_
|
||||
- 1.1.0
|
||||
- :version-ref:`"Composable Kernel" rocm_version`
|
||||
- Enables faster execution of core operations like matrix multiplication
|
||||
(GEMM), convolutions and transformations.
|
||||
- Speeds up ``torch.permute``, ``torch.view``, ``torch.matmul``,
|
||||
``torch.mm``, ``torch.bmm``, ``torch.nn.Conv2d``, ``torch.nn.Conv3d``
|
||||
and ``torch.nn.MultiheadAttention``.
|
||||
* - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
|
||||
- 2.3.0
|
||||
- :version-ref:`hipBLAS rocm_version`
|
||||
- Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
|
||||
matrix and vector operations.
|
||||
- Supports operations like matrix multiplication, matrix-vector products,
|
||||
and tensor contractions. Utilized in both dense and batched linear
|
||||
algebra operations.
|
||||
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
|
||||
- 0.10.0
|
||||
- :version-ref:`hipBLASLt rocm_version`
|
||||
- hipBLASLt is an extension of the hipBLAS library, providing additional
|
||||
features like epilogues fused into the matrix multiplication kernel or
|
||||
use of integer tensor cores.
|
||||
- It accelerates operations like ``torch.matmul``, ``torch.mm``, and the
|
||||
matrix multiplications used in convolutional and linear layers.
|
||||
* - `hipCUB <https://github.com/ROCm/hipCUB>`_
|
||||
- 3.3.0
|
||||
- :version-ref:`hipCUB rocm_version`
|
||||
- Provides a C++ template library for parallel algorithms for reduction,
|
||||
scan, sort and select.
|
||||
- Supports operations like ``torch.sum``, ``torch.cumsum``, ``torch.sort``
|
||||
@@ -239,93 +242,93 @@ performance, and feature set available to developers.
|
||||
irregular shapes often involve scanning, sorting, and filtering, which
|
||||
hipCUB handles efficiently.
|
||||
* - `hipFFT <https://github.com/ROCm/hipFFT>`_
|
||||
- 1.0.17
|
||||
- :version-ref:`hipFFT rocm_version`
|
||||
- Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
|
||||
- Used in functions like the ``torch.fft`` module.
|
||||
* - `hipRAND <https://github.com/ROCm/hipRAND>`_
|
||||
- 2.11.0
|
||||
- :version-ref:`hipRAND rocm_version`
|
||||
- Provides fast random number generation for GPUs.
|
||||
- The ``torch.rand``, ``torch.randn`` and stochastic layers like
|
||||
``torch.nn.Dropout``.
|
||||
* - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
|
||||
- 2.3.0
|
||||
- :version-ref:`hipSOLVER rocm_version`
|
||||
- Provides GPU-accelerated solvers for linear systems, eigenvalues, and
|
||||
singular value decompositions (SVD).
|
||||
- Supports functions like ``torch.linalg.solve``,
|
||||
``torch.linalg.eig``, and ``torch.linalg.svd``.
|
||||
* - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
|
||||
- 3.1.2
|
||||
- :version-ref:`hipSPARSE rocm_version`
|
||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||
or matrix-matrix products.
|
||||
- Sparse tensor operations ``torch.sparse``.
|
||||
* - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`_
|
||||
- 0.2.2
|
||||
- :version-ref:`hipSPARSELt rocm_version`
|
||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||
or matrix-matrix products.
|
||||
- Sparse tensor operations ``torch.sparse``.
|
||||
* - `hipTensor <https://github.com/ROCm/hipTensor>`_
|
||||
- 1.4.0
|
||||
- :version-ref:`hipTensor rocm_version`
|
||||
- Optimizes for high-performance tensor operations, such as contractions.
|
||||
- Accelerates tensor algebra, especially in deep learning and scientific
|
||||
computing.
|
||||
* - `MIOpen <https://github.com/ROCm/MIOpen>`_
|
||||
- 3.3.0
|
||||
- :version-ref:`MIOpen rocm_version`
|
||||
- Optimizes deep learning primitives such as convolutions, pooling,
|
||||
normalization, and activation functions.
|
||||
- Speeds up convolutional neural networks (CNNs), recurrent neural
|
||||
networks (RNNs), and other layers. Used in operations like
|
||||
``torch.nn.Conv2d``, ``torch.nn.ReLU``, and ``torch.nn.LSTM``.
|
||||
* - `MIGraphX <https://github.com/ROCm/AMDMIGraphX>`_
|
||||
- 2.11.0
|
||||
- :version-ref:`MIGraphX rocm_version`
|
||||
- Adds graph-level optimizations, ONNX models and mixed precision support
|
||||
and enable Ahead-of-Time (AOT) Compilation.
|
||||
- Speeds up inference models and executes ONNX models for
|
||||
compatibility with other frameworks.
|
||||
``torch.nn.Conv2d``, ``torch.nn.ReLU``, and ``torch.nn.LSTM``.
|
||||
* - `MIVisionX <https://github.com/ROCm/MIVisionX>`_
|
||||
- 3.1.0
|
||||
- :version-ref:`MIVisionX rocm_version`
|
||||
- Optimizes acceleration for computer vision and AI workloads like
|
||||
preprocessing, augmentation, and inferencing.
|
||||
- Faster data preprocessing and augmentation pipelines for datasets like
|
||||
ImageNet or COCO and easy to integrate into PyTorch's ``torch.utils.data``
|
||||
and ``torchvision`` workflows.
|
||||
* - `rocAL <https://github.com/ROCm/rocAL>`_
|
||||
- 2.1.0
|
||||
- :version-ref:`rocAL rocm_version`
|
||||
- Accelerates the data pipeline by offloading intensive preprocessing and
|
||||
augmentation tasks. rocAL is part of MIVisionX.
|
||||
- Easy to integrate into PyTorch's ``torch.utils.data`` and
|
||||
``torchvision`` data load workloads.
|
||||
* - `RCCL <https://github.com/ROCm/rccl>`_
|
||||
- 2.21.5
|
||||
- :version-ref:`RCCL rocm_version`
|
||||
- Optimizes for multi-GPU communication for operations like AllReduce and
|
||||
Broadcast.
|
||||
- Distributed data parallel training (``torch.nn.parallel.DistributedDataParallel``).
|
||||
Handles communication in multi-GPU setups.
|
||||
* - `rocDecode <https://github.com/ROCm/rocDecode>`_
|
||||
- 0.8.0
|
||||
- :version-ref:`rocDecode rocm_version`
|
||||
- Provides hardware-accelerated data decoding capabilities, particularly
|
||||
for image, video, and other dataset formats.
|
||||
- Can be integrated in ``torch.utils.data``, ``torchvision.transforms``
|
||||
and ``torch.distributed``.
|
||||
* - `rocJPEG <https://github.com/ROCm/rocJPEG>`_
|
||||
- 0.6.0
|
||||
- :version-ref:`rocJPEG rocm_version`
|
||||
- Provides hardware-accelerated JPEG image decoding and encoding.
|
||||
- GPU accelerated ``torchvision.io.decode_jpeg`` and
|
||||
``torchvision.io.encode_jpeg`` and can be integrated in
|
||||
``torch.utils.data`` and ``torchvision``.
|
||||
* - `RPP <https://github.com/ROCm/RPP>`_
|
||||
- 1.9.1
|
||||
- :version-ref:`RPP rocm_version`
|
||||
- Speeds up data augmentation, transformation, and other preprocessing steps.
|
||||
- Easy to integrate into PyTorch's ``torch.utils.data`` and
|
||||
``torchvision`` data load workloads.
|
||||
* - `rocThrust <https://github.com/ROCm/rocThrust>`_
|
||||
- 3.3.0
|
||||
- :version-ref:`rocThrust rocm_version`
|
||||
- Provides a C++ template library for parallel algorithms like sorting,
|
||||
reduction, and scanning.
|
||||
- Utilized in backend operations for tensor computations requiring
|
||||
parallel processing.
|
||||
* - `rocWMMA <https://github.com/ROCm/rocWMMA>`_
|
||||
- 1.6.0
|
||||
- :version-ref:`rocWMMA rocm_version`
|
||||
- Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
|
||||
multiplication (GEMM) and accumulation operations with mixed precision
|
||||
support.
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
:description: TensorFlow compatibility
|
||||
:keywords: GPU, TensorFlow compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
*******************************************************************************
|
||||
TensorFlow compatibility
|
||||
*******************************************************************************
|
||||
@@ -54,7 +56,7 @@ Docker image compatibility
|
||||
AMD validates and publishes ready-made `TensorFlow images
|
||||
<https://hub.docker.com/r/rocm/tensorflow>`_ with ROCm backends on
|
||||
Docker Hub. The following Docker image tags and associated inventories are
|
||||
validated for `ROCm 6.3.1 <https://repo.radeon.com/rocm/apt/6.3.1/>`_. Click
|
||||
validated for `ROCm 6.3.3 <https://repo.radeon.com/rocm/apt/6.3.3/>`_. Click
|
||||
the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
.. list-table:: TensorFlow Docker image components
|
||||
@@ -68,47 +70,47 @@ the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.1-py3.12-tf2.17.0-dev/images/sha256-804121ee4985718277ba7dcec53c57bdade130a1ef42f544b6c48090ad379c17"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.3-py3.12-tf2.17-dev/images/sha256-fd2653f436880366cc874aa24264ca9dabd892d76ccb63fb807debba459bcaaf"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
|
||||
- `tensorflow-rocm 2.17.0 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3/tensorflow_rocm-2.17.0-cp312-cp312-manylinux_2_28_x86_64.whl>`__
|
||||
- `tensorflow-rocm 2.17.0 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3.3/tensorflow_rocm-2.17.0-cp312-cp312-manylinux_2_28_x86_64.whl>`__
|
||||
- dev
|
||||
- `Python 3.12 <https://www.python.org/downloads/release/python-3124/>`_
|
||||
- `Python 3.12.4 <https://www.python.org/downloads/release/python-3124/>`_
|
||||
- `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.1-py3.10-tf2.17.0-dev/images/sha256-776837ffa945913f6c466bfe477810a11453d21d5b6afb200be1c36e48fbc08e"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.3-py3.10-tf2.17-dev/images/sha256-8a5eb7443798935dd269575e2abae847b702e1dfb06766ab84f081a6314d8b95"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
|
||||
- `tensorflow-rocm 2.17.0 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3/tensorflow_rocm-2.17.0-cp310-cp310-manylinux_2_28_x86_64.whl>`__
|
||||
- `tensorflow-rocm 2.17.0 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3.3/tensorflow_rocm-2.17.0-cp310-cp310-manylinux_2_28_x86_64.whl>`__
|
||||
- dev
|
||||
- `Python 3.10 <https://www.python.org/downloads/release/python-31012/>`_
|
||||
- `TensorBoard 2.17.0 <https://github.com/tensorflow/tensorboard/tree/2.17.0>`_
|
||||
- `Python 3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
- `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.1-py3.12-tf2.16.2-dev/images/sha256-c793e1483e30809c3c28fc5d7805bedc033c73da224f839fff370717cb100944"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.3-py3.12-tf2.16-dev/images/sha256-8fc939b10cdd6d2b11407474880d4c8ab2b52ab6e2d1743c921fc2adbfd0422f"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
|
||||
- `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
|
||||
- `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3.3/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
|
||||
- dev
|
||||
- `Python 3.12 <https://www.python.org/downloads/release/python-3124/>`_
|
||||
- `Python 3.12.4 <https://www.python.org/downloads/release/python-3124/>`_
|
||||
- `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.1-py3.10-tf2.16.0-dev/images/sha256-263e78414ae85d7bcd52a025a94131d0a279872a45ed632b9165336dfdcd4443"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.3-py3.10-tf2.16-dev/images/sha256-a4cc6ab23d59fdf5459ceac1f0a603e6c16ae7f885d30e42c0c2b3ac60c2ad10"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
|
||||
- `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3/tensorflow_rocm-2.16.2-cp310-cp310-manylinux_2_28_x86_64.whl>`__
|
||||
- `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3.3/tensorflow_rocm-2.16.2-cp310-cp310-manylinux_2_28_x86_64.whl>`__
|
||||
- dev
|
||||
- `Python 3.10 <https://www.python.org/downloads/release/python-31012/>`_
|
||||
- `Python 3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
- `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.1-py3.10-tf2.15.0-dev/images/sha256-479046a8477ca701a9494a813ab17e8ab4f6baa54641e65dc8d07629f1e6a880"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.3-py3.10-tf2.15-dev/images/sha256-60887c488421184adcb60b9ed4f72a8bd7bdb64d238e50943ca7cbde38e4aa48"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
|
||||
- `tensorflow-rocm 2.15.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3/tensorflow_rocm-2.15.1-cp310-cp310-manylinux_2_28_x86_64.whl>`_
|
||||
- `tensorflow-rocm 2.15.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3.3/tensorflow_rocm-2.15.1-cp310-cp310-manylinux_2_28_x86_64.whl>`_
|
||||
- dev
|
||||
- `Python 3.10 <https://www.python.org/downloads/release/python-31012/>`_
|
||||
- `Python 3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
- `TensorBoard 2.15.2 <https://github.com/tensorflow/tensorboard/tree/2.15.2>`_
|
||||
|
||||
Critical ROCm libraries for TensorFlow
|
||||
@@ -117,7 +119,8 @@ Critical ROCm libraries for TensorFlow
|
||||
TensorFlow depends on multiple components and the supported features of those
|
||||
components can affect the TensorFlow ROCm supported feature set. The versions
|
||||
in the following table refer to the first TensorFlow version where the ROCm
|
||||
library was introduced as a dependency.
|
||||
library was introduced as a dependency. The versions described
|
||||
are available in ROCm :version:`rocm_version`.
|
||||
|
||||
.. list-table::
|
||||
:widths: 25, 10, 35, 30
|
||||
@@ -128,43 +131,43 @@ library was introduced as a dependency.
|
||||
- Purpose
|
||||
- Used in
|
||||
* - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
|
||||
- 2.3.0
|
||||
- :version-ref:`hipBLAS rocm_version`
|
||||
- Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
|
||||
matrix and vector operations.
|
||||
- Accelerates operations like ``tf.matmul``, ``tf.linalg.matmul``, and
|
||||
other matrix multiplications commonly used in neural network layers.
|
||||
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
|
||||
- 0.10.0
|
||||
- :version-ref:`hipBLASLt rocm_version`
|
||||
- Extends hipBLAS with additional optimizations like fused kernels and
|
||||
integer tensor cores.
|
||||
- Optimizes matrix multiplications and linear algebra operations used in
|
||||
layers like dense, convolutional, and RNNs in TensorFlow.
|
||||
* - `hipCUB <https://github.com/ROCm/hipCUB>`_
|
||||
- 3.3.0
|
||||
- :version-ref:`hipCUB rocm_version`
|
||||
- Provides a C++ template library for parallel algorithms for reduction,
|
||||
scan, sort and select.
|
||||
- Supports operations like ``tf.reduce_sum``, ``tf.cumsum``, ``tf.sort``
|
||||
and other tensor operations in TensorFlow, especially those involving
|
||||
scanning, sorting, and filtering.
|
||||
* - `hipFFT <https://github.com/ROCm/hipFFT>`_
|
||||
- 1.0.17
|
||||
- :version-ref:`hipFFT rocm_version`
|
||||
- Accelerates Fast Fourier Transforms (FFT) for signal processing tasks.
|
||||
- Used for operations like signal processing, image filtering, and
|
||||
certain types of neural networks requiring FFT-based transformations.
|
||||
* - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
|
||||
- 2.3.0
|
||||
- :version-ref:`hipSOLVER rocm_version`
|
||||
- Provides GPU-accelerated direct linear solvers for dense and sparse
|
||||
systems.
|
||||
- Optimizes linear algebra functions such as solving systems of linear
|
||||
equations, often used in optimization and training tasks.
|
||||
* - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
|
||||
- 3.1.2
|
||||
- :version-ref:`hipSPARSE rocm_version`
|
||||
- Optimizes sparse matrix operations for efficient computations on sparse
|
||||
data.
|
||||
- Accelerates sparse matrix operations in models with sparse weight
|
||||
matrices or activations, commonly used in neural networks.
|
||||
* - `MIOpen <https://github.com/ROCm/MIOpen>`_
|
||||
- 3.3.0
|
||||
- :version-ref:`MIOpen rocm_version`
|
||||
- Provides optimized deep learning primitives such as convolutions,
|
||||
pooling,
|
||||
normalization, and activation functions.
|
||||
@@ -172,13 +175,13 @@ library was introduced as a dependency.
|
||||
in TensorFlow for layers like ``tf.nn.conv2d``, ``tf.nn.relu``, and
|
||||
``tf.nn.lstm_cell``.
|
||||
* - `RCCL <https://github.com/ROCm/rccl>`_
|
||||
- 2.21.5
|
||||
- :version-ref:`RCCL rocm_version`
|
||||
- Optimizes for multi-GPU communication for operations like AllReduce and
|
||||
Broadcast.
|
||||
- Distributed data parallel training (``tf.distribute.MirroredStrategy``).
|
||||
Handles communication in multi-GPU setups.
|
||||
* - `rocThrust <https://github.com/ROCm/rocThrust>`_
|
||||
- 3.3.0
|
||||
- :version-ref:`rocThrust rocm_version`
|
||||
- Provides a C++ template library for parallel algorithms like sorting,
|
||||
reduction, and scanning.
|
||||
- Reduction operations like ``tf.reduce_sum``, ``tf.cumsum`` for computing
|
||||
|
||||
916
docs/compatibility/pytorch-compatibility.rst
Normal file
916
docs/compatibility/pytorch-compatibility.rst
Normal file
@@ -0,0 +1,916 @@
|
||||
.. meta::
|
||||
:description: PyTorch compatibility
|
||||
:keywords: GPU, PyTorch compatibility
|
||||
|
||||
********************************************************************************
|
||||
PyTorch compatibility
|
||||
********************************************************************************
|
||||
|
||||
`PyTorch <https://pytorch.org/>`_ is an open-source tensor library designed for
|
||||
deep learning. PyTorch on ROCm provides mixed-precision and large-scale training
|
||||
using `MIOpen <https://github.com/ROCm/MIOpen>`_ and
|
||||
`RCCL <https://github.com/ROCm/rccl>`_ libraries.
|
||||
|
||||
ROCm support for PyTorch is upstreamed into the official PyTorch repository. Due to independent
|
||||
compatibility considerations, this results in two distinct release cycles for PyTorch on ROCm:
|
||||
|
||||
- ROCm PyTorch release:
|
||||
|
||||
- Provides the latest version of ROCm but doesn't immediately support the latest stable PyTorch
|
||||
version.
|
||||
|
||||
- Offers :ref:`Docker images <pytorch-docker-compat>` with ROCm and PyTorch
|
||||
pre-installed.
|
||||
|
||||
- ROCm PyTorch repository: `<https://github.com/rocm/pytorch>`__
|
||||
|
||||
- See the :doc:`ROCm PyTorch installation guide <rocm-install-on-linux:install/3rd-party/pytorch-install>` to get started.
|
||||
|
||||
- Official PyTorch release:
|
||||
|
||||
- Provides the latest stable version of PyTorch but doesn't immediately support the latest ROCm version.
|
||||
|
||||
- Official PyTorch repository: `<https://github.com/pytorch/pytorch>`__
|
||||
|
||||
- See the `Nightly and latest stable version installation guide <https://pytorch.org/get-started/locally/>`_
|
||||
or `Previous versions <https://pytorch.org/get-started/previous-versions/>`_ to get started.
|
||||
|
||||
The upstream PyTorch includes an automatic HIPification solution that automatically generates HIP
|
||||
source code from the CUDA backend. This approach allows PyTorch to support ROCm without requiring
|
||||
manual code modifications.
|
||||
|
||||
ROCm's development is aligned with the stable release of PyTorch while upstream PyTorch testing uses
|
||||
the stable release of ROCm to maintain consistency.
|
||||
|
||||
.. _pytorch-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
================================================================================
|
||||
|
||||
AMD validates and publishes ready-made `PyTorch <https://hub.docker.com/r/rocm/pytorch>`_
|
||||
images with ROCm backends on Docker Hub. The following Docker image tags and
|
||||
associated inventories are validated for `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_.
|
||||
|
||||
.. list-table:: PyTorch Docker image components
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker
|
||||
- PyTorch
|
||||
- Ubuntu
|
||||
- Python
|
||||
- Apex
|
||||
- torchvision
|
||||
- TensorBoard
|
||||
- MAGMA
|
||||
- UCX
|
||||
- OMPI
|
||||
- OFED
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu24.04_py3.12_pytorch_release_2.4.0/images/sha256-98ddf20333bd01ff749b8092b1190ee369a75d3b8c71c2fac80ffdcb1a98d529?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- 24.04
|
||||
- `3.12 <https://www.python.org/downloads/release/python-3128/>`_
|
||||
- `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`_
|
||||
- `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`_
|
||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`_
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
||||
- `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
|
||||
- `4.0.7 <https://github.com/open-mpi/ompi/tree/v4.0.7>`_
|
||||
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.10_pytorch_release_2.4.0/images/sha256-402c9b4f1a6b5a81c634a1932b56cbe01abb699cfcc7463d226276997c6cf8ea?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- 22.04
|
||||
- `3.10 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
- `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`_
|
||||
- `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`_
|
||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`_
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
||||
- `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
|
||||
- `4.0.7 <https://github.com/open-mpi/ompi/tree/v4.0.7>`_
|
||||
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.9_pytorch_release_2.4.0/images/sha256-e0608b55d408c3bfe5c19fdd57a4ced3e0eb3a495b74c309980b60b156c526dd?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- 22.04
|
||||
- `3.9 <https://www.python.org/downloads/release/python-3918/>`_
|
||||
- `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`_
|
||||
- `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`_
|
||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`_
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
||||
- `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
|
||||
- `4.0.7 <https://github.com/open-mpi/ompi/tree/v4.0.7>`_
|
||||
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-652cf25263d05b1de548222970aeb76e60b12de101de66751264709c0d0ff9d8?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`_
|
||||
- 22.04
|
||||
- `3.10 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
- `1.3.0 <https://github.com/ROCm/apex/tree/release/1.3.0>`_
|
||||
- `0.18.0 <https://github.com/pytorch/vision/tree/v0.18.0>`_
|
||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`_
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
||||
- `1.14.1 <https://github.com/openucx/ucx/tree/v1.14.1>`_
|
||||
- `4.1.5 <https://github.com/open-mpi/ompi/tree/v4.1.5>`_
|
||||
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.10_pytorch_release_2.2.1/images/sha256-051976f26beab8f9aa65d999e3ad546c027b39240a0cc3ee81b114a9024f2912?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.2.1 <https://github.com/ROCm/pytorch/tree/release/2.2>`_
|
||||
- 22.04
|
||||
- `3.10 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
- `1.2.0 <https://github.com/ROCm/apex/tree/release/1.2.0>`_
|
||||
- `0.17.1 <https://github.com/pytorch/vision/tree/v0.17.1>`_
|
||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`_
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
||||
- `1.14.1 <https://github.com/openucx/ucx/tree/v1.14.1>`_
|
||||
- `4.1.5 <https://github.com/open-mpi/ompi/tree/v4.1.5>`_
|
||||
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu20.04_py3.9_pytorch_release_2.2.1/images/sha256-88c839a364d109d3748c100385bfa100d28090d25118cc723fd0406390ab2f7e?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.2.1 <https://github.com/ROCm/pytorch/tree/release/2.2>`_
|
||||
- 20.04
|
||||
- `3.9 <https://www.python.org/downloads/release/python-3921/>`_
|
||||
- `1.2.0 <https://github.com/ROCm/apex/tree/release/1.2.0>`_
|
||||
- `0.17.1 <https://github.com/pytorch/vision/tree/v0.17.1>`_
|
||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
||||
- `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
|
||||
- `4.0.3 <https://github.com/open-mpi/ompi/tree/v4.0.3>`_
|
||||
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.9_pytorch_release_1.13.1/images/sha256-994424ed07a63113f79dd9aa72159124c00f5fbfe18127151e6658f7d0b6f821?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `1.13.1 <https://github.com/ROCm/pytorch/tree/release/1.13>`_
|
||||
- 22.04
|
||||
- `3.9 <https://www.python.org/downloads/release/python-3921/>`_
|
||||
- `1.0.0 <https://github.com/ROCm/apex/tree/release/1.0.0>`_
|
||||
- `0.14.0 <https://github.com/pytorch/vision/tree/v0.14.0>`_
|
||||
- `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18>`_
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
||||
- `1.14.1 <https://github.com/openucx/ucx/tree/v1.14.1>`_
|
||||
- `4.1.5 <https://github.com/open-mpi/ompi/tree/v4.1.5>`_
|
||||
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu20.04_py3.9_pytorch_release_1.13.1/images/sha256-7b8139fe40a9aeb4bca3aecd15c22c1fa96e867d93479fa3a24fdeeeeafa1219?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `1.13.1 <https://github.com/ROCm/pytorch/tree/release/1.13>`_
|
||||
- 20.04
|
||||
- `3.9 <https://www.python.org/downloads/release/python-3921/>`_
|
||||
- `1.0.0 <https://github.com/ROCm/apex/tree/release/1.0.0>`_
|
||||
- `0.14.0 <https://github.com/pytorch/vision/tree/v0.14.0>`_
|
||||
- `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18>`_
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
||||
- `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
|
||||
- `4.0.3 <https://github.com/open-mpi/ompi/tree/v4.0.3>`_
|
||||
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
|
||||
|
||||
Critical ROCm libraries for PyTorch
|
||||
================================================================================
|
||||
|
||||
The functionality of PyTorch with ROCm is shaped by its underlying library
|
||||
dependencies. These critical ROCm components affect the capabilities,
|
||||
performance, and feature set available to developers.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - ROCm library
|
||||
- Version
|
||||
- Purpose
|
||||
- Used in
|
||||
* - `Composable Kernel <https://github.com/ROCm/composable_kernel>`_
|
||||
- 1.1.0
|
||||
- Enables faster execution of core operations like matrix multiplication
|
||||
(GEMM), convolutions and transformations.
|
||||
- Speeds up ``torch.permute``, ``torch.view``, ``torch.matmul``,
|
||||
``torch.mm``, ``torch.bmm``, ``torch.nn.Conv2d``, ``torch.nn.Conv3d``
|
||||
and ``torch.nn.MultiheadAttention``.
|
||||
* - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
|
||||
- 2.3.0
|
||||
- Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
|
||||
matrix and vector operations.
|
||||
- Supports operations like matrix multiplication, matrix-vector products,
|
||||
and tensor contractions. Utilized in both dense and batched linear
|
||||
algebra operations.
|
||||
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
|
||||
- 0.10.0
|
||||
- hipBLASLt is an extension of the hipBLAS library, providing additional
|
||||
features like epilogues fused into the matrix multiplication kernel or
|
||||
use of integer tensor cores.
|
||||
- It accelerates operations like ``torch.matmul``, ``torch.mm``, and the
|
||||
matrix multiplications used in convolutional and linear layers.
|
||||
* - `hipCUB <https://github.com/ROCm/hipCUB>`_
|
||||
- 3.3.0
|
||||
- Provides a C++ template library for parallel algorithms for reduction,
|
||||
scan, sort and select.
|
||||
- Supports operations like ``torch.sum``, ``torch.cumsum``, ``torch.sort``
|
||||
and ``torch.topk``. Operations on sparse tensors or tensors with
|
||||
irregular shapes often involve scanning, sorting, and filtering, which
|
||||
hipCUB handles efficiently.
|
||||
* - `hipFFT <https://github.com/ROCm/hipFFT>`_
|
||||
- 1.0.17
|
||||
- Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
|
||||
- Used in functions like the ``torch.fft`` module.
|
||||
* - `hipRAND <https://github.com/ROCm/hipRAND>`_
|
||||
- 2.11.0
|
||||
- Provides fast random number generation for GPUs.
|
||||
- The ``torch.rand``, ``torch.randn`` and stochastic layers like
|
||||
``torch.nn.Dropout``.
|
||||
* - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
|
||||
- 2.3.0
|
||||
- Provides GPU-accelerated solvers for linear systems, eigenvalues, and
|
||||
singular value decompositions (SVD).
|
||||
- Supports functions like ``torch.linalg.solve``,
|
||||
``torch.linalg.eig``, and ``torch.linalg.svd``.
|
||||
* - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
|
||||
- 3.1.2
|
||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||
or matrix-matrix products.
|
||||
- Sparse tensor operations ``torch.sparse``.
|
||||
* - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`_
|
||||
- 0.2.2
|
||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||
or matrix-matrix products.
|
||||
- Sparse tensor operations ``torch.sparse``.
|
||||
* - `hipTensor <https://github.com/ROCm/hipTensor>`_
|
||||
- 1.4.0
|
||||
- Optimizes for high-performance tensor operations, such as contractions.
|
||||
- Accelerates tensor algebra, especially in deep learning and scientific
|
||||
computing.
|
||||
* - `MIOpen <https://github.com/ROCm/MIOpen>`_
|
||||
- 3.3.0
|
||||
- Optimizes deep learning primitives such as convolutions, pooling,
|
||||
normalization, and activation functions.
|
||||
- Speeds up convolutional neural networks (CNNs), recurrent neural
|
||||
networks (RNNs), and other layers. Used in operations like
|
||||
``torch.nn.Conv2d``, ``torch.nn.ReLU``, and ``torch.nn.LSTM``.
|
||||
* - `MIGraphX <https://github.com/ROCm/AMDMIGraphX>`_
|
||||
- 2.11.0
|
||||
- Add graph-level optimizations, ONNX models and mixed precision support
|
||||
and enable Ahead-of-Time (AOT) Compilation.
|
||||
- Speeds up inference models and executes ONNX models for
|
||||
compatibility with other frameworks.
|
||||
``torch.nn.Conv2d``, ``torch.nn.ReLU``, and ``torch.nn.LSTM``.
|
||||
* - `MIVisionX <https://github.com/ROCm/MIVisionX>`_
|
||||
- 3.1.0
|
||||
- Optimizes acceleration for computer vision and AI workloads like
|
||||
preprocessing, augmentation, and inferencing.
|
||||
- Faster data preprocessing and augmentation pipelines for datasets like
|
||||
ImageNet or COCO and easy to integrate into PyTorch's ``torch.utils.data``
|
||||
and ``torchvision`` workflows.
|
||||
* - `rocAL <https://github.com/ROCm/rocAL>`_
|
||||
- 2.1.0
|
||||
- Accelerates the data pipeline by offloading intensive preprocessing and
|
||||
augmentation tasks. rocAL is part of MIVisionX.
|
||||
- Easy to integrate into PyTorch's ``torch.utils.data`` and
|
||||
``torchvision`` data load workloads.
|
||||
* - `RCCL <https://github.com/ROCm/rccl>`_
|
||||
- 2.21.5
|
||||
- Optimizes for multi-GPU communication for operations like AllReduce and
|
||||
Broadcast.
|
||||
- Distributed data parallel training (``torch.nn.parallel.DistributedDataParallel``).
|
||||
Handles communication in multi-GPU setups.
|
||||
* - `rocDecode <https://github.com/ROCm/rocDecode>`_
|
||||
- 0.8.0
|
||||
- Provide hardware-accelerated data decoding capabilities, particularly
|
||||
for image, video, and other dataset formats.
|
||||
- Can be integrated in ``torch.utils.data``, ``torchvision.transforms``
|
||||
and ``torch.distributed``.
|
||||
* - `rocJPEG <https://github.com/ROCm/rocJPEG>`_
|
||||
- 0.6.0
|
||||
- Provide hardware-accelerated JPEG image decoding and encoding.
|
||||
- GPU accelerated ``torchvision.io.decode_jpeg`` and
|
||||
``torchvision.io.encode_jpeg`` and can be integrated in
|
||||
``torch.utils.data`` and ``torchvision``.
|
||||
* - `RPP <https://github.com/ROCm/RPP>`_
|
||||
- 1.9.1
|
||||
- Speed up data augmentation, transformation, and other preprocessing step.
|
||||
- Easy to integrate into PyTorch's ``torch.utils.data`` and
|
||||
``torchvision`` data load workloads.
|
||||
* - `rocThrust <https://github.com/ROCm/rocThrust>`_
|
||||
- 3.3.0
|
||||
- Provides a C++ template library for parallel algorithms like sorting,
|
||||
reduction, and scanning.
|
||||
- Utilized in backend operations for tensor computations requiring
|
||||
parallel processing.
|
||||
* - `rocWMMA <https://github.com/ROCm/rocWMMA>`_
|
||||
- 1.6.0
|
||||
- Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
|
||||
multiplication (GEMM) and accumulation operations with mixed precision
|
||||
support.
|
||||
- Linear layers (``torch.nn.Linear``), convolutional layers
|
||||
(``torch.nn.Conv2d``), attention layers, general tensor operations that
|
||||
involve matrix products, such as ``torch.matmul``, ``torch.bmm``, and
|
||||
more.
|
||||
|
||||
Supported and unsupported features
|
||||
================================================================================
|
||||
|
||||
The following section maps GPU-accelerated PyTorch features to their supported
|
||||
ROCm and PyTorch versions.
|
||||
|
||||
torch
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
`torch <https://pytorch.org/docs/stable/index.html>`_ is the central module of
|
||||
PyTorch, providing data structures for multi-dimensional tensors and
|
||||
implementing mathematical operations on them. It also includes utilities for
|
||||
efficient serialization of tensors and arbitrary data types, along with various
|
||||
other tools.
|
||||
|
||||
Tensor data types
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The data type of a tensor is specified using the ``dtype`` attribute or argument, and PyTorch supports a wide range of data types for different use cases.
|
||||
|
||||
The following table lists `torch.Tensor <https://pytorch.org/docs/stable/tensors.html>`_'s single data types:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Data type
|
||||
- Description
|
||||
- Since PyTorch
|
||||
- Since ROCm
|
||||
* - ``torch.float8_e4m3fn``
|
||||
- 8-bit floating point, e4m3
|
||||
- 2.3
|
||||
- 5.5
|
||||
* - ``torch.float8_e5m2``
|
||||
- 8-bit floating point, e5m2
|
||||
- 2.3
|
||||
- 5.5
|
||||
* - ``torch.float16`` or ``torch.half``
|
||||
- 16-bit floating point
|
||||
- 0.1.6
|
||||
- 2.0
|
||||
* - ``torch.bfloat16``
|
||||
- 16-bit floating point
|
||||
- 1.6
|
||||
- 2.6
|
||||
* - ``torch.float32`` or ``torch.float``
|
||||
- 32-bit floating point
|
||||
- 0.1.12_2
|
||||
- 2.0
|
||||
* - ``torch.float64`` or ``torch.double``
|
||||
- 64-bit floating point
|
||||
- 0.1.12_2
|
||||
- 2.0
|
||||
* - ``torch.complex32`` or ``torch.chalf``
|
||||
- PyTorch provides native support for 32-bit complex numbers
|
||||
- 1.6
|
||||
- 2.0
|
||||
* - ``torch.complex64`` or ``torch.cfloat``
|
||||
- PyTorch provides native support for 64-bit complex numbers
|
||||
- 1.6
|
||||
- 2.0
|
||||
* - ``torch.complex128`` or ``torch.cdouble``
|
||||
- PyTorch provides native support for 128-bit complex numbers
|
||||
- 1.6
|
||||
- 2.0
|
||||
* - ``torch.uint8``
|
||||
- 8-bit integer (unsigned)
|
||||
- 0.1.12_2
|
||||
- 2.0
|
||||
* - ``torch.uint16``
|
||||
- 16-bit integer (unsigned)
|
||||
- 2.3
|
||||
- Not natively supported
|
||||
* - ``torch.uint32``
|
||||
- 32-bit integer (unsigned)
|
||||
- 2.3
|
||||
- Not natively supported
|
||||
* - ``torch.uint64``
|
||||
- 32-bit integer (unsigned)
|
||||
- 2.3
|
||||
- Not natively supported
|
||||
* - ``torch.int8``
|
||||
- 8-bit integer (signed)
|
||||
- 1.12
|
||||
- 5.0
|
||||
* - ``torch.int16`` or ``torch.short``
|
||||
- 16-bit integer (signed)
|
||||
- 0.1.12_2
|
||||
- 2.0
|
||||
* - ``torch.int32`` or ``torch.int``
|
||||
- 32-bit integer (signed)
|
||||
- 0.1.12_2
|
||||
- 2.0
|
||||
* - ``torch.int64`` or ``torch.long``
|
||||
- 64-bit integer (signed)
|
||||
- 0.1.12_2
|
||||
- 2.0
|
||||
* - ``torch.bool``
|
||||
- Boolean
|
||||
- 1.2
|
||||
- 2.0
|
||||
* - ``torch.quint8``
|
||||
- Quantized 8-bit integer (unsigned)
|
||||
- 1.8
|
||||
- 5.0
|
||||
* - ``torch.qint8``
|
||||
- Quantized 8-bit integer (signed)
|
||||
- 1.8
|
||||
- 5.0
|
||||
* - ``torch.qint32``
|
||||
- Quantized 32-bit integer (signed)
|
||||
- 1.8
|
||||
- 5.0
|
||||
* - ``torch.quint4x2``
|
||||
- Quantized 4-bit integer (unsigned)
|
||||
- 1.8
|
||||
- 5.0
|
||||
|
||||
.. note::
|
||||
|
||||
Unsigned types aside from ``uint8`` are currently only have limited support in
|
||||
eager mode (they primarily exist to assist usage with ``torch.compile``).
|
||||
|
||||
The :doc:`ROCm precision support page <rocm:reference/precision-support>`
|
||||
collected the native HW support of different data types.
|
||||
|
||||
torch.cuda
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
``torch.cuda`` in PyTorch is a module that provides utilities and functions for
|
||||
managing and utilizing AMD and NVIDIA GPUs. It enables GPU-accelerated
|
||||
computations, memory management, and efficient execution of tensor operations,
|
||||
leveraging ROCm and CUDA as the underlying frameworks.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Data type
|
||||
- Description
|
||||
- Since PyTorch
|
||||
- Since ROCm
|
||||
* - Device management
|
||||
- Utilities for managing and interacting with GPUs.
|
||||
- 0.4.0
|
||||
- 3.8
|
||||
* - Tensor operations on GPU
|
||||
- Perform tensor operations such as addition and matrix multiplications on
|
||||
the GPU.
|
||||
- 0.4.0
|
||||
- 3.8
|
||||
* - Streams and events
|
||||
- Streams allow overlapping computation and communication for optimized
|
||||
performance, events enable synchronization.
|
||||
- 1.6.0
|
||||
- 3.8
|
||||
* - Memory management
|
||||
- Functions to manage and inspect memory usage like
|
||||
``torch.cuda.memory_allocated()``, ``torch.cuda.max_memory_allocated()``,
|
||||
``torch.cuda.memory_reserved()`` and ``torch.cuda.empty_cache()``.
|
||||
- 0.3.0
|
||||
- 1.9.2
|
||||
* - Running process lists of memory management
|
||||
- Return a human-readable printout of the running processes and their GPU
|
||||
memory use for a given device with functions like
|
||||
``torch.cuda.memory_stats()`` and ``torch.cuda.memory_summary()``.
|
||||
- 1.8.0
|
||||
- 4.0
|
||||
* - Communication collectives
|
||||
- A set of APIs that enable efficient communication between multiple GPUs,
|
||||
allowing for distributed computing and data parallelism.
|
||||
- 1.9.0
|
||||
- 5.0
|
||||
* - ``torch.cuda.CUDAGraph``
|
||||
- Graphs capture sequences of GPU operations to minimize kernel launch
|
||||
overhead and improve performance.
|
||||
- 1.10.0
|
||||
- 5.3
|
||||
* - TunableOp
|
||||
- A mechanism that allows certain operations to be more flexible and
|
||||
optimized for performance. It enables automatic tuning of kernel
|
||||
configurations and other settings to achieve the best possible
|
||||
performance based on the specific hardware (GPU) and workload.
|
||||
- 2.0
|
||||
- 5.4
|
||||
* - NVIDIA Tools Extension (NVTX)
|
||||
- Integration with NVTX for profiling and debugging GPU performance using
|
||||
NVIDIA's Nsight tools.
|
||||
- 1.8.0
|
||||
- ❌
|
||||
* - Lazy loading NVRTC
|
||||
- Delays JIT compilation with NVRTC until the code is explicitly needed.
|
||||
- 1.13.0
|
||||
- ❌
|
||||
* - Jiterator (beta)
|
||||
- Jiterator allows asynchronous data streaming into computation streams
|
||||
during training loops.
|
||||
- 1.13.0
|
||||
- 5.2
|
||||
|
||||
.. Need to validate and extend.
|
||||
|
||||
torch.backends.cuda
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
``torch.backends.cuda`` is a PyTorch module that provides configuration options
|
||||
and flags to control the behavior of CUDA or ROCm operations. It is part of the
|
||||
PyTorch backend configuration system, which allows users to fine-tune how
|
||||
PyTorch interacts with the CUDA or ROCm environment.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Data type
|
||||
- Description
|
||||
- Since PyTorch
|
||||
- Since ROCm
|
||||
* - ``cufft_plan_cache``
|
||||
- Manages caching of GPU FFT plans to optimize repeated FFT computations.
|
||||
- 1.7.0
|
||||
- 5.0
|
||||
* - ``matmul.allow_tf32``
|
||||
- Enables or disables the use of TensorFloat-32 (TF32) precision for
|
||||
faster matrix multiplications on GPUs with Tensor Cores.
|
||||
- 1.10.0
|
||||
- ❌
|
||||
* - ``matmul.allow_fp16_reduced_precision_reduction``
|
||||
- Reduced precision reductions (e.g., with fp16 accumulation type) are
|
||||
allowed with fp16 GEMMs.
|
||||
- 2.0
|
||||
- ❌
|
||||
* - ``matmul.allow_bf16_reduced_precision_reduction``
|
||||
- Reduced precision reductions are allowed with bf16 GEMMs.
|
||||
- 2.0
|
||||
- ❌
|
||||
* - ``enable_cudnn_sdp``
|
||||
- Globally enables cuDNN SDPA's kernels within SDPA.
|
||||
- 2.0
|
||||
- ❌
|
||||
* - ``enable_flash_sdp``
|
||||
- Globally enables or disables FlashAttention for SDPA.
|
||||
- 2.1
|
||||
- ❌
|
||||
* - ``enable_mem_efficient_sdp``
|
||||
- Globally enables or disables Memory-Efficient Attention for SDPA.
|
||||
- 2.1
|
||||
- ❌
|
||||
* - ``enable_math_sdp``
|
||||
- Globally enables or disables the PyTorch C++ implementation within SDPA.
|
||||
- 2.1
|
||||
- ❌
|
||||
|
||||
.. Need to validate and extend.
|
||||
|
||||
torch.backends.cudnn
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Supported ``torch`` options:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Data type
|
||||
- Description
|
||||
- Since PyTorch
|
||||
- Since ROCm
|
||||
* - ``allow_tf32``
|
||||
- TensorFloat-32 tensor cores may be used in cuDNN convolutions on NVIDIA
|
||||
Ampere or newer GPUs.
|
||||
- 1.12.0
|
||||
- ❌
|
||||
* - ``deterministic``
|
||||
- A bool that, if True, causes cuDNN to only use deterministic
|
||||
convolution algorithms.
|
||||
- 1.12.0
|
||||
- 6.0
|
||||
|
||||
Automatic mixed precision: torch.amp
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
PyTorch that automates the process of using both 16-bit (half-precision,
|
||||
float16) and 32-bit (single-precision, float32) floating-point types in model
|
||||
training and inference.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Data type
|
||||
- Description
|
||||
- Since PyTorch
|
||||
- Since ROCm
|
||||
* - Autocasting
|
||||
- Instances of autocast serve as context managers or decorators that allow
|
||||
regions of your script to run in mixed precision.
|
||||
- 1.9
|
||||
- 2.5
|
||||
* - Gradient scaling
|
||||
- To prevent underflow, “gradient scaling” multiplies the network’s
|
||||
loss(es) by a scale factor and invokes a backward pass on the scaled
|
||||
loss(es). Gradients flowing backward through the network are then
|
||||
scaled by the same factor. In other words, gradient values have a
|
||||
larger magnitude, so they don’t flush to zero.
|
||||
- 1.9
|
||||
- 2.5
|
||||
* - CUDA op-specific behavior
|
||||
- These ops always go through autocasting whether they are invoked as part
|
||||
of a ``torch.nn.Module``, as a function, or as a ``torch.Tensor`` method. If
|
||||
functions are exposed in multiple namespaces, they go through
|
||||
autocasting regardless of the namespace.
|
||||
- 1.9
|
||||
- 2.5
|
||||
|
||||
Distributed library features
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The PyTorch distributed library includes a collective of parallelism modules, a
|
||||
communications layer, and infrastructure for launching and debugging large
|
||||
training jobs. See :ref:`rocm-for-ai-pytorch-distributed` for more information.
|
||||
|
||||
The Distributed Library feature in PyTorch provides tools and APIs for building
|
||||
and running distributed machine learning workflows. It allows training models
|
||||
across multiple processes, GPUs, or nodes in a cluster, enabling efficient use
|
||||
of computational resources and scalability for large-scale tasks.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Features
|
||||
- Description
|
||||
- Since PyTorch
|
||||
- Since ROCm
|
||||
* - TensorPipe
|
||||
- TensorPipe is a point-to-point communication library integrated into
|
||||
PyTorch for distributed training. It is designed to handle tensor data
|
||||
transfers efficiently between different processes or devices, including
|
||||
those on separate machines.
|
||||
- 1.8
|
||||
- 5.4
|
||||
* - Gloo
|
||||
- Gloo is designed for multi-machine and multi-GPU setups, enabling
|
||||
efficient communication and synchronization between processes. Gloo is
|
||||
one of the default backends for PyTorch's Distributed Data Parallel
|
||||
(DDP) and RPC frameworks, alongside other backends like NCCL and MPI.
|
||||
- 1.0
|
||||
- 2.0
|
||||
|
||||
torch.compiler
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Features
|
||||
- Description
|
||||
- Since PyTorch
|
||||
- Since ROCm
|
||||
* - ``torch.compiler`` (AOT Autograd)
|
||||
- Autograd captures not only the user-level code, but also backpropagation,
|
||||
which results in capturing the backwards pass “ahead-of-time”. This
|
||||
enables acceleration of both forwards and backwards pass using
|
||||
``TorchInductor``.
|
||||
- 2.0
|
||||
- 5.3
|
||||
* - ``torch.compiler`` (TorchInductor)
|
||||
- The default ``torch.compile`` deep learning compiler that generates fast
|
||||
code for multiple accelerators and backends. You need to use a backend
|
||||
compiler to make speedups through ``torch.compile`` possible. For AMD,
|
||||
NVIDIA, and Intel GPUs, it leverages OpenAI Triton as the key building block.
|
||||
- 2.0
|
||||
- 5.3
|
||||
|
||||
torchaudio
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
The `torchaudio <https://pytorch.org/audio/stable/index.html>`_ library provides
|
||||
utilities for processing audio data in PyTorch, such as audio loading,
|
||||
transformations, and feature extraction.
|
||||
|
||||
To ensure GPU-acceleration with ``torchaudio.transforms``, you need to move audio
|
||||
data (waveform tensor) explicitly to GPU using ``.to('cuda')``.
|
||||
|
||||
The following ``torchaudio`` features are GPU-accelerated.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Features
|
||||
- Description
|
||||
- Since torchaudio version
|
||||
- Since ROCm
|
||||
* - ``torchaudio.transforms.Spectrogram``
|
||||
- Generate spectrogram of an input waveform using STFT.
|
||||
- 0.6.0
|
||||
- 4.5
|
||||
* - ``torchaudio.transforms.MelSpectrogram``
|
||||
- Generate the mel-scale spectrogram of raw audio signals.
|
||||
- 0.9.0
|
||||
- 4.5
|
||||
* - ``torchaudio.transforms.MFCC``
|
||||
- Extract of MFCC features.
|
||||
- 0.9.0
|
||||
- 4.5
|
||||
* - ``torchaudio.transforms.Resample``
|
||||
- Resample a signal from one frequency to another
|
||||
- 0.9.0
|
||||
- 4.5
|
||||
|
||||
torchvision
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
The `torchvision <https://pytorch.org/vision/stable/index.html>`_ library
|
||||
provide datasets, model architectures, and common image transformations for
|
||||
computer vision.
|
||||
|
||||
The following ``torchvision`` features are GPU-accelerated.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Features
|
||||
- Description
|
||||
- Since torchvision version
|
||||
- Since ROCm
|
||||
* - ``torchvision.transforms.functional``
|
||||
- Provides GPU-compatible transformations for image preprocessing like
|
||||
resize, normalize, rotate and crop.
|
||||
- 0.2.0
|
||||
- 4.0
|
||||
* - ``torchvision.ops``
|
||||
- GPU-accelerated operations for object detection and segmentation tasks.
|
||||
``torchvision.ops.roi_align``, ``torchvision.ops.nms`` and
|
||||
``box_convert``.
|
||||
- 0.6.0
|
||||
- 3.3
|
||||
* - ``torchvision.models`` with ``.to('cuda')``
|
||||
- ``torchvision`` provides several pre-trained models (ResNet, Faster
|
||||
R-CNN, Mask R-CNN, ...) that can run on CUDA for faster inference and
|
||||
training.
|
||||
- 0.1.6
|
||||
- 2.x
|
||||
* - ``torchvision.io``
|
||||
- Video decoding and frame extraction using GPU acceleration with NVIDIA’s
|
||||
NVDEC and nvJPEG (rocJPEG) on CUDA-enabled GPUs.
|
||||
- 0.4.0
|
||||
- 6.3
|
||||
|
||||
torchtext
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
The `torchtext <https://pytorch.org/text/stable/index.html>`_ library provides
|
||||
utilities for processing and working with text data in PyTorch, including
|
||||
tokenization, vocabulary management, and text embeddings. torchtext supports
|
||||
preprocessing pipelines and integration with PyTorch models, simplifying the
|
||||
implementation of natural language processing (NLP) tasks.
|
||||
|
||||
To leverage GPU acceleration in torchtext, you need to move tensors
|
||||
explicitly to the GPU using ``.to('cuda')``.
|
||||
|
||||
* torchtext does not implement its own kernels. ROCm support is enabled by linking against ROCm libraries.
|
||||
|
||||
* Only official release exists.
|
||||
|
||||
torchtune
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
The `torchtune <https://pytorch.org/torchtune/stable/index.html>`_ library for
|
||||
authoring, fine-tuning and experimenting with LLMs.
|
||||
|
||||
* Usage: It works out-of-the-box, enabling developers to fine-tune ROCm PyTorch solutions.
|
||||
|
||||
* Only official release exists.
|
||||
|
||||
torchserve
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
The `torchserve <https://pytorch.org/torchserve/>`_ is a PyTorch domain library
|
||||
for common sparsity and parallelism primitives needed for large-scale recommender
|
||||
systems.
|
||||
|
||||
* torchtext does not implement its own kernels. ROCm support is enabled by linking against ROCm libraries.
|
||||
|
||||
* Only official release exists.
|
||||
|
||||
torchrec
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
The `torchrec <https://pytorch.org/torchrec/>`_ is a PyTorch domain library for
|
||||
common sparsity and parallelism primitives needed for large-scale recommender
|
||||
systems.
|
||||
|
||||
* torchrec does not implement its own kernels. ROCm support is enabled by linking against ROCm libraries.
|
||||
|
||||
* Only official release exists.
|
||||
|
||||
Unsupported PyTorch features
|
||||
----------------------------
|
||||
|
||||
The following are GPU-accelerated PyTorch features not currently supported by ROCm.
|
||||
|
||||
.. list-table::
|
||||
:widths: 30, 60, 10
|
||||
:header-rows: 1
|
||||
|
||||
* - Data type
|
||||
- Description
|
||||
- Since PyTorch
|
||||
* - APEX batch norm
|
||||
- Use APEX batch norm instead of PyTorch batch norm.
|
||||
- 1.6.0
|
||||
* - ``torch.backends.cuda`` / ``matmul.allow_tf32``
|
||||
- A bool that controls whether TensorFloat-32 tensor cores may be used in
|
||||
matrix multiplications.
|
||||
- 1.7
|
||||
* - ``torch.cuda`` / NVIDIA Tools Extension (NVTX)
|
||||
- Integration with NVTX for profiling and debugging GPU performance using
|
||||
NVIDIA's Nsight tools.
|
||||
- 1.7.0
|
||||
* - ``torch.cuda`` / Lazy loading NVRTC
|
||||
- Delays JIT compilation with NVRTC until the code is explicitly needed.
|
||||
- 1.8.0
|
||||
* - ``torch-tensorrt``
|
||||
- Integrate TensorRT library for optimizing and deploying PyTorch models.
|
||||
ROCm does not have equialent library for TensorRT.
|
||||
- 1.9.0
|
||||
* - ``torch.backends`` / ``cudnn.allow_tf32``
|
||||
- TensorFloat-32 tensor cores may be used in cuDNN convolutions.
|
||||
- 1.10.0
|
||||
* - ``torch.backends.cuda`` / ``matmul.allow_fp16_reduced_precision_reduction``
|
||||
- Reduced precision reductions with fp16 accumulation type are
|
||||
allowed with fp16 GEMMs.
|
||||
- 2.0
|
||||
* - ``torch.backends.cuda`` / ``matmul.allow_bf16_reduced_precision_reduction``
|
||||
- Reduced precision reductions are allowed with bf16 GEMMs.
|
||||
- 2.0
|
||||
* - ``torch.nn.functional`` / ``scaled_dot_product_attention``
|
||||
- Flash attention backend for SDPA to accelerate attention computation in
|
||||
transformer-based models.
|
||||
- 2.0
|
||||
* - ``torch.backends.cuda`` / ``enable_cudnn_sdp``
|
||||
- Globally enables cuDNN SDPA's kernels within SDPA.
|
||||
- 2.0
|
||||
* - ``torch.backends.cuda`` / ``enable_flash_sdp``
|
||||
- Globally enables or disables FlashAttention for SDPA.
|
||||
- 2.1
|
||||
* - ``torch.backends.cuda`` / ``enable_mem_efficient_sdp``
|
||||
- Globally enables or disables Memory-Efficient Attention for SDPA.
|
||||
- 2.1
|
||||
* - ``torch.backends.cuda`` / ``enable_math_sdp``
|
||||
- Globally enables or disables the PyTorch C++ implementation within SDPA.
|
||||
- 2.1
|
||||
* - Dynamic parallelism
|
||||
- PyTorch itself does not directly expose dynamic parallelism as a core
|
||||
feature. Dynamic parallelism allow GPU threads to launch additional
|
||||
threads which can be reached using custom operations via the
|
||||
``torch.utils.cpp_extension`` module.
|
||||
- Not a core feature
|
||||
* - Unified memory support in PyTorch
|
||||
- Unified Memory is not directly exposed in PyTorch's core API, it can be
|
||||
utilized effectively through custom CUDA extensions or advanced
|
||||
workflows.
|
||||
- Not a core feature
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
* :doc:`Using ROCm for AI: training a model </how-to/rocm-for-ai/train-a-model>` provides
|
||||
guidance on how to leverage the ROCm platform for training AI models. It covers the steps, tools, and best practices
|
||||
for optimizing training workflows on AMD GPUs using PyTorch features.
|
||||
|
||||
* :doc:`Single-GPU fine-tuning and inference </how-to/llm-fine-tuning-optimization/single-gpu-fine-tuning-and-inference>`
|
||||
describes and demonstrates how to use the ROCm platform for the fine-tuning and inference of
|
||||
machine learning models, particularly large language models (LLMs), on systems with a single AMD
|
||||
Instinct MI300X accelerator. This page provides a detailed guide for setting up, optimizing, and
|
||||
executing fine-tuning and inference workflows in such environments.
|
||||
|
||||
* :doc:`Multi-GPU fine-tuning and inference optimization </how-to/llm-fine-tuning-optimization/multi-gpu-fine-tuning-and-inference>`
|
||||
describes and demonstrates the fine-tuning and inference of machine learning models on systems
|
||||
with multi MI300X accelerators.
|
||||
|
||||
* The :doc:`Instinct MI300X workload optimization guide </how-to/tuning-guides/mi300x/workload>` provides detailed
|
||||
guidance on optimizing workloads for the AMD Instinct MI300X accelerator using ROCm. This guide is aimed at helping
|
||||
users achieve optimal performance for deep learning and other high-performance computing tasks on the MI300X
|
||||
accelerator.
|
||||
|
||||
* The :doc:`Inception with PyTorch documentation </conceptual/ai-pytorch-inception>`
|
||||
describes how PyTorch integrates with ROCm for AI workloads It outlines the use of PyTorch on the ROCm platform and
|
||||
focuses on how to efficiently leverage AMD GPU hardware for training and inference tasks in AI applications.
|
||||
|
||||
For more use cases and recommendations, see `ROCm PyTorch blog posts <https://rocm.blogs.amd.com/blog/tag/pytorch.html>`_
|
||||
@@ -32,7 +32,7 @@ architecture.
|
||||
|
||||
* [AMD Instinct™ MI250 microarchitecture](./gpu-arch/mi250.md)
|
||||
* [AMD Instinct MI200/CDNA2 ISA](https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf)
|
||||
* [White paper](https://www.amd.com/system/files/documents/amd-cdna2-white-paper.pdf)
|
||||
* [White paper](https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna2-white-paper.pdf)
|
||||
* [Performance counters](./gpu-arch/mi300-mi200-performance-counters.rst)
|
||||
|
||||
:::
|
||||
@@ -45,7 +45,7 @@ architecture.
|
||||
|
||||
* [AMD Instinct™ MI100 microarchitecture](./gpu-arch/mi100.md)
|
||||
* [AMD Instinct MI100/CDNA1 ISA](https://www.amd.com/system/files/TechDocs/instinct-mi100-cdna1-shader-instruction-set-architecture%C2%A0.pdf)
|
||||
* [White paper](https://www.amd.com/system/files/documents/amd-cdna-whitepaper.pdf)
|
||||
* [White paper](https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna-white-paper.pdf)
|
||||
|
||||
:::
|
||||
|
||||
@@ -55,7 +55,6 @@ architecture.
|
||||
* [AMD RDNA3 ISA](https://www.amd.com/system/files/TechDocs/rdna3-shader-instruction-set-architecture-feb-2023_0.pdf)
|
||||
* [AMD RDNA2 ISA](https://www.amd.com/system/files/TechDocs/rdna2-shader-instruction-set-architecture.pdf)
|
||||
* [AMD RDNA ISA](https://www.amd.com/system/files/TechDocs/rdna-shader-instruction-set-architecture.pdf)
|
||||
* [AMD RDNA Architecture White Paper](https://www.amd.com/system/files/documents/rdna-whitepaper.pdf)
|
||||
|
||||
:::
|
||||
|
||||
|
||||
21
docs/conf.py
21
docs/conf.py
@@ -6,6 +6,8 @@
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
shutil.copy2("../RELEASE.md", "./about/release-notes.md")
|
||||
|
||||
@@ -49,6 +51,9 @@ article_pages = [
|
||||
|
||||
{"file": "how-to/rocm-for-ai/training/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/train-a-model", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/prerequisite-system-validation", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/megatron-lm", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/scale-model-training", "os": ["linux"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/fine-tuning/index", "os": ["linux"]},
|
||||
@@ -63,7 +68,7 @@ article_pages = [
|
||||
{"file": "how-to/rocm-for-ai/inference/llm-inference-frameworks", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/vllm-benchmark", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},
|
||||
|
||||
|
||||
{"file": "how-to/rocm-for-ai/inference-optimization/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference-optimization/model-quantization", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference-optimization/model-acceleration-libraries", "os": ["linux"]},
|
||||
@@ -86,11 +91,16 @@ article_pages = [
|
||||
|
||||
external_toc_path = "./sphinx/_toc.yml"
|
||||
|
||||
extensions = ["rocm_docs", "sphinx_reredirects", "sphinx_sitemap"]
|
||||
# Add the _extensions directory to Python's search path
|
||||
sys.path.append(str(Path(__file__).parent / 'extension'))
|
||||
|
||||
extensions = ["rocm_docs", "sphinx_reredirects", "sphinx_sitemap", "sphinxcontrib.datatemplates", "version-ref"]
|
||||
|
||||
compatibility_matrix_file = str(Path(__file__).parent / 'compatibility/compatibility-matrix-historical-6.0.csv')
|
||||
|
||||
external_projects_current_project = "rocm"
|
||||
|
||||
# Uncomment if facing rate limit exceed issue with local build
|
||||
# Uncomment if facing rate limit exceed issue with local build
|
||||
# external_projects_remote_repository = ""
|
||||
|
||||
html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "https://rocm-stg.amd.com/")
|
||||
@@ -101,8 +111,9 @@ if os.environ.get("READTHEDOCS", "") == "True":
|
||||
html_theme = "rocm_docs_theme"
|
||||
html_theme_options = {"flavor": "rocm-docs-home"}
|
||||
|
||||
html_static_path = ["sphinx/static/css"]
|
||||
html_css_files = ["rocm_custom.css", "rocm_rn.css"]
|
||||
html_static_path = ["sphinx/static/css", "extension/how-to/rocm-for-ai/inference"]
|
||||
html_css_files = ["rocm_custom.css", "rocm_rn.css", "vllm-benchmark.css"]
|
||||
html_js_files = ["vllm-benchmark.js"]
|
||||
|
||||
html_title = "ROCm Documentation"
|
||||
|
||||
|
||||
@@ -0,0 +1,153 @@
|
||||
vllm_benchmark:
|
||||
unified_docker:
|
||||
latest:
|
||||
pull_tag: rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6/images/sha256-9a12ef62bbbeb5a4c30a01f702c8e025061f575aa129f291a49fbd02d6b4d6c9
|
||||
rocm_version: 6.3.1
|
||||
vllm_version: 0.6.6
|
||||
pytorch_version: 2.7.0 (2.7.0a0+git3a58512)
|
||||
model_groups:
|
||||
- group: Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_vllm_llama-3.1-8b
|
||||
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: float16
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_vllm_llama-3.1-70b
|
||||
model_repo: meta-llama/Llama-3.1-70B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||
precision: float16
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_vllm_llama-3.1-405b
|
||||
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||
precision: float16
|
||||
- model: Llama 3.2 11B Vision
|
||||
mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct
|
||||
model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct
|
||||
precision: float16
|
||||
- model: Llama 2 7B
|
||||
mad_tag: pyt_vllm_llama-2-7b
|
||||
model_repo: meta-llama/Llama-2-7b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
|
||||
precision: float16
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_vllm_llama-2-70b
|
||||
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||
precision: float16
|
||||
- model: Llama 3.1 70B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-70b_fp8
|
||||
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- model: Llama 3.1 405B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- group: Mistral
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral MoE 8x7B
|
||||
mad_tag: pyt_vllm_mixtral-8x7b
|
||||
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
precision: float16
|
||||
- model: Mixtral MoE 8x22B
|
||||
mad_tag: pyt_vllm_mixtral-8x22b
|
||||
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
precision: float16
|
||||
- model: Mistral 7B
|
||||
mad_tag: pyt_vllm_mistral-7b
|
||||
model_repo: mistralai/Mistral-7B-Instruct-v0.3
|
||||
url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
|
||||
precision: float16
|
||||
- model: Mixtral MoE 8x7B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- model: Mixtral MoE 8x22B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- model: Mistral 7B FP8
|
||||
mad_tag: pyt_vllm_mistral-7b_fp8
|
||||
model_repo: amd/Mistral-7B-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen2 7B
|
||||
mad_tag: pyt_vllm_qwen2-7b
|
||||
model_repo: Qwen/Qwen2-7B-Instruct
|
||||
url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
|
||||
precision: float16
|
||||
- model: Qwen2 72B
|
||||
mad_tag: pyt_vllm_qwen2-72b
|
||||
model_repo: Qwen/Qwen2-72B-Instruct
|
||||
url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
|
||||
precision: float16
|
||||
- group: JAIS
|
||||
tag: jais
|
||||
models:
|
||||
- model: JAIS 13B
|
||||
mad_tag: pyt_vllm_jais-13b
|
||||
model_repo: core42/jais-13b-chat
|
||||
url: https://huggingface.co/core42/jais-13b-chat
|
||||
precision: float16
|
||||
- model: JAIS 30B
|
||||
mad_tag: pyt_vllm_jais-30b
|
||||
model_repo: core42/jais-30b-chat-v3
|
||||
url: https://huggingface.co/core42/jais-30b-chat-v3
|
||||
precision: float16
|
||||
- group: DBRX
|
||||
tag: dbrx
|
||||
models:
|
||||
- model: DBRX Instruct
|
||||
mad_tag: pyt_vllm_dbrx-instruct
|
||||
model_repo: databricks/dbrx-instruct
|
||||
url: https://huggingface.co/databricks/dbrx-instruct
|
||||
precision: float16
|
||||
- model: DBRX Instruct FP8
|
||||
mad_tag: pyt_vllm_dbrx_fp8
|
||||
model_repo: amd/dbrx-instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
|
||||
precision: float8
|
||||
- group: Gemma
|
||||
tag: gemma
|
||||
models:
|
||||
- model: Gemma 2 27B
|
||||
mad_tag: pyt_vllm_gemma-2-27b
|
||||
model_repo: google/gemma-2-27b
|
||||
url: https://huggingface.co/google/gemma-2-27b
|
||||
precision: float16
|
||||
- group: Cohere
|
||||
tag: cohere
|
||||
models:
|
||||
- model: C4AI Command R+ 08-2024
|
||||
mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
|
||||
model_repo: CohereForAI/c4ai-command-r-plus-08-2024
|
||||
url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
|
||||
precision: float16
|
||||
- model: C4AI Command R+ 08-2024 FP8
|
||||
mad_tag: pyt_vllm_command-r-plus_fp8
|
||||
model_repo: amd/c4ai-command-r-plus-FP8-KV
|
||||
url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
|
||||
precision: float8
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek MoE 16B
|
||||
mad_tag: pyt_vllm_deepseek-moe-16b-chat
|
||||
model_repo: deepseek-ai/deepseek-moe-16b-chat
|
||||
url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
|
||||
precision: float16
|
||||
0
docs/extension/__init__.py
Normal file
0
docs/extension/__init__.py
Normal file
212
docs/extension/how-to/rocm-for-ai/inference/vllm-benchmark.js
Normal file
212
docs/extension/how-to/rocm-for-ai/inference/vllm-benchmark.js
Normal file
@@ -0,0 +1,212 @@
|
||||
function ready(proc) {
|
||||
// Check if page is loaded. If so, init.
|
||||
if (document.readyState !== "loading") {
|
||||
proc();
|
||||
} else {
|
||||
// Otherwise, wait for DOMContentLoaded event.
|
||||
document.addEventListener("DOMContentLoaded", proc);
|
||||
}
|
||||
}
|
||||
|
||||
ready(() => {
|
||||
const ModelPicker = {
|
||||
// Selector strings for DOM elements
|
||||
SELECTORS: {
|
||||
CONTAINER: "#vllm-benchmark-ud-params-picker",
|
||||
MODEL_GROUP_BTN: 'div[data-param-k="model-group"][data-param-v]',
|
||||
MODEL_PARAM_BTN: 'div[data-param-k="model"][data-param-v]',
|
||||
MODEL_DOC: "div.model-doc",
|
||||
},
|
||||
CSS_CLASSES: {
|
||||
HIDDEN: "hidden",
|
||||
},
|
||||
ATTRIBUTES: {
|
||||
PARAM_KEY: "data-param-k", // URL search parameter key (i.e., "model")
|
||||
PARAM_VALUE: "data-param-v", // URL search param value (e.g., "pyt_vllm_llama-3.1-8b", "pyt_vllm_llama-3.1-70b") -- these are MAD model tags
|
||||
PARAM_GROUP: "data-param-group", // Model group (e.g., "llama", "mistral")
|
||||
PARAM_STATE: "data-param-state", // Selection state
|
||||
},
|
||||
|
||||
// Cache DOM elements
|
||||
elements: {
|
||||
container: null,
|
||||
modelGroups: null,
|
||||
modelParams: null,
|
||||
modelDocs: null,
|
||||
},
|
||||
|
||||
data: {
|
||||
availableModels: new Set(),
|
||||
modelsByGroup: new Map(),
|
||||
modelToGroupMap: new Map(),
|
||||
formattedModelClassMap: new Map(), //TODO
|
||||
},
|
||||
|
||||
init() {
|
||||
this.elements.container = document.querySelector(
|
||||
this.SELECTORS.CONTAINER,
|
||||
);
|
||||
if (!this.elements.container) return;
|
||||
|
||||
this.cacheDOMElements();
|
||||
if (!this.validateElements()) return;
|
||||
|
||||
this.buildModelData();
|
||||
this.bindEvents();
|
||||
this.initializeState();
|
||||
},
|
||||
|
||||
cacheDOMElements() {
|
||||
const { CONTAINER, MODEL_GROUP_BTN, MODEL_PARAM_BTN, MODEL_DOC } =
|
||||
this.SELECTORS;
|
||||
this.elements = {
|
||||
container: document.querySelector(CONTAINER),
|
||||
modelGroups: document.querySelectorAll(MODEL_GROUP_BTN),
|
||||
modelParams: document.querySelectorAll(MODEL_PARAM_BTN),
|
||||
modelDocs: document.querySelectorAll(MODEL_DOC),
|
||||
};
|
||||
},
|
||||
|
||||
validateElements() {
|
||||
const { modelGroups, modelParams } = this.elements;
|
||||
if (!modelGroups.length || !modelParams.length) {
|
||||
console.warn("Model picker is missing required elements");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
},
|
||||
|
||||
buildModelData() {
|
||||
const { PARAM_VALUE, PARAM_GROUP } = this.ATTRIBUTES;
|
||||
|
||||
this.elements.modelParams.forEach((model) => {
|
||||
const modelTag = model.getAttribute(PARAM_VALUE);
|
||||
const groupTag = model.getAttribute(PARAM_GROUP);
|
||||
|
||||
if (!modelTag || !groupTag) return;
|
||||
|
||||
this.data.availableModels.add(modelTag);
|
||||
this.data.modelToGroupMap.set(modelTag, groupTag);
|
||||
|
||||
// FIXME: this is because Sphinx auto-formats class names to use dashes
|
||||
this.data.formattedModelClassMap.set(
|
||||
modelTag,
|
||||
modelTag.replace(/[^a-zA-Z0-9]/g, "-"),
|
||||
);
|
||||
|
||||
if (!this.data.modelsByGroup.has(groupTag)) {
|
||||
this.data.modelsByGroup.set(groupTag, []);
|
||||
}
|
||||
this.data.modelsByGroup.get(groupTag).push(modelTag);
|
||||
});
|
||||
},
|
||||
|
||||
// Event listeners for user interactions
|
||||
bindEvents() {
|
||||
const handleInteraction = (event) => {
|
||||
const target = event.target.closest(`[${this.ATTRIBUTES.PARAM_KEY}]`);
|
||||
if (!target) return;
|
||||
|
||||
const paramType = target.getAttribute(this.ATTRIBUTES.PARAM_KEY);
|
||||
const paramValue = target.getAttribute(this.ATTRIBUTES.PARAM_VALUE);
|
||||
|
||||
if (paramType === "model") {
|
||||
const groupTag = target.getAttribute(this.ATTRIBUTES.PARAM_GROUP);
|
||||
if (groupTag) this.updateUI(paramValue, groupTag);
|
||||
} else if (paramType === "model-group") {
|
||||
const firstModelInGroup = this.data.modelsByGroup.get(paramValue)
|
||||
?.[0];
|
||||
if (firstModelInGroup) this.updateUI(firstModelInGroup, paramValue);
|
||||
}
|
||||
};
|
||||
|
||||
this.elements.container.addEventListener("click", handleInteraction);
|
||||
this.elements.container.addEventListener("keydown", (event) => {
|
||||
if (event.key === "Enter" || event.key === " ") {
|
||||
event.preventDefault();
|
||||
handleInteraction(event);
|
||||
}
|
||||
});
|
||||
},
|
||||
|
||||
// Update the page based on the selected model
|
||||
updateUI(modelTag, groupTag) {
|
||||
const validModel = this.setModelSearchParam(modelTag);
|
||||
|
||||
// Update model group buttons
|
||||
this.elements.modelGroups.forEach((group) => {
|
||||
const isSelected =
|
||||
group.getAttribute(this.ATTRIBUTES.PARAM_VALUE) === groupTag;
|
||||
group.setAttribute(
|
||||
this.ATTRIBUTES.PARAM_STATE,
|
||||
isSelected ? "selected" : "",
|
||||
);
|
||||
group.setAttribute("aria-selected", isSelected.toString());
|
||||
});
|
||||
|
||||
// Update model buttons
|
||||
this.elements.modelParams.forEach((model) => {
|
||||
const isInSelectedGroup =
|
||||
model.getAttribute(this.ATTRIBUTES.PARAM_GROUP) === groupTag;
|
||||
const isSelectedModel =
|
||||
model.getAttribute(this.ATTRIBUTES.PARAM_VALUE) === validModel;
|
||||
|
||||
model.classList.toggle(this.CSS_CLASSES.HIDDEN, !isInSelectedGroup);
|
||||
model.setAttribute(
|
||||
this.ATTRIBUTES.PARAM_STATE,
|
||||
isSelectedModel ? "selected" : "",
|
||||
);
|
||||
model.setAttribute("aria-selected", isSelectedModel.toString());
|
||||
});
|
||||
|
||||
// Update visibility of doc sections
|
||||
const formattedClass = this.data.formattedModelClassMap.get(validModel);
|
||||
if (formattedClass) {
|
||||
this.elements.modelDocs.forEach((doc) => {
|
||||
doc.classList.toggle(
|
||||
this.CSS_CLASSES.HIDDEN,
|
||||
!doc.classList.contains(formattedClass),
|
||||
);
|
||||
});
|
||||
}
|
||||
},
|
||||
|
||||
// Get the current model from the URL search parameters.
|
||||
getModelSearchParam() {
|
||||
return new URLSearchParams(location.search).get("model");
|
||||
},
|
||||
|
||||
// Set the model in the URL search parameters, or fallback to the first available one.
|
||||
setModelSearchParam(modelTag) {
|
||||
const defaultModel = [...this.data.availableModels][0];
|
||||
const model = this.data.availableModels.has(modelTag)
|
||||
? modelTag
|
||||
: defaultModel;
|
||||
|
||||
const searchParams = new URLSearchParams(location.search);
|
||||
searchParams.set("model", model);
|
||||
|
||||
history.replaceState(
|
||||
{},
|
||||
"",
|
||||
`${location.pathname}?${searchParams.toString()}`,
|
||||
);
|
||||
return model;
|
||||
},
|
||||
|
||||
// Initialize the UI state based on the current URL search parameter or default values.
|
||||
initializeState() {
|
||||
const currentModel = this.getModelSearchParam();
|
||||
const validModel = this.setModelSearchParam(currentModel);
|
||||
|
||||
const initialGroup = this.data.modelToGroupMap.get(validModel) ??
|
||||
[...this.data.modelsByGroup.keys()][0];
|
||||
|
||||
if (initialGroup) {
|
||||
this.updateUI(validModel, initialGroup);
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
ModelPicker.init();
|
||||
});
|
||||
266
docs/extension/version-ref.py
Normal file
266
docs/extension/version-ref.py
Normal file
@@ -0,0 +1,266 @@
|
||||
from docutils import nodes
|
||||
from docutils.parsers.rst import Directive
|
||||
from sphinx.util import logging
|
||||
import csv
|
||||
from io import StringIO
|
||||
import re
|
||||
import shlex
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class VersionReference(nodes.Inline, nodes.TextElement):
|
||||
"""Represents an inline version reference."""
|
||||
pass
|
||||
|
||||
class VersionSetDirective(Directive):
|
||||
"""Directive for setting version references within a page scope."""
|
||||
required_arguments = 2 # name and value
|
||||
optional_arguments = 0
|
||||
|
||||
def run(self):
|
||||
env = self.state.document.settings.env
|
||||
if not hasattr(env, 'doc_version_refs'):
|
||||
env.doc_version_refs = {}
|
||||
current_doc = env.docname
|
||||
if current_doc not in env.doc_version_refs:
|
||||
env.doc_version_refs[current_doc] = {}
|
||||
|
||||
name, value = self.arguments
|
||||
if name.lower() == 'latest':
|
||||
logger.warning('Cannot override the "latest" keyword with version-set')
|
||||
return []
|
||||
|
||||
# Handle 'latest' value by getting the actual version
|
||||
if value.lower() == 'latest':
|
||||
data = getattr(env, 'compatibility_matrix', None)
|
||||
if data:
|
||||
latest_version = get_latest_rocm_version(data)
|
||||
if latest_version:
|
||||
value = latest_version
|
||||
|
||||
env.doc_version_refs[current_doc][name] = value
|
||||
return []
|
||||
|
||||
def clean_library_name(name):
|
||||
"""Extract library name from RST formatting."""
|
||||
# Handle :doc: format
|
||||
doc_match = re.search(r':doc:`([^<]+)(?:\s+<[^>]+>)?`', name)
|
||||
if doc_match:
|
||||
return doc_match.group(1).strip()
|
||||
|
||||
# Handle other link formats
|
||||
link_match = re.search(r'`([^<]+)(?:\s+<[^>]+>)?`_?', name)
|
||||
if link_match:
|
||||
return link_match.group(1).strip()
|
||||
|
||||
return name.strip()
|
||||
|
||||
def get_latest_rocm_version(data):
|
||||
"""Get the latest ROCm version from the matrix headers."""
|
||||
if not data or len(data) == 0:
|
||||
return None
|
||||
|
||||
# Get all column names except 'ROCm Version'
|
||||
columns = [col for col in data[0].keys() if col != 'ROCm Version']
|
||||
# Return the first column name (assumed to be the latest version)
|
||||
return columns[0] if columns else None
|
||||
|
||||
def version_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
|
||||
"""
|
||||
Role function to print version value.
|
||||
Usage: :version:`version_name`
|
||||
"""
|
||||
try:
|
||||
version_name = text.strip()
|
||||
env = inliner.document.settings.env
|
||||
|
||||
if hasattr(env, 'doc_version_refs'):
|
||||
current_doc = env.docname
|
||||
if current_doc in env.doc_version_refs:
|
||||
doc_refs = env.doc_version_refs[current_doc]
|
||||
if version_name in doc_refs:
|
||||
version = doc_refs[version_name]
|
||||
node = nodes.Text(version)
|
||||
return [node], []
|
||||
|
||||
msg = inliner.reporter.warning(
|
||||
f'No version defined for name {version_name}',
|
||||
line=lineno
|
||||
)
|
||||
return [], [msg]
|
||||
|
||||
except Exception as e:
|
||||
msg = inliner.reporter.error(
|
||||
f'Error looking up version: {str(e)}',
|
||||
line=lineno
|
||||
)
|
||||
prb = inliner.problematic(rawtext, rawtext, msg)
|
||||
return [prb], [msg]
|
||||
|
||||
def version_ref_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
|
||||
"""
|
||||
Role function for version references.
|
||||
Usage: :version-ref:`library_name release`
|
||||
:version-ref:`"library name" release`
|
||||
:version-ref:`library_name latest`
|
||||
:version-ref:`rocm latest`
|
||||
"""
|
||||
try:
|
||||
# Parse the text - handle both quoted and unquoted formats
|
||||
if '"' in text:
|
||||
parts = shlex.split(text)
|
||||
else:
|
||||
parts = text.split()
|
||||
|
||||
if len(parts) != 2:
|
||||
msg = inliner.reporter.error(
|
||||
'Version reference must be in format "library_name release" or "\\"library name\\" release"',
|
||||
line=lineno
|
||||
)
|
||||
prb = inliner.problematic(rawtext, rawtext, msg)
|
||||
return [prb], [msg]
|
||||
|
||||
library_name, release = parts
|
||||
env = inliner.document.settings.env
|
||||
|
||||
# Check if release is a version reference in current document
|
||||
if hasattr(env, 'doc_version_refs'):
|
||||
current_doc = env.docname
|
||||
if current_doc in env.doc_version_refs:
|
||||
doc_refs = env.doc_version_refs[current_doc]
|
||||
if release in doc_refs:
|
||||
release = doc_refs[release]
|
||||
|
||||
# Handle special case for "rocm latest"
|
||||
if library_name.lower() == 'rocm' and release.lower() == 'latest':
|
||||
data = getattr(env, 'compatibility_matrix', None)
|
||||
if not data:
|
||||
raise ValueError("Compatibility matrix not found in environment")
|
||||
|
||||
latest_version = get_latest_rocm_version(data)
|
||||
if latest_version:
|
||||
node = VersionReference()
|
||||
node += nodes.Text(latest_version)
|
||||
return [node], []
|
||||
else:
|
||||
msg = inliner.reporter.warning(
|
||||
'No ROCm versions found in compatibility matrix',
|
||||
line=lineno
|
||||
)
|
||||
return [], [msg]
|
||||
|
||||
version = lookup_version(inliner, library_name, release)
|
||||
|
||||
if version:
|
||||
node = VersionReference()
|
||||
node += nodes.Text(version)
|
||||
return [node], []
|
||||
else:
|
||||
msg = inliner.reporter.warning(
|
||||
f'No version found for library {library_name} in release {release}',
|
||||
line=lineno
|
||||
)
|
||||
return [], [msg]
|
||||
|
||||
except Exception as e:
|
||||
msg = inliner.reporter.error(
|
||||
f'Error looking up version: {str(e)}',
|
||||
line=lineno
|
||||
)
|
||||
prb = inliner.problematic(rawtext, rawtext, msg)
|
||||
return [prb], [msg]
|
||||
|
||||
def lookup_version(inliner, library_name, release):
|
||||
"""Look up the version in the compatibility matrix."""
|
||||
env = inliner.document.settings.env
|
||||
data = getattr(env, 'compatibility_matrix', None)
|
||||
|
||||
if not data:
|
||||
raise ValueError("Compatibility matrix not found in environment")
|
||||
|
||||
# Handle the 'latest' keyword
|
||||
if release.lower() == 'latest':
|
||||
latest_version = get_latest_rocm_version(data)
|
||||
if not latest_version:
|
||||
return None
|
||||
release = latest_version
|
||||
|
||||
# For ROCm, check if the version exists in column headers
|
||||
if library_name.lower() == 'rocm':
|
||||
columns = [col for col in data[0].keys() if col != 'ROCm Version']
|
||||
if release in columns:
|
||||
return release
|
||||
return None
|
||||
|
||||
# Find the library version
|
||||
for row in data:
|
||||
row_lib_name = clean_library_name(row['ROCm Version'])
|
||||
if row_lib_name == library_name:
|
||||
# Get the version, removing any whitespace
|
||||
version = row.get(release, '').strip()
|
||||
if version:
|
||||
return version
|
||||
|
||||
# If not found, try a case-insensitive search
|
||||
for row in data:
|
||||
row_lib_name = clean_library_name(row['ROCm Version'])
|
||||
if row_lib_name.lower() == library_name.lower():
|
||||
version = row.get(release, '').strip()
|
||||
if version:
|
||||
return version
|
||||
|
||||
return None
|
||||
|
||||
def visit_version_reference(self, node):
|
||||
self.body.append(f'<span class="version-reference">')
|
||||
|
||||
def depart_version_reference(self, node):
|
||||
self.body.append('</span>')
|
||||
|
||||
def load_compatibility_matrix(app):
|
||||
"""Load the compatibility matrix content from CSV."""
|
||||
if not app.config.compatibility_matrix_file:
|
||||
logger.warning('No compatibility matrix file configured')
|
||||
return
|
||||
|
||||
try:
|
||||
with open(app.config.compatibility_matrix_file, 'r', encoding='utf-8') as f:
|
||||
reader = csv.DictReader(f)
|
||||
app.env.compatibility_matrix = list(reader)
|
||||
logger.info('Successfully loaded compatibility matrix')
|
||||
|
||||
# Debug: print first few rows with their library names
|
||||
for row in list(app.env.compatibility_matrix)[:5]:
|
||||
if 'ROCm Version' in row:
|
||||
lib_name = clean_library_name(row['ROCm Version'])
|
||||
logger.debug(f"Loaded library: {lib_name}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f'Error loading compatibility matrix: {str(e)}')
|
||||
|
||||
def purge_version_refs(app, env, docname):
|
||||
"""Remove version references for a document when it is purged"""
|
||||
if hasattr(env, 'doc_version_refs'):
|
||||
if docname in env.doc_version_refs:
|
||||
del env.doc_version_refs[docname]
|
||||
|
||||
def setup(app):
|
||||
app.add_node(VersionReference,
|
||||
html=(visit_version_reference, depart_version_reference))
|
||||
app.add_role('version-ref', version_ref_role)
|
||||
app.add_role('version', version_role)
|
||||
app.add_directive('version-set', VersionSetDirective)
|
||||
|
||||
# Add a config value for the compatibility matrix file path
|
||||
app.add_config_value('compatibility_matrix_file', None, 'env')
|
||||
|
||||
# Connect to the builder-inited event to load the matrix
|
||||
app.connect('builder-inited', load_compatibility_matrix)
|
||||
|
||||
# Connect to env-purge-doc event to clean up document-specific version refs
|
||||
app.connect('env-purge-doc', purge_version_refs)
|
||||
|
||||
return {
|
||||
'parallel_read_safe': True,
|
||||
'parallel_write_safe': True,
|
||||
}
|
||||
@@ -9,422 +9,266 @@ LLM inference performance validation on AMD Instinct MI300X
|
||||
|
||||
.. _vllm-benchmark-unified-docker:
|
||||
|
||||
The `ROCm vLLM Docker <https://hub.docker.com/r/rocm/vllm/tags>`_ image offers
|
||||
a prebuilt, optimized environment for validating large language model (LLM)
|
||||
inference performance on the AMD Instinct™ MI300X accelerator. This ROCm vLLM
|
||||
Docker image integrates vLLM and PyTorch tailored specifically for the MI300X
|
||||
accelerator and includes the following components:
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
|
||||
|
||||
* `ROCm 6.3.1 <https://github.com/ROCm/ROCm>`_
|
||||
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||
|
||||
* `vLLM 0.6.6 <https://docs.vllm.ai/en/latest>`_
|
||||
The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
|
||||
a prebuilt, optimized environment for validating large language model (LLM)
|
||||
inference performance on the AMD Instinct™ MI300X accelerator. This ROCm vLLM
|
||||
Docker image integrates vLLM and PyTorch tailored specifically for the MI300X
|
||||
accelerator and includes the following components:
|
||||
|
||||
* `PyTorch 2.7.0 (2.7.0a0+git3a58512) <https://github.com/pytorch/pytorch>`_
|
||||
* `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
|
||||
|
||||
With this Docker image, you can quickly validate the expected inference
|
||||
performance numbers for the MI300X accelerator. This topic also provides tips on
|
||||
optimizing performance with popular AI models. For more information, see the lists of
|
||||
:ref:`available models for MAD-integrated benchmarking <vllm-benchmark-mad-models>`
|
||||
and :ref:`standalone benchmarking <vllm-benchmark-standalone-options>`.
|
||||
* `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
|
||||
|
||||
.. _vllm-benchmark-vllm:
|
||||
* `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/pytorch/pytorch>`_
|
||||
|
||||
.. note::
|
||||
With this Docker image, you can quickly validate the expected inference
|
||||
performance numbers for the MI300X accelerator. This topic also provides tips on
|
||||
optimizing performance with popular AI models.
|
||||
|
||||
vLLM is a toolkit and library for LLM inference and serving. AMD implements
|
||||
high-performance custom kernels and modules in vLLM to enhance performance.
|
||||
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||
more information.
|
||||
.. _vllm-benchmark-available-models:
|
||||
|
||||
Getting started
|
||||
===============
|
||||
Available models
|
||||
================
|
||||
|
||||
Use the following procedures to reproduce the benchmark results on an
|
||||
MI300X accelerator with the prebuilt vLLM Docker image.
|
||||
.. raw:: html
|
||||
|
||||
.. _vllm-benchmark-get-started:
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row">
|
||||
<div class="col-2 me-2 model-param-head">Model</div>
|
||||
<div class="row col-10">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
1. Disable NUMA auto-balancing.
|
||||
<div class="row mt-1">
|
||||
<div class="col-2 me-2 model-param-head">Model variant</div>
|
||||
<div class="row col-10">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
|
||||
might hang until the periodic balancing is finalized. For more information,
|
||||
see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
|
||||
.. _vllm-benchmark-vllm:
|
||||
|
||||
.. code-block:: shell
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
# disable automatic NUMA balancing
|
||||
sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||
# check if NUMA balancing is disabled (returns 0 if disabled)
|
||||
cat /proc/sys/kernel/numa_balancing
|
||||
0
|
||||
.. container:: model-doc {{model.mad_tag}}
|
||||
|
||||
2. Download the :ref:`ROCm vLLM Docker image <vllm-benchmark-unified-docker>`.
|
||||
.. note::
|
||||
|
||||
Use the following command to pull the Docker image from Docker Hub.
|
||||
See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
|
||||
Some models require access authorization prior to use via an external license agreement through a third party.
|
||||
|
||||
.. code-block:: shell
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
docker pull rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
|
||||
|
||||
Once the setup is complete, choose between two options to reproduce the
|
||||
benchmark results:
|
||||
.. note::
|
||||
|
||||
- :ref:`MAD-integrated benchmarking <vllm-benchmark-mad>`
|
||||
vLLM is a toolkit and library for LLM inference and serving. AMD implements
|
||||
high-performance custom kernels and modules in vLLM to enhance performance.
|
||||
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||
more information.
|
||||
|
||||
- :ref:`Standalone benchmarking <vllm-benchmark-standalone>`
|
||||
Getting started
|
||||
===============
|
||||
|
||||
.. _vllm-benchmark-mad:
|
||||
Use the following procedures to reproduce the benchmark results on an
|
||||
MI300X accelerator with the prebuilt vLLM Docker image.
|
||||
|
||||
MAD-integrated benchmarking
|
||||
===========================
|
||||
.. _vllm-benchmark-get-started:
|
||||
|
||||
Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
1. Disable NUMA auto-balancing.
|
||||
|
||||
.. code-block:: shell
|
||||
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
|
||||
might hang until the periodic balancing is finalized. For more information,
|
||||
see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
.. code-block:: shell
|
||||
|
||||
Use this command to run a performance benchmark test of the Llama 3.1 8B model
|
||||
on one GPU with ``float16`` data type in the host machine.
|
||||
# disable automatic NUMA balancing
|
||||
sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||
# check if NUMA balancing is disabled (returns 0 if disabled)
|
||||
cat /proc/sys/kernel/numa_balancing
|
||||
0
|
||||
|
||||
.. code-block:: shell
|
||||
2. Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
|
||||
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
python3 tools/run_models.py --tags pyt_vllm_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
|
||||
Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
ROCm MAD launches a Docker container with the name
|
||||
``container_ci-pyt_vllm_llama-3.1-8b``. The latency and throughput reports of the
|
||||
model are collected in the following path: ``~/MAD/reports_float16/``.
|
||||
.. code-block:: shell
|
||||
|
||||
Although the following models are preconfigured to collect latency and
|
||||
throughput performance data, you can also change the benchmarking parameters.
|
||||
Refer to the :ref:`Standalone benchmarking <vllm-benchmark-standalone>` section.
|
||||
docker pull {{ unified_docker.pull_tag }}
|
||||
|
||||
.. _vllm-benchmark-mad-models:
|
||||
Benchmarking
|
||||
============
|
||||
|
||||
Available models
|
||||
----------------
|
||||
Once the setup is complete, choose between two options to reproduce the
|
||||
benchmark results:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:widths: 2, 3
|
||||
.. _vllm-benchmark-mad:
|
||||
|
||||
* - Model name
|
||||
- Tag
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
* - `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B>`_
|
||||
- ``pyt_vllm_llama-3.1-8b``
|
||||
.. container:: model-doc {{model.mad_tag}}
|
||||
|
||||
* - `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
|
||||
- ``pyt_vllm_llama-3.1-70b``
|
||||
.. tab-set::
|
||||
|
||||
* - `Llama 3.1 405B <https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct>`_
|
||||
- ``pyt_vllm_llama-3.1-405b``
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
* - `Llama 3.2 11B Vision <https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct>`_
|
||||
- ``pyt_vllm_llama-3.2-11b-vision-instruct``
|
||||
Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
|
||||
* - `Llama 2 7B <https://huggingface.co/meta-llama/Llama-2-7b-chat-hf>`_
|
||||
- ``pyt_vllm_llama-2-7b``
|
||||
.. code-block:: shell
|
||||
|
||||
* - `Llama 2 70B <https://huggingface.co/meta-llama/Llama-2-70b-chat-hf>`_
|
||||
- ``pyt_vllm_llama-2-70b``
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
|
||||
* - `Mixtral MoE 8x7B <https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1>`_
|
||||
- ``pyt_vllm_mixtral-8x7b``
|
||||
Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
|
||||
using one GPU with the ``{{model.precision}}`` data type on the host machine.
|
||||
|
||||
* - `Mixtral MoE 8x22B <https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1>`_
|
||||
- ``pyt_vllm_mixtral-8x22b``
|
||||
.. code-block:: shell
|
||||
|
||||
* - `Mistral 7B <https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3>`_
|
||||
- ``pyt_vllm_mistral-7b``
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
|
||||
|
||||
* - `Qwen2 7B <https://huggingface.co/Qwen/Qwen2-7B-Instruct>`_
|
||||
- ``pyt_vllm_qwen2-7b``
|
||||
MAD launches a Docker container with the name
|
||||
``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
|
||||
model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
|
||||
|
||||
* - `Qwen2 72B <https://huggingface.co/Qwen/Qwen2-72B-Instruct>`_
|
||||
- ``pyt_vllm_qwen2-72b``
|
||||
Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
|
||||
to collect latency and throughput performance data, you can also change the benchmarking
|
||||
parameters. See the standalone benchmarking tab for more information.
|
||||
|
||||
* - `JAIS 13B <https://huggingface.co/core42/jais-13b-chat>`_
|
||||
- ``pyt_vllm_jais-13b``
|
||||
.. tab-item:: Standalone benchmarking
|
||||
|
||||
* - `JAIS 30B <https://huggingface.co/core42/jais-30b-chat-v3>`_
|
||||
- ``pyt_vllm_jais-30b``
|
||||
Run the vLLM benchmark tool independently by starting the
|
||||
`Docker container <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6/images/sha256-9a12ef62bbbeb5a4c30a01f702c8e025061f575aa129f291a49fbd02d6b4d6c9>`_
|
||||
as shown in the following snippet.
|
||||
|
||||
* - `DBRX Instruct <https://huggingface.co/databricks/dbrx-instruct>`_
|
||||
- ``pyt_vllm_dbrx-instruct``
|
||||
.. code-block::
|
||||
|
||||
* - `Gemma 2 27B <https://huggingface.co/google/gemma-2-27b>`_
|
||||
- ``pyt_vllm_gemma-2-27b``
|
||||
docker pull rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
|
||||
docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name vllm_v0.6.6 rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
|
||||
|
||||
* - `C4AI Command R+ 08-2024 <https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024>`_
|
||||
- ``pyt_vllm_c4ai-command-r-plus-08-2024``
|
||||
In the Docker container, clone the ROCm MAD repository and navigate to the
|
||||
benchmark scripts directory at ``~/MAD/scripts/vllm``.
|
||||
|
||||
* - `DeepSeek MoE 16B <https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat>`_
|
||||
- ``pyt_vllm_deepseek-moe-16b-chat``
|
||||
.. code-block::
|
||||
|
||||
* - `Llama 3.1 70B FP8 <https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV>`_
|
||||
- ``pyt_vllm_llama-3.1-70b_fp8``
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD/scripts/vllm
|
||||
|
||||
* - `Llama 3.1 405B FP8 <https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV>`_
|
||||
- ``pyt_vllm_llama-3.1-405b_fp8``
|
||||
To start the benchmark, use the following command with the appropriate options.
|
||||
|
||||
* - `Mixtral MoE 8x7B FP8 <https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV>`_
|
||||
- ``pyt_vllm_mixtral-8x7b_fp8``
|
||||
.. code-block::
|
||||
|
||||
* - `Mixtral MoE 8x22B FP8 <https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV>`_
|
||||
- ``pyt_vllm_mixtral-8x22b_fp8``
|
||||
./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
|
||||
|
||||
* - `Mistral 7B FP8 <https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV>`_
|
||||
- ``pyt_vllm_mistral-7b_fp8``
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:align: center
|
||||
|
||||
* - `DBRX Instruct FP8 <https://huggingface.co/amd/dbrx-instruct-FP8-KV>`_
|
||||
- ``pyt_vllm_dbrx_fp8``
|
||||
* - Name
|
||||
- Options
|
||||
- Description
|
||||
|
||||
* - `C4AI Command R+ 08-2024 FP8 <https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV>`_
|
||||
- ``pyt_vllm_command-r-plus_fp8``
|
||||
* - ``$test_option``
|
||||
- latency
|
||||
- Measure decoding token latency
|
||||
|
||||
.. _vllm-benchmark-standalone:
|
||||
* -
|
||||
- throughput
|
||||
- Measure token generation throughput
|
||||
|
||||
Standalone benchmarking
|
||||
=======================
|
||||
* -
|
||||
- all
|
||||
- Measure both throughput and latency
|
||||
|
||||
You can run the vLLM benchmark tool independently by starting the
|
||||
`Docker container <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6/images/sha256-9a12ef62bbbeb5a4c30a01f702c8e025061f575aa129f291a49fbd02d6b4d6c9>`_
|
||||
as shown in the following snippet.
|
||||
* - ``$num_gpu``
|
||||
- 1 or 8
|
||||
- Number of GPUs
|
||||
|
||||
.. code-block::
|
||||
* - ``$datatype``
|
||||
- ``float16`` or ``float8``
|
||||
- Data type
|
||||
|
||||
docker pull rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
|
||||
docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name vllm_v0.6.6 rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
|
||||
.. note::
|
||||
|
||||
In the Docker container, clone the ROCm MAD repository and navigate to the
|
||||
benchmark scripts directory at ``~/MAD/scripts/vllm``.
|
||||
The input sequence length, output sequence length, and tensor parallel (TP) are
|
||||
already configured. You don't need to specify them with this script.
|
||||
|
||||
.. code-block::
|
||||
.. note::
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD/scripts/vllm
|
||||
If you encounter the following error, pass your access-authorized Hugging
|
||||
Face token to the gated models.
|
||||
|
||||
Command
|
||||
-------
|
||||
.. code-block::
|
||||
|
||||
To start the benchmark, use the following command with the appropriate options.
|
||||
See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
|
||||
options and their descriptions.
|
||||
OSError: You are trying to access a gated repo.
|
||||
|
||||
.. code-block:: shell
|
||||
# pass your HF_TOKEN
|
||||
export HF_TOKEN=$your_personal_hf_token
|
||||
|
||||
./vllm_benchmark_report.sh -s $test_option -m $model_repo -g $num_gpu -d $datatype
|
||||
Here are some examples of running the benchmark with various options.
|
||||
|
||||
See the :ref:`examples <vllm-benchmark-run-benchmark>` for more information.
|
||||
* Latency benchmark
|
||||
|
||||
.. note::
|
||||
Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with the ``{{model.precision}}`` data type.
|
||||
|
||||
The input sequence length, output sequence length, and tensor parallel (TP) are
|
||||
already configured. You don't need to specify them with this script.
|
||||
.. code-block::
|
||||
|
||||
.. note::
|
||||
./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
|
||||
|
||||
If you encounter the following error, pass your access-authorized Hugging
|
||||
Face token to the gated models.
|
||||
Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
|
||||
|
||||
.. code-block:: shell
|
||||
* Throughput benchmark
|
||||
|
||||
OSError: You are trying to access a gated repo.
|
||||
Use this command to throughput the latency of the {{model.model}} model on eight GPUs with the ``{{model.precision}}`` data type.
|
||||
|
||||
# pass your HF_TOKEN
|
||||
export HF_TOKEN=$your_personal_hf_token
|
||||
.. code-block:: shell
|
||||
|
||||
.. _vllm-benchmark-standalone-options:
|
||||
./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
|
||||
|
||||
Options and available models
|
||||
----------------------------
|
||||
Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:align: center
|
||||
.. raw:: html
|
||||
|
||||
* - Name
|
||||
- Options
|
||||
- Description
|
||||
<style>
|
||||
mjx-container[jax="CHTML"][display="true"] {
|
||||
text-align: left;
|
||||
margin: 0;
|
||||
}
|
||||
</style>
|
||||
|
||||
* - ``$test_option``
|
||||
- latency
|
||||
- Measure decoding token latency
|
||||
.. note::
|
||||
|
||||
* -
|
||||
- throughput
|
||||
- Measure token generation throughput
|
||||
Throughput is calculated as:
|
||||
|
||||
* -
|
||||
- all
|
||||
- Measure both throughput and latency
|
||||
- .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
|
||||
|
||||
* - ``$model_repo``
|
||||
- ``meta-llama/Llama-3.1-8B-Instruct``
|
||||
- `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B>`_
|
||||
|
||||
* - (``float16``)
|
||||
- ``meta-llama/Llama-3.1-70B-Instruct``
|
||||
- `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
|
||||
|
||||
* -
|
||||
- ``meta-llama/Llama-3.1-405B-Instruct``
|
||||
- `Llama 3.1 405B <https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct>`_
|
||||
|
||||
* -
|
||||
- ``meta-llama/Llama-3.2-11B-Vision-Instruct``
|
||||
- `Llama 3.2 11B Vision <https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct>`_
|
||||
|
||||
* -
|
||||
- ``meta-llama/Llama-2-7b-chat-hf``
|
||||
- `Llama 2 7B <https://huggingface.co/meta-llama/Llama-2-7b-chat-hf>`_
|
||||
|
||||
* -
|
||||
- ``meta-llama/Llama-2-70b-chat-hf``
|
||||
- `Llama 2 7B <https://huggingface.co/meta-llama/Llama-2-70b-chat-hf>`_
|
||||
|
||||
* -
|
||||
- ``mistralai/Mixtral-8x7B-Instruct-v0.1``
|
||||
- `Mixtral MoE 8x7B <https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1>`_
|
||||
|
||||
* -
|
||||
- ``mistralai/Mixtral-8x22B-Instruct-v0.1``
|
||||
- `Mixtral MoE 8x22B <https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1>`_
|
||||
|
||||
* -
|
||||
- ``mistralai/Mistral-7B-Instruct-v0.3``
|
||||
- `Mistral 7B <https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3>`_
|
||||
|
||||
* -
|
||||
- ``Qwen/Qwen2-7B-Instruct``
|
||||
- `Qwen2 7B <https://huggingface.co/Qwen/Qwen2-7B-Instruct>`_
|
||||
|
||||
* -
|
||||
- ``Qwen/Qwen2-72B-Instruct``
|
||||
- `Qwen2 72B <https://huggingface.co/Qwen/Qwen2-72B-Instruct>`_
|
||||
|
||||
* -
|
||||
- ``core42/jais-13b-chat``
|
||||
- `JAIS 13B <https://huggingface.co/core42/jais-13b-chat>`_
|
||||
|
||||
* -
|
||||
- ``core42/jais-30b-chat-v3``
|
||||
- `JAIS 30B <https://huggingface.co/core42/jais-30b-chat-v3>`_
|
||||
|
||||
* -
|
||||
- ``databricks/dbrx-instruct``
|
||||
- `DBRX Instruct <https://huggingface.co/databricks/dbrx-instruct>`_
|
||||
|
||||
* -
|
||||
- ``google/gemma-2-27b``
|
||||
- `Gemma 2 27B <https://huggingface.co/google/gemma-2-27b>`_
|
||||
|
||||
* -
|
||||
- ``CohereForAI/c4ai-command-r-plus-08-2024``
|
||||
- `C4AI Command R+ 08-2024 <https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024>`_
|
||||
|
||||
* -
|
||||
- ``deepseek-ai/deepseek-moe-16b-chat``
|
||||
- `DeepSeek MoE 16B <https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat>`_
|
||||
|
||||
* - ``$model_repo``
|
||||
- ``amd/Llama-3.1-70B-Instruct-FP8-KV``
|
||||
- `Llama 3.1 70B FP8 <https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV>`_
|
||||
|
||||
* - (``float8``)
|
||||
- ``amd/Llama-3.1-405B-Instruct-FP8-KV``
|
||||
- `Llama 3.1 405B FP8 <https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV>`_
|
||||
|
||||
* -
|
||||
- ``amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV``
|
||||
- `Mixtral MoE 8x7B FP8 <https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV>`_
|
||||
|
||||
* -
|
||||
- ``amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV``
|
||||
- `Mixtral MoE 8x22B FP8 <https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV>`_
|
||||
|
||||
* -
|
||||
- ``amd/Mistral-7B-v0.1-FP8-KV``
|
||||
- `Mistral 7B FP8 <https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV>`_
|
||||
|
||||
* -
|
||||
- ``amd/dbrx-instruct-FP8-KV``
|
||||
- `DBRX Instruct FP8 <https://huggingface.co/amd/dbrx-instruct-FP8-KV>`_
|
||||
|
||||
* -
|
||||
- ``amd/c4ai-command-r-plus-FP8-KV``
|
||||
- `C4AI Command R+ 08-2024 FP8 <https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV>`_
|
||||
|
||||
* - ``$num_gpu``
|
||||
- 1 or 8
|
||||
- Number of GPUs
|
||||
|
||||
* - ``$datatype``
|
||||
- ``float16`` or ``float8``
|
||||
- Data type
|
||||
|
||||
.. _vllm-benchmark-run-benchmark:
|
||||
|
||||
Running the benchmark on the MI300X accelerator
|
||||
-----------------------------------------------
|
||||
|
||||
Here are some examples of running the benchmark with various options.
|
||||
See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
|
||||
options and their descriptions.
|
||||
|
||||
Example 1: latency benchmark
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Use this command to benchmark the latency of the Llama 3.1 70B model on eight GPUs with the ``float16`` and ``float8`` data types.
|
||||
|
||||
.. code-block::
|
||||
|
||||
./vllm_benchmark_report.sh -s latency -m meta-llama/Llama-3.1-70B-Instruct -g 8 -d float16
|
||||
./vllm_benchmark_report.sh -s latency -m amd/Llama-3.1-70B-Instruct-FP8-KV -g 8 -d float8
|
||||
|
||||
Find the latency reports at:
|
||||
|
||||
- ``./reports_float16/summary/Llama-3.1-70B-Instruct_latency_report.csv``
|
||||
|
||||
- ``./reports_float8/summary/Llama-3.1-70B-Instruct-FP8-KV_latency_report.csv``
|
||||
|
||||
Example 2: throughput benchmark
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Use this command to benchmark the throughput of the Llama 3.1 70B model on eight GPUs with the ``float16`` and ``float8`` data types.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./vllm_benchmark_report.sh -s throughput -m meta-llama/Llama-3.1-70B-Instruct -g 8 -d float16
|
||||
./vllm_benchmark_report.sh -s throughput -m amd/Llama-3.1-70B-Instruct-FP8-KV -g 8 -d float8
|
||||
|
||||
Find the throughput reports at:
|
||||
|
||||
- ``./reports_float16/summary/Llama-3.1-70B-Instruct_throughput_report.csv``
|
||||
|
||||
- ``./reports_float8/summary/Llama-3.1-70B-Instruct-FP8-KV_throughput_report.csv``
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<style>
|
||||
mjx-container[jax="CHTML"][display="true"] {
|
||||
text-align: left;
|
||||
margin: 0;
|
||||
}
|
||||
</style>
|
||||
|
||||
.. note::
|
||||
|
||||
Throughput is calculated as:
|
||||
|
||||
- .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
|
||||
|
||||
- .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
|
||||
- .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Further reading
|
||||
===============
|
||||
@@ -446,33 +290,3 @@ Further reading
|
||||
|
||||
- To learn how to fine-tune LLMs, see
|
||||
:doc:`Fine-tuning LLMs <../fine-tuning/index>`.
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
This table lists previous versions of the ROCm vLLM Docker image for inference
|
||||
performance validation. For detailed information about available models for
|
||||
benchmarking, see the version-specific documentation.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:stub-columns: 1
|
||||
|
||||
* - ROCm version
|
||||
- vLLM version
|
||||
- PyTorch version
|
||||
- Resources
|
||||
|
||||
* - 6.2.1
|
||||
- 0.6.4
|
||||
- 2.5.0
|
||||
-
|
||||
* `Documentation <https://rocm.docs.amd.com/en/docs-6.3.0/how-to/performance-validation/mi300x/vllm-benchmark.html>`_
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4/images/sha256-ccbb74cc9e7adecb8f7bdab9555f7ac6fc73adb580836c2a35ca96ff471890d8>`_
|
||||
|
||||
* - 6.2.0
|
||||
- 0.4.3
|
||||
- 2.4.0
|
||||
-
|
||||
* `Documentation <https://rocm.docs.amd.com/en/docs-6.2.0/how-to/performance-validation/mi300x/vllm-benchmark.html>`_
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50/images/sha256-9e4dd4788a794c3d346d7d0ba452ae5e92d39b8dfac438b2af8efdc7f15d22c0>`_
|
||||
|
||||
@@ -0,0 +1,547 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: How to train a model using Megatron-LM for ROCm.
|
||||
:keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
|
||||
|
||||
******************************************
|
||||
Training a model with Megatron-LM for ROCm
|
||||
******************************************
|
||||
|
||||
The Megatron-LM framework for ROCm is a specialized fork of the robust Megatron-LM,
|
||||
designed to enable efficient training of large-scale language models on AMD
|
||||
GPUs. By leveraging AMD Instinct™ MI300X series accelerators, Megatron-LM delivers
|
||||
enhanced scalability, performance, and resource utilization for AI workloads.
|
||||
It is purpose-built to support models like Llama 2, Llama 3, Llama 3.1, and
|
||||
DeepSeek, enabling developers to train next-generation AI models more
|
||||
efficiently. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.
|
||||
|
||||
AMD provides a ready-to-use Docker image for MI300X accelerators containing
|
||||
essential components, including PyTorch, ROCm libraries, and Megatron-LM
|
||||
utilities. It contains the following software components to accelerate training
|
||||
workloads:
|
||||
|
||||
+--------------------------+--------------------------------+
|
||||
| Software component | Version |
|
||||
+==========================+================================+
|
||||
| ROCm | 6.3.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
| PyTorch | 2.7.0a0+git637433 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Python | 3.10 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Transformer Engine | 1.11 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Flash Attention | 3.0.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
| hipBLASLt | git258a2162 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Triton | 3.1 |
|
||||
+--------------------------+--------------------------------+
|
||||
|
||||
Supported features and models
|
||||
=============================
|
||||
|
||||
Megatron-LM provides the following key features to train large language models efficiently:
|
||||
|
||||
- Transformer Engine (TE)
|
||||
|
||||
- APEX
|
||||
|
||||
- GEMM tuning
|
||||
|
||||
- Torch.compile
|
||||
|
||||
- 3D parallelism: TP + SP + CP
|
||||
|
||||
- Distributed optimizer
|
||||
|
||||
- Flash Attention (FA) 3
|
||||
|
||||
- Fused kernels
|
||||
|
||||
- Pre-training
|
||||
|
||||
.. _amd-megatron-lm-model-support:
|
||||
|
||||
The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
|
||||
|
||||
* Llama 2 7B
|
||||
|
||||
* Llama 2 70B
|
||||
|
||||
* Llama 3 8B
|
||||
|
||||
* Llama 3 70B
|
||||
|
||||
* Llama 3.1 8B
|
||||
|
||||
* Llama 3.1 70B
|
||||
|
||||
* DeepSeek-V2-Lite
|
||||
|
||||
.. note::
|
||||
|
||||
Some models, such as Llama 3, require an external license agreement through
|
||||
a third party (for example, Meta).
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
If you have already validated your system settings, skip this step. Otherwise,
|
||||
complete the :ref:`system validation and optimization steps <train-a-model-system-validation>`
|
||||
to set up your system before starting training.
|
||||
|
||||
Disable NUMA auto-balancing
|
||||
---------------------------
|
||||
|
||||
Generally, application performance can benefit from disabling NUMA auto-balancing. However,
|
||||
it might be detrimental to performance with certain types of workloads.
|
||||
|
||||
Run the command ``cat /proc/sys/kernel/numa_balancing`` to check your current NUMA (Non-Uniform
|
||||
Memory Access) settings. Output ``0`` indicates this setting is disabled. If there is no output or
|
||||
the output is ``1``, run the following command to disable NUMA auto-balancing.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||
|
||||
See :ref:`mi300x-disable-numa` for more information.
|
||||
|
||||
.. _mi300x-amd-megatron-lm-training:
|
||||
|
||||
Environment setup
|
||||
=================
|
||||
|
||||
The pre-built ROCm Megatron-LM environment allows users to quickly validate system performance, conduct
|
||||
training benchmarks, and achieve superior performance for models like Llama 3.1, Llama 2, and DeepSeek V2.
|
||||
|
||||
Use the following instructions to set up the environment, configure the script to train models, and
|
||||
reproduce the benchmark results on the MI300X accelerators with the AMD Megatron-LM Docker
|
||||
image.
|
||||
|
||||
.. _amd-megatron-lm-requirements:
|
||||
|
||||
Download the Docker image
|
||||
-------------------------
|
||||
|
||||
1. Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull rocm/megatron-lm:v25.3
|
||||
|
||||
2. Launch the Docker container.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 64G --name megatron_training_env rocm/megatron-lm:v25.3
|
||||
|
||||
3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker start megatron_training_env
|
||||
docker exec -it megatron_training_env bash
|
||||
|
||||
The Docker container includes a pre-installed, verified version of Megatron-LM from the `release branch <https://github.com/ROCm/Megatron-LM/tree/megatron_release_v25.3>`_.
|
||||
|
||||
.. _amd-megatron-lm-environment-setup:
|
||||
|
||||
Configuration scripts
|
||||
---------------------
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Llama
|
||||
:sync: llama
|
||||
|
||||
If you're working with Llama 2 7B or Llama 2 70 B, use the ``train_llama2.sh`` configuration
|
||||
script in the ``examples/llama`` directory of
|
||||
`<https://github.com/ROCm/Megatron-LM/tree/megatron_release_v25.3/examples/llama>`__.
|
||||
Likewise, if you're working with Llama 3 or Llama 3.1, then use ``train_llama3.sh`` and update
|
||||
the configuration script accordingly.
|
||||
|
||||
.. tab-item:: DeepSeek V2
|
||||
:sync: deepseek
|
||||
|
||||
Use the ``train_deepseek_v2.sh`` configuration script in the ``examples/deepseek_v2``
|
||||
directory of
|
||||
`<https://github.com/ROCm/Megatron-LM/tree/megatron_release_v25.3/examples/deepseek_v2>`__
|
||||
and update the configuration script accordingly.
|
||||
|
||||
Network interface
|
||||
^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Llama
|
||||
:sync: llama
|
||||
|
||||
To avoid connectivity issues in multi-node deployments, ensure the correct network interface
|
||||
is set in your training scripts.
|
||||
|
||||
1. Run the following command (outside the container) to find the active network interface on your system.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
ip a
|
||||
|
||||
2. Update the ``NCCL_SOCKET_IFNAME`` and ``GLOO_SOCKET_IFNAME`` variables with your system’s network interface. For
|
||||
example:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export NCCL_SOCKET_IFNAME=ens50f0np0
|
||||
|
||||
export GLOO_SOCKET_IFNAME=ens50f0np0
|
||||
|
||||
Dataset options
|
||||
^^^^^^^^^^^^^^^
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Llama
|
||||
:sync: llama
|
||||
|
||||
You can use either mock data or real data for training.
|
||||
|
||||
* Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default
|
||||
value is ``1`` for enabled.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
MOCK_DATA=1
|
||||
|
||||
* If you're using a real dataset, update the ``DATA_PATH`` variable to point to the location of your dataset.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
MOCK_DATA=0
|
||||
|
||||
DATA_PATH=${DATA_PATH:-"/data/bookcorpus_text_sentence"} # Change to where your dataset is stored
|
||||
|
||||
Ensure that the files are accessible inside the Docker container.
|
||||
|
||||
.. tab-item:: DeepSeek V2
|
||||
:sync: deepseek
|
||||
|
||||
If you don't already have the dataset, download the DeepSeek dataset using the following
|
||||
commands:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
mkdir deepseek-datasets
|
||||
cd deepseek-datasets
|
||||
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json
|
||||
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json
|
||||
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json
|
||||
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.bin
|
||||
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.idx
|
||||
|
||||
You can use either mock data or real data for training.
|
||||
|
||||
* Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default
|
||||
value is ``1`` for enabled.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
MOCK_DATA=1
|
||||
|
||||
* If you're using a real dataset, update the ``DATA_DIR`` variable to point to the location of your dataset.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
MOCK_DATA=0
|
||||
|
||||
DATA_DIR="/root/data/deepseek-datasets" # Change to where your dataset is stored
|
||||
|
||||
Ensure that the files are accessible inside the Docker container.
|
||||
|
||||
Tokenizer
|
||||
^^^^^^^^^
|
||||
|
||||
Tokenization is the process of converting raw text into tokens that can be processed by the model. For Llama
|
||||
models, this typically involves sub-word tokenization, where words are broken down into smaller units based on
|
||||
a fixed vocabulary. The tokenizer is trained along with the model on a large corpus of text, and it learns a
|
||||
fixed vocabulary that can represent a wide range of text from different domains. This allows Llama models to
|
||||
handle a variety of input sequences, including unseen words or domain-specific terms.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Llama
|
||||
:sync: llama
|
||||
|
||||
To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``.
|
||||
|
||||
To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
|
||||
Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable.
|
||||
|
||||
For example, if you're using the Llama 3.1 8B model:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
TOKENIZER_MODEL=meta-llama/Llama-3.1-8B
|
||||
|
||||
.. tab-item:: DeepSeek V2
|
||||
:sync: deepseek
|
||||
|
||||
To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
|
||||
|
||||
Multi-node training
|
||||
^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Llama
|
||||
:sync: llama
|
||||
|
||||
If you're running multi-node training, update the following environment variables. They can
|
||||
also be passed as command line arguments.
|
||||
|
||||
* Change ``localhost`` to the master node's hostname:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
MASTER_ADDR="${MASTER_ADDR:-localhost}"
|
||||
|
||||
* Set the number of nodes you want to train on (for instance, ``2``, ``4``, ``8``):
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
NNODES="${NNODES:-1}"
|
||||
|
||||
* Set the rank of each node (0 for master, 1 for the first worker node, and so on):
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
NODE_RANK="${NODE_RANK:-0}"
|
||||
|
||||
* Set ``DATA_CACHE_PATH`` to a common directory accessible by all the nodes (for example, an
|
||||
NFS directory) for multi-node runs:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
DATA_CACHE_PATH=/root/cache # Set to a common directory for multi-node runs
|
||||
|
||||
* For multi-node runs, make sure the correct network drivers are installed on the nodes. If
|
||||
inside a Docker, either install the drivers inside the Docker container or pass the network
|
||||
drivers from the host while creating the Docker container.
|
||||
|
||||
Start training on AMD Instinct accelerators
|
||||
===========================================
|
||||
|
||||
The prebuilt Megatron-LM with ROCm training environment allows users to quickly validate
|
||||
system performance, conduct training benchmarks, and achieve superior
|
||||
performance for models like Llama 3.1 and Llama 2. This container should not be
|
||||
expected to provide generalized performance across all training workloads. You
|
||||
can expect the container to perform in the model configurations described in
|
||||
the following section, but other configurations are not validated by AMD.
|
||||
|
||||
Use the following instructions to set up the environment, configure the script
|
||||
to train models, and reproduce the benchmark results on MI300X series
|
||||
accelerators with the AMD Megatron-LM Docker image.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Llama
|
||||
:sync: llama
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Single node training
|
||||
:sync: single-node
|
||||
|
||||
To run training on a single node, navigate to the Megatron-LM folder and use the
|
||||
following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 bash examples/llama/train_llama3.sh
|
||||
|
||||
.. tab-item:: Multi-node training
|
||||
:sync: multi-node
|
||||
|
||||
To run training on multiple nodes, launch the Docker container on each node. For example, for a two node setup (``NODE0`` as the master node), use these commands.
|
||||
|
||||
* On the master node ``NODE0``:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=0 bash examples/llama/train_llama3.sh
|
||||
|
||||
* On the worker node ``NODE1``:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=1 bash examples/llama/train_llama3.sh
|
||||
|
||||
|
||||
.. tab-item:: DeepSeek V2
|
||||
:sync: deepseek
|
||||
|
||||
To run the training on a single node, go to ``/Megatron-LM`` folder and use the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cd /workspace/Megatron-LM
|
||||
GEMM_TUNING=1 PR=bf16 MBS=4 AC=none bash examples/deepseek_v2/train_deepseekv2.sh
|
||||
|
||||
Key options
|
||||
-----------
|
||||
|
||||
.. _amd-megatron-lm-benchmark-test-vars:
|
||||
|
||||
The benchmark tests support the following sets of variables:
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Llama
|
||||
:sync: llama
|
||||
|
||||
``TEE_OUTPUT``
|
||||
``1`` to enable training logs or ``0`` to disable.
|
||||
|
||||
``TE_FP8``
|
||||
``0`` for BP16 (default) or ``1`` for FP8 GEMMs.
|
||||
|
||||
``GEMM_TUNING``
|
||||
``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
|
||||
|
||||
``USE_FLASH_ATTN``
|
||||
``1`` to enable Flash Attention.
|
||||
|
||||
``ENABLE_PROFILING``
|
||||
``1`` to enable PyTorch profiling for performance analysis.
|
||||
|
||||
``transformer-impl``
|
||||
``transformer_engine`` to use the Transformer Engine (TE) or ``local`` to disable TE.
|
||||
|
||||
``MODEL_SIZE``
|
||||
``8B`` or ``70B`` for Llama 3 and 3.1. ``7B`` or ``70B`` for Llama 2.
|
||||
|
||||
``TOTAL_ITERS``
|
||||
The total number of iterations -- ``10`` by default.
|
||||
|
||||
``MOCK_DATA``
|
||||
``1`` to use mock data or ``0`` to use real data provided by you.
|
||||
|
||||
``MBS``
|
||||
Micro batch size.
|
||||
|
||||
``BS``
|
||||
Global batch size.
|
||||
|
||||
``TP``
|
||||
Tensor parallel (``1``, ``2``, ``4``, ``8``).
|
||||
|
||||
``SEQ_LENGTH``
|
||||
Input sequence length.
|
||||
|
||||
.. tab-item:: DeepSeek V2
|
||||
:sync: deepseek
|
||||
|
||||
``PR``
|
||||
Precision for training. ``bf16`` for BF16 (default) or ``fp8`` for FP8 GEMMs.
|
||||
|
||||
``GEMM_TUNING``
|
||||
``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
|
||||
|
||||
``TOTAL_ITERS``
|
||||
The total number of iterations -- ``10`` by default.
|
||||
|
||||
``MOCK_DATA``
|
||||
``1`` to use mock data or ``0`` to use real data provided by you.
|
||||
|
||||
``MBS``
|
||||
Micro batch size.
|
||||
|
||||
``GBS``
|
||||
Global batch size.
|
||||
|
||||
Benchmarking examples
|
||||
---------------------
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Llama
|
||||
:sync: llama
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Single node training
|
||||
:sync: single-node
|
||||
|
||||
Use this command to run training with Llama 2 7B model on a single node. You can specify MBS, BS, FP,
|
||||
datatype, and so on.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
TEE_OUTPUT=1 MBS=5 BS=120 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
||||
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
||||
|
||||
You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
|
||||
|
||||
See the sample output:
|
||||
|
||||
.. image:: ../../../../data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
|
||||
:width: 800
|
||||
|
||||
.. tab-item:: Multi-node training
|
||||
:sync: multi-node
|
||||
|
||||
Launch the Docker container on each node.
|
||||
|
||||
In this example, run training with Llama 2 7B model on 2 nodes with specific MBS, BS, FP, datatype, and
|
||||
so on.
|
||||
|
||||
On the master node:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
||||
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
||||
|
||||
On the worker node:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
||||
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
||||
|
||||
You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
|
||||
|
||||
Sample output for 2-node training:
|
||||
|
||||
Master node:
|
||||
|
||||
.. image:: ../../../../data/how-to/rocm-for-ai/2-node-training-master.png
|
||||
:width: 800
|
||||
|
||||
Worker node:
|
||||
|
||||
.. image:: ../../../../data/how-to/rocm-for-ai/2-node-training-worker.png
|
||||
:width: 800
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
This table lists previous versions of the ROCm Megatron-LM Docker image for training
|
||||
performance validation. For detailed information about available models for
|
||||
benchmarking, see the version-specific documentation.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:stub-columns: 1
|
||||
|
||||
* - ROCm version
|
||||
- Megatron-LM version
|
||||
- PyTorch version
|
||||
- Resources
|
||||
|
||||
* - 6.1
|
||||
- 24.12-dev
|
||||
- 2.4.0
|
||||
-
|
||||
* `Documentation <https://rocm.docs.amd.com/en/docs-6.3.0/how-to/rocm-for-ai/train-a-model.html>`_
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/24.12-dev/images/sha256-5818c50334ce3d69deeeb8f589d83ec29003817da34158ebc9e2d112b929bf2e>`_
|
||||
@@ -0,0 +1,341 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: How to train a model using PyTorch for ROCm.
|
||||
:keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
|
||||
|
||||
**************************************
|
||||
Training a model with PyTorch for ROCm
|
||||
**************************************
|
||||
|
||||
PyTorch is an open-source machine learning framework that is widely used for
|
||||
model training with GPU-optimized components for transformer-based models.
|
||||
|
||||
The PyTorch for ROCm training Docker (``rocm/pytorch-training:v25.3``) image
|
||||
provides a prebuilt optimized environment for fine-tuning and pretraining a
|
||||
model on AMD Instinct MI325X and MI300X accelerators. It includes the following
|
||||
software components to accelerate training workloads:
|
||||
|
||||
+--------------------------+--------------------------------+
|
||||
| Software component | Version |
|
||||
+==========================+================================+
|
||||
| ROCm | 6.3.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
| PyTorch | 2.7.0a0+git637433 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Python | 3.10 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Transformer Engine | 1.11 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Flash Attention | 3.0.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
| hipBLASLt | git258a2162 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Triton | 3.1 |
|
||||
+--------------------------+--------------------------------+
|
||||
|
||||
.. _amd-pytorch-training-model-support:
|
||||
|
||||
Supported models
|
||||
================
|
||||
|
||||
The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
|
||||
|
||||
* Llama 3.1 8B
|
||||
|
||||
* Llama 3.1 70B
|
||||
|
||||
* FLUX.1-dev
|
||||
|
||||
.. note::
|
||||
|
||||
Only these models are supported in the following steps.
|
||||
|
||||
Some models, such as Llama 3, require an external license agreement through
|
||||
a third party (for example, Meta).
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
If you have already validated your system settings, skip this step. Otherwise,
|
||||
complete the :ref:`system validation and optimization steps <train-a-model-system-validation>`
|
||||
to set up your system before starting training.
|
||||
|
||||
Disable NUMA auto-balancing
|
||||
---------------------------
|
||||
|
||||
Generally, application performance can benefit from disabling NUMA auto-balancing. However,
|
||||
it might be detrimental to performance with certain types of workloads.
|
||||
|
||||
Run the command ``cat /proc/sys/kernel/numa_balancing`` to check your current NUMA (Non-Uniform
|
||||
Memory Access) settings. Output ``0`` indicates this setting is disabled. If there is no output or
|
||||
the output is ``1``, run the following command to disable NUMA auto-balancing.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||
|
||||
See :ref:`mi300x-disable-numa` for more information.
|
||||
|
||||
Environment setup
|
||||
=================
|
||||
|
||||
This Docker image is optimized for specific model configurations outlined
|
||||
below. Performance can vary for other training workloads, as AMD
|
||||
doesn’t validate configurations and run conditions outside those described.
|
||||
|
||||
Download the Docker image
|
||||
-------------------------
|
||||
|
||||
1. Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull rocm/pytorch-training:v25.3
|
||||
|
||||
2. Run the Docker container.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env rocm/pytorch-training:v25.3
|
||||
|
||||
3. Use these commands if you exit the ``training_env`` container and need to return to it.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker start training_env
|
||||
docker exec -it training_env bash
|
||||
|
||||
4. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__ repository and navigate to the benchmark scripts directory.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD/scripts/pytorch-train
|
||||
|
||||
Prepare training datasets and dependencies
|
||||
------------------------------------------
|
||||
|
||||
The following benchmarking examples may require downloading models and datasets
|
||||
from Hugging Face. To ensure successful access to gated repos, set your
|
||||
``HF_TOKEN``.
|
||||
|
||||
Run the setup script to install libraries and datasets needed for benchmarking.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_setup.sh
|
||||
|
||||
``pytorch_benchmark_setup.sh`` installs the following libraries:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Library
|
||||
- Benchmark model
|
||||
- Reference
|
||||
|
||||
* - ``accelerate``
|
||||
- Llama 3.1 8B, FLUX
|
||||
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
||||
|
||||
* - ``datasets``
|
||||
- Llama 3.1 8B, 70B, FLUX
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||
|
||||
* - ``torchdata``
|
||||
- Llama 3.1 70B
|
||||
- `TorchData <https://pytorch.org/data/beta/index.html>`_
|
||||
|
||||
* - ``tomli``
|
||||
- Llama 3.1 70B
|
||||
- `Tomli <https://pypi.org/project/tomli/>`_
|
||||
|
||||
* - ``tiktoken``
|
||||
- Llama 3.1 70B
|
||||
- `tiktoken <https://github.com/openai/tiktoken>`_
|
||||
|
||||
* - ``blobfile``
|
||||
- Llama 3.1 70B
|
||||
- `blobfile <https://pypi.org/project/blobfile/>`_
|
||||
|
||||
* - ``tabulate``
|
||||
- Llama 3.1 70B
|
||||
- `tabulate <https://pypi.org/project/tabulate/>`_
|
||||
|
||||
* - ``wandb``
|
||||
- Llama 3.1 70B
|
||||
- `Weights & Biases <https://github.com/wandb/wandb>`_
|
||||
|
||||
* - ``sentencepiece``
|
||||
- Llama 3.1 70B, FLUX
|
||||
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
|
||||
|
||||
* - ``tensorboard``
|
||||
- Llama 3.1 70 B, FLUX
|
||||
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
|
||||
|
||||
* - ``csvkit``
|
||||
- FLUX
|
||||
- `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
|
||||
|
||||
* - ``deepspeed``
|
||||
- FLUX
|
||||
- `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
|
||||
|
||||
* - ``diffusers``
|
||||
- FLUX
|
||||
- `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
|
||||
|
||||
* - ``GitPython``
|
||||
- FLUX
|
||||
- `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
|
||||
|
||||
* - ``opencv-python-headless``
|
||||
- FLUX
|
||||
- `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
|
||||
|
||||
* - ``peft``
|
||||
- FLUX
|
||||
- `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
|
||||
|
||||
* - ``protobuf``
|
||||
- FLUX
|
||||
- `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
|
||||
|
||||
* - ``pytest``
|
||||
- FLUX
|
||||
- `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
|
||||
|
||||
* - ``python-dotenv``
|
||||
- FLUX
|
||||
- `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
|
||||
|
||||
* - ``seaborn``
|
||||
- FLUX
|
||||
- `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
|
||||
|
||||
* - ``transformers``
|
||||
- FLUX
|
||||
- `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
|
||||
|
||||
``pytorch_benchmark_setup.sh`` downloads the following models from Hugging Face:
|
||||
|
||||
* `meta-llama/Llama-3.1-70B-Instruct <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
|
||||
|
||||
* `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
|
||||
|
||||
Along with the following datasets:
|
||||
|
||||
* `WikiText <https://huggingface.co/datasets/Salesforce/wikitext>`_
|
||||
|
||||
* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
|
||||
|
||||
Start training on AMD Instinct accelerators
|
||||
===========================================
|
||||
|
||||
The prebuilt PyTorch with ROCm training environment allows users to quickly validate
|
||||
system performance, conduct training benchmarks, and achieve superior
|
||||
performance for models like Llama 3.1 and Llama 2. This container should not be
|
||||
expected to provide generalized performance across all training workloads. You
|
||||
can expect the container to perform in the model configurations described in
|
||||
the following section, but other configurations are not validated by AMD.
|
||||
|
||||
Use the following instructions to set up the environment, configure the script
|
||||
to train models, and reproduce the benchmark results on MI300X series
|
||||
accelerators with the AMD PyTorch training Docker image.
|
||||
|
||||
Once your environment is set up, use the following commands and examples to start benchmarking.
|
||||
|
||||
Pretraining
|
||||
-----------
|
||||
|
||||
To start the pretraining benchmark, use the following command with the
|
||||
appropriate options. See the following list of options and their descriptions.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t $training_mode -m $model_repo -p $datatype -s $sequence_length
|
||||
|
||||
Options and available models
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Name
|
||||
- Options
|
||||
- Description
|
||||
|
||||
* - ``$training_mode``
|
||||
- ``pretrain``
|
||||
- Benchmark pretraining
|
||||
|
||||
* -
|
||||
- ``finetune_fw``
|
||||
- Benchmark full weight fine-tuning (Llama 3.1 70B with BF16)
|
||||
|
||||
* -
|
||||
- ``finetune_lora``
|
||||
- Benchmark LoRA fine-tuning (Llama 3.1 70B with BF16)
|
||||
|
||||
* - ``$datatype``
|
||||
- FP8 or BF16
|
||||
- Only Llama 3.1 8B supports FP8 precision.
|
||||
|
||||
* - ``$model_repo``
|
||||
- Llama-3.1-8B
|
||||
- `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct>`_
|
||||
|
||||
* -
|
||||
- Llama-3.1-70B
|
||||
- `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
|
||||
|
||||
* -
|
||||
- Flux
|
||||
- `FLUX.1 [dev] <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
|
||||
|
||||
Fine-tuning
|
||||
-----------
|
||||
|
||||
To start the fine-tuning benchmark, use the following command. It will run the benchmarking example of Llama 2 70B
|
||||
with the WikiText dataset using the AMD fork of `torchtune <https://github.com/AMD-AIG-AIMA/torchtune>`_.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t {finetune_fw, finetune_lora} -p BF16 -m Llama-3.1-70B
|
||||
|
||||
Benchmarking examples
|
||||
---------------------
|
||||
|
||||
Here are some examples of how to use the command.
|
||||
|
||||
* Example 1: Llama 3.1 70B with BF16 precision with `torchtitan <https://github.com/ROCm/torchtitan>`_.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Llama-3.1-70B -s 8192
|
||||
|
||||
* Example 2: Llama 3.1 8B with FP8 precision using Transformer Engine (TE) and Hugging Face Accelerator.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t pretrain -p FP8 -m Llama-3.1-70B -s 8192
|
||||
|
||||
* Example 3: FLUX.1-dev with BF16 precision with FluxBenchmark.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Flux
|
||||
|
||||
* Example 4: Torchtune full weight fine-tuning with Llama 3.1 70B
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.1-70B
|
||||
|
||||
* Example 5: Torchtune LoRA fine-tuning with Llama 3.1 70B
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.1-70B
|
||||
@@ -19,6 +19,10 @@ training, fine-tuning, and inference. It leverages popular machine learning fram
|
||||
|
||||
In this guide, you'll learn about:
|
||||
|
||||
- :doc:`Training a model <train-a-model>`
|
||||
- Training a model
|
||||
|
||||
- :doc:`Scale model training <scale-model-training>`
|
||||
- :doc:`Train a model with Megatron-LM <benchmark-docker/megatron-lm>`
|
||||
|
||||
- :doc:`Train a model with PyTorch <benchmark-docker/pytorch-training>`
|
||||
|
||||
- :doc:`Scaling model training <scale-model-training>`
|
||||
|
||||
@@ -0,0 +1,130 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: Prerequisite system validation before using ROCm for AI.
|
||||
:keywords: ROCm, AI, LLM, train, megatron, Llama, tutorial, docker, torch, pytorch, jax
|
||||
|
||||
.. _train-a-model-system-validation:
|
||||
|
||||
**********************************************
|
||||
Prerequisite system validation before training
|
||||
**********************************************
|
||||
|
||||
Complete the following system validation and optimization steps to set up your system before starting training.
|
||||
|
||||
Disable NUMA auto-balancing
|
||||
---------------------------
|
||||
|
||||
Generally, application performance can benefit from disabling NUMA auto-balancing. However,
|
||||
it might be detrimental to performance with certain types of workloads.
|
||||
|
||||
Run the command ``cat /proc/sys/kernel/numa_balancing`` to check your current NUMA (Non-Uniform
|
||||
Memory Access) settings. Output ``0`` indicates this setting is disabled. If there is no output or
|
||||
the output is ``1``, run the following command to disable NUMA auto-balancing.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||
|
||||
See :ref:`mi300x-disable-numa` for more information.
|
||||
|
||||
Hardware verification with ROCm
|
||||
-------------------------------
|
||||
|
||||
Use the command ``rocm-smi --setperfdeterminism 1900`` to set the max clock speed up to 1900 MHz
|
||||
instead of the default 2100 MHz. This can reduce the chance of a PCC event lowering the attainable
|
||||
GPU clocks. This setting will not be required for new IFWI releases with the production PRC feature.
|
||||
You can restore this setting to its default value with the ``rocm-smi -r`` command.
|
||||
|
||||
Run the command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
rocm-smi --setperfdeterminism 1900
|
||||
|
||||
See :ref:`mi300x-hardware-verification-with-rocm` for more information.
|
||||
|
||||
RCCL Bandwidth Test for multi-node setups
|
||||
-----------------------------------------
|
||||
|
||||
ROCm Collective Communications Library (RCCL) is a standalone library of standard collective communication
|
||||
routines for GPUs. See the :doc:`RCCL documentation <rccl:index>` for more information. Before starting
|
||||
pretraining, running a RCCL bandwidth test helps ensure that the multi-GPU or multi-node setup is optimized
|
||||
for efficient distributed training.
|
||||
|
||||
Running the RCCL bandwidth test helps verify that:
|
||||
|
||||
- The GPUs can communicate across nodes or within a single node.
|
||||
|
||||
- The interconnect (such as InfiniBand, Ethernet, or Infinite fabric) is functioning as expected and
|
||||
provides adequate bandwidth for communication.
|
||||
|
||||
- No hardware setup or cabling issues could affect the communication between GPUs
|
||||
|
||||
Tuning and optimizing hyperparameters
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
In distributed training, specific hyperparameters related to distributed communication can be tuned based on
|
||||
the results of the RCCL bandwidth test. These variables are already set in the Docker image:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# force all RCCL streams to be high priority
|
||||
export TORCH_NCCL_HIGH_PRIORITY=1
|
||||
|
||||
# specify which RDMA interfaces to use for communication
|
||||
export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
|
||||
|
||||
# define the Global ID index used in RoCE mode
|
||||
export NCCL_IB_GID_INDEX=3
|
||||
|
||||
# avoid data corruption/mismatch issue that existed in past releases
|
||||
export RCCL_MSCCL_ENABLE=0
|
||||
|
||||
Running the RCCL Bandwidth Test
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
It's recommended you run the RCCL bandwidth test before launching training. It ensures system
|
||||
performance is sufficient to launch training. RCCL is not included in the AMD Megatron-LM Docker
|
||||
image; follow the instructions in `<https://github.com/ROCm/rccl-tests>`__ to get started.
|
||||
See :ref:`mi300x-rccl` for more information.
|
||||
|
||||
Run on 8 GPUs (``-g 8``), scanning from 8 bytes to 10 GB:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./build/all_reduce_perf -b 8 -e 10G -f 2 -g 8
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/rccl-tests-8-gpu.png
|
||||
:width: 800
|
||||
|
||||
Using one MPI process per GPU and ``-g 1`` for performance-oriented runs on both single-node and multi-node is
|
||||
recommended. So, a run on 8 GPUs looks something like:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
mpirun -np 8 --bind-to numa ./build/all_reduce_perf -b 8 -e 10G -f 2 -g 1
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/rccl-tests-1-mpi-process-per-gpu.png
|
||||
:width: 800
|
||||
|
||||
Running with one MPI process per GPU ensures a one-to-one mapping for CPUs and GPUs, which can be beneficial
|
||||
for smaller message sizes. This better represents the real-world use of RCCL in deep learning frameworks like
|
||||
PyTorch and TensorFlow.
|
||||
|
||||
Use the following script to run the RCCL test for four MI300X GPU nodes. Modify paths and node addresses as needed.
|
||||
|
||||
.. code-block::
|
||||
|
||||
/home/$USER/ompi_for_gpu/ompi/bin/mpirun -np 32 -H tw022:8,tw024:8,tw010:8, tw015:8 \
|
||||
--mca pml ucx \
|
||||
--mca btl ^openib \
|
||||
-x NCCL_SOCKET_IFNAME=ens50f0np0 \
|
||||
-x NCCL_IB_HCA=rdma0:1,rdma1:1,rdma2:1,rdma3:1,rdma4:1,rdma5:1,rdma6:1,rdma7:1 \
|
||||
-x NCCL_IB_GID_INDEX=3 \
|
||||
-x NCCL_MIN_NCHANNELS=40 \
|
||||
-x NCCL_DEBUG=version \
|
||||
$HOME/rccl-tests/build/all_reduce_perf -b 8 -e 8g -f 2 -g 1
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/rccl-tests-4-mi300x-gpu-nodes.png
|
||||
:width: 800
|
||||
@@ -1,503 +0,0 @@
|
||||
.. meta::
|
||||
:description: How to train a model using ROCm Megatron-LM
|
||||
:keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
|
||||
|
||||
**************************************
|
||||
Training a model with ROCm Megatron-LM
|
||||
**************************************
|
||||
|
||||
.. _amd-megatron-lm:
|
||||
|
||||
The ROCm Megatron-LM framework is a specialized fork of the robust Megatron-LM, designed to
|
||||
enable efficient training of large-scale language models on AMD GPUs. By leveraging AMD Instinct™ MI300X
|
||||
accelerators, AMD Megatron-LM delivers enhanced scalability, performance, and resource utilization for AI
|
||||
workloads. It is purpose-built to :ref:`support models <amd-megatron-lm-model-support>`
|
||||
like Meta's Llama 2, Llama 3, and Llama 3.1, enabling developers to train next-generation AI models with greater
|
||||
efficiency. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.
|
||||
|
||||
For ease of use, AMD provides a ready-to-use Docker image for MI300X accelerators containing essential
|
||||
components, including PyTorch, PyTorch Lightning, ROCm libraries, and Megatron-LM utilities. It contains the
|
||||
following software to accelerate training workloads:
|
||||
|
||||
+--------------------------+--------------------------------+
|
||||
| Software component | Version |
|
||||
+==========================+================================+
|
||||
| ROCm | 6.1 |
|
||||
+--------------------------+--------------------------------+
|
||||
| PyTorch | 2.4.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
| PyTorch Lightning | 2.4.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Megatron Core | 0.9.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Transformer Engine | 1.5.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Flash Attention | v2.6 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Transformers | 4.44.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
|
||||
Supported features and models
|
||||
=============================
|
||||
|
||||
Megatron-LM provides the following key features to train large language models efficiently:
|
||||
|
||||
- Transformer Engine (TE)
|
||||
|
||||
- APEX
|
||||
|
||||
- GEMM tuning
|
||||
|
||||
- Torch.compile
|
||||
|
||||
- 3D parallelism: TP + SP + CP
|
||||
|
||||
- Distributed optimizer
|
||||
|
||||
- Flash Attention (FA) 2
|
||||
|
||||
- Fused kernels
|
||||
|
||||
- Pre-training
|
||||
|
||||
.. _amd-megatron-lm-model-support:
|
||||
|
||||
The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
|
||||
|
||||
* Llama 2 7B
|
||||
|
||||
* Llama 2 70B
|
||||
|
||||
* Llama 3 8B
|
||||
|
||||
* Llama 3 70B
|
||||
|
||||
* Llama 3.1 8B
|
||||
|
||||
* Llama 3.1 70B
|
||||
|
||||
Prerequisite system validation steps
|
||||
====================================
|
||||
|
||||
Complete the following system validation and optimization steps to set up your system before starting training.
|
||||
|
||||
Disable NUMA auto-balancing
|
||||
---------------------------
|
||||
|
||||
Generally, application performance can benefit from disabling NUMA auto-balancing. However,
|
||||
it might be detrimental to performance with certain types of workloads.
|
||||
|
||||
Run the command ``cat /proc/sys/kernel/numa_balancing`` to check your current NUMA (Non-Uniform
|
||||
Memory Access) settings. Output ``0`` indicates this setting is disabled. If there is no output or
|
||||
the output is ``1``, run the following command to disable NUMA auto-balancing.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||
|
||||
See :ref:`mi300x-disable-numa` for more information.
|
||||
|
||||
Hardware verification with ROCm
|
||||
-------------------------------
|
||||
|
||||
Use the command ``rocm-smi --setperfdeterminism 1900`` to set the max clock speed up to 1900 MHz
|
||||
instead of the default 2100 MHz. This can reduce the chance of a PCC event lowering the attainable
|
||||
GPU clocks. This setting will not be required for new IFWI releases with the production PRC feature.
|
||||
You can restore this setting to its default value with the ``rocm-smi -r`` command.
|
||||
|
||||
Run the command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
rocm-smi --setperfdeterminism 1900
|
||||
|
||||
See :ref:`mi300x-hardware-verification-with-rocm` for more information.
|
||||
|
||||
RCCL Bandwidth Test
|
||||
-------------------
|
||||
|
||||
ROCm Collective Communications Library (RCCL) is a standalone library of standard collective communication
|
||||
routines for GPUs. See the :doc:`RCCL documentation <rccl:index>` for more information. Before starting
|
||||
pre-training, running a RCCL bandwidth test helps ensure that the multi-GPU or multi-node setup is optimized
|
||||
for efficient distributed training.
|
||||
|
||||
Running the RCCL bandwidth test helps verify that:
|
||||
|
||||
- The GPUs can communicate across nodes or within a single node.
|
||||
|
||||
- The interconnect (such as InfiniBand, Ethernet, or Infinite fabric) is functioning as expected and
|
||||
provides adequate bandwidth for communication.
|
||||
|
||||
- No hardware setup or cabling issues could affect the communication between GPUs
|
||||
|
||||
Tuning and optimizing hyperparameters
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
In distributed training, specific hyperparameters related to distributed communication can be tuned based on
|
||||
the results of the RCCL bandwidth test. These variables are already set in the Docker image:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# force all RCCL streams to be high priority
|
||||
export TORCH_NCCL_HIGH_PRIORITY=1
|
||||
|
||||
# specify which RDMA interfaces to use for communication
|
||||
export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
|
||||
|
||||
# define the Global ID index used in RoCE mode
|
||||
export NCCL_IB_GID_INDEX=3
|
||||
|
||||
# avoid data corruption/mismatch issue that existed in past releases
|
||||
export RCCL_MSCCL_ENABLE=0
|
||||
|
||||
Running the RCCL Bandwidth Test
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
It's recommended you run the RCCL bandwidth test before launching training. It ensures system
|
||||
performance is sufficient to launch training. RCCL is not included in the AMD Megatron-LM Docker
|
||||
image; follow the instructions in `<https://github.com/ROCm/rccl-tests>`__ to get started.
|
||||
See :ref:`mi300x-rccl` for more information.
|
||||
|
||||
Run on 8 GPUs (``-g 8``), scanning from 8 bytes to 10 GB:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./build/all_reduce_perf -b 8 -e 10G -f 2 -g 8
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/rccl-tests-8-gpu.png
|
||||
:width: 800
|
||||
|
||||
Using one MPI process per GPU and ``-g 1`` for performance-oriented runs on both single-node and multi-node is
|
||||
recommended. So, a run on 8 GPUs looks something like:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
mpirun -np 8 --bind-to numa ./build/all_reduce_perf -b 8 -e 10G -f 2 -g 1
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/rccl-tests-1-mpi-process-per-gpu.png
|
||||
:width: 800
|
||||
|
||||
Running with one MPI process per GPU ensures a one-to-one mapping for CPUs and GPUs, which can be beneficial
|
||||
for smaller message sizes. This better represents the real-world use of RCCL in deep learning frameworks like
|
||||
PyTorch and TensorFlow.
|
||||
|
||||
Use the following script to run the RCCL test for four MI300X GPU nodes. Modify paths and node addresses as needed.
|
||||
|
||||
.. code-block::
|
||||
|
||||
/home/$USER/ompi_for_gpu/ompi/bin/mpirun -np 32 -H tw022:8,tw024:8,tw010:8, tw015:8 \
|
||||
--mca pml ucx \
|
||||
--mca btl ^openib \
|
||||
-x NCCL_SOCKET_IFNAME=ens50f0np0 \
|
||||
-x NCCL_IB_HCA=rdma0:1,rdma1:1,rdma2:1,rdma3:1,rdma4:1,rdma5:1,rdma6:1,rdma7:1 \
|
||||
-x NCCL_IB_GID_INDEX=3 \
|
||||
-x NCCL_MIN_NCHANNELS=40 \
|
||||
-x NCCL_DEBUG=version \
|
||||
$HOME/rccl-tests/build/all_reduce_perf -b 8 -e 8g -f 2 -g 1
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/rccl-tests-4-mi300x-gpu-nodes.png
|
||||
:width: 800
|
||||
|
||||
.. _mi300x-amd-megatron-lm-training:
|
||||
|
||||
Start training on MI300X accelerators
|
||||
=====================================
|
||||
|
||||
The pre-built ROCm Megatron-LM environment allows users to quickly validate system performance, conduct
|
||||
training benchmarks, and achieve superior performance for models like Llama 2 and Llama 3.1.
|
||||
|
||||
Use the following instructions to set up the environment, configure the script to train models, and
|
||||
reproduce the benchmark results on the MI300X accelerators with the AMD Megatron-LM Docker
|
||||
image.
|
||||
|
||||
.. _amd-megatron-lm-requirements:
|
||||
|
||||
Download the Docker image and required packages
|
||||
-----------------------------------------------
|
||||
|
||||
1. Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull rocm/megatron-lm:24.12-dev
|
||||
|
||||
2. Launch the Docker container.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $CACHE_DIR:/root/.cache --name megatron-dev-env rocm/megatron-lm:24.12-dev /bin/bash
|
||||
|
||||
3. Clone the ROCm Megatron-LM repository to a local directory and install the required packages on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/Megatron-LM
|
||||
cd Megatron-LM
|
||||
|
||||
.. note::
|
||||
|
||||
This release is validated with ``ROCm/Megatron-LM`` commit `bb93ccb <https://github.com/ROCm/Megatron-LM/tree/bb93ccbfeae6363c67b361a97a27c74ab86e7e92>`_.
|
||||
Checking out this specific commit is recommended for a stable and reproducible environment.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git checkout bb93ccbfeae6363c67b361a97a27c74ab86e7e92
|
||||
|
||||
Prepare training datasets
|
||||
-------------------------
|
||||
|
||||
If you already have the preprocessed data, you can skip this section.
|
||||
|
||||
Use the following command to process datasets. We use GPT data as an example. You may change the merge table, use an
|
||||
end-of-document token, remove sentence splitting, and use the tokenizer type.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
python tools/preprocess_data.py \
|
||||
--input my-corpus.json \
|
||||
--output-prefix my-gpt2 \
|
||||
--vocab-file gpt2-vocab.json \
|
||||
--tokenizer-type GPT2BPETokenizer \
|
||||
--merge-file gpt2-merges.txt \
|
||||
--append-eod
|
||||
|
||||
In this case, the automatically generated output files are named ``my-gpt2_text_document.bin`` and
|
||||
``my-gpt2_text_document.idx``.
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/prep-training-datasets-my-gpt2-text-document.png
|
||||
:width: 800
|
||||
|
||||
.. _amd-megatron-lm-environment-setup:
|
||||
|
||||
Environment setup
|
||||
-----------------
|
||||
|
||||
In the ``examples/llama`` directory of Megatron-LM, if you're working with Llama 2 7B or Llama 2 70 B, use the
|
||||
``train_llama2.sh`` configuration script. Likewise, if you're working with Llama 3 or Llama 3.1, then use
|
||||
``train_llama3.sh`` and update the configuration script accordingly.
|
||||
|
||||
Network interface
|
||||
^^^^^^^^^^^^^^^^^
|
||||
|
||||
To avoid connectivity issues, ensure the correct network interface is set in your training scripts.
|
||||
|
||||
1. Run the following command to find the active network interface on your system.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
ip a
|
||||
|
||||
2. Update the ``NCCL_SOCKET_IFNAME`` and ``GLOO_SOCKET_IFNAME`` variables with your system’s network interface. For
|
||||
example:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export NCCL_SOCKET_IFNAME=ens50f0np0
|
||||
|
||||
export GLOO_SOCKET_IFNAME=ens50f0np0
|
||||
|
||||
Dataset options
|
||||
^^^^^^^^^^^^^^^
|
||||
|
||||
You can use either mock data or real data for training.
|
||||
|
||||
* If you're using a real dataset, update the ``DATA_PATH`` variable to point to the location of your dataset.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
DATA_DIR="/root/.cache/data" # Change to where your dataset is stored
|
||||
|
||||
DATA_PATH=${DATA_DIR}/bookcorpus_text_sentence
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
--data-path $DATA_PATH
|
||||
|
||||
Ensure that the files are accessible inside the Docker container.
|
||||
|
||||
* Mock data can be useful for testing and validation. If you're using mock data, replace ``--data-path $DATA_PATH`` with the ``--mock-data`` option.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
--mock-data
|
||||
|
||||
Tokenizer
|
||||
^^^^^^^^^
|
||||
|
||||
Tokenization is the process of converting raw text into tokens that can be processed by the model. For Llama
|
||||
models, this typically involves sub-word tokenization, where words are broken down into smaller units based on
|
||||
a fixed vocabulary. The tokenizer is trained along with the model on a large corpus of text, and it learns a
|
||||
fixed vocabulary that can represent a wide range of text from different domains. This allows Llama models to
|
||||
handle a variety of input sequences, including unseen words or domain-specific terms.
|
||||
|
||||
To train any of the Llama 2 models that this Docker image supports, use the ``Llama2Tokenizer``.
|
||||
|
||||
To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
|
||||
Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable.
|
||||
|
||||
For example, if you're using the Llama 3.1 8B model:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
TOKENIZER_MODEL=meta-llama/Llama-3.1-8B
|
||||
|
||||
Run benchmark tests
|
||||
-------------------
|
||||
|
||||
.. note::
|
||||
|
||||
If you're running **multi node training**, update the following environment variables. They can
|
||||
also be passed as command line arguments.
|
||||
|
||||
* Change ``localhost`` to the master node's hostname:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
MASTER_ADDR="${MASTER_ADDR:-localhost}"
|
||||
|
||||
* Set the number of nodes you want to train on (for instance, ``2``, ``4``, ``8``):
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
NNODES="${NNODES:-1}"
|
||||
|
||||
* Set the rank of each node (0 for master, 1 for the first worker node, and so on):
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
NODE_RANK="${NODE_RANK:-0}"
|
||||
|
||||
* Use this command to run a performance benchmark test of any of the Llama 2 models that this Docker image supports (see :ref:`variables <amd-megatron-lm-benchmark-test-vars>`).
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
{variables} bash examples/llama/train_llama2.sh
|
||||
|
||||
* Use this command to run a performance benchmark test of any of the Llama 3 and Llama 3.1 models that this Docker image supports (see :ref:`variables <amd-megatron-lm-benchmark-test-vars>`).
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
{variables} bash examples/llama/train_llama3.sh
|
||||
|
||||
.. _amd-megatron-lm-benchmark-test-vars:
|
||||
|
||||
The benchmark tests support the same set of variables:
|
||||
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| Name | Options | Description |
|
||||
+==========================+=======================+=======================+
|
||||
| ``TEE_OUTPUT`` | 0 or 1 | 0: disable training |
|
||||
| | | log |
|
||||
| | | |
|
||||
| | | 1: enable training |
|
||||
| | | log |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``MBS`` | | Micro batch size |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``BS`` | | Batch size |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``TP`` | 1, 2, 4, 8 | Tensor parallel |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``TE_FP8`` | 0 or 1 | Datatype. |
|
||||
| | | If it is set to 1, |
|
||||
| | | FP8. |
|
||||
| | | |
|
||||
| | | If it is set to 0. |
|
||||
| | | BP16 |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``NO_TORCH_COMPILE`` | 0 or 1 | If it is set to 1, |
|
||||
| | | enable torch.compile. |
|
||||
| | | |
|
||||
| | | If it is set to 0. |
|
||||
| | | Disable torch.compile |
|
||||
| | | (default) |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``SEQ_LENGTH`` | | Input sequence length |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``GEMM_TUNING`` | 0 or 1 | If it is set to 1, |
|
||||
| | | enable gemm tuning. |
|
||||
| | | |
|
||||
| | | If it is set to 0, |
|
||||
| | | disable gemm tuning |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``USE_FLASH_ATTN`` | 0 or 1 | 0: disable flash |
|
||||
| | | attention |
|
||||
| | | |
|
||||
| | | 1: enable flash |
|
||||
| | | attention |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``ENABLE_PROFILING`` | 0 or 1 | 0: disable torch |
|
||||
| | | profiling |
|
||||
| | | |
|
||||
| | | 1: enable torch |
|
||||
| | | profiling |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``MODEL_SIZE`` | | The size of the mode: |
|
||||
| | | 7B/70B, etc. |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``TOTAL_ITERS`` | | Total number of |
|
||||
| | | iterations |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``transformer-impl`` | transformer_engine or | Enable transformer |
|
||||
| | local | engine by default |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
|
||||
Benchmarking examples
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Single node training
|
||||
:sync: single
|
||||
|
||||
Use this command to run training with Llama 2 7B model on a single node. You can specify MBS, BS, FP,
|
||||
datatype, and so on.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
TEE_OUTPUT=1 MBS=5 BS=120 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
||||
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
||||
|
||||
You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
|
||||
|
||||
See the sample output:
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
|
||||
:width: 800
|
||||
|
||||
.. tab-item:: Multi node training
|
||||
:sync: multi
|
||||
|
||||
Launch the Docker container on each node.
|
||||
|
||||
In this example, run training with Llama 2 7B model on 2 nodes with specific MBS, BS, FP, datatype, and
|
||||
so on.
|
||||
|
||||
On the master node:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
||||
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
||||
|
||||
On the worker node:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
||||
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
||||
|
||||
You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
|
||||
|
||||
Sample output for 2-node training:
|
||||
|
||||
Master node:
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/2-node-training-master.png
|
||||
:width: 800
|
||||
|
||||
Worker node:
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/2-node-training-worker.png
|
||||
:width: 800
|
||||
|
||||
@@ -308,6 +308,24 @@ Otherwise, if the system has Intel host CPUs add this instead to
|
||||
|
||||
intel_iommu=on iommu=pt
|
||||
|
||||
``modprobe.blacklist=amdgpu``
|
||||
For some system configurations, the ``amdgpu`` driver needs to be blocked during kernel initialization to avoid an issue where after boot, the GPUs are not listed when running the command ``rocm-smi`` or ``amd-smi``.
|
||||
|
||||
Alternatively, configuring the AMD recommended system optimized BIOS settings might remove the need for using this setting. Some manufacturers and users might not implement the recommended system optimized BIOS settings.
|
||||
|
||||
If you experience the mentioned issue, then add this to ``GRUB_CMDLINE_LINUX``:
|
||||
|
||||
.. code-block:: text
|
||||
|
||||
modprobe.blacklist=amdgpu
|
||||
|
||||
After the change, the ``amdgpu`` module must be loaded to support the ROCm framework
|
||||
software tools and utilities. Run the following command to load the ``amdgpu`` module:
|
||||
|
||||
.. code-block:: text
|
||||
|
||||
sudo modprobe amdgpu
|
||||
|
||||
Update GRUB
|
||||
-----------
|
||||
|
||||
|
||||
@@ -21,8 +21,6 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- Model
|
||||
- Architecture
|
||||
- LLVM target name
|
||||
- Device Major version
|
||||
- Device Minor version
|
||||
- VRAM (GiB)
|
||||
- Compute Units
|
||||
- Wavefront Size
|
||||
@@ -34,12 +32,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- L1 Instruction Cache (KiB)
|
||||
- VGPR File (KiB)
|
||||
- SGPR File (KiB)
|
||||
- GFXIP Major version
|
||||
- GFXIP Minor version
|
||||
*
|
||||
- MI325X
|
||||
- CDNA3
|
||||
- gfx942
|
||||
- 9
|
||||
- 4
|
||||
- 256
|
||||
- 304 (38 per XCD)
|
||||
- 64
|
||||
@@ -51,12 +49,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 64 per 2 CUs
|
||||
- 512
|
||||
- 12.5
|
||||
- 9
|
||||
- 4
|
||||
*
|
||||
- MI300X
|
||||
- CDNA3
|
||||
- gfx942
|
||||
- 9
|
||||
- 4
|
||||
- 192
|
||||
- 304 (38 per XCD)
|
||||
- 64
|
||||
@@ -68,12 +66,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 64 per 2 CUs
|
||||
- 512
|
||||
- 12.5
|
||||
- 9
|
||||
- 4
|
||||
*
|
||||
- MI300A
|
||||
- CDNA3
|
||||
- gfx942
|
||||
- 9
|
||||
- 4
|
||||
- 128
|
||||
- 228 (38 per XCD)
|
||||
- 64
|
||||
@@ -85,12 +83,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 64 per 2 CUs
|
||||
- 512
|
||||
- 12.5
|
||||
- 9
|
||||
- 4
|
||||
*
|
||||
- MI250X
|
||||
- CDNA2
|
||||
- gfx90a
|
||||
- 9
|
||||
- 0
|
||||
- 128
|
||||
- 220 (110 per GCD)
|
||||
- 64
|
||||
@@ -102,12 +100,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 2 CUs
|
||||
- 512
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
*
|
||||
- MI250
|
||||
- CDNA2
|
||||
- gfx90a
|
||||
- 9
|
||||
- 0
|
||||
- 128
|
||||
- 208 (104 per GCD)
|
||||
- 64
|
||||
@@ -119,12 +117,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 2 CUs
|
||||
- 512
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
*
|
||||
- MI210
|
||||
- CDNA2
|
||||
- gfx90a
|
||||
- 9
|
||||
- 0
|
||||
- 64
|
||||
- 104
|
||||
- 64
|
||||
@@ -136,12 +134,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 2 CUs
|
||||
- 512
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
*
|
||||
- MI100
|
||||
- CDNA
|
||||
- gfx908
|
||||
- 9
|
||||
- 0
|
||||
- 32
|
||||
- 120
|
||||
- 64
|
||||
@@ -153,12 +151,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 3 CUs
|
||||
- 256 VGPR and 256 AccVGPR
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
*
|
||||
- MI60
|
||||
- GCN5.1
|
||||
- gfx906
|
||||
- 9
|
||||
- 0
|
||||
- 32
|
||||
- 64
|
||||
- 64
|
||||
@@ -170,12 +168,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 3 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
*
|
||||
- MI50 (32GB)
|
||||
- GCN5.1
|
||||
- gfx906
|
||||
- 9
|
||||
- 0
|
||||
- 32
|
||||
- 60
|
||||
- 64
|
||||
@@ -187,12 +185,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 3 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
*
|
||||
- MI50 (16GB)
|
||||
- GCN5.1
|
||||
- gfx906
|
||||
- 9
|
||||
- 0
|
||||
- 16
|
||||
- 60
|
||||
- 64
|
||||
@@ -204,12 +202,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 3 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
*
|
||||
- MI25
|
||||
- GCN5.0
|
||||
- gfx900
|
||||
- 9
|
||||
- 0
|
||||
- 16
|
||||
- 64
|
||||
- 64
|
||||
@@ -221,12 +219,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 3 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
*
|
||||
- MI8
|
||||
- GCN3.0
|
||||
- gfx803
|
||||
- 8
|
||||
- 0
|
||||
- 4
|
||||
- 64
|
||||
- 64
|
||||
@@ -238,12 +236,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 4 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
- 8
|
||||
- 0
|
||||
*
|
||||
- MI6
|
||||
- GCN4.0
|
||||
- gfx803
|
||||
- 8
|
||||
- 0
|
||||
- 16
|
||||
- 36
|
||||
- 64
|
||||
@@ -255,6 +253,8 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 4 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
- 8
|
||||
- 0
|
||||
|
||||
.. tab-item:: AMD Radeon PRO GPUs
|
||||
|
||||
@@ -266,8 +266,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- Model
|
||||
- Architecture
|
||||
- LLVM target name
|
||||
- Device Major version
|
||||
- Device Minor version
|
||||
|
||||
- VRAM (GiB)
|
||||
- Compute Units
|
||||
- Wavefront Size
|
||||
@@ -280,12 +279,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- L0 Instruction Cache (KiB)
|
||||
- VGPR File (KiB)
|
||||
- SGPR File (KiB)
|
||||
- GFXIP Major version
|
||||
- GFXIP Minor version
|
||||
*
|
||||
- Radeon PRO V710
|
||||
- RDNA3
|
||||
- gfx1101
|
||||
- 11
|
||||
- 0
|
||||
- 28
|
||||
- 54
|
||||
- 32
|
||||
@@ -298,12 +297,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon PRO W7900 Dual Slot
|
||||
- RDNA3
|
||||
- gfx1100
|
||||
- 11
|
||||
- 0
|
||||
- 48
|
||||
- 96
|
||||
- 32
|
||||
@@ -316,12 +315,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon PRO W7900
|
||||
- RDNA3
|
||||
- gfx1100
|
||||
- 11
|
||||
- 0
|
||||
- 48
|
||||
- 96
|
||||
- 32
|
||||
@@ -334,12 +333,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon PRO W7800
|
||||
- RDNA3
|
||||
- gfx1100
|
||||
- 11
|
||||
- 0
|
||||
- 32
|
||||
- 70
|
||||
- 32
|
||||
@@ -352,12 +351,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon PRO W7700
|
||||
- RDNA3
|
||||
- gfx1101
|
||||
- 11
|
||||
- 0
|
||||
- 16
|
||||
- 48
|
||||
- 32
|
||||
@@ -370,12 +369,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon PRO W6800
|
||||
- RDNA2
|
||||
- gfx1030
|
||||
- 10
|
||||
- 3
|
||||
- 32
|
||||
- 60
|
||||
- 32
|
||||
@@ -388,12 +387,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon PRO W6600
|
||||
- RDNA2
|
||||
- gfx1032
|
||||
- 10
|
||||
- 3
|
||||
- 8
|
||||
- 28
|
||||
- 32
|
||||
@@ -406,12 +405,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon PRO V620
|
||||
- RDNA2
|
||||
- gfx1030
|
||||
- 10
|
||||
- 3
|
||||
- 32
|
||||
- 72
|
||||
- 32
|
||||
@@ -424,12 +423,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon Pro W5500
|
||||
- RDNA
|
||||
- gfx1012
|
||||
- 10
|
||||
- 1
|
||||
- 8
|
||||
- 22
|
||||
- 32
|
||||
@@ -442,12 +441,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 20
|
||||
- 10
|
||||
- 1
|
||||
*
|
||||
- Radeon Pro VII
|
||||
- GCN5.1
|
||||
- gfx906
|
||||
- 9
|
||||
- 0
|
||||
- 16
|
||||
- 60
|
||||
- 64
|
||||
@@ -460,6 +459,8 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 3 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
|
||||
.. tab-item:: AMD Radeon GPUs
|
||||
|
||||
@@ -471,8 +472,6 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- Model
|
||||
- Architecture
|
||||
- LLVM target name
|
||||
- Device Major version
|
||||
- Device Minor version
|
||||
- VRAM (GiB)
|
||||
- Compute Units
|
||||
- Wavefront Size
|
||||
@@ -485,12 +484,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- L0 Instruction Cache (KiB)
|
||||
- VGPR File (KiB)
|
||||
- SGPR File (KiB)
|
||||
- GFXIP Major version
|
||||
- GFXIP Minor version
|
||||
*
|
||||
- Radeon RX 7900 XTX
|
||||
- RDNA3
|
||||
- gfx1100
|
||||
- 11
|
||||
- 0
|
||||
- 24
|
||||
- 96
|
||||
- 32
|
||||
@@ -503,12 +502,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon RX 7900 XT
|
||||
- RDNA3
|
||||
- gfx1100
|
||||
- 11
|
||||
- 0
|
||||
- 20
|
||||
- 84
|
||||
- 32
|
||||
@@ -521,12 +520,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon RX 7900 GRE
|
||||
- RDNA3
|
||||
- gfx1100
|
||||
- 11
|
||||
- 0
|
||||
- 16
|
||||
- 80
|
||||
- 32
|
||||
@@ -539,12 +538,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon RX 7800 XT
|
||||
- RDNA3
|
||||
- gfx1101
|
||||
- 11
|
||||
- 0
|
||||
- 16
|
||||
- 60
|
||||
- 32
|
||||
@@ -557,12 +556,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon RX 7700 XT
|
||||
- RDNA3
|
||||
- gfx1101
|
||||
- 11
|
||||
- 0
|
||||
- 12
|
||||
- 54
|
||||
- 32
|
||||
@@ -575,12 +574,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon RX 7600
|
||||
- RDNA3
|
||||
- gfx1102
|
||||
- 11
|
||||
- 0
|
||||
- 8
|
||||
- 32
|
||||
- 32
|
||||
@@ -593,12 +592,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon RX 6950 XT
|
||||
- RDNA2
|
||||
- gfx1030
|
||||
- 10
|
||||
- 3
|
||||
- 16
|
||||
- 80
|
||||
- 32
|
||||
@@ -611,12 +610,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon RX 6900 XT
|
||||
- RDNA2
|
||||
- gfx1030
|
||||
- 10
|
||||
- 3
|
||||
- 16
|
||||
- 80
|
||||
- 32
|
||||
@@ -629,12 +628,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon RX 6800 XT
|
||||
- RDNA2
|
||||
- gfx1030
|
||||
- 10
|
||||
- 3
|
||||
- 16
|
||||
- 72
|
||||
- 32
|
||||
@@ -647,12 +646,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon RX 6800
|
||||
- RDNA2
|
||||
- gfx1030
|
||||
- 10
|
||||
- 3
|
||||
- 16
|
||||
- 60
|
||||
- 32
|
||||
@@ -665,12 +664,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon RX 6750 XT
|
||||
- RDNA2
|
||||
- gfx1031
|
||||
- 10
|
||||
- 3
|
||||
- 12
|
||||
- 40
|
||||
- 32
|
||||
@@ -683,12 +682,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon RX 6700 XT
|
||||
- RDNA2
|
||||
- gfx1031
|
||||
- 10
|
||||
- 3
|
||||
- 12
|
||||
- 40
|
||||
- 32
|
||||
@@ -701,13 +700,13 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon RX 6700
|
||||
- RDNA2
|
||||
- gfx1031
|
||||
- 10
|
||||
- 3
|
||||
- 10
|
||||
- 36
|
||||
- 32
|
||||
- 128
|
||||
@@ -719,12 +718,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon RX 6650 XT
|
||||
- RDNA2
|
||||
- gfx1032
|
||||
- 10
|
||||
- 3
|
||||
- 8
|
||||
- 32
|
||||
- 32
|
||||
@@ -737,12 +736,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon RX 6600 XT
|
||||
- RDNA2
|
||||
- gfx1032
|
||||
- 10
|
||||
- 3
|
||||
- 8
|
||||
- 32
|
||||
- 32
|
||||
@@ -755,12 +754,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon RX 6600
|
||||
- RDNA2
|
||||
- gfx1032
|
||||
- 10
|
||||
- 3
|
||||
- 8
|
||||
- 28
|
||||
- 32
|
||||
@@ -773,12 +772,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon VII
|
||||
- GCN5.1
|
||||
- gfx906
|
||||
- 9
|
||||
- 0
|
||||
- 16
|
||||
- 60
|
||||
- 64
|
||||
@@ -791,6 +790,8 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 3 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
|
||||
Glossary
|
||||
========
|
||||
@@ -804,18 +805,6 @@ For more information about the terms used, see the
|
||||
Argument to pass to clang in ``--offload-arch`` to compile code for the given
|
||||
architecture.
|
||||
|
||||
**Device major version**
|
||||
|
||||
Indicates the core instruction set of the GPU architecture. For example, a value
|
||||
of 11 would correspond to Navi III (RDNA3).
|
||||
|
||||
**Device minor version**
|
||||
|
||||
Indicates a particular configuration, feature set, or variation within the group
|
||||
represented by the device compute version. For example, different models within
|
||||
the same major version might have varying levels of support for certain features
|
||||
or optimizations.
|
||||
|
||||
**VRAM**
|
||||
|
||||
Amount of memory available on the GPU.
|
||||
@@ -898,6 +887,26 @@ Purpose Vector Registers, used specifically in matrix instructions.
|
||||
Size of the Scalar General Purpose Register (SGPR) file. Holds data used in
|
||||
scalar instructions.
|
||||
|
||||
**GFXIP**
|
||||
|
||||
GFXIP (Graphics IP) is a versioning system used by AMD to identify the GPU
|
||||
architecture and its instruction set. It helps categorize different generations
|
||||
of GPUs and their feature sets.
|
||||
|
||||
**GFXIP major version**
|
||||
|
||||
Defines the GPU's core instruction set and architecture, which determines
|
||||
compatibility with software stacks such as HIP and OpenCL. For example, a GFXIP
|
||||
11 major version corresponds to the RDNA 3 (Navi 3x) architecture, influencing
|
||||
driver support and available compute features.
|
||||
|
||||
**GFXIP minor version**
|
||||
|
||||
Represents specific variations within a GFXIP major version and affects feature sets,
|
||||
optimizations, and driver behavior in software stacks such as HIP and OpenCL. Different
|
||||
GPU models within the same major version can have unique capabilities, impacting
|
||||
performance and supported instructions.
|
||||
|
||||
**GCD**
|
||||
|
||||
Graphics Compute Die.
|
||||
|
||||
@@ -9,16 +9,14 @@
|
||||
Data types and precision support
|
||||
*************************************************************
|
||||
|
||||
This topic lists the supported data types of AMD GPUs and ROCm libraries.
|
||||
Corresponding :doc:`HIP <hip:index>` data types are also noted.
|
||||
This topic lists the data types support on AMD GPUs, ROCm libraries along
|
||||
with corresponding :doc:`HIP <hip:index>` data types.
|
||||
|
||||
Integral types
|
||||
==========================================
|
||||
==============
|
||||
|
||||
The signed and unsigned integral types supported by ROCm are listed in
|
||||
the following table, along with their corresponding HIP type and a short
|
||||
description.
|
||||
|
||||
the following table.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
@@ -48,10 +46,9 @@ description.
|
||||
.. _precision_support_floating_point_types:
|
||||
|
||||
Floating-point types
|
||||
==========================================
|
||||
====================
|
||||
|
||||
The floating-point types supported by ROCm are listed in the following
|
||||
table, along with their corresponding HIP type and a short description.
|
||||
The floating-point types supported by ROCm are listed in the following table.
|
||||
|
||||
.. image:: ../data/about/compatibility/floating-point-data-types.png
|
||||
:alt: Supported floating-point types
|
||||
@@ -66,18 +63,18 @@ table, along with their corresponding HIP type and a short description.
|
||||
- Description
|
||||
*
|
||||
- float8 (E4M3)
|
||||
- ``-``
|
||||
- ``__hip_fp8_e4m3_fnuz``
|
||||
- An 8-bit floating-point number that mostly follows IEEE-754 conventions
|
||||
and **S1E4M3** bit layout, as described in `8-bit Numerical Formats for Deep Neural Networks <https://arxiv.org/abs/2206.02915>`_ ,
|
||||
with expanded range and no infinity or signed zero. NaN is
|
||||
represented as negative zero.
|
||||
and **S1E4M3** bit layout, as described in `8-bit Numerical Formats for Deep Neural Networks <https://arxiv.org/abs/2206.02915>`_,
|
||||
with expanded range and no infinity or signed zero. NaN is represented
|
||||
as negative zero.
|
||||
*
|
||||
- float8 (E5M2)
|
||||
- ``-``
|
||||
- ``__hip_fp8_e5m2_fnuz``
|
||||
- An 8-bit floating-point number mostly following IEEE-754 conventions and
|
||||
**S1E5M2** bit layout, as described in `8-bit Numerical Formats for Deep Neural Networks <https://arxiv.org/abs/2206.02915>`_ ,
|
||||
with expanded range and no infinity or signed zero. NaN is
|
||||
represented as negative zero.
|
||||
**S1E5M2** bit layout, as described in `8-bit Numerical Formats for Deep Neural Networks <https://arxiv.org/abs/2206.02915>`_,
|
||||
with expanded range and no infinity or signed zero. NaN is represented
|
||||
as negative zero.
|
||||
*
|
||||
- float16
|
||||
- ``half``
|
||||
@@ -90,7 +87,7 @@ table, along with their corresponding HIP type and a short description.
|
||||
format.
|
||||
*
|
||||
- tensorfloat32
|
||||
- ``-``
|
||||
- Not available
|
||||
- A floating-point number that occupies 32 bits or less of storage,
|
||||
providing improved range compared to half (16-bit) format, at
|
||||
(potentially) greater throughput than single-precision (32-bit) formats.
|
||||
@@ -117,12 +114,15 @@ table, along with their corresponding HIP type and a short description.
|
||||
|
||||
* In some AMD documents and articles, float8 (E5M2) is referred to as bfloat8.
|
||||
|
||||
ROCm support icons
|
||||
==========================================
|
||||
* The :doc:`low precision floating point types page <hip:reference/low_fp_types>`
|
||||
describes how to use these types in HIP with examples.
|
||||
|
||||
In the following sections, icons represent the level of support. These
|
||||
icons, described in the following table, are also used in the library data type
|
||||
support pages.
|
||||
Level of support definitions
|
||||
============================
|
||||
|
||||
In the following sections, icons represent the level of support. These icons,
|
||||
described in the following table, are also used in the library data type support
|
||||
pages.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
@@ -130,6 +130,11 @@ support pages.
|
||||
*
|
||||
- Icon
|
||||
- Definition
|
||||
|
||||
*
|
||||
- NA
|
||||
- Not applicable
|
||||
|
||||
*
|
||||
- ❌
|
||||
- Not supported
|
||||
@@ -158,16 +163,15 @@ support pages.
|
||||
* Any type can be emulated by software, but this page does not cover such
|
||||
cases.
|
||||
|
||||
Hardware data type support
|
||||
Data type support by Hardware Architecture
|
||||
==========================================
|
||||
|
||||
The following tables provide information about AMD Instinct accelerators support
|
||||
for various data types. The MI200 series GPUs, which include MI210, MI250, and
|
||||
MI250X, are based on the CDNA2 architecture. The MI300 series GPUs, consisting
|
||||
of MI300A, MI300X, and MI325X, are built on the CDNA3 architecture.
|
||||
The MI200 series GPUs, which include MI210, MI250, and MI250X, are based on the
|
||||
CDNA2 architecture. The MI300 series GPUs, consisting of MI300A, MI300X, and
|
||||
MI325X, are based on the CDNA3 architecture.
|
||||
|
||||
Compute units support
|
||||
-------------------------------------------------------------------------------
|
||||
---------------------
|
||||
|
||||
The following table lists data type support for compute units.
|
||||
|
||||
@@ -248,7 +252,7 @@ The following table lists data type support for compute units.
|
||||
- ✅
|
||||
|
||||
Matrix core support
|
||||
-------------------------------------------------------------------------------
|
||||
-------------------
|
||||
|
||||
The following table lists data type support for AMD GPU matrix cores.
|
||||
|
||||
@@ -329,7 +333,7 @@ The following table lists data type support for AMD GPU matrix cores.
|
||||
- ✅
|
||||
|
||||
Atomic operations support
|
||||
-------------------------------------------------------------------------------
|
||||
-------------------------
|
||||
|
||||
The following table lists data type support for atomic operations.
|
||||
|
||||
@@ -416,14 +420,14 @@ The following table lists data type support for atomic operations.
|
||||
performance impact when they frequently access the same memory address.
|
||||
|
||||
Data type support in ROCm libraries
|
||||
==========================================
|
||||
===================================
|
||||
|
||||
ROCm library support for int8, float8 (E4M3), float8 (E5M2), int16, float16,
|
||||
bfloat16, int32, tensorfloat32, float32, int64, and float64 is listed in the
|
||||
following tables.
|
||||
|
||||
Libraries input/output type support
|
||||
-------------------------------------------------------------------------------
|
||||
-----------------------------------
|
||||
|
||||
The following tables list ROCm library support for specific input and output
|
||||
data types. Refer to the corresponding library data type support page for a
|
||||
@@ -444,37 +448,37 @@ detailed description.
|
||||
- int32
|
||||
- int64
|
||||
*
|
||||
- hipSPARSELt (:doc:`details <hipsparselt:reference/data-type-support>`)
|
||||
- :doc:`hipSPARSELt <hipsparselt:reference/data-type-support>`
|
||||
- ✅/✅
|
||||
- ❌/❌
|
||||
- ❌/❌
|
||||
- ❌/❌
|
||||
*
|
||||
- rocRAND (:doc:`details <rocrand:api-reference/data-type-support>`)
|
||||
- -/✅
|
||||
- -/✅
|
||||
- -/✅
|
||||
- -/✅
|
||||
- :doc:`rocRAND <rocrand:api-reference/data-type-support>`
|
||||
- NA/✅
|
||||
- NA/✅
|
||||
- NA/✅
|
||||
- NA/✅
|
||||
*
|
||||
- hipRAND (:doc:`details <hiprand:api-reference/data-type-support>`)
|
||||
- -/✅
|
||||
- -/✅
|
||||
- -/✅
|
||||
- -/✅
|
||||
- :doc:`hipRAND <hiprand:api-reference/data-type-support>`
|
||||
- NA/✅
|
||||
- NA/✅
|
||||
- NA/✅
|
||||
- NA/✅
|
||||
*
|
||||
- rocPRIM (:doc:`details <rocprim:reference/data-type-support>`)
|
||||
- :doc:`rocPRIM <rocprim:reference/data-type-support>`
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
*
|
||||
- hipCUB (:doc:`details <hipcub:api-reference/data-type-support>`)
|
||||
- :doc:`hipCUB <hipcub:api-reference/data-type-support>`
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
*
|
||||
- rocThrust (:doc:`details <rocthrust:data-type-support>`)
|
||||
- :doc:`rocThrust <rocthrust:data-type-support>`
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
@@ -496,7 +500,7 @@ detailed description.
|
||||
- float32
|
||||
- float64
|
||||
*
|
||||
- hipSPARSELt (:doc:`details <hipsparselt:reference/data-type-support>`)
|
||||
- :doc:`hipSPARSELt <hipsparselt:reference/data-type-support>`
|
||||
- ❌/❌
|
||||
- ❌/❌
|
||||
- ✅/✅
|
||||
@@ -505,25 +509,25 @@ detailed description.
|
||||
- ❌/❌
|
||||
- ❌/❌
|
||||
*
|
||||
- rocRAND (:doc:`details <rocrand:api-reference/data-type-support>`)
|
||||
- -/❌
|
||||
- -/❌
|
||||
- -/✅
|
||||
- -/❌
|
||||
- -/❌
|
||||
- -/✅
|
||||
- -/✅
|
||||
- :doc:`rocRAND <rocrand:api-reference/data-type-support>`
|
||||
- NA/❌
|
||||
- NA/❌
|
||||
- NA/✅
|
||||
- NA/❌
|
||||
- NA/❌
|
||||
- NA/✅
|
||||
- NA/✅
|
||||
*
|
||||
- hipRAND (:doc:`details <hiprand:api-reference/data-type-support>`)
|
||||
- -/❌
|
||||
- -/❌
|
||||
- -/✅
|
||||
- -/❌
|
||||
- -/❌
|
||||
- -/✅
|
||||
- -/✅
|
||||
- :doc:`hipRAND <hiprand:api-reference/data-type-support>`
|
||||
- NA/❌
|
||||
- NA/❌
|
||||
- NA/✅
|
||||
- NA/❌
|
||||
- NA/❌
|
||||
- NA/✅
|
||||
- NA/✅
|
||||
*
|
||||
- rocPRIM (:doc:`details <rocprim:reference/data-type-support>`)
|
||||
- :doc:`rocPRIM <rocprim:reference/data-type-support>`
|
||||
- ❌/❌
|
||||
- ❌/❌
|
||||
- ✅/✅
|
||||
@@ -532,7 +536,7 @@ detailed description.
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
*
|
||||
- hipCUB (:doc:`details <hipcub:api-reference/data-type-support>`)
|
||||
- :doc:`hipCUB <hipcub:api-reference/data-type-support>`
|
||||
- ❌/❌
|
||||
- ❌/❌
|
||||
- ✅/✅
|
||||
@@ -541,7 +545,7 @@ detailed description.
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
*
|
||||
- rocThrust (:doc:`details <rocthrust:data-type-support>`)
|
||||
- :doc:`rocThrust <rocthrust:data-type-support>`
|
||||
- ❌/❌
|
||||
- ❌/❌
|
||||
- ⚠️/⚠️
|
||||
@@ -550,9 +554,14 @@ detailed description.
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
|
||||
.. note::
|
||||
|
||||
As random number generation libraries, rocRAND and hipRAND only specify output
|
||||
data types for the random values they generate, with no need for input data
|
||||
types.
|
||||
|
||||
Libraries internal calculations type support
|
||||
-------------------------------------------------------------------------------
|
||||
--------------------------------------------
|
||||
|
||||
The following tables list ROCm library support for specific internal data types.
|
||||
Refer to the corresponding library data type support page for a detailed
|
||||
@@ -573,7 +582,7 @@ description.
|
||||
- int32
|
||||
- int64
|
||||
*
|
||||
- hipSPARSELt (:doc:`details <hipsparselt:reference/data-type-support>`)
|
||||
- :doc:`hipSPARSELt <hipsparselt:reference/data-type-support>`
|
||||
- ❌
|
||||
- ❌
|
||||
- ✅
|
||||
@@ -596,7 +605,7 @@ description.
|
||||
- float32
|
||||
- float64
|
||||
*
|
||||
- hipSPARSELt (:doc:`details <hipsparselt:reference/data-type-support>`)
|
||||
- :doc:`hipSPARSELt <hipsparselt:reference/data-type-support>`
|
||||
- ❌
|
||||
- ❌
|
||||
- ❌
|
||||
|
||||
@@ -40,11 +40,13 @@ subtrees:
|
||||
title: Training
|
||||
subtrees:
|
||||
- entries:
|
||||
- file: how-to/rocm-for-ai/training/train-a-model.rst
|
||||
title: Train a model
|
||||
- file: how-to/rocm-for-ai/training/benchmark-docker/megatron-lm
|
||||
title: Train a model with Megatron-LM
|
||||
- file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training
|
||||
title: Train a model with PyTorch
|
||||
- file: how-to/rocm-for-ai/training/scale-model-training.rst
|
||||
title: Scale model training
|
||||
|
||||
|
||||
- file: how-to/rocm-for-ai/fine-tuning/index.rst
|
||||
title: Fine-tuning LLMs
|
||||
subtrees:
|
||||
@@ -152,7 +154,7 @@ subtrees:
|
||||
- entries:
|
||||
- url: https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf
|
||||
title: AMD Instinct MI200/CDNA2 ISA
|
||||
- url: https://www.amd.com/system/files/documents/amd-cdna2-white-paper.pdf
|
||||
- url: https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna2-white-paper.pdf
|
||||
title: White paper
|
||||
- file: conceptual/gpu-arch/mi100.md
|
||||
title: MI100 microarchitecture
|
||||
@@ -160,7 +162,7 @@ subtrees:
|
||||
- entries:
|
||||
- url: https://www.amd.com/system/files/TechDocs/instinct-mi100-cdna1-shader-instruction-set-architecture%C2%A0.pdf
|
||||
title: AMD Instinct MI100/CDNA1 ISA
|
||||
- url: https://www.amd.com/system/files/documents/amd-cdna-whitepaper.pdf
|
||||
- url: https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna-white-paper.pdf
|
||||
title: White paper
|
||||
- file: conceptual/iommu.rst
|
||||
title: Input-Output Memory Management Unit (IOMMU)
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
rocm-docs-core==1.15.0
|
||||
rocm-docs-core==1.17.0
|
||||
sphinx-reredirects
|
||||
sphinx-sitemap
|
||||
sphinxcontrib.datatemplates==0.11.0
|
||||
|
||||
@@ -1,13 +1,15 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.10
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements.in
|
||||
# pip-compile docs/sphinx/requirements.in
|
||||
#
|
||||
accessible-pygments==0.0.5
|
||||
# via pydata-sphinx-theme
|
||||
alabaster==1.0.0
|
||||
# via sphinx
|
||||
appnope==0.1.4
|
||||
# via ipykernel
|
||||
asttokens==3.0.0
|
||||
# via stack-data
|
||||
attrs==25.1.0
|
||||
@@ -23,7 +25,7 @@ beautifulsoup4==4.12.3
|
||||
# via pydata-sphinx-theme
|
||||
breathe==4.35.0
|
||||
# via rocm-docs-core
|
||||
certifi==2024.8.30
|
||||
certifi==2024.12.14
|
||||
# via requests
|
||||
cffi==1.17.1
|
||||
# via
|
||||
@@ -37,12 +39,14 @@ click==8.1.7
|
||||
# sphinx-external-toc
|
||||
comm==0.2.2
|
||||
# via ipykernel
|
||||
cryptography==44.0.1
|
||||
cryptography==44.0.0
|
||||
# via pyjwt
|
||||
debugpy==1.8.12
|
||||
# via ipykernel
|
||||
decorator==5.1.1
|
||||
# via ipython
|
||||
defusedxml==0.7.1
|
||||
# via sphinxcontrib-datatemplates
|
||||
deprecated==1.2.15
|
||||
# via pygithub
|
||||
docutils==0.21.2
|
||||
@@ -51,11 +55,9 @@ docutils==0.21.2
|
||||
# myst-parser
|
||||
# pydata-sphinx-theme
|
||||
# sphinx
|
||||
exceptiongroup==1.2.2
|
||||
# via ipython
|
||||
executing==2.2.0
|
||||
# via stack-data
|
||||
fastjsonschema==2.20.0
|
||||
fastjsonschema==2.21.1
|
||||
# via
|
||||
# nbformat
|
||||
# rocm-docs-core
|
||||
@@ -63,8 +65,6 @@ gitdb==4.0.11
|
||||
# via gitpython
|
||||
gitpython==3.1.43
|
||||
# via rocm-docs-core
|
||||
greenlet==3.1.1
|
||||
# via sqlalchemy
|
||||
idna==3.10
|
||||
# via requests
|
||||
imagesize==1.4.1
|
||||
@@ -75,13 +75,13 @@ importlib-metadata==8.6.1
|
||||
# myst-nb
|
||||
ipykernel==6.29.5
|
||||
# via myst-nb
|
||||
ipython==8.31.0
|
||||
ipython==8.32.0
|
||||
# via
|
||||
# ipykernel
|
||||
# myst-nb
|
||||
jedi==0.19.2
|
||||
# via ipython
|
||||
jinja2==3.1.5
|
||||
jinja2==3.1.4
|
||||
# via
|
||||
# myst-parser
|
||||
# sphinx
|
||||
@@ -115,7 +115,7 @@ mdit-py-plugins==0.4.2
|
||||
# via myst-parser
|
||||
mdurl==0.1.2
|
||||
# via markdown-it-py
|
||||
myst-nb==1.1.2
|
||||
myst-nb==1.2.0
|
||||
# via rocm-docs-core
|
||||
myst-parser==4.0.0
|
||||
# via myst-nb
|
||||
@@ -142,7 +142,7 @@ platformdirs==4.3.6
|
||||
# via jupyter-core
|
||||
prompt-toolkit==3.0.50
|
||||
# via ipython
|
||||
psutil==6.1.1
|
||||
psutil==7.0.0
|
||||
# via ipykernel
|
||||
ptyprocess==0.7.0
|
||||
# via pexpect
|
||||
@@ -150,7 +150,7 @@ pure-eval==0.2.3
|
||||
# via stack-data
|
||||
pycparser==2.22
|
||||
# via cffi
|
||||
pydata-sphinx-theme==0.16.0
|
||||
pydata-sphinx-theme==0.16.1
|
||||
# via
|
||||
# rocm-docs-core
|
||||
# sphinx-book-theme
|
||||
@@ -162,7 +162,7 @@ pygments==2.18.0
|
||||
# ipython
|
||||
# pydata-sphinx-theme
|
||||
# sphinx
|
||||
pyjwt[crypto]==2.10.0
|
||||
pyjwt[crypto]==2.10.1
|
||||
# via pygithub
|
||||
pynacl==1.5.0
|
||||
# via pygithub
|
||||
@@ -175,7 +175,7 @@ pyyaml==6.0.2
|
||||
# myst-parser
|
||||
# rocm-docs-core
|
||||
# sphinx-external-toc
|
||||
pyzmq==26.2.0
|
||||
pyzmq==26.2.1
|
||||
# via
|
||||
# ipykernel
|
||||
# jupyter-client
|
||||
@@ -187,7 +187,7 @@ requests==2.32.3
|
||||
# via
|
||||
# pygithub
|
||||
# sphinx
|
||||
rocm-docs-core==1.15.0
|
||||
rocm-docs-core==1.17.0
|
||||
# via -r requirements.in
|
||||
rpds-py==0.22.3
|
||||
# via
|
||||
@@ -215,6 +215,8 @@ sphinx==8.1.3
|
||||
# sphinx-notfound-page
|
||||
# sphinx-reredirects
|
||||
# sphinx-sitemap
|
||||
# sphinxcontrib-datatemplates
|
||||
# sphinxcontrib-runcmd
|
||||
sphinx-book-theme==1.1.3
|
||||
# via rocm-docs-core
|
||||
sphinx-copybutton==0.5.2
|
||||
@@ -226,11 +228,13 @@ sphinx-external-toc==1.0.1
|
||||
sphinx-notfound-page==1.0.4
|
||||
# via rocm-docs-core
|
||||
sphinx-reredirects==0.1.5
|
||||
# via -r requirements.in
|
||||
# via -r docs/sphinx/requirements.in
|
||||
sphinx-sitemap==2.6.0
|
||||
# via -r requirements.in
|
||||
# via -r docs/sphinx/requirements.in
|
||||
sphinxcontrib-applehelp==2.0.0
|
||||
# via sphinx
|
||||
sphinxcontrib-datatemplates==0.11.0
|
||||
# via -r docs/sphinx/requirements.in
|
||||
sphinxcontrib-devhelp==2.0.0
|
||||
# via sphinx
|
||||
sphinxcontrib-htmlhelp==2.1.0
|
||||
@@ -239,16 +243,16 @@ sphinxcontrib-jsmath==1.0.1
|
||||
# via sphinx
|
||||
sphinxcontrib-qthelp==2.0.0
|
||||
# via sphinx
|
||||
sphinxcontrib-runcmd==0.2.0
|
||||
# via sphinxcontrib-datatemplates
|
||||
sphinxcontrib-serializinghtml==2.0.0
|
||||
# via sphinx
|
||||
sqlalchemy==2.0.37
|
||||
sqlalchemy==2.0.38
|
||||
# via jupyter-cache
|
||||
stack-data==0.6.3
|
||||
# via ipython
|
||||
tabulate==0.9.0
|
||||
# via jupyter-cache
|
||||
tomli==2.1.0
|
||||
# via sphinx
|
||||
tornado==6.4.2
|
||||
# via
|
||||
# ipykernel
|
||||
|
||||
102
docs/sphinx/static/css/vllm-benchmark.css
Normal file
102
docs/sphinx/static/css/vllm-benchmark.css
Normal file
@@ -0,0 +1,102 @@
|
||||
/* ------------------ Compatibility options grid ------------------ */
|
||||
html {
|
||||
--compat-border-radius: 2px;
|
||||
--compat-accent-color: var(--pst-color-primary);
|
||||
--compat-bg-color: var(--pst-color-on-background);
|
||||
--compat-fg-color: var(--pst-color-primary-text);
|
||||
--compat-head-color: var(--pst-color-surface);
|
||||
--compat-param-hover-color: var(--pst-color-link-hover);
|
||||
--compat-param-selected-color: var(--pst-color-primary);
|
||||
}
|
||||
|
||||
html[data-theme="light"] {
|
||||
--compat-border-color: var(--pst-gray-500);
|
||||
--compat-param-disabled-color: var(--pst-gray-300);
|
||||
}
|
||||
|
||||
html[data-theme="dark"] {
|
||||
--compat-border-color: var(--pst-gray-600);
|
||||
--compat-param-disabled-color: var(--pst-gray-600);
|
||||
}
|
||||
|
||||
div#vllm-benchmark-ud-params-picker.container-fluid {
|
||||
padding: 0 0 1rem 0;
|
||||
}
|
||||
|
||||
div[data-param-k="model"] {
|
||||
background-color: var(--compat-bg-color);
|
||||
padding: 2px;
|
||||
border: solid 1px var(--compat-border-color);
|
||||
font-weight: 500;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
div[data-param-k="model"][data-param-state="selected"] {
|
||||
background-color: var(--compat-param-selected-color);
|
||||
color: var(--compat-fg-color);
|
||||
}
|
||||
|
||||
div[data-param-k="model"][data-param-state="latest-version"] {
|
||||
background-color: var(--compat-param-selected-color);
|
||||
color: var(--compat-fg-color);
|
||||
}
|
||||
|
||||
div[data-param-k="model"][data-param-state="disabled"] {
|
||||
background-color: var(--compat-param-disabled-color);
|
||||
text-decoration: line-through;
|
||||
/* text-decoration-color: var(--pst-color-danger); */
|
||||
cursor: auto;
|
||||
}
|
||||
|
||||
div[data-param-k="model"]:not([data-param-state]):hover {
|
||||
background-color: var(--compat-param-hover-color);
|
||||
}
|
||||
|
||||
div[data-param-k="model-group"] {
|
||||
background-color: var(--compat-bg-color);
|
||||
padding: 2px;
|
||||
border: solid 1px var(--compat-border-color);
|
||||
font-weight: 500;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
div[data-param-k="model-group"][data-param-state="selected"] {
|
||||
background-color: var(--compat-param-selected-color);
|
||||
color: var(--compat-fg-color);
|
||||
}
|
||||
|
||||
div[data-param-k="model-group"][data-param-state="latest-version"] {
|
||||
background-color: var(--compat-param-selected-color);
|
||||
color: var(--compat-fg-color);
|
||||
}
|
||||
|
||||
div[data-param-k="model-group"][data-param-state="disabled"] {
|
||||
background-color: var(--compat-param-disabled-color);
|
||||
text-decoration: line-through;
|
||||
/* text-decoration-color: var(--pst-color-danger); */
|
||||
cursor: auto;
|
||||
}
|
||||
|
||||
div[data-param-k="model-group"]:not([data-param-state]):hover {
|
||||
background-color: var(--compat-param-hover-color);
|
||||
}
|
||||
|
||||
.model-param-head {
|
||||
background-color: var(--compat-head-color);
|
||||
padding: 0.15rem 0.15rem 0.15rem 0.67rem;
|
||||
/* margin: 2px; */
|
||||
border-right: solid 2px var(--compat-accent-color);
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.model-param {
|
||||
/* padding: 2px; */
|
||||
/* margin: 0 2px 0 2px; */
|
||||
/* margin: 2px; */
|
||||
border: solid 1px var(--compat-border-color);
|
||||
font-weight: 500;
|
||||
}
|
||||
|
||||
.hidden {
|
||||
display: none !important;
|
||||
}
|
||||
@@ -68,85 +68,6 @@ set_address_sanitizer_off() {
|
||||
export LDFLAGS=""
|
||||
}
|
||||
|
||||
build_miopen_ckProf() {
|
||||
ENABLE_ADDRESS_SANITIZER=false
|
||||
echo "Start Building Composable Kernel Profiler"
|
||||
if [ "${ENABLE_ADDRESS_SANITIZER}" == "true" ]; then
|
||||
set_asan_env_vars
|
||||
set_address_sanitizer_on
|
||||
else
|
||||
unset_asan_env_vars
|
||||
set_address_sanitizer_off
|
||||
fi
|
||||
|
||||
cd $COMPONENT_SRC
|
||||
cd "$BUILD_DIR"
|
||||
rm -rf *
|
||||
|
||||
architectures='gfx10 gfx11 gfx90 gfx94'
|
||||
if [ -n "$GPU_ARCHS" ]; then
|
||||
architectures=$(echo ${GPU_ARCHS} | awk -F';' '{for(i=1;i<=NF;i++) a[substr($i,1,5)]} END{for(i in a) printf i" "}')
|
||||
fi
|
||||
|
||||
for arch in ${architectures}
|
||||
do
|
||||
if [ "${ASAN_CMAKE_PARAMS}" == "true" ] ; then
|
||||
cmake -DBUILD_DEV=OFF \
|
||||
-DCMAKE_PREFIX_PATH="${ROCM_PATH%-*}/lib/cmake;${ROCM_PATH%-*}/$ASAN_LIBDIR;${ROCM_PATH%-*}/llvm;${ROCM_PATH%-*}" \
|
||||
-DCMAKE_BUILD_TYPE=${BUILD_TYPE:-'RelWithDebInfo'} \
|
||||
-DCMAKE_SHARED_LINKER_FLAGS_INIT="-Wl,--enable-new-dtags,--rpath,$ROCM_ASAN_LIB_RPATH" \
|
||||
-DCMAKE_EXE_LINKER_FLAGS_INIT="-Wl,--enable-new-dtags,--rpath,$ROCM_ASAN_EXE_RPATH" \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=1 \
|
||||
-DCMAKE_INSTALL_RPATH_USE_LINK_PATH=FALSE \
|
||||
-DCMAKE_INSTALL_PREFIX="${ROCM_PATH}" \
|
||||
-DCMAKE_PACKAGING_INSTALL_PREFIX="${ROCM_PATH}" \
|
||||
-DBUILD_FILE_REORG_BACKWARD_COMPATIBILITY=OFF \
|
||||
-DROCM_SYMLINK_LIBS=OFF \
|
||||
-DCPACK_PACKAGING_INSTALL_PREFIX="${ROCM_PATH}" \
|
||||
-DROCM_DISABLE_LDCONFIG=ON \
|
||||
-DROCM_PATH="${ROCM_PATH}" \
|
||||
-DCPACK_GENERATOR="${PKGTYPE^^}" \
|
||||
-DCMAKE_CXX_COMPILER="${ROCM_PATH}/llvm/bin/clang++" \
|
||||
-DCMAKE_C_COMPILER="${ROCM_PATH}/llvm/bin/clang" \
|
||||
${LAUNCHER_FLAGS} \
|
||||
-DPROFILER_ONLY=ON \
|
||||
-DENABLE_ASAN_PACKAGING=true \
|
||||
-DGPU_ARCH="${arch}" \
|
||||
"$COMPONENT_SRC"
|
||||
else
|
||||
cmake -DBUILD_DEV=OFF \
|
||||
-DCMAKE_PREFIX_PATH="${ROCM_PATH%-*}" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_SHARED_LINKER_FLAGS_INIT='-Wl,--enable-new-dtags,--rpath,$ORIGIN' \
|
||||
-DCMAKE_EXE_LINKER_FLAGS_INIT='-Wl,--enable-new-dtags,--rpath,$ORIGIN/../lib' \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=1 \
|
||||
-DCMAKE_INSTALL_RPATH_USE_LINK_PATH=FALSE \
|
||||
-DCMAKE_INSTALL_PREFIX="${ROCM_PATH}" \
|
||||
-DCMAKE_PACKAGING_INSTALL_PREFIX="${ROCM_PATH}" \
|
||||
-DBUILD_FILE_REORG_BACKWARD_COMPATIBILITY=OFF \
|
||||
-DROCM_SYMLINK_LIBS=OFF \
|
||||
-DCPACK_PACKAGING_INSTALL_PREFIX="${ROCM_PATH}" \
|
||||
-DROCM_DISABLE_LDCONFIG=ON \
|
||||
-DROCM_PATH="${ROCM_PATH}" \
|
||||
-DCPACK_GENERATOR="${PKGTYPE^^}" \
|
||||
-DCMAKE_CXX_COMPILER="${ROCM_PATH}/llvm/bin/clang++" \
|
||||
-DCMAKE_C_COMPILER="${ROCM_PATH}/llvm/bin/clang" \
|
||||
${LAUNCHER_FLAGS} \
|
||||
-DPROFILER_ONLY=ON \
|
||||
-DGPU_ARCH="${arch}" \
|
||||
"$COMPONENT_SRC"
|
||||
fi
|
||||
|
||||
cmake --build . -- -j${PROC} package
|
||||
cp ./*ckprofiler*.${PKGTYPE} $PACKAGE_DIR
|
||||
rm -rf *
|
||||
done
|
||||
rm -rf _CPack_Packages/ && find -name '*.o' -delete
|
||||
|
||||
echo "Finished building Composable Kernel"
|
||||
show_build_cache_stats
|
||||
}
|
||||
|
||||
clean_miopen_ck() {
|
||||
echo "Cleaning MIOpen-CK build directory: ${BUILD_DIR} ${PACKAGE_DIR}"
|
||||
rm -rf "$BUILD_DIR" "$PACKAGE_DIR"
|
||||
|
||||
@@ -42,7 +42,6 @@ DEB_PATH="$(getDebPath $PROJ_NAME)"
|
||||
RPM_PATH="$(getRpmPath $PROJ_NAME)"
|
||||
INSTALL_PATH="${ROCM_INSTALL_PATH}/lib/llvm"
|
||||
LLVM_ROOT_LCL="${LLVM_ROOT}"
|
||||
ROCM_WHEEL_DIR="${BUILD_PATH}/_wheel"
|
||||
|
||||
TARGET="all"
|
||||
MAKEOPTS="$DASH_JAY"
|
||||
@@ -150,7 +149,6 @@ ENABLE_RUNTIMES="$ENABLE_RUNTIMES;libcxx;libcxxabi"
|
||||
BOOTSTRAPPING_BUILD_LIBCXX=1
|
||||
|
||||
clean_lightning() {
|
||||
rm -rf "$ROCM_WHEEL_DIR"
|
||||
rm -rf "$BUILD_PATH"
|
||||
rm -rf "$DEB_PATH"
|
||||
rm -rf "$RPM_PATH"
|
||||
@@ -332,15 +330,6 @@ build_lightning() {
|
||||
echo "End Workaround for race condition"
|
||||
cmake --build . -- $MAKEOPTS
|
||||
|
||||
case "$DISTRO_ID" in
|
||||
(rhel*|centos*)
|
||||
RHEL_BUILD=1
|
||||
;;
|
||||
(*)
|
||||
RHEL_BUILD=0
|
||||
;;
|
||||
esac
|
||||
|
||||
if [ $SKIP_LIT_TESTS -eq 0 ]; then
|
||||
if [ $RHEL_BUILD -eq 1 ]; then
|
||||
cmake --build . -- $MAKEOPTS check-lld check-mlir
|
||||
@@ -1158,9 +1147,4 @@ case $TARGET in
|
||||
(*) die "Invalid target $TARGET" ;;
|
||||
esac
|
||||
|
||||
if [[ $WHEEL_PACKAGE == true ]]; then
|
||||
echo "Wheel Package build started !!!!"
|
||||
create_wheel_package
|
||||
fi
|
||||
|
||||
echo "Operation complete"
|
||||
|
||||
@@ -1,171 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
source "$(dirname "${BASH_SOURCE}")/compute_utils.sh"
|
||||
|
||||
printUsage() {
|
||||
echo
|
||||
echo "Usage: ${BASH_SOURCE##*/} [options ...]"
|
||||
echo
|
||||
echo "Options:"
|
||||
echo " -c, --clean Clean output and delete all intermediate work"
|
||||
echo " -s, --static Build static lib (.a). build instead of dynamic/shared(.so) "
|
||||
echo " -p, --package <type> Specify packaging format"
|
||||
echo " -r, --release Make a release build instead of a debug build"
|
||||
echo " -a, --address_sanitizer Enable address sanitizer"
|
||||
echo " -o, --outdir <pkg_type> Print path of output directory containing packages of
|
||||
type referred to by pkg_type"
|
||||
echo " -w, --wheel Creates python wheel package of omniperf.
|
||||
It needs to be used along with -r option"
|
||||
echo " -h, --help Prints this help"
|
||||
echo
|
||||
echo "Possible values for <type>:"
|
||||
echo " deb -> Debian format (default)"
|
||||
echo " rpm -> RPM format"
|
||||
echo
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
API_NAME="omniperf"
|
||||
PROJ_NAME="$API_NAME"
|
||||
LIB_NAME="lib${API_NAME}"
|
||||
TARGET="build"
|
||||
MAKETARGET="deb"
|
||||
PACKAGE_ROOT="$(getPackageRoot)"
|
||||
PACKAGE_LIB="$(getLibPath)"
|
||||
BUILD_DIR="$(getBuildPath $API_NAME)"
|
||||
PACKAGE_DEB="$(getPackageRoot)/deb/$API_NAME"
|
||||
PACKAGE_RPM="$(getPackageRoot)/rpm/$API_NAME"
|
||||
ROCM_WHEEL_DIR="${BUILD_DIR}/_wheel"
|
||||
BUILD_TYPE="Debug"
|
||||
MAKE_OPTS="$DASH_JAY -C $BUILD_DIR"
|
||||
SHARED_LIBS="ON"
|
||||
CLEAN_OR_OUT=0;
|
||||
MAKETARGET="deb"
|
||||
PKGTYPE="deb"
|
||||
WHEEL_PACKAGE=false
|
||||
|
||||
|
||||
#parse the arguments
|
||||
VALID_STR=$(getopt -o hcraso:p:w --long help,clean,release,static,address_sanitizer,outdir:,package:,wheel -- "$@")
|
||||
eval set -- "$VALID_STR"
|
||||
|
||||
while true ;
|
||||
do
|
||||
case "$1" in
|
||||
-h | --help)
|
||||
printUsage ; exit 0;;
|
||||
-c | --clean)
|
||||
TARGET="clean" ; ((CLEAN_OR_OUT|=1)) ; shift ;;
|
||||
-r | --release)
|
||||
BUILD_TYPE="Release" ; shift ;;
|
||||
-a | --address_sanitizer)
|
||||
set_asan_env_vars
|
||||
set_address_sanitizer_on ; shift ;;
|
||||
-s | --static)
|
||||
SHARED_LIBS="OFF" ; shift ;;
|
||||
-o | --outdir)
|
||||
TARGET="outdir"; PKGTYPE=$2 ; OUT_DIR_SPECIFIED=1 ; ((CLEAN_OR_OUT|=2)) ; shift 2 ;;
|
||||
-p | --package)
|
||||
MAKETARGET="$2" ; shift 2 ;;
|
||||
-w | --wheel)
|
||||
WHEEL_PACKAGE=true ; shift ;;
|
||||
--) shift; break;; # end delimiter
|
||||
*)
|
||||
echo " This should never come but just incase : UNEXPECTED ERROR Parm : [$1] ">&2 ; exit 20;;
|
||||
esac
|
||||
|
||||
done
|
||||
|
||||
RET_CONFLICT=1
|
||||
check_conflicting_options "$CLEAN_OR_OUT" "$PKGTYPE" "$MAKETARGET"
|
||||
if [ $RET_CONFLICT -ge 30 ]; then
|
||||
print_vars "$API_NAME" "$TARGET" "$BUILD_TYPE" "$SHARED_LIBS" "$CLEAN_OR_OUT" "$PKGTYPE" "$MAKETARGET"
|
||||
exit $RET_CONFLICT
|
||||
fi
|
||||
|
||||
clean() {
|
||||
echo "Cleaning $PROJ_NAME"
|
||||
rm -rf "$ROCM_WHEEL_DIR"
|
||||
rm -rf "$BUILD_DIR"
|
||||
rm -rf "$PACKAGE_DEB"
|
||||
rm -rf "$PACKAGE_RPM"
|
||||
rm -rf "$PACKAGE_ROOT/${PROJ_NAME:?}"
|
||||
rm -rf "$PACKAGE_LIB/${LIB_NAME:?}"*
|
||||
}
|
||||
|
||||
build() {
|
||||
echo "Building $PROJ_NAME"
|
||||
if [ "$DISTRO_ID" = centos-7 ]; then
|
||||
echo "Skip make and uploading packages for Omniperf on Centos7 distro, due to python dependency"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ ! -d "$BUILD_DIR" ]; then
|
||||
mkdir -p "$BUILD_DIR"
|
||||
pushd "$BUILD_DIR" || exit
|
||||
|
||||
echo "ROCm CMake Params: $(rocm_cmake_params)"
|
||||
echo "ROCm Common CMake Params: $(rocm_common_cmake_params)"
|
||||
|
||||
print_lib_type $SHARED_LIBS
|
||||
cmake \
|
||||
$(rocm_cmake_params) \
|
||||
$(rocm_common_cmake_params) \
|
||||
-DCHECK_PYTHON_DEPS=NO \
|
||||
-DPYTHON_DEPS=${BUILD_DIR}/python-libs \
|
||||
-DMOD_INSTALL_PATH=${BUILD_DIR}/modulefiles \
|
||||
"$OMNIPERF_ROOT"
|
||||
fi
|
||||
|
||||
make $MAKE_OPTS
|
||||
make $MAKE_OPTS install
|
||||
make $MAKE_OPTS package
|
||||
|
||||
copy_if DEB "${CPACKGEN:-"DEB;RPM"}" "$PACKAGE_DEB" "$BUILD_DIR/${API_NAME}"*.deb
|
||||
copy_if RPM "${CPACKGEN:-"DEB;RPM"}" "$PACKAGE_RPM" "$BUILD_DIR/${API_NAME}"*.rpm
|
||||
}
|
||||
|
||||
create_wheel_package() {
|
||||
echo "Creating Omniperf wheel package"
|
||||
|
||||
# Copy the setup.py generator to build folder
|
||||
mkdir -p "$ROCM_WHEEL_DIR"
|
||||
cp -f "$SCRIPT_ROOT"/generate_setup_py.py "$ROCM_WHEEL_DIR"
|
||||
cp -f "$SCRIPT_ROOT"/repackage_wheel.sh "$ROCM_WHEEL_DIR"
|
||||
cd "$ROCM_WHEEL_DIR" || exit
|
||||
|
||||
# Currently only supports python3.6
|
||||
./repackage_wheel.sh "$BUILD_DIR"/*.rpm python3.6
|
||||
|
||||
# Copy the wheel created to RPM folder which will be uploaded to artifactory
|
||||
copy_if WHL "WHL" "$PACKAGE_RPM" "$ROCM_WHEEL_DIR"/dist/*.whl
|
||||
}
|
||||
|
||||
print_output_directory() {
|
||||
case ${PKGTYPE} in
|
||||
("deb")
|
||||
echo "${PACKAGE_DEB}";;
|
||||
("rpm")
|
||||
echo "${PACKAGE_RPM}";;
|
||||
(*)
|
||||
echo "Invalid package type \"${PKGTYPE}\" provided for -o" >&2; exit 1;;
|
||||
esac
|
||||
exit
|
||||
}
|
||||
|
||||
verifyEnvSetup
|
||||
|
||||
case "$TARGET" in
|
||||
(clean) clean ;;
|
||||
(build) build ;;
|
||||
(outdir) print_output_directory ;;
|
||||
(*) die "Invalid target $TARGET" ;;
|
||||
esac
|
||||
|
||||
if [[ $WHEEL_PACKAGE == true ]]; then
|
||||
echo "Wheel Package build started !!!!"
|
||||
create_wheel_package
|
||||
fi
|
||||
|
||||
echo "Operation complete"
|
||||
@@ -1,191 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
source "$(dirname "${BASH_SOURCE}")/compute_utils.sh"
|
||||
|
||||
printUsage() {
|
||||
echo
|
||||
echo "Usage: ${BASH_SOURCE##*/} [options ...]"
|
||||
echo
|
||||
echo "Options:"
|
||||
echo " -c, --clean Clean output and delete all intermediate work"
|
||||
echo " -s, --static Build static lib (.a). build instead of dynamic/shared(.so) "
|
||||
echo " -p, --package <type> Specify packaging format"
|
||||
echo " -r, --release Make a release build instead of a debug build"
|
||||
echo " -a, --address_sanitizer Enable address sanitizer"
|
||||
echo " -o, --outdir <pkg_type> Print path of output directory containing packages of
|
||||
type referred to by pkg_type"
|
||||
echo " -w, --wheel Creates python wheel package of omnitrace.
|
||||
It needs to be used along with -r option"
|
||||
echo " -h, --help Prints this help"
|
||||
echo
|
||||
echo "Possible values for <type>:"
|
||||
echo " deb -> Debian format (default)"
|
||||
echo " rpm -> RPM format"
|
||||
echo
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
API_NAME="omnitrace"
|
||||
PROJ_NAME="$API_NAME"
|
||||
LIB_NAME="lib${API_NAME}"
|
||||
TARGET="build"
|
||||
MAKETARGET="deb"
|
||||
PACKAGE_ROOT="$(getPackageRoot)"
|
||||
PACKAGE_LIB="$(getLibPath)"
|
||||
BUILD_DIR="$(getBuildPath $API_NAME)"
|
||||
PACKAGE_DEB="$(getPackageRoot)/deb/$API_NAME"
|
||||
PACKAGE_RPM="$(getPackageRoot)/rpm/$API_NAME"
|
||||
BUILD_TYPE="Debug"
|
||||
MAKE_OPTS="-j 8"
|
||||
SHARED_LIBS="ON"
|
||||
CLEAN_OR_OUT=0
|
||||
MAKETARGET="deb"
|
||||
PKGTYPE="deb"
|
||||
ASAN=0
|
||||
|
||||
#parse the arguments
|
||||
VALID_STR=$(getopt -o hcraso:p:w --long help,clean,release,address_sanitizer,static,outdir:,package:,wheel -- "$@")
|
||||
eval set -- "$VALID_STR"
|
||||
|
||||
while true; do
|
||||
case "$1" in
|
||||
-h | --help)
|
||||
printUsage
|
||||
exit 0
|
||||
;;
|
||||
-c | --clean)
|
||||
TARGET="clean"
|
||||
((CLEAN_OR_OUT |= 1))
|
||||
shift
|
||||
;;
|
||||
-r | --release)
|
||||
BUILD_TYPE="RelWithDebInfo"
|
||||
shift
|
||||
;;
|
||||
-a | --address_sanitizer)
|
||||
ack_and_ignore_asan
|
||||
|
||||
ASAN=1
|
||||
shift
|
||||
;;
|
||||
-s | --static)
|
||||
SHARED_LIBS="OFF"
|
||||
shift
|
||||
;;
|
||||
-o | --outdir)
|
||||
TARGET="outdir"
|
||||
PKGTYPE=$2
|
||||
((CLEAN_OR_OUT |= 2))
|
||||
shift 2
|
||||
;;
|
||||
-p | --package)
|
||||
MAKETARGET="$2"
|
||||
shift 2
|
||||
;;
|
||||
-w | --wheel)
|
||||
echo "omnitrace: wheel build option accepted and ignored"
|
||||
shift
|
||||
;;
|
||||
--)
|
||||
shift
|
||||
break
|
||||
;;
|
||||
*)
|
||||
echo " This should never come but just incase : UNEXPECTED ERROR Parm : [$1] " >&2
|
||||
exit 20
|
||||
;;
|
||||
esac
|
||||
|
||||
done
|
||||
|
||||
RET_CONFLICT=1
|
||||
check_conflicting_options $CLEAN_OR_OUT $PKGTYPE $MAKETARGET
|
||||
if [ $RET_CONFLICT -ge 30 ]; then
|
||||
print_vars $API_NAME $TARGET $BUILD_TYPE $SHARED_LIBS $CLEAN_OR_OUT $PKGTYPE $MAKETARGET
|
||||
exit $RET_CONFLICT
|
||||
fi
|
||||
|
||||
clean() {
|
||||
echo "Cleaning $PROJ_NAME"
|
||||
rm -rf "$BUILD_DIR"
|
||||
rm -rf "$PACKAGE_DEB"
|
||||
rm -rf "$PACKAGE_RPM"
|
||||
rm -rf "$PACKAGE_ROOT/${PROJ_NAME:?}"
|
||||
rm -rf "$PACKAGE_LIB/${LIB_NAME:?}"*
|
||||
}
|
||||
|
||||
build_omnitrace() {
|
||||
echo "Building $PROJ_NAME"
|
||||
if [ "$DISTRO_ID" = "mariner-2.0" ] || [ "$DISTRO_ID" = "ubuntu-24.04" ] || [ "$DISTRO_ID" = "azurelinux-3.0" ]; then
|
||||
echo "Skip make and uploading packages for Omnitrace on \"${DISTRO_ID}\" distro"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ $ASAN == 1 ]; then
|
||||
echo "Skip make and uploading packages for Omnitrace on ASAN build"
|
||||
exit 0
|
||||
fi
|
||||
if [ ! -d "$BUILD_DIR" ]; then
|
||||
mkdir -p "$BUILD_DIR"
|
||||
echo "Created build directory: $BUILD_DIR"
|
||||
fi
|
||||
|
||||
echo "Build directory: $BUILD_DIR"
|
||||
pushd "$BUILD_DIR" || exit
|
||||
print_lib_type $SHARED_LIBS
|
||||
|
||||
echo "ROCm CMake Params: $(rocm_cmake_params)"
|
||||
echo "ROCm Common CMake Params: $(rocm_common_cmake_params)"
|
||||
|
||||
|
||||
if [ $ASAN == 1 ]; then
|
||||
echo "Address Sanitizer path"
|
||||
|
||||
else
|
||||
cmake \
|
||||
$(rocm_cmake_params) \
|
||||
$(rocm_common_cmake_params) \
|
||||
-DOMNITRACE_BUILD_{LIBUNWIND,DYNINST}=ON \
|
||||
-DDYNINST_BUILD_{TBB,BOOST,ELFUTILS,LIBIBERTY}=ON \
|
||||
"$OMNITRACE_ROOT"
|
||||
fi
|
||||
|
||||
|
||||
popd || exit
|
||||
|
||||
echo "Make Options: $MAKE_OPTS"
|
||||
cmake --build "$BUILD_DIR" --target all -- $MAKE_OPTS
|
||||
cmake --build "$BUILD_DIR" --target install -- $MAKE_OPTS
|
||||
cmake --build "$BUILD_DIR" --target package -- $MAKE_OPTS
|
||||
|
||||
copy_if DEB "${CPACKGEN:-"DEB;RPM"}" "$PACKAGE_DEB" "$BUILD_DIR/${API_NAME}"*.deb
|
||||
copy_if RPM "${CPACKGEN:-"DEB;RPM"}" "$PACKAGE_RPM" "$BUILD_DIR/${API_NAME}"*.rpm
|
||||
}
|
||||
|
||||
print_output_directory() {
|
||||
case ${PKGTYPE} in
|
||||
"deb")
|
||||
echo "${PACKAGE_DEB}"
|
||||
;;
|
||||
"rpm")
|
||||
echo "${PACKAGE_RPM}"
|
||||
;;
|
||||
*)
|
||||
echo "Invalid package type \"${PKGTYPE}\" provided for -o" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
exit
|
||||
}
|
||||
|
||||
verifyEnvSetup
|
||||
|
||||
case "$TARGET" in
|
||||
clean) clean ;;
|
||||
build) build_omnitrace ;;
|
||||
outdir) print_output_directory ;;
|
||||
*) die "Invalid target $TARGET" ;;
|
||||
esac
|
||||
|
||||
echo "Operation complete"
|
||||
@@ -1,141 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
source "$(dirname "${BASH_SOURCE}")/compute_utils.sh"
|
||||
PROJ_NAME=OpenCL-ICD-Loader
|
||||
TARGET="build"
|
||||
MAKEOPTS="$DASH_JAY"
|
||||
BUILD_TYPE="Debug"
|
||||
PACKAGE_ROOT="$(getPackageRoot)"
|
||||
PACKAGE_DEB="$PACKAGE_ROOT/deb/${PROJ_NAME,,}"
|
||||
PACKAGE_RPM="$PACKAGE_ROOT/rpm/${PROJ_NAME,,}"
|
||||
CLEAN_OR_OUT=0;
|
||||
PKGTYPE="deb"
|
||||
MAKETARGET="deb"
|
||||
API_NAME="rocm-opencl-icd-loader"
|
||||
|
||||
printUsage() {
|
||||
echo
|
||||
echo "Usage: $(basename "${BASH_SOURCE}") [options ...]"
|
||||
echo
|
||||
echo "Options:"
|
||||
echo " -c, --clean Clean output and delete all intermediate work"
|
||||
echo " -p, --package <type> Specify packaging format"
|
||||
echo " -r, --release Make a release build instead of a debug build"
|
||||
echo " -h, --help Prints this help"
|
||||
echo " -o, --outdir Print path of output directory containing packages"
|
||||
echo " -s, --static Component/Build does not support static builds just accepting this param & ignore. No effect of the param on this build"
|
||||
echo
|
||||
echo "Possible values for <type>:"
|
||||
echo " deb -> Debian format (default)"
|
||||
echo " rpm -> RPM format"
|
||||
echo
|
||||
return 0
|
||||
}
|
||||
|
||||
RET_CONFLICT=1
|
||||
check_conflicting_options $CLEAN_OR_OUT $PKGTYPE $MAKETARGET
|
||||
if [ $RET_CONFLICT -ge 30 ]; then
|
||||
print_vars $TARGET $BUILD_TYPE $CLEAN_OR_OUT $PKGTYPE $MAKETARGET
|
||||
exit $RET_CONFLICT
|
||||
fi
|
||||
|
||||
clean_opencl_icd_loader() {
|
||||
echo "Cleaning $PROJ_NAME"
|
||||
rm -rf "$PACKAGE_DEB"
|
||||
rm -rf "$PACKAGE_RPM"
|
||||
rm -rf "$PACKAGE_ROOT/${PROJ_NAME,,}"
|
||||
}
|
||||
|
||||
copy_pkg_files_to_rocm() {
|
||||
local comp_folder=$1
|
||||
local comp_pkg_name=$2
|
||||
|
||||
cd "${OUT_DIR}/${PKGTYPE}/${comp_folder}"|| exit 2
|
||||
if [ "${PKGTYPE}" = 'deb' ]; then
|
||||
dpkg-deb -x ${comp_pkg_name}_*.deb pkg/
|
||||
else
|
||||
mkdir pkg && pushd pkg/ || exit 2
|
||||
if [[ "${comp_pkg_name}" != *-dev* ]]; then
|
||||
rpm2cpio ../${comp_pkg_name}-*.rpm | cpio -idmv
|
||||
else
|
||||
rpm2cpio ../${comp_pkg_name}el-*.rpm | cpio -idmv
|
||||
fi
|
||||
popd || exit 2
|
||||
fi
|
||||
ls ./pkg -alt
|
||||
cp -r ./pkg/*/rocm*/* "${ROCM_PATH}" || exit 2
|
||||
rm -rf pkg/
|
||||
}
|
||||
|
||||
build_opencl_icd_loader() {
|
||||
echo "Downloading $PROJ_NAME" package
|
||||
if [ "$DISTRO_NAME" = ubuntu ]; then
|
||||
mkdir -p "$PACKAGE_DEB"
|
||||
local rocm_ver=${ROCM_VERSION}
|
||||
if [ ${ROCM_VERSION##*.} = 0 ]; then
|
||||
rocm_ver=${ROCM_VERSION%.*}
|
||||
fi
|
||||
local url="https://repo.radeon.com/rocm/apt/${rocm_ver}/pool/main/r/${API_NAME}/"
|
||||
local package
|
||||
package=$(curl -s "$url" | grep -Po 'href="\K[^"]*' | grep "${DISTRO_RELEASE}" | head -n 1)
|
||||
|
||||
if [ -z "$package" ]; then
|
||||
echo "No package found for Ubuntu version $DISTRO_RELEASE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
wget -t3 -P "$PACKAGE_DEB" "${url}${package}"
|
||||
copy_pkg_files_to_rocm ${PROJ_NAME,,} ${API_NAME}
|
||||
else
|
||||
echo "$DISTRO_ID is not supported..."
|
||||
exit 2
|
||||
fi
|
||||
|
||||
echo "Installing $PROJ_NAME" package
|
||||
}
|
||||
|
||||
print_output_directory() {
|
||||
case ${PKGTYPE} in
|
||||
("deb")
|
||||
echo ${PACKAGE_DEB};;
|
||||
("rpm")
|
||||
echo ${PACKAGE_RPM};;
|
||||
(*)
|
||||
echo "Invalid package type \"${PKGTYPE}\" provided for -o" >&2; exit 1;;
|
||||
esac
|
||||
exit
|
||||
}
|
||||
|
||||
VALID_STR=`getopt -o hcraswlo:p: --long help,clean,release,outdir:,package: -- "$@"`
|
||||
eval set -- "$VALID_STR"
|
||||
while true ;
|
||||
do
|
||||
case "$1" in
|
||||
(-c | --clean )
|
||||
TARGET="clean" ; ((CLEAN_OR_OUT|=1)) ; shift ;;
|
||||
(-r | --release )
|
||||
BUILD_TYPE="RelWithDebInfo" ; shift ;;
|
||||
(-h | --help )
|
||||
printUsage ; exit 0 ;;
|
||||
(-a | --address_sanitizer)
|
||||
ack_and_ignore_asan ; shift ;;
|
||||
(-o | --outdir)
|
||||
TARGET="outdir"; PKGTYPE=$2 ; OUT_DIR_SPECIFIED=1 ; ((CLEAN_OR_OUT|=2)) ; shift 2 ;;
|
||||
(-p | --package)
|
||||
MAKETARGET="$2" ; shift 2;;
|
||||
(-s | --static)
|
||||
echo "-s parameter accepted but ignored" ; shift ;;
|
||||
--) shift; break;;
|
||||
(*)
|
||||
echo " This should never come but just incase : UNEXPECTED ERROR Parm : [$1] ">&2 ; exit 20;;
|
||||
esac
|
||||
done
|
||||
|
||||
case $TARGET in
|
||||
(clean) clean_opencl_icd_loader ;;
|
||||
(build) build_opencl_icd_loader ;;
|
||||
(outdir) print_output_directory ;;
|
||||
(*) die "Invalid target $TARGET" ;;
|
||||
esac
|
||||
|
||||
echo "Operation complete"
|
||||
@@ -32,7 +32,6 @@ ROCM_CMAKE_BUILD_DIR="$(getBuildPath rocm-cmake)"
|
||||
ROCM_CMAKE_BUILD_DIR="$(getBuildPath rocm-cmake)"
|
||||
ROCM_CMAKE_PACKAGE_DEB="$(getPackageRoot)/deb/rocm-cmake"
|
||||
ROCM_CMAKE_PACKAGE_RPM="$(getPackageRoot)/rpm/rocm-cmake"
|
||||
ROCM_WHEEL_DIR="${ROCM_CMAKE_BUILD_DIR}/_wheel"
|
||||
ROCM_CMAKE_BUILD_TYPE="debug"
|
||||
BUILD_TYPE="Debug"
|
||||
SHARED_LIBS="ON"
|
||||
@@ -56,8 +55,6 @@ do
|
||||
ack_and_ignore_asan ; shift ;;
|
||||
(-s | --static)
|
||||
SHARED_LIBS="OFF" ; shift ;;
|
||||
(-w | --wheel)
|
||||
WHEEL_PACKAGE=true ; shift ;;
|
||||
(-o | --outdir)
|
||||
TARGET="outdir"; PKGTYPE=$2 ; OUT_DIR_SPECIFIED=1 ; ((CLEAN_OR_OUT|=2)) ; shift 2 ;;
|
||||
(-p | --package)
|
||||
@@ -78,7 +75,6 @@ fi
|
||||
|
||||
|
||||
clean_rocm_cmake() {
|
||||
rm -rf "$ROCM_WHEEL_DIR"
|
||||
rm -rf $ROCM_CMAKE_BUILD_DIR
|
||||
rm -rf $ROCM_CMAKE_PACKAGE_DEB
|
||||
rm -rf $ROCM_CMAKE_PACKAGE_RPM
|
||||
@@ -106,19 +102,6 @@ build_rocm_cmake() {
|
||||
copy_if RPM "${CPACKGEN:-"DEB;RPM"}" "$ROCM_CMAKE_PACKAGE_RPM" $ROCM_CMAKE_BUILD_DIR/rocm-cmake*.rpm
|
||||
}
|
||||
|
||||
create_wheel_package() {
|
||||
echo "Creating rocm-cmake wheel package"
|
||||
# Copy the setup.py generator to build folder
|
||||
mkdir -p $ROCM_WHEEL_DIR
|
||||
cp -f $SCRIPT_ROOT/generate_setup_py.py $ROCM_WHEEL_DIR
|
||||
cp -f $SCRIPT_ROOT/repackage_wheel.sh $ROCM_WHEEL_DIR
|
||||
cd $ROCM_WHEEL_DIR
|
||||
# Currently only supports python3.6
|
||||
./repackage_wheel.sh $ROCM_CMAKE_BUILD_DIR/rocm-cmake*.rpm python3.6
|
||||
# Copy the wheel created to RPM folder which will be uploaded to artifactory
|
||||
copy_if WHL "WHL" "$ROCM_CMAKE_PACKAGE_RPM" "$ROCM_WHEEL_DIR"/dist/*.whl
|
||||
}
|
||||
|
||||
print_output_directory() {
|
||||
case ${PKGTYPE} in
|
||||
("deb")
|
||||
@@ -138,9 +121,4 @@ case $TARGET in
|
||||
(*) die "Invalid target $TARGET" ;;
|
||||
esac
|
||||
|
||||
if [[ $WHEEL_PACKAGE == true ]]; then
|
||||
echo "Wheel Package build started !!!!"
|
||||
create_wheel_package
|
||||
fi
|
||||
|
||||
echo "Operation complete"
|
||||
|
||||
@@ -7,7 +7,6 @@ bison
|
||||
bridge-utils
|
||||
build-essential
|
||||
bzip2
|
||||
ccache
|
||||
check
|
||||
chrpath
|
||||
cifs-utils
|
||||
@@ -121,11 +120,9 @@ python3-yaml
|
||||
python3.8-dev
|
||||
re2c
|
||||
redis-tools
|
||||
# Eventually we should be able to remove rpm for debian builds.
|
||||
rpm
|
||||
rsync
|
||||
ssh
|
||||
# This makes life more pleasent inside the container
|
||||
strace
|
||||
sudo
|
||||
systemtap-sdt-dev
|
||||
|
||||
@@ -1,285 +0,0 @@
|
||||
#! /usr/bin/bash
|
||||
|
||||
set -x
|
||||
|
||||
apt-get -y update
|
||||
DEBIAN_FRONTEND=noninteractive DEBCONF_NONINTERACTIVE_SEEN=true apt-get install --no-install-recommends -y $(sed 's/#.*//' /tmp/packages)
|
||||
apt-get clean
|
||||
rm -rf /var/cache/apt/ /var/lib/apt/lists/* /etc/apt/apt.conf.d/01proxy
|
||||
|
||||
#Install 2.17.1 version of git as we are seeing issues with 2.25 , where it was not allowing to add git submodules if the user is different for parent git directory
|
||||
curl -o git.tar.gz https://cdn.kernel.org/pub/software/scm/git/git-2.17.1.tar.gz
|
||||
tar -zxf git.tar.gz
|
||||
cd git-*
|
||||
make prefix=/usr/local all
|
||||
make prefix=/usr/local install
|
||||
git --version
|
||||
|
||||
#install argparse and CppHeaderParser python modules for roctracer and rocprofiler
|
||||
#install rocm-docs-core for the docs-as-code project. Only needed on one OS
|
||||
# CppHeader needs setuptools. setuptools needs wheel.
|
||||
# Looks like I need them as seperate commands
|
||||
# Sigh, install both python2 and python 3 version
|
||||
pip3 install --no-cache-dir setuptools wheel tox
|
||||
pip3 install --no-cache-dir CppHeaderParser argparse requests lxml barectf recommonmark jinja2==3.0.0 websockets matplotlib numpy scipy minimal msgpack pytest sphinx joblib PyYAML rocm-docs-core cmake==3.25.2 pandas myst-parser
|
||||
|
||||
# Allow sudo for everyone user
|
||||
echo 'ALL ALL=(ALL) NOPASSWD:ALL' > /etc/sudoers.d/everyone
|
||||
|
||||
# Install OCaml packages to build LLVM's OCaml bindings to be used in lightning compiler test pipeline
|
||||
wget -nv https://sourceforge.net/projects/opam.mirror/files/2.1.4/opam-2.1.4-x86_64-linux -O /usr/local/bin/opam
|
||||
chmod +x /usr/local/bin/opam
|
||||
opam init --yes --disable-sandboxing
|
||||
opam install ctypes --yes
|
||||
|
||||
# Install and modify git-repo (#!/usr/bin/env python -> #!/usr/bin/env python3)
|
||||
curl https://storage.googleapis.com/git-repo-downloads/repo > /usr/bin/repo
|
||||
chmod a+x /usr/bin/repo
|
||||
|
||||
# Build ccache from the source
|
||||
cd /tmp
|
||||
git clone https://github.com/ccache/ccache -b v4.7.5
|
||||
cd ccache
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release ..
|
||||
make
|
||||
make install
|
||||
cd /tmp
|
||||
rm -rf ccache
|
||||
|
||||
# Install sharp from MLNX_OFED_LINUX as dependency for rccl-rdma-sharp-plugins
|
||||
cd /var/tmp
|
||||
mkdir mlnx
|
||||
wget -O mlnx/tar.tgz https://content.mellanox.com/ofed/MLNX_OFED-24.01-0.3.3.1/MLNX_OFED_LINUX-24.01-0.3.3.1-ubuntu22.04-x86_64.tgz
|
||||
tar -xz -C mlnx -f mlnx/tar.tgz
|
||||
apt-key add mlnx/*/RPM-GPG-KEY-Mellanox
|
||||
echo "deb [arch=amd64] file:$(echo $PWD/mlnx/*/DEBS) ./" > /etc/apt/sources.list.d/sharp.list
|
||||
apt update
|
||||
apt install -y sharp
|
||||
apt clean
|
||||
rm -rf /var/cache/apt/ /var/lib/apt/lists/* mlnx /etc/apt/sources.list.d/sharp.list
|
||||
|
||||
apt update
|
||||
apt -y install libunwind-dev
|
||||
apt -y install libgoogle-glog-dev
|
||||
|
||||
# Install python3.8 from source
|
||||
curl -LO https://www.python.org/ftp/python/3.8.13/Python-3.8.13.tar.xz
|
||||
tar -xvf Python-3.8.13.tar.xz
|
||||
pwd
|
||||
ls /var/tmp/
|
||||
ls Python-3.8.13
|
||||
mv Python-3.8.13 /opt/
|
||||
apt install build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libsqlite3-dev libreadline-dev libffi-dev curl libbz2-dev pkg-config make -y
|
||||
cd /opt/Python-3.8.13/
|
||||
./configure --enable-optimizations --enable-shared
|
||||
make
|
||||
make -j 6
|
||||
make altinstall
|
||||
ldconfig /opt/Python3.8.13
|
||||
python3.8 --version
|
||||
|
||||
# roctracer and rocprofiler needs this python3.8
|
||||
python3.8 -m pip install setuptools wheel
|
||||
python3.8 -m pip install CppHeaderParser argparse requests lxml PyYAML joblib
|
||||
|
||||
#Install older version of hwloc-devel package for rocrtst
|
||||
curl -lO https://download.open-mpi.org/release/hwloc/v1.11/hwloc-1.11.13.tar.bz2
|
||||
tar -xvf hwloc-1.11.13.tar.bz2
|
||||
cd hwloc-1.11.13
|
||||
./configure
|
||||
make
|
||||
make install
|
||||
cp /usr/local/lib/libhwloc.so.5 /usr/lib
|
||||
hwloc-info --version
|
||||
|
||||
# Install gtest
|
||||
mkdir -p /tmp/gtest
|
||||
cd /tmp/gtest
|
||||
wget https://github.com/google/googletest/archive/refs/tags/v1.14.0.zip -O googletest.zip
|
||||
unzip googletest.zip
|
||||
cd googletest-1.14.0/
|
||||
mkdir build
|
||||
cd build
|
||||
cmake ..
|
||||
make -j$(nproc)
|
||||
make install
|
||||
rm -rf /tmp/gtest
|
||||
|
||||
## Install gRPC from source
|
||||
## RDC Pre-requisites
|
||||
GRPC_ARCHIVE=grpc-1.61.0.tar.gz
|
||||
mkdir /tmp/grpc
|
||||
mkdir /usr/grpc
|
||||
cd /tmp
|
||||
git clone --recurse-submodules -b v1.61.0 https://github.com/grpc/grpc
|
||||
cd grpc
|
||||
mkdir -p build
|
||||
cd build
|
||||
cmake -DgRPC_INSTALL=ON -DBUILD_SHARED_LIBS=ON -DgRPC_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=/usr/grpc -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=14 -DCMAKE_SHARED_LINKER_FLAGS_INIT=-Wl,--enable-new-dtags,--build-id=sha1,--rpath,'$ORIGIN' ..
|
||||
make -j $(nproc) install
|
||||
rm -rf /tmp/grpc
|
||||
|
||||
## rocBLAS Pre-requisites
|
||||
## Download prebuilt AMD multithreaded blis (2.0)
|
||||
## Reference : https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/install.sh#L403
|
||||
mkdir -p /tmp/blis
|
||||
cd /tmp/blis
|
||||
wget -O - https://github.com/amd/blis/releases/download/2.0/aocl-blis-mt-ubuntu-2.0.tar.gz | tar xfz -
|
||||
mv amd-blis-mt /usr/blis
|
||||
cd /
|
||||
rm -rf /tmp/blis
|
||||
|
||||
## rocBLAS Pre-requisites(SWDEV-404612)
|
||||
## Download aocl-linux-gcc-4.2.0_1_amd64.deb
|
||||
mkdir -p /tmp/aocl
|
||||
cd /tmp/aocl
|
||||
wget -nv https://download.amd.com/developer/eula/aocl/aocl-4-2/aocl-linux-gcc-4.2.0_1_amd64.deb
|
||||
apt install ./aocl-linux-gcc-4.2.0_1_amd64.deb
|
||||
rm -rf /tmp/aocl
|
||||
|
||||
## hipBLAS Pre-requisites
|
||||
## lapack(3.9.1v)
|
||||
## Reference https://github.com/ROCmSoftwarePlatform/rocSOLVER/blob/develop/install.sh#L174
|
||||
lapack_version=3.9.1
|
||||
lapack_srcdir=lapack-$lapack_version
|
||||
lapack_blddir=lapack-$lapack_version-bld
|
||||
mkdir -p /tmp/lapack
|
||||
cd /tmp/lapack
|
||||
rm -rf "$lapack_srcdir" "$lapack_blddir"
|
||||
wget -O - https://github.com/Reference-LAPACK/lapack/archive/refs/tags/v3.9.1.tar.gz | tar xzf -
|
||||
cmake -H$lapack_srcdir -B$lapack_blddir -DCMAKE_BUILD_TYPE=Release -DCMAKE_Fortran_FLAGS=-fno-optimize-sibling-calls -DBUILD_TESTING=OFF -DCBLAS=ON -DLAPACKE=OFF
|
||||
make -j$(nproc) -C "$lapack_blddir"
|
||||
make -C "$lapack_blddir" install
|
||||
cd $lapack_blddir
|
||||
cp -r ./include/* /usr/local/include/
|
||||
cp -r ./lib/* /usr/local/lib
|
||||
cd /
|
||||
rm -rf /tmp/lapack
|
||||
|
||||
## rocSOLVER Pre-requisites
|
||||
## FMT(7.1.3v)
|
||||
## Reference https://github.com/ROCmSoftwarePlatform/rocSOLVER/blob/develop/install.sh#L152
|
||||
fmt_version=7.1.3
|
||||
fmt_srcdir=fmt-$fmt_version
|
||||
fmt_blddir=fmt-$fmt_version-bld
|
||||
mkdir -p /tmp/fmt
|
||||
cd /tmp/fmt
|
||||
rm -rf "$fmt_srcdir" "$fmt_blddir"
|
||||
wget -O - https://github.com/fmtlib/fmt/archive/refs/tags/7.1.3.tar.gz | tar xzf -
|
||||
cmake -H$fmt_srcdir -B$fmt_blddir -DCMAKE_BUILD_TYPE=Release -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_EXTENSIONS=OFF -DCMAKE_CXX_STANDARD_REQUIRED=ON -DFMT_DOC=OFF -DFMT_TEST=OFF
|
||||
make -j$(nproc) -C "$fmt_blddir"
|
||||
make -C "$fmt_blddir" install
|
||||
|
||||
# Build and install libjpeg-turbo
|
||||
mkdir -p /tmp/libjpeg-turbo
|
||||
cd /tmp/libjpeg-turbo
|
||||
wget -nv https://github.com/rrawther/libjpeg-turbo/archive/refs/heads/2.0.6.2.zip -O libjpeg-turbo-2.0.6.2.zip
|
||||
unzip libjpeg-turbo-2.0.6.2.zip
|
||||
cd libjpeg-turbo-2.0.6.2
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_BUILD_TYPE=RELEASE -DENABLE_STATIC=FALSE -DCMAKE_INSTALL_DEFAULT_LIBDIR=lib ..
|
||||
make -j$(nproc) install
|
||||
rm -rf /tmp/libjpeg-turbo
|
||||
|
||||
# Get released ninja from source
|
||||
mkdir -p /tmp/ninja
|
||||
cd /tmp/ninja
|
||||
wget -nv https://codeload.github.com/Kitware/ninja/zip/refs/tags/v1.11.1.g95dee.kitware.jobserver-1 -O ninja.zip
|
||||
unzip ninja.zip
|
||||
cd ninja-1.11.1.g95dee.kitware.jobserver-1
|
||||
./configure.py --bootstrap
|
||||
cp ninja /usr/local/bin/
|
||||
rm -rf /tmp/ninja
|
||||
|
||||
# Install FFmpeg and dependencies
|
||||
# Build NASM
|
||||
mkdir -p /tmp/nasm-2.15.05
|
||||
cd /tmp
|
||||
wget -qO- "https://distfiles.macports.org/nasm/nasm-2.15.05.tar.bz2" | tar -xvj
|
||||
cd nasm-2.15.05
|
||||
./autogen.sh
|
||||
./configure --prefix="/usr/local"
|
||||
make -j$(nproc) install
|
||||
rm -rf /tmp/nasm-2.15.05
|
||||
|
||||
# Build YASM
|
||||
mkdir -p /tmp/yasm-1.3.0
|
||||
cd /tmp
|
||||
wget -qO- "http://www.tortall.net/projects/yasm/releases/yasm-1.3.0.tar.gz" | tar -xvz
|
||||
cd yasm-1.3.0
|
||||
./configure --prefix="/usr/local"
|
||||
make -j$(nproc) install
|
||||
rm -rf /tmp/yasm-1.3.0
|
||||
|
||||
# Build x264
|
||||
mkdir -p /tmp/x264-snapshot-20191217-2245-stable
|
||||
cd /tmp
|
||||
wget -qO- "https://download.videolan.org/pub/videolan/x264/snapshots/x264-snapshot-20191217-2245-stable.tar.bz2" | tar -xvj
|
||||
cd /tmp/x264-snapshot-20191217-2245-stable
|
||||
PKG_CONFIG_PATH="/usr/local/lib/pkgconfig" ./configure --prefix="/usr/local" --enable-shared
|
||||
make -j$(nproc) install
|
||||
rm -rf /tmp/x264-snapshot-20191217-2245-stable
|
||||
|
||||
# Build x265
|
||||
mkdir -p /tmp/x265_2.7
|
||||
cd /tmp
|
||||
wget -qO- "https://get.videolan.org/x265/x265_2.7.tar.gz" | tar -xvz
|
||||
cd /tmp/x265_2.7/build/linux
|
||||
cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX="/usr/local" -DENABLE_SHARED:bool=on ../../source
|
||||
make -j$(nproc) install
|
||||
rm -rf /tmp/x265_2.7
|
||||
|
||||
# Build fdk-aac
|
||||
mkdir -p /tmp/fdk-aac-2.0.2
|
||||
cd /tmp
|
||||
wget -qO- "https://sourceforge.net/projects/opencore-amr/files/fdk-aac/fdk-aac-2.0.2.tar.gz" | tar -xvz
|
||||
cd /tmp/fdk-aac-2.0.2
|
||||
autoreconf -fiv
|
||||
./configure --prefix="/usr/local" --enable-shared --disable-static
|
||||
make -j$(nproc) install
|
||||
rm -rf /tmp/fdk-aac-2.0.2
|
||||
|
||||
# Build FFmpeg
|
||||
cd /tmp
|
||||
git clone -b release/4.4 https://git.ffmpeg.org/ffmpeg.git ffmpeg
|
||||
cd ffmpeg
|
||||
PKG_CONFIG_PATH="/usr/local/lib/pkgconfig"
|
||||
./configure --prefix="/usr/local" --extra-cflags="-I/usr/local/include" --extra-ldflags="-L/usr/local/lib" --extra-libs=-lpthread --extra-libs=-lm --enable-shared --disable-static --enable-libx264 --enable-libx265 --enable-libfdk-aac --enable-gpl --enable-nonfree
|
||||
make -j$(nproc) install
|
||||
rm -rf /tmp/ffmpeg
|
||||
|
||||
cp /tmp/local-pin-600 /etc/apt/preferences.d
|
||||
|
||||
command -v lbzip2
|
||||
ln -sf $(command -v lbzip2) /usr/local/bin/compressor || ln -sf $(command -v bzip2) /usr/local/bin/compressor
|
||||
|
||||
# Install Google Benchmark
|
||||
mkdir -p /tmp/Gbenchmark
|
||||
cd /tmp/Gbenchmark
|
||||
wget -qO- https://github.com/google/benchmark/archive/refs/tags/v1.6.1.tar.gz | tar xz
|
||||
cmake -Sbenchmark-1.6.1 -Bbuild -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF -DBENCHMARK_ENABLE_TESTING=OFF -DCMAKE_CXX_STANDARD=14
|
||||
make -j -C build
|
||||
cd /tmp/Gbenchmark/build
|
||||
make install
|
||||
|
||||
# Build boost-1.85.0 from source for RPP
|
||||
# Installing in a non-standard location since the test packages of hipFFT and rocFFT pick up the version of
|
||||
# the installed Boost library and declare a package dependency on that specific version of Boost.
|
||||
# For example, if this was installed in the standard location it would declare a dependency on libboost-dev(el)1.85.0
|
||||
# which is not available as a package in any distro.
|
||||
# Once this is fixed, we can remove the Boost package from the requirements list and install this
|
||||
# in the standard location
|
||||
mkdir -p /tmp/boost-1.85.0
|
||||
cd /tmp/boost-1.85.0
|
||||
wget -nv https://sourceforge.net/projects/boost/files/boost/1.85.0/boost_1_85_0.tar.bz2 -O ./boost_1_85_0.tar.bz2
|
||||
tar -xf boost_1_85_0.tar.bz2 --use-compress-program="/usr/local/bin/compressor"
|
||||
cd boost_1_85_0
|
||||
./bootstrap.sh --prefix=${RPP_DEPS_LOCATION} --with-python=python3
|
||||
./b2 stage -j$(nproc) threading=multi link=shared cxxflags="-std=c++11"
|
||||
./b2 install threading=multi link=shared --with-system --with-filesystem
|
||||
./b2 stage -j$(nproc) threading=multi link=static cxxflags="-std=c++11 -fpic" cflags="-fpic"
|
||||
./b2 install threading=multi link=static --with-system --with-filesystem
|
||||
rm -rf /tmp/boost-1.85.0
|
||||
@@ -7,7 +7,6 @@ bison
|
||||
bridge-utils
|
||||
build-essential
|
||||
bzip2
|
||||
ccache
|
||||
check
|
||||
chrpath
|
||||
cifs-utils
|
||||
|
||||
Reference in New Issue
Block a user